diff --git a/.gitignore b/.gitignore
index 3598e611a25ae558ec4aef823985397aa55a414d..69ed0a703a1bbecfb0b30f3f513afadcac78c886 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,8 @@ tests/check_cdo_selectors
 /tests/check_memlock
 MaestroConfig.cmake
 dst
+tests/check_pm_dist_cdo.sh
+tests/simple_dist_client_1
+tests/simple_dist_client_2
+tests/simple_dist_client_3
+tests/simple_dist_client_4
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000000000000000000000000000000000000..74ec42a676690700f3150cb1e9c7063b63838a95
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,55 @@
+2022-01-18  Utz-Uwe Haus  <uhaus@C02ZD3QXLVDR>
+
+	* Release 0.3
+
+	* Major changes:
+	- Include libfabric 1.14.0
+	- Include mamba 0.1.9
+
+	* New features:
+
+	- DISPOSE-and-REUSE functionality
+
+	- proper support for multi-ranked applications (using component-index in addition
+	  to component name)
+
+	- (limited) support for distributed CDOs
+
+	- documentation on https://maestro-core.readthedocs.io/
+
+	- OFI endpoint serialization using a protobuf-based format, and a tool to decode
+	 pool manager info to human-readable form (tests/decode_pm_info)
+
+	- asynchronous CDO operations
+
+
+
+	* Other notable changes:
+
+	- add origin-side timestamp to pool protocol events, expose in application-side events
+
+	- numerous performance enhancements and memory leak fixes
+
+	- Default to DRC node insecure mode by switching to
+	  DRC_FLAGS_FLEX_CREDENTIAL (can be disabled). This solves the problem of using GNI from
+	  multiple applications on the same compute node.
+
+	- Streamlined logging, especially on the pool manager when not doing DEBUG level tracing
+
+	- aggressive CDO state change checking, ensuring invalid state changes are detected early.
+
+	- cleaner handling of rejected JOIN (e.g., if trying to join twice with the same IDs)
+
+	- proper support for 0-sized CDOs
+
+	- better sepation of data for GFS transport
+
+	- default to low-debugging compilation unless --enable-developer configure flag is given
+
+
+
+2021-09-20  Utz-Uwe Haus  <uhaus@hpe.com>
+
+	* Release 0.2, commit 519fea863b3fc5cf8f8fb568c1220836e37dc3be
+	First stable release of maestro core.
+
diff --git a/INSTALL.md b/INSTALL.md
index b0bee5d6746b3ea9f848c93742ba33540d75f2e9..8fcbc2189cfc6fee88bd19fe08e335e8a6c79f61 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -42,6 +42,15 @@ To update a dependency project (e.g. mamba) subtree please do
 git subtree pull --prefix=deps/mamba --squash git@gitlab.com:cerl/mamba.git master
 ```
 
+If `git` refuses to do the subtree update insisting 'repo was never added', create a remote for the repo, e.g.,
+
+```
+git remote add -f ofiwg  git@github.com:ofiwg/libfabric.git 
+```
+
+and use that repo name for the `subtree pull` command.
+
+
 
 # Building
 
diff --git a/Makefile.am b/Makefile.am
index 7afe7a878ca777a7eb7d8061d6972f4f0cbfbbdd..8c39d4948ef67dec1f68d84f026b46aa8936d289 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -92,7 +92,7 @@ coverage: all
 	$(LCOV) --base-directory . --directory . --zerocounters -q
 	$(MAKE) check
 	rm -f maestro.info
-	for d in `find . -name \*.o|grep -v 'deps/'` ; do                            \
+	for d in `find . -name \*.o|grep -v 'deps/\|.libs/'` ; do                            \
 		if test -f `dirname $$d`/`basename $$d .o`.gcno ; then               \
 		  $(LCOV) -c -d $$d -b `dirname $$d` --gcov-tool $(GCOV) -o tmp.info;\
 		  cat <tmp.info >>maestro.info;                                      \
diff --git a/README.md b/README.md
index dc861cfad21e8097efe1aa434fdd80898a0fda46..093e574483809eab6802e29d3d75b7cd6c1cb3ec 100644
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ automatically launched with make check.
 
 # Documentation
 
-Doxygen documentation is available and compiled in [docs](./docs) folder.
+Doxygen as well as Sphinx documentation is available and compiled in [docs](./docs) folder.
 
 # Common issues/FAQs
 
diff --git a/attributes/Makefile.am b/attributes/Makefile.am
index c5b08a0b73608bbfcf1f6d80a9745f3669fe5fa7..73bc1424972400c2d116049d4de017d05de19ae9 100644
--- a/attributes/Makefile.am
+++ b/attributes/Makefile.am
@@ -40,7 +40,8 @@ BUILT_SOURCES = maestro-core-yaml.h ecmwf-yaml.h
 libattributes_la_CPPFLAGS = \
 		-I$(top_srcdir) \
 		-I$(top_srcdir)/protocols \
-		-I$(top_srcdir)/deps/mamba \
+		-I$(top_srcdir)/deps/mamba/common \
+		-I$(top_srcdir)/deps/mamba/memory \
                 -I$(top_srcdir)/include \
 		-I$(top_srcdir)/deps/libyaml/include \
 		-I$(top_srcdir)/deps/libcyaml/include 
diff --git a/attributes/maestro-core-yaml.h b/attributes/maestro-core-yaml.h
index daafd9ef8db3bec15f44eb0856da9897c251f368..7fbb6fffaf6f460a3186d1d53f52e823541a99c5 100644
--- a/attributes/maestro-core-yaml.h
+++ b/attributes/maestro-core-yaml.h
@@ -51,629 +51,655 @@ unsigned char maestro_core_yaml[] = {
   0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69,
   0x73, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x0a, 0x23, 0x73, 0x63,
   0x68, 0x65, 0x6d, 0x61, 0x2d, 0x74, 0x79, 0x70, 0x65, 0x73, 0x3a, 0x0a,
-  0x20, 0x20, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66, 0x65,
-  0x74, 0x69, 0x6d, 0x65, 0x2d, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61,
-  0x6c, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x73, 0x70, 0x65, 0x63, 0x3a, 0x20, 0x6d, 0x61, 0x70, 0x28, 0x29,
-  0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43,
-  0x44, 0x4f, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x20,
-  0x74, 0x79, 0x70, 0x65, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20,
-  0x20, 0x74, 0x79, 0x70, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x6c,
-  0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69,
-  0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x70, 0x65, 0x63, 0x3a, 0x20,
-  0x2a, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74,
-  0x72, 0x69, 0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x2d, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x3a, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69,
-  0x6d, 0x65, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x72,
-  0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x70, 0x72, 0x65, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x20,
-  0x74, 0x79, 0x70, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x61, 0x63,
-  0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64, 0x65, 0x0a, 0x20, 0x20,
-  0x23, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x70, 0x65,
-  0x63, 0x3a, 0x20, 0x2a, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d,
-  0x6f, 0x64, 0x65, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x0a, 0x20,
+  0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
+  0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69,
+  0x6d, 0x65, 0x2d, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61, 0x6c, 0x0a,
+  0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73,
+  0x70, 0x65, 0x63, 0x3a, 0x20, 0x6d, 0x61, 0x70, 0x28, 0x29, 0x0a, 0x20,
   0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
   0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43, 0x44, 0x4f,
-  0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x20, 0x6d, 0x6f, 0x64, 0x65,
-  0x0a, 0x0a, 0x23, 0x20, 0x49, 0x6e, 0x64, 0x69, 0x76, 0x69, 0x64, 0x75,
-  0x61, 0x6c, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
-  0x20, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e,
-  0x73, 0x0a, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x74,
-  0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x3a, 0x0a, 0x20, 0x20,
-  0x23, 0x23, 0x20, 0x54, 0x4f, 0x50, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x20,
-  0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x0a, 0x0a,
-  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x2e, 0x6d,
-  0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2e, 0x63, 0x6f, 0x72, 0x65, 0x2e,
-  0x63, 0x64, 0x6f, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x20, 0x23, 0x20,
-  0x6e, 0x6f, 0x74, 0x65, 0x3a, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69,
-  0x73, 0x20, 0x61, 0x6e, 0x20, 0x61, 0x62, 0x73, 0x6f, 0x6c, 0x75, 0x74,
-  0x65, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x54,
-  0x72, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54,
-  0x68, 0x65, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44,
-  0x4f, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x6e, 0x6f, 0x74, 0x65, 0x3a,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x49, 0x44, 0x20, 0x69, 0x73, 0x20, 0x6e,
-  0x6f, 0x74, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x76, 0x69, 0x73, 0x69,
-  0x62, 0x6c, 0x65, 0x3b, 0x20, 0x69, 0x74, 0x27, 0x73, 0x20, 0x73, 0x79,
-  0x6e, 0x74, 0x68, 0x65, 0x74, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x64, 0x75,
-  0x72, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x66,
-  0x6c, 0x6f, 0x77, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x65, 0x64, 0x20, 0x68, 0x65, 0x72, 0x65, 0x20, 0x6f, 0x6e, 0x6c,
-  0x79, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
-  0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x69, 0x64, 0x22, 0x0a, 0x20, 0x20,
-  0x23, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x63, 0x64,
-  0x6f, 0x69, 0x64, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x54, 0x72, 0x75, 0x65,
-  0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
-  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68,
-  0x65, 0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x49, 0x44, 0x20,
-  0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x69,
-  0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x66, 0x6c,
-  0x6f, 0x77, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
-  0x20, 0x63, 0x64, 0x6f, 0x2e, 0x72, 0x61, 0x77, 0x2d, 0x70, 0x74, 0x72,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x70,
-  0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
-  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x41, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d,
-  0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64, 0x20, 0x70, 0x6f, 0x69,
-  0x6e, 0x74, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6f, 0x63, 0x61,
-  0x6c, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x61, 0x73,
-  0x73, 0x6f, 0x63, 0x69, 0x61, 0x74, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74,
-  0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x0a, 0x0a,
-  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x63, 0x64, 0x6f,
-  0x2e, 0x6d, 0x61, 0x6d, 0x62, 0x61, 0x2d, 0x61, 0x72, 0x72, 0x61, 0x79,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x70,
-  0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
-  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x41, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61,
-  0x2d, 0x61, 0x72, 0x72, 0x61, 0x79, 0x20, 0x61, 0x73, 0x73, 0x6f, 0x63,
-  0x69, 0x61, 0x74, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x73, 0x74, 0x6f,
-  0x72, 0x61, 0x67, 0x65, 0x20, 0x6f, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x20,
-  0x45, 0x69, 0x74, 0x68, 0x65, 0x72, 0x20, 0x61, 0x20, 0x28, 0x31, 0x2d,
-  0x64, 0x2c, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x70, 0x61, 0x64, 0x64, 0x65,
-  0x64, 0x2c, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x65,
-  0x6e, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77,
-  0x72, 0x61, 0x70, 0x70, 0x65, 0x72, 0x20, 0x61, 0x72, 0x6f, 0x75, 0x6e,
-  0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x64, 0x6f, 0x2e, 0x72, 0x61,
-  0x77, 0x2d, 0x70, 0x74, 0x72, 0x20, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64,
-  0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, 0x6f,
-  0x72, 0x20, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x6c,
-  0x6c, 0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x20, 0x73, 0x74, 0x6f, 0x72,
-  0x61, 0x67, 0x65, 0x20, 0x28, 0x69, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x64, 0x6f, 0x2e, 0x6d, 0x61, 0x65, 0x73, 0x74,
-  0x72, 0x6f, 0x2d, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64, 0x2d,
-  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x69, 0x73, 0x20, 0x74,
-  0x72, 0x75, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65,
-  0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x61, 0x6c, 0x6c, 0x6f,
-  0x63, 0x61, 0x74, 0x65, 0x2d, 0x6e, 0x6f, 0x77, 0x22, 0x20, 0x23, 0x20,
-  0x6e, 0x6f, 0x74, 0x65, 0x3a, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69,
-  0x73, 0x20, 0x61, 0x20, 0x72, 0x65, 0x6c, 0x61, 0x74, 0x69, 0x76, 0x65,
-  0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
-  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
+  0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66,
+  0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x63, 0x74,
+  0x6e, 0x65, 0x73, 0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x73, 0x70, 0x65, 0x63, 0x3a, 0x20, 0x2a, 0x6c,
+  0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69,
+  0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+  0x43, 0x44, 0x4f, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65,
+  0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x72, 0x20, 0x69,
+  0x6e, 0x74, 0x65, 0x72, 0x70, 0x72, 0x65, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x61, 0x63, 0x63, 0x65,
+  0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x70, 0x65, 0x63, 0x3a,
+  0x20, 0x2a, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64,
+  0x65, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x23,
   0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x50, 0x65, 0x72, 0x66, 0x6f,
-  0x72, 0x6d, 0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x74, 0x65,
-  0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x0a,
-  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x70, 0x65, 0x72, 0x73, 0x69, 0x73, 0x74, 0x22, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f,
-  0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
-  0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
-  0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x61,
+  0x63, 0x63, 0x65, 0x73, 0x73, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x0a, 0x0a,
+  0x23, 0x20, 0x49, 0x6e, 0x64, 0x69, 0x76, 0x69, 0x64, 0x75, 0x61, 0x6c,
+  0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20, 0x64,
+  0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x0a,
+  0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x74, 0x74, 0x72,
+  0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x3a, 0x0a, 0x20, 0x20, 0x23, 0x23,
+  0x20, 0x54, 0x4f, 0x50, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x20, 0x61, 0x74,
+  0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20,
+  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x2e, 0x6d, 0x61, 0x65,
+  0x73, 0x74, 0x72, 0x6f, 0x2e, 0x63, 0x6f, 0x72, 0x65, 0x2e, 0x63, 0x64,
+  0x6f, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x20, 0x23, 0x20, 0x6e, 0x6f,
+  0x74, 0x65, 0x3a, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+  0x61, 0x6e, 0x20, 0x61, 0x62, 0x73, 0x6f, 0x6c, 0x75, 0x74, 0x65, 0x20,
+  0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20, 0x6e, 0x61,
+  0x6d, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a,
+  0x20, 0x73, 0x74, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x54, 0x72, 0x75,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
+  0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68, 0x65,
+  0x20, 0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x0a,
+  0x0a, 0x20, 0x20, 0x23, 0x20, 0x6e, 0x6f, 0x74, 0x65, 0x3a, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x49, 0x44, 0x20, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x74,
+  0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x76, 0x69, 0x73, 0x69, 0x62, 0x6c,
+  0x65, 0x3b, 0x20, 0x69, 0x74, 0x27, 0x73, 0x20, 0x73, 0x79, 0x6e, 0x74,
+  0x68, 0x65, 0x74, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x64, 0x75, 0x72, 0x69,
+  0x6e, 0x67, 0x20, 0x61, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x66, 0x6c, 0x6f,
+  0x77, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x65,
+  0x64, 0x20, 0x68, 0x65, 0x72, 0x65, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x0a,
+  0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
+  0x63, 0x64, 0x6f, 0x2e, 0x69, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x23, 0x20,
+  0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x63, 0x64, 0x6f, 0x69,
+  0x64, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
+  0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x54, 0x72, 0x75, 0x65, 0x0a, 0x20,
+  0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
+  0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68, 0x65, 0x20,
+  0x75, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x20, 0x49, 0x44, 0x20, 0x6f, 0x66,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x69, 0x6e, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x66, 0x6c, 0x6f, 0x77,
+  0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x63,
+  0x64, 0x6f, 0x2e, 0x72, 0x61, 0x77, 0x2d, 0x70, 0x74, 0x72, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x70, 0x6f, 0x69,
+  0x6e, 0x74, 0x65, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c,
+  0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
-  0x6e, 0x3a, 0x20, 0x45, 0x6e, 0x73, 0x75, 0x72, 0x65, 0x20, 0x61, 0x20,
-  0x70, 0x65, 0x72, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x63,
-  0x6f, 0x70, 0x79, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43,
-  0x44, 0x4f, 0x20, 0x69, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x72,
-  0x76, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77,
-  0x68, 0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74,
-  0x20, 0x63, 0x6f, 0x70, 0x79, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x43, 0x44, 0x4f, 0x20, 0x69, 0x73, 0x20, 0x57, 0x49, 0x54, 0x48,
-  0x44, 0x52, 0x41, 0x57, 0x4e, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74,
-  0x68, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f,
-  0x6f, 0x6c, 0x2e, 0x20, 0x28, 0x4e, 0x6f, 0x74, 0x65, 0x20, 0x74, 0x68,
-  0x65, 0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x74, 0x72, 0x69, 0x76, 0x69, 0x61,
-  0x6c, 0x20, 0x73, 0x65, 0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x20, 0x69,
-  0x6e, 0x74, 0x65, 0x72, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x63, 0x64, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x69, 0x73,
-  0x74, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20,
-  0x22, 0x63, 0x64, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x69, 0x73, 0x74, 0x22,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62,
-  0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73,
-  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x3a, 0x20, 0x54, 0x72, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x6e, 0x3a, 0x20, 0x41, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x70, 0x72,
+  0x6f, 0x76, 0x69, 0x64, 0x65, 0x64, 0x20, 0x70, 0x6f, 0x69, 0x6e, 0x74,
+  0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20,
+  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x61, 0x73, 0x73, 0x6f,
+  0x63, 0x69, 0x61, 0x74, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x0a, 0x0a, 0x20, 0x20,
+  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x63, 0x64, 0x6f, 0x2e, 0x6d,
+  0x61, 0x6d, 0x62, 0x61, 0x2d, 0x61, 0x72, 0x72, 0x61, 0x79, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x70, 0x6f, 0x69,
+  0x6e, 0x74, 0x65, 0x72, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c,
+  0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
-  0x6e, 0x3a, 0x20, 0x45, 0x6e, 0x73, 0x75, 0x72, 0x65, 0x20, 0x74, 0x68,
-  0x61, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x70, 0x65, 0x72, 0x73, 0x69,
-  0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x70, 0x69, 0x65, 0x73,
-  0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20,
-  0x61, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20,
-  0x70, 0x65, 0x72, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x73,
-  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x77, 0x68, 0x65, 0x6e, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x0a, 0x20, 0x20, 0x20,
+  0x6e, 0x3a, 0x20, 0x41, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61, 0x2d, 0x61,
+  0x72, 0x72, 0x61, 0x79, 0x20, 0x61, 0x73, 0x73, 0x6f, 0x63, 0x69, 0x61,
+  0x74, 0x65, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61,
+  0x67, 0x65, 0x20, 0x6f, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63,
-  0x65, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f,
-  0x20, 0x69, 0x73, 0x20, 0x57, 0x49, 0x54, 0x48, 0x44, 0x52, 0x41, 0x57,
-  0x6e, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x20, 0x45, 0x69,
+  0x74, 0x68, 0x65, 0x72, 0x20, 0x61, 0x20, 0x28, 0x31, 0x2d, 0x64, 0x2c,
+  0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x70, 0x61, 0x64, 0x64, 0x65, 0x64, 0x2c,
+  0x20, 0x6e, 0x6f, 0x6e, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x65, 0x6e, 0x64,
+  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x72, 0x61,
+  0x70, 0x70, 0x65, 0x72, 0x20, 0x61, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x63, 0x64, 0x6f, 0x2e, 0x72, 0x61, 0x77, 0x2d,
+  0x70, 0x74, 0x72, 0x20, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64,
+  0x20, 0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x6f, 0x6c, 0x2e, 0x20,
-  0x28, 0x4e, 0x6f, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x6f,
-  0x6e, 0x2d, 0x74, 0x72, 0x69, 0x76, 0x69, 0x61, 0x6c, 0x20, 0x73, 0x65,
-  0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72,
-  0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, 0x6f, 0x72, 0x20,
+  0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x6c, 0x6c, 0x6f,
+  0x63, 0x61, 0x74, 0x65, 0x64, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
+  0x65, 0x20, 0x28, 0x69, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63,
-  0x64, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x69, 0x73, 0x74, 0x20, 0x61, 0x74,
-  0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x29, 0x0a, 0x0a, 0x20,
+  0x20, 0x63, 0x64, 0x6f, 0x2e, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f,
+  0x2d, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64, 0x2d, 0x73, 0x74,
+  0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x69, 0x73, 0x20, 0x74, 0x72, 0x75,
+  0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
+  0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x61,
+  0x74, 0x65, 0x2d, 0x6e, 0x6f, 0x77, 0x22, 0x20, 0x23, 0x20, 0x6e, 0x6f,
+  0x74, 0x65, 0x3a, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+  0x61, 0x20, 0x72, 0x65, 0x6c, 0x61, 0x74, 0x69, 0x76, 0x65, 0x20, 0x61,
+  0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20,
+  0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c,
+  0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x50, 0x65, 0x72, 0x66, 0x6f, 0x72, 0x6d,
+  0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x74, 0x65, 0x20, 0x61,
+  0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x0a, 0x0a, 0x20,
   0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f,
-  0x2e, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x70, 0x72, 0x6f,
-  0x76, 0x69, 0x64, 0x65, 0x64, 0x2d, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x22, 0x20, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
-  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x52, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x2f, 0x61, 0x64, 0x76, 0x69, 0x73, 0x65, 0x20, 0x6d, 0x61,
-  0x65, 0x73, 0x74, 0x72, 0x6f, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x20, 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x6d, 0x65, 0x6e, 0x74,
-  0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x28, 0x6e, 0x6f, 0x20,
-  0x52, 0x41, 0x57, 0x2d, 0x50, 0x54, 0x52, 0x20, 0x77, 0x69, 0x6c, 0x6c,
-  0x20, 0x62, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64,
-  0x20, 0x6f, 0x72, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x62, 0x65, 0x20, 0x6f,
-  0x62, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x64, 0x29, 0x0a, 0x0a, 0x20, 0x20,
-  0x23, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64,
-  0x6f, 0x2e, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64,
-  0x65, 0x22, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f,
-  0x64, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
-  0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
-  0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
-  0x6c, 0x74, 0x3a, 0x20, 0x2a, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d,
-  0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x0a,
-  0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e, 0x64,
-  0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x68, 0x6f, 0x77, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20,
-  0x62, 0x65, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x65, 0x64, 0x20,
-  0x62, 0x79, 0x20, 0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, 0x72, 0x0a,
-  0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x72, 0x65,
-  0x61, 0x64, 0x2d, 0x77, 0x72, 0x69, 0x74, 0x65, 0x2c, 0x20, 0x72, 0x65,
-  0x61, 0x64, 0x2d, 0x6f, 0x6e, 0x6c, 0x79, 0x2c, 0x20, 0x65, 0x74, 0x63,
-  0x2e, 0x29, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79,
-  0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73,
-  0x66, 0x65, 0x72, 0x2d, 0x61, 0x73, 0x61, 0x70, 0x22, 0x0a, 0x20, 0x20,
+  0x2e, 0x70, 0x65, 0x72, 0x73, 0x69, 0x73, 0x74, 0x22, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
   0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
   0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20,
   0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20,
   0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
   0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a,
-  0x20, 0x49, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x74,
-  0x68, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20,
-  0x6d, 0x75, 0x73, 0x74, 0x20, 0x62, 0x65, 0x20, 0x74, 0x72, 0x61, 0x6e,
-  0x73, 0x66, 0x65, 0x72, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x73, 0x6f,
-  0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73,
-  0x20, 0x70, 0x6f, 0x73, 0x73, 0x69, 0x62, 0x6c, 0x65, 0x2e, 0x0a, 0x0a,
-  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64,
-  0x6f, 0x2e, 0x6e, 0x65, 0x76, 0x65, 0x72, 0x2d, 0x70, 0x6f, 0x6f, 0x6c,
-  0x65, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
-  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e, 0x64, 0x69, 0x63,
-  0x61, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x74, 0x68,
-  0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62,
-  0x65, 0x20, 0x6e, 0x65, 0x76, 0x65, 0x72, 0x20, 0x62, 0x65, 0x20, 0x75,
-  0x73, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x4f, 0x46, 0x46, 0x45, 0x52, 0x2f, 0x44, 0x45, 0x4d, 0x41,
-  0x4e, 0x44, 0x2c, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x70, 0x6f,
-  0x74, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x70, 0x65,
-  0x72, 0x6d, 0x69, 0x74, 0x73, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x0a,
+  0x20, 0x45, 0x6e, 0x73, 0x75, 0x72, 0x65, 0x20, 0x61, 0x20, 0x70, 0x65,
+  0x72, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x70,
+  0x79, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f,
+  0x20, 0x69, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65,
+  0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x65,
+  0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63,
+  0x6f, 0x70, 0x79, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43,
+  0x44, 0x4f, 0x20, 0x69, 0x73, 0x20, 0x57, 0x49, 0x54, 0x48, 0x44, 0x52,
+  0x41, 0x57, 0x4e, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x6f, 0x6c,
+  0x2e, 0x20, 0x28, 0x4e, 0x6f, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20,
+  0x6e, 0x6f, 0x6e, 0x2d, 0x74, 0x72, 0x69, 0x76, 0x69, 0x61, 0x6c, 0x20,
+  0x73, 0x65, 0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74,
+  0x65, 0x72, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x65, 0x72, 0x66, 0x6f,
-  0x72, 0x6d, 0x61, 0x6e, 0x63, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6d,
-  0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x0a, 0x0a, 0x20,
-  0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f,
-  0x2e, 0x69, 0x73, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x22, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
-  0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
-  0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20,
-  0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
+  0x20, 0x20, 0x20, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x63, 0x64, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x69, 0x73, 0x74, 0x20,
+  0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
+  0x64, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x69, 0x73, 0x74, 0x22, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f,
+  0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
+  0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a,
+  0x20, 0x54, 0x72, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
   0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a,
-  0x20, 0x49, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x74,
-  0x68, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20,
-  0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e,
-  0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72,
-  0x20, 0x43, 0x44, 0x4f, 0x73, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x45, 0x6e, 0x73, 0x75, 0x72, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74,
+  0x20, 0x61, 0x6c, 0x6c, 0x20, 0x70, 0x65, 0x72, 0x73, 0x69, 0x73, 0x74,
+  0x65, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x70, 0x69, 0x65, 0x73, 0x20, 0x6f,
+  0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x61, 0x72,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6d,
+  0x6f, 0x76, 0x65, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x65,
+  0x72, 0x73, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x73, 0x74, 0x6f,
+  0x72, 0x61, 0x67, 0x65, 0x20, 0x77, 0x68, 0x65, 0x6e, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x20,
+  0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x69,
+  0x73, 0x20, 0x57, 0x49, 0x54, 0x48, 0x44, 0x52, 0x41, 0x57, 0x6e, 0x20,
+  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x2e, 0x65, 0x2e, 0x20, 0x61, 0x20, 0x43, 0x44, 0x4f,
-  0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x2e, 0x20, 0x55, 0x73, 0x65, 0x72,
-  0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x65, 0x78,
-  0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x74,
-  0x69, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x72, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x6f, 0x6c, 0x2e, 0x20, 0x28, 0x4e,
+  0x6f, 0x74, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x6f, 0x6e, 0x2d,
+  0x74, 0x72, 0x69, 0x76, 0x69, 0x61, 0x6c, 0x20, 0x73, 0x65, 0x6d, 0x61,
+  0x6e, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x61, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x20,
+  0x77, 0x69, 0x74, 0x68, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x64, 0x6f,
+  0x2e, 0x64, 0x65, 0x73, 0x69, 0x73, 0x74, 0x20, 0x61, 0x74, 0x74, 0x72,
+  0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x2d,
+  0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6d,
+  0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x70, 0x72, 0x6f, 0x76, 0x69,
+  0x64, 0x65, 0x64, 0x2d, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x22,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62,
+  0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
+  0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x3a, 0x20, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2f,
+  0x61, 0x64, 0x76, 0x69, 0x73, 0x65, 0x20, 0x6d, 0x61, 0x65, 0x73, 0x74,
+  0x72, 0x6f, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x6d,
+  0x61, 0x6e, 0x61, 0x67, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x43, 0x44, 0x4f, 0x20, 0x28, 0x6e, 0x6f, 0x20, 0x52, 0x41, 0x57,
+  0x2d, 0x50, 0x54, 0x52, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65,
+  0x20, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x64, 0x20, 0x6f, 0x72,
+  0x20, 0x63, 0x61, 0x6e, 0x20, 0x62, 0x65, 0x20, 0x6f, 0x62, 0x74, 0x61,
+  0x69, 0x6e, 0x65, 0x64, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d,
+  0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x61,
+  0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64, 0x65, 0x22, 0x0a,
+  0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20,
+  0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64, 0x65, 0x0a,
+  0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72,
+  0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20,
+  0x23, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a,
+  0x20, 0x2a, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x2d, 0x6d, 0x6f, 0x64,
+  0x65, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x23,
+  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e, 0x64, 0x69, 0x63, 0x61,
+  0x74, 0x65, 0x73, 0x20, 0x68, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x65, 0x20,
+  0x43, 0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20,
+  0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, 0x72, 0x0a, 0x20, 0x20, 0x23,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x72, 0x65, 0x61, 0x64, 0x2d,
+  0x77, 0x72, 0x69, 0x74, 0x65, 0x2c, 0x20, 0x72, 0x65, 0x61, 0x64, 0x2d,
+  0x6f, 0x6e, 0x6c, 0x79, 0x2c, 0x20, 0x65, 0x74, 0x63, 0x2e, 0x29, 0x2e,
   0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
-  0x63, 0x64, 0x6f, 0x2e, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x2d, 0x6d, 0x65,
-  0x6d, 0x62, 0x65, 0x72, 0x73, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x62, 0x28, 0x29, 0x0a,
+  0x63, 0x64, 0x6f, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x65, 0x72,
+  0x2d, 0x61, 0x73, 0x61, 0x70, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64,
   0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x22, 0x22, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x4f, 0x70, 0x61, 0x71, 0x75,
-  0x65, 0x20, 0x6d, 0x65, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x64, 0x65, 0x73,
-  0x63, 0x72, 0x69, 0x70, 0x74, 0x6f, 0x72, 0x20, 0x69, 0x66, 0x20, 0x2e,
-  0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2e, 0x63, 0x6f, 0x72, 0x65,
-  0x2e, 0x63, 0x64, 0x6f, 0x2e, 0x69, 0x73, 0x2d, 0x67, 0x72, 0x6f, 0x75,
-  0x70, 0x20, 0x69, 0x73, 0x20, 0x54, 0x72, 0x75, 0x65, 0x2e, 0x0a, 0x20,
+  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c,
+  0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e,
+  0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x6d, 0x75, 0x73,
+  0x74, 0x20, 0x62, 0x65, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x65,
+  0x72, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x73, 0x6f, 0x6f, 0x6e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x20, 0x70, 0x6f,
+  0x73, 0x73, 0x69, 0x62, 0x6c, 0x65, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x2d,
+  0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6e,
+  0x65, 0x76, 0x65, 0x72, 0x2d, 0x70, 0x6f, 0x6f, 0x6c, 0x65, 0x64, 0x22,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x62,
+  0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
+  0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74, 0x65,
+  0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43,
+  0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x6e,
+  0x65, 0x76, 0x65, 0x72, 0x20, 0x62, 0x65, 0x20, 0x75, 0x73, 0x65, 0x64,
+  0x20, 0x69, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x4f,
+  0x46, 0x46, 0x45, 0x52, 0x2f, 0x44, 0x45, 0x4d, 0x41, 0x4e, 0x44, 0x2c,
+  0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x70, 0x6f, 0x74, 0x65, 0x6e,
+  0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x70, 0x65, 0x72, 0x6d, 0x69,
+  0x74, 0x73, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x55, 0x73, 0x65, 0x72, 0x73, 0x20, 0x61,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x65, 0x72, 0x66, 0x6f, 0x72, 0x6d, 0x61,
+  0x6e, 0x63, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6d, 0x69, 0x7a, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20,
+  0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x69, 0x73,
+  0x67, 0x72, 0x6f, 0x75, 0x70, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64,
+  0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c,
+  0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e,
+  0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x69, 0x73, 0x20,
+  0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x6f, 0x74, 0x68, 0x65, 0x72, 0x20, 0x43, 0x44,
+  0x4f, 0x73, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x2e, 0x65, 0x2e, 0x20, 0x61, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x67, 0x72,
+  0x6f, 0x75, 0x70, 0x2e, 0x20, 0x55, 0x73, 0x65, 0x72, 0x73, 0x20, 0x61,
   0x72, 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x65, 0x78, 0x70, 0x65, 0x63,
   0x74, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x74, 0x69, 0x6c, 0x69,
-  0x7a, 0x65, 0x20, 0x6f, 0x72, 0x20, 0x73, 0x65, 0x74, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
-  0x2e, 0x20, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x23, 0x20, 0x4c, 0x69, 0x66,
-  0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x72, 0x65, 0x6c, 0x61, 0x74, 0x65,
-  0x64, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
-  0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69,
-  0x6d, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65,
-  0x2d, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61, 0x6c, 0x0a, 0x20, 0x20,
-  0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64,
-  0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x6c, 0x69, 0x66,
-  0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66,
-  0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x0a, 0x20, 0x20, 0x23,
-  0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f,
-  0x2e, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74,
-  0x72, 0x69, 0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x22, 0x0a, 0x20, 0x20,
-  0x23, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x6c, 0x69,
-  0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x63,
-  0x74, 0x6e, 0x65, 0x73, 0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20,
+  0x7a, 0x65, 0x20, 0x6f, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x73, 0x65, 0x74, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x61, 0x74,
+  0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x0a, 0x0a, 0x20, 0x20,
+  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e,
+  0x67, 0x72, 0x6f, 0x75, 0x70, 0x2d, 0x6d, 0x65, 0x6d, 0x62, 0x65, 0x72,
+  0x73, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a,
+  0x20, 0x62, 0x6c, 0x6f, 0x62, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61,
-  0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x65,
-  0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x2a, 0x6c, 0x69, 0x66, 0x65,
-  0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x63, 0x74, 0x6e,
-  0x65, 0x73, 0x73, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x0a, 0x20,
-  0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43, 0x44, 0x4f, 0x20,
-  0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x73, 0x70, 0x65,
-  0x63, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x0a, 0x20, 0x20, 0x23, 0x23, 0x20, 0x53,
-  0x43, 0x4f, 0x50, 0x45, 0x20, 0x72, 0x65, 0x6c, 0x61, 0x74, 0x65, 0x64,
-  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x73, 0x69, 0x7a,
-  0x65, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a,
-  0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x2d, 0x31, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65,
-  0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x2d, 0x31,
+  0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61,
+  0x75, 0x6c, 0x74, 0x3a, 0x20, 0x22, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x4f, 0x70, 0x61, 0x71, 0x75, 0x65, 0x20, 0x6d, 0x65,
+  0x6d, 0x62, 0x65, 0x72, 0x20, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70,
+  0x74, 0x6f, 0x72, 0x20, 0x69, 0x66, 0x20, 0x2e, 0x6d, 0x61, 0x65, 0x73,
+  0x74, 0x72, 0x6f, 0x2e, 0x63, 0x6f, 0x72, 0x65, 0x2e, 0x63, 0x64, 0x6f,
+  0x2e, 0x69, 0x73, 0x2d, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x69, 0x73,
+  0x20, 0x54, 0x72, 0x75, 0x65, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x55, 0x73, 0x65, 0x72, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6e,
+  0x6f, 0x74, 0x20, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20,
+  0x74, 0x6f, 0x20, 0x75, 0x74, 0x69, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x6f,
+  0x72, 0x20, 0x73, 0x65, 0x74, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x61,
+  0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e, 0x0a, 0x0a, 0x0a,
+  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64,
+  0x6f, 0x2e, 0x69, 0x73, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75,
+  0x74, 0x65, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3a, 0x20, 0x62, 0x6f, 0x6f, 0x6c, 0x28, 0x29, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20,
+  0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65,
+  0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68, 0x65, 0x20,
-  0x74, 0x6f, 0x74, 0x61, 0x6c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6f,
-  0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x20, 0x2d,
-  0x31, 0x20, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73, 0x20,
-  0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x2f, 0x55, 0x6e, 0x61, 0x6c,
-  0x6c, 0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20, 0x20,
-  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e,
-  0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x2d,
-  0x73, 0x69, 0x7a, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d,
-  0x2d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
+  0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x49, 0x6e, 0x64, 0x69,
+  0x63, 0x61, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20,
+  0x61, 0x72, 0x65, 0x20, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75,
+  0x74, 0x65, 0x64, 0x20, 0x61, 0x6d, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61,
+  0x72, 0x69, 0x6f, 0x75, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x20, 0x63, 0x6f, 0x6d,
+  0x70, 0x6f, 0x65, 0x6e, 0x65, 0x74, 0x73, 0x2e, 0x20, 0x55, 0x73, 0x65,
+  0x72, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x65,
+  0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x75,
+  0x74, 0x69, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x72, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x2e,
+  0x0a, 0x0a, 0x23, 0x46, 0x49, 0x58, 0x4d, 0x45, 0x20, 0x6f, 0x6e, 0x6c,
+  0x79, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6e, 0x6f,
+  0x77, 0x20, 0x69, 0x72, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, 0x5f,
+  0x31, 0x44, 0x20, 0x64, 0x69, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x79, 0x6f,
+  0x75, 0x74, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20,
+  0x22, 0x63, 0x64, 0x6f, 0x2e, 0x64, 0x69, 0x73, 0x74, 0x2d, 0x6c, 0x61,
+  0x79, 0x6f, 0x75, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x3a, 0x20, 0x6d, 0x6d, 0x62, 0x6c, 0x61, 0x79, 0x6f, 0x75,
+  0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
   0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a,
-  0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54,
-  0x68, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x73, 0x69, 0x7a,
-  0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f,
-  0x2e, 0x20, 0x2d, 0x31, 0x20, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74,
-  0x65, 0x73, 0x20, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x2f, 0x55,
-  0x6e, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x2e, 0x0a,
-  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x6c, 0x61, 0x79,
-  0x6f, 0x75, 0x74, 0x2e, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, 0x2d,
-  0x31, 0x64, 0x2e, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2d, 0x73,
-  0x69, 0x7a, 0x65, 0x22, 0x20, 0x23, 0x20, 0x64, 0x65, 0x70, 0x72, 0x65,
-  0x63, 0x61, 0x74, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a,
-  0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x31, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74,
-  0x2d, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x61, 0x20, 0x31,
-  0x2d, 0x64, 0x20, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, 0x20, 0x6c,
-  0x61, 0x79, 0x6f, 0x75, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x6d,
-  0x75, 0x6c, 0x74, 0x69, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69, 0x6f,
-  0x6e, 0x61, 0x6c, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x73, 0x20,
-  0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65,
-  0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61,
-  0x0a, 0x20, 0x20, 0x23, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x74, 0x69,
-  0x62, 0x69, 0x6c, 0x69, 0x74, 0x79, 0x3b, 0x20, 0x77, 0x69, 0x6c, 0x6c,
-  0x20, 0x6e, 0x65, 0x65, 0x64, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x73, 0x20,
-  0x61, 0x6e, 0x64, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x64, 0x65, 0x66,
-  0x69, 0x6e, 0x65, 0x64, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x20, 0x66,
-  0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x20,
-  0x73, 0x74, 0x79, 0x6c, 0x65, 0x73, 0x20, 0x6c, 0x69, 0x6b, 0x65, 0x20,
-  0x72, 0x6f, 0x77, 0x2d, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x2f, 0x63, 0x6f,
-  0x6c, 0x75, 0x6d, 0x6e, 0x2d, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x0a, 0x0a,
-  0x23, 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x69, 0x63, 0x20, 0x73, 0x65,
-  0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x2d,
+  0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+  0x54, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x20, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x61, 0x6d, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x72, 0x69,
+  0x6f, 0x75, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d,
+  0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
+  0x65, 0x6e, 0x65, 0x74, 0x73, 0x2e, 0x0a, 0x0a, 0x0a, 0x20, 0x20, 0x23,
+  0x23, 0x20, 0x4c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x72,
+  0x65, 0x6c, 0x61, 0x74, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d,
   0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c,
-  0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d,
-  0x65, 0x6e, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d,
-  0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
-  0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20,
+  0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x23,
+  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66,
+  0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76,
+  0x61, 0x6c, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
+  0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
+  0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x43, 0x44,
+  0x4f, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x73,
+  0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e,
+  0x2e, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
+  0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69,
+  0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x63, 0x74, 0x6e, 0x65, 0x73,
+  0x73, 0x22, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3a, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d,
+  0x73, 0x74, 0x72, 0x69, 0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x0a, 0x20,
+  0x20, 0x23, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65,
+  0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x23,
   0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20,
-  0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6e, 0x2d, 0x62,
-  0x79, 0x74, 0x65, 0x73, 0x20, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72,
-  0x79, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74, 0x20,
-  0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20,
-  0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x6c, 0x6c, 0x6f,
-  0x63, 0x61, 0x74, 0x65, 0x64, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x0a,
-  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70, 0x72,
-  0x65, 0x2d, 0x70, 0x61, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69,
-  0x6e, 0x3d, 0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
+  0x2a, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x2d, 0x73, 0x74,
+  0x72, 0x69, 0x63, 0x74, 0x6e, 0x65, 0x73, 0x73, 0x2d, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x20, 0x20, 0x64, 0x6f,
+  0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a,
+  0x20, 0x43, 0x44, 0x4f, 0x20, 0x6c, 0x69, 0x66, 0x65, 0x74, 0x69, 0x6d,
+  0x65, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x2e, 0x0a, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x23, 0x20,
+  0x53, 0x43, 0x4f, 0x50, 0x45, 0x20, 0x72, 0x65, 0x6c, 0x61, 0x74, 0x65,
+  0x64, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
+  0x63, 0x64, 0x6f, 0x2e, 0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x73, 0x69,
+  0x7a, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
+  0x3a, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x2d, 0x31,
+  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72,
+  0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x2d,
+  0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
+  0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68, 0x65,
+  0x20, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20,
+  0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x20,
+  0x2d, 0x31, 0x20, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x61, 0x74, 0x65, 0x73,
+  0x20, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x2f, 0x55, 0x6e, 0x61,
+  0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20,
+  0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f,
+  0x2e, 0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
+  0x2d, 0x73, 0x69, 0x7a, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e,
+  0x3d, 0x2d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
   0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
-  0x3a, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54,
-  0x68, 0x65, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x20, 0x61, 0x74,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x62, 0x65,
-  0x67, 0x69, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x77, 0x68, 0x6f, 0x6c, 0x65, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x0a, 0x0a, 0x20, 0x20,
+  0x3a, 0x20, 0x2d, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+  0x54, 0x68, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x73, 0x69,
+  0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44,
+  0x4f, 0x2e, 0x20, 0x2d, 0x31, 0x20, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x61,
+  0x74, 0x65, 0x73, 0x20, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x2f,
+  0x55, 0x6e, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x2e,
+  0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
+  0x63, 0x64, 0x6f, 0x2e, 0x73, 0x63, 0x6f, 0x70, 0x65, 0x2e, 0x6c, 0x61,
+  0x79, 0x6f, 0x75, 0x74, 0x2e, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72,
+  0x2d, 0x31, 0x64, 0x2e, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2d,
+  0x73, 0x69, 0x7a, 0x65, 0x22, 0x20, 0x23, 0x20, 0x64, 0x65, 0x70, 0x72,
+  0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64,
+  0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x31, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e,
+  0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x61, 0x20,
+  0x31, 0x2d, 0x64, 0x20, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, 0x20,
+  0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x20,
+  0x6d, 0x75, 0x6c, 0x74, 0x69, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69,
+  0x6f, 0x6e, 0x61, 0x6c, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x73,
+  0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64,
+  0x65, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x6d, 0x61, 0x6d, 0x62,
+  0x61, 0x0a, 0x20, 0x20, 0x23, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x74,
+  0x69, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x79, 0x3b, 0x20, 0x77, 0x69, 0x6c,
+  0x6c, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x73,
+  0x20, 0x61, 0x6e, 0x64, 0x20, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x64, 0x65,
+  0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x74, 0x79, 0x70, 0x65, 0x73, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x0a, 0x20, 0x20, 0x23, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74,
+  0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x73, 0x20, 0x6c, 0x69, 0x6b, 0x65,
+  0x20, 0x72, 0x6f, 0x77, 0x2d, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x2f, 0x63,
+  0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x2d, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x0a,
+  0x0a, 0x23, 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x69, 0x63, 0x20, 0x73,
+  0x65, 0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x73, 0x0a, 0x0a, 0x20, 0x20,
   0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e,
-  0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70, 0x6f, 0x73, 0x74, 0x2d,
-  0x70, 0x61, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d,
-  0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
-  0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20,
-  0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68, 0x65,
-  0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x20, 0x61, 0x74, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x65, 0x6e, 0x64, 0x20,
-  0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x77, 0x68, 0x6f, 0x6c, 0x65,
-  0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74,
-  0x0a, 0x0a, 0x20, 0x20, 0x23, 0x23, 0x20, 0x4c, 0x61, 0x79, 0x6f, 0x75,
-  0x74, 0x20, 0x73, 0x65, 0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x73, 0x0a,
-  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64,
-  0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x65, 0x6c, 0x65,
-  0x6d, 0x65, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x22, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e, 0x74,
-  0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61,
-  0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x3a, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x3a, 0x20, 0x53, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x20,
-  0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x20, 0x65, 0x6c, 0x65, 0x6d, 0x65,
-  0x6e, 0x74, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x29, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73, 0x2e,
+  0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x61, 0x6c, 0x69, 0x67, 0x6e,
+  0x6d, 0x65, 0x6e, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e,
+  0x3d, 0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
+  0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a,
+  0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6e, 0x2d,
+  0x62, 0x79, 0x74, 0x65, 0x73, 0x20, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61,
+  0x72, 0x79, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x6d, 0x65, 0x6e, 0x74,
+  0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x20, 0x66, 0x6f, 0x72,
+  0x20, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x61, 0x6c, 0x6c,
+  0x6f, 0x63, 0x61, 0x74, 0x65, 0x64, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79,
   0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
-  0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x6e,
-  0x64, 0x69, 0x6d, 0x73, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
+  0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70,
+  0x72, 0x65, 0x2d, 0x70, 0x61, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d,
+  0x69, 0x6e, 0x3d, 0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
+  0x74, 0x3a, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+  0x54, 0x68, 0x65, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x20, 0x61,
+  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x62,
+  0x65, 0x67, 0x69, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x77, 0x68, 0x6f, 0x6c, 0x65, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x0a, 0x0a, 0x20,
+  0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f,
+  0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70, 0x6f, 0x73, 0x74,
+  0x2d, 0x70, 0x61, 0x64, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
   0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e,
-  0x3d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
+  0x3d, 0x30, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
   0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a,
-  0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
-  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x4e, 0x75,
-  0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x69, 0x6d, 0x65,
-  0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68,
-  0x65, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x0a, 0x0a, 0x20,
-  0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x63, 0x64, 0x6f, 0x2e,
-  0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x64, 0x69, 0x6d, 0x73, 0x2d,
-  0x73, 0x69, 0x7a, 0x65, 0x20, 0x23, 0x20, 0x46, 0x49, 0x58, 0x4d, 0x45,
-  0x3a, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x62,
-  0x65, 0x20, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x64, 0x20, 0x74, 0x6f,
-  0x20, 0x61, 0x74, 0x20, 0x6c, 0x65, 0x61, 0x73, 0x74, 0x20, 0x62, 0x6c,
-  0x6f, 0x62, 0x2c, 0x20, 0x62, 0x65, 0x74, 0x74, 0x65, 0x72, 0x20, 0x61,
-  0x72, 0x72, 0x61, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x28, 0x29,
-  0x20, 0x23, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28, 0x75, 0x69, 0x6e, 0x74,
-  0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x31, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x54, 0x68,
+  0x65, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x20, 0x61, 0x74, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x65, 0x6e, 0x64,
+  0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x77, 0x68, 0x6f, 0x6c,
+  0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75,
+  0x74, 0x0a, 0x0a, 0x20, 0x20, 0x23, 0x23, 0x20, 0x4c, 0x61, 0x79, 0x6f,
+  0x75, 0x74, 0x20, 0x73, 0x65, 0x6d, 0x61, 0x6e, 0x74, 0x69, 0x63, 0x73,
+  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
+  0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x65, 0x6c,
+  0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x22, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e,
+  0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
   0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x53, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66,
-  0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74,
-  0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x69,
-  0x6e, 0x20, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x0a,
-  0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x6f, 0x72,
-  0x64, 0x65, 0x72, 0x22, 0x20, 0x23, 0x20, 0x63, 0x6f, 0x75, 0x6c, 0x64,
-  0x20, 0x62, 0x65, 0x20, 0x61, 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x66,
-  0x6f, 0x72, 0x20, 0x6e, 0x6f, 0x77, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69, 0x6e,
-  0x3d, 0x30, 0x29, 0x20, 0x23, 0x66, 0x6f, 0x72, 0x20, 0x6e, 0x6f, 0x77,
-  0x20, 0x23, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x28, 0x20, 0x27, 0x72, 0x6f,
-  0x77, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x63, 0x6f,
-  0x6c, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x67, 0x65,
-  0x6e, 0x65, 0x72, 0x69, 0x63, 0x2d, 0x6e, 0x64, 0x27, 0x29, 0x20, 0x23,
-  0x20, 0x75, 0x6e, 0x69, 0x6d, 0x70, 0x6c, 0x3b, 0x20, 0x63, 0x6f, 0x75,
-  0x6c, 0x64, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x20, 0x69, 0x6d, 0x6d, 0x65,
-  0x64, 0x69, 0x61, 0x74, 0x65, 0x6c, 0x79, 0x20, 0x77, 0x69, 0x74, 0x68,
-  0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x70, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61,
-  0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x3a, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x3a, 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, 0x20, 0x6c, 0x61,
-  0x79, 0x6f, 0x75, 0x74, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x20,
-  0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x6f, 0x6e, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x43, 0x44, 0x4f, 0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e,
-  0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70, 0x61, 0x74, 0x74, 0x65,
-  0x72, 0x6e, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28, 0x29, 0x20,
-  0x23, 0x20, 0x6f, 0x66, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x20, 0x28, 0x27,
-  0x61, 0x72, 0x72, 0x61, 0x79, 0x27, 0x2c, 0x20, 0x27, 0x73, 0x74, 0x72,
-  0x75, 0x63, 0x74, 0x75, 0x72, 0x65, 0x27, 0x29, 0x0a, 0x23, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a,
-  0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49,
-  0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x41,
-  0x73, 0x73, 0x6f, 0x63, 0x69, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20,
-  0x65, 0x61, 0x63, 0x68, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69,
-  0x6f, 0x6e, 0x20, 0x61, 0x20, 0x73, 0x65, 0x6e, 0x73, 0x65, 0x20, 0x6f,
-  0x66, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x69, 0x67, 0x75, 0x6f, 0x75, 0x73,
-  0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x28, 0x61, 0x72, 0x72, 0x61, 0x79, 0x29, 0x20, 0x6f, 0x72,
-  0x20, 0x73, 0x74, 0x72, 0x69, 0x64, 0x65, 0x64, 0x20, 0x61, 0x63, 0x63,
-  0x65, 0x73, 0x73, 0x20, 0x28, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x75,
-  0x72, 0x65, 0x29, 0x2e, 0x20, 0x60, 0x65, 0x6e, 0x75, 0x6d, 0x20, 0x4d,
-  0x4d, 0x42, 0x5f, 0x4c, 0x41, 0x59, 0x4f, 0x55, 0x54, 0x5f, 0x45, 0x4c,
-  0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x60, 0x0a,
-  0x23, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a,
-  0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74,
-  0x2e, 0x64, 0x69, 0x6d, 0x73, 0x2d, 0x70, 0x72, 0x65, 0x2d, 0x70, 0x61,
-  0x64, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28, 0x29, 0x0a, 0x23,
+  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x53, 0x69, 0x7a, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x61,
+  0x20, 0x73, 0x69, 0x6e, 0x67, 0x6c, 0x65, 0x20, 0x65, 0x6c, 0x65, 0x6d,
+  0x65, 0x6e, 0x74, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x29, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73,
+  0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20,
+  0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e,
+  0x6e, 0x64, 0x69, 0x6d, 0x73, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69,
+  0x6e, 0x3d, 0x31, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
+  0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
+  0x3a, 0x20, 0x31, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
+  0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x4e,
+  0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x69, 0x6d,
+  0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x0a, 0x0a,
+  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x63, 0x64, 0x6f,
+  0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x64, 0x69, 0x6d, 0x73,
+  0x2d, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x23, 0x20, 0x46, 0x49, 0x58, 0x4d,
+  0x45, 0x3a, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x73, 0x20, 0x74, 0x6f, 0x20,
+  0x62, 0x65, 0x20, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x64, 0x20, 0x74,
+  0x6f, 0x20, 0x61, 0x74, 0x20, 0x6c, 0x65, 0x61, 0x73, 0x74, 0x20, 0x62,
+  0x6c, 0x6f, 0x62, 0x2c, 0x20, 0x62, 0x65, 0x74, 0x74, 0x65, 0x72, 0x20,
+  0x61, 0x72, 0x72, 0x61, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x28,
+  0x29, 0x20, 0x23, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28, 0x75, 0x69, 0x6e,
+  0x74, 0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x31, 0x29, 0x29, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20,
+  0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65,
+  0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x53, 0x69, 0x7a, 0x65, 0x20, 0x6f,
+  0x66, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75,
+  0x74, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x20,
+  0x69, 0x6e, 0x20, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e,
+  0x0a, 0x0a, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22,
+  0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x6f,
+  0x72, 0x64, 0x65, 0x72, 0x22, 0x20, 0x23, 0x20, 0x63, 0x6f, 0x75, 0x6c,
+  0x64, 0x20, 0x62, 0x65, 0x20, 0x61, 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x6e, 0x6f, 0x77, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x69, 0x6e, 0x74, 0x28, 0x6d, 0x69,
+  0x6e, 0x3d, 0x30, 0x29, 0x20, 0x23, 0x66, 0x6f, 0x72, 0x20, 0x6e, 0x6f,
+  0x77, 0x20, 0x23, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x28, 0x20, 0x27, 0x72,
+  0x6f, 0x77, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x63,
+  0x6f, 0x6c, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x69, 0x63, 0x2d, 0x6e, 0x64, 0x27, 0x29, 0x20,
+  0x23, 0x20, 0x75, 0x6e, 0x69, 0x6d, 0x70, 0x6c, 0x3b, 0x20, 0x63, 0x6f,
+  0x75, 0x6c, 0x64, 0x20, 0x77, 0x6f, 0x72, 0x6b, 0x20, 0x69, 0x6d, 0x6d,
+  0x65, 0x64, 0x69, 0x61, 0x74, 0x65, 0x6c, 0x79, 0x20, 0x77, 0x69, 0x74,
+  0x68, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x70, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46,
+  0x61, 0x6c, 0x73, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66,
+  0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x6c, 0x20, 0x6c,
+  0x61, 0x79, 0x6f, 0x75, 0x74, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73,
+  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x6f, 0x6e, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x43, 0x44, 0x4f, 0x2e, 0x0a, 0x0a, 0x23, 0x0a, 0x23, 0x20,
+  0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64,
+  0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x2e, 0x70, 0x61, 0x74,
+  0x74, 0x65, 0x72, 0x6e, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28,
+  0x29, 0x20, 0x23, 0x20, 0x6f, 0x66, 0x20, 0x65, 0x6e, 0x75, 0x6d, 0x20,
+  0x28, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27, 0x2c, 0x20, 0x27, 0x73,
+  0x74, 0x72, 0x75, 0x63, 0x74, 0x75, 0x72, 0x65, 0x27, 0x29, 0x0a, 0x23,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65,
   0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20,
   0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
   0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a,
-  0x20, 0x50, 0x72, 0x65, 0x2d, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67,
-  0x20, 0x62, 0x79, 0x74, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x20, 0x65, 0x61,
-  0x63, 0x68, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65,
+  0x20, 0x41, 0x73, 0x73, 0x6f, 0x63, 0x69, 0x61, 0x74, 0x65, 0x20, 0x74,
+  0x6f, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e,
+  0x73, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x20, 0x73, 0x65, 0x6e, 0x73, 0x65,
+  0x20, 0x6f, 0x66, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x69, 0x67, 0x75, 0x6f,
+  0x75, 0x73, 0x20, 0x61, 0x63, 0x63, 0x65, 0x73, 0x73, 0x0a, 0x23, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x28, 0x61, 0x72, 0x72, 0x61, 0x79, 0x29, 0x20,
+  0x6f, 0x72, 0x20, 0x73, 0x74, 0x72, 0x69, 0x64, 0x65, 0x64, 0x20, 0x61,
+  0x63, 0x63, 0x65, 0x73, 0x73, 0x20, 0x28, 0x73, 0x74, 0x72, 0x75, 0x63,
+  0x74, 0x75, 0x72, 0x65, 0x29, 0x2e, 0x20, 0x60, 0x65, 0x6e, 0x75, 0x6d,
+  0x20, 0x4d, 0x4d, 0x42, 0x5f, 0x4c, 0x41, 0x59, 0x4f, 0x55, 0x54, 0x5f,
+  0x45, 0x4c, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45,
+  0x60, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65,
   0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61, 0x79, 0x6f,
-  0x75, 0x74, 0x2e, 0x64, 0x69, 0x6d, 0x73, 0x2d, 0x70, 0x6f, 0x73, 0x74,
-  0x2d, 0x70, 0x61, 0x64, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28,
-  0x29, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75,
-  0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a,
-  0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x3a, 0x20, 0x50, 0x6f, 0x73, 0x74, 0x2d, 0x70, 0x61, 0x64,
-  0x64, 0x69, 0x6e, 0x67, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73, 0x20, 0x6f,
-  0x66, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e,
-  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23, 0x20,
-  0x57, 0x49, 0x50, 0x20, 0x69, 0x6e, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61,
-  0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x2d, 0x63, 0x79, 0x63, 0x6c,
-  0x69, 0x63, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x69, 0x72, 0x72, 0x65, 0x67,
-  0x75, 0x6c, 0x61, 0x72, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x73,
-  0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23, 0x20, 0x44, 0x69, 0x73, 0x74, 0x72,
-  0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x73, 0x65, 0x6d, 0x61,
-  0x6e, 0x74, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x54, 0x42, 0x44, 0x0a, 0x23,
-  0x20, 0x20, 0x20, 0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63,
-  0x64, 0x6f, 0x2e, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74,
-  0x69, 0x6f, 0x6e, 0x2e, 0x69, 0x64, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x75, 0x69, 0x6e, 0x74,
-  0x28, 0x6d, 0x69, 0x6e, 0x3d, 0x30, 0x29, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20,
-  0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3a, 0x20, 0x30, 0x0a, 0x23,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x4c, 0x6f, 0x63, 0x61,
-  0x6c, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x69, 0x64, 0x65, 0x6e,
-  0x74, 0x69, 0x66, 0x69, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x20,
-  0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x64, 0x20,
-  0x43, 0x44, 0x4f, 0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23,
-  0x23, 0x20, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x4c, 0x61, 0x79,
-  0x65, 0x72, 0x3a, 0x20, 0x54, 0x42, 0x44, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x2d, 0x20, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e,
-  0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x2e, 0x6c, 0x61, 0x79, 0x65, 0x72,
-  0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3a, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x23, 0x20, 0x70,
-  0x6f, 0x73, 0x73, 0x69, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x73, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x6d,
-  0x61, 0x74, 0x63, 0x68, 0x20, 0x65, 0x69, 0x74, 0x68, 0x65, 0x72, 0x20,
-  0x6d, 0x61, 0x6d, 0x62, 0x61, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x75, 0x74, 0x2e, 0x64, 0x69, 0x6d, 0x73, 0x2d, 0x70, 0x72, 0x65, 0x2d,
+  0x70, 0x61, 0x64, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, 0x28, 0x29,
+  0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x23,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
+  0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x50, 0x72, 0x65, 0x2d, 0x70, 0x61, 0x64, 0x64, 0x69,
+  0x6e, 0x67, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x20,
+  0x65, 0x61, 0x63, 0x68, 0x20, 0x64, 0x69, 0x6d, 0x65, 0x6e, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x2d, 0x20,
+  0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6c, 0x61,
+  0x79, 0x6f, 0x75, 0x74, 0x2e, 0x64, 0x69, 0x6d, 0x73, 0x2d, 0x70, 0x6f,
+  0x73, 0x74, 0x2d, 0x70, 0x61, 0x64, 0x22, 0x0a, 0x23, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x61, 0x72, 0x72, 0x61,
+  0x79, 0x28, 0x29, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73,
+  0x65, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61,
+  0x75, 0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x50, 0x6f, 0x73, 0x74, 0x2d, 0x70,
+  0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x62, 0x79, 0x74, 0x65, 0x73,
+  0x20, 0x6f, 0x66, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x64, 0x69, 0x6d,
+  0x65, 0x6e, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x0a, 0x23, 0x0a, 0x23, 0x20,
+  0x23, 0x20, 0x57, 0x49, 0x50, 0x20, 0x69, 0x6e, 0x20, 0x6d, 0x61, 0x6d,
+  0x62, 0x61, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x2d, 0x63, 0x79,
+  0x63, 0x6c, 0x69, 0x63, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x69, 0x72, 0x72,
+  0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, 0x20, 0x6c, 0x61, 0x79, 0x6f, 0x75,
+  0x74, 0x73, 0x0a, 0x23, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23, 0x23, 0x20,
+  0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x4c, 0x61, 0x79, 0x65, 0x72,
+  0x3a, 0x20, 0x54, 0x42, 0x44, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x2d, 0x20,
+  0x6b, 0x65, 0x79, 0x3a, 0x20, 0x22, 0x63, 0x64, 0x6f, 0x2e, 0x6d, 0x65,
+  0x6d, 0x6f, 0x72, 0x79, 0x2e, 0x6c, 0x61, 0x79, 0x65, 0x72, 0x22, 0x0a,
+  0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20,
+  0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x23, 0x20, 0x70, 0x6f, 0x73,
+  0x73, 0x69, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73,
+  0x20, 0x6e, 0x65, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x61, 0x74,
+  0x63, 0x68, 0x20, 0x65, 0x69, 0x74, 0x68, 0x65, 0x72, 0x20, 0x6d, 0x61,
+  0x6d, 0x62, 0x61, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x23, 0x20, 0x62,
-  0x75, 0x69, 0x6c, 0x74, 0x69, 0x6e, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6d,
-  0x61, 0x6d, 0x62, 0x61, 0x20, 0x68, 0x77, 0x6c, 0x6f, 0x63, 0x2c, 0x20,
-  0x6f, 0x72, 0x20, 0x61, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x23, 0x20, 0x62, 0x75, 0x69,
+  0x6c, 0x74, 0x69, 0x6e, 0x2c, 0x20, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6d,
+  0x62, 0x61, 0x20, 0x68, 0x77, 0x6c, 0x6f, 0x63, 0x2c, 0x20, 0x6f, 0x72,
+  0x20, 0x61, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x23, 0x20, 0x6d, 0x61,
-  0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
-  0x20, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x69,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x65, 0x6d, 0x6f,
-  0x72, 0x79, 0x20, 0x6c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x73, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71,
-  0x75, 0x69, 0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65,
-  0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
-  0x6c, 0x74, 0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20,
-  0x6c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x72, 0x61, 0x77, 0x2d, 0x70, 0x74, 0x72, 0x20, 0x61, 0x6e, 0x64,
-  0x2f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61, 0x2d, 0x61, 0x72,
-  0x72, 0x61, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20,
-  0x43, 0x44, 0x4f, 0x0a, 0x23, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23, 0x23,
-  0x20, 0x4d, 0x65, 0x74, 0x61, 0x2d, 0x49, 0x6e, 0x66, 0x6f, 0x72, 0x6d,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x0a, 0x23, 0x20, 0x23, 0x23, 0x20, 0x45,
-  0x61, 0x63, 0x68, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c,
-  0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x6d, 0x65, 0x74, 0x61, 0x2d, 0x64,
-  0x61, 0x74, 0x61, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x61, 0x62, 0x6f, 0x75, 0x74, 0x2c, 0x20, 0x66,
-  0x6f, 0x72, 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x2c,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20,
-  0x6c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x73, 0x29, 0x20, 0x75, 0x73, 0x65,
-  0x64, 0x0a, 0x0a, 0x0a, 0x23, 0x20, 0x28, 0x65, 0x6e, 0x64, 0x20, 0x6f,
-  0x66, 0x20, 0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x63, 0x6f,
-  0x72, 0x65, 0x2e, 0x79, 0x61, 0x6d, 0x6c, 0x29, 0x0a, 0x20, 0x0a
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x23, 0x20, 0x6d, 0x61, 0x65, 0x73,
+  0x74, 0x72, 0x6f, 0x2d, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x74,
+  0x69, 0x6d, 0x65, 0x20, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79,
+  0x20, 0x6c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x73,
+  0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x3a, 0x20, 0x46, 0x61, 0x6c, 0x73, 0x65, 0x0a, 0x23,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
+  0x3a, 0x20, 0x4e, 0x49, 0x4c, 0x0a, 0x23, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x6c, 0x61,
+  0x79, 0x65, 0x72, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x72,
+  0x61, 0x77, 0x2d, 0x70, 0x74, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x2f, 0x6f,
+  0x72, 0x20, 0x6d, 0x61, 0x6d, 0x62, 0x61, 0x2d, 0x61, 0x72, 0x72, 0x61,
+  0x79, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x43, 0x44,
+  0x4f, 0x0a, 0x23, 0x0a, 0x23, 0x0a, 0x23, 0x20, 0x23, 0x23, 0x20, 0x4d,
+  0x65, 0x74, 0x61, 0x2d, 0x49, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x0a, 0x23, 0x20, 0x23, 0x23, 0x20, 0x45, 0x61, 0x63,
+  0x68, 0x20, 0x43, 0x44, 0x4f, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x68,
+  0x61, 0x76, 0x65, 0x20, 0x6d, 0x65, 0x74, 0x61, 0x2d, 0x64, 0x61, 0x74,
+  0x61, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x61, 0x62, 0x6f, 0x75, 0x74, 0x2c, 0x20, 0x66, 0x6f, 0x72,
+  0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x2c, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x20, 0x6c, 0x61,
+  0x79, 0x65, 0x72, 0x28, 0x73, 0x29, 0x20, 0x75, 0x73, 0x65, 0x64, 0x0a,
+  0x0a, 0x0a, 0x23, 0x20, 0x28, 0x65, 0x6e, 0x64, 0x20, 0x6f, 0x66, 0x20,
+  0x6d, 0x61, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x2d, 0x63, 0x6f, 0x72, 0x65,
+  0x2e, 0x79, 0x61, 0x6d, 0x6c, 0x29, 0x0a
 };
-const size_t maestro_core_yaml_len = 8111;
+const size_t maestro_core_yaml_len = 8419;
diff --git a/attributes/maestro-core.yaml b/attributes/maestro-core.yaml
index de3a28921e908db939e3b7b47f3aa12bba9f18b1..9ba2692cb0453767167f1b0270f46e88606e6e33 100644
--- a/attributes/maestro-core.yaml
+++ b/attributes/maestro-core.yaml
@@ -16,7 +16,7 @@ schema-namespace: ".maestro.core."
 
 # user-defined types used in this schema
 #schema-types:
-  
+
   # -  typename: lifetime-interval
   #    typespec: map()
   #    documentation: CDO lifetime type
@@ -85,7 +85,7 @@ maestro-attributes:
                    pool. (Note the non-trivial semantic interaction
                    with the cdo.desist attribute.)
 
-  - key: "cdo.maestro-provided-storage" 
+  - key: "cdo.maestro-provided-storage"
     type: bool()
     required: False
     default: False
@@ -120,14 +120,32 @@ maestro-attributes:
     default: False
     documentation: Indicates that the CDO is a container for other CDOs,
                    i.e. a CDO group. Users are not expected to utilize or
-                   set this attribute. 
+                   set this attribute.
 
   - key: "cdo.group-members"
     type: blob()
     required: False
     default: ""
     documentation: Opaque member descriptor if .maestro.core.cdo.is-group is True.
-                  Users are not expected to utilize or set this attribute. 
+                  Users are not expected to utilize or set this attribute.
+
+
+  - key: "cdo.isdistributed"
+    type: bool()
+    required: False
+    default: False
+    documentation: Indicates that the CDO data are distributed among various
+                   maestro compoenets. Users are not expected to utilize or
+                   set this attribute.
+
+#FIXME only support now irregular_1D dist layout
+  - key: "cdo.dist-layout"
+    type: mmblayout()
+    required: False
+    default: NIL
+    documentation: The CDO data distribution among various
+                   maestro compoenets.
+
 
   ## Lifetime related
   # - key: "cdo.lifetime"
@@ -140,7 +158,7 @@ maestro-attributes:
   #   default: *lifetime-strictness-values
   #   documentation: CDO lifetime specification.
 
-    
+
   ## SCOPE related
   - key: "cdo.scope.size"
     type: int(min=-1)
@@ -208,6 +226,7 @@ maestro-attributes:
     required: False
     default: 0
     documentation: General layout access order on the CDO.
+
 #
 #   - key: "cdo.layout.pattern"
 #     type: array() # of enum ('array', 'structure')
@@ -230,13 +249,6 @@ maestro-attributes:
 #
 # # WIP in mamba: block-cyclic and irregular layouts
 #
-# # Distribution semantics: TBD
-#   - key: "cdo.distribution.id"
-#     type: uint(min=0)
-#     required: False
-#     default: 0
-#     documentation: Local chunk identifier of a distributed CDO.
-#
 #
 # ## Memory Layer: TBD
 #   - key: "cdo.memory.layer"
@@ -253,4 +265,3 @@ maestro-attributes:
 
 
 # (end of maestro-core.yaml)
- 
diff --git a/attributes/maestro-schema-schema.yaml b/attributes/maestro-schema-schema.yaml
index 1c9461a0171882b85a815cf75fb027f5889d6fc3..75f6fa0d440238dfc2d1f11dc6a69d817b494a7a 100644
--- a/attributes/maestro-schema-schema.yaml
+++ b/attributes/maestro-schema-schema.yaml
@@ -21,7 +21,7 @@ schema-types: list(include('maestro-user-type-def'),
 
 schema-type-values: list(include('maestro-user-typespec'),
                     required=False)
-          
+
 # A Maestro schema needs to be a valid yaml document
 maestro-attributes: list(include('maestro-attribute-def'), required=False)
 ---
@@ -50,17 +50,18 @@ maestro-user-type-def:
 maestro-builtin-typespec: any(regex('^str\(.*\)$'),
                               regex('^bool\(.*\)$'),
                               regex('^int\(.*\)$'),
-                              regex('^map\(.*\)$'),
                               regex('^enum\(.*\)$'),
-                              regex('^map\(.*\)$')
+                              regex('^map\(.*\)$'),
+                              regex('^mmblayout\(.*\)$')
                               )
 
 ---
 maestro-builtin-typeval:  Any(bool(),
                               int(),
                               enum('None'),
-                              str())
-  
+                              str(),
+                              mmblayout())
+
 ---
 # FIXME: this needs better inside typing
 maestro-user-typespec: any(include('maestro-builtin-typespec'),
@@ -71,7 +72,6 @@ maestro-user-typespec: any(include('maestro-builtin-typespec'),
 # things that can be checked directly
 # plus
 # things that can be run-time checked
-maestro-user-typeval: any(include('maestro-builtin-typeval'), 
-                          include('maestro-builtin-typespec') 
+maestro-user-typeval: any(include('maestro-builtin-typeval'),
+                          include('maestro-builtin-typespec')
                           )
-                          
diff --git a/attributes/maestro-schema.c b/attributes/maestro-schema.c
index fd42faf8439c565d3225dfee80b1c9be9de1ff4d..3cf0543d65d9501bc333a3194f11ea4a627b449f 100644
--- a/attributes/maestro-schema.c
+++ b/attributes/maestro-schema.c
@@ -1,9 +1,9 @@
 /* -*- mode:c -*- */
 /** @file
- ** @brief implementing the Maestro schema-schema in cyaml 
+ ** @brief implementing the Maestro schema-schema in cyaml
 
  * This should always be kept in sync (manually, unfortunately) with
- * attributes/maestro-schema-schema.yaml 
+ * attributes/maestro-schema-schema.yaml
  **/
 
 /*
@@ -184,7 +184,7 @@ static const cyaml_schema_value_t
 csv_schema_type_entry = {
   CYAML_VALUE_MAPPING(CYAML_FLAG_DEFAULT,
                       struct mstro_schema_type_,
-                      csf_schema_type_mapping),                      
+                      csf_schema_type_mapping),
 };
 
 struct mstro_schema_attribute_type_parse_closure {
@@ -209,7 +209,7 @@ struct mstro_schema_attribute_ {
 #define ATTRIBUTE_KEY_PATTERN_EFLAGS 0
   char *typespec;   /**< type: include('maestro-user-typespec') */
   bool required; /**<  required: bool() */
-  
+
   /* struct mstro_schema_type_val *defaultval; /\**< Any(include('maestro-builtin-typeval'), */
   /*                                              include('maestro-user-typeval'), */
   /*                                              required=False, none=False) *\/ */
@@ -251,9 +251,9 @@ csv_schema_attributes_entry = {
 /** externally visible: a full schema */
 struct mstro_schema_ {
   char    *schema_name;      /**< "schema-name: str()" */
-  
+
   uint64_t schema_version;   /**< "schema-version: int(min=0) */
-  
+
   char    *schema_namespace; /**< "schema-namespace: regex('(^$)|(^\.$)|(^(\.[^\s]+)+\.$)', ... */
 #define NAMESPACE_PATTERN "(^$)|(^\\.$)|(^(\\.[^[:space:]]+)+\\.$)"
 #define NAMESPACE_PATTERN_CFLAGS (REG_EXTENDED|REG_ICASE)
@@ -362,7 +362,7 @@ mstro_schema_free(mstro_schema sch)
     return MSTRO_INVARG;
 
   DEBUG("deallocating schema at %p (%s)\n", sch, sch->schema_name);
-  
+
   mstro_schema next=sch->next; /* save for tail recursion */
 
   if(sch->schema_name) {
@@ -413,7 +413,7 @@ mstro_schema_free(mstro_schema sch)
       if(sa->defaultval!=NULL && sa->defaultval!=MSTRO_SCHEMA_DEFAULT_VAL_UNSET) {
         free(sa->defaultval);
       }
-      if(sa->type_parse_closure.info) 
+      if(sa->type_parse_closure.info)
         mstro_stp_val_dispose(sa->type_parse_closure.info);
       if(sa->key)
         free(sa->key);
@@ -439,15 +439,15 @@ mstro_schema_free(mstro_schema sch)
     }
   }
   mstro_symtab_destroy(&sch->symtab);
-  
+
   int err= pthread_rwlock_destroy(&sch->lock);
   if(err!=0) {
     ERR("Failed to destroy schema lock: %d (%s)\n", err, strerror(err));
     return MSTRO_FAIL;
   }
-  
+
   free(sch);
-  
+
   if(next)
     return mstro_schema_free(next);
   else
@@ -480,11 +480,11 @@ mstro_schema_version(mstro_schema sch)
     WITH_LOCKED_SCHEMA_READ(sch, {
       version = sch->schema_version;
       });
-    
+
     return version;
   }
 }
-        
+
 
 
 
@@ -510,7 +510,7 @@ csf_top_mapping_schema[] = {
                          CYAML_FLAG_POINTER|CYAML_FLAG_OPTIONAL,
                          struct mstro_schema_, schema_attributes, &csv_schema_attributes_entry, 0, CYAML_UNLIMITED),
   CYAML_FIELD_END
-};  
+};
 
 
 static const
@@ -551,7 +551,7 @@ mstro_schema__ensure_regex(const char *pattern,
 {
   if(pattern==NULL || dst==NULL)
     return MSTRO_INVARG;
-  
+
   if(atomic_load(dst)==NULL) {
     /* compile regex and store. */
     regex_t *re = malloc(sizeof(regex_t));
@@ -573,7 +573,7 @@ mstro_schema__ensure_regex(const char *pattern,
       DEBUG("Concurrent regex compilation wins against us\n");
       regfree(re);
       free(re);
-    } 
+    }
   }
   assert(atomic_load(dst)!=NULL);
   return MSTRO_OK;
@@ -750,20 +750,20 @@ mstro_attribute_val__compute_size(enum mstro_stp_val_kind kind,
   assert((string==NULL && val!=NULL) || (string!=NULL && val==NULL));
 
   mstro_status status = MSTRO_OK;
-  
+
   switch(kind) {
-    case MSTRO_STP_BOOL: 
+    case MSTRO_STP_BOOL:
       *val_size = sizeof(bool); break;
-    case MSTRO_STP_UINT: 
+    case MSTRO_STP_UINT:
       *val_size = sizeof(uint64_t); break;
-    case MSTRO_STP_INT: 
+    case MSTRO_STP_INT:
       *val_size = sizeof(int64_t); break;
     case MSTRO_STP_FLOAT:
       *val_size = sizeof(float); break;
-    case MSTRO_STP_DOUBLE: 
+    case MSTRO_STP_DOUBLE:
       *val_size = sizeof(double); break;
     case MSTRO_STP_STR:
-    case MSTRO_STP_REGEX: 
+    case MSTRO_STP_REGEX:
       if(string!=NULL)
         *val_size = strlen(string)+1;
       else {
@@ -797,6 +797,9 @@ mstro_attribute_val__compute_size(enum mstro_stp_val_kind kind,
         *val_size = ((mstro_blob*)val)->len;
       }
       break;
+    case MSTRO_STP_MMBLAYOUT:
+      *val_size = sizeof(mmbLayout);
+      break;
     case MSTRO_STP_POINTER:
       *val_size = sizeof(void*);
       break;
@@ -856,7 +859,7 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
       err = regcomp(&(regex[0]), "[+]?[[:blank:]]*[0-9]+",REG_EXTENDED|REG_NOSUB);
       WARN("Not checking numeric bounds on types\n");
       break;
-    }      
+    }
     case MSTRO_STP_INT: {
       err = regcomp(&(regex[0]), "[-+]?[[:blank:]]*[0-9]+",REG_EXTENDED|REG_NOSUB);
       WARN("Not checking numeric bounds on types\n");
@@ -901,16 +904,21 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
         err |= regcomp(&(regex[i]), parsed_type->regex_patterns[i],
                        REG_EXTENDED|REG_NOSUB| (parsed_type->regex_ignorecase ? REG_ICASE : 0));
       }
-      break;          
+      break;
     case MSTRO_STP_BLOB:
       need_regmatch = false;
       minlen = parsed_type->blob_minlen;
       maxlen = parsed_type->blob_maxlen;
       break;
+    case MSTRO_STP_MMBLAYOUT:
+      need_regmatch = false;
+      minlen = parsed_type->mmblayout_minlen;
+      maxlen = parsed_type->mmblayout_maxlen;
+      break;
     case MSTRO_STP_TIMESTAMP:
       err = regcomp(&(regex[0]), RFC3339_PATTERN, REG_EXTENDED);
       break;
-      
+
     case MSTRO_STP_POINTER:
       err = regcomp(&(regex[0]), POINTER_PATTERN, REG_EXTENDED|REG_ICASE);
       break;
@@ -924,7 +932,7 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
     // regerror ...
     goto BAILOUT;
   }
-  
+
   if(! (minlen<=val_len  && val_len <=maxlen)) {
     ERR("Argument |%s| (strlen %zu) not within length bounds for type: min=%zu, max=%zu\n",
         string, val_len, minlen, maxlen);
@@ -979,19 +987,19 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
         }
         break;
       }
-      case MSTRO_STP_UINT: 
+      case MSTRO_STP_UINT:
         *((uint64_t*)dst) = strtoumax((const char*)string, NULL, 10);
         break;
 
-      case MSTRO_STP_INT: 
+      case MSTRO_STP_INT:
         *((int64_t*)dst)  = strtoimax(string, NULL, 10);
         break;
 
-      case MSTRO_STP_FLOAT: 
+      case MSTRO_STP_FLOAT:
         *((float*)dst) = strtof(string, NULL);
         break;
 
-      case MSTRO_STP_DOUBLE: 
+      case MSTRO_STP_DOUBLE:
         *((double*)dst) = strtod(string, NULL);
         break;
 
@@ -1000,12 +1008,19 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
         strcpy((char*)dst, string);
         ((char*)dst)[(*val_size)-1] = '\0';
         break;
-        
+
       case MSTRO_STP_BLOB:
         memcpy((char*)dst, string, *val_size);
         break;
 
-      case MSTRO_STP_TIMESTAMP: 
+      case MSTRO_STP_MMBLAYOUT:
+        s = mstro_mmbLayout_parse(string, (mmbLayout **) &dst);
+        if(s!=MSTRO_OK) {
+          ERR("Failed to parse mmbLayout object |%s| \n", string);
+        }
+        break;
+
+      case MSTRO_STP_TIMESTAMP:
         s=mstro_timestamp_parse(string, *val_size, (mstro_timestamp*)dst);
         if(s!=MSTRO_OK) {
           ERR("Failed to parse timestamp value |%s| as rfc3339-timestamp\n");
@@ -1019,7 +1034,7 @@ mstro_attribute_val_parse(const struct mstro_stp_val *parsed_type, const char *s
           val=0;
         else if(strcasecmp(string, "NULL")==0)
           val=0;
-        else 
+        else
           val= strtoumax((const char*)string, NULL, 0);
         *val_p = (void*)val;
         DEBUG("parsed val %s as pointer %p\n", string, *val_p);
@@ -1043,7 +1058,7 @@ mstro_status
 mstro_schema_attribute__parse_defaultval(struct mstro_schema_attribute_ *attr)
 {
   assert(attr->type_parse_closure.info!=NULL);
-  if(attr->defaultval_string) 
+  if(attr->defaultval_string)
     return mstro_attribute_val_parse(
         attr->type_parse_closure.info, attr->defaultval_string,
         &attr->defaultval, &attr->defaultval_size);
@@ -1115,7 +1130,7 @@ mstro_schema_validate_and_instantiate(mstro_schema schema)
   DEBUG(" %zu type names: -- ok\n", schema->schema_types_count);
 
 
-  /* attribute names */ 
+  /* attribute names */
   for(size_t i=0; i<schema->schema_attributes_count; i++) {
     const char *akey = schema->schema_attributes[i].key;
     s=mstro_schema__validate_attribute_key(akey);
@@ -1163,7 +1178,7 @@ mstro_schema_validate_and_instantiate(mstro_schema schema)
     }
     HASH_ADD(hh, schema->type_table, type_symbol, sizeof(mstro_symbol), current);
   }
-  
+
   for(size_t i=0; i<schema->schema_attributes_count; i++) {
     const char *fqstr;
     struct mstro_schema_attribute_ *current = schema->schema_attributes+i;
@@ -1187,7 +1202,7 @@ mstro_schema_validate_and_instantiate(mstro_schema schema)
           current->typespec);
       goto BAILOUT;
     }
-    
+
     HASH_ADD(hh, schema->attribute_table,
              key_symbol, sizeof(mstro_symbol),
              current);
@@ -1208,7 +1223,7 @@ mstro_schema_validate_and_instantiate(mstro_schema schema)
           current->defaultval_string, fqstr);
       goto BAILOUT;
     }
-        
+
     /* check that attribute has value type */
     DEBUG("Attribute |%s| type |%s|, required %s, default %s [parsed as %p], doc: %s\n",
           mstro_symbol_name(current->key_symbol),
@@ -1216,7 +1231,7 @@ mstro_schema_validate_and_instantiate(mstro_schema schema)
           current->required ? "YES" : "NO",
           //"?"//
           current->defaultval_string ? current->defaultval_string : "(none)",
-          (current->defaultval==MSTRO_SCHEMA_DEFAULT_VAL_UNSET 
+          (current->defaultval==MSTRO_SCHEMA_DEFAULT_VAL_UNSET
            ? "(unset)" : current->defaultval),
           current->documentation ? current->documentation : "(none)"
           );
@@ -1270,7 +1285,7 @@ mstro_schema_parse(const uint8_t *yaml_data, size_t data_len,
       *result=NULL;
       return MSTRO_FAIL;
     }
-    
+
     mstro_status s = mstro_schema_validate_and_instantiate(*result);
     if(s!=MSTRO_OK) {
       ERR("Failed to validate and instantiate schema '%s' (V%zu)\n",
@@ -1290,7 +1305,7 @@ mstro_status
 mstro_schema_parse_from_file(const char *fname, mstro_schema *result)
 {
   mstro_status s=MSTRO_UNIMPL;
-  
+
   int fd;
   if((fd = open(fname, O_RDONLY, 0)) == -1) {
     ERR("Failed to open file %s: %d (%s)\n",
@@ -1326,14 +1341,14 @@ mstro_schema_parse_from_file(const char *fname, mstro_schema *result)
         fname, errno, strerror(errno));
     s=MSTRO_FAIL;
   }
-  
+
 BAILOUT_CLOSE:
   if(-1==close(fd)) {
     ERR("Failed to close file %s: %d (%s)\n",
         fname, errno, strerror(errno));
     s=MSTRO_FAIL;
   }
-  
+
 BAILOUT:
   return s;
 }
@@ -1361,9 +1376,9 @@ mstro_schema_merge(mstro_schema main,
   WITH_LOCKED_SCHEMA_WRITE(consumed, {
       WITH_LOCKED_SCHEMA_WRITE(main, {
           free(main->schema_name);
-          
+
           main->schema_name=tmpname;
-          
+
           LL_APPEND(main,consumed);
           /* FIXME: we could copy entries from consumed into main, but until
            * someone complains we'll cdr down the next list when performing
@@ -1371,7 +1386,7 @@ mstro_schema_merge(mstro_schema main,
            */
         });
     });
-  
+
   return MSTRO_OK;
 }
 
@@ -1387,9 +1402,10 @@ enum mstro_schema_builtin_type {
   MSTRO_SCHEMA_BUILTIN_BLOB,
   MSTRO_SCHEMA_BUILTIN_TIMESTAMP,
   MSTRO_SCHEMA_BUILTIN_POINTER,
+  MSTRO_SCHEMA_BUILTIN_MMBLAYOUT,
   MSTRO_SCHEMA_BUILTIN_TYPE__MAX
 };
-  
+
 static struct {
   enum mstro_schema_builtin_type type;
   const char *basename;
@@ -1425,6 +1441,9 @@ static struct {
   [MSTRO_SCHEMA_BUILTIN_POINTER] = { .type = MSTRO_SCHEMA_BUILTIN_POINTER,
                                      .basename = "pointer",
                                      .stp_kind = MSTRO_STP_POINTER},
+  [MSTRO_SCHEMA_BUILTIN_MMBLAYOUT] = { .type = MSTRO_SCHEMA_BUILTIN_MMBLAYOUT,
+                                     .basename = "mmblayout",
+                                     .stp_kind = MSTRO_STP_MMBLAYOUT},
 };
 
 /** lookup or create builtin type */
@@ -1437,7 +1456,7 @@ mstro_schema_lookup_type__builtins(
 {
   size_t i;
   mstro_status s = MSTRO_NOENT;
-  
+
   for(i=0; i<MSTRO_SCHEMA_BUILTIN_TYPE__MAX; i++) {
     /* hand-coded to avoid 3 strlen ops */
     size_t j;
@@ -1535,7 +1554,7 @@ mstro_schema_lookup_type(mstro_schema schema,
       } else {
         DEBUG("|%s| is not a builtin type\n", typename);
       }
-      
+
       status = mstro_symtab_lookup(schema->symtab, typename, &sym);
       if(status==MSTRO_OK) {
         HASH_FIND(hh, schema->type_table, &sym, sizeof(mstro_symbol), *result);
@@ -1567,7 +1586,7 @@ mstro_schema_lookup_symbol(mstro_schema schema, const char *fqkey,
     return MSTRO_INVARG;
   if(sym==NULL)
     return MSTRO_INVOUT;
-  
+
   mstro_status status = MSTRO_OK;
   for(mstro_schema schem = schema;
       schem!=NULL;
@@ -1605,7 +1624,7 @@ mstro_schema_lookup_attribute(mstro_schema schema,
     ERR("NULL result destination\n");
     return MSTRO_INVOUT;
   }
-  
+
   *result = NULL;
   status = mstro_schema_lookup_symbol(schema, attributename, &sym);
   if(status==MSTRO_OK) {
@@ -1619,7 +1638,7 @@ mstro_schema_lookup_attribute(mstro_schema schema,
           }
         }
       });
-    
+
     if(*result==NULL) {
       DEBUG("Failed to find |%s| in type table\n", attributename);
       status= MSTRO_NOENT;
@@ -1714,7 +1733,7 @@ static inline
 mstro_status
 mstro_attribute_entry_dispose(struct mstro_attribute_entry_ *entry)
 {
-  if(entry==NULL) 
+  if(entry==NULL)
     return MSTRO_INVARG;
   mstro_status status=MSTRO_OK;
 
@@ -1757,7 +1776,7 @@ mstro_attribute_entry_dispose(struct mstro_attribute_entry_ *entry)
       status=MSTRO_FAIL;
   }
   free(entry);
-  
+
   return status;
 }
 
@@ -1783,7 +1802,7 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
   size_t default_namespace_prefix_len = 0;
   if(default_namespace_prefix!=NULL)
     default_namespace_prefix_len = strlen(default_namespace_prefix);
-  
+
   if(*result==NULL) {
     *result = malloc(sizeof(struct mstro_attribute_dict_));
     if(*result==NULL) {
@@ -1803,21 +1822,21 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
       status = MSTRO_FAIL;
       goto BAILOUT;
     }
-    
-    switch(event.type) { 
+
+    switch(event.type) {
       case YAML_NO_EVENT: DEBUG("YAML: No event!\n"); break;
-        
+
         /* Stream start/end */
       case YAML_STREAM_START_EVENT: DEBUG("YAML: STREAM START\n"); break;
       case YAML_STREAM_END_EVENT:   DEBUG("YAML: STREAM END\n");   break;
-        
+
         /* Block delimeters */
       case YAML_DOCUMENT_START_EVENT: DEBUG("YAML: Start Document\n"); break;
       case YAML_DOCUMENT_END_EVENT:   DEBUG("YAML: End Document\n");   break;
-        
+
       case YAML_SEQUENCE_START_EVENT: DEBUG("YAML: Start Sequence\n"); break;
       case YAML_SEQUENCE_END_EVENT:   DEBUG("YAML: End Sequence\n");   break;
-        
+
       case YAML_MAPPING_START_EVENT: {
         DEBUG("Start Mapping\n");
         if(keystack!=NULL && keystack->fqkey==NULL) {
@@ -1836,7 +1855,7 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
         DEBUG("pushed keystack element\n");
         break;
       }
-        
+
       case YAML_MAPPING_END_EVENT: {
         DEBUG("End Mapping (%s)\n", keystack->fqkey);
         struct partial_key *pk;
@@ -1861,20 +1880,20 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
         }
         break;
       }
-          
+
         /* Data */
       case YAML_ALIAS_EVENT:  printf("Got alias (anchor %s)\n", event.data.alias.anchor); break;
       case YAML_SCALAR_EVENT: {
         unsigned char *val = event.data.scalar.value;
         size_t val_len = event.data.scalar.length;
         DEBUG("Got scalar (value %s, len %zu)\n", val, val_len);
-        
+
         if(keystack==NULL) {
           ERR("YAML scalar at toplevel\n");
           status=MSTRO_FAIL;
           goto BAILOUT;
         }
-        
+
         if(keystack->fqkey==NULL) {
           DEBUG("No key on top of stack, this must be the mapping key\n");
           size_t next_len;
@@ -1896,7 +1915,7 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
             status=MSTRO_NOMEM;
             goto BAILOUT;
           }
-          
+
           size_t offset;
           if(keystack->next!=NULL) {
             DEBUG("keystack provides prefix %s\n", keystack->next->fqkey);
@@ -1980,17 +1999,17 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
             /* FIXME: describe_entry function call here */
           }
           entry->kind = decl->type_parse_closure.info->kind;
-          
+
           assert(entry->kind!=MSTRO_STP_ERROR);
 
           if(new_entry)
             HASH_ADD(hh, (*result)->dict, key, sizeof(mstro_symbol), entry);
-          
+
           DEBUG("Handled entry for %s, cleaning keystack\n", keystack->fqkey);
           free(keystack->fqkey);
           keystack->fqkey = NULL;
         }
-        
+
         break;
       }
 
@@ -2006,13 +2025,13 @@ mstro_attributes__parse_helper(yaml_parser_t parser,
 
   (*result)->schema = schema;
   status = MSTRO_OK;
-  
+
 
 BAILOUT:
   if(status!=MSTRO_OK && result_allocated_here) {
     free(*result);
   }
-    
+
   return status;
 }
 
@@ -2036,7 +2055,7 @@ mstro_attributes_parse(mstro_schema schema,
     s=MSTRO_INVARG;
     goto BAILOUT;
   }
-  
+
   /* Initialize parser */
   yaml_parser_t parser;
   if(!yaml_parser_initialize(&parser)) {
@@ -2053,7 +2072,7 @@ mstro_attributes_parse(mstro_schema schema,
   s=mstro_attributes__parse_helper(parser, schema,
                                    result, default_namespace_prefix);
   yaml_parser_delete(&parser);
-  
+
   if(s!=MSTRO_OK) {
     ERR("Failed to parse attribute yaml string |%s|\n", yaml_fragment);
   } else {
@@ -2061,7 +2080,7 @@ mstro_attributes_parse(mstro_schema schema,
          HASH_COUNT((*result)->dict));
     (*result)->schema = schema;
   }
-     
+
 BAILOUT:
   return s;
 }
@@ -2095,7 +2114,7 @@ mstro_attribute_dict_set_defaults(mstro_schema schema,
     (*result)->schema = schema;
     (*result)->dict = NULL;
   }
-  
+
   DEBUG("Default values will be filled in on demand\n");
   return MSTRO_OK;
 }
@@ -2109,7 +2128,7 @@ mstro_attribute_dict_dispose(mstro_attribute_dict dict)
     return MSTRO_INVARG;
   /* schema will be refcounted one day ...*/
   mstro_status status = MSTRO_OK;
-  
+
   struct mstro_attribute_entry_ *el,*tmp;
   HASH_ITER(hh,dict->dict,el,tmp) {
     DEBUG("Deleting attribute %s from dict %p\n",
@@ -2117,7 +2136,7 @@ mstro_attribute_dict_dispose(mstro_attribute_dict dict)
     HASH_DELETE(hh,dict->dict,el);
     status = status | mstro_attribute_entry_dispose(el);
   }
-  
+
   free(dict);
   return status;
 }
@@ -2152,10 +2171,10 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
   struct mstro_attribute_entry_ *entry=NULL;
   bool must_insert=false;
   mstro_status status = MSTRO_OK;
-  
+
   DEBUG("schema dict insert default for key %s\n", mstro_symbol_name(key));
   mstro_schema_attribute attr=NULL;
-  
+
   status = mstro_schema_lookup_attribute(dict->schema,
                                          mstro_symbol_name(key),
                                          &attr);
@@ -2163,7 +2182,7 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
     ERR("Cannot find attribute in schema !? (should not happen here)\n");
     goto BAILOUT;
   }
-  
+
   if(attr->defaultval==MSTRO_SCHEMA_DEFAULT_VAL_UNSET) {
     /* schema instantiation should have parsed the defaultval_string for us */
     DEBUG("No default for |%s| in schema\n", mstro_symbol_name(key));
@@ -2179,7 +2198,7 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
         attr->typespec, mstro_symbol_name(key));
     goto BAILOUT;
   }
-  
+
   HASH_FIND(hh, dict->dict, &key, sizeof(mstro_symbol), entry);
   if(!entry) {
     DEBUG("No entry for |%s|, allocating new one\n", mstro_symbol_name(key));
@@ -2192,10 +2211,10 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
     entry->kind=tdecl->parsed_type->kind;
     entry->val = NULL;
     must_insert=true;
-  } 
-    
+  }
+
   assert(entry->kind == tdecl->parsed_type->kind);
-  
+
   /* duplicate; if user-supplied: allocate freshly */
   if(entry->user_owned_val)
     entry->val=NULL;
@@ -2224,18 +2243,19 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
     }
     entry->user_owned_val=false;
   }
-  
+
   switch(entry->kind) {
-    case MSTRO_STP_BOOL: 
-    case MSTRO_STP_UINT: 
-    case MSTRO_STP_INT: 
-    case MSTRO_STP_FLOAT: 
-    case MSTRO_STP_DOUBLE: 
+    case MSTRO_STP_BOOL:
+    case MSTRO_STP_UINT:
+    case MSTRO_STP_INT:
+    case MSTRO_STP_FLOAT:
+    case MSTRO_STP_DOUBLE:
     case MSTRO_STP_STR:
-    case MSTRO_STP_REGEX: 
+    case MSTRO_STP_REGEX:
     case MSTRO_STP_BLOB:
     case MSTRO_STP_TIMESTAMP:
     case MSTRO_STP_POINTER:
+    case MSTRO_STP_MMBLAYOUT:
       memcpy(entry->val, attr->defaultval, entry->valsize);
       break;
     default:
@@ -2246,7 +2266,7 @@ mstro_attribute_dict__insert_default(mstro_attribute_dict dict,
       status=MSTRO_UNIMPL;
       goto BAILOUT;
   }
-  
+
   if(must_insert) {
     HASH_ADD(hh, dict->dict, key, sizeof(mstro_symbol), entry);
   }
@@ -2257,7 +2277,7 @@ BAILOUT:
 
   DEBUG("Insert default returns %d (%s)\n",
         status, mstro_status_description(status));
-  
+
   return status;
 }
 
@@ -2296,7 +2316,7 @@ mstro_attribute_dict_get(mstro_attribute_dict dict,
    * multiple CDOs (while the schema is). OTOH, allocation of a
    * temporary qualifying prefix string buffer would better be handled
    * further up for efficiency ... */
-  
+
   const char *fqkey=NULL; /* a const ref to the fully qualified version */
   char *tmpfqkey=NULL;    /* a locally allocated fq key if needed */
   if(key[0]!='.') {
@@ -2332,7 +2352,7 @@ mstro_attribute_dict_get(mstro_attribute_dict dict,
       goto BAILOUT;
     }
     /* otherwise insert default */
-      
+
     status = mstro_attribute_dict__insert_default(dict, sym);
     if(status==MSTRO_NOENT) {
       DEBUG("Key |%s| has no default value in schema\n", fqkey);
@@ -2345,9 +2365,9 @@ mstro_attribute_dict_get(mstro_attribute_dict dict,
     HASH_FIND(hh, dict->dict, &sym, sizeof(mstro_symbol), entry);
     assert(entry!=NULL);
   }
-  
+
   /* return value */
-  if(entry_p) 
+  if(entry_p)
     *entry_p = entry;
   if(val_p) {
     *val_p = entry->val;
@@ -2383,6 +2403,9 @@ mstro_attribute_dict_get(mstro_attribute_dict dict,
       case MSTRO_STP_POINTER:
         *valtype = MSTRO_CDO_ATTR_VALUE_pointer;
         break;
+      case MSTRO_STP_MMBLAYOUT:
+        *valtype = MSTRO_CDO_ATTR_VALUE_mmblayout;
+        break;
       default:
         ERR("Unhandled MSTRO_STP attribute type %d\n", entry->kind);
         status=MSTRO_UNIMPL;
@@ -2390,7 +2413,7 @@ mstro_attribute_dict_get(mstro_attribute_dict dict,
     }
   }
   status=MSTRO_OK;
-  
+
 BAILOUT:
   if(tmpfqkey)
     free(tmpfqkey);
@@ -2422,7 +2445,7 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
   fqkey = key;
 
   DEBUG("Setting value for |%s|\n", fqkey);
-  
+
   status = mstro_schema_lookup_symbol(dict->schema, fqkey, &sym);
   if(sym==NULL) {
     DEBUG("Key |%s| not found in schema %s (V%zu)\n",
@@ -2457,7 +2480,7 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
     mstro_schema_type tdecl;
     mstro_status s = mstro_schema_lookup_type(dict->schema, decl->typespec,
                                               &tdecl);
-    
+
     if(s!=MSTRO_OK) {
       ERR("Failed to find type declaration for type |%s| (attribute |%s|)\n",
           decl->typespec, fqkey);
@@ -2487,7 +2510,6 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
   } else {
     DEBUG("Found existing entry for |%s|, updating\n", fqkey);
   }
-            
 
   /* some lax type checking */
   if(valtype!=MSTRO_CDO_ATTR_VALUE_INVALID) {
@@ -2516,7 +2538,7 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
         break;
       case MSTRO_STP_STR:
       case MSTRO_STP_REGEX:
-        if(valtype != MSTRO_CDO_ATTR_VALUE_cstring) 
+        if(valtype != MSTRO_CDO_ATTR_VALUE_cstring)
           status = MSTRO_INVARG;
         break;
       case MSTRO_STP_BLOB:
@@ -2527,6 +2549,10 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
         if(valtype != MSTRO_CDO_ATTR_VALUE_timestamp)
           status = MSTRO_INVARG;
         break;
+      case MSTRO_STP_MMBLAYOUT:
+        if(valtype != MSTRO_CDO_ATTR_VALUE_mmblayout)
+            status = MSTRO_INVARG;
+        break;
       default:
         ERR("Unhandled MSTRO_STP attribute type %d\n", entry->kind);
         status=MSTRO_UNIMPL;
@@ -2558,7 +2584,7 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
       free(entry->val);
     entry->val=NULL;
   }
-  
+
   if(copy_value) {
     status = mstro_attribute_val__compute_size(entry->kind, NULL,
                                                val, &entry->valsize);
@@ -2566,14 +2592,24 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
       ERR("Cannot compute value size\n");
       goto BAILOUT;
     }
-    entry->val = malloc(entry->valsize);
-    memcpy(entry->val, val, entry->valsize);
+    /* create a copy of mmbLayout object */
+    if(entry->kind == MSTRO_STP_MMBLAYOUT) {
+      mmbError stat = mmb_layout_create_copy(val ,(mmbLayout **) &entry->val);
+      if (stat != MMB_OK) {
+        ERR("Cannot copy mmbLayout \n");
+        goto BAILOUT;
+      }
+    }
+    else {
+      entry->val = malloc(entry->valsize);
+      memcpy(entry->val, val, entry->valsize);
+    }
     entry->user_owned_val = false;
   } else {
     entry->val = val;
     entry->user_owned_val = true;
   }
-  
+
   status = mstro_attribute_entry__set_size(entry, entry->kind);
   if(status!=MSTRO_OK) {
     ERR("Failed to set entry size for |%s|\n", fqkey);
@@ -2596,9 +2632,9 @@ mstro_attribute_dict_set(mstro_attribute_dict dict, const char *key,
     default:
       break;
   }
-  
+
   status=MSTRO_OK;
-  
+
 BAILOUT:
   if(tmpfqkey)
     free(tmpfqkey);
@@ -2610,7 +2646,7 @@ mstro_status
 mstro_attribute_dict_set_kventry(mstro_attribute_dict dict,
                                  const Mstro__Pool__KvEntry *entry)
 {
-  if(dict==NULL || entry==NULL) 
+  if(dict==NULL || entry==NULL)
     return MSTRO_INVARG;
 
   const char *key = entry->key;
@@ -2675,7 +2711,19 @@ mstro_attribute_dict_set_kventry(mstro_attribute_dict dict,
                              .offset = aval->timestamp->offset };
       return mstro_attribute_dict_set(dict, key,
                                       MSTRO_CDO_ATTR_VALUE_timestamp, &ts, true);
-    }      
+    }
+    case MSTRO__POOL__AVAL__VAL_MMB_LAYOUT: {
+      /**FIXME support other mmbLayout types */
+      if(aval->mmblayout->type != MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_IRREGULAR){
+        ERR("Unsupported mmbLayout type \n");
+        return MSTRO_UNIMPL;
+      }
+      mmbLayout *dist_layout;
+      mstro_attribute_pool_aval_to_mmbLayout(aval->mmblayout, &dist_layout);
+
+      return mstro_attribute_dict_set(dict, key,
+                                      MSTRO_CDO_ATTR_VALUE_mmblayout, dist_layout, false);
+    }
     case MSTRO__POOL__AVAL__VAL_INT32:
       /* fall-thru */
     case MSTRO__POOL__AVAL__VAL_UINT32:
@@ -2691,6 +2739,82 @@ mstro_attribute_dict_set_kventry(mstro_attribute_dict dict,
   return MSTRO_UNIMPL;
 }
 
+mstro_status
+mstro_attribute_pool_find_dist_layout(Mstro__Pool__Attributes *attributes, mmbLayout **dist_layout)
+{
+  mstro_status status = MSTRO_OK;
+  size_t n = attributes->kv_map->n_map;
+
+  DEBUG("looking for distributed layout among %d attributes \n", n);
+  const char *key;
+  for(size_t i=0; i<n; i++) {
+     key = attributes->kv_map->map[i]->key;
+     if(strcmp(key,".maestro.core.cdo.dist-layout")==0) {
+       DEBUG("DIST Layout found ... \n");
+       status = mstro_attribute_pool_aval_to_mmbLayout(attributes->kv_map->map[i]->val->mmblayout, dist_layout);
+      break;
+     }
+   }
+
+  return status;
+}
+
+
+mstro_status
+mstro_attribute_pool_aval_to_mmbLayout(Mstro__Pool__Mmblayout *aval_layout, mmbLayout **dist_layout){
+  mstro_status s = MSTRO_UNIMPL;
+
+  /**FIXME support other mmbLayout types */
+  DEBUG("layout type %d \n ", aval_layout->type);
+  if( (aval_layout->type != MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_IRREGULAR) || (aval_layout->n_dims != 1)){
+       ERR("Unsupported mmbLayout type \n");
+     return MSTRO_UNIMPL;
+   }
+   size_t n_blocks = (size_t) aval_layout->irregular->n_blocks;
+   DEBUG("layout index %"PRIu64" \n ", aval_layout->index);
+   DEBUG("nblocks %zu \n", n_blocks);
+   for (size_t k = 0; k < n_blocks; k++) {
+     DEBUG("offsets[%zu] = %"PRIu64", lengths[%zu] = %"PRIu64" \n", k,
+                                                    aval_layout->irregular->offsets[k],
+                                                    k,
+                                                    aval_layout->irregular->lengths[k]);
+   }
+
+
+   /** copy and cast offsets and lengths to size_t * type */
+   size_t *offsets = malloc(sizeof(size_t)*n_blocks);
+   if(offsets == NULL){
+     ERR("Can not allocate memory for mmbLayout attribute \n");
+     return MSTRO_NOMEM;
+   }
+   size_t *lengths = malloc(sizeof(size_t)*n_blocks);
+   if(lengths == NULL){
+     ERR("Can not allocate memory for mmbLayout attribute \n");
+     return MSTRO_NOMEM;
+   }
+   mstro_status s1 =
+     mstro_cast_unit64_to_size_t_array(aval_layout->irregular->offsets,
+                              offsets,
+                              n_blocks);
+   assert(s1 == MSTRO_OK);
+   mstro_status s2 =
+     mstro_cast_unit64_to_size_t_array(aval_layout->irregular->lengths,
+                              lengths,
+                              n_blocks);
+   assert(s2 == MSTRO_OK);
+
+   mmbError mmb_s;
+   mmb_s = mmb_layout_create_dist_irregular_1d(aval_layout->element_size_bytes,
+                                aval_layout->index,
+                                n_blocks,
+                                offsets,
+                                lengths,
+                                dist_layout);
+  assert(mmb_s == MMB_OK);
+
+  s = (mmb_s == MMB_OK) ? MSTRO_OK:MSTRO_FAIL;
+  return s;
+}
 
 static inline
 mstro_status
@@ -2698,7 +2822,7 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
                                   Mstro__Pool__KvEntry **result_p)
 {
   mstro_status s = MSTRO_UNIMPL;
-  
+
   Mstro__Pool__KvEntry *res = malloc(sizeof(Mstro__Pool__KvEntry));
   if(res==NULL) {
     ERR("Failed to allocate KV entry\n");
@@ -2713,7 +2837,7 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
     s=MSTRO_NOMEM;
     goto BAILOUT;
   }
-  
+
   res->val = malloc(sizeof(Mstro__Pool__AVal));
   if(res->val==NULL) {
     ERR("Failed to allocat value box for KV entry\n");
@@ -2721,10 +2845,10 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
     s=MSTRO_NOMEM;
     goto BAILOUT;
   }
-  mstro__pool__aval__init(res->val);  
+  mstro__pool__aval__init(res->val);
 
   s=MSTRO_OK;
-  
+
   switch(entry->kind) {
     case MSTRO_STP_BOOL:
       res->val->val_case = MSTRO__POOL__AVAL__VAL_BOOL;
@@ -2741,7 +2865,7 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
     case MSTRO_STP_FLOAT:
       res->val->val_case = MSTRO__POOL__AVAL__VAL_FLOAT;
       res->val->float_ = *((float*)entry->val);
-      break; 
+      break;
     case MSTRO_STP_DOUBLE:
       res->val->val_case = MSTRO__POOL__AVAL__VAL_DOUBLE;
       res->val->double_ = *((double*)entry->val);
@@ -2770,7 +2894,7 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
                                         * rely on it staying alive long
                                         * enough */
       //      DEBUG("BLOB of length %zu wrapped as AVal\n", b->len);
-      s= MSTRO_OK;      
+      s= MSTRO_OK;
       break;
     case  MSTRO_STP_POINTER:
       res->val->val_case = MSTRO__POOL__AVAL__VAL_BYTES;
@@ -2793,6 +2917,80 @@ mstro_attribute_entry_to_mapentry(const struct mstro_attribute_entry_ *entry,
       }
       break;
     }
+    case MSTRO_STP_MMBLAYOUT: {
+      res->val->val_case = MSTRO__POOL__AVAL__VAL_MMB_LAYOUT;
+      res->val->mmblayout = malloc(sizeof(Mstro__Pool__Mmblayout));
+      if(res->val->mmblayout==NULL) {
+        ERR("Failed to allocate mmblayout AVal\n");
+        s=MSTRO_NOMEM;
+        free(res->val);
+        free(res);
+        goto BAILOUT;
+      }
+      mstro__pool__mmblayout__init(res->val->mmblayout);
+      const mmbLayout *src = (mmbLayout *)entry->val;
+      if (src->type != MMB_IRREGULAR){
+        ERR("Unsupported mmbLayout type %d \n", src->type);
+        s=MSTRO_INVARG;
+        free(res->val->mmblayout);
+        free(res->val);
+        free(res);
+        goto BAILOUT;
+      }
+      res->val->mmblayout->irregular = malloc(sizeof(Mstro__Pool__MmbLayoutIrregular));
+      if(res->val->mmblayout->irregular == NULL){
+        s=MSTRO_NOMEM;
+        free(res->val->mmblayout);
+        free(res->val);
+        free(res);
+        goto BAILOUT;
+      }
+      DEBUG("mmbLayout type %d \n", src->type);
+      DEBUG("mmbLayout n_dims  %zd \n", src->n_dims);
+      DEBUG("mmbLayout index %zd \n", src->index);
+      DEBUG("mmbLayout element size %zd \n", src->element.size_bytes);
+      DEBUG("mmbLayout n_blocks %zd \n", src->irregular.n_blocks.d[0]);
+      mstro__pool__mmb_layout_irregular__init(res->val->mmblayout->irregular);
+      res->val->mmblayout->type = MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_IRREGULAR;
+      res->val->mmblayout->n_dims = (uint64_t) src->n_dims;
+      res->val->mmblayout->index = (uint64_t) src->index;
+      res->val->mmblayout->element_size_bytes = (uint64_t) src->element.size_bytes;
+      res->val->mmblayout->layout_case = MSTRO__POOL__MMBLAYOUT__LAYOUT_IRREGULAR;
+      res->val->mmblayout->irregular->n_blocks = (uint64_t) src->irregular.n_blocks.d[0];
+      /**make a copy of offsets and pass it to be serialized */
+      uint64_t *offsets = (uint64_t *) malloc(sizeof(uint64_t)*src->irregular.n_blocks.d[0]);
+      uint64_t *lengths = (uint64_t *) malloc(sizeof(uint64_t)*src->irregular.n_blocks.d[0]);
+      if ((offsets == NULL) || (lengths == NULL)){
+        ERR("Failed to allocate mmblayout offsets or lengths \n");
+        s=MSTRO_NOMEM;
+      }
+      /** copy and cast data from src->irregular.offsets (size_t) to uint64_t */
+      mstro_status s1 = mstro_cast_size_t_to_unit64_array(src->irregular.offsets, offsets, src->irregular.n_blocks.d[0]);
+      mstro_status s2 = mstro_cast_size_t_to_unit64_array(src->irregular.lengths, lengths, src->irregular.n_blocks.d[0]);
+      /* something wrong happened, cleanup and fail */
+      if ((s != MSTRO_OK) || (s1 != MSTRO_OK) || (s2 != MSTRO_OK)) {
+        if(offsets) {
+          free(offsets);
+        }
+        if(lengths) {
+          free(lengths);
+        }
+        free(res->val->mmblayout->irregular);
+        free(res->val->mmblayout);
+        free(res->val);
+        free(res);
+        goto BAILOUT;
+      }
+      res->val->mmblayout->irregular->offsets = offsets;
+      res->val->mmblayout->irregular->lengths = lengths;
+      for (size_t i = 0; i < src->irregular.n_blocks.d[0]; i++) {
+        DEBUG("%zu offsets %"PRIu64" lengths %"PRIu64" \n", i, offsets[i], lengths[i]);
+      }
+      res->val->mmblayout->irregular->n_offsets = src->irregular.n_blocks.d[0];
+      res->val->mmblayout->irregular->n_lengths = src->irregular.n_blocks.d[0];
+      s=MSTRO_OK;
+      }
+      break;
     default:
       ERR("Unsupported attribute type for |%s|, can not serialize: %d\n",
           res->key, entry->kind);
@@ -2810,6 +3008,39 @@ BAILOUT:
   *result_p = res;
   return s;
 }
+static inline
+mstro_status
+mstro_cast_size_t_to_unit64_array(size_t *in, uint64_t *out, size_t len){
+  for (size_t i = 0; i < len; i++) {
+    /*check if the value does not fit in uint64_t */
+    if(in[i] >= UINT64_MAX){
+      ERR("can not convert from size_t to uint64_t, values are too large");
+      return MSTRO_FAIL;
+    }
+    else {
+      out[i] = (uint64_t) in[i];
+    }
+
+  }
+  return MSTRO_OK;
+}
+
+
+static inline
+mstro_status
+mstro_cast_unit64_to_size_t_array(uint64_t *in, size_t *out, size_t len){
+  for (size_t i = 0; i < len; i++) {
+    /*check if the value does not fit in uint64_t */
+    if(in[i] >= SIZE_MAX){
+      ERR("can not convert from uint64_t to size_t, values are too large");
+      return MSTRO_FAIL;
+    }
+    else {
+      out[i] = (size_t) in[i];
+    }
+  }
+  return MSTRO_OK;
+}
 
 static inline
 void
@@ -2843,6 +3074,9 @@ mstro_attribute_map__mapentry_destroy(Mstro__Pool__KvEntry *entry)
       free(entry->val->timestamp);
       entry->val->timestamp = NULL;
       break;
+    case MSTRO__POOL__AVAL__VAL_MMB_LAYOUT:
+      /** nothing is shared with dict ... rely on default deallocator */
+      break;
     default:
       ERR("Unexpected mapentry type: %d\n", entry->val->val_case);
   }
@@ -2859,7 +3093,7 @@ mstro_attribute_dict_to_kvmap(mstro_attribute_dict dict,
                               Mstro__Pool__Attributes__Map **map)
 {
   mstro_status s=MSTRO_UNIMPL;
-  
+
   Mstro__Pool__Attributes__Map *res
       = malloc(sizeof(Mstro__Pool__Attributes__Map));
   if(res==NULL) {
@@ -2868,7 +3102,7 @@ mstro_attribute_dict_to_kvmap(mstro_attribute_dict dict,
     goto BAILOUT;
   }
   mstro__pool__attributes__map__init(res);
-  
+
   res->n_map = HASH_COUNT(dict->dict);
   if(res->n_map==0) {
     res->map = NULL;
@@ -2876,7 +3110,7 @@ mstro_attribute_dict_to_kvmap(mstro_attribute_dict dict,
     goto BAILOUT;
   }
   DEBUG("Dict has %zu entries\n", res->n_map);
-  
+
   res->map = malloc(res->n_map * sizeof(Mstro__Pool__KvEntry*));
   if(res->map==NULL) {
     ERR("Failed to allocate k-v-map array\n");
@@ -2905,16 +3139,17 @@ mstro_attribute_dict_to_kvmap(mstro_attribute_dict dict,
         goto BAILOUT;
       }
     } else {
+      DEBUG("Serialized |%s| successfully at index %d \n",  mstro_symbol_name(el->key), i);
       i++;
     }
   }
   DEBUG("Total non-serialized entries: %zu\n", res->n_map - i);
   res->n_map = i; /* we could have skipped some */
   s=MSTRO_OK;
-  
+
 BAILOUT:
   *map = res;
-        
+
   return s;
 }
 
@@ -2954,7 +3189,6 @@ mstro_attribute_dict_to_message(mstro_attribute_dict dict,
    * default namespace. FIXME: we could save on message size by
    * picking the 'most-used' prefix and set that, compressing names */
   res->default_namespace = NULL;
-  
   *msg_p = res;
   return MSTRO_OK;
 }
@@ -2980,14 +3214,14 @@ mstro_attribute_dict_message_dispose(mstro_attribute_dict dict,
     msg->kv_map->n_map = 0;
     free(msg->kv_map->map);
     msg->kv_map->map=NULL;
-    
+
     mstro__pool__attributes__free_unpacked(msg, NULL);
   }
 
   return MSTRO_OK;
 }
-    
-  
+
+
 
 static inline
 mstro_status
@@ -3035,7 +3269,7 @@ mstro_attribute_val_cmp(const struct mstro_stp_val *attrtype,
       }
       break;
     }
-    
+
     case MSTRO_ATTR_VAL_CMP_LT:
       switch(attrtype->kind) {
         case MSTRO_STP_UINT: {
@@ -3089,7 +3323,7 @@ mstro_attribute_val_cmp(const struct mstro_stp_val *attrtype,
       }
       return s;
     }
-      
+
     case MSTRO_ATTR_VAL_CMP_LE:
       switch(attrtype->kind) {
         case MSTRO_STP_UINT: {
@@ -3142,12 +3376,12 @@ mstro_attribute_val_cmp(const struct mstro_stp_val *attrtype,
       }
       return s;
     }
-      
+
     case MSTRO_ATTR_VAL_CMP_RMATCH: /* fallthrough */
     case MSTRO_ATTR_VAL_CMP_RMATCH_ICASE:
       ERR("Unimplemented RMATCH comparison\n");
       return MSTRO_UNIMPL;
-      
+
     default:
       ERR("Unknown comparison op: %d\n");
       return MSTRO_INVARG;
@@ -3186,7 +3420,7 @@ mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
    * this); exceptions will overwrite (BLOB for instance, and
    * string): */
   lhsvalsize = rhsvalsize;
-  
+
   /* un-type attribute value ("lhs") */
   switch(aval->val_case) {
     case MSTRO__POOL__AVAL__VAL_INT32:
@@ -3197,7 +3431,7 @@ mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
       assert(attrtype->kind == MSTRO_STP_BOOL);
       lhsval = &aval->bool_;
       break;
-      
+
     case MSTRO__POOL__AVAL__VAL_INT64:
       assert(attrtype->kind == MSTRO_STP_INT);
       lhsval = &aval->int64;
@@ -3212,7 +3446,7 @@ mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
       assert(attrtype->kind == MSTRO_STP_DOUBLE);
       lhsval = &aval->double_;
       break;
-      
+
     case MSTRO__POOL__AVAL__VAL_STRING:
       assert(attrtype->kind == MSTRO_STP_STR || attrtype->kind == MSTRO_STP_REGEX);
       lhsval = aval->string;
@@ -3224,7 +3458,7 @@ mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
       lhsval = aval->bytes.data;
       lhsvalsize = aval->bytes.len;
       break;
-  
+
     case MSTRO__POOL__AVAL__VAL_TIMESTAMP:
        /* FIXME: aren't timestamps converted to strings on the wire?
         * If not the timestamp kind should be part of the public enum
@@ -3236,16 +3470,19 @@ mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
       tmp->offset = aval->timestamp->offset;
       lhsval = tmp;
       break;
-      
+
     default:
       ERR("Unexpected attribute value kind: %d\n", aval->val_case);
       return MSTRO_FAIL;
   }
 
-  s = mstro_attribute_val_cmp(attrtype, 
+  s = mstro_attribute_val_cmp(attrtype,
                               lhsval, lhsvalsize,
                               cmp,
                               rhsval, rhsvalsize,
                               result);
+  assert(rhsval!=NULL);
+  free(rhsval); // was allocated at val_parse time and scope is ending
+
   return s;
 }
diff --git a/attributes/maestro-schema.h b/attributes/maestro-schema.h
index a9d255807274523ed4bb0ac52b5a32bd7e62d132..d84cbd9791b3b3d1d0859895f466b28e8963ed34 100644
--- a/attributes/maestro-schema.h
+++ b/attributes/maestro-schema.h
@@ -38,6 +38,7 @@
 
 #include <stdbool.h>
 #include "maestro/status.h"
+#include "deps/mamba/common/mmb_layout.h"
 #include "protocols/mstro_pool.pb-c.h"
 
 /** An (abstract) schema handle */
@@ -258,6 +259,22 @@ mstro_status
 mstro_attribute_dict_to_message(mstro_attribute_dict dict,
                                 Mstro__Pool__Attributes **msg_p);
 
+/** convert MSTRO__POOL__MMBLAYOUT to mmbLayout
+  */
+mstro_status
+mstro_attribute_pool_aval_to_mmbLayout(Mstro__Pool__Mmblayout *aval_layout, mmbLayout **dist_layout);
+
+
+/** Look for .maestro.core.cdo.dist-layout in Mstro__Pool__Attributes array
+  * and convert it to mmbLayout
+  *
+  * Return MSTRO_OK on success.
+**/
+mstro_status
+mstro_attribute_pool_find_dist_layout(
+                                  Mstro__Pool__Attributes *attributes,
+                                  mmbLayout **dist_layout);
+
 /** Destroy a pool manager message containing references to the values in dict.
  *
  * This function must be used instead of the generic protobuf deallocators to avoid freeing values still
@@ -267,6 +284,21 @@ mstro_status
 mstro_attribute_dict_message_dispose(mstro_attribute_dict dict,
                                      Mstro__Pool__Attributes *msg);
 
+/** cast and copy a size_t array to uint64_t array
+ *
+ * such coversion is needed for mmbLayout irregular layout offsets and lengths arrays
+ */
+static inline
+mstro_status
+mstro_cast_size_t_to_unit64_array(size_t *in, uint64_t *out, size_t len);
+
+/** cast and copy a uint64_t array to size_t array
+ *
+ * such coversion is needed for mmbLayout irregular layout offsets and lengths arrays
+ */
+static inline
+mstro_status
+mstro_cast_unit64_to_size_t_array(uint64_t *in, size_t *out, size_t len);
 
 
 /** Comparison operations for attribute values
@@ -288,5 +320,5 @@ mstro_status
 mstro_attribute_val_cmp_str(mstro_schema_attribute attr,
                             const char *valstr, enum mstro_attribute_val_cmp_op cmp,
                             const Mstro__Pool__AVal *avalcmp, bool *result);
-    
+
 #endif
diff --git a/attributes/schema_type_parse.c b/attributes/schema_type_parse.c
index 4bf2c57987c2796db1a848d9bc495c43d26f8005..a1b1f3fce6a25138f96e1f45cc1c8cd7e4b2f68f 100644
--- a/attributes/schema_type_parse.c
+++ b/attributes/schema_type_parse.c
@@ -112,6 +112,10 @@ mstro_stp_val_alloc(enum mstro_stp_val_kind kind)
         res->blob_minlen = 0;
         res->blob_maxlen = MSTRO_STP_VAL_SIZE_MAX;
         break;
+    case MSTRO_STP_MMBLAYOUT:
+    res->mmblayout_minlen = 0;
+    res->mmblayout_maxlen = MSTRO_STP_VAL_SIZE_MAX;
+    break;
       case MSTRO_STP_ERROR:
         res->errmsg = NULL;
         break;
@@ -1254,6 +1258,20 @@ static void pcc_action_builtin_type_name_9(mstro_stp_context_t *__pcc_ctx, pcc_t
 #undef auxil
 }
 
+static void pcc_action_builtin_type_name_10(mstro_stp_context_t *__pcc_ctx, pcc_thunk_t *__pcc_in, pcc_value_t *__pcc_out) {
+#define auxil (__pcc_ctx->auxil)
+#define __ (*__pcc_out)
+#define _0 pcc_get_capture_string(__pcc_ctx, &__pcc_in->data.leaf.capt0)
+#define _0s ((const)__pcc_in->data.leaf.capt0.range.start)
+#define _0e ((const)__pcc_in->data.leaf.capt0.range.end)
+    __ = mstro_stp_val_alloc(MSTRO_STP_MMBLAYOUT);   DEBUG("found mmb_layout\n"); 
+#undef _0e
+#undef _0s
+#undef _0
+#undef __
+#undef auxil
+}
+
 static void pcc_action_user_type_name_0(mstro_stp_context_t *__pcc_ctx, pcc_thunk_t *__pcc_in, pcc_value_t *__pcc_out) {
 #define auxil (__pcc_ctx->auxil)
 #define __ (*__pcc_out)
@@ -1658,7 +1676,7 @@ static void pcc_action_string_0(mstro_stp_context_t *__pcc_ctx, pcc_thunk_t *__p
 #define _1 pcc_get_capture_string(__pcc_ctx, __pcc_in->data.leaf.capts.buf[0])
 #define _1s __pcc_in->data.leaf.capts.buf[0]->range.start
 #define _1e __pcc_in->data.leaf.capts.buf[0]->range.end
-    __=mstro_stp_val_alloc(MSTRO_STP_STRVAL); 
+    __=mstro_stp_val_alloc(MSTRO_STP_STRVAL);
     __->strval = strdup(_1);
     if(__->strval==NULL) {
       ERR("Failed to allocate string for \"%s\"\n", _1);
@@ -2258,6 +2276,31 @@ static pcc_thunk_chunk_t *pcc_evaluate_rule_builtin_type_name(mstro_stp_context_
         }
         goto L0001;
     L0011:;
+        ctx->pos = p;
+        pcc_thunk_array__revert(ctx->auxil, &chunk->thunks, n);
+        {
+            if (
+                pcc_refill_buffer(ctx, 9) < 9 ||
+                (ctx->buffer.buf + ctx->pos)[0] != 'm' ||
+                (ctx->buffer.buf + ctx->pos)[1] != 'm' ||
+                (ctx->buffer.buf + ctx->pos)[2] != 'b' ||
+                (ctx->buffer.buf + ctx->pos)[3] != 'l' ||
+                (ctx->buffer.buf + ctx->pos)[4] != 'a' ||
+                (ctx->buffer.buf + ctx->pos)[5] != 'y' ||
+                (ctx->buffer.buf + ctx->pos)[6] != 'o' ||
+                (ctx->buffer.buf + ctx->pos)[7] != 'u' ||
+                (ctx->buffer.buf + ctx->pos)[8] != 't'
+            ) goto L0012;
+            ctx->pos += 9;
+        }
+        {
+            pcc_thunk_t *const thunk = pcc_thunk__create_leaf(ctx->auxil, pcc_action_builtin_type_name_10, 0, 0);
+            thunk->data.leaf.capt0.range.start = chunk->pos;
+            thunk->data.leaf.capt0.range.end = ctx->pos;
+            pcc_thunk_array__add(ctx->auxil, &chunk->thunks, thunk);
+        }
+        goto L0001;
+    L0012:;
         ctx->pos = p;
         pcc_thunk_array__revert(ctx->auxil, &chunk->thunks, n);
         goto L0000;
@@ -3705,7 +3748,7 @@ mstro_stp_val_dispose(struct mstro_stp_val *v)
     case MSTRO_STP_REGEX:
       if(v->regex_name!=NULL)
         free(v->regex_name);
-      for(size_t i=0; i<v->regex_numpatterns; i++) 
+      for(size_t i=0; i<v->regex_numpatterns; i++)
         free(v->regex_patterns[i]);
       if(v->regex_patterns!=NULL)
         free(v->regex_patterns);
@@ -3719,7 +3762,7 @@ mstro_stp_val_dispose(struct mstro_stp_val *v)
       }
       break;
     case MSTRO_STP_NAME:
-      if(v->nameval) 
+      if(v->nameval)
               free(v->nameval);
       break;
     default:
@@ -3761,7 +3804,7 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
       DEBUG("v kind: DOUBLE (min=%g, max=%g)\n", v->double_min, v->double_max); break;
     case MSTRO_STP_STR:
       DEBUG("v kind: STRING (min=%zu, max=%zu, excluded='%s')\n",
-      	    v->str_minlen, v->str_maxlen, v->str_excludedchars == NULL ? "" : v->str_excludedchars ); 
+      	    v->str_minlen, v->str_maxlen, v->str_excludedchars == NULL ? "" : v->str_excludedchars );
       break;
     case MSTRO_STP_REGEX:
       DEBUG("v kind: REGEX (%d substrings, ignore_case=%d, name='%s')\n",
@@ -3775,6 +3818,9 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
       break;
     case MSTRO_STP_POINTER:
       DEBUG("v kind: POINTER\n"); break;
+    case MSTRO_STP_MMBLAYOUT:
+      DEBUG("v kind: mmb_layout (min=%" PRIu64 ", max=%" PRIu64 ")\n", v->mmblayout_minlen, v->mmblayout_maxlen);
+    break;
 
     case MSTRO_STP_BOOLVAL:
     case MSTRO_STP_MININT:
@@ -3794,7 +3840,7 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
 }
 
 
-/** build a user-understandable type value from TYPENAME and 
+/** build a user-understandable type value from TYPENAME and
  ** the applicable arguments */
 
 struct mstro_stp_val *
@@ -3823,7 +3869,7 @@ mstro_stp_val__build_restricted_type(
         leftovers = args;
       };
       break;
- 
+
    /* pick out the ones we understand, accumulate the rest in leftovers */
     case MSTRO_STP_UINT: {
       while(args!=NULL) {
@@ -3833,52 +3879,52 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINUINT: 
+         case MSTRO_STP_MINUINT:
            result->uint_min = tmp->list_entry->minuintval; break;
-         case MSTRO_STP_MAXUINT: 
+         case MSTRO_STP_MAXUINT:
            result->uint_max = tmp->list_entry->maxuintval; break;
          default:
            ERR("Illegal type restriction for unsigned integer: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
       }
-      
+
     case MSTRO_STP_INT: {
       while(args!=NULL) {
        assert(args->kind==MSTRO_STP_LIST);
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MININT: 
+         case MSTRO_STP_MININT:
            result->int_min = tmp->list_entry->minintval;
 	   break;
-         case MSTRO_STP_MINUINT: 
+         case MSTRO_STP_MINUINT:
            if(tmp->list_entry->minuintval>(uint64_t)INT64_MAX) {
              ERR("signed int min %" PRIu64 "exceeds INT64_MAX\n", tmp->list_entry->minuintval);
            }
            result->int_min = (tmp->list_entry->minuintval < INT64_MAX)
 	   		     ? (int64_t)tmp->list_entry->minuintval : INT64_MAX;
            break;
-         case MSTRO_STP_MAXINT: 
+         case MSTRO_STP_MAXINT:
            result->int_max = tmp->list_entry->maxintval;
 	   break;
-         case MSTRO_STP_MAXUINT: 
+         case MSTRO_STP_MAXUINT:
            DEBUG("int, encountered maxuint arg of %" PRIu64 "\n", tmp->list_entry->maxuintval);
            if(tmp->list_entry->maxuintval>(uint64_t)INT64_MAX) {
              ERR("signed int max %" PRIu64 "exceeds INT64_MAX\n", tmp->list_entry->maxuintval);
            }
-           result->int_max = (tmp->list_entry->maxuintval < INT64_MAX) 
+           result->int_max = (tmp->list_entry->maxuintval < INT64_MAX)
                              ? (int64_t)tmp->list_entry->maxuintval : INT64_MAX ;
            break;
          default:
            ERR("Illegal type restriction for signed integer: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -3892,14 +3938,14 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINFLOAT: 
+         case MSTRO_STP_MINFLOAT:
            result->float_min = tmp->list_entry->minfloatval; break;
-         case MSTRO_STP_MAXFLOAT: 
+         case MSTRO_STP_MAXFLOAT:
            result->float_max = tmp->list_entry->maxfloatval; break;
          default:
            ERR("Illegal type restriction for float: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -3913,14 +3959,14 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINDOUBLE: 
+         case MSTRO_STP_MINDOUBLE:
            result->double_min = tmp->list_entry->mindoubleval; break;
-         case MSTRO_STP_MAXDOUBLE: 
+         case MSTRO_STP_MAXDOUBLE:
            result->double_max = tmp->list_entry->maxdoubleval; break;
          default:
            ERR("Illegal type restriction for double: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -3931,9 +3977,9 @@ mstro_stp_val__build_restricted_type(
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_EXCLSTR: 
+         case MSTRO_STP_EXCLSTR:
            result->str_excludedchars = tmp->list_entry->exclstrval;
            tmp->list_entry->exclstrval = NULL; // passed on to result
            break;
@@ -3941,13 +3987,13 @@ mstro_stp_val__build_restricted_type(
            result->str_maxlen = tmp->list_entry->maxintval;
            break;
          case MSTRO_STP_MAXUINT:
-           result->str_maxlen = tmp->list_entry->maxuintval; 
+           result->str_maxlen = tmp->list_entry->maxuintval;
            break;
          case MSTRO_STP_MININT:
-           result->str_minlen = tmp->list_entry->minintval; 
+           result->str_minlen = tmp->list_entry->minintval;
            break;
          case MSTRO_STP_MINUINT:
-           result->str_minlen = tmp->list_entry->minuintval; 
+           result->str_minlen = tmp->list_entry->minuintval;
            break;
          default:
            ERR("Illegal type restriction for string: %d\n", tmp->list_entry->kind);
@@ -3955,42 +4001,49 @@ mstro_stp_val__build_restricted_type(
        /* clamp to protobuf permitted max */
        if(tmp->list_entry->kind==MSTRO_STP_MININT||tmp->list_entry->kind==MSTRO_STP_MINUINT) {
          if(result->str_minlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("string min %zu larger than supported maximum, clamping\n", 
+           ERR("string min %zu larger than supported maximum, clamping\n",
                result->str_minlen);
            result->str_minlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
        if(tmp->list_entry->kind==MSTRO_STP_MAXINT||tmp->list_entry->kind==MSTRO_STP_MAXUINT) {
          if(result->str_maxlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("string max %zu larger than supported maximum, clamping\n", 
+           ERR("string max %zu larger than supported maximum, clamping\n",
                result->str_maxlen);
            result->str_maxlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
       }
+    case MSTRO_STP_MMBLAYOUT:
+       if(args!=NULL) {
+        /** FIXME should have size restrictions **/
+          ERR("mmbLayout type does not accept type restrictions, ignoring them\n");
+          leftovers = args;
+        };
+        break;
     case MSTRO_STP_BLOB: {
       while(args!=NULL) {
        assert(args->kind==MSTRO_STP_LIST);
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
          case MSTRO_STP_MAXINT:
            result->blob_maxlen = tmp->list_entry->maxintval;
            break;
          case MSTRO_STP_MAXUINT:
-           result->blob_maxlen = tmp->list_entry->maxuintval; 
+           result->blob_maxlen = tmp->list_entry->maxuintval;
            break;
          case MSTRO_STP_MININT:
-           result->blob_minlen = tmp->list_entry->minintval; 
+           result->blob_minlen = tmp->list_entry->minintval;
            break;
          case MSTRO_STP_MINUINT:
-           result->blob_minlen = tmp->list_entry->minuintval; 
+           result->blob_minlen = tmp->list_entry->minuintval;
            break;
          default:
            ERR("Illegal type restriction for blob: %d\n", tmp->list_entry->kind);
@@ -3998,14 +4051,14 @@ mstro_stp_val__build_restricted_type(
        /* clamp to protobuf permitted max */
        if(tmp->list_entry->kind==MSTRO_STP_MININT||tmp->list_entry->kind==MSTRO_STP_MINUINT) {
          if(result->blob_minlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("blob min %zu larger than supported maximum, clamping\n", 
+           ERR("blob min %zu larger than supported maximum, clamping\n",
                result->blob_minlen);
            result->blob_minlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
        if(tmp->list_entry->kind==MSTRO_STP_MAXINT||tmp->list_entry->kind==MSTRO_STP_MAXUINT) {
          if(result->blob_maxlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("blob max %zu larger than supported maximum, clamping\n", 
+           ERR("blob max %zu larger than supported maximum, clamping\n",
                result->blob_maxlen);
            result->blob_maxlen = MSTRO_STP_VAL_SIZE_MAX;
          }
@@ -4024,7 +4077,7 @@ mstro_stp_val__build_restricted_type(
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        DEBUG("looking at %d\n", tmp->list_entry->kind);
        switch(tmp->list_entry->kind) {
          case MSTRO_STP_ICASE:
@@ -4035,13 +4088,13 @@ mstro_stp_val__build_restricted_type(
            tmp->list_entry->nameval = NULL; // moved to result
            DEBUG("regex name '%s'\n", result->regex_name);
            break;
-         case MSTRO_STP_STRVAL: 
+         case MSTRO_STP_STRVAL:
            /* patterns; push to pattern stack and count */
            LL_PREPEND2(patterns,tmp,list_tail);
            result->regex_numpatterns++;
            tmp=NULL; /* moved to patterns stack */
-           DEBUG("Found pattern |%s|, have %zu\n", 
-                 patterns->list_entry->strval, 
+           DEBUG("Found pattern |%s|, have %zu\n",
+                 patterns->list_entry->strval,
                  result->regex_numpatterns);
            break;
          default:
@@ -4097,9 +4150,8 @@ mstro_stp_val__build_restricted_type(
       leftovers = args;
       break;
   }
-  if(leftovers!=NULL) 
+  if(leftovers!=NULL)
     mstro_stp_val_dispose(leftovers);
 
   return result;
 }
-
diff --git a/attributes/schema_type_parse.h b/attributes/schema_type_parse.h
index 06010e6d5e20596371efc716658f0c474f509f23..1bc8859b80726133647c72be428d2b3861f4664b 100644
--- a/attributes/schema_type_parse.h
+++ b/attributes/schema_type_parse.h
@@ -44,7 +44,8 @@ enum mstro_stp_val_kind {
   MSTRO_STP_STR,
   MSTRO_STP_REGEX,
   MSTRO_STP_BLOB,
-  MSTRO_STP_POINTER
+  MSTRO_STP_POINTER,
+  MSTRO_STP_MMBLAYOUT
 
 };
 
@@ -122,6 +123,10 @@ struct mstro_stp_val {
      uint64_t blob_minlen;
      uint64_t blob_maxlen;
      };
+   struct {
+     uint64_t mmblayout_minlen;
+     uint64_t mmblayout_maxlen;
+    };
    };
 
 };
diff --git a/attributes/schema_type_parse.peg b/attributes/schema_type_parse.peg
index 5e5cc048fc4b3cdacab5b26596f43305ab34d1eb..9fd23b5fad6c87f3a660a77239a34f39163209a9 100644
--- a/attributes/schema_type_parse.peg
+++ b/attributes/schema_type_parse.peg
@@ -41,7 +41,7 @@ mstro_stp__getchar(struct mstro_stp_parser_ctx *input) {
   else
     return (int)input->string[input->next++];
 }
- 
+
 #define PCC_GETCHAR(auxil) mstro_stp__getchar(auxil)
 
 #define PCC_ERROR(auxil) do {                                 \
@@ -98,6 +98,10 @@ mstro_stp_val_alloc(enum mstro_stp_val_kind kind)
         res->blob_minlen = 0;
 	res->blob_maxlen = MSTRO_STP_VAL_SIZE_MAX;
 	break;
+    case MSTRO_STP_MMBLAYOUT:
+    res->mmblayout_minlen = 0;
+    res->mmblayout_maxlen = MSTRO_STP_VAL_SIZE_MAX;
+    break;
       case MSTRO_STP_ERROR:
         res->errmsg = NULL;
 	break;
@@ -144,7 +148,7 @@ mstro_stp_val_alloc(enum mstro_stp_val_kind kind)
         break;
 
 	break;
-	  
+
       default:
         ERR("No default restrictions set for type %d\n", kind);
     }
@@ -161,7 +165,7 @@ struct mstro_stp_val *mstro_stp_val__cons(struct mstro_stp_val *car,struct mstro
     cons->list_entry = car;
     cons->list_tail  = cdr;
   }
-  
+
   return cons;
 }
 
@@ -208,7 +212,7 @@ enum mstro_stp_val_kind {
   MSTRO_STP_NAME,
   MSTRO_STP_LIST,
   MSTRO_STP_TIMESTAMP,  /* externally will be converted to RFC3339 regex */
-  
+
   /** user-facing node types */
   MSTRO_STP_BOOL = 64,
   MSTRO_STP_UINT,
@@ -218,8 +222,9 @@ enum mstro_stp_val_kind {
   MSTRO_STP_STR,
   MSTRO_STP_REGEX,
   MSTRO_STP_BLOB,
-  MSTRO_STP_POINTER
-  
+  MSTRO_STP_POINTER,
+  MSTRO_STP_MMBLAYOUT
+
 };
 
 /** User-facing: Protobuf imposes a size limit on strings and blobs
@@ -257,7 +262,7 @@ struct mstro_stp_val {
       struct mstro_stp_val *list_entry;
       struct mstro_stp_val *list_tail;
     };
-    
+
     struct {
       int64_t int_min;
       int64_t int_max;
@@ -273,29 +278,33 @@ struct mstro_stp_val {
      float    float_min;
      float    float_max;
      };
-     
+
    struct {
      double   double_min;
      double   double_max;
      };
-     
+
    struct {
      uint64_t str_minlen;
      uint64_t str_maxlen;
      char    *str_excludedchars;
      };
-     
+
    struct {
      size_t   regex_numpatterns;
      char   **regex_patterns; /**< num_patterns strings, concatenated as alternative regexes */
      char    *regex_name;
      bool     regex_ignorecase;
      };
-     
+
    struct {
      uint64_t blob_minlen;
      uint64_t blob_maxlen;
      };
+   struct {
+     uint64_t mmblayout_minlen;
+     uint64_t mmblayout_maxlen;
+    };
    };
 
 };
@@ -314,7 +323,7 @@ mstro_stp_val_dispose(struct mstro_stp_val *v);
 
 ############# Rule definitions ####################
 
-## Toplevel: a restricted type 
+## Toplevel: a restricted type
 type_expression <- space* t:restricted_type space* EOF {
                       $$ = t;
 		      mstro_stp_val__describe($$);
@@ -327,7 +336,7 @@ restricted_type <-  n:type_name space* args:type_arguments {
 
 ## A type_name is either a built-in or a valid type designator
 type_name <- x:builtin_type_name {$$ = x; }
-           / x:user_type_name    {$$ = x; } 
+           / x:user_type_name    {$$ = x; }
 
 ## The builtin type names are ...
 builtin_type_name <-
@@ -341,6 +350,7 @@ builtin_type_name <-
 		  / "blob"   { $$ = mstro_stp_val_alloc(MSTRO_STP_BLOB);   DEBUG("found blob\n");  }
 		  / "timestamp" { $$ = mstro_stp_val_alloc(MSTRO_STP_TIMESTAMP); DEBUG("found timestamp\n"); }
 		  / "pointer"   { $$ = mstro_stp_val_alloc(MSTRO_STP_POINTER);   DEBUG("found pointer\n"); }
+      / "mmblayout"   { $$ = mstro_stp_val_alloc(MSTRO_STP_MMBLAYOUT);   DEBUG("found mmb_layout\n"); }
 
 ## user-defined types are a name suitable for a maestro schema yaml
 ##   attribute key, possibly including (parts of) a namespace prefix
@@ -353,12 +363,12 @@ type_arguments <- '(' a:unnamed_args ','  l:type_restriction_list ')' {
 			 struct mstro_stp_val *tmp;
 			 LL_COUNT2(a,tmp,l_a,list_tail);
 			 LL_COUNT2(l,tmp,l_l,list_tail);
-			 
+
 			 assert(a->kind == MSTRO_STP_LIST);
 			 assert(l->kind == MSTRO_STP_LIST);
 			 $$=a;
 			 LL_CONCAT2($$,l,list_tail);
-			 
+
 			 LL_COUNT2($$,tmp,l_tot,list_tail);
  			 DEBUG("unnamed count %zu, named count %zu, total %zu\n", l_a, l_l, l_tot);
 
@@ -376,7 +386,7 @@ type_arguments <- '(' a:unnamed_args ','  l:type_restriction_list ')' {
 			 DEBUG("empty restrictions list\n");
 	              }
 
-		   
+
 type_restriction_list <- r:type_restriction space* ',' l:type_restriction_list {
 		              DEBUG("one more entry in type restriction list\n");
 			      $$=mstro_stp_val__cons(r,l);
@@ -516,7 +526,7 @@ identifier <- < [-.:A-Za-z][-.:A-Za-z0-9]* > {
                        }
 		       }
 
-string <- ["] < [^"]* > ["] { $$=mstro_stp_val_alloc(MSTRO_STP_STRVAL); 
+string <- ["] < [^"]* > ["] { $$=mstro_stp_val_alloc(MSTRO_STP_STRVAL);
                               $$->strval = strdup($1);
                               if($$->strval==NULL) {
                                 ERR("Failed to allocate string for \"%s\"\n", $1);
@@ -529,7 +539,7 @@ string <- ["] < [^"]* > ["] { $$=mstro_stp_val_alloc(MSTRO_STP_STRVAL);
                                 ERR("Failed to allocate string for '%s'\n", $2);
                                 auxil->got_error = true;
                               }
-                            } 
+                            }
 
 bool <- '1'                  { $$=mstro_stp_val_alloc(MSTRO_STP_BOOLVAL); $$->boolval = true; }
       / '0'                  { $$=mstro_stp_val_alloc(MSTRO_STP_BOOLVAL); $$->boolval = false; }
@@ -543,7 +553,7 @@ space <- [ \t\n\v\f\r] { ; }
 number <- space*             x:signed_number           { $$ = x; }
         / space* '+'? space* x:unsigned_number         { $$ = x; }
 
-signed_number <- '-' space* x:integer { 
+signed_number <- '-' space* x:integer {
                /* flip sign */
                if(x->uintval>INT64_MAX) {
                  ERR("integral value %" PRIu64 " too big for int64 type\n", $$->uintval);
@@ -558,7 +568,7 @@ signed_number <- '-' space* x:integer {
 
 unsigned_number <- x:integer { $$ = x; }
                  / x:decimal { $$ = x; }
-		 
+
 integer <- < [0-9]+ > {
          /* parse as uint64 */
          $$ = mstro_stp_val_alloc(MSTRO_STP_UINTVAL);
@@ -572,7 +582,7 @@ decimal <- < [0-9]+ '.' [0-9]* > {
         char *endptr;
 	$$->doubleval = atof($1);
 	}
-          
+
 ## end of input:
 EOF  <- !.
 
@@ -621,7 +631,7 @@ mstro_stp_val_dispose(struct mstro_stp_val *v)
     case MSTRO_STP_REGEX:
       if(v->regex_name!=NULL)
         free(v->regex_name);
-      for(size_t i=0; i<v->regex_numpatterns; i++) 
+      for(size_t i=0; i<v->regex_numpatterns; i++)
         free(v->regex_patterns[i]);
       if(v->regex_patterns!=NULL)
         free(v->regex_patterns);
@@ -635,7 +645,7 @@ mstro_stp_val_dispose(struct mstro_stp_val *v)
       }
       break;
     case MSTRO_STP_NAME:
-      if(v->nameval) 
+      if(v->nameval)
               free(v->nameval);
       break;
     default:
@@ -677,7 +687,7 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
       DEBUG("v kind: DOUBLE (min=%g, max=%g)\n", v->double_min, v->double_max); break;
     case MSTRO_STP_STR:
       DEBUG("v kind: STRING (min=%zu, max=%zu, excluded='%s')\n",
-      	    v->str_minlen, v->str_maxlen, v->str_excludedchars == NULL ? "" : v->str_excludedchars ); 
+      	    v->str_minlen, v->str_maxlen, v->str_excludedchars == NULL ? "" : v->str_excludedchars );
       break;
     case MSTRO_STP_REGEX:
       DEBUG("v kind: REGEX (%d substrings, ignore_case=%d, name='%s')\n",
@@ -691,6 +701,9 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
       break;
     case MSTRO_STP_POINTER:
       DEBUG("v kind: POINTER\n"); break;
+    case MSTRO_STP_MMBLAYOUT:
+      DEBUG("v kind: mmb_layout (min=%" PRIu64 ", max=%" PRIu64 ")\n", v->mmblayout_minlen, v->mmblayout_maxlen);
+    break;
 
     case MSTRO_STP_BOOLVAL:
     case MSTRO_STP_MININT:
@@ -710,7 +723,7 @@ mstro_stp_val__describe(const struct mstro_stp_val *v)
 }
 
 
-/** build a user-understandable type value from TYPENAME and 
+/** build a user-understandable type value from TYPENAME and
  ** the applicable arguments */
 
 struct mstro_stp_val *
@@ -739,7 +752,7 @@ mstro_stp_val__build_restricted_type(
         leftovers = args;
       };
       break;
- 
+
    /* pick out the ones we understand, accumulate the rest in leftovers */
     case MSTRO_STP_UINT: {
       while(args!=NULL) {
@@ -749,52 +762,52 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINUINT: 
+         case MSTRO_STP_MINUINT:
            result->uint_min = tmp->list_entry->minuintval; break;
-         case MSTRO_STP_MAXUINT: 
+         case MSTRO_STP_MAXUINT:
            result->uint_max = tmp->list_entry->maxuintval; break;
          default:
            ERR("Illegal type restriction for unsigned integer: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
       }
-      
+
     case MSTRO_STP_INT: {
       while(args!=NULL) {
        assert(args->kind==MSTRO_STP_LIST);
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MININT: 
+         case MSTRO_STP_MININT:
            result->int_min = tmp->list_entry->minintval;
 	   break;
-         case MSTRO_STP_MINUINT: 
+         case MSTRO_STP_MINUINT:
            if(tmp->list_entry->minuintval>(uint64_t)INT64_MAX) {
              ERR("signed int min %" PRIu64 "exceeds INT64_MAX\n", tmp->list_entry->minuintval);
            }
            result->int_min = (tmp->list_entry->minuintval < INT64_MAX)
 	   		     ? (int64_t)tmp->list_entry->minuintval : INT64_MAX;
            break;
-         case MSTRO_STP_MAXINT: 
+         case MSTRO_STP_MAXINT:
            result->int_max = tmp->list_entry->maxintval;
 	   break;
-         case MSTRO_STP_MAXUINT: 
+         case MSTRO_STP_MAXUINT:
            DEBUG("int, encountered maxuint arg of %" PRIu64 "\n", tmp->list_entry->maxuintval);
            if(tmp->list_entry->maxuintval>(uint64_t)INT64_MAX) {
              ERR("signed int max %" PRIu64 "exceeds INT64_MAX\n", tmp->list_entry->maxuintval);
            }
-           result->int_max = (tmp->list_entry->maxuintval < INT64_MAX) 
+           result->int_max = (tmp->list_entry->maxuintval < INT64_MAX)
                              ? (int64_t)tmp->list_entry->maxuintval : INT64_MAX ;
            break;
          default:
            ERR("Illegal type restriction for signed integer: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -808,14 +821,14 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINFLOAT: 
+         case MSTRO_STP_MINFLOAT:
            result->float_min = tmp->list_entry->minfloatval; break;
-         case MSTRO_STP_MAXFLOAT: 
+         case MSTRO_STP_MAXFLOAT:
            result->float_max = tmp->list_entry->maxfloatval; break;
          default:
            ERR("Illegal type restriction for float: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -829,14 +842,14 @@ mstro_stp_val__build_restricted_type(
        tmp->list_tail = NULL;
 
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_MINDOUBLE: 
+         case MSTRO_STP_MINDOUBLE:
            result->double_min = tmp->list_entry->mindoubleval; break;
-         case MSTRO_STP_MAXDOUBLE: 
+         case MSTRO_STP_MAXDOUBLE:
            result->double_max = tmp->list_entry->maxdoubleval; break;
          default:
            ERR("Illegal type restriction for double: %d\n", tmp->list_entry->kind);
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
@@ -847,9 +860,9 @@ mstro_stp_val__build_restricted_type(
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
-         case MSTRO_STP_EXCLSTR: 
+         case MSTRO_STP_EXCLSTR:
            result->str_excludedchars = tmp->list_entry->exclstrval;
            tmp->list_entry->exclstrval = NULL; // passed on to result
            break;
@@ -857,13 +870,13 @@ mstro_stp_val__build_restricted_type(
            result->str_maxlen = tmp->list_entry->maxintval;
            break;
          case MSTRO_STP_MAXUINT:
-           result->str_maxlen = tmp->list_entry->maxuintval; 
+           result->str_maxlen = tmp->list_entry->maxuintval;
            break;
          case MSTRO_STP_MININT:
-           result->str_minlen = tmp->list_entry->minintval; 
+           result->str_minlen = tmp->list_entry->minintval;
            break;
          case MSTRO_STP_MINUINT:
-           result->str_minlen = tmp->list_entry->minuintval; 
+           result->str_minlen = tmp->list_entry->minuintval;
            break;
          default:
            ERR("Illegal type restriction for string: %d\n", tmp->list_entry->kind);
@@ -871,42 +884,49 @@ mstro_stp_val__build_restricted_type(
        /* clamp to protobuf permitted max */
        if(tmp->list_entry->kind==MSTRO_STP_MININT||tmp->list_entry->kind==MSTRO_STP_MINUINT) {
          if(result->str_minlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("string min %zu larger than supported maximum, clamping\n", 
+           ERR("string min %zu larger than supported maximum, clamping\n",
                result->str_minlen);
            result->str_minlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
        if(tmp->list_entry->kind==MSTRO_STP_MAXINT||tmp->list_entry->kind==MSTRO_STP_MAXUINT) {
          if(result->str_maxlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("string max %zu larger than supported maximum, clamping\n", 
+           ERR("string max %zu larger than supported maximum, clamping\n",
                result->str_maxlen);
            result->str_maxlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
-       
+
        mstro_stp_val_dispose(tmp);
       };
       break;
       }
+    case MSTRO_STP_MMBLAYOUT:
+       if(args!=NULL) {
+        /** FIXME should have size restrictions **/
+          ERR("mmbLayout type does not accept type restrictions, ignoring them\n");
+          leftovers = args;
+        };
+        break;
     case MSTRO_STP_BLOB: {
       while(args!=NULL) {
        assert(args->kind==MSTRO_STP_LIST);
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        switch(tmp->list_entry->kind) {
          case MSTRO_STP_MAXINT:
            result->blob_maxlen = tmp->list_entry->maxintval;
            break;
          case MSTRO_STP_MAXUINT:
-           result->blob_maxlen = tmp->list_entry->maxuintval; 
+           result->blob_maxlen = tmp->list_entry->maxuintval;
            break;
          case MSTRO_STP_MININT:
-           result->blob_minlen = tmp->list_entry->minintval; 
+           result->blob_minlen = tmp->list_entry->minintval;
            break;
          case MSTRO_STP_MINUINT:
-           result->blob_minlen = tmp->list_entry->minuintval; 
+           result->blob_minlen = tmp->list_entry->minuintval;
            break;
          default:
            ERR("Illegal type restriction for blob: %d\n", tmp->list_entry->kind);
@@ -914,14 +934,14 @@ mstro_stp_val__build_restricted_type(
        /* clamp to protobuf permitted max */
        if(tmp->list_entry->kind==MSTRO_STP_MININT||tmp->list_entry->kind==MSTRO_STP_MINUINT) {
          if(result->blob_minlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("blob min %zu larger than supported maximum, clamping\n", 
+           ERR("blob min %zu larger than supported maximum, clamping\n",
                result->blob_minlen);
            result->blob_minlen = MSTRO_STP_VAL_SIZE_MAX;
          }
        }
        if(tmp->list_entry->kind==MSTRO_STP_MAXINT||tmp->list_entry->kind==MSTRO_STP_MAXUINT) {
          if(result->blob_maxlen > MSTRO_STP_VAL_SIZE_MAX) {
-           ERR("blob max %zu larger than supported maximum, clamping\n", 
+           ERR("blob max %zu larger than supported maximum, clamping\n",
                result->blob_maxlen);
            result->blob_maxlen = MSTRO_STP_VAL_SIZE_MAX;
          }
@@ -940,7 +960,7 @@ mstro_stp_val__build_restricted_type(
        struct mstro_stp_val *tmp = args;
        args = args->list_tail;
        tmp->list_tail = NULL;
-       
+
        DEBUG("looking at %d\n", tmp->list_entry->kind);
        switch(tmp->list_entry->kind) {
          case MSTRO_STP_ICASE:
@@ -951,13 +971,13 @@ mstro_stp_val__build_restricted_type(
            tmp->list_entry->nameval = NULL; // moved to result
            DEBUG("regex name '%s'\n", result->regex_name);
            break;
-         case MSTRO_STP_STRVAL: 
+         case MSTRO_STP_STRVAL:
            /* patterns; push to pattern stack and count */
            LL_PREPEND2(patterns,tmp,list_tail);
            result->regex_numpatterns++;
            tmp=NULL; /* moved to patterns stack */
-           DEBUG("Found pattern |%s|, have %zu\n", 
-                 patterns->list_entry->strval, 
+           DEBUG("Found pattern |%s|, have %zu\n",
+                 patterns->list_entry->strval,
                  result->regex_numpatterns);
            break;
          default:
@@ -1013,9 +1033,8 @@ mstro_stp_val__build_restricted_type(
       leftovers = args;
       break;
   }
-  if(leftovers!=NULL) 
+  if(leftovers!=NULL)
     mstro_stp_val_dispose(leftovers);
 
   return result;
 }
-
diff --git a/configure.ac b/configure.ac
index bdc1e458aa2e48b996a7ad92cb570bdca4bc3983..d839788523c097b3abf8e8cc7e21a4b00f2e3794 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,7 +66,7 @@ AS_IF([test "x$enable_devel" != "xno"],
               enable_asan=yes
               dnl include NOISE level debugging ability, but default to DEBUG
 	      CPPFLAGS="-DMSTRO_DEFAULT_LOG_LEVEL=3 -DMSTRO_MAX_DEBUG_LEVEL=4 $CPPFLAGS"
-              
+
             ],
             [ dnl set logging to ERR, and limit logging to DEBUG
 	      CPPFLAGS="-DMSTRO_DEFAULT_LOG_LEVEL=0 -DMSTRO_MAX_DEBUG_LEVEL=3 $CPPFLAGS"
@@ -335,10 +335,10 @@ dnl Check if Breathe is available
 AC_CHECK_PROGS([BREATHE], [breathe-apidoc], [no])
 AS_IF([ test "x$BREATHE" == x ],
     [AC_MSG_WARN([Breathe not found - continue without Breathe support])],
+    AM_CONDITIONAL([HAVE_BREATHE], [ test "x$BREATHE" != xno ])
     )
 
 
-
 dnl libyaml for configuration purposes
 dnl AC_CHECK_HEADER([yaml.h],[],AC_MSG_ERROR([yaml.h header not found]))
 dnl AC_CHECK_LIB([yaml],[yaml_document_initialize],[],[AC_MSG_ERROR([libyaml not found])])
@@ -430,12 +430,12 @@ AS_IF([test "x$enable_ofi_pool_manager" = "xyes"], [
     AC_PREPROC_IFELSE(
 	    [AC_LANG_PROGRAM(
 		    [[#include <rdma/fabric.h>
-#if !(FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1,11)))
-#error libfabric 1.10 or better required
+#if !(FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), FI_VERSION(1,14)))
+#error libfabric 1.14 or better required
 #endif]],
 		    [[return 0;]])],
 	    [AC_MSG_NOTICE([libfabric is recent enough, great])],
-	    [AC_MSG_WARN([libfabric is not recent enough, we require version 1.11 or better; will build our own])
+	    [AC_MSG_WARN([libfabric is not recent enough, we require version 1.14 or better; will build our own])
 	    build_private_ofi=yes])],
     [AC_MSG_WARN([building our own libfabric])
      build_private_ofi=yes])
@@ -454,8 +454,9 @@ AS_IF([test "x$enable_ofi_pool_manager" = "xyes"], [
    #AX_SUBDIRS_CONFIGURE([deps/libfabric],[[--enable-embedded],[--disable-rxm],[--disable-rxd],[--disable-psm]],[],[],[])
    dnl Configuring without kdreg to fix issue https://gitlab.com/cerl/maestro/maestro-core/-/issues/117 which relates to https://github.com/ofiwg/libfabric/issues/5313
    dnl Whether doing so creates a performance problem or not is still to be determined
-   AX_SUBDIRS_CONFIGURE([deps/libfabric],[[--enable-embedded],[--disable-rxd],[--disable-shm],[--disable-tcp],[--with-kdreg=no],[--enable-debug]],[],[],[])
+   AX_SUBDIRS_CONFIGURE([deps/libfabric],[[--enable-embedded],[--disable-rxd],[--disable-tcp],[--with-kdreg=no],[--enable-debug]],[],[],[])
    AC_MSG_NOTICE([================== done preconfiguring private libfabric library build])
+   AC_DEFINE([LOCAL_LIBFABRIC],[1],[Define if using our own libfabric build])
   ])
 
 
@@ -590,10 +591,12 @@ m4_define([TESTSCRIPTS],
    tests/check_pm_reentrant_client.sh \
    tests/check_pm_redundant_interlock.sh \
    tests/check_pm_interlock_async.sh \
+   tests/check_pm_dist_cdo.sh \
    tests/check_subscribe.sh \
    tests/check_ecmwf_events.sh \
    tests/check_ecmwf_handle.sh \
-   tests/check_ecmwf_attr.sh 
+   tests/check_ecmwf_attr.sh \
+   tests/check_decode_pminfo.sh
   ])
 
 m4_foreach_w([TESTSCRIPT],
diff --git a/deps/libfabric/.appveyor.yml b/deps/libfabric/.appveyor.yml
index 4a69c2ae37541ecd91730ae4243c6f4401ccdc5d..d2ae3e019a322a3a14ce247e00d3b84b50f7d873 100644
--- a/deps/libfabric/.appveyor.yml
+++ b/deps/libfabric/.appveyor.yml
@@ -1,6 +1,7 @@
 image:
   - Visual Studio 2015
   - Visual Studio 2017
+  - Visual Studio 2019
 
 build:
   project: libfabric.sln
@@ -8,15 +9,29 @@ build:
 configuration:
   - Debug-v140
   - Debug-v141
+  - Debug-v142
   - Release-v140
   - Release-v141
+  - Release-v142
 
 matrix:
   exclude:
+    - configuration: Debug-v140
+      image: Visual Studio 2019
     - configuration: Debug-v141
       image: Visual Studio 2015
+    - configuration: Debug-v142
+      image: Visual Studio 2015
+    - configuration: Debug-v142
+      image: Visual Studio 2017
+    - configuration: Release-v140
+      image: Visual Studio 2019
     - configuration: Release-v141
       image: Visual Studio 2015
+    - configuration: Release-v142
+      image: Visual Studio 2015
+    - configuration: Release-v142
+      image: Visual Studio 2017
 
 before_build:
   - ps: .appveyor.ps1 -Verbose
diff --git a/deps/libfabric/.github/workflows/gh-man.sh b/deps/libfabric/.github/workflows/gh-man.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8767f9c80bdc5b03c6537d20440fb62e04bad9c4
--- /dev/null
+++ b/deps/libfabric/.github/workflows/gh-man.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# Only do anything meaningful in the main ofiwg/libfabric repo.  If
+# we're not in that repo, then don't do anything because it confuses
+# people who fork the libfabric repo if GH Man Page Updater commits
+# show up in their fork with different git hashes than the GH Man
+# Page Updater commits on the ofiwg/libfabric repo.
+if test -n "$REPO"; then
+    first=`echo $REPO | cut -d/ -f1`
+    second=`echo $REPO | cut -d/ -f2`
+
+    if test "$first" != "ofiwg" -o "$second" != "libfabric"; then
+        cat <<EOF
+
+The GH Man Page Updater is contractually obligated to only operate on
+the ofiwg/libfabric repository.
+
+Exiting without doing anything.
+
+EOF
+        exit 0
+    fi
+fi
+
+if test "$BASE_REF" != "main"; then
+        echo "Not the main branch -- nothing to do!"
+    exit 0
+fi
+
+git config --global user.name "OFIWG Bot"
+git config --global user.email "ofiwg@lists.openfabrics.org"
+
+mkdir tmp && cp man/*.md tmp/
+
+branch_name=pr/update-gh-man-pages
+git fetch origin gh-pages
+git checkout gh-pages
+git checkout -b $branch_name
+cp tmp/*.md main/man/
+git add main/man/*.md
+
+set +e
+git commit -as -m 'Update GH man pages'
+st=$?
+set -e
+
+if test $st -ne 0; then
+    echo "Nothing to commit -- nothing to do!"
+    exit 0
+fi
+
+# Yes, we committed something.  Push the branch and make a PR.
+# Extract the PR number.
+git push --set-upstream origin $branch_name
+url=`hub pull-request -b gh-pages -m 'Update GH man pages'`
+pr_num=`echo $url | cut -d/ -f7`
+
+# Wait for the required "DCO" CI to complete
+i=0
+sleep_time=5
+max_seconds=300
+i_max=`expr $max_seconds / $sleep_time`
+
+echo "Waiting up to $max_seconds seconds for DCO CI to complete..."
+while test $i -lt $i_max; do
+    date
+    set +e
+    status=`hub ci-status --format "%t %S%n" | egrep '^DCO' | awk '{ print $2 }'`
+    set -e
+    if test "$status" = "success"; then
+        echo "DCO CI is complete!"
+        break
+    fi
+    sleep $sleep_time
+    i=`expr $i + 1`
+done
+
+status=0
+if test $i -lt $i_max; then
+    # Sadly, there is no "hub" command to merge a PR.  So do it by
+    # hand.
+    curl \
+        -XPUT \
+        -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/$GITHUB_REPOSITORY/pulls/$pr_num/merge
+else
+    echo "Sad panda; DCO CI didn't complete -- did not merge $url"
+    status=1
+fi
+
+# Delete the remote branch
+git push origin --delete $branch_name
+exit $status
diff --git a/deps/libfabric/.github/workflows/gh-man.yaml b/deps/libfabric/.github/workflows/gh-man.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb2784fda868bc18cbc54b6e8bd2c1fac64347f8
--- /dev/null
+++ b/deps/libfabric/.github/workflows/gh-man.yaml
@@ -0,0 +1,31 @@
+name: GH Man Page Updater
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'man/*.md'
+  workflow_dispatch:
+
+jobs:
+    gh-man-update:
+        name: GH Man Page Updater
+        runs-on: ubuntu-latest
+        steps:
+          - name: Debug information
+            env:
+              GITHUB_DATA: ${{ toJSON(github) }}
+            run: |
+              echo This is information that may be useful for debugging.
+              echo "$GITHUB_DATA"
+
+          - name: Check out the git repo
+            uses: actions/checkout@v2
+
+          - name: Update the man pages in branch gh-pages
+            run: .github/workflows/gh-man.sh
+            env:
+              GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+              REPO: ${{ github.event.repository.full_name }}
+              BASE_REF: ${{ github.event.repository.default_branch }}
diff --git a/deps/libfabric/.github/workflows/nroff-elves.sh b/deps/libfabric/.github/workflows/nroff-elves.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a7176baf4012cfc8a322091ebe45a2d62a4d1bde
--- /dev/null
+++ b/deps/libfabric/.github/workflows/nroff-elves.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# Only do anything meaningful in the main ofiwg/libfabric repo.  If
+# we're not in that repo, then don't do anything because it confuses
+# people who fork the libfabric repo if Nroff Elves commits show up in
+# their fork with different git hashes than the Nroff Elves commits on
+# the ofiwg/libfabric repo.
+if test -n "$REPO"; then
+    first=`echo $REPO | cut -d/ -f1`
+    second=`echo $REPO | cut -d/ -f2`
+
+    if test "$first" != "ofiwg" -o "$second" != "libfabric"; then
+        cat <<EOF
+
+The Nroff Elves are contractually obligated to only operate on the
+ofiwg/libfabric repository.
+
+Exiting without doing anything.
+
+EOF
+        exit 0
+    fi
+fi
+
+# In June of 2021, ofiwg/libfabric changed its default branch from
+# "master" to "main".  This confuses "hub" (because it still falls
+# back to "master" when it can't figure out the target branch name).
+# In nroff-elves.yaml, we load $BASE_REPO with
+# ${{github.event.repository.default_branch}} and use that to tell
+# "hub" what the base branch should be.  This works great... except
+# sometimes $BASE_REPO is blank (when it should have a valid branch
+# name in it).  So if we get here and $BASE_REPO is blank, just assume
+# that it should be "main".  This is lame and shouldn't be necessary,
+# but it prevents us all from getting Github Action failure emails.
+# Sigh.
+if test -z "$BASE_REF"; then
+    BASE_REF=main
+fi
+
+# If we're here, we want to generate some nroff.  Woo hoo!
+for file in `ls man/*.md`; do
+    perl config/md2nroff.pl --source=$file
+done
+
+git config --global user.name "OFIWG Bot"
+git config --global user.email "ofiwg@lists.openfabrics.org"
+
+branch_name=pr/update-nroff-generated-man-pages
+git checkout -b $branch_name
+
+set +e
+git commit -as -m 'Updated nroff-generated man pages'
+st=$?
+set -e
+
+if test $st -ne 0; then
+    echo "Nothing to commit -- nothing to do!"
+    exit 0
+fi
+
+# Yes, we committed something.  Push the branch and make a PR.
+# Extract the PR number.
+git push --set-upstream origin $branch_name
+url=`hub pull-request -b $BASE_REF -m 'Update nroff-generated man pages'`
+pr_num=`echo $url | cut -d/ -f7`
+
+# Wait for the required "DCO" CI to complete
+i=0
+sleep_time=5
+max_seconds=300
+i_max=`expr $max_seconds / $sleep_time`
+
+echo "Waiting up to $max_seconds seconds for DCO CI to complete..."
+while test $i -lt $i_max; do
+    date
+    set +e
+    status=`hub ci-status --format "%t %S%n" | egrep '^DCO' | awk '{ print $2 }'`
+    set -e
+    if test "$status" = "success"; then
+        echo "DCO CI is complete!"
+        break
+    fi
+    sleep $sleep_time
+    i=`expr $i + 1`
+done
+
+status=0
+if test $i -lt $i_max; then
+    # Sadly, there is no "hub" command to merge a PR.  So do it by
+    # hand.
+    curl \
+        -XPUT \
+        -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/$GITHUB_REPOSITORY/pulls/$pr_num/merge
+else
+    echo "Sad panda; DCO CI didn't complete -- did not merge $url"
+    status=1
+fi
+
+# Delete the remote branch
+git push origin --delete $branch_name
+exit $status
diff --git a/deps/libfabric/.github/workflows/nroff-elves.yaml b/deps/libfabric/.github/workflows/nroff-elves.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae1040a2ad447095089982fea9ff15a88579d36e
--- /dev/null
+++ b/deps/libfabric/.github/workflows/nroff-elves.yaml
@@ -0,0 +1,31 @@
+name: GitHub Action Schedule
+
+on:
+  schedule:
+    - cron: '0 * * * *'
+  workflow_dispatch:
+
+jobs:
+    nroff-elves-scheduled:
+        name: The Nroff Elves
+        runs-on: ubuntu-latest
+        steps:
+          - name: Debug information
+            env:
+              GITHUB_DATA: ${{ toJSON(github) }}
+            run: |
+              echo This is information that may be useful for debugging.
+              echo "$GITHUB_DATA"
+
+          - name: Check out the git repo
+            uses: actions/checkout@v2
+
+          - name: Get the required packages
+            run: sudo apt install -y pandoc hub
+
+          - name: Build the nroff man pages
+            run: .github/workflows/nroff-elves.sh
+            env:
+              GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+              REPO: ${{ github.event.repository.full_name }}
+              BASE_REF: ${{ github.event.repository.default_branch }}
diff --git a/deps/libfabric/.github/workflows/pr-ci.yml b/deps/libfabric/.github/workflows/pr-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7e66800987310f89461fe877e5eabf9cc6f76eb7
--- /dev/null
+++ b/deps/libfabric/.github/workflows/pr-ci.yml
@@ -0,0 +1,136 @@
+name: Build Checks
+on: [push, pull_request]
+env:
+  APT_PACKAGES: >-
+    abi-compliance-checker
+    abi-dumper
+    build-essential
+    debhelper
+    dh-systemd
+    fakeroot
+    gcc
+    git
+    libnl-3-200 libnl-3-dev libnl-route-3-200 libnl-route-3-dev
+    libnuma-dev
+    libudev-dev
+    uuid-dev
+    make
+    ninja-build
+    pandoc
+    pkg-config
+    python
+    rpm
+    sparse
+    valgrind
+    wget
+  OFI_PROVIDER_FLAGS: >-
+    --enable-efa=rdma-core/build
+    --enable-mrail
+    --enable-psm3=rdma-core/build
+    --enable-rxd
+    --enable-rxm
+    --enable-shm
+    --enable-tcp
+    --enable-udp
+    --enable-usnic
+    --enable-verbs=rdma-core/build
+  RDMA_CORE_PATH: 'rdma-core/build'
+  RDMA_CORE_VERSION: v34.1
+jobs:
+  linux:
+    runs-on: '${{ matrix.os }}'
+    strategy:
+      matrix:
+        os:
+          - ubuntu-18.04
+          - ubuntu-20.04
+        cc:
+          - gcc
+          - clang
+      fail-fast: false
+    steps:
+      - name: Install dependencies (Linux)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ${{ env.APT_PACKAGES }}
+      - uses: actions/checkout@v2
+      - name: Build Check
+        run: |
+          set -x
+          git clone --depth 1 -b ${{ env.RDMA_CORE_VERSION }} https://github.com/linux-rdma/rdma-core.git
+          pushd rdma-core; bash build.sh; popd
+          export LD_LIBRARY_PATH="${{ env.RDMA_CORE_PATH }}/lib:$LD_LIBRARY_PATH"
+          ./autogen.sh
+          ./configure --prefix=$PWD/install ${{ env.OFI_PROVIDER_FLAGS }} CC=${{ matrix.cc }}
+          make -j 2; make install
+          $PWD/install/bin/fi_info -l
+      - name: Upload build logs
+        if: failure()
+        uses: actions/upload-artifact@v2
+        with:
+          name: config.log
+          path: config.log
+  hmem:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Install dependencies (Linux)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ${{ env.APT_PACKAGES }}
+      - name: Install CUDA
+        run: |
+          sudo apt-get install -y nvidia-cuda-toolkit
+      - name: Install ROCm
+        run: |
+          echo "Installing ROCm SDK"
+          # TODO: Install ROCm dependencies and add --with-rocm to build in next step
+      - name: Install Ze
+        run: |
+          echo "Installing Ze SDK"
+          sudo apt-get install -y gpg-agent wget
+          wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | sudo apt-key add -
+          sudo apt-add-repository 'deb [arch=amd64] https://repositories.intel.com/graphics/ubuntu focal main'
+          sudo apt-get update
+          sudo apt-get install -y level-zero level-zero-dev
+      - uses: actions/checkout@v2
+      - name: HMEM Checks
+        run: |
+          set -x
+          # We could use 'upload-artifact' and persist the rdma-core build
+          # across jobs, but this is just as quick.
+          git clone --depth 1 -b ${{ env.RDMA_CORE_VERSION }} https://github.com/linux-rdma/rdma-core.git
+          pushd rdma-core; bash build.sh; popd
+          export LD_LIBRARY_PATH="${{ env.RDMA_CORE_PATH }}/lib:$LD_LIBRARY_PATH"
+          ./autogen.sh
+          ./configure --prefix=$PWD/install ${{ env.OFI_PROVIDER_FLAGS }} \
+                                            --with-cuda=/usr/local/cuda --with-ze \
+                                            CC=${{ matrix.cc }}
+          make -j 2; make install
+          $PWD/install/bin/fi_info -l
+          $PWD/install/bin/fi_info -c FI_HMEM
+      - name: Upload build logs
+        if: failure()
+        uses: actions/upload-artifact@v2
+        with:
+          name: config.log
+          path: config.log
+  macos:
+    runs-on: macos-10.15
+    steps:
+      - name: Install dependencies (Mac OS)
+        run: |
+           brew install automake
+           brew install libtool
+      - uses: actions/checkout@v2
+      - name: Build Check
+        run: |
+          ./autogen.sh
+          ./configure --prefix=$PWD/install
+          make -j 2; make install
+          $PWD/install/bin/fi_info -l
+      - name: Upload build logs
+        if: failure()
+        uses: actions/upload-artifact@v2
+        with:
+          name: config.log
+          path: config.log
diff --git a/deps/libfabric/.github/workflows/stale.yml b/deps/libfabric/.github/workflows/stale.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cdfbdc94560d4f4a5cf3df619f9bb0ba629f26a6
--- /dev/null
+++ b/deps/libfabric/.github/workflows/stale.yml
@@ -0,0 +1,32 @@
+# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/actions/stale
+name: Mark stale issues and pull requests
+
+on:
+  schedule:
+  - cron: '26 10 * * *'
+
+jobs:
+  stale:
+
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+
+    steps:
+    - uses: actions/stale@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        days-before-stale: 360
+        days-before-close: 7
+        stale-issue-message: 'This issue is stale because it has been open 360 days with no activity. Remove stale label or comment, otherwise it will be closed in 7 days.'
+        stale-pr-message: 'This pull request is stale because it has been open 360 days with no activity. Remove stale label or comment, otherwise it will be closed in 7 days.'
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        operations-per-run: 60
+        exempt-issue-labels: 'enhancement,high priority'
+        exempt-pr-labels: 'work in progress'
diff --git a/deps/libfabric/.travis.yml b/deps/libfabric/.travis.yml
index 3be4f54e00c3efa2decbfe222a1ebb602d470d44..ed7308431ac8f36d717fc030d8ea5b9446857eb3 100644
--- a/deps/libfabric/.travis.yml
+++ b/deps/libfabric/.travis.yml
@@ -6,6 +6,7 @@ compiler:
 os:
     - linux
     - osx
+osx_image: xcode12.2
 addons:
     apt:
         packages:
@@ -34,18 +35,13 @@ addons:
             - abi-compliance-checker
             - abi-dumper
     coverity_scan:
-      project:
-        name: "ofiwg/libfabric"
-        description: "Libfabric project coverity scans"
-      notification_email: sean.hefty@intel.com
-      build_command_prepend: "./autogen.sh; ./configure"
-      build_command: "make -j2"
-      # It might be overkill to run a full scan across the compiler test matrix
-      # for every PR to master. The coverity addon can not selectively run for
-      # certain OSes or compilers. Once a couple runs succeed, change this to a
-      # coverity-scan branch that we push to on-demand during releases or as
-      # needed..
-      branch_pattern: master
+        project:
+            name: "ofiwg/libfabric"
+            description: "Libfabric project coverity scans"
+        notification_email: sean.hefty@intel.com
+        build_command_prepend: "./autogen.sh; ./configure --enable-efa=$RDMA_CORE_PATH --enable-psm2 --enable-psm3=$RDMA_CORE_PATH --enable-usnic --enable-verbs=$RDMA_CORE_PATH"
+        build_command: "make -j2"
+        branch_pattern: main
 
 env:
     global:
@@ -73,14 +69,16 @@ before_install:
 install:
     - ./autogen.sh
     # Build rdma-core because ubuntu doesn't have a sufficiently new version of
-    # ibverbs/rdma-core for EFA. OS X doesn't have verbs support.
+    # ibverbs/rdma-core for EFA and PSM3. OS X doesn't have verbs support.
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] ; then
         RDMA_CORE_BRANCH="v27.0";
         git clone --depth 1 -b $RDMA_CORE_BRANCH https://github.com/linux-rdma/rdma-core.git && cd rdma-core && bash build.sh && cd -;
         RDMA_CORE_PATH=$PWD/rdma-core/build ;
         export LD_LIBRARY_PATH="$RDMA_CORE_PATH/lib:$LD_LIBRARY_PATH" ;
         LIBFABRIC_CONFIGURE_ARGS="$LIBFABRIC_CONFIGURE_ARGS --enable-usnic
-        --enable-verbs=$RDMA_CORE_PATH --enable-efa=$RDMA_CORE_PATH";
+        --enable-psm3=$RDMA_CORE_PATH
+        --enable-verbs=$RDMA_CORE_PATH
+        --enable-efa=$RDMA_CORE_PATH";
       fi
     # Test fabric direct
     # (all other providers are automatically disabled by configure)
@@ -97,6 +95,7 @@ install:
       --disable-perf
       --disable-psm
       --disable-psm2
+      --disable-psm3
       --disable-rstream
       --disable-rxd
       --disable-rxm
@@ -124,6 +123,7 @@ install:
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
         make dist;
         config_options="--enable-efa=$RDMA_CORE_PATH
+                        --enable-psm3=$RDMA_CORE_PATH
                         --enable-verbs=$RDMA_CORE_PATH --enable-usnic";
         LDFLAGS=-Wl,--build-id rpmbuild -ta
           --define "configopts $config_options" libfabric-*.tar.bz2;
diff --git a/deps/libfabric/AUTHORS b/deps/libfabric/AUTHORS
index 642b4cb6e6614b59923355a449f5155ee1142f41..eae8607a9e57a0c9364be21bd40d61f4a2b53e78 100644
--- a/deps/libfabric/AUTHORS
+++ b/deps/libfabric/AUTHORS
@@ -1,8 +1,10 @@
+Adam Goldman <adam.goldman@intel.com>
 aikatz <aik49@cornell.edu>
 aingerson <aingerson@gmail.com>
 aingerson <alexia.ingerson@intel.com>
 Ajay Kulkarni <ajaykulk@cisco.com>
 aleksandra.justa <ajusta@gklab-125-155.igk.intel.com>
+Alexia Ingerson <alexia.ingerson@intel.com>
 Alex McKinley <alex.mckinley@intel.com>
 Alex McKinley <alex@mckpals.com>
 Amith Abraham <aabraham@cray.com>
@@ -18,6 +20,9 @@ Arun Ilango <arun.ilango@intel.com>
 Ashley Pittman <ampittma@ampittma-mac02.pittman.co.uk.20.20.172.in-addr.arpa>
 Ashley Pittman <ashley.m.pittman@intel.com>
 Automated bot for the OFIWG organization <ofiwg-bot@users.noreply.github.com>
+AWS ParallelCluster user <centos@ip-172-31-23-100.ec2.internal>
+AWS ParallelCluster user <ec2-user@ip-172-31-0-240.us-east-2.compute.internal>
+AWS ParallelCluster user <ec2-user@ip-172-31-15-28.ec2.internal>
 Benjamin Drung <bdrung@debian.org>
 Ben Menadue <ben.menadue@nci.org.au>
 Ben Turrubiates <bturrubiates@lanl.gov>
@@ -26,6 +31,7 @@ Bernd Schubert <bschubert@ddn.com>
 Brian Barrett <bbarrett@amazon.com>
 Brian J. Murrell <brian@interlinx.bc.ca>
 Brian Li <brian14708@gmail.com>
+Casey Carter <cacarter@microsoft.com>
 Chang Hyun Park <heartinpiece@gmail.com>
 Charles J Archer <charles.j.archer@intel.com>
 Chenwei Zhang <chenwz@amazon.com>
@@ -54,6 +60,8 @@ Gengbin Zheng <gengbin.zheng@intel.com>
 germanafro <andreasberger86@hotmail.de>
 Gilles Gouaillardet <gilles.gouaillardet@iferc.org>
 Gilles Gouaillardet <gilles@rist.or.jp>
+github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
+Goldman, Adam <adam.goldman@intel.com>
 Hefty <sean.hefty@intel.com>
 Holger Hoffstätte <holger@applied-asynchrony.com>
 Honggang Li <honli@redhat.com>
@@ -62,6 +70,7 @@ Hui Zhou <hzhou321@anl.gov>
 Ian Ziemba <ian.ziemba@hpe.com>
 Ignacio Hernandez <ignacio.hernandez@intel.com>
 Ira Weiny <ira.weiny@intel.com>
+iziemba <57813515+iziemba@users.noreply.github.com>
 Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
 James Dinan <james.dinan@intel.com>
 James Shimek <jshimek@cray.com>
@@ -78,6 +87,7 @@ Jeff Squyres <jsquyres@cisco.com>
 Jerome Berryhill <Jerome.Berryhill@Intel.com>
 Jerome Boyd Berryhill <JeromeBerryhill@Intel.com>
 Jerome Soumagne <jsoumagne@hdfgroup.org>
+Jiakun Yan <jiakunyan1998@gmail.com>
 Jianxin Xiong <jianxin.xiong@intel.com>
 Jie Zhang <zhngaj@amazon.com>
 Jim Snow <jim.m.snow@intel.com>
@@ -91,6 +101,8 @@ jose <jose@cst-fs.(none)>
 jose <jose@cstnh-8.(none)>
 JoZie <JoZie@users.noreply.github.com>
 jroznova <julia.roznova@intel.com>
+Juee Himalbhai Desai <juee.himalbhai.desai@intel.com>
+Kaike Wan <kaike.wan@intel.com>
 Ken Raffenetti <raffenet@mcs.anl.gov>
 Kevan rehm <krehm@cray.com>
 Kevan Rehm <krehm@cray.com>
@@ -101,12 +113,15 @@ Lisanna Dettwyler <levi.e.dettwyler@intel.com>
 Lisanna Dettwyler <lisanna.dettwyler@intel.com>
 Marcin Salnik <marcin.salnik@intel.com>
 Martin Kontsek <mkontsek@cisco.com>
+Matt Koop <mkoop@amazon.com>
 Miao Luo <miao.luo@intel.com>
 Michael Blocksome <michael.blocksome@intel.com>
 Michael Chuvelev <michael.chuvelev@intel.com>
 Mikhail Khalilov <miharulidze@gmail.com>
 Mikhail Khalilov <mikhail.khalilov@intel.com>
+Min Si <msi@anl.gov>
 Mohan Gandhi <mohgan@amazon.com>
+muttormark <mike.uttormark@hpe.com>
 Neil Spruit <neil.r.spruit@intel.com>
 Nicolas Morey-Chaisemartin <nmoreychaisemartin@suse.com>
 nikhilnanal <nikhilnanal1@gmail.com>
@@ -118,6 +133,8 @@ Nikola Dancejic <dancejic@amazon.com>
 Oblomov, Sergey <hoopoepg@gmail.com>
 Oblomov, Sergey <sergey.oblomov@intel.com>
 OFIWG Bot <ofiwg@lists.openfabrics.org>
+Olivier Serres <oserres@google.com>
+orbea <orbea@riseup.net>
 Paolo Inaudi <p91paul@gmail.com>
 patrickbueb <70724661+patrickbueb@users.noreply.github.com>
 Patrick Bueb <patrick.bueb@hpe.com>
@@ -132,9 +149,12 @@ Philip Davis <philipdavis01@gmail.com>
 Pierre Roux <piroux@cisco.com>
 Prankur Gupta <prankgup@cisco.com>
 Raghu Raja <craghun@amazon.com>
+Raghu Raja <raghu@enfabrica.net>
+Raghu Raja <rajachan@protonmail.com>
 Raghu Raja <rajachan@users.noreply.github.com>
 Reese Faucette <rfaucett@cisco.com>
 Richard Halkyard <rhalkyard@cray.com>
+Rich Welch <rlwelch@amazon.com>
 Robert Wespetal <wesper@amazon.com>
 Rohit Zambre <rzambre@uci.edu>
 Sannikov, Alexander <alexander.sannikov@intel.com>
@@ -142,6 +162,7 @@ Sayantan Sur <sayantan.sur@intel.com>
 Sean Hefty <sean.hefty@intel.com>
 Sergey Oblomov <sergey.oblomov@intel.com>
 Shantonu Hossain <shantonu.hossain@intel.com>
+Shi Jin <sjina@amazon.com>
 soblomov <sergey.oblomov@intel.com>
 Solovyev, Dmitriy <dmitriy.solovyev@intel.com>
 Spruit, Neil R <neil.r.spruit@intel.com>
@@ -157,11 +178,14 @@ Sylvain Didelot <sdidelot@ddn.com>
 Thananon Patinyasakdikul <apatinya@cisco.com>
 Thibault BREZILLON <thibault.brezillon@techsat.com>
 Thomas Smith <thomasm2@cisco.com>
+Todd Rimmer <todd.rimmer@intel.com>
 Tony Zinger <ajz@cray.com>
 tonyzinger <ajz@cray.com>
 Trevor Hendricks <trevorhe@amazon.com>
+Ubuntu <ubuntu@ip-172-31-15-224.ec2.internal>
 Venkata Krishna Nimmagadda <nvkrishna85@gmail.com>
 Venkata Krishna Nimmagadda <venkata.krishna.nimmagadda@intel.com>
+Wei Zhang <wzam@amazonc.com>
 Wei Zhang <wzam@amazon.com>
 Wesley Bland <wesley.bland@intel.com>
 William Zhang <wilzhang@amazon.com>
@@ -173,4 +197,5 @@ Yulu Jia <yulu.jia@intel.com>
 Zach Tiffany <zachary.tiffany@hpe.com>
 Zach Tiffany <ztiffany@cray.com>
 Zach <ztiffany@cray.com>
+zdworkin <zachary.dworkin@intel.com>
 ztaylor <ztaylor@twitter.com>
diff --git a/deps/libfabric/COPYING b/deps/libfabric/COPYING
index 31bc30a75eec83f70213a798c9c5c267186e09b5..a786c78ba283e1806202e1d7b5a84c3e85063305 100644
--- a/deps/libfabric/COPYING
+++ b/deps/libfabric/COPYING
@@ -3,7 +3,7 @@ licenses.  You may choose to be licensed under the terms of the the
 BSD license or the GNU General Public License (GPL) Version
 2, both included below.
 
-Copyright (c) 2015-2019 Intel Corporation.  All rights reserved.
+Copyright (c) 2015-2021 Intel Corporation.  All rights reserved.
 Copyright (c) 2015-2019 Cisco Systems, Inc.  All rights reserved.
 
 ==================================================================
diff --git a/deps/libfabric/Makefile.am b/deps/libfabric/Makefile.am
index b857e7ddc3ba0364aefb729d87d338b8cbb78a3e..bb9898a3a4a75317bad24f158501ba09b83aaf91 100644
--- a/deps/libfabric/Makefile.am
+++ b/deps/libfabric/Makefile.am
@@ -43,6 +43,7 @@ common_srcs =				\
 	src/hmem.c			\
 	src/hmem_rocr.c			\
 	src/hmem_cuda.c			\
+	src/hmem_cuda_gdrcopy.c		\
 	src/hmem_ze.c			\
 	src/common.c			\
 	src/enosys.c			\
@@ -75,10 +76,12 @@ common_srcs =				\
 	prov/util/src/util_mr_cache.c	\
 	prov/util/src/cuda_mem_monitor.c \
 	prov/util/src/rocr_mem_monitor.c \
+	prov/util/src/ze_mem_monitor.c \
 	prov/util/src/util_coll.c
 
 
 if MACOS
+common_srcs += src/osx/osd.c
 common_srcs += src/unix/osd.c
 common_srcs += include/osx/osd.h
 common_srcs += include/unix/osd.h
@@ -183,7 +186,7 @@ src_libfabric_la_LIBADD =
 src_libfabric_la_DEPENDENCIES = libfabric.map
 
 if !EMBEDDED
-src_libfabric_la_LDFLAGS += -version-info 15:1:14
+src_libfabric_la_LDFLAGS += -version-info 18:0:17
 endif
 src_libfabric_la_LDFLAGS += -export-dynamic \
 			   $(libfabric_version_script)
@@ -194,6 +197,7 @@ rdmainclude_HEADERS += \
 	$(top_srcdir)/include/rdma/fi_collective.h \
 	$(top_srcdir)/include/rdma/fi_domain.h \
 	$(top_srcdir)/include/rdma/fi_eq.h \
+	$(top_srcdir)/include/rdma/fi_ext.h \
 	$(top_srcdir)/include/rdma/fi_rma.h \
 	$(top_srcdir)/include/rdma/fi_endpoint.h \
 	$(top_srcdir)/include/rdma/fi_errno.h \
@@ -222,6 +226,7 @@ real_man_pages = \
         man/man1/fi_info.1 \
         man/man1/fi_pingpong.1 \
         man/man1/fi_strerror.1 \
+        man/man3/fi_atomic.3 \
         man/man3/fi_av.3 \
         man/man3/fi_av_set.3 \
         man/man3/fi_cm.3 \
@@ -234,6 +239,7 @@ real_man_pages = \
         man/man3/fi_errno.3 \
         man/man3/fi_eq.3 \
         man/man3/fi_fabric.3 \
+        man/man3/fi_provider.3 \
         man/man3/fi_getinfo.3 \
         man/man3/fi_mr.3 \
         man/man3/fi_msg.3 \
@@ -254,7 +260,6 @@ dummy_man_pages = \
         man/man3/fi_allgather.3 \
         man/man3/fi_allreduce.3 \
         man/man3/fi_alltoall.3 \
-        man/man3/fi_atomic.3 \
         man/man3/fi_atomic_valid.3 \
         man/man3/fi_atomicmsg.3 \
         man/man3/fi_atomicv.3 \
@@ -407,6 +412,7 @@ include prov/efa/Makefile.include
 include prov/usnic/Makefile.include
 include prov/psm/Makefile.include
 include prov/psm2/Makefile.include
+include prov/psm3/Makefile.include
 include prov/gni/Makefile.include
 include prov/rxm/Makefile.include
 include prov/mrail/Makefile.include
@@ -422,6 +428,7 @@ include prov/hook/hook_debug/Makefile.include
 man_MANS = $(real_man_pages) $(prov_install_man_pages) $(dummy_man_pages)
 
 EXTRA_DIST += \
+        autogen.sh \
         NEWS.md \
         libfabric.spec.in \
         config/distscript.pl \
diff --git a/deps/libfabric/NEWS.md b/deps/libfabric/NEWS.md
index 27d4efa4f361ec5b6bb543f55f9d00b2479d4077..98a2f44c1a4a98dff0c277dbc8c726981ceff15e 100644
--- a/deps/libfabric/NEWS.md
+++ b/deps/libfabric/NEWS.md
@@ -3,7 +3,645 @@ Libfabric release notes
 
 This file contains the main features as well as overviews of specific
 bug fixes (and other actions) for each version of Libfabric since
-version 1.0.
+version 1.0.  New major releases include all fixes from minor
+releases with earlier release dates.
+
+v1.14.0, Fri Nov 19, 2021
+=========================
+
+## Core
+
+- Add time stamps to log messages
+- Fix gdrcopy calculation of memory region size when aligned
+- Allow user to disable use of p2p transfers
+- Update fi_tostr print FI_SHARED_CONTEXT text instead of value
+- Update fi_tostr to output field names matching header file names
+- Fix narrow race condition in ofi_init
+- Minor optimization to pollfds to handle timeout of 0
+- Add new fi_log_sparse API to rate limit repeated log output
+- Define memory registration for buffers used for collective operations
+
+## EFA
+- Provide better support for long lived applications utilizing the RDM
+  endpoint, that may reuse an EFA queue pair after an application restarts.
+- Fixes for RNR support (enabled in v1.13.1), to allow Libfabric to manage
+  backoff when a receiver's queue is exhausted. A setopt parameter was added to
+  allow applications to set the number of re-transmissions done by the device
+  before a packet is queued by Libfabric, or if Libfabric is configured to not
+  handle resource errors, write an error entry to the application.
+- Potentially reduce memory utilization by waiting until first CQ read to
+  allocate pools
+- Deprecate the FI_EFA_SHM_MAX_MEDIUM_SIZE environment variable
+- Fix a bug in the send path which caused a performance regression for large
+  messages
+- Fix issue in MR registration path when cache is used with CUDA buffers
+- Print a clearer warning message when the reorder buffer is too small
+- Various bugfixes in send path causing unneeded copies
+- Various bugfixes caught by inspection and coverity
+- Add documentation describing version 4 of the RDM protocol
+
+## SHM
+
+- Separate HMEM caps and disable FI_ATOMIC when requested
+- Fix casting ints to pointers of different sizes
+- Add error checking in smr_setname
+- Distinguish between max shm name and max path name
+- Move allocation of sar_msg into smr_format_sar()
+
+## TCP
+
+- Use IP_BIND_ADDRESS_NO_PORT socket option to improve scaling
+- Fix situation where we can leave socket in blocking mode
+- Add specific fi_info output to fi_getinfo for srx case
+- Code restructuring and renames to improve maintenance
+- Initial implementation to support tagged messages at tcp layer
+- Optimize RMA handling at receiver
+- Remove non-defined CQ flags when reporting completions
+
+## RXM
+
+- Reset connection state if we receive a new connection request
+- Increase and update debug log messages to be more consistent
+- Force CM progress if msg ep's are actively connecting
+- Optimize handling for cm_progress_interval = 0
+
+## Util
+
+- Fix fi_getinfo check if provider requires the use of shared contexts
+- Replace deprecated pthread_yield with sched_yield
+- Fix compiler warning mixing u64 with size_t fields
+- Fix memory leak in util_av_set_close
+- Fix ofi_av_set to use passed in start_addr and end_addr values
+- Add logic to detect if another library is intercepting memory calls
+- Update 128-bit atomic support
+- Fix possible deadlock if multiple memory monitors are enabled for the
+  same memory type
+
+## Verbs
+
+- Fix setting MR access to handle read-only buffers
+- Expand debug output
+- Fail FI_HMEM support if p2p is disabled
+- Handle FI_HMEM_HOST_ALLOC flag for FI_HMEM_ZE
+
+## Fabtests
+
+- Fix rdm_rma_trigger support for hmem
+- Add key exchanges to common code to support device memory
+- Remove need for OOB address exchange when hmem is enabled
+- Always use command line provided inject size when given
+- Add ability to test tagged messages over msg ep's
+- Add support for shared rx contexts to common code
+- Update scripts to allow provider specific fabtests
+- Add an EFA RDM RNR fabtest
+
+v1.13.2, Fri Oct 15, 2021
+========================
+
+## Core
+
+- Provide work-around for segfault in Ze destructor using DL provider
+- Minor code fixes supporting Ze
+- Use copy only engine when accessing GPUs through Ze
+- Sort DL providers to ensure consistent load ordering
+- Update hooking providers to handle fi_open_ops calls to avoid crashes
+- Replace cassert with assert.h to avoid C++ headers in C code
+- Enhance serialization for memory monitors to handle external monitors
+
+## EFA
+
+- Limit memcpy in packet processing to only copy valid data
+- Removed maximum wait time sending packet to avoid silent drops
+- Fix unconditionally growing buffer pools that should not grow
+- Handle possible large backlog of unexpected messages via SHM
+- Update Tx counter for inject operations
+- Allow in flight sends to finish when closing endpoint
+- Fix handing of prefix size when receiving data
+- Removed unnecessary data copy
+
+## SHM
+
+- Fix possible sigbus error
+- Handle errors if peer is not yet initialized
+
+## TCP
+
+- Fix reporting RMA write CQ data
+- Fix RMA read request error completion handling
+- Avoid possible use after free in reject path
+- Remove restriction where EQs and CQs may not share wait sets
+- Increase max supported rx size
+- Fix possible memory leak of CM context structure in error cases
+- Set source address for active EPs to ensure correct address is used
+- Fix memory leak of dest address in CM requests
+
+## RxM
+
+- Improve connection handling responsiveness to fix application stalls
+- Add missing locks around AV data structures
+- Add missing hmem initialization for DL builds
+- Do not ignore user specified rx/tx sizes
+- Fix source address reported to peer
+- Fix possible use of uninitialized memory handling CQ errors
+- Fix address comparison to remove duplicate connections
+- Reworked CM code to fix several possible crash scenarios
+- Fix setting the conn_id when generated 'fake' tagged headers
+
+## Util
+
+- Fix AV set to use non-zero starting address
+- Fix setting of CQ completion flags
+
+## Verbs
+
+- Work-around compilation error with Intel compiler 2018.3.222
+- Avoid possible user after free issue accessing rdma cm id in error cases
+
+## Fabtests
+
+- Add missing prints to fi_av_xfer to report failures
+- Fix memory leak in fi_multinode test
+- Add device validation for hmem tests
+- Update fi_info hints mode field based on user options
+- Fix use of incorrect message prefix sized in fi_pingpong test
+
+v1.13.1, Tue Aug 24, 2021
+=========================
+
+## Core
+
+- Fix ZE check in configure
+- Enable loading ZE library with dlopen()
+- Add IPv6 support to fi_pingpong
+- Fix the call to fi_recv in fi_pingpong
+
+## EFA
+
+- Split ep->rx_entry_queued_list into two lists
+- Split ep->tx_entry_queued_list into two lists
+- Only set FI_HMEM hint for SHM getinfo when requested
+- Include qkey in smr name
+- Do not ignore send completion for a local read operation
+- Convert pkt_entry->state to pkt_entry->flags
+- Detect recvwin overflow and print an error message
+- Add function ofi_recvwin_id_processed()
+- Let efa_av_remove() remove peer with resources
+- Ignore received packets from a remove address
+- Check for and handle empty util_av->ep_list in efa_av
+- Invalidate peer's outstanding TX packets' address when removing peer
+- Extend the scope of deep cleaning resources in rxr_ep_free_res()
+- Eefactor error handling functions for x_entry
+- Only write RNR error completion for send operation
+- Ignore TX completion to a removed peer.
+- Release peer's tx_entry and rx_entry when removing peer
+- Make efa_conn->ep_addr a pointer and use it to identify removed peer
+- Mix the using of released packet in rxr_cq_handler_error()
+- Refactor tx ops counter updating
+- Make rxr_release_tx_entry() release queued pkts
+- Rename rxr_pkt_entry->type to rxr_pkt_entry->alloc_type
+- Initialize rxr_pkt_entry->x_entry to NULL
+- Fix ep->pkt_sendv_pool size
+- Add rnr_backoff prefix to variables related to RNR backoff
+- Refactor rxr_cq_queue_pkt()
+- Eliminate rnr_timeout_exp in rdm_peer
+- Eliminate the flag RXR_PEER_BACKED_OFF
+- Adjust unexpected packet pool chunk size
+- Defer memory allocation to 1st call to progress engine
+- Enable RNR support
+- Remove peer from backoff peer list in efa_rdm_peer_reset()
+- Make rxr_pkt_req_max_header_size use RXR_REQ_OPT_RAW_ADDR_HDR_SIZE
+- Use ibv_is_fork_initialized in EFA fork support
+
+## PSM3
+
+- Update Versions
+- Clean ref's to split cuda hostbufs when no longer needed
+- Fix issue when running gpudirect on gpu with small bar size
+- Fix issues with debug statistics
+- Fix issue with unreleased MR in cache
+
+## SHM
+
+- Fix unsigned comparison introduced in #6948
+- Use hmem iov copies in mmap progression
+- Correct return values in smr_progress.c
+- Fix smr_progress_ipc error handling
+
+## Util
+
+- Do not override default monitor if already set
+- Do not set impmon.impfid to NULL on monitor init
+- Initialize the import monitor
+- Add memory monitor for ZE
+
+## Fabtests
+
+- Use dlopen to load ZE library
+- Bug fixes related to IPv6 address format
+- Do not immediately kill server process
+
+v1.13.0, Thu Jul 1, 2021
+========================
+
+## Core
+
+- Fix behavior of fi_param_get parsing an invalid boolean value
+- Add new APIs to open, export, and import specialized fid's
+- Define ability to import a monitor into the registration cache
+- Add API support for INT128/UINT128 atomics
+- Fix incorrect check for provider name in getinfo filtering path
+- Allow core providers to return default attributes which are lower then
+  maximum supported attributes in getinfo call
+- Add option prefer external providers (in order discovered) over internal
+  providers, regardless of provider version
+- Separate Ze (level-0) and DRM dependencies
+- Always maintain a list of all discovered providers
+- Fix incorrect CUDA warnings
+- Fix bug in cuda init/cleanup checking for gdrcopy support
+- Shift order providers are called from in fi_getinfo, move psm2 ahead of
+  psm3 and efa ahead of psmX
+
+## EFA
+
+- Minor code optimizations and bug fixed
+- Add support for fi_inject for RDM messages
+- Improve handling of RNR NACKs from NIC
+- Improve handling of zero copy receive case, especially when sender does not
+  post receive buffer
+- Numerous RMA read bug fixes
+- Add unexpected receive queue for each peer
+- Fixed issue releasing rx entries
+- Decrease the initial size of the out-of-order packeet pool allocation size
+  to reduce the common-case memory footprint
+- Handle FI_ADDR_NOTAVAIL in rxr_ep_get_peer
+- Identify and handle QP reuse
+- Use the memory monitor specified by the user
+- Replace provider code with common code in select places
+- Update efa_av_lookup to return correct address
+- Update rdm endpoint directly poll cq from ibv_cq
+- Avoid possible duplicate completions
+- Add reference counting for peer tracking
+- Fix EFA usage of util AV causing incorrect refcounting
+- Do not allow endpoints to share address vectors
+- Improve fork support; users can set the FI_EFA_FORK_SAFE environment variable
+  for applications which call fork()
+- Adjust the timing of clearing deferred memory registration list
+- Do not use eager protocol for cuda message and local peer
+- Fixes for shm support
+- Enable MR cache for CUDA
+- Disable shm when application requests FI_HMEM
+
+## PSM3
+
+- Added CUDA Support, GPU Direct through RV kernel module
+- Changed PSM3 Provider Version to match IEFS version
+- Expanded Multi-Rail support
+- Enhanced debug logging
+- Removed internal copy of libuuid, added as linked lib
+- Various Bug Fixes
+
+## RxD
+
+- Fix peer connection and address cleanup
+- Maintain peer connection after AV removal to send ACKs
+
+## RxM
+
+- Fix rx buffer leak in error case
+- Dynamically allocate buffer space for large unexpected messages
+- Separate the eager protocol size from allocated receive buffers
+  to reduce memory footprint
+- Make eager limit a per ep value, rather than global for all peers
+- Separate definitions and use of buffer, eager, and packet sizes
+- Fix calling fi_getinfo to the msg provider with FI_SOURCE set but
+  null parameters
+- General code cleanups, simplifications, and optimizations
+- Fix retrieving tag from dynamic receive buffer path
+- Enable dynamic receive buffer path over tcp by default
+- Use correct check to select between tagged and untagged rx queues
+- Repost rx buffers immediately to fix situation where applications can hang
+- Update help text for several environment variables
+- Fix use_srx check to enable srx by default layering over tcp provider
+- Reduce default tx/rx sizes to shrink memory footprint
+- Fix leaving stale peer entries in the AV
+- Handle error completions from the msg provider properly, and avoid passing
+  internal transfers up to the application
+- Reduce memory footprint by combining inject packets into one
+- Reduce inject copy overhead by using memcpy instead of hmem copy routines
+- Restrict the number of outstanding user transfers to prevent memory
+  overflow
+- Enable direct send feature by default for the tcp provider
+- Fix initialization of atomic headers
+- Only ignore interrupts in wait calls (e.g. poll) in debug builds, otherwise
+  return control to the caller
+- Combine and simplify internal buffer pools to reduce memory footprint
+- Remove request for huge pages for internal buffer pools
+- Add optimized tagged message path over tcp provider, removing need for
+  rxm header overhead
+- Several optimizations around supporting rxm over tcp provider
+
+## SHM
+
+- Use signal to reduce lock contention between processes
+- Fix communication with a peer that was restarted
+- Code cleanup to handle issues reported by coverity
+- Add check that IPC protocol is accessing device only memory
+- Fix interface selection used for IPC transfers
+- Change address to use a global ep index to support apps that open
+  multiple fabrics
+- Add environment variable to disable CMA transfers, to handle environments
+  where CMA checks may succeed, but CMA may not be usable
+- Add missing lock in ofi_av_insert_addr
+- Add support for GPU memory in inject operations.
+
+## Sockets
+
+- Fix possible ring buffer overflow calculating atomic lengths
+- Use correct address length (IPv6 vs 4) walking through address array
+
+## TCP
+
+- Add send side coalescing buffer to improve small message handling
+- Add receive side prefetch buffer to reduce kernel transitions
+- Fix initializing the mr_iov_limit domain attribute
+- Add support for zero copy transfers, with configurable threshold settings.
+  Disable zero copy by default due to negative impact on overall performance
+- Add environment variable overrides for default tx/rx sizes
+- Simplify and optimize handling of protocol headers
+- Add a priority transmit queue for internally generated messages (e.g. ACKs)
+- Check that the endpoint state is valid before attempting to drive progress
+  on the underlying socket
+- Limit the number of outstanding transmit and receive operations that a
+  user may post to an ep
+- Remove limitations on allocating internally generated messages to prevent
+  application hangs
+- Combine multiple internal buffer pools to one to reduce memory footprint
+- Optimize socket progress based on signaled events
+- Optimize pollfd abstraction to replace linear searches with direct indexing
+- Update both rx and tx cq's socket poll list to prevent application hangs
+- Optimize reading in extra headers to reduce loop overhead
+- Continue progressing transmit data until socket is full to reduce progress
+  overhead
+- Add msg id field to protocol headers (debug only) for protocol debugging
+- Drive rx progress when there's an unmatched 0-byte received message to
+  avoid application hangs
+- Avoid kernel transitions that are likely to do not work (return EAGAIN)
+- Fail try_wait call if there's data already queued in user space prefetch
+  buffers to avoid possible hangs
+- Fix possible access to freed tx entry
+- Optimize socket receive calls in progress function to skip progress loop
+  and immediately handle a received header.  This also fixes an application
+  hang handling 0-byte messages
+- Broad code cleanups, rework, and simplifications aimed at reducing
+  overhead and improving code stability
+- Improve handling of socket disconnect or fatal protocol errors
+- Fix reporting failures of internal messages to the user
+- Disable endpoints on fatal protocol errors
+- Validate response messages are what is expected
+- Simplify and align transmit, receive, and response handling to improve code
+  maintainability and simplify related data structures
+- Copy small messages through a coalescing buffer to avoid passing SGL to
+  the kernel
+- Fix race handling a disconnected event during the CM handshake
+- Report default attributes that are lower than the supported maximums
+- Remove use of huge pages, which aren't needed by tcp, to reserve them for
+  the user
+- Increase default inject size to be larger than the rxm header
+- Add tagged message protocol header for sending tagged messages using the
+  tcp headers only
+- Separate definition of maximum header size from maximum inject size
+
+## Util
+
+- Added lock validation checks to debug builds
+- Fix MR cache flush LRU behavior
+- Always remove dead memory regions from the MR cache immediately
+- Update buffer pools to handle an alignment of 0
+- Fail memory registration calls for HMEM if the interface isn't available
+- Pass through failures when a requested memory monitor fails to start
+- Always process deferred work list from pollfd wait abstraction
+
+## Verbs
+
+- Fixed checks setting CQ signaling vector
+- Internal code cleanups and clarifications
+- Fixed XRC MOFED 5.2 incompatibility
+- Add dmabuf MR support for GPU P2P transfers
+
+v1.12.1, Thu Apr 1, 2021
+========================
+
+## Core
+
+- Fix initialization checks for CUDA HMEM support
+- Fail if a memory monitor is requested but not available
+- Adjust priority of psm3 provider to prefer HW specific providers,
+  such as efa and psm2
+
+## EFA
+- Adjust timing clearing the deferred MR list to fix memory leak
+- Repost handshake packets on EAGAIN failure
+- Enable mr cache for CUDA memory
+- Support FI_HMEM and FI_LOCAL_COMM when used together
+- Skip using shm provider when FI_HMEM is requested
+
+## PSM3
+- Fix AVX2 configure check
+- Fix conflict with with-psm2-src build option to prevent duplicate
+  symbols
+- Fix checksum generation to support different builddir
+- Remove dependency on librdmacm header files
+- Use AR variable instead of calling ar directly in automake tools
+- Add missing PACK_SUFFIX to header
+
+v1.12.0, Mon Mar 8, 2021
+=========================
+
+## Core
+
+- Added re-entrant version of fi_tostr
+- Added fi_control commands for accessing fid-specific attributes
+- Added Ze (level-0) HMEM API support
+- Fixed RoCR memory checks
+- Minor code cleanups, restructuring, and fixes
+- Fix possible stack buffer overflow with address string conversion
+- Handle macOS socket API size limitations
+- Verify and improve support for CUDA devices
+- Update internal string functions to protect against buffer overflow
+- Support gdrcopy in addition to cudaMemcpy to avoid deadlocks
+- Properly mark if addresses support only local communication
+- Prevent providers from layering over each other non-optimally
+- Fix pollfds abstraction to fix possible use after free
+
+## EFA
+- Added support for FI_DELIVERY_COMPLETE via an acknowledgment packet in the
+  provider. Applications that request FI_DELIVERY_COMPLETE will see a
+  performance impact from this release onward. The default delivery semantic
+  for EFA is still FI_TRANSMIT_COMPLETE and acknowledgment packets will not be
+  sent in this mode.
+- Added ability for the provider to notify device that it can correctly handle
+  receiver not ready (RNR) errors. There are still known issues so this is
+  currently turned off by default; the device is still configured to retry
+  indefinitely.
+- Disable FI_HMEM when FI_LOCAL_COMM is requested due to problems in the
+  provider with loopback support for FI_HMEM buffers.
+- Use a loopback read to copy from host memory to FI_HMEM buffers in the
+  receive path. This has a performance impact, but using the native copy API
+  for CUDA can cause a deadlock when the EFA provider is used with NCCL.
+- Only allow fork support when the cache is disabled, i.e. the application
+  handles registrations (FI_MR_LOCAL) to prevent potential data corruption.
+  General fork support will be addressed in a future release.
+- Moved EFA fork handler check to only trigger when an EFA device is present
+  and EFA is selected by an application.
+- Changed default memory registration cache monitor back to userfaultfd due to
+  a conflict with the memory hooks installed by Open MPI.
+- Fixed an issue where packets were incorrectly queued which caused message
+  ordering issues for messages the EFA provider sent via SHM provider.
+- Fixed a bug where bounce buffers were used instead of application provided
+  memory registration descriptors.
+- Various fixes for AV and FI_HMEM capability checks in the getinfo path.
+- Fix bug in the GPUDirect support detection path.
+- Various fixes and refactoring to the protocol implementation to resolve some
+  memory leaks and hangs.
+
+## PSM3
+
+- New core provider for psm3.x protocol over verbs UD interfaces, with
+  additional features over Intel E810 RoCEv2 capable NICs
+- See fi_psm3.7 man page for more details
+
+## RxD
+
+- Added missing cleanup to free peer endpoint data with AV
+- Add support for FI_SYNC_ERR flag
+
+## RxM
+
+- Cleanup atomic buffer pool lock resources
+- Fix unexpected message handling when using multi-recv buffers
+- Handle SAR and rendezvous messages received into multi-recv buffers
+- Give application entire size of eager buffer region
+- Minor code cleanups based on static code analysis
+- Simplify rendezvous message code paths
+- Avoid passing internal errors handling progress directly to applications
+- Limit fi_cancel to canceling at most 1 receive operation
+- Remove incorrect handling if errors occur writing to a CQ
+- Only write 1 CQ entry if a SAR message fails
+- Continue processing if the receive buffer pool is full and reposting delayed
+- Add support for dynamic receive buffering when layering over tcp
+- Add support for direct send to avoid send bounce buffers in certain cases
+- Prioritize credit messages to avoid deadlock
+- Fix conversion to message provider's mr access flags
+- Reduce inject size by the minimum packet header needed by rxm
+- Fix checks to enable shared rx when creating an endpoint
+- Minor code restructuring
+- Fix trying to access freed memory in error handling case
+- Use optimized inject limits to avoid bounce buffer copies
+- Fix possible invalid pointer access handling rx errors
+- Add support for HMEM if supported by msg provider
+- Add missing locks around progress to silence thread-sanitizer
+- Support re-connecting to peers if peer disconnects (client-server model)
+- Cleanup rendezvous protocol handling
+- Add support for RMA write rendezvous protocol
+
+## SHM
+
+- Add support for Ze IPC protocol
+- Only perform IPC protocol related cleanup when using IPC
+- Disable cross-memory attach protocol when HMEM is enabled
+- Fix cross-memory attach support when running in containers
+- Always call SAR protocol's progress function
+- Enable cross-memory attach protocol when sending to self
+- Minor code cleanups and restructuring for maintenance
+
+## Sockets
+
+- Verify CM data size is less than supported value
+- Handle FI_SYNC_ERR flag on AV insert
+- Improve destination IP address checks
+- Minor coding cleanups based on static code analysis
+- Fix possible use after free access in Rx progress handling
+
+## TCP
+
+- Fix hangs on windows during connection setup
+- Relax CQ checks when enabling EP to handle send/recv only EPs
+- Fix possible use of unset return value in EP enable
+- Minor coding cleanups based on static code analysis
+- Handle EAGAIN during CM message exchanges
+- Set sockets to nonblocking on creation to avoid possible hangs at scale
+- Improve CM state tracking and optimize CM message flows
+- Make passive endpoints nonblocking to avoid hangs
+- Allow reading buffered data from disconnected endpoints
+- Implement fi_cancel for receive queues
+- Flush outstanding operations to user when an EP is disabled
+- Support dynamic receive buffering - removes need for bounce buffers
+- Add direct send feature - removes need for bounce buffers
+- Minor code cleanups and restructuring to improve maintenance
+- Add support for fo_domain_bind
+
+## Util
+
+- Improve checks that EPs are bound to necessary CQs
+- Fix mistaking the AV's total size with current count to size properly
+- Fix CQ buffer overrun protection mechanisms to avoid lost events
+
+## Verbs
+
+- Add SW credit flow control to improve performance over Ethernet
+- Skip verbs devices that report faulty information
+- Limit inline messages to iov = 1 to support more devices
+- Minor code improvements and restructuring to improve maintenance
+- Enable caching of device memory (RoCR, CUDA, Ze) registrations
+- Add HMEM support, including proprietary verbs support for P2P
+- Add support for registering device memory
+- Support GIDs at any GID index, not just 0
+- Fix macro definitions to cleanup build warnings
+- Support GID based connection establishment, removes ipoib requirement
+- Reduce per peer memory footprint for large scale fabrics
+
+v1.11.2, Tue Dec 15, 2020
+=========================
+
+## Core
+
+- Handle data transfers > 4GB on OS X over tcp sockets
+- Fixed spelling and syntax in man pages
+- Fix pmem instruction checks
+
+## EFA
+
+- Use memory registration for emulated read protocol
+- Update send paths to use app memory descriptor if available
+- Remove unneeded check for local memory registration
+- Do not install fork handler if EFA is not used
+- Fix medium message RTM protocol
+- Fix memory registration leak in error path
+- Fix posting of REQ packets when using shm provider
+
+## RxM
+
+- Fix provider initialization when built as a dynamic library
+
+## SHM
+
+- Reverts SAR buffer locking patch
+- Include correct header file for process_vm_readv/writev syscalls
+- Skip atomic fetch processing for non-fetch operations
+
+## TCP
+
+- Fix swapping of address and CQ data in RMA inject path
+
+## Util
+
+- Fix error code returned for invalid AV flags
+- Fix a bug finding the end of a page when the address is aligned
+
+## Verbs
+
+- Fix build warning in XRC CM log messages
+- Fix build warnings in debug macros
 
 v1.11.1, Fri Oct 9, 2021
 ========================
@@ -49,7 +687,7 @@ v1.11.1, Fri Oct 9, 2021
 - Create duplicate fi_info's when reporting FI_HMEM support
 - Handle transfers larger than 2GB
 - Register for signal using SA_ONSTACK
-- Fix segfault if peer has not been inserted intqqo local AV
+- Fix segfault if peer has not been inserted into local AV
 - Fix command/buffer tracking for sending connection requests
 - Return proper errno on AV lookup failures
 - Remove duplicate call to ofi_hmem_init
@@ -68,7 +706,7 @@ v1.11.1, Fri Oct 9, 2021
 ## TCP
 
 - Fix possible deadlock during EP shutdown due lock inversion
-- Rework CM state maching to fix lock inversion handling disconnect
+- Rework CM state machine to fix lock inversion handling disconnect
 
 ## Util
 
diff --git a/deps/libfabric/README.md b/deps/libfabric/README.md
index 116014cceac22f36959474db01948a0092e59775..23cc0f65ed97ec0576519b452542fb7d745cf65f 100644
--- a/deps/libfabric/README.md
+++ b/deps/libfabric/README.md
@@ -1,5 +1,6 @@
 [<img alt="libfabric master branch Travis CI status" src="https://travis-ci.org/ofiwg/libfabric.svg?branch=master"/>](https://travis-ci.org/ofiwg/libfabric)
-[<img alt="libfabric Coverity scan suild status" src="https://scan.coverity.com/projects/4274/badge.svg"/>](https://scan.coverity.com/projects/4274)
+[<img alt="libfabric Coverity scan build status" src="https://scan.coverity.com/projects/4274/badge.svg"/>](https://scan.coverity.com/projects/4274)
+[<img alt="libfabric master branch AppVeyor CI status" src="https://ci.appveyor.com/api/projects/status/github/ofiwg/libfabric?svg=true"/>](https://ci.appveyor.com/api/projects/status/github/ofiwg/libfabric)
 [![libfabric release version](https://img.shields.io/github/release/ofiwg/libfabric.svg)](https://github.com/ofiwg/libfabric/releases/latest)
 
 # libfabric
@@ -140,12 +141,6 @@ See the `fi_gni(7)` man page for more details.
 
 - The `gni` provider requires `gcc` version 4.9 or higher.
 
-### mxm
-
-***
-
-The MXM provider has been deprecated and was removed after the 1.4.0 release.
-
 ### psm
 
 ***
@@ -177,8 +172,22 @@ Intel TrueScale Fabric.
 
 See the `fi_psm2(7)` man page for more details.
 
+### psm3
+
+***
+
+The `psm3` provider provides optimized performance and scalability for most
+verbs UD devices. Additional features and optimizations can be enabled when
+running over Intel's E810 Ethernet NICs and using Intel's rendezvous kernel
+module ([`rv`](https://github.com/intel/iefs-kernel-updates)). PSM 3.x fully integrates the OFI provider and the underlying
+PSM3 protocols/implementation and only exports the OFI APIs.
+
+See [`fi_psm3`(7)](https://ofiwg.github.io/libfabric/master/man/fi_psm3.7.html) for more details.
+
 ### rxm
 
+***
+
 The `ofi_rxm` provider is an utility provider that supports RDM endpoints emulated
 over MSG endpoints of a core provider.
 
@@ -336,27 +345,6 @@ See the `fi_netdir(7)` man page for more details.
   root of provier directory, i.e. \prov\netdir\NetDirect, where NetDirect contains
   the header files), specify them in the configuration properties of the VS project.
 
-### mlx
-
-***
-
-The MLX provider enables applications using OFI to be run over UCX
-communication library. It uses libucp for connections control and data transfer operations.
-Supported UCP API version: 1.2
-
-See the `fi_mlx(7)` man page for more details.
-
-#### Dependencies
-
-- The MLX provider requires UCP API 1.2 capable libucp and libucs (tested with hpcx v1.8.0, v1.9.7).
-  If you are compiling Libfabric from source and want to enable MLX
-  support, you will also need the matching header files for UCX.
-  If the libraries and header files are not in default paths, specify them using:
-
-```
---with-mlx=<path to local UCX installation>
-```
-
 ### shm
 
 ***
@@ -406,6 +394,7 @@ Even though windows isn't fully supported yet it is possible to compile and link
       1-2: Debug/Release ICC (restricted support for Intel Compiler XE 15.0 only)
       3-4: Debug/Release v140 (VS 2015 tool set)
       5-6: Debug/Release v141 (VS 2017 tool set)
+      7-8: Debug/Release v142 (VS 2019 tool set)
 
   make sure you choose the correct target fitting your compiler.
   By default the library will be compiled to `<libfabricroot>\x64\<yourconfigchoice>`
diff --git a/deps/libfabric/config/cron-run-all-md2nroff.pl b/deps/libfabric/config/cron-run-all-md2nroff.pl
deleted file mode 100755
index f1f0c78377ea4ba512b7804c41bdf999b1b86cd9..0000000000000000000000000000000000000000
--- a/deps/libfabric/config/cron-run-all-md2nroff.pl
+++ /dev/null
@@ -1,412 +0,0 @@
-#!/usr/bin/env perl
-
-# Script to pull down the latest markdown man pages from the libfabric
-# git repo.  Iterate over them, converting each to an nroff man page
-# and also copying+committing them to the gh-pages branch.  Finally,
-# git push them back upstream (so that Github will render + serve them
-# up as web pages).
-
-use strict;
-use warnings;
-
-use POSIX;
-use File::Basename;
-use Getopt::Long;
-use File::Temp;
-use JSON;
-use Data::Dumper;
-
-my $repo_arg;
-my $source_branch_arg;
-my $pages_branch_arg;
-my $logfile_dir_arg = "/tmp";
-my $pat_file_arg;
-my $verbose_arg;
-my $help_arg;
-
-my $ok = Getopt::Long::GetOptions("repo=s" => \$repo_arg,
-                                  "source-branch=s" => \$source_branch_arg,
-                                  "pages-branch=s" => \$pages_branch_arg,
-                                  "logfile-dir=s" => \$logfile_dir_arg,
-                                  "pat=s" => \$pat_file_arg,
-                                  "help|h" => \$help_arg,
-                                  "verbose" => \$verbose_arg,
-                                  );
-
-# Sanity checks
-die "Must specify a git repo"
-    if (!defined($repo_arg));
-die "Must specify a git source branch"
-    if (!defined($source_branch_arg));
-die "Must specify a Github Personal Access Token (PAT) file"
-    if (!defined($pat_file_arg));
-die "Github Personal Access Token (PAT) file unreadable"
-    if (! -r $pat_file_arg);
-
-#####################################################################
-
-open(FILE, $pat_file_arg) || die "Can't open Github Personal Access Token (PAT) file";
-my $pat = <FILE>;
-chomp($pat);
-close(FILE);
-
-$repo_arg =~ m/:(.+)\/(.+)\.git$/;
-my $gh_org = $1;
-my $gh_repo = $2;
-
-#####################################################################
-
-my $logfile_dir = $logfile_dir_arg;
-my $logfile_counter = 1;
-
-sub doit {
-    my $allowed_to_fail = shift;
-    my $cmd = shift;
-    my $stdout_file = shift;
-
-    # Redirect stdout if requested
-    if (defined $stdout_file) {
-        # Put a prefix on the logfiles so that we know that they
-        # belong to this script, and put a counter so that we know the
-        # sequence of logfiles
-        $stdout_file = "runall-md2nroff-$logfile_counter-$stdout_file";
-        ++$logfile_counter;
-
-        $stdout_file = "$logfile_dir/$stdout_file.log";
-        unlink($stdout_file);
-        $cmd .= " >$stdout_file";
-    } elsif (!$verbose_arg && $cmd !~ />/) {
-        $cmd .= " >/dev/null";
-    }
-    $cmd .= " 2>&1";
-
-    my $rc = system($cmd);
-    if (0 != $rc && !$allowed_to_fail) {
-        my_die("Command $cmd failed: exit status $rc");
-    }
-
-    system("cat $stdout_file")
-        if ($verbose_arg && defined($stdout_file) && -f $stdout_file);
-}
-
-sub verbose {
-    print @_
-        if ($verbose_arg);
-}
-
-sub my_die {
-    # Move out of our current cwd so that temp directories can be
-    # automatically cleaned up at close.
-    chdir("/");
-
-    die @_;
-}
-
-sub read_json_file {
-    my $filename = shift;
-    my $unlink_file = shift;
-
-    open(FILE, $filename);
-    my $contents;
-    while (<FILE>) {
-        $contents .= $_;
-    }
-    close(FILE);
-
-    unlink($filename)
-        if ($unlink_file);
-
-    return decode_json($contents);
-}
-
-#####################################################################
-
-# Setup a logfile dir just for this run
-my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
-    localtime(time);
-$logfile_dir =
-    sprintf("%s/cron-run-all-md2nroff-logs-%04d-%02d-%02d-%02d%02d",
-            $logfile_dir_arg, $year + 1900, $mon + 1, $mday,
-            $hour, $min);
-my $rc = system("mkdir $logfile_dir");
-if ($rc != 0 || ! -d $logfile_dir || ! -w $logfile_dir) {
-    my_die "mkdir of $logfile_dir failed, or can't write to it";
-}
-
-my $tmpdir = File::Temp->newdir();
-verbose("*** Working in: $tmpdir\n");
-chdir($tmpdir);
-
-# First, git clone the source branch of the repo
-verbose("*** Cloning repo: $repo_arg / $source_branch_arg...\n");
-doit(0, "git clone --single-branch --branch $source_branch_arg $repo_arg source", "git-clone-source");
-
-# Next, git clone the pages branch of repo
-if (defined($pages_branch_arg)) {
-    verbose("*** Cloning repo: $repo_arg / $pages_branch_arg...\n");
-    doit(0, "git clone --single-branch --branch $pages_branch_arg $repo_arg pages", "git-clone-pages");
-}
-
-#####################################################################
-# Look for all markdown man pages
-#####################################################################
-
-# Find all libfabric *.\d.md files
-verbose("*** Finding libfabric markdown man pages...\n");
-opendir(DIR, "source/man");
-my @libfabric_markdown_files =
-    map { "source/man/" . $_ }
-    grep { /\.\d\.md$/ && -f "source/man/$_" } readdir(DIR);
-closedir(DIR);
-verbose("Found: @libfabric_markdown_files\n");
-
-# Find all fabtests *.\d.md files
-verbose("*** Finding fabtests markdown man pages...\n");
-opendir(DIR, "source/fabtests/man");
-my @fabtests_markdown_files =
-    map { "source/fabtests/man/" . $_ }
-    grep { /\.\d\.md$/ && -f "source/fabtests/man/$_" } readdir(DIR);
-closedir(DIR);
-verbose("Found: @fabtests_markdown_files\n");
-
-#####################################################################
-# Publish any changes to man pages to the gh-pages branch
-# (only libfabric -- not fabtests)
-#####################################################################
-
-# Copy each of the markdown files to the pages branch checkout
-if (defined($pages_branch_arg)) {
-    chdir("pages/master");
-    foreach my $file (@libfabric_markdown_files) {
-        my $base = basename($file);
-        doit(0, "cp $tmpdir/$file man/$base", "loop-cp");
-
-        # Is there a new man page?  If so, we need to "git add" it.
-        my $out = `git status --porcelain man/$base`;
-        doit(0, "git add man/$base", "loop-git-add")
-            if ($out =~ /^\?\?/);
-    }
-
-    # Generate a new index.md with all the files that we just
-    # published.  First, read in the header stub.
-    open(IN, "man/index-head.txt") ||
-        my_die("failed to open index-head.txt");
-    my $str;
-    $str .= $_
-        while (<IN>);
-    close(IN);
-
-    # Write out the header stub into index.md itself
-    open(OUT, ">man/index.md") ||
-        my_die("failed to write to new index.md file");
-    print OUT $str;
-
-    # Now write out all the pages
-    my @headings;
-    push(@headings, { section=>7, title=>"General information" });
-    push(@headings, { section=>3, title=>"API documentation" });
-    foreach my $h (@headings) {
-        print OUT "\n* $h->{title}\n";
-        foreach my $file (sort(@libfabric_markdown_files)) {
-            my $base = basename($file);
-            if ($base =~ /\.$h->{section}\.md$/) {
-                $base =~ m/^(.+)\.$h->{section}\.md$/;
-                my $shortname = $1;
-                print OUT "  * [$shortname($h->{section})]($shortname.$h->{section}.html)\n";
-            }
-        }
-    }
-    close(OUT);
-
-    # Git commit those files in the pages repo and push them to the
-    # upstream repo so that they go live.  If nothing changed, the commit
-    # and push will be no-ops.
-    chdir("..");
-    doit(1, "git commit -s --no-verify -a -m \"Updated Markdown man pages from $source_branch_arg\"",
-         "git-commit-pages");
-    doit(1, "git push", "git-push-pages");
-}
-
-#####################################################################
-# Look for changes to .md files and generate nroff files on master
-#####################################################################
-
-my @markdown_files = (@libfabric_markdown_files,
-                      @fabtests_markdown_files);
-
-# Now process each of the Markdown files in the source repo and
-# generate new nroff man pages.
-chdir("$tmpdir");
-foreach my $file (@markdown_files) {
-    doit(0, "$tmpdir/source/config/md2nroff.pl --source $file", "loop2-md2nroff");
-}
-
-#####################################################################
-
-# Similar to above: commit the newly-generated nroff pages and push
-# them back upstream.  If nothing changed, these will be no-ops.  Note
-# that there are mandatory CI checks on master, which means we can't
-# push directly.  Instead, we must make a pull request.  Hence, don't
-# git commit directly to the pages branch here; make a branch and
-# commit there.
-
-# Try to delete the old pr branch first (it's ok to fail -- i.e., if
-# it wasn't there).
-chdir("$tmpdir/source");
-my $pr_branch_name = "pr/update-nroff-generated-man-pages";
-doit(1, "git branch -D $pr_branch_name");
-doit(0, "git checkout -b $pr_branch_name");
-
-# Do the commit.  Save the git HEAD hash before and after so that we
-# can tell if the "git commit" command actually resulted in a new
-# commit.
-my $old_head=`git rev-parse HEAD`;
-doit(1, "git commit -s --no-verify -a -m \"Updated nroff-generated man pages\"",
-     "git-commit-source-generated-man-pages");
-my $new_head=`git rev-parse HEAD`;
-
-# See if the commit was a no op or not.
-if ($old_head ne $new_head) {
-    chomp($new_head);
-
-    # Push the branch up to github
-    doit(0, "git push --force", "git-push-source-generated-man-pages");
-
-    # Get the list of files
-    open(GIT, 'git diff-tree --no-commit-id --name-only -r HEAD|') ||
-        my_die "Cannot git diff-tree";
-    my @files;
-    while (<GIT>) {
-        chomp;
-        push(@files, $_);
-    }
-    close(GIT);
-
-    # Create a new pull request
-    my $cmd_base = "curl --silent ";
-    $cmd_base .= "-H 'Content-Type: application/json' ";
-    $cmd_base .= "-H 'Authorization: token $pat' ";
-    $cmd_base .= "-H 'User-Agent: OFIWG-bot' ";
-
-    my $outfile = 'curl-out.json';
-    unlink($outfile);
-
-    my $body;
-    $body = "The Nroff Elves created these man pages, just for you:\n\n";
-    foreach my $f (@files) {
-        $body .= "* `$f`\n";
-    }
-
-    my $json = {
-        title => 'Update nroff-generated man pages',
-        body  => $body,
-        head  => $pr_branch_name,
-        base  => 'master',
-    };
-    my $json_encoded = encode_json($json);
-
-    my $cmd = $cmd_base;
-    $cmd .= "--request POST ";
-    $cmd .= "--data '$json_encoded' ";
-    $cmd .= "https://api.github.com/repos/$gh_org/$gh_repo/pulls ";
-    $cmd .= "-o $outfile";
-    doit(0, $cmd, "github-create-pr");
-
-    # Read the resulting file to find whether the PR creation
-    # succeeded, and if so, what the URL of the new PR is.
-    $json = read_json_file($outfile, 1);
-    if (!exists($json->{'id'}) || !exists($json->{'url'})) {
-        my_die "Failed to create PR";
-    }
-
-    my $pr_url = $json->{'url'};
-    my $pr_num = $json->{'number'};
-    verbose("Created PR #$pr_num\n");
-
-    # Wait for the required DCO check to complete on the git hash for
-    # the latest commit.
-    $outfile = "github-ci-status-check.json";
-
-    $cmd = $cmd_base;
-    $cmd .= "-o $outfile ";
-    $cmd .= "-H 'Accept: application/vnd.github.antiope-preview+json' ";
-    $cmd .= "https://api.github.com/repos/$gh_org/$gh_repo/commits/$new_head/check-runs";
-
-    my $count = 0;
-    my $max_count = 30;
-    my $happy = 0;
-    verbose("Waiting for DCO check to complete\n");
-
-    # Only wait for $max_count iterations
-    while (!$happy && $count < $max_count) {
-        # Give the DCO hook time to run
-        sleep(1);
-
-        unlink($outfile);
-        doit(0, $cmd, "github-check-run-status");
-        my $json = read_json_file($outfile, 1);
-
-        if ($json and $#{$json->{"check_runs"}} >= 0) {
-            # If we got any statuses back, check them to see if we can
-            # find a successful DCO signoff.  That would indicate that
-            # the required check test ran.
-            foreach my $j (@{$json->{"check_runs"}}) {
-                if ($j->{"name"} eq "DCO") {
-                    verbose("Found DCO status on SHA $new_head\n");
-                    if ($j->{"status"} eq "completed") {
-                        if ($j->{"conclusion"} eq "success") {
-                            verbose("DCO is happy!\n");
-                            $happy = 1;
-                            last;
-                        } else {
-                            verbose("DCO is not happy -- how did that happen?\n");
-                            $happy = 0;
-                            last;
-                        }
-                    }
-                }
-            }
-        }
-
-        $count += 1;
-    }
-
-    my_die("Could not find a happy DCO status on $new_head")
-        if (!$happy);
-
-    # If we get here, it means the DCO CI is done/happy, so we can
-    # merge the PR.
-    $json = {
-        commit_title   => "Merge pull request #$pr_num",
-        commit_message => "More tasty nroff man pages for you, fresh out of the oven!",
-        sha            => $new_head,
-        merge_method   => "merge",
-    };
-    $json_encoded = encode_json($json);
-
-    $outfile = "github-per-merge.json";
-    unlink($outfile);
-
-    $cmd = $cmd_base;
-    $cmd .= "--request PUT ";
-    $cmd .= "--data '$json_encoded' ";
-    $cmd .= "-o $outfile ";
-    $cmd .= "$pr_url/merge";
-    doit(0, $cmd, "github-create-pr");
-
-    # Remove the remote branch
-    doit(1, "git push origin --delete $pr_branch_name", 'git-remove-remote-branch');
-}
-
-# Delete the local pull request branch
-doit(0, "git checkout master");
-doit(1, "git branch -D $pr_branch_name");
-
-# chdir out of the tmpdir so that it can be removed
-chdir("/");
-
-# If we get here, we finished successfully, so there's no need to keep
-# the logfile dir around
-system("rm -rf $logfile_dir");
-
-exit(0);
diff --git a/deps/libfabric/config/distscript.pl b/deps/libfabric/config/distscript.pl
index 25b44837876182d602925c4533a564fe2b57e751..fd49943f6c341acc51de0df745305bff63610941 100755
--- a/deps/libfabric/config/distscript.pl
+++ b/deps/libfabric/config/distscript.pl
@@ -29,17 +29,8 @@ sub subst {
     close(IN);
 
     my $copy = $orig;
-    $copy =~ s/\@VERSION\@/Libfabric v$version/g;
-    $copy =~ s/\@DATE\@/$today/g;
-
-    # Note that there appears to be a bug in some versions of Pandoc
-    # that will escape the appearance of @ in generated man pages
-    # (e.g., in the "@VERSION@" that appears in the man page version
-    # field).  So rather than be clever in the regexp's above, do the
-    # simple/clear thing and repeat the same regexp's as above, but
-    # with double-escaped @'s.
-    $copy =~ s/\\\@VERSION\\\@/Libfabric v$version/g;
-    $copy =~ s/\\\@DATE\\\@/$today/g;
+    $copy =~ s/#VERSION#/Libfabric v$version/g;
+    $copy =~ s/#DATE#/$today/g;
 
     if ($copy ne $orig) {
         print "*** VERSION/DATE-ifying $file...\n";
diff --git a/deps/libfabric/config/fi_provider.m4 b/deps/libfabric/config/fi_provider.m4
index e01e3373c9f87d1abbf5b56ad30dddf53c9ece59..eae2d8a39afce3a7fa33de5ccd2d89c31435fd1b 100644
--- a/deps/libfabric/config/fi_provider.m4
+++ b/deps/libfabric/config/fi_provider.m4
@@ -68,7 +68,13 @@ dnl
 					and use $1 installed under PATH)])
 			     ],
 			     [],
-			     [enable_$1=auto])])
+			     [AS_IF([test x"$enable_only" != x"no"],
+			            [AC_MSG_NOTICE([*** Skipping $1 because $enable_only set])
+			             enable_$1=no],
+			            [enable_$1=auto])
+			    ])
+	      ])
+
 
 	# Save CPPFLAGS and LDFLAGS before they are modified by FI_CHECK_PREFIX_DIR.
 	# Provider's local macros could use the value if needed.
diff --git a/deps/libfabric/config/fi_strip_optflags.m4 b/deps/libfabric/config/fi_strip_optflags.m4
new file mode 100644
index 0000000000000000000000000000000000000000..3911b0757908f48cff6bcddebadd507401226c49
--- /dev/null
+++ b/deps/libfabric/config/fi_strip_optflags.m4
@@ -0,0 +1,62 @@
+dnl -*- shell-script -*-
+dnl
+dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+dnl                         University Research and Technology
+dnl                         Corporation.  All rights reserved.
+dnl Copyright (c) 2004-2005 The University of Tennessee and The University
+dnl                         of Tennessee Research Foundation.  All rights
+dnl                         reserved.
+dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+dnl                         University of Stuttgart.  All rights reserved.
+dnl Copyright (c) 2004-2005 The Regents of the University of California.
+dnl                         All rights reserved.
+dnl Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
+dnl Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+dnl Copyright (c) 2014-2021 Intel, Inc. All rights reserved.
+dnl $COPYRIGHT$
+dnl
+dnl Additional copyrights may follow
+dnl
+dnl $HEADER$
+dnl
+
+dnl
+dnl This file derived from config/opal_strip_optflags.m4 in Open MPI.
+dnl
+dnl Example Usage:
+dnl      FI_STRIP_OPTFLAGS($CFLAGS)
+dnl      CFLAGS_WITHOUT_OPTFLAGS="$s_result"
+
+AC_DEFUN([FI_STRIP_OPTFLAGS],[
+
+# Process a set of flags and remove all debugging and optimization
+# flags
+
+s_arg="$1"
+s_result=
+for s_word in $s_arg; do
+    # See http://www.gnu.org/software/autoconf/manual/html_node/Quadrigraphs.html#Quadrigraphs
+    # for an explanation of @<:@ and @:>@ -- they m4 expand to [ and ]
+    case $s_word in
+    -g)                 ;;
+    -g@<:@1-3@:>@)      ;;
+    +K@<:@0-5@:>@)      ;;
+    -O)                 ;;
+    -O@<:@0-9@:>@)      ;;
+    -xO)                ;;
+    -xO@<:@0-9@:>@)     ;;
+    -fast)              ;;
+    -finline-functions) ;;
+
+    # The below Sun Studio flags require or
+    # trigger -xO optimization
+    -xvector*)          ;;
+    -xdepend=yes)       ;;
+
+    *)     s_result="$s_result $s_word"
+    esac
+done
+
+# Clean up
+
+unset s_word s_arg])
diff --git a/deps/libfabric/config/md2nroff.pl b/deps/libfabric/config/md2nroff.pl
index 18ce6671832c4e72270f0a3287fbc906c6de9e00..b0fef785a48b2d04ab52209ae2357c20cac42305 100755
--- a/deps/libfabric/config/md2nroff.pl
+++ b/deps/libfabric/config/md2nroff.pl
@@ -99,9 +99,9 @@ while ($pandoc_input =~ m/\[(.+?)\]\(.+?\)/) {
 }
 
 # Add the pandoc header
-$pandoc_input = "% $shortfile($section) Libfabric Programmer's Manual | \@VERSION\@
+$pandoc_input = "% $shortfile($section) Libfabric Programmer's Manual | #VERSION#
 % OpenFabrics
-% \@DATE\@\n\n$pandoc_input";
+% #DATE#\n\n$pandoc_input";
 
 # Generate the nroff output
 my ($fh, $temp_filename) = tempfile();
@@ -132,8 +132,8 @@ if (-r $target) {
     # compare and ignore if the date has changed.  Note that some
     # versions of pandoc render dates as xxxx\-xx\-xx, and others
     # render it as xxxx-xx-xx.  Handle both.
-    $target_nroff =~ s/\"\d\d\d\d\\\-\d\d\\\-\d\d\"/\"\\\@DATE\\\@\"/;
-    $target_nroff =~ s/\"\d\d\d\d\-\d\d\-\d\d\"/\"\\\@DATE\\\@\"/;
+    $target_nroff =~ s/\"\d\d\d\d\\\-\d\d\\\-\d\d\"/\"#DATE#\"/;
+    $target_nroff =~ s/\"\d\d\d\d\-\d\d\-\d\d\"/\"#DATE#\"/;
 
     $write_nroff = 0
         if ($pandoc_nroff eq $target_nroff);
@@ -144,7 +144,7 @@ if ($write_nroff) {
 
     # What's the date right now?
     my $now_string = strftime "%Y\\-%m\\-%d", localtime;
-    $pandoc_nroff =~ s/\\\@DATE\\\@/$now_string/g;
+    $pandoc_nroff =~ s/#DATE#/$now_string/g;
 
     # Make sure the target directory exists
     my $dirname = dirname($target);
diff --git a/deps/libfabric/configure.ac b/deps/libfabric/configure.ac
index 04ae0696f916209d8b4ffdac6028596328100914..8882478e570e4316be4fb4233435e84eb59f2c6c 100644
--- a/deps/libfabric/configure.ac
+++ b/deps/libfabric/configure.ac
@@ -1,13 +1,13 @@
 dnl
 dnl Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
-dnl Copyright (c) 2019 Intel, Inc.  All rights reserved.
+dnl Copyright (c) 2019-2021 Intel, Inc.  All rights reserved.
 dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
 dnl (C) Copyright 2020 Hewlett Packard Enterprise Development LP
 dnl
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.60])
-AC_INIT([libfabric], [1.11.1], [ofiwg@lists.openfabrics.org])
+AC_INIT([libfabric], [1.14.0], [ofiwg@lists.openfabrics.org])
 AC_CONFIG_SRCDIR([src/fabric.c])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR(config)
@@ -15,6 +15,7 @@ AC_CONFIG_HEADERS(config.h)
 AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects parallel-tests tar-pax])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 m4_include(config/fi_check_package.m4)
+m4_include(config/fi_strip_optflags.m4)
 
 AC_CANONICAL_HOST
 
@@ -85,6 +86,12 @@ AC_ARG_ENABLE([direct],
 	[],
 	[enable_direct=no])
 
+AC_ARG_ENABLE([only],
+	[AS_HELP_STRING([--enable-only],
+		[Only build explicitly specified fabric providers])
+	],
+	[],
+	[enable_only=no])
 
 AC_ARG_ENABLE([atomics],
 	[AS_HELP_STRING([--enable-atomics],
@@ -132,6 +139,16 @@ AS_IF([test x"$enable_debug" != x"no"],
 AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg],
                    [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise])
 
+AC_ARG_ENABLE([asan],
+	      [AS_HELP_STRING([--enable-asan],
+			      [Enable address sanitizer @<:@default=no@:>@])
+	      ],
+	      [],
+	      [enable_asan=no])
+
+AS_IF([test x"$enable_asan" != x"no"],
+      [CFLAGS="-fsanitize=address $CFLAGS"])
+
 dnl Checks for header files.
 AC_HEADER_STDC
 
@@ -147,7 +164,7 @@ AC_ARG_WITH([dlopen],
 		       [dl-loadable provider support @<:@default=yes@:>@]),
 	)
 
-if test "$freebsd" == "0"; then
+if test "$freebsd" = "0"; then
 AS_IF([test x"$with_dlopen" != x"no"], [
 AC_CHECK_LIB(dl, dlopen, [],
     AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.]))
@@ -274,6 +291,8 @@ AS_IF([test x"$enable_atomics" != x"no"],
 
 dnl Check for gcc memory model aware built-in atomics
 dnl If supported check to see if not internal to compiler
+dnl If built-in atomics present, check for 128-bit atomic support
+have_mm_atomics=0
 LIBS_save=$LIBS
 AC_SEARCH_LIBS([__atomic_load_8], [atomic])
 AS_IF([test x"$enable_atomics" != x"no"],
@@ -296,6 +315,7 @@ AS_IF([test x"$enable_atomics" != x"no"],
         [
             AC_MSG_RESULT(yes)
             AC_DEFINE(HAVE_BUILTIN_MM_ATOMICS, 1, [Set to 1 to use built-in intrinsics memory model aware atomics])
+	    have_mm_atomics=1
         ],
         [
             AC_MSG_RESULT(no)
@@ -307,6 +327,34 @@ AS_IF([test x"$enable_atomics" != x"no"],
 ])
 unset LIBS_save
 
+dnl Check for 128-bit integer support
+AC_CHECK_TYPE([__int128],
+	[AC_DEFINE(HAVE___INT128, 1, [Set to 1 to use 128-bit ints])])
+
+dnl Check for 128-bit integer built-in atomic support
+AS_IF([test "$have_mm_atomics" -eq 1 -a "$ac_cv_type___int128" = "yes"],
+    AC_MSG_CHECKING(compiler support for built-in memory model aware 128-bit atomics)
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <stdint.h>]],
+        [[__int128 d;
+         __int128 s;
+         __int128 c;
+         __int128 r;
+          r = __atomic_fetch_add(&d, s, __ATOMIC_SEQ_CST);
+          __atomic_load(&d, &r, __ATOMIC_SEQ_CST);
+          __atomic_exchange(&d, &s, &r, __ATOMIC_SEQ_CST);
+          __atomic_compare_exchange(&d, &c, &s, 0, __ATOMIC_SEQ_CST,
+	  			    __ATOMIC_SEQ_CST);
+	 return 0;
+        ]])],
+        [
+            AC_MSG_RESULT(yes)
+            AC_DEFINE(HAVE_BUILTIN_MM_INT128_ATOMICS, 1, [Set to 1 to use built-in intrinsics memory model aware 128-bit integer atomics])
+        ],
+        [
+            AC_MSG_RESULT(no)
+        ]),
+)
+
 dnl Check for gcc cpuid intrinsics
 AC_MSG_CHECKING(compiler support for cpuid)
 AC_TRY_LINK([
@@ -386,7 +434,7 @@ AC_TRY_LINK(
 		int foo(int arg) { return arg + 3; };
 		int foo2(int arg) __attribute__ (( __alias__("foo")));
 	],
-	[ /* empty main */ ],
+	[ foo2(1); ],
 	[
 		AC_MSG_RESULT(yes)
 		ac_prog_cc_alias_symbols=1
@@ -500,7 +548,7 @@ AS_IF([test x"$with_cuda" != x"no"],
 			      [])],
 	    [])
 
-AS_IF([test "$with_cuda" = "yes" && test "$have_libcuda" = "0" ],
+AS_IF([test x"$with_cuda" != x"no" && test -n "$with_cuda" && test "$have_libcuda" = "0" ],
 	[AC_MSG_ERROR([CUDA support requested but CUDA runtime not available.])],
 	[])
 AC_DEFINE_UNQUOTED([HAVE_LIBCUDA], [$have_libcuda], [Whether we have CUDA runtime or not])
@@ -510,7 +558,7 @@ AC_ARG_ENABLE([cuda-dlopen],
         [Enable dlopen of CUDA libraries @<:@default=no@:>@])
     ],
     [
-        AS_IF([test "$freebsd" == "0"], [
+        AS_IF([test "$freebsd" = "0"], [
             AC_CHECK_LIB(dl, dlopen, [],
                 [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
         ])
@@ -523,6 +571,7 @@ AC_ARG_WITH([ze],
 					 libraries and headers are installed.]),
 	[], [])
 
+have_ze=0
 AS_IF([test x"$with_ze" != x"no"],
       [FI_CHECK_PACKAGE([ze],
 			[level_zero/ze_api.h],
@@ -531,13 +580,40 @@ AS_IF([test x"$with_ze" != x"no"],
 			[],
 			[$with_ze],
 			[],
-			[AC_DEFINE([HAVE_LIBZE], [1],[ZE support])],
-			[], [])
-       CPPFLAGS="$CPPFLAGS $ze_CPPFLAGS"
-       LDFLAGS="$LDFLAGS $ze_LDFLAGS"
-       LIBS="$LIBS $ze_LIBS"],
+			[have_ze=1],
+			[], [])],
       [])
 
+have_drm=0
+AS_IF([test "$have_ze" = "1"],
+      [AC_CHECK_HEADER(drm/i915_drm.h, [have_drm=1], [])]
+      [])
+
+AS_IF([test x"$with_ze" != x"no" && test -n "$with_ze" && test "$have_ze" = "0" ],
+	[AC_MSG_ERROR([ZE support requested but ZE runtime not available.])],
+	[])
+
+AC_DEFINE_UNQUOTED([HAVE_LIBZE], [$have_ze], [ZE support])
+AC_DEFINE_UNQUOTED([HAVE_DRM], [$have_drm], [i915 DRM header])
+
+AC_ARG_ENABLE([ze-dlopen],
+    [AS_HELP_STRING([--enable-ze-dlopen],
+        [Enable dlopen of ZE libraries @<:@default=no@:>@])
+    ],
+    [
+        AS_IF([test "$freebsd" = "0"], [
+            AC_CHECK_LIB(dl, dlopen, [],
+                [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
+        ])
+        AC_DEFINE([ENABLE_ZE_DLOPEN], [1], [dlopen ZE libraries])
+    ],
+    [enable_ze_dlopen=no])
+
+AS_IF([test x"$enable_ze_dlopen" != x"yes"], [LIBS="$LIBS $ze_LIBS"])
+AS_IF([test "$have_ze" = "1" && test x"$with_ze" != x"yes"],
+      [CPPFLAGS="$CPPFLAGS $ze_CPPFLAGS"
+       LDFLAGS="$LDFLAGS $ze_LDFLAGS"])
+
 enable_memhooks=1
 AC_ARG_ENABLE([memhooks-monitor],
               [AC_HELP_STRING([--disable-memhooks-monitor],
@@ -548,7 +624,7 @@ AC_ARG_ENABLE([memhooks-monitor],
 AC_DEFINE_UNQUOTED(ENABLE_MEMHOOKS_MONITOR, [$enable_memhooks],
 	[Define to 1 to enable memhooks memory monitor])
 
-AS_IF([test "$enable_memhooks" == "1"], [
+AS_IF([test "$enable_memhooks" = "1"], [
 	AC_CHECK_FUNCS([__curbrk __clear_cache])
 	AC_CHECK_HEADERS([linux/mman.h sys/syscall.h])
 	AC_CHECK_DECLS([__syscall], [], [], [#include <sys/syscall.h>])
@@ -585,6 +661,56 @@ LDFLAGS="$LDFLAGS $cuda_LDFLAGS"
 
 AS_IF([test x"$enable_cuda_dlopen" != x"yes"], [LIBS="$LIBS $cuda_LIBS"])
 
+#gdrcopy related configs
+AC_ARG_WITH([gdrcopy],
+	    [AC_HELP_STRING([--with-gdrcopy=DIR],
+			    [Provide path to where the gdrcopy development
+			    and runtime libraries are installed.])],
+	    [], [])
+
+AS_IF([test -n "$with_gdrcopy" && test x"$with_gdrcopy" != x"no" && test "$have_libcuda" = "0"],
+	[AC_MSG_ERROR([gdrcopy is requested but cuda is not requested or cuda runtime is not available.])],
+	[])
+
+have_gdrcopy=0
+AS_IF([test "$have_libcuda" = "1" && test x"$with_gdrcopy" != x"no"],
+	[AS_IF([test x"$with_gdrcopy" = x"yes"],[gdrcopy_dir=""],[gdrcopy_dir=$with_gdrcopy])
+	 FI_CHECK_PACKAGE([gdrcopy],
+			  [gdrapi.h],
+			  [gdrapi],
+			  [gdr_open],
+			  [],
+			  [$gdrcopy_dir],
+			  [],
+			  [have_gdrcopy=1],
+			  [],
+			  [])],
+	[])
+
+AS_IF([test x"$with_gdrcopy" != x"no" && test -n "$with_gdrcopy" && test "$have_gdrcopy" = "0" ],
+	[AC_MSG_ERROR([gdrcopy support requested but gdrcopy development library is not available.])],
+	[])
+
+AC_DEFINE_UNQUOTED([HAVE_GDRCOPY], [$have_gdrcopy], [Whether we have gdrcopy development library or not])
+
+AC_ARG_ENABLE([gdrcopy-dlopen],
+    [AS_HELP_STRING([--enable-gdrcopy-dlopen],
+        [Enable dlopen of gdrcopy libraries @<:@default=no@:>@])
+    ],
+    [
+        AS_IF([test "$freebsd" = "0"], [
+            AC_CHECK_LIB(dl, dlopen, [],
+                [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
+        ])
+        AC_DEFINE([ENABLE_GDRCOPY_DLOPEN], [1], [dlopen CUDA libraries])
+    ],
+    [enable_gdrcopy_dlopen=no])
+
+CPPFLAGS="$CPPFLAGS $gdrcopy_CPPFLAGS"
+LDFLAGS="$LDFLAGS $gdrcopy_LDFLAGS"
+AS_IF([test x"$enable_gdrcopy_dlopen" != x"yes"], [LIBS="$LIBS $gdrcopy_LIBS"])
+#end gdrcopy configures
+
 dnl Check for ROCR runtime libraries.
 AC_ARG_WITH([rocr],
 	    [AC_HELP_STRING([--with-rocr=DIR],
@@ -597,7 +723,7 @@ AC_ARG_ENABLE([rocr-dlopen],
         [Enable dlopen of ROCR libraries @<:@default=no@:>@])
     ],
     [
-        AS_IF([test "$freebsd" == "0"], [
+        AS_IF([test "$freebsd" = "0"], [
             AC_CHECK_LIB(dl, dlopen, [],
                 [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
         ])
@@ -620,10 +746,13 @@ LDFLAGS="$LDFLAGS $rocr_LDFLAGS"
 
 AS_IF([test x"$enable_rocr_dlopen" != x"yes"], [LIBS="$LIBS $rocr_LIBS"])
 
+AC_CHECK_SIZEOF([void *])
+
 dnl Provider-specific checks
 FI_PROVIDER_INIT
 FI_PROVIDER_SETUP([psm])
 FI_PROVIDER_SETUP([psm2])
+FI_PROVIDER_SETUP([psm3])
 FI_PROVIDER_SETUP([sockets])
 FI_PROVIDER_SETUP([verbs])
 FI_PROVIDER_SETUP([efa])
@@ -678,7 +807,7 @@ fi
 
 for i in $PROVIDERS_TO_BUILD; do
 	v=${i}_dl
-	if test `eval echo \\$${v}` == "1"; then
+	if test `eval echo \\$${v}` = "1"; then
 		dso="$i ${dso}"
 	else
 		builtin="$i ${builtin}"
diff --git a/deps/libfabric/contrib/buildrpm/README b/deps/libfabric/contrib/buildrpm/README
index 98fc95ac834eee8dafd3775e99cc3c562f328090..f25a544406594df5e80faf214d581e282151ae9e 100644
--- a/deps/libfabric/contrib/buildrpm/README
+++ b/deps/libfabric/contrib/buildrpm/README
@@ -47,6 +47,10 @@ Provider parameters:
 
 
 General parameters:
+-b
+    Build binary packages only
+    By default, build binary and source packages
+
 -n
     Do nothing, useful with -v option. If used with -v option,
     it will just print what would have been done.
diff --git a/deps/libfabric/contrib/buildrpm/buildrpmLibfabric.sh b/deps/libfabric/contrib/buildrpm/buildrpmLibfabric.sh
index 3e8cef4fb8625b86964666d1d96f4fcf7a4c5856..9202932e9308d4edf7d69610a412da2d706188ba 100755
--- a/deps/libfabric/contrib/buildrpm/buildrpmLibfabric.sh
+++ b/deps/libfabric/contrib/buildrpm/buildrpmLibfabric.sh
@@ -51,6 +51,7 @@ create_modulefile=""
 unpack_spec=""
 verbose=""
 verboseoption=""
+build_binary_only=""
 st=""
 version=""
 modulepath=""
@@ -105,7 +106,7 @@ error()
 # usage information
 ###################
 usage="Usage: $0 [-i provider_name] [-e provider_name]
-       [-n] [-o] [-m] [-d] [-s] [-c] [-r] [-v] [-h] tarball
+       [-n] [-o] [-l] [-m] [-d] [-s] [-c] [-r] [-v] [-h] tarball
 
  Provider options:
 
@@ -116,11 +117,17 @@ usage="Usage: $0 [-i provider_name] [-e provider_name]
              exclude 'provider_name' provider support from the build
 
  General options:
+  -b         build binary packages only
+               {default: build binary and source packages}
+
   -n         no op, do nothing (useful with -v option)
 
   -o         install under /opt/libfabric/_VERSION_
                {default: install under /usr/ }
 
+  -l         create symbolic link 'default' to _VERSION_ (requires -o option) 
+              {default: link not create}
+
   -m         install modulefile
               {default: don't install modulefile}
 
@@ -160,8 +167,10 @@ usage="Usage: $0 [-i provider_name] [-e provider_name]
 # parse args
 ############
 export arguments="$@"
-while getopts DP:M:V:nomi:e:dc:r:svh flag; do
+while getopts DP:M:V:nolmi:e:dc:r:svhb flag; do
     case "$flag" in
+      b) build_binary_only="true"
+         ;;
       n) noop="true"
          ;;
       o) install_in_opt="true"
@@ -191,6 +200,8 @@ while getopts DP:M:V:nomi:e:dc:r:svh flag; do
          ;;
       v) verbose="true"
          ;;
+      l) version_symbolic_link="true"
+         ;;
       h) echo "$usage"
          exit 0
          ;;
@@ -261,6 +272,9 @@ if [[ -n "$install_in_opt" ]]; then
   if [[ -z "$prefix" ]] ; then
     prefix=$default_opt_prefix
   fi
+  if [[ -n "$version_symbolic_link" ]]; then
+    rpmbuild_options="$rpmbuild_options --define '_version_symbolic_link $prefix/libfabric/default'"
+  fi
   prefix="$prefix/libfabric/$version"
 
   if [[ -n "$modulepath" ]] ; then
@@ -327,7 +341,14 @@ if [[ -z "$verbose" ]]; then
 else
   build_opt="-v"
 fi
-cmd="rpmbuild $build_opt -bb $specfile $rpmbuild_options \
+
+if [[ -n "$build_binary_only" ]] ; then
+    rpmbuild_flag="-bb"
+else
+    rpmbuild_flag="-ba"
+fi
+
+cmd="rpmbuild $build_opt $rpmbuild_flag $specfile $rpmbuild_options \
   --define '_topdir $rpmbuilddir' \
   --define '_sourcedir $rpmbuilddir/SOURCES' \
   --define '_rpmdir $rpmbuilddir/RPMS' \
diff --git a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
index fe992a2f41865e99e0a56a4387e3274e2650abeb..fba9d6eac320f69b67a58795b8cec90009c94ce8 100644
--- a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
+++ b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
@@ -5,6 +5,9 @@ pipeline {
         timestamps()
         timeout(activity: true, time: 4, unit: 'HOURS')    
     }
+    environment {
+        JOB_CADENCE = 'PR'
+    }
 
     stages {
         stage ('fetch-opa-psm2')  {
@@ -202,7 +205,8 @@ pipeline {
         withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) {
             sh "rm -rf '/mpibuilddir/mpich-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
             sh "rm -rf '/mpibuilddir/ompi-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
-            dir("${env.WORKSPACE}"){
+            sh "rm -rf '/mpibuilddir/mpich-suite-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
+	    dir("${env.WORKSPACE}"){
                 deleteDir()
             }
         }
diff --git a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile.daily b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile.daily
index 06e7512981d13a93c2b9338a93d02c11cc61eab5..2dbabe09a5b4badac2012981a73e4a62b33dfa58 100644
--- a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile.daily
+++ b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile.daily
@@ -5,7 +5,9 @@ pipeline {
     timestamps()             
     timeout(activity: true, time: 4, unit: 'HOURS')
     }
-    
+    environment {
+        JOB_CADENCE = 'daily'
+    }    
     stages {
         stage ('fetch-opa-psm2')  {
              steps {
@@ -487,6 +489,7 @@ pipeline {
         withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) {
             sh "rm -rf '/mpibuilddir/mpich-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
             sh "rm -rf '/mpibuilddir/ompi-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
+            sh "rm -rf '/mpibuilddir/mpich-suite-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'"
             dir("${env.WORKSPACE}"){
                 deleteDir()
             }
diff --git a/deps/libfabric/contrib/intel/jenkins/build.py b/deps/libfabric/contrib/intel/jenkins/build.py
index b46e17cb205d949d2dbe187a532194b801792bd9..1f655043e1ac856ed9e0bdd6827a86dbdc02960c 100755
--- a/deps/libfabric/contrib/intel/jenkins/build.py
+++ b/deps/libfabric/contrib/intel/jenkins/build.py
@@ -148,13 +148,18 @@ def build_mpi(mpi, mpisrc, mpi_install_path, libfab_install_path,  ofi_build_mod
     common.run_command(["make", "clean"])
     common.run_command(["make", "install", "-j32"])
 
-def build_mpich_suite(mpi, mpi_install_path, libfab_install_path):
+def build_mpich_suite(mpi, mpi_install_path, libfab_install_path, ofi_build_mode):
 
-    mpich_suite_path = '{}/test/'.format(ci_site_config.mpich_src)
+    mpich_suite_build_path = "/mpibuilddir/mpich-suite-build-dir/{}/{}/{}/mpich" \
+                             .format(jobname, buildno, ofi_build_mode);
+    if (os.path.exists(mpich_suite_build_path) == False):
+        shutil.copytree(ci_site_config.mpich_src, mpich_suite_build_path)
+
+    mpich_suite_path = '{}/test/'.format(mpich_suite_build_path)
     mpichsuite_installpath= "{}/mpichsuite/test".format(mpi_install_path)
     pwd = os.getcwd()
     if (mpi == 'impi'):
-        os.chdir("{}/mpi".format(mpich_suite_path))
+        os.chdir("{}/mpi".format(mpich_suite_path)) 
         cmd = ["./configure", "--with-mpi={}/intel64" \
                .format(ci_site_config.impi_root)]
 
@@ -270,7 +275,7 @@ if __name__ == "__main__":
             build_mpi(mpi, mpisrc, mpi_install_path, install_path, ofi_build_mode)
         
 	# build mpich_test_suite
-        build_mpich_suite(mpi, mpi_install_path, install_path)
+        build_mpich_suite(mpi, mpi_install_path, install_path, ofi_build_mode)
         # run stress and osu benchmarks for all mpitypes
         build_stress_bm(mpi, mpi_install_path, install_path)
         build_osu_bm(mpi, mpi_install_path, install_path)
diff --git a/deps/libfabric/contrib/intel/jenkins/common.py b/deps/libfabric/contrib/intel/jenkins/common.py
index 61464672273cc4ed00ec90d150bc88d549919396..7805edc22e7ba02c59acf5fbda59afcd2e3cbd9a 100755
--- a/deps/libfabric/contrib/intel/jenkins/common.py
+++ b/deps/libfabric/contrib/intel/jenkins/common.py
@@ -26,6 +26,7 @@ def run_command(command):
 Prov = collections.namedtuple('Prov', 'core util')
 prov_list = [
 
+   Prov("psm3", None),
    Prov("psm2", None),
    Prov("verbs", None),
    Prov("verbs", "rxd"),
@@ -45,6 +46,7 @@ enabled_prov_list = [
     "shm"
 ]
 disabled_prov_list = [
+    "psm3",
     'usnic',
     'psm',
     'efa',
diff --git a/deps/libfabric/contrib/intel/jenkins/run.py b/deps/libfabric/contrib/intel/jenkins/run.py
index 7873f0f1e77a2a0bc2f7072b9d651a7c0e88c1ad..8dd0d8f546e793f413f48c909903b69b67ab547b 100755
--- a/deps/libfabric/contrib/intel/jenkins/run.py
+++ b/deps/libfabric/contrib/intel/jenkins/run.py
@@ -19,17 +19,17 @@ bno = os.environ['BUILD_NUMBER']#args.buildno
 
 #run fi_info test
 def fi_info_test(core, hosts, mode,util=None):
-    
+
     fi_info_test = tests.FiInfoTest(jobname=jbname,buildno=bno,\
                     testname="fi_info", core_prov=core, fabric=fab,\
                          hosts=hosts, ofi_build_mode=mode, util_prov=util)
     print("running fi_info test for {}-{}-{}".format(core, util, fab))
     fi_info_test.execute_cmd()
-        
+
 
 #runfabtests
 def fabtests(core, hosts, mode, util=None):
-       
+
     runfabtest = tests.Fabtest(jobname=jbname,buildno=bno,\
                  testname="runfabtests", core_prov=core, fabric=fab,\
                  hosts=hosts, ofi_build_mode=mode, util_prov=util)
@@ -41,7 +41,7 @@ def fabtests(core, hosts, mode, util=None):
         print("skipping {} as execute condition fails"\
               .format(runfabtest.testname))
     print("----------------------------------------------------------------------------------------\n")
-    
+
 def shmemtest(core, hosts, mode, util=None):
     runshmemtest = tests.ShmemTest(jobname=jbname,buildno=bno,\
                  testname="shmem test", core_prov=core, fabric=fab,\
@@ -59,7 +59,7 @@ def shmemtest(core, hosts, mode, util=None):
         print("skipping {} as execute condition fails"\
               .format(runshmemtest.testname))
     print("----------------------------------------------------------------------------------------\n")
-    
+
 
 #imb-tests
 def intel_mpi_benchmark(core, hosts, mpi, mode, util=None):
@@ -67,7 +67,7 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, util=None):
     imb_test = tests.MpiTestIMB(jobname=jbname,buildno=bno,\
                testname="IntelMPIbenchmark",core_prov=core, fabric=fab,\
                hosts=hosts, mpitype=mpi, ofi_build_mode=mode, util_prov=util)
-    
+
     if (imb_test.execute_condn == True  and imb_test.mpi_gen_execute_condn == True):
         print("running imb-tests for {}-{}-{}-{}".format(core, util, fab, mpi))
         imb_test.execute_cmd()
@@ -87,29 +87,15 @@ def mpich_test_suite(core, hosts, mpi, mode, util=None):
         print("Running mpich test suite: Spawn coll, comm, dt Tests for {}-{}-{}-{}".format(core, util, fab, mpi))
         os.environ["MPITEST_RETURN_WITH_CODE"] = "1"
         mpich_tests.execute_cmd("spawn")
- 
-#mpi_stress benchmark tests
-def mpistress_benchmark(core, hosts, mpi, mode, util=None):
-
-    stress_test = tests.MpiTestStress(jobname=jbname,buildno=bno,\
-                  testname="stress",core_prov=core, fabric=fab, mpitype=mpi,\
-                  hosts=hosts, ofi_build_mode=mode, util_prov=util)
- 
-    if (stress_test.execute_condn == True and stress_test.mpi_gen_execute_condn == True):
-        print("running mpistress-test for {}-{}-{}-{}".format(core, util, fab, mpi))
-        stress_test.execute_cmd()
-    else:
-        print("skipping {} as execute condition fails" \
-                    .format(stress_test.testname))
     print("----------------------------------------------------------------------------------------\n")
 
-#osu benchmark tests    
+#osu benchmark tests
 def osu_benchmark(core, hosts, mpi, mode, util=None):
 
     osu_test = tests.MpiTestOSU(jobname=jbname, buildno=bno, \
                testname="osu-benchmarks",core_prov=core, fabric=fab, mpitype=mpi, \
                hosts=hosts, ofi_build_mode=mode, util_prov=util)
-    
+
     if (osu_test.execute_condn == True and osu_test.mpi_gen_execute_condn == True):
         print("running osu-test for {}-{}-{}-{}".format(core, util, fab, mpi))
         osu_test.execute_cmd()
diff --git a/deps/libfabric/contrib/intel/jenkins/runtests.py b/deps/libfabric/contrib/intel/jenkins/runtests.py
index c3cfe285d8f13c27f9a6ffcee4cae42a295ac2d8..c950706c7ea7f47870da206fe505b1e35727bfdd 100755
--- a/deps/libfabric/contrib/intel/jenkins/runtests.py
+++ b/deps/libfabric/contrib/intel/jenkins/runtests.py
@@ -28,7 +28,7 @@ node = (os.environ['NODE_NAME']).split('-')[0]
 hosts = [node]
 # Note: Temporarily disabling all mpich testing
 # due to mpich options issues which is causing
-# multiple tests to fail. 
+# multiple tests to fail.
 #mpilist = ['impi', 'mpich', 'ompi']
 mpilist = ['impi', 'ompi']
 
@@ -36,7 +36,7 @@ mpilist = ['impi', 'ompi']
 #this is done since some mpi tests
 #look for a valid location before running
 # the test on the secondary host(client)
-# but jenkins only creates a valid path on 
+# but jenkins only creates a valid path on
 # the primary host (server/test node)
 
 os.chdir('/tmp/')
@@ -51,9 +51,8 @@ if(args_core):
         run.shmemtest(args_core, hosts, ofi_build_mode)
         for mpi in mpilist:
             run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode)
-            run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode)   
-            run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode)
-            run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode)  
+            run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode)
+            run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode)
     else:
         run.fi_info_test(args_core, hosts, ofi_build_mode, util=args_util)
         run.fabtests(args_core, hosts, ofi_build_mode, util=args_util)
@@ -64,10 +63,8 @@ if(args_core):
 
             run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode, \
                                     util=args_util)
-            run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode, \
-                                    util=args_util)
             run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode, \
                                              util=args_util)
 else:
     print("Error : Specify a core provider to run tests")
-    
+
diff --git a/deps/libfabric/contrib/intel/jenkins/tests.py b/deps/libfabric/contrib/intel/jenkins/tests.py
index c9cb310286572860fba82348a98df49ee02fab51..10a5b059c87112f4de60442a33153bd18917de30 100755
--- a/deps/libfabric/contrib/intel/jenkins/tests.py
+++ b/deps/libfabric/contrib/intel/jenkins/tests.py
@@ -11,6 +11,8 @@ import common
 import shlex
 from abc import ABC, abstractmethod # abstract base class for creating abstract classes in python
 
+job_cadence = os.environ['JOB_CADENCE']
+
 # A Jenkins env variable for job name is composed of the name of the jenkins job and the branch name
 # it is building for. for e.g. in our case jobname = 'ofi_libfabric/master'
 class Test:
@@ -20,18 +22,19 @@ class Test:
         self.buildno = buildno
         self.testname = testname
         self.core_prov = core_prov
-        self.util_prov = "ofi_{}".format(util_prov) if util_prov != None else "" 
+        self.util_prov = "ofi_{}".format(util_prov) if util_prov != None else ""
         self.fabric = fabric
         self.hosts = hosts
         self.ofi_build_mode = ofi_build_mode
+        self.job_cadence = job_cadence
         if (len(hosts) == 2):
             self.server = hosts[0]
             self.client = hosts[1]
-       
+
         self.nw_interface = ci_site_config.interface_map[self.fabric]
         self.libfab_installpath = "{}/{}/{}/{}".format(ci_site_config.install_dir,
                                   self.jobname, self.buildno, self.ofi_build_mode)
- 
+
         self.env = [("FI_VERBS_MR_CACHE_ENABLE", "1"),\
                     ("FI_VERBS_INLINE_SIZE", "256")] \
                     if self.core_prov == "verbs" else []
@@ -41,106 +44,106 @@ class FiInfoTest(Test):
 
         super().__init__(jobname, buildno, testname, core_prov, fabric,
                      hosts, ofi_build_mode, util_prov)
-     
-        self.fi_info_testpath =  "{}/bin".format(self.libfab_installpath) 
-     
+
+        self.fi_info_testpath =  "{}/bin".format(self.libfab_installpath)
+
     @property
     def cmd(self):
         return "{}/fi_info ".format(self.fi_info_testpath)
 
     @property
-    def options(self):       
+    def options(self):
         if (self.util_prov):
             opts  = "-f -p {};{}".format(self.core_prov, self.util_prov)
         else:
             opts = "-f -p {}".format(self.core_prov)
-        
-        return opts 
-    
+
+        return opts
+
     def execute_cmd(self):
         command = self.cmd + self.options
         outputcmd = shlex.split(command)
-        common.run_command(outputcmd)         
-     
+        common.run_command(outputcmd)
+
 
 class Fabtest(Test):
-    
+
     def __init__(self, jobname, buildno, testname, core_prov, fabric,
                  hosts, ofi_build_mode, util_prov=None):
-        
+
         super().__init__(jobname, buildno, testname, core_prov, fabric,
                          hosts, ofi_build_mode, util_prov)
-        self.fabtestpath = "{}/bin".format(self.libfab_installpath) 
+        self.fabtestpath = "{}/bin".format(self.libfab_installpath)
         self.fabtestconfigpath = "{}/share/fabtests".format(self.libfab_installpath)
     def get_exclude_file(self):
         path = self.libfab_installpath
         efile_path = "{}/share/fabtests/test_configs".format(path)
 
         prov = self.util_prov if self.util_prov else self.core_prov
-        efile_old = "{path}/{prov}/{prov}.exclude".format(path=efile_path, 
+        efile_old = "{path}/{prov}/{prov}.exclude".format(path=efile_path,
                       prov=prov)
-        
+
         if self.util_prov:
             efile = "{path}/{util_prov}/{core_prov}/exclude".format(path=efile_path,
                       util_prov=self.util_prov, core_prov=self.core_prov)
         else:
             efile = "{path}/{prov}/exclude".format(path=efile_path,
                       prov=self.core_prov)
-           
+
         if os.path.isfile(efile):
             return efile
         elif os.path.isfile(efile_old):
             return efile_old
         else:
             print("Exclude file: {} not found!".format(efile))
-            return None  
+            return None
 
-    @property    
-    def cmd(self):    
+    @property
+    def cmd(self):
         return "{}/runfabtests.sh ".format(self.fabtestpath)
-     
+
     @property
     def options(self):
         opts = "-T 300 -vvv -p {} -S ".format(self.fabtestpath)
         if (self.core_prov == "verbs" and self.nw_interface):
-            opts = "{} -s {} ".format(opts, common.get_node_name(self.server, 
+            opts = "{} -s {} ".format(opts, common.get_node_name(self.server,
                     self.nw_interface)) # include common.py
-            opts = "{} -c {} ".format(opts, common.get_node_name(self.client, 
+            opts = "{} -c {} ".format(opts, common.get_node_name(self.client,
                     self.nw_interface)) # from common.py
-       
+
         if (self.core_prov == "shm"):
             opts = "{} -s {} ".format(opts, self.server)
             opts = "{} -c {} ".format(opts, self.client)
             opts += "-N "
-            
-        if not re.match(".*sockets|udp|tcp.*", self.core_prov):
+
+        if not re.match(".*sockets|udp.*", self.core_prov):
             opts = "{} -t all ".format(opts)
 
         efile = self.get_exclude_file()
         if efile:
             opts = "{} -R ".format(opts)
-            opts = "{} -f {} ".format(opts, efile)  
-        
+            opts = "{} -f {} ".format(opts, efile)
+
         for key,val in self.env:
-            opts = "{options} -E {key}={value} ".format(options = opts, 
+            opts = "{options} -E {key}={value} ".format(options = opts,
                     key=key, value=val)
-    
+
         if self.util_prov:
-            opts = "{options} {core};{util} ".format(options=opts, 
+            opts = "{options} {core};{util} ".format(options=opts,
                     core=self.core_prov, util=self.util_prov)
         else:
             opts = "{options} {core} ".format(options=opts,
                     core=self.core_prov)
-        
+
         if (self.core_prov == "shm"):
             opts += "{} {} ".format(self.server, self.server)
         else:
             opts += "{} {} ".format(self.server, self.client)
-             
+
         return opts
-   
+
     @property
-    def execute_condn(self):     
+    def execute_condn(self):
         return True if (self.core_prov != 'shm' or \
                         self.ofi_build_mode == 'dbg') else False
 
@@ -155,13 +158,13 @@ class Fabtest(Test):
 class ShmemTest(Test):
     def __init__(self, jobname, buildno, testname, core_prov, fabric,
                  hosts, ofi_build_mode, util_prov=None):
-        
+
         super().__init__(jobname, buildno, testname, core_prov, fabric,
                          hosts, ofi_build_mode, util_prov)
-     
+
         #self.n - number of hosts * number of processes per host
-        self.n = 4 
-        # self.ppn - number of processes per node. 
+        self.n = 4
+        # self.ppn - number of processes per node.
         self.ppn = 2
         self.shmem_dir = "{}/shmem".format(self.libfab_installpath)
 
@@ -171,13 +174,13 @@ class ShmemTest(Test):
         return "{}/run_shmem.sh ".format(ci_site_config.mpi_testpath)
 
     def options(self, shmem_testname):
-       
+
         if self.util_prov:
-            prov = "{core};{util} ".format(core=self.core_prov, 
+            prov = "{core};{util} ".format(core=self.core_prov,
                     util=self.util_prov)
         else:
             prov = self.core_prov
- 
+
         opts = "-n {n} -hosts {server},{client} -shmem_dir={shmemdir} \
                 -libfabric_path={path}/lib -prov '{provider}' -test {test} \
                 -server {server} -inf {inf}" \
@@ -189,72 +192,74 @@ class ShmemTest(Test):
 
     @property
     def execute_condn(self):
-        return True if (self.core_prov == "psm2" or self.core_prov == "sockets") \
+        return True if (self.job_cadence == 'daily' and \
+                        (self.core_prov == "psm2" or \
+                        self.core_prov == "sockets")) \
                     else False
-            
+
     def execute_cmd(self, shmem_testname):
-        command = self.cmd + self.options(shmem_testname) 
+        command = self.cmd + self.options(shmem_testname)
         outputcmd = shlex.split(command)
-        common.run_command(outputcmd)        
-    
+        common.run_command(outputcmd)
+
 
 class MpiTests(Test):
     def __init__(self, jobname, buildno, testname, core_prov, fabric,
                  mpitype, hosts, ofi_build_mode, util_prov=None):
-       
-        super().__init__(jobname, buildno, testname, core_prov, 
+
+        super().__init__(jobname, buildno, testname, core_prov,
                          fabric, hosts, ofi_build_mode, util_prov)
         self.mpi = mpitype
 
     @property
     def cmd(self):
         if (self.mpi == "impi" or self.mpi == "mpich"):
-            self.testpath = ci_site_config.mpi_testpath 
+            self.testpath = ci_site_config.mpi_testpath
             return "{}/run_{}.sh ".format(self.testpath,self.mpi)
         elif(self.mpi =="ompi"):
             self.testpath = "{}/ompi/bin".format(self.libfab_installpath)
-            return "{}/mpirun ".format(self.testpath)      
-    
+            return "{}/mpirun ".format(self.testpath)
+
     @property
     def options(self):
-        opts = [] 
+        opts = []
         if (self.mpi == "impi" or self.mpi == "mpich"):
             opts = "-n {} -ppn {} -hosts {},{} ".format(self.n,self.ppn,
                     self.server,self.client)
-                
+
             if (self.mpi == "impi"):
-                opts = "{} -mpi_root={} ".format(opts, 
+                opts = "{} -mpi_root={} ".format(opts,
                         ci_site_config.impi_root)
             else:
-                opts = "{} -mpi_root={}/mpich".format(opts, 
+                opts = "{} -mpi_root={}/mpich".format(opts,
                         self.libfab_installpath)
-            
-            opts = "{} -libfabric_path={}/lib ".format(opts, 
+
+            opts = "{} -libfabric_path={}/lib ".format(opts,
                     self.libfab_installpath)
-            
+
             if self.util_prov:
-                opts = "{options} -prov {core};{util} ".format(options=opts, 
+                opts = "{options} -prov {core};{util} ".format(options=opts,
                         core=self.core_prov, util=self.util_prov)
             else:
                 opts = "{} -prov {} ".format(opts, self.core_prov)
 
             for key, val in self.env:
                 opts = "{} -genv {} {} ".format(opts, key, val)
-            
+
         elif (self.mpi == "ompi"):
             opts = "-np {} ".format(self.n)
             hosts = ",".join([":".join([host,str(self.ppn)]) \
                     for host in self.hosts])
-            
+
             opts = "{} --host {} ".format(opts, hosts)
-            
+
             if self.util_prov:
-                opts = "{} --mca mtl_ofi_provider_include {};{} ".format(opts, 
+                opts = "{} --mca mtl_ofi_provider_include {};{} ".format(opts,
                         self.core_prov,self.util_prov)
             else:
-                opts = "{} --mca mtl_ofi_provider_include {} ".format(opts, 
+                opts = "{} --mca mtl_ofi_provider_include {} ".format(opts,
                         self.core_prov)
- 
+
             opts += "--mca orte_base_help_aggregate 0 "
             opts += "--mca mtl ofi --mca pml cm -tag-output "
             for key,val in self.env:
@@ -264,7 +269,7 @@ class MpiTests(Test):
     @property
     def mpi_gen_execute_condn(self):
         #Skip MPI tests for udp, verbs(core) providers.
-        # we would still have MPI tests runnning for 
+        # we would still have MPI tests runnning for
         # verbs-rxd and verbs-rxm providers
         return True if (self.core_prov != "udp" and \
                         self.core_prov != "shm" and \
@@ -274,15 +279,15 @@ class MpiTests(Test):
 
 # IMBtests serves as an abstract class for different
 # types of intel MPI benchmarks. Currently we have
-# the mpi1 and rma tests enabled which are encapsulated 
-# in the IMB_mpi1 and IMB_rma classes below. 
+# the mpi1 and rma tests enabled which are encapsulated
+# in the IMB_mpi1 and IMB_rma classes below.
 
 class IMBtests(ABC):
     """
-    This is an abstract class for IMB tests. 
-    currently IMB-MPI1 and IMB-RMA tests are 
+    This is an abstract class for IMB tests.
+    currently IMB-MPI1 and IMB-RMA tests are
     supported. In future there could be more.
-    All abstract  methods must be implemented. 
+    All abstract  methods must be implemented.
     """
 
     @property
@@ -296,9 +301,9 @@ class IMBtests(ABC):
         pass
 
 class IMBmpi1(IMBtests):
-    
+
     def __init__(self):
-        self.additional_tests = [ 
+        self.additional_tests = [
                                    "Biband",
                                    "Uniband",
                                    "PingPongAnySource",
@@ -327,9 +332,9 @@ class IMBrma(IMBtests):
     @property
     def execute_condn(self):
         return True if (self.core_prov != "verbs") else False
- 
+
 # MpiTestIMB class inherits from the MPITests class.
-# It uses the same options method and class variables as all MPI tests. 
+# It uses the same options method and class variables as all MPI tests.
 # It creates IMB_xxx test objects for each kind of IMB test.
 class MpiTestIMB(MpiTests):
 
@@ -337,18 +342,18 @@ class MpiTestIMB(MpiTests):
                  mpitype, hosts, ofi_build_mode, util_prov=None):
         super().__init__(jobname, buildno, testname, core_prov, fabric,
                          mpitype, hosts, ofi_build_mode, util_prov)
-       
+
         self.n = 4
         self.ppn = 1
         self.mpi1 = IMBmpi1()
-        self.rma = IMBrma(self.core_prov) 
+        self.rma = IMBrma(self.core_prov)
 
     @property
     def execute_condn(self):
         return True if (self.mpi == "impi") else False
-       
+
     def execute_cmd(self):
-        command = self.cmd + self.options 
+        command = self.cmd + self.options
         if(self.mpi1.execute_condn):
             outputcmd = shlex.split(command +  self.mpi1.imb_cmd)
             common.run_command(outputcmd)
@@ -357,8 +362,8 @@ class MpiTestIMB(MpiTests):
             common.run_command(outputcmd)
 
 class MpichTestSuite(MpiTests):
-    
-    def __init__(self, jobname, buildno, testname, core_prov, fabric, 
+
+    def __init__(self, jobname, buildno, testname, core_prov, fabric,
 		     mpitype, hosts, ofi_build_mode, util_prov=None):
             super().__init__(jobname, buildno, testname, core_prov, fabric,
 			     mpitype,  hosts, ofi_build_mode, util_prov)
@@ -367,14 +372,14 @@ class MpichTestSuite(MpiTests):
             self.pwd = os.getcwd()
 
     def testgroup(self, testgroupname):
-        
+
         testpath = "{}/{}".format(self.mpichsuitepath, testgroupname)
         tests = []
         with open("{}/testlist".format(testpath)) as file:
             for line in file:
                 if(line[0] != '#' and  line[0] != '\n'):
                     tests.append((line.rstrip('\n')).split(' '))
-	
+
         return tests
 
     def options(self, nprocs, timeout=None):
@@ -406,7 +411,7 @@ class MpichTestSuite(MpiTests):
     def execute_condn(self):
         return True if (self.mpi == 'impi' and  self.core_prov != 'psm2' \
                         and self.core_prov != 'sockets') else False
- 
+
     def execute_cmd(self, testgroupname):
         print("Running Tests: " + testgroupname)
         tests = []
@@ -427,51 +432,15 @@ class MpichTestSuite(MpiTests):
             common.run_command(outputcmd)
         os.chdir(self.pwd)
 
-class MpiTestStress(MpiTests):
-     
-    def __init__(self, jobname, buildno, testname, core_prov, fabric, 
-                 mpitype, hosts, ofi_build_mode, util_prov=None):
-        super().__init__(jobname, buildno, testname, core_prov, fabric, 
-                         mpitype,  hosts, ofi_build_mode, util_prov)
-        
-         
-        if((self.core_prov == "verbs" or self.core_prov =="psm2")):
-            self.n = 16
-            self.ppn = 8
-        else:
-            self.n = 4
-            self.ppn = 2
-      
-    @property
-    def stress_cmd(self):
-        return "{}/{}/stress/mpi_stress -dcr".format(self.libfab_installpath, self.mpi)
-
-    @property
-    def execute_condn(self):
-        # Todo : run stress test for ompi with libfabirc-dbg builds if it works
-        # in Jenkins for buildbot these ompi did not build with libfabric-dbg 
-
-        # Due to an mpich issue when the correct mpich options are enabled during
-        # mpich builds, sttress test is failing. disabling mpich + stress tests
-        # untill the mpich team fixes the issue. 
-        return True if (self.mpi != 'mpich' and (self.mpi != 'ompi' or \
-                        self.ofi_build_mode != 'dbg')) else  False
-    
-    def execute_cmd(self):
-        command = self.cmd + self.options + self.stress_cmd
-        outputcmd = shlex.split(command)
-        common.run_command(outputcmd) 
 
-         
-      
 class MpiTestOSU(MpiTests):
 
     def __init__(self, jobname, buildno, testname, core_prov, fabric,
                  mpitype, hosts, ofi_build_mode, util_prov=None):
         super().__init__(jobname, buildno, testname, core_prov, fabric,
                          mpitype, hosts, ofi_build_mode, util_prov)
-        
-        self.n = 4 
+
+        self.n = 4
         self.ppn = 2
         self.two_proc_tests = {'osu_latency',
                                'osu_bibw',
@@ -488,17 +457,18 @@ class MpiTestOSU(MpiTests):
                               }
 
         self.osu_mpi_path = "{}/{}/osu/libexec/osu-micro-benchmarks/mpi/". \
-                            format(self.libfab_installpath,mpitype) 
-    
+                            format(self.libfab_installpath,mpitype)
+
     @property
-    def execute_condn(self): 
+    def execute_condn(self):
         # sockets and psm2 have some issues with OSU benchmark testing.
-        return True if (self.mpi != "ompi" or \
-                       (self.core_prov != "sockets" and \
-                        self.core_prov != "psm2" and \
-                        self.ofi_build_mode!="dbg")) \
+        return True if ((self.job_cadence  == 'daily') and \
+                        (self.mpi != "ompi" or \
+                        (self.core_prov != "sockets" and \
+                         self.core_prov != "psm2" and \
+                         self.ofi_build_mode!="dbg"))) \
                     else False
-    
+
     def execute_cmd(self):
         assert(self.osu_mpi_path)
         p = re.compile('osu_put*')
@@ -516,6 +486,6 @@ class MpiTestOSU(MpiTests):
                     osu_cmd = os.path.join(root, test)
                     command = launcher + osu_cmd
                     outputcmd = shlex.split(command)
-                    common.run_command(outputcmd) 
+                    common.run_command(outputcmd)
 
 
diff --git a/deps/libfabric/fabtests/COPYING b/deps/libfabric/fabtests/COPYING
index 14257a216349704e6b44b31137af509992e02d45..593587ef0e43b5434204902b2a841c0b1d4b5685 100644
--- a/deps/libfabric/fabtests/COPYING
+++ b/deps/libfabric/fabtests/COPYING
@@ -7,7 +7,7 @@ Some parts of the source are 3rd party code which uses MIT license.
 The description and requirements of the license are available in
 later part of this file.
 
-Copyright (c) 2015-2020 Intel Corporation.  All rights reserved.
+Copyright (c) 2015-2021 Intel Corporation.  All rights reserved.
 Copyright (c) 2016-2018 Cisco Systems, Inc.  All rights reserved.
 
 ==================================================================
diff --git a/deps/libfabric/fabtests/Makefile.am b/deps/libfabric/fabtests/Makefile.am
index f60c81e716988c3e2348337ff2ebb85b1509a0a1..e99bb39d3f999c6bca81657da6b08d5fe3be11ea 100644
--- a/deps/libfabric/fabtests/Makefile.am
+++ b/deps/libfabric/fabtests/Makefile.am
@@ -9,6 +9,7 @@ endif
 
 if FREEBSD
 os_excludes = -f ./test_configs/freebsd.exclude
+AM_CFLAGS += -I$(srcdir)/include/freebsd
 endif
 
 bin_PROGRAMS = \
@@ -17,7 +18,7 @@ bin_PROGRAMS = \
 	functional/fi_stream_msg \
 	functional/fi_msg_sockets \
 	functional/fi_rdm \
-	functional/fi_rdm_rma_simple \
+	functional/fi_rdm_rma_event \
 	functional/fi_rdm_rma_trigger \
 	functional/fi_rdm_deferred_wq \
 	functional/fi_dgram \
@@ -42,6 +43,8 @@ bin_PROGRAMS = \
 	functional/fi_rdm_atomic \
 	functional/fi_multi_recv \
 	functional/fi_bw \
+	functional/fi_rdm_multi_client \
+	functional/fi_loopback \
 	benchmarks/fi_msg_pingpong \
 	benchmarks/fi_msg_bw \
 	benchmarks/fi_rma_bw \
@@ -58,7 +61,6 @@ bin_PROGRAMS = \
 	unit/fi_av_test \
 	unit/fi_dom_test \
 	unit/fi_getinfo_test \
-	unit/fi_resource_freeing \
 	ubertest/fi_ubertest	\
 	multinode/fi_multinode	\
 	multinode/fi_multinode_coll
@@ -84,6 +86,7 @@ nobase_dist_config_DATA = \
         test_configs/udp/functional.test \
         test_configs/udp/udp.exclude \
         test_configs/tcp/tcp.exclude \
+        test_configs/tcp/all.test \
         test_configs/verbs/all.test \
         test_configs/verbs/quick.test \
 	test_configs/verbs/verbs.exclude \
@@ -93,6 +96,9 @@ nobase_dist_config_DATA = \
 	test_configs/psm2/all.test \
 	test_configs/psm2/verify.test \
 	test_configs/psm2/psm2.exclude \
+	test_configs/psm3/all.test \
+	test_configs/psm3/verify.test \
+	test_configs/psm3/psm3.exclude \
 	test_configs/ofi_rxm/tcp.test \
 	test_configs/ofi_rxm/verbs.test \
 	test_configs/ofi_rxm/ofi_rxm.exclude \
@@ -109,7 +115,6 @@ noinst_LTLIBRARIES = libfabtests.la
 
 libfabtests_la_SOURCES = \
 	common/shared.c \
-	common/jsmn.c \
 	common/hmem.c \
 	common/hmem_cuda.c \
 	common/hmem_rocr.c \
@@ -162,9 +167,9 @@ functional_fi_rdm_shared_av_SOURCES = \
 	functional/rdm_shared_av.c
 functional_fi_rdm_shared_av_LDADD = libfabtests.la
 
-functional_fi_rdm_rma_simple_SOURCES = \
-	functional/rdm_rma_simple.c
-functional_fi_rdm_rma_simple_LDADD = libfabtests.la
+functional_fi_rdm_rma_event_SOURCES = \
+	functional/rdm_rma_event.c
+functional_fi_rdm_rma_event_LDADD = libfabtests.la
 
 functional_fi_rdm_rma_trigger_SOURCES = \
 	functional/rdm_rma_trigger.c
@@ -254,6 +259,14 @@ functional_fi_bw_SOURCES = \
 	functional/bw.c
 functional_fi_bw_LDADD = libfabtests.la
 
+functional_fi_rdm_multi_client_SOURCES = \
+	functional/rdm_multi_client.c
+functional_fi_rdm_multi_client_LDADD = libfabtests.la
+
+functional_fi_loopback_SOURCES = \
+	functional/loopback.c
+functional_fi_loopback_LDADD = libfabtests.la
+
 benchmarks_fi_msg_pingpong_SOURCES = \
 	benchmarks/msg_pingpong.c \
 	$(benchmarks_srcs)
@@ -335,10 +348,6 @@ unit_fi_getinfo_test_SOURCES = \
 	$(unit_srcs)
 unit_fi_getinfo_test_LDADD = libfabtests.la
 
-unit_fi_resource_freeing_SOURCES = \
-	unit/resource_freeing.c
-unit_fi_resource_freeing_LDADD = libfabtests.la
-
 ubertest_fi_ubertest_SOURCES = \
 	ubertest/fabtest.h \
 	ubertest/ofi_atomic.h \
@@ -401,7 +410,7 @@ dummy_man_pages = \
 	man/man1/fi_rdm_deferred_wq.1 \
 	man/man1/fi_rdm_multi_domain.1 \
 	man/man1/fi_multi_recv.1 \
-	man/man1/fi_rdm_rma_simple.1 \
+	man/man1/fi_rdm_rma_event.1 \
 	man/man1/fi_rdm_rma_trigger.1 \
 	man/man1/fi_rdm_shared_av.1 \
 	man/man1/fi_rdm_tagged_peek.1 \
@@ -426,9 +435,10 @@ dummy_man_pages = \
 	man/man1/fi_eq_test.1 \
 	man/man1/fi_getinfo_test.1 \
 	man/man1/fi_mr_test.1 \
-	man/man1/fi_resource_freeing.1 \
 	man/man1/fi_bw.1 \
-	man/man1/fi_ubertest.1
+	man/man1/fi_rdm_multi_client.1 \
+	man/man1/fi_ubertest.1 \
+	man/man1/fi_efa_ep_rnr_retry.1
 
 nroff:
 	@for file in $(real_man_pages); do \
@@ -436,6 +446,7 @@ nroff:
             perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
         done
 
+include prov/efa/Makefile.include
 
 man_MANS = $(real_man_pages) $(dummy_man_pages)
 
diff --git a/deps/libfabric/fabtests/Makefile.win b/deps/libfabric/fabtests/Makefile.win
index b1ed3b978f5938b025dc79ed17f16f0b2ce798f0..0ec7f4beb4cb45450ec7e2fc7180a3325a0a2fc2 100644
--- a/deps/libfabric/fabtests/Makefile.win
+++ b/deps/libfabric/fabtests/Makefile.win
@@ -18,6 +18,10 @@ CFLAGS = $(CFLAGS) /Zi /Od /MTd
 outdir = $(output_root)$(arch)\debug-v141
 CFLAGS = $(CFLAGS) /Zi /Od /MTd
 !endif
+!if "$(config)" == "Debug-v142"
+outdir = $(output_root)$(arch)\debug-v142
+CFLAGS = $(CFLAGS) /Zi /Od /MTd
+!endif
 !if "$(config)" == "Release-v140"
 outdir = $(output_root)$(arch)\release-v140
 CFLAGS = $(CFLAGS) /O2 /MT
@@ -26,8 +30,12 @@ CFLAGS = $(CFLAGS) /O2 /MT
 outdir = $(output_root)$(arch)\release-v141
 CFLAGS = $(CFLAGS) /O2 /MT
 !endif
+!if "$(config)" == "Release-v142"
+outdir = $(output_root)$(arch)\release-v142
+CFLAGS = $(CFLAGS) /O2 /MT
+!endif
 
-basedeps = common\hmem.c common\shared.c common\jsmn.c \
+basedeps = common\hmem.c common\shared.c \
 	common\windows\getopt.c common\windows\osd.c \
 	common\hmem_cuda.c common\hmem_rocr.c common\hmem_ze.c
 
@@ -60,7 +68,7 @@ benchmarks: $(outdir)\msg_pingpong.exe $(outdir)\rdm_cntr_pingpong.exe \
 
 functional: $(outdir)\cq_data.exe $(outdir)\dgram.exe $(outdir)\dgram_waitset.exe $(outdir)\msg.exe \
 	$(outdir)\msg_epoll.exe $(outdir)\msg_sockets.exe \
-	$(outdir)\poll.exe $(outdir)\rdm.exe $(outdir)\rdm_rma_simple.exe $(outdir)\rdm_rma_trigger.exe \
+	$(outdir)\poll.exe $(outdir)\rdm.exe $(outdir)\rdm_rma_event.exe $(outdir)\rdm_rma_trigger.exe \
 	$(outdir)\rdm_tagged_peek.exe $(outdir)\scalable_ep.exe $(outdir)\inj_complete.exe $(outdir)\bw.exe
 
 unit: $(outdir)\av_test.exe $(outdir)\dom_test.exe $(outdir)\eq_test.exe
@@ -96,7 +104,7 @@ $(outdir)\poll.exe: {functional}poll.c $(basedeps)
 
 $(outdir)\rdm.exe: {functional}rdm.c $(basedeps)
 
-$(outdir)\rdm_rma_simple.exe: {functional}rdm_rma_simple.c $(basedeps)
+$(outdir)\rdm_rma_event.exe: {functional}rdm_rma_event.c $(basedeps)
 
 $(outdir)\rdm_rma_trigger.exe: {functional}rdm_rma_trigger.c $(basedeps)
 
diff --git a/deps/libfabric/fabtests/benchmarks/benchmark_shared.c b/deps/libfabric/fabtests/benchmarks/benchmark_shared.c
index 26c37dc0af42ffb1eb5c4793bacddbcd0606a33e..0f204b6a1631facd510147608ae237b0063e2aa7 100644
--- a/deps/libfabric/fabtests/benchmarks/benchmark_shared.c
+++ b/deps/libfabric/fabtests/benchmarks/benchmark_shared.c
@@ -39,6 +39,14 @@
 #include "shared.h"
 #include "benchmark_shared.h"
 
+/*
+ * when the -j option is set, user supplied inject_size must be honored,
+ * even if the provider may return a larger value. This flag is used to
+ * distinguish between the '-j 0' option and no '-j' option at all. For
+ * both cases hints->tx_attr->inject_size is 0.
+ */
+static int inject_size_set;
+
 void ft_parse_benchmark_opts(int op, char *optarg)
 {
 	switch (op) {
@@ -50,6 +58,7 @@ void ft_parse_benchmark_opts(int op, char *optarg)
 		break;
 	case 'j':
 		hints->tx_attr->inject_size = atoi(optarg);
+		inject_size_set = 1;
 		break;
 	case 'W':
 		opts.window_size = atoi(optarg);
@@ -72,7 +81,10 @@ void ft_benchmark_usage(void)
 
 int pingpong(void)
 {
-	int ret, i;
+	int ret, i, inject_size;
+
+	inject_size = inject_size_set ?
+			hints->tx_attr->inject_size : fi->tx_attr->inject_size;
 
 	ret = ft_sync();
 	if (ret)
@@ -83,7 +95,7 @@ int pingpong(void)
 			if (i == opts.warmup_iterations)
 				ft_start();
 
-			if (opts.transfer_size < fi->tx_attr->inject_size)
+			if (opts.transfer_size < inject_size)
 				ret = ft_inject(ep, remote_fi_addr, opts.transfer_size);
 			else
 				ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx);
@@ -103,7 +115,7 @@ int pingpong(void)
 			if (ret)
 				return ret;
 
-			if (opts.transfer_size < fi->tx_attr->inject_size)
+			if (opts.transfer_size < inject_size)
 				ret = ft_inject(ep, remote_fi_addr, opts.transfer_size);
 			else
 				ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx);
@@ -145,7 +157,10 @@ static int bw_rx_comp()
 
 int bandwidth(void)
 {
-	int ret, i, j;
+	int ret, i, j, inject_size;
+
+	inject_size = inject_size_set ?
+			hints->tx_attr->inject_size : fi->tx_attr->inject_size;
 
 	ret = ft_sync();
 	if (ret)
@@ -163,7 +178,7 @@ int bandwidth(void)
 			if (i == opts.warmup_iterations)
 				ft_start();
 
-			if (opts.transfer_size < fi->tx_attr->inject_size)
+			if (opts.transfer_size < inject_size)
 				ret = ft_inject(ep, remote_fi_addr, opts.transfer_size);
 			else
 				ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size,
@@ -233,7 +248,10 @@ static int bw_rma_comp(enum ft_rma_opcodes rma_op)
 
 int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
 {
-	int ret, i, j;
+	int ret, i, j, inject_size;
+
+	inject_size = inject_size_set ?
+			hints->tx_attr->inject_size: fi->tx_attr->inject_size;
 
 	ret = ft_sync();
 	if (ret)
@@ -245,7 +263,7 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
 
 		switch (rma_op) {
 		case FT_RMA_WRITE:
-			if (opts.transfer_size < fi->tx_attr->inject_size) {
+			if (opts.transfer_size < inject_size) {
 				ret = ft_post_rma_inject(FT_RMA_WRITE, ep,
 						opts.transfer_size, remote);
 			} else {
@@ -265,7 +283,7 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
 					rx_seq++;
 
 			} else {
-				if (opts.transfer_size < fi->tx_attr->inject_size) {
+				if (opts.transfer_size < inject_size) {
 					ret = ft_post_rma_inject(FT_RMA_WRITEDATA,
 							ep,
 							opts.transfer_size,
diff --git a/deps/libfabric/fabtests/benchmarks/dgram_pingpong.c b/deps/libfabric/fabtests/benchmarks/dgram_pingpong.c
index 4daba11e0f0dee82ad127780d9da91da15eac99d..191f0daab7be5ef84b0bb64c379899f5b5730d10 100644
--- a/deps/libfabric/fabtests/benchmarks/dgram_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/dgram_pingpong.c
@@ -118,6 +118,8 @@ int main(int argc, char **argv)
 	hints->mode |= FI_CONTEXT;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->tx_attr->tclass = FI_TC_LOW_LATENCY;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/msg_bw.c b/deps/libfabric/fabtests/benchmarks/msg_bw.c
index 2e36d0ff1d3e4fc83dcb9c8fcf5794a1a40177f8..0e90b4cfb1ac90cbb4ae9dfc8a36cfcea9de32cf 100644
--- a/deps/libfabric/fabtests/benchmarks/msg_bw.c
+++ b/deps/libfabric/fabtests/benchmarks/msg_bw.c
@@ -84,12 +84,14 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS
+			    API_OPTS BENCHMARK_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_benchmark_opts(op, optarg);
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
+			ft_parse_api_opts(op, optarg, hints, &opts);
 			break;
 		case '?':
 		case 'h':
@@ -103,11 +105,17 @@ int main(int argc, char **argv)
 		opts.dst_addr = argv[optind];
 
 	hints->ep_attr->type = FI_EP_MSG;
-	hints->caps = FI_MSG;
+	if (hints->caps & FI_TAGGED) {
+		opts.options |= FT_OPT_SRX;
+		hints->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT;
+	} else {
+		hints->caps |= FI_MSG;
+	}
 	hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
 	hints->addr_format = opts.address_format;
+	hints->tx_attr->tclass = FI_TC_BULK_DATA;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/msg_pingpong.c b/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
index a03864724290947c3e590b08576f44a026b6cf21..ec7755f8f33c66037c48159ad7541dfba85118ea 100644
--- a/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
@@ -84,13 +84,15 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) !=
+	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS
+			    API_OPTS BENCHMARK_OPTS)) !=
 			-1) {
 		switch (op) {
 		default:
 			ft_parse_benchmark_opts(op, optarg);
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
+			ft_parse_api_opts(op, optarg, hints, &opts);
 			break;
 		case '?':
 		case 'h':
@@ -104,10 +106,16 @@ int main(int argc, char **argv)
 		opts.dst_addr = argv[optind];
 
 	hints->ep_attr->type = FI_EP_MSG;
-	hints->caps = FI_MSG;
+	if (hints->caps & FI_TAGGED) {
+		opts.options |= FT_OPT_SRX;
+		hints->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT;
+	} else {
+		hints->caps |= FI_MSG;
+	}
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
 	hints->addr_format = opts.address_format;
+	hints->tx_attr->tclass = FI_TC_LOW_LATENCY;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rdm_cntr_pingpong.c b/deps/libfabric/fabtests/benchmarks/rdm_cntr_pingpong.c
index ab09dd170bfdfd203bb75d988032dd30df4411d0..a4a46fa0474a2f633fd86ef3931d44616002bdab 100644
--- a/deps/libfabric/fabtests/benchmarks/rdm_cntr_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/rdm_cntr_pingpong.c
@@ -100,6 +100,8 @@ int main(int argc, char **argv)
 	hints->caps = FI_MSG;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->tx_attr->tclass = FI_TC_LOW_LATENCY;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rdm_pingpong.c b/deps/libfabric/fabtests/benchmarks/rdm_pingpong.c
index 800a7b63d0127b2328283a079f9d8e24073bfadc..397d5645f2359109a16864e07164ed706bf3fc79 100644
--- a/deps/libfabric/fabtests/benchmarks/rdm_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/rdm_pingpong.c
@@ -74,7 +74,7 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) !=
+	while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) !=
 			-1) {
 		switch (op) {
 		default:
@@ -82,6 +82,9 @@ int main(int argc, char **argv)
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case '?':
 		case 'h':
 			ft_csusage(argv[0], "Ping pong client and server using RDM.");
@@ -95,9 +98,11 @@ int main(int argc, char **argv)
 
 	hints->ep_attr->type = FI_EP_RDM;
 	hints->caps = FI_MSG;
-	hints->mode = FI_CONTEXT;
+	hints->mode |= FI_CONTEXT;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->tx_attr->tclass = FI_TC_LOW_LATENCY;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rdm_tagged_bw.c b/deps/libfabric/fabtests/benchmarks/rdm_tagged_bw.c
index 5252d4484897c7add81c45143e190a0e35405431..932910dbabe9585d973ea7967fda0173fb114e27 100644
--- a/deps/libfabric/fabtests/benchmarks/rdm_tagged_bw.c
+++ b/deps/libfabric/fabtests/benchmarks/rdm_tagged_bw.c
@@ -77,13 +77,16 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_benchmark_opts(op, optarg);
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case '?':
 		case 'h':
 			ft_csusage(argv[0], "Bandwidth test for RDM endpoints using tagged messages.");
@@ -98,9 +101,11 @@ int main(int argc, char **argv)
 	hints->ep_attr->type = FI_EP_RDM;
 	hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
 	hints->caps = FI_TAGGED;
-	hints->mode = FI_CONTEXT;
+	hints->mode |= FI_CONTEXT;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->tx_attr->tclass = FI_TC_BULK_DATA;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rdm_tagged_pingpong.c b/deps/libfabric/fabtests/benchmarks/rdm_tagged_pingpong.c
index f21216aed4fbdde16df3af95ec3bc39a4a0ac872..af94698bfa15fcc739e89b096f953a8d476aea8d 100644
--- a/deps/libfabric/fabtests/benchmarks/rdm_tagged_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/rdm_tagged_pingpong.c
@@ -76,13 +76,16 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_benchmark_opts(op, optarg);
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case '?':
 		case 'h':
 			ft_csusage(argv[0], "Ping pong client and server using tagged messages.");
@@ -96,9 +99,11 @@ int main(int argc, char **argv)
 
 	hints->ep_attr->type = FI_EP_RDM;
 	hints->caps = FI_TAGGED;
-	hints->mode = FI_CONTEXT;
+	hints->mode |= FI_CONTEXT;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->tx_attr->tclass = FI_TC_LOW_LATENCY;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rma_bw.c b/deps/libfabric/fabtests/benchmarks/rma_bw.c
index a8ace33bcc1c81af2f085e1c7e3d4b323fecd218..6267351fde697758f30b81730ede1592331f27b4 100644
--- a/deps/libfabric/fabtests/benchmarks/rma_bw.c
+++ b/deps/libfabric/fabtests/benchmarks/rma_bw.c
@@ -97,16 +97,20 @@ int main(int argc, char **argv)
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
 	hints->addr_format = opts.address_format;
 
-	while ((op = getopt(argc, argv, "ho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS API_OPTS
+			    BENCHMARK_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_benchmark_opts(op, optarg);
 			ft_parseinfo(op, optarg, hints, &opts);
 			ft_parsecsopts(op, optarg, &opts);
-			ret = ft_parse_rma_opts(op, optarg, hints, &opts);
+			ret = ft_parse_api_opts(op, optarg, hints, &opts);
 			if (ret)
 				return ret;
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case '?':
 		case 'h':
 			ft_csusage(argv[0], "Bandwidth test using RMA operations.");
@@ -124,6 +128,7 @@ int main(int argc, char **argv)
 		opts.dst_addr = argv[optind];
 
 	hints->domain_attr->mr_mode = opts.mr_mode;
+	hints->tx_attr->tclass = FI_TC_BULK_DATA;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/common/hmem_rocr.c b/deps/libfabric/fabtests/common/hmem_rocr.c
index 58b11e6bbf8ceabb05a8ebb162ff3846691b60d2..3fed6d77e937af5f0e2dc7d973a10a98f2a4c6bc 100644
--- a/deps/libfabric/fabtests/common/hmem_rocr.c
+++ b/deps/libfabric/fabtests/common/hmem_rocr.c
@@ -61,6 +61,8 @@ struct rocr_ops {
 	hsa_status_t (*hsa_memory_allocate)(hsa_region_t region, size_t size,
 					    void **ptr);
 	hsa_status_t (*hsa_memory_free)(void *ptr);
+	hsa_status_t (*hsa_amd_memory_fill)(void* ptr, uint32_t value,
+					    size_t count);
 };
 
 static struct rocr_ops rocr_ops;
@@ -191,6 +193,13 @@ int ft_rocr_init(void)
 		goto err_dlclose_rocr;
 	}
 
+	rocr_ops.hsa_amd_memory_fill = dlsym(rocr_handle,
+					     "hsa_amd_memory_fill");
+	if (!rocr_ops.hsa_amd_memory_fill) {
+		FT_ERR("Failed to find hsa_amd_memory_fill");
+		goto err_dlclose_rocr;
+	}
+
 	hsa_ret = rocr_ops.hsa_init();
 	if (hsa_ret != HSA_STATUS_SUCCESS) {
 		ROCR_ERR(hsa_ret, "hsa_init failed");
@@ -259,17 +268,62 @@ int ft_rocr_free(void *buf)
 	return -FI_EIO;
 }
 
+#define ROCR_MEM_FILL_BYTE_ALIGNMENT 4U
+
 int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size)
 {
-	unsigned char *ptr = buf;
 	unsigned char set_value = value;
+	void *mem_fill_ptr;
+	size_t mem_fill_size;
+	uint32_t mem_fill_value;
+	hsa_status_t hsa_ret;
+	unsigned char *ptr = buf;
 	int ret;
 
-	while (size-- > 0) {
+	/* Determine if ROCR memory fill can be used to set device memory. ROCR
+	 * memory fill requires 4-byte alignment.
+	 */
+	mem_fill_ptr = (void *) ALIGN((uintptr_t) buf,
+				      ROCR_MEM_FILL_BYTE_ALIGNMENT);
+
+	/* Use ROCR memory copy to fill the start of the buffer until the buffer
+	 * is correctly aligned.
+	 */
+	while (ptr != mem_fill_ptr && size > 0) {
+		ret = ft_rocr_memcpy(device, ptr, &set_value, sizeof(*ptr));
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		size--;
+		ptr++;
+	}
+
+	/* Use ROCR memory fill to fill the middle of the buffer. */
+	if (size >= ROCR_MEM_FILL_BYTE_ALIGNMENT) {
+		mem_fill_size = ALIGN_DOWN(size, ROCR_MEM_FILL_BYTE_ALIGNMENT);
+
+		memset(&mem_fill_value, set_value, sizeof(mem_fill_value));
+
+		hsa_ret = rocr_ops.hsa_amd_memory_fill(mem_fill_ptr,
+						       mem_fill_value,
+						       mem_fill_size /
+						       ROCR_MEM_FILL_BYTE_ALIGNMENT);
+		if (hsa_ret != HSA_STATUS_SUCCESS) {
+			ROCR_ERR(hsa_ret, "hsa_amd_memory_fill failed");
+			return -FI_EIO;
+		}
+
+		size -= mem_fill_size;
+		ptr += mem_fill_size;
+	}
+
+	/* Use ROCR memory copy to fill the end of the buffer. */
+	while (size > 0) {
 		ret = ft_rocr_memcpy(device, ptr, &set_value, sizeof(*ptr));
 		if (ret != FI_SUCCESS)
 			return ret;
 
+		size--;
 		ptr++;
 	}
 
diff --git a/deps/libfabric/fabtests/common/hmem_ze.c b/deps/libfabric/fabtests/common/hmem_ze.c
index 8e9b73f048589e2a4a7b62d1cb57c0d96229baaf..149dcedc6c6cb1ef0dde270ff898576364cbd2f4 100644
--- a/deps/libfabric/fabtests/common/hmem_ze.c
+++ b/deps/libfabric/fabtests/common/hmem_ze.c
@@ -34,10 +34,13 @@
 #include <config.h>
 #endif
 
+#include <stdio.h>
 #include "hmem.h"
+#include "shared.h"
 
-#ifdef HAVE_LIBZE
+#if HAVE_LIBZE
 
+#include <dlfcn.h>
 #include <level_zero/ze_api.h>
 
 #define ZE_MAX_DEVICES 4
@@ -71,11 +74,180 @@ static const ze_device_mem_alloc_desc_t device_desc = {
 	.ordinal	= 0,
 };
 
-static const ze_host_mem_alloc_desc_t host_desc = {
-	.stype		= ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
-	.pNext		= NULL,
-	.flags		= 0,
-};
+static void *libze_handle;
+static struct libze_ops {
+	ze_result_t (*zeInit)(ze_init_flags_t flags);
+	ze_result_t (*zeDriverGet)(uint32_t *pCount,
+				   ze_driver_handle_t *phDrivers);
+	ze_result_t (*zeDeviceGet)(ze_driver_handle_t hDriver,
+				   uint32_t *pCount,
+				   ze_device_handle_t *phDevices);
+	ze_result_t (*zeDeviceCanAccessPeer)(ze_device_handle_t hDevice,
+					     ze_device_handle_t hPeerDevice,
+					     ze_bool_t *value);
+	ze_result_t (*zeContextCreate)(ze_driver_handle_t hDriver,
+				       const ze_context_desc_t *desc,
+				       ze_context_handle_t *phContext);
+	ze_result_t (*zeContextDestroy)(ze_context_handle_t hContext);
+	ze_result_t (*zeCommandQueueCreate)(ze_context_handle_t hContext,
+					    ze_device_handle_t hDevice,
+					    const ze_command_queue_desc_t *desc,
+					    ze_command_queue_handle_t *phCommandQueue);
+	ze_result_t (*zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue);
+	ze_result_t (*zeCommandQueueExecuteCommandLists)(
+					ze_command_queue_handle_t hCommandQueue,
+					uint32_t numCommandLists,
+					ze_command_list_handle_t *phCommandLists,
+					ze_fence_handle_t hFence);
+	ze_result_t (*zeCommandListCreate)(ze_context_handle_t hContext,
+					   ze_device_handle_t hDevice,
+					   const ze_command_list_desc_t *desc,
+					   ze_command_list_handle_t *phCommandList);
+	ze_result_t (*zeCommandListDestroy)(ze_command_list_handle_t hCommandList);
+	ze_result_t (*zeCommandListClose)(ze_command_list_handle_t hCommandList);
+	ze_result_t (*zeCommandListAppendMemoryCopy)(
+				ze_command_list_handle_t hCommandList,
+				void *dstptr, const void *srcptr, size_t size,
+				ze_event_handle_t hSignalEvent,
+				uint32_t numWaitEvents,
+				ze_event_handle_t *phWaitEvents);
+	ze_result_t (*zeCommandListAppendMemoryFill)(
+				ze_command_list_handle_t hCommandList,
+				void *ptr, const void *pattern,
+				size_t pattern_size, size_t size,
+				ze_event_handle_t hSignalEvent,
+				uint32_t numWaitEvents,
+				ze_event_handle_t *phWaitEvents);
+	ze_result_t (*zeMemAllocDevice)(
+				ze_context_handle_t hContext,
+				const ze_device_mem_alloc_desc_t *device_desc,
+				size_t size, size_t alignment, ze_device_handle_t hDevice,
+				void *pptr);
+	ze_result_t (*zeMemFree)(ze_context_handle_t hContext, void *ptr);
+} libze_ops;
+
+static int init_libze_ops(void)
+{
+	libze_handle = dlopen("libze_loader.so", RTLD_NOW);
+	if (!libze_handle) {
+		FT_ERR("Failed to dlopen libze_loader.so\n");
+		goto err_out;
+	}
+
+	libze_ops.zeInit = dlsym(libze_handle, "zeInit");
+	if (!libze_ops.zeInit) {
+		FT_ERR("Failed to find zeInit\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDriverGet = dlsym(libze_handle, "zeDriverGet");
+	if (!libze_ops.zeDriverGet) {
+		FT_ERR("Failed to find zeDriverGet\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceGet = dlsym(libze_handle, "zeDeviceGet");
+	if (!libze_ops.zeDeviceGet) {
+		FT_ERR("Failed to find zeDeviceGet\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceCanAccessPeer = dlsym(libze_handle, "zeDeviceCanAccessPeer");
+	if (!libze_ops.zeDeviceCanAccessPeer) {
+		FT_ERR("Failed to find zeDeviceCanAccessPeer\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeContextCreate = dlsym(libze_handle, "zeContextCreate");
+	if (!libze_ops.zeContextCreate) {
+		FT_ERR("Failed to find zeContextCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeContextDestroy = dlsym(libze_handle, "zeContextDestroy");
+	if (!libze_ops.zeContextDestroy) {
+		FT_ERR("Failed to find zeContextDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeContextDestroy = dlsym(libze_handle, "zeContextDestroy");
+	if (!libze_ops.zeContextDestroy) {
+		FT_ERR("Failed to find zeContextDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueCreate = dlsym(libze_handle, "zeCommandQueueCreate");
+	if (!libze_ops.zeCommandQueueCreate) {
+		FT_ERR("Failed to find zeCommandQueueCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueDestroy = dlsym(libze_handle, "zeCommandQueueDestroy");
+	if (!libze_ops.zeCommandQueueDestroy) {
+		FT_ERR("Failed to find zeCommandQueueDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueExecuteCommandLists = dlsym(libze_handle, "zeCommandQueueExecuteCommandLists");
+	if (!libze_ops.zeCommandQueueExecuteCommandLists) {
+		FT_ERR("Failed to find zeCommandQueueExecuteCommandLists\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListCreate = dlsym(libze_handle, "zeCommandListCreate");
+	if (!libze_ops.zeCommandListCreate) {
+		FT_ERR("Failed to find zeCommandListCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListDestroy = dlsym(libze_handle, "zeCommandListDestroy");
+	if (!libze_ops.zeCommandListDestroy) {
+		FT_ERR("Failed to find zeCommandListDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListClose = dlsym(libze_handle, "zeCommandListClose");
+	if (!libze_ops.zeCommandListClose) {
+		FT_ERR("Failed to find zeCommandListClose\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListAppendMemoryCopy = dlsym(libze_handle, "zeCommandListAppendMemoryCopy");
+	if (!libze_ops.zeCommandListAppendMemoryCopy) {
+		FT_ERR("Failed to find zeCommandListAppendMemoryCopy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListAppendMemoryFill = dlsym(libze_handle, "zeCommandListAppendMemoryFill");
+	if (!libze_ops.zeCommandListAppendMemoryFill) {
+		FT_ERR("Failed to find zeCommandListAppendMemoryFill\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemAllocDevice = dlsym(libze_handle, "zeMemAllocDevice");
+	if (!libze_ops.zeMemAllocDevice) {
+		FT_ERR("Failed to find zeMemAllocDevice\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemFree = dlsym(libze_handle, "zeMemFree");
+	if (!libze_ops.zeMemFree) {
+		FT_ERR("Failed to find zeMemFree\n");
+		goto err_dlclose;
+	}
+	return FI_SUCCESS;
+
+err_dlclose:
+	dlclose(libze_handle);
+
+err_out:
+	return -FI_ENODATA;
+}
+
+static void cleanup_libze_ops(void)
+{
+	dlclose(libze_handle);
+}
 
 int ft_ze_init(void)
 {
@@ -84,31 +256,35 @@ int ft_ze_init(void)
 	ze_result_t ze_ret;
 	uint32_t count;
 
-	ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY);
+	if (init_libze_ops())
+		return -FI_EIO;
+
+	ze_ret = (*libze_ops.zeInit)(ZE_INIT_FLAG_GPU_ONLY);
 	if (ze_ret)
 		return -FI_EIO;
 
 	count = 1;
-	ze_ret = zeDriverGet(&count, &driver);
+	ze_ret = (*libze_ops.zeDriverGet)(&count, &driver);
 	if (ze_ret)
 		return -FI_EIO;
 
-	ze_ret = zeContextCreate(driver, &context_desc, &context);
+	ze_ret = (*libze_ops.zeContextCreate)(driver, &context_desc, &context);
 	if (ze_ret)
 		return -FI_EIO;
 
 	count = 0;
-	ze_ret = zeDeviceGet(driver, &count, NULL);
+	ze_ret = (*libze_ops.zeDeviceGet)(driver, &count, NULL);
 	if (ze_ret || count > ZE_MAX_DEVICES)
 		goto err;;
 
-	ze_ret = zeDeviceGet(driver, &count, devices);
+	ze_ret = (*libze_ops.zeDeviceGet)(driver, &count, devices);
 	if (ze_ret)
 		goto err;
 
 	for (num_devices = 0; num_devices < count; num_devices++) {
-		ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc,
-					      &cmd_queue[num_devices]);
+		ze_ret = (*libze_ops.zeCommandQueueCreate)(
+					context, devices[num_devices], &cq_desc,
+					&cmd_queue[num_devices]);
 		if (ze_ret)
 			goto err;
 	}
@@ -125,26 +301,28 @@ int ft_ze_cleanup(void)
 	int i, ret = FI_SUCCESS;
 
 	for (i = 0; i < num_devices; i++) {
-		if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i]))
+		if (cmd_queue[i] &&
+		    (*libze_ops.zeCommandQueueDestroy)(cmd_queue[i]))
 			ret = -FI_EINVAL;
 	}
 
-	if (zeContextDestroy(context))
+	if ((*libze_ops.zeContextDestroy)(context))
 		return -FI_EINVAL;
 
+	cleanup_libze_ops();
 	return ret;
 }
 
 int ft_ze_alloc(uint64_t device, void **buf, size_t size)
 {
-	return zeMemAllocShared(context, &device_desc, &host_desc,
-				size, 16, devices[device], buf) ?
-				-FI_EINVAL : 0;
+	return (*libze_ops.zeMemAllocDevice)(context, &device_desc, size, 16,
+					     devices[device], buf) ?
+			-FI_EINVAL : 0;
 }
 
 int ft_ze_free(void *buf)
 {
-	return zeMemFree(context, buf) ? -FI_EINVAL : FI_SUCCESS;
+	return (*libze_ops.zeMemFree)(context, buf) ? -FI_EINVAL : FI_SUCCESS;
 }
 
 int ft_ze_memset(uint64_t device, void *buf, int value, size_t size)
@@ -152,24 +330,26 @@ int ft_ze_memset(uint64_t device, void *buf, int value, size_t size)
 	ze_command_list_handle_t cmd_list;
 	ze_result_t ze_ret;
 
-	ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list);
+	ze_ret = (*libze_ops.zeCommandListCreate)(context, devices[device],
+						  &cl_desc, &cmd_list);
 	if (ze_ret)
 		return -FI_EIO;
 
-	ze_ret = zeCommandListAppendMemoryFill(cmd_list, buf, &value,
-					       sizeof(value), size, NULL, 0, NULL);
+	ze_ret = (*libze_ops.zeCommandListAppendMemoryFill)(
+					cmd_list, buf, &value, sizeof(value),
+					size, NULL, 0, NULL);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandListClose(cmd_list);
+	ze_ret = (*libze_ops.zeCommandListClose)(cmd_list);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1,
-						   &cmd_list, NULL);
+	ze_ret = (*libze_ops.zeCommandQueueExecuteCommandLists)(
+					cmd_queue[device], 1, &cmd_list, NULL);
 
 free:
-	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+	if (!(*libze_ops.zeCommandListDestroy)(cmd_list) && !ze_ret)
 		return FI_SUCCESS;
 
 	return -FI_EINVAL;
@@ -180,23 +360,25 @@ int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size)
 	ze_command_list_handle_t cmd_list;
 	ze_result_t ze_ret;
 
-	ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list);
+	ze_ret = (*libze_ops.zeCommandListCreate)(context, devices[device],
+						  &cl_desc, &cmd_list);
 	if (ze_ret)
 		return -FI_EIO;
 
-	ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL);
+	ze_ret = (*libze_ops.zeCommandListAppendMemoryCopy)(
+					cmd_list, dst, src, size, NULL, 0, NULL);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandListClose(cmd_list);
+	ze_ret = (*libze_ops.zeCommandListClose)(cmd_list);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1,
-						   &cmd_list, NULL);
+	ze_ret = (*libze_ops.zeCommandQueueExecuteCommandLists)(
+					cmd_queue[device], 1, &cmd_list, NULL);
 
 free:
-	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+	if (!(*libze_ops.zeCommandListDestroy)(cmd_list) && !ze_ret)
 		return FI_SUCCESS;
 
 	return -FI_EINVAL;
diff --git a/deps/libfabric/fabtests/common/jsmn.c b/deps/libfabric/fabtests/common/jsmn.c
deleted file mode 100644
index 9fe0fb4d4d9d6870d299b287525277430894378b..0000000000000000000000000000000000000000
--- a/deps/libfabric/fabtests/common/jsmn.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2010 Serge A. Zaitsev
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <stdlib.h>
-
-#include "jsmn.h"
-
-/**
- * Allocates a fresh unused token from the token pull.
- */
-static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser,
-		jsmntok_t *tokens, size_t num_tokens) {
-	jsmntok_t *tok;
-	if (parser->toknext >= num_tokens) {
-		return NULL;
-	}
-	tok = &tokens[parser->toknext++];
-	tok->start = tok->end = -1;
-	tok->size = 0;
-#ifdef JSMN_PARENT_LINKS
-	tok->parent = -1;
-#endif
-	return tok;
-}
-
-/**
- * Fills token type and boundaries.
- */
-static void jsmn_fill_token(jsmntok_t *token, jsmntype_t type,
-                            int start, int end) {
-	token->type = type;
-	token->start = start;
-	token->end = end;
-	token->size = 0;
-}
-
-/**
- * Fills next available token with JSON primitive.
- */
-static jsmnerr_t jsmn_parse_primitive(jsmn_parser *parser, const char *js,
-		size_t len, jsmntok_t *tokens, size_t num_tokens) {
-	jsmntok_t *token;
-	int start;
-
-	start = parser->pos;
-
-	for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
-		switch (js[parser->pos]) {
-#ifndef JSMN_STRICT
-			/* In strict mode primitive must be followed by "," or "}" or "]" */
-			case ':':
-#endif
-			case '\t' : case '\r' : case '\n' : case ' ' :
-			case ','  : case ']'  : case '}' :
-				goto found;
-		}
-		if (js[parser->pos] < 32 || js[parser->pos] >= 127) {
-			parser->pos = start;
-			return JSMN_ERROR_INVAL;
-		}
-	}
-#ifdef JSMN_STRICT
-	/* In strict mode primitive must be followed by a comma/object/array */
-	parser->pos = start;
-	return JSMN_ERROR_PART;
-#endif
-
-found:
-	if (tokens == NULL) {
-		parser->pos--;
-		return 0;
-	}
-	token = jsmn_alloc_token(parser, tokens, num_tokens);
-	if (token == NULL) {
-		parser->pos = start;
-		return JSMN_ERROR_NOMEM;
-	}
-	jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos);
-#ifdef JSMN_PARENT_LINKS
-	token->parent = parser->toksuper;
-#endif
-	parser->pos--;
-	return 0;
-}
-
-/**
- * Filsl next token with JSON string.
- */
-static jsmnerr_t jsmn_parse_string(jsmn_parser *parser, const char *js,
-		size_t len, jsmntok_t *tokens, size_t num_tokens) {
-	jsmntok_t *token;
-
-	int start = parser->pos;
-
-	parser->pos++;
-
-	/* Skip starting quote */
-	for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
-		char c = js[parser->pos];
-
-		/* Quote: end of string */
-		if (c == '\"') {
-			if (tokens == NULL) {
-				return 0;
-			}
-			token = jsmn_alloc_token(parser, tokens, num_tokens);
-			if (token == NULL) {
-				parser->pos = start;
-				return JSMN_ERROR_NOMEM;
-			}
-			jsmn_fill_token(token, JSMN_STRING, start+1, parser->pos);
-#ifdef JSMN_PARENT_LINKS
-			token->parent = parser->toksuper;
-#endif
-			return 0;
-		}
-
-		/* Backslash: Quoted symbol expected */
-		if (c == '\\' && parser->pos + 1 < len) {
-			int i;
-			parser->pos++;
-			switch (js[parser->pos]) {
-				/* Allowed escaped symbols */
-				case '\"': case '/' : case '\\' : case 'b' :
-				case 'f' : case 'r' : case 'n'  : case 't' :
-					break;
-				/* Allows escaped symbol \uXXXX */
-				case 'u':
-					parser->pos++;
-					for(i = 0; i < 4 && parser->pos < len && js[parser->pos] != '\0'; i++) {
-						/* If it isn't a hex character we have an error */
-						if(!((js[parser->pos] >= 48 && js[parser->pos] <= 57) || /* 0-9 */
-									(js[parser->pos] >= 65 && js[parser->pos] <= 70) || /* A-F */
-									(js[parser->pos] >= 97 && js[parser->pos] <= 102))) { /* a-f */
-							parser->pos = start;
-							return JSMN_ERROR_INVAL;
-						}
-						parser->pos++;
-					}
-					parser->pos--;
-					break;
-				/* Unexpected symbol */
-				default:
-					parser->pos = start;
-					return JSMN_ERROR_INVAL;
-			}
-		}
-	}
-	parser->pos = start;
-	return JSMN_ERROR_PART;
-}
-
-/**
- * Parse JSON string and fill tokens.
- */
-jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len,
-		jsmntok_t *tokens, unsigned int num_tokens) {
-	jsmnerr_t r;
-	int i;
-	jsmntok_t *token;
-	int count = 0;
-
-	for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
-		char c;
-		jsmntype_t type;
-
-		c = js[parser->pos];
-		switch (c) {
-			case '{': case '[':
-				count++;
-				if (tokens == NULL) {
-					break;
-				}
-				token = jsmn_alloc_token(parser, tokens, num_tokens);
-				if (token == NULL)
-					return JSMN_ERROR_NOMEM;
-				if (parser->toksuper != -1) {
-					tokens[parser->toksuper].size++;
-#ifdef JSMN_PARENT_LINKS
-					token->parent = parser->toksuper;
-#endif
-				}
-				token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY);
-				token->start = parser->pos;
-				parser->toksuper = parser->toknext - 1;
-				break;
-			case '}': case ']':
-				if (tokens == NULL)
-					break;
-				type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY);
-#ifdef JSMN_PARENT_LINKS
-				if (parser->toknext < 1) {
-					return JSMN_ERROR_INVAL;
-				}
-				token = &tokens[parser->toknext - 1];
-				for (;;) {
-					if (token->start != -1 && token->end == -1) {
-						if (token->type != type) {
-							return JSMN_ERROR_INVAL;
-						}
-						token->end = parser->pos + 1;
-						parser->toksuper = token->parent;
-						break;
-					}
-					if (token->parent == -1) {
-						break;
-					}
-					token = &tokens[token->parent];
-				}
-#else
-				for (i = parser->toknext - 1; i >= 0; i--) {
-					token = &tokens[i];
-					if (token->start != -1 && token->end == -1) {
-						if (token->type != type) {
-							return JSMN_ERROR_INVAL;
-						}
-						parser->toksuper = -1;
-						token->end = parser->pos + 1;
-						break;
-					}
-				}
-				/* Error if unmatched closing bracket */
-				if (i == -1) return JSMN_ERROR_INVAL;
-				for (; i >= 0; i--) {
-					token = &tokens[i];
-					if (token->start != -1 && token->end == -1) {
-						parser->toksuper = i;
-						break;
-					}
-				}
-#endif
-				break;
-			case '\"':
-				r = jsmn_parse_string(parser, js, len, tokens, num_tokens);
-				if (r < 0) return r;
-				count++;
-				if (parser->toksuper != -1 && tokens != NULL)
-					tokens[parser->toksuper].size++;
-				break;
-			case '\t' : case '\r' : case '\n' : case ' ':
-				break;
-			case ':':
-				parser->toksuper = parser->toknext - 1;
-				break;
-			case ',':
-				if (tokens != NULL &&
-						tokens[parser->toksuper].type != JSMN_ARRAY &&
-						tokens[parser->toksuper].type != JSMN_OBJECT) {
-#ifdef JSMN_PARENT_LINKS
-					parser->toksuper = tokens[parser->toksuper].parent;
-#else
-					for (i = parser->toknext - 1; i >= 0; i--) {
-						if (tokens[i].type == JSMN_ARRAY || tokens[i].type == JSMN_OBJECT) {
-							if (tokens[i].start != -1 && tokens[i].end == -1) {
-								parser->toksuper = i;
-								break;
-							}
-						}
-					}
-#endif
-				}
-				break;
-#ifdef JSMN_STRICT
-			/* In strict mode primitives are: numbers and booleans */
-			case '-': case '0': case '1' : case '2': case '3' : case '4':
-			case '5': case '6': case '7' : case '8': case '9':
-			case 't': case 'f': case 'n' :
-				/* And they must not be keys of the object */
-				if (tokens != NULL) {
-					jsmntok_t *t = &tokens[parser->toksuper];
-					if (t->type == JSMN_OBJECT ||
-							(t->type == JSMN_STRING && t->size != 0)) {
-						return JSMN_ERROR_INVAL;
-					}
-				}
-#else
-			/* In non-strict mode every unquoted value is a primitive */
-			default:
-#endif
-				r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens);
-				if (r < 0) return r;
-				count++;
-				if (parser->toksuper != -1 && tokens != NULL)
-					tokens[parser->toksuper].size++;
-				break;
-
-#ifdef JSMN_STRICT
-			/* Unexpected char in strict mode */
-			default:
-				return JSMN_ERROR_INVAL;
-#endif
-		}
-	}
-
-	for (i = parser->toknext - 1; i >= 0; i--) {
-		/* Unmatched opened object or array */
-		if (tokens && tokens[i].start != -1 && tokens[i].end == -1) {
-			return JSMN_ERROR_PART;
-		}
-	}
-
-	return count;
-}
-
-/**
- * Creates a new parser based over a given  buffer with an array of tokens
- * available.
- */
-void jsmn_init(jsmn_parser *parser) {
-	parser->pos = 0;
-	parser->toknext = 0;
-	parser->toksuper = -1;
-}
-
diff --git a/deps/libfabric/fabtests/common/shared.c b/deps/libfabric/fabtests/common/shared.c
index 95737b64094d8c276e25e4f670ae49f7f557d72d..6a0fa03ddd395bdac8369db1d9d686a2a46e2411 100644
--- a/deps/libfabric/fabtests/common/shared.c
+++ b/deps/libfabric/fabtests/common/shared.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2013-2018 Intel Corporation.  All rights reserved.
  * Copyright (c) 2016 Cray Inc.  All rights reserved.
  * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  *
  * This software is available to you under the BSD license below:
  *
@@ -29,6 +30,7 @@
  */
 
 #include <assert.h>
+#include <fcntl.h>
 #include <netdb.h>
 #include <poll.h>
 #include <stdlib.h>
@@ -60,6 +62,8 @@ struct fid_pep *pep;
 struct fid_ep *ep, *alias_ep;
 struct fid_cq *txcq, *rxcq;
 struct fid_cntr *txcntr, *rxcntr;
+struct fid_ep *srx;
+struct fid_stx *stx;
 struct fid_mr *mr;
 void *mr_desc = NULL;
 struct fid_av *av;
@@ -144,7 +148,6 @@ struct test_size_param test_size[] = {
 
 const unsigned int test_cnt = (sizeof test_size / sizeof test_size[0]);
 
-#define INTEG_SEED 7
 static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
 static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1;
 
@@ -365,6 +368,7 @@ static int ft_reg_mr(void *buf, size_t size, uint64_t access,
 	struct fi_mr_attr attr = {0};
 	struct iovec iov = {0};
 	int ret;
+	uint64_t flags;
 
 	if (((!(fi->domain_attr->mr_mode & FI_MR_LOCAL) &&
 	      !(opts.options & FT_OPT_USE_DEVICE)) ||
@@ -391,7 +395,8 @@ static int ft_reg_mr(void *buf, size_t size, uint64_t access,
 		break;
 	}
 
-	ret = fi_mr_regattr(domain, &attr, 0, mr);
+	flags = (opts.iface) ? FI_HMEM_DEVICE_ONLY : 0;
+	ret = fi_mr_regattr(domain, &attr, flags, mr);
 	if (ret)
 		return ret;
 
@@ -476,7 +481,7 @@ static int ft_alloc_msgs(void)
 		ft_set_tx_rx_sizes(&tx_size, &rx_size);
 		tx_mr_size = 0;
 		rx_mr_size = 0;
-		buf_size = MAX(tx_size, FT_MAX_CTRL_MSG) * opts.window_size + 
+		buf_size = MAX(tx_size, FT_MAX_CTRL_MSG) * opts.window_size +
 			   MAX(rx_size, FT_MAX_CTRL_MSG) * opts.window_size;
 	}
 
@@ -537,6 +542,42 @@ static int ft_alloc_msgs(void)
 	return 0;
 }
 
+int ft_open_domain_res(void)
+{
+	int ret;
+
+	ret = fi_domain(fabric, fi, &domain, NULL);
+	if (ret) {
+		FT_PRINTERR("fi_domain", ret);
+		return ret;
+	}
+
+	if (opts.options & FT_OPT_DOMAIN_EQ) {
+		ret = fi_domain_bind(domain, &eq->fid, 0);
+		if (ret) {
+			FT_PRINTERR("fi_domain_bind", ret);
+			return ret;
+		}
+	}
+
+	if (opts.options & FT_OPT_STX) {
+		ret = fi_stx_context(domain, fi->tx_attr, &stx, NULL);
+		if (ret) {
+			FT_PRINTERR("fi_stx_context", ret);
+			return ret;
+		}
+	}
+
+	if (opts.options & FT_OPT_SRX) {
+		ret = fi_srx_context(domain, fi->rx_attr, &srx, NULL);
+		if (ret) {
+			FT_PRINTERR("fi_srx_context", ret);
+			return ret;
+		}
+	}
+	return 0;
+}
+
 int ft_open_fabric_res(void)
 {
 	int ret;
@@ -553,13 +594,7 @@ int ft_open_fabric_res(void)
 		return ret;
 	}
 
-	ret = fi_domain(fabric, fi, &domain, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_domain", ret);
-		return ret;
-	}
-
-	return 0;
+	return ft_open_domain_res();
 }
 
 int ft_alloc_ep_res(struct fi_info *fi)
@@ -676,26 +711,37 @@ int ft_alloc_active_res(struct fi_info *fi)
 	return 0;
 }
 
-static int ft_init(void)
+int ft_init(void)
 {
 	tx_seq = 0;
 	rx_seq = 0;
 	tx_cq_cntr = 0;
 	rx_cq_cntr = 0;
 
-	//If using device memory for transfers, require OOB address
-	//exchange because extra steps are involved when passing
-	//device buffers into fi_av_insert
-	if (opts.options & FT_OPT_ENABLE_HMEM)
-		opts.options |= FT_OPT_OOB_ADDR_EXCH;
-
 	return ft_hmem_init(opts.iface);
 }
 
+int ft_sock_setup(int sock)
+{
+	int ret, op;
+
+	op = 1;
+	ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+			  (void *) &op, sizeof(op));
+	if (ret)
+		return ret;
+
+	ret = ft_fd_nonblock(sock);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 int ft_init_oob(void)
 {
-	int ret, op, err;
 	struct addrinfo *ai = NULL;
+	int ret;
 
 	if (!(opts.options & FT_OPT_OOB_CTRL) || oob_sock != -1)
 		return 0;
@@ -717,7 +763,6 @@ int ft_init_oob(void)
 
 		close(listen_sock);
 	} else {
-
 		ret = getaddrinfo(opts.dst_addr, opts.oob_port, NULL, &ai);
 		if (ret) {
 			perror("getaddrinfo");
@@ -740,11 +785,7 @@ int ft_init_oob(void)
 		sleep(1);
 	}
 
-	op = 1;
-	err = setsockopt(oob_sock, IPPROTO_TCP, TCP_NODELAY,
-			 (void *) &op, sizeof(op));
-	if (err)
-		perror("setsockopt"); /* non-fatal error */
+	ret = ft_sock_setup(oob_sock);
 
 free:
 	if (ai)
@@ -752,6 +793,10 @@ free:
 	return ret;
 }
 
+/*
+ * Handles a persistent server communicating with multiple clients,
+ * one at a time, in sequence.
+ */
 int ft_accept_next_client() {
 	int ret;
 
@@ -762,9 +807,48 @@ int ft_accept_next_client() {
 			return ret;
 	}
 
+	/* Clients may be separate processes, so re-initialize any OOB setup. */
+	if (opts.options & FT_OPT_OOB_ADDR_EXCH) {
+		ret = ft_reset_oob();
+		if (ret)
+			return ret;
+	}
 	return ft_init_av();
 }
 
+/*
+ * Re-initialize the OOB setup.
+ */
+int ft_reset_oob()
+{
+	int ret;
+	ret = ft_close_oob();
+	if (ret) {
+		FT_PRINTERR("ft_close_oob", ret);
+		return ret;
+	}
+	ret = ft_init_oob();
+	if (ret) {
+		FT_PRINTERR("ft_init_oob", ret);
+		return ret;
+	}
+	return 0;
+}
+
+int ft_close_oob()
+{
+	int ret;
+	if (oob_sock == -1)
+		return 0;
+	ret = close(oob_sock);
+	if (ret) {
+		FT_PRINTERR("close", errno);
+		return ret;
+	}
+	oob_sock = -1;
+	return 0;
+}
+
 int ft_getinfo(struct fi_info *hints, struct fi_info **info)
 {
 	char *node, *service;
@@ -932,11 +1016,9 @@ int ft_server_connect(void)
 	if (ret)
 		goto err;
 
-	ret = fi_domain(fabric, fi, &domain, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_domain", ret);
+	ret = ft_open_domain_res();
+	if (ret)
 		goto err;
-	}
 
 	ret = ft_alloc_active_res(fi);
 	if (ret)
@@ -950,6 +1032,9 @@ int ft_server_connect(void)
 	if (ret)
 		goto err;
 
+	if (ft_check_opts(FT_OPT_FORK_CHILD))
+		ft_fork_child();
+
 	return 0;
 
 err:
@@ -1008,6 +1093,9 @@ int ft_client_connect(void)
 	if (ret)
 		return ret;
 
+	if (ft_check_opts(FT_OPT_FORK_CHILD))
+		ft_fork_child();
+
 	return 0;
 }
 
@@ -1043,6 +1131,9 @@ int ft_init_fabric(void)
 	if (ret)
 		return ret;
 
+	if (ft_check_opts(FT_OPT_FORK_CHILD))
+		ft_fork_child();
+
 	return 0;
 }
 
@@ -1070,18 +1161,18 @@ int ft_init_alias_ep(uint64_t flags)
 	return 0;
 }
 
-int ft_enable_ep(struct fid_ep *ep, struct fid_eq *eq, struct fid_av *av,
-		 struct fid_cq *txcq, struct fid_cq *rxcq,
-		 struct fid_cntr *txcntr, struct fid_cntr *rxcntr)
+int ft_enable_ep(struct fid_ep *ep)
 {
 	uint64_t flags;
 	int ret;
 
-	if (fi->ep_attr->type == FI_EP_MSG || fi->caps & FI_MULTICAST ||
-	    fi->caps & FI_COLLECTIVE)
+	if ((fi->ep_attr->type == FI_EP_MSG || fi->caps & FI_MULTICAST ||
+	    fi->caps & FI_COLLECTIVE) && !(opts.options & FT_OPT_DOMAIN_EQ))
 		FT_EP_BIND(ep, eq, 0);
 
 	FT_EP_BIND(ep, av, 0);
+	FT_EP_BIND(ep, stx, 0);
+	FT_EP_BIND(ep, srx, 0);
 
 	flags = FI_TRANSMIT;
 	if (!(opts.options & FT_OPT_TX_CQ))
@@ -1135,7 +1226,7 @@ int ft_enable_ep_recv(void)
 {
 	int ret;
 
-	ret = ft_enable_ep(ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	ret = ft_enable_ep(ep);
 	if (ret)
 		return ret;
 
@@ -1235,6 +1326,7 @@ int ft_exchange_addresses_oob(struct fid_av *av_ptr, struct fid_ep *ep_ptr,
 int ft_init_av_dst_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr,
 		fi_addr_t *remote_addr)
 {
+	char temp[FT_MAX_CTRL_MSG];
 	size_t addrlen;
 	int ret;
 
@@ -1252,13 +1344,17 @@ int ft_init_av_dst_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr,
 			return ret;
 
 		addrlen = FT_MAX_CTRL_MSG;
-		ret = fi_getname(&ep_ptr->fid, (char *) tx_buf + ft_tx_prefix_size(),
-				&addrlen);
+		ret = fi_getname(&ep_ptr->fid, temp, &addrlen);
 		if (ret) {
 			FT_PRINTERR("fi_getname", ret);
 			return ret;
 		}
 
+		ret = ft_hmem_copy_to(opts.iface, opts.device,
+				      tx_buf + ft_tx_prefix_size(), temp, addrlen);
+		if (ret)
+			return ret;
+
 		ret = (int) ft_tx(ep, *remote_addr, addrlen, &tx_ctx);
 		if (ret)
 			return ret;
@@ -1267,18 +1363,28 @@ int ft_init_av_dst_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr,
 		if (ret)
 			return ret;
 	} else {
-		ret = (int) ft_rx(ep, FT_MAX_CTRL_MSG);
+		ret = ft_get_rx_comp(rx_seq);
+		if (ret)
+			return ret;
+
+		ret = ft_hmem_copy_from(opts.iface, opts.device, temp,
+					rx_buf + ft_rx_prefix_size(),
+					FT_MAX_CTRL_MSG);
 		if (ret)
 			return ret;
 
 		/* Test passing NULL fi_addr on one of the sides (server) if
 		 * AV type is FI_AV_TABLE */
-		ret = ft_av_insert(av_ptr, (char *) rx_buf + ft_rx_prefix_size(),
-				   1, ((fi->domain_attr->av_type == FI_AV_TABLE) ?
+		ret = ft_av_insert(av_ptr, temp, 1,
+				   ((fi->domain_attr->av_type == FI_AV_TABLE) ?
 				       NULL : remote_addr), 0, NULL);
 		if (ret)
 			return ret;
 
+		ret = ft_post_rx(ep, rx_size, &rx_ctx);
+		if (ret)
+			return ret;
+
 		if (fi->domain_attr->av_type == FI_AV_TABLE)
 			*remote_addr = 0;
 
@@ -1359,148 +1465,79 @@ int ft_init_av_addr(struct fid_av *av_ptr, struct fid_ep *ep_ptr,
 	return 0;
 }
 
-int ft_exchange_raw_keys(struct fi_rma_iov *peer_iov)
+int ft_exchange_keys(struct fi_rma_iov *peer_iov)
 {
-	struct fi_rma_iov *rma_iov;
-	size_t key_size;
-	size_t len;
+	char temp[FT_MAX_CTRL_MSG];
+	struct fi_rma_iov *rma_iov = (struct fi_rma_iov *) temp;
+	size_t key_size = 0, len;
 	uint64_t addr;
 	int ret;
 
-	/* Get key size */
-	key_size = 0;
-	ret = fi_mr_raw_attr(mr, &addr, NULL, &key_size, 0);
-	if (ret != -FI_ETOOSMALL) {
-		return ret;
-	}
-
-	len = sizeof(*rma_iov) + key_size - sizeof(rma_iov->key);
-	/* TODO: make sure this fits in tx_buf and rx_buf */
-
-	if (opts.dst_addr) {
-		rma_iov = (struct fi_rma_iov *) (tx_buf + ft_tx_prefix_size());
-		if ((fi->domain_attr->mr_mode == FI_MR_BASIC) ||
-		    (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
-			rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size();
-		} else {
-			rma_iov->addr = 0;
-		}
-
-		/* Get raw attributes */
-		ret = fi_mr_raw_attr(mr, &addr, (uint8_t *) &rma_iov->key,
-				&key_size, 0);
-		if (ret)
-			return ret;
-
-		ret = ft_tx(ep, remote_fi_addr, len, &tx_ctx);
-		if (ret)
-			return ret;
-
-		ret = ft_get_rx_comp(rx_seq);
-		if (ret)
-			return ret;
-
-		rma_iov = (struct fi_rma_iov *) (rx_buf + ft_rx_prefix_size());
-		peer_iov->addr 	= rma_iov->addr;
-		peer_iov->len 	= rma_iov->len;
-		/* Map remote mr raw locally */
-		ret = fi_mr_map_raw(domain, rma_iov->addr,
-				(uint8_t *) &rma_iov->key, key_size,
-				&peer_iov->key, 0);
-		if (ret)
+	if (fi->domain_attr->mr_mode & FI_MR_RAW) {
+		ret = fi_mr_raw_attr(mr, &addr, NULL, &key_size, 0);
+		if (ret != -FI_ETOOSMALL)
 			return ret;
-
-		ret = ft_post_rx(ep, rx_size, &rx_ctx);
+		len = sizeof(*rma_iov) + key_size - sizeof(rma_iov->key);
+		if (len > FT_MAX_CTRL_MSG) {
+			FT_PRINTERR("Raw key too large for ctrl message",
+				    -FI_ETOOSMALL);
+			return -FI_ETOOSMALL;
+		}
 	} else {
-		ret = ft_get_rx_comp(rx_seq);
-		if (ret)
-			return ret;
-
-		rma_iov = (struct fi_rma_iov *) (rx_buf + ft_rx_prefix_size());
-		peer_iov->addr 	= rma_iov->addr;
-		peer_iov->len 	= rma_iov->len;
-		/* Map remote mr raw locally */
-		ret = fi_mr_map_raw(domain, rma_iov->addr,
-				(uint8_t *) &rma_iov->key, key_size,
-				&peer_iov->key, 0);
-		if (ret)
-			return ret;
-
-		ret = ft_post_rx(ep, rx_size, &rx_ctx);
-		if (ret)
-			return ret;
+		len = sizeof(*rma_iov);
+	}
 
-		rma_iov = (struct fi_rma_iov *) (tx_buf + ft_tx_prefix_size());
-		if ((fi->domain_attr->mr_mode == FI_MR_BASIC) ||
-		    (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
-			rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size();
-		} else {
-			rma_iov->addr = 0;
-		}
+	if ((fi->domain_attr->mr_mode == FI_MR_BASIC) ||
+	    (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
+		rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size();
+	} else {
+		rma_iov->addr = 0;
+	}
 
-		/* Get raw attributes */
+	if (fi->domain_attr->mr_mode & FI_MR_RAW) {
 		ret = fi_mr_raw_attr(mr, &addr, (uint8_t *) &rma_iov->key,
-				&key_size, 0);
+				     &key_size, 0);
 		if (ret)
 			return ret;
-
-		ret = ft_tx(ep, remote_fi_addr, len, &tx_ctx);
+	} else {
+		rma_iov->key = fi_mr_key(mr);
 	}
 
-	return ret;
-}
+	ret = ft_hmem_copy_to(opts.iface, opts.device,
+			      tx_buf + ft_tx_prefix_size(), temp, len);
+	if (ret)
+		return ret;
 
-int ft_exchange_keys(struct fi_rma_iov *peer_iov)
-{
-	struct fi_rma_iov *rma_iov;
-	int ret;
+	ret = ft_tx(ep, remote_fi_addr, len, &tx_ctx);
+	if (ret)
+		return ret;
 
-	if (fi->domain_attr->mr_mode & FI_MR_RAW)
-		return ft_exchange_raw_keys(peer_iov);
+	ret = ft_get_rx_comp(rx_seq);
+	if (ret)
+		return ret;
 
-	if (opts.dst_addr) {
-		rma_iov = (struct fi_rma_iov *) (tx_buf + ft_tx_prefix_size());
-		if ((fi->domain_attr->mr_mode == FI_MR_BASIC) ||
-		    (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
-			rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size();
-		} else {
-			rma_iov->addr = 0;
-		}
-		rma_iov->key = fi_mr_key(mr);
-		ret = ft_tx(ep, remote_fi_addr, sizeof *rma_iov, &tx_ctx);
-		if (ret)
-			return ret;
+	ret = ft_hmem_copy_from(opts.iface, opts.device, temp,
+				rx_buf + ft_rx_prefix_size(), FT_MAX_CTRL_MSG);
+	if (ret)
+		return ret;
 
-		ret = ft_get_rx_comp(rx_seq);
+	if (fi->domain_attr->mr_mode & FI_MR_RAW) {
+		peer_iov->addr = rma_iov->addr;
+		peer_iov->len = rma_iov->len;
+		ret = fi_mr_map_raw(domain, rma_iov->addr,
+				    (uint8_t *) &rma_iov->key, key_size,
+				    &peer_iov->key, 0);
 		if (ret)
 			return ret;
-
-		rma_iov = (struct fi_rma_iov *) (rx_buf + ft_rx_prefix_size());
-		*peer_iov = *rma_iov;
-		ret = ft_post_rx(ep, rx_size, &rx_ctx);
 	} else {
-		ret = ft_get_rx_comp(rx_seq);
-		if (ret)
-			return ret;
-
-		rma_iov = (struct fi_rma_iov *) (rx_buf + ft_rx_prefix_size());
 		*peer_iov = *rma_iov;
-		ret = ft_post_rx(ep, rx_size, &rx_ctx);
-		if (ret)
-			return ret;
-
-		rma_iov = (struct fi_rma_iov *) (tx_buf + ft_tx_prefix_size());
-		if ((fi->domain_attr->mr_mode == FI_MR_BASIC) ||
-		    (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
-			rma_iov->addr = (uintptr_t) rx_buf + ft_rx_prefix_size();
-		} else {
-			rma_iov->addr = 0;
-		}
-		rma_iov->key = fi_mr_key(mr);
-		ret = ft_tx(ep, remote_fi_addr, sizeof *rma_iov, &tx_ctx);
 	}
 
-	return ret;
+	ret = ft_post_rx(ep, rx_size, &rx_ctx);
+	if (ret)
+		return ret;
+
+	return ft_sync();
 }
 
 static void ft_cleanup_mr_array(struct ft_context *ctx_arr, char **mr_bufs)
@@ -1520,8 +1557,6 @@ static void ft_cleanup_mr_array(struct ft_context *ctx_arr, char **mr_bufs)
 
 static void ft_close_fids(void)
 {
-	if (mr != &no_mr)
-		FT_CLOSE_FID(mr);
 	FT_CLOSE_FID(mc);
 	FT_CLOSE_FID(alias_ep);
 	FT_CLOSE_FID(ep);
@@ -1535,9 +1570,13 @@ static void ft_close_fids(void)
 	FT_CLOSE_FID(rxcntr);
 	FT_CLOSE_FID(txcntr);
 	FT_CLOSE_FID(pollset);
+	if (mr != &no_mr)
+		FT_CLOSE_FID(mr);
 	FT_CLOSE_FID(av);
-	FT_CLOSE_FID(eq);
+	FT_CLOSE_FID(srx);
+	FT_CLOSE_FID(stx);
 	FT_CLOSE_FID(domain);
+	FT_CLOSE_FID(eq);
 	FT_CLOSE_FID(waitset);
 	FT_CLOSE_FID(fabric);
 }
@@ -1575,7 +1614,7 @@ void ft_free_res(void)
 		fi_freeinfo(hints);
 		hints = NULL;
 	}
-	
+
 	ret = ft_hmem_cleanup(opts.iface);
 	if (ret)
 		FT_PRINTERR("ft_hmem_cleanup", ret);
@@ -1732,11 +1771,14 @@ static const size_t datatype_size_table[] = {
 	[FI_DOUBLE_COMPLEX] = sizeof(OFI_COMPLEX(double)),
 	[FI_LONG_DOUBLE]    = sizeof(long double),
 	[FI_LONG_DOUBLE_COMPLEX] = sizeof(OFI_COMPLEX(long_double)),
+	/* Compute 128-bit integer size, since compiler may not support type. */
+	[FI_INT128]  = sizeof(int64_t) * 2,
+	[FI_UINT128] = sizeof(uint64_t) * 2,
 };
 
 size_t datatype_to_size(enum fi_datatype datatype)
 {
-	if (datatype >= FI_DATATYPE_LAST)
+	if (datatype >= ARRAY_SIZE(datatype_size_table))
 		return 0;
 
 	return datatype_size_table[datatype];
@@ -1753,6 +1795,14 @@ void init_test(struct ft_opts *opts, char *test_name, size_t test_name_len)
 		opts->iterations = size_to_count(opts->transfer_size);
 }
 
+static void ft_force_progress(void)
+{
+	if (txcq)
+		fi_cq_read(txcq, NULL, 0);
+	if (rxcq)
+		fi_cq_read(rxcq, NULL, 0);
+}
+
 static int ft_progress(struct fid_cq *cq, uint64_t total, uint64_t *cq_cntr)
 {
 	struct fi_cq_err_entry comp;
@@ -1843,8 +1893,11 @@ ssize_t ft_tx(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx)
 {
 	ssize_t ret;
 
-	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE))
-		ft_fill_buf((char *) tx_buf + ft_tx_prefix_size(), size);
+	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) {
+		ret = ft_fill_buf((char *) tx_buf + ft_tx_prefix_size(), size);
+		if (ret)
+			return ret;
+	}
 
 	ret = ft_post_tx(ep, fi_addr, size, NO_CQ_DATA, ctx);
 	if (ret)
@@ -1874,8 +1927,11 @@ ssize_t ft_inject(struct fid_ep *ep, fi_addr_t fi_addr, size_t size)
 {
 	ssize_t ret;
 
-	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE))
-		ft_fill_buf((char *) tx_buf + ft_tx_prefix_size(), size);
+	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) {
+		ret = ft_fill_buf((char *) tx_buf + ft_tx_prefix_size(), size);
+		if (ret)
+			return ret;
+	}
 
 	ret = ft_post_inject(ep, fi_addr, size);
 	if (ret)
@@ -2071,8 +2127,8 @@ ssize_t ft_post_rx_buf(struct fid_ep *ep, size_t size, void *ctx,
 	if (hints->caps & FI_TAGGED) {
 		op_tag = op_tag ? op_tag : rx_seq;
 		FT_POST(fi_trecv, ft_progress, rxcq, rx_seq, &rx_cq_cntr,
-			"receive", ep, op_buf, size, op_mr_desc, 0, op_tag,
-			0, ctx);
+			"receive", ep, op_buf, size, op_mr_desc,
+			remote_fi_addr, op_tag, 0, ctx);
 	} else {
 		FT_POST(fi_recv, ft_progress, rxcq, rx_seq, &rx_cq_cntr,
 			"receive", ep, op_buf, size, op_mr_desc, 0, ctx);
@@ -2591,6 +2647,21 @@ int ft_fork_and_pair(void)
 	return 0;
 }
 
+int ft_fork_child(void)
+{
+	ft_child_pid = fork();
+	if (ft_child_pid < 0) {
+		FT_PRINTERR("fork", ft_child_pid);
+		return -errno;
+	}
+
+	if (ft_child_pid == 0) {
+		exit(0);
+	}
+
+	return 0;
+}
+
 int ft_wait_child(void)
 {
 	int ret;
@@ -2639,7 +2710,7 @@ int ft_finalize_ep(struct fid_ep *ep)
 
 		FT_POST(fi_tsendmsg, ft_progress, txcq, tx_seq,
 			&tx_cq_cntr, "tsendmsg", ep, &tmsg,
-			FI_INJECT | FI_TRANSMIT_COMPLETE);
+			FI_TRANSMIT_COMPLETE);
 	} else {
 		struct fi_msg msg;
 
@@ -2652,7 +2723,7 @@ int ft_finalize_ep(struct fid_ep *ep)
 
 		FT_POST(fi_sendmsg, ft_progress, txcq, tx_seq,
 			&tx_cq_cntr, "sendmsg", ep, &msg,
-			FI_INJECT | FI_TRANSMIT_COMPLETE);
+			FI_TRANSMIT_COMPLETE);
 	}
 
 	ret = ft_get_tx_comp(tx_seq);
@@ -2801,7 +2872,17 @@ void ft_usage(char *name, char *desc)
 	FT_PRINT_OPTS_USAGE("", "fi_resmgmt_test");
 	FT_PRINT_OPTS_USAGE("", "fi_inj_complete");
 	FT_PRINT_OPTS_USAGE("", "fi_bw");
+	FT_PRINT_OPTS_USAGE("-U", "run fabtests with FI_DELIVERY_COMPLETE set");
+	FT_PRINT_OPTS_USAGE("", "Only the following tests support this option for now:");
+	FT_PRINT_OPTS_USAGE("", "fi_bw");
+	FT_PRINT_OPTS_USAGE("", "fi_rdm");
+	FT_PRINT_OPTS_USAGE("", "fi_rdm_atomic");
+	FT_PRINT_OPTS_USAGE("", "fi_rdm_pingpong");
+	FT_PRINT_OPTS_USAGE("", "fi_rdm_tagged_bw");
+	FT_PRINT_OPTS_USAGE("", "fi_rdm_tagged_pingpong");
+	FT_PRINT_OPTS_USAGE("", "fi_rma_bw");
 	FT_PRINT_OPTS_USAGE("-M <mode>", "Disable mode bit from test");
+	FT_PRINT_OPTS_USAGE("-K", "fork a child process after initializing endpoint");
 	FT_PRINT_OPTS_USAGE("", "mr_local");
 	FT_PRINT_OPTS_USAGE("-a <address vector name>", "name of address vector");
 	FT_PRINT_OPTS_USAGE("-h", "display this help output");
@@ -2838,10 +2919,13 @@ void ft_csusage(char *name, char *desc)
 {
 	ft_usage(name, desc);
 	FT_PRINT_OPTS_USAGE("-I <number>", "number of iterations");
+	FT_PRINT_OPTS_USAGE("-Q", "bind EQ to domain (vs. endpoint)");
 	FT_PRINT_OPTS_USAGE("-w <number>", "number of warmup iterations");
 	FT_PRINT_OPTS_USAGE("-S <size>", "specific transfer size or 'all'");
 	FT_PRINT_OPTS_USAGE("-l", "align transmit and receive buffers to page size");
 	FT_PRINT_OPTS_USAGE("-m", "machine readable output");
+	FT_PRINT_OPTS_USAGE("-D <device_iface>", "Specify device interface: eg cuda, ze(default: None). "
+			     "Automatically enables FI_HMEM (-H)");
 	FT_PRINT_OPTS_USAGE("-t <type>", "completion type [queue, counter]");
 	FT_PRINT_OPTS_USAGE("-c <method>", "completion method [spin, sread, fd, yield]");
 	FT_PRINT_OPTS_USAGE("-h", "display this help output");
@@ -2898,6 +2982,8 @@ void ft_parseinfo(int op, char *optarg, struct fi_info *hints,
 	case 'D':
 		if (!strncasecmp("ze", optarg, 2))
 			opts->iface = FI_HMEM_ZE;
+		else if (!strncasecmp("cuda", optarg, 4))
+			opts->iface = FI_HMEM_CUDA;
 		else
 			printf("Unsupported interface\n");
 		opts->options |= FT_OPT_ENABLE_HMEM | FT_OPT_USE_DEVICE;
@@ -2908,6 +2994,9 @@ void ft_parseinfo(int op, char *optarg, struct fi_info *hints,
 	case 'H':
 		opts->options |= FT_OPT_ENABLE_HMEM;
 		break;
+	case 'K':
+		opts->options |= FT_OPT_FORK_CHILD;
+		break;
 	default:
 		/* let getopt handle unknown opts*/
 		break;
@@ -2938,10 +3027,10 @@ void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts)
 			opts->oob_port = default_oob_port;
 		break;
 	case 'F':
-		if (!strncasecmp("fi_sockaddr_in", optarg, 14))
-			opts->address_format = FI_SOCKADDR_IN;
-		else if (!strncasecmp("fi_sockaddr_in6", optarg, 15))
+		if (!strncasecmp("fi_sockaddr_in6", optarg, 15))
 			opts->address_format = FI_SOCKADDR_IN6;
+		else if (!strncasecmp("fi_sockaddr_in", optarg, 14))
+			opts->address_format = FI_SOCKADDR_IN;
 		else if (!strncasecmp("fi_sockaddr_ib", optarg, 14))
 			opts->address_format = FI_SOCKADDR_IB;
 		else if (!strncasecmp("fi_sockaddr", optarg, 11)) /* keep me last */
@@ -2965,6 +3054,9 @@ void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts)
 		opts->options |= FT_OPT_ITER;
 		opts->iterations = atoi(optarg);
 		break;
+	case 'Q':
+		opts->options |= FT_OPT_DOMAIN_EQ;
+		break;
 	case 'S':
 		if (!strncasecmp("all", optarg, 3)) {
 			opts->sizes_enabled = FT_ENABLE_ALL;
@@ -3005,27 +3097,31 @@ void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts)
 	}
 }
 
-int ft_parse_rma_opts(int op, char *optarg, struct fi_info *hints,
+int ft_parse_api_opts(int op, char *optarg, struct fi_info *hints,
 		      struct ft_opts *opts)
 {
 	switch (op) {
 	case 'o':
-		if (!strcmp(optarg, "read")) {
+		if (!strcasecmp(optarg, "read")) {
 			hints->caps |= FI_READ | FI_REMOTE_READ;
 			opts->rma_op = FT_RMA_READ;
-		} else if (!strcmp(optarg, "writedata")) {
+		} else if (!strcasecmp(optarg, "writedata")) {
 			hints->caps |= FI_WRITE | FI_REMOTE_WRITE;
 			hints->mode |= FI_RX_CQ_DATA;
 			hints->domain_attr->cq_data_size = 4;
 			opts->rma_op = FT_RMA_WRITEDATA;
 			cq_attr.format = FI_CQ_FORMAT_DATA;
-		} else if (!strcmp(optarg, "write")) {
+		} else if (!strcasecmp(optarg, "write")) {
 			hints->caps |= FI_WRITE | FI_REMOTE_WRITE;
 			opts->rma_op = FT_RMA_WRITE;
+		} else if (!strcasecmp(optarg, "msg")) {
+			hints->caps |= FI_MSG;
+		} else if (!strcasecmp(optarg, "tagged")) {
+			hints->caps |= FI_TAGGED;
 		} else {
-			fprintf(stderr, "Invalid operation type: \"%s\". Usage:\n"
-					"-o <op>\trma op type: read|write|writedata "
-				       "(default:write)\n", optarg);
+			fprintf(stderr, "Invalid operation type: \"%s\"."
+				"Usage:\n-o <op>\top: "
+				"read|write|writedata|msg|tagged\n", optarg);
 			return EXIT_FAILURE;
 		}
 		break;
@@ -3036,47 +3132,76 @@ int ft_parse_rma_opts(int op, char *optarg, struct fi_info *hints,
 	return 0;
 }
 
-void ft_fill_buf(void *buf, size_t size)
+int ft_fill_buf(void *buf, size_t size)
 {
 	char *msg_buf;
-	int msg_index;
-	static unsigned int iter = 0;
+	int msg_index = 0;
 	size_t i;
+	int ret = 0;
+
+	if (opts.iface != FI_HMEM_SYSTEM) {
+		msg_buf = malloc(size);
+		if (!msg_buf)
+			return -FI_ENOMEM;
+	} else {
+		msg_buf = (char *) buf;
+	}
 
-	msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length;
-	msg_buf = (char *)buf;
 	for (i = 0; i < size; i++) {
-		msg_buf[i] = integ_alphabet[msg_index++];
-		if (msg_index >= integ_alphabet_length)
+		msg_buf[i] = integ_alphabet[msg_index];
+		if (++msg_index >= integ_alphabet_length)
 			msg_index = 0;
 	}
+
+	if (opts.iface != FI_HMEM_SYSTEM) {
+		ret = ft_hmem_copy_to(opts.iface, opts.device, buf, msg_buf, size);
+		if (ret)
+			goto out;
+	}
+out:
+	if (opts.iface != FI_HMEM_SYSTEM)
+		free(msg_buf);
+	return ret;
 }
 
 int ft_check_buf(void *buf, size_t size)
 {
 	char *recv_data;
 	char c;
-	static unsigned int iter = 0;
-	int msg_index;
+	int msg_index = 0;
 	size_t i;
+	int ret = 0;
 
-	msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length;
-	recv_data = (char *)buf;
+	if (opts.iface != FI_HMEM_SYSTEM) {
+		recv_data = malloc(size);
+		if (!recv_data)
+			return -FI_ENOMEM;
+
+		ret = ft_hmem_copy_from(opts.iface, opts.device,
+					recv_data, buf, size);
+		if (ret)
+			goto out;
+	} else {
+		recv_data = (char *)buf;
+	}
 
 	for (i = 0; i < size; i++) {
-		c = integ_alphabet[msg_index++];
-		if (msg_index >= integ_alphabet_length)
+		c = integ_alphabet[msg_index];
+		if (++msg_index >= integ_alphabet_length)
 			msg_index = 0;
 		if (c != recv_data[i])
 			break;
 	}
 	if (i != size) {
-		printf("Error at iteration=%d size=%zu byte=%zu\n",
-			iter, size, i);
-		return 1;
+		printf("Data check error (%c!=%c) at byte %zu for "
+		       "buffer size %zu\n", c, recv_data[i], i, size);
+		ret = -FI_EIO;
 	}
 
-	return 0;
+out:
+	if (opts.iface != FI_HMEM_SYSTEM)
+		free(recv_data);
+	return ret;
 }
 
 uint64_t ft_init_cq_data(struct fi_info *info)
@@ -3214,17 +3339,14 @@ int ft_sock_connect(char *node, char *service)
 		goto free;
 	}
 
-	ret = 1;
-	ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void *) &ret, sizeof(ret));
-	if (ret)
-		perror("setsockopt");
-
 	ret = connect(sock, ai->ai_addr, ai->ai_addrlen);
 	if (ret) {
 		perror("connect");
 		close(sock);
+		goto free;
 	}
 
+	ret = ft_sock_setup(sock);
 free:
 	freeaddrinfo(ai);
 	return ret;
@@ -3232,7 +3354,7 @@ free:
 
 int ft_sock_accept()
 {
-	int ret, op;
+	int ret;
 
 	sock = accept(listen_sock, NULL, 0);
         if (sock < 0) {
@@ -3241,48 +3363,51 @@ int ft_sock_accept()
 		return ret;
 	}
 
-	op = 1;
-	ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
-			  (void *) &op, sizeof(op));
-	if (ret)
-		perror("setsockopt");
-
-	return 0;
+	ret = ft_sock_setup(sock);
+	return ret;
 }
 
 int ft_sock_send(int fd, void *msg, size_t len)
 {
-	int ret;
+	size_t sent;
+	ssize_t ret, err = 0;
 
-	ret = send(fd, msg, len, 0);
-	if (ret == len) {
-		return 0;
-	} else if (ret < 0) {
-		perror("send");
-		return -errno;
-	} else {
-		perror("send aborted");
-		return -FI_ECONNABORTED;
+	for (sent = 0; sent < len; ) {
+		ret = send(fd, ((char *) msg) + sent, len - sent, 0);
+		if (ret > 0) {
+			sent += ret;
+		} else if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			ft_force_progress();
+		} else {
+			err = -errno;
+			break;
+		}
 	}
+
+	return err ? err: 0;
 }
 
 int ft_sock_recv(int fd, void *msg, size_t len)
 {
-	int ret;
+	size_t rcvd;
+	ssize_t ret, err = 0;
 
-	ret = recv(fd, msg, len, MSG_WAITALL);
-	if (ret == len) {
-		return 0;
-	} else if (ret == 0) {
-		return -FI_ENOTCONN;
-	} else if (ret < 0) {
-		FT_PRINTERR("ft_sock_recv", -errno);
-		perror("recv");
-		return -errno;
-	} else {
-		perror("recv aborted");
-		return -FI_ECONNABORTED;
+	for (rcvd = 0; rcvd < len; ) {
+		ret = recv(fd, ((char *) msg) + rcvd, len - rcvd, 0);
+		if (ret > 0) {
+			rcvd += ret;
+		} else if (ret == 0) {
+			err = -FI_ENOTCONN;
+			break;
+		} else if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			ft_force_progress();
+		} else {
+			err = -errno;
+			break;
+		}
 	}
+
+	return err ? err: 0;
 }
 
 int ft_sock_sync(int value)
diff --git a/deps/libfabric/fabtests/configure.ac b/deps/libfabric/fabtests/configure.ac
index eeea4baffe9086955d5dce08b2f2b3a9560a27b0..bc3b44ac208846cb90d58e9eca209857da66bf80 100644
--- a/deps/libfabric/fabtests/configure.ac
+++ b/deps/libfabric/fabtests/configure.ac
@@ -1,11 +1,11 @@
 dnl
 dnl Copyright (c) 2016-2017 Cisco Systems, Inc.  All rights reserved.
-dnl Copyright (c) 2018-2020 Intel Corporation, Inc.  All rights reserved.
+dnl Copyright (c) 2018-2021 Intel Corporation, Inc.  All rights reserved.
 dnl
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.57)
-AC_INIT([fabtests], [1.11.1], [ofiwg@lists.openfabrics.org])
+AC_INIT([fabtests], [1.14.0], [ofiwg@lists.openfabrics.org])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR(config)
 AC_CONFIG_HEADERS(config.h)
@@ -51,6 +51,16 @@ AC_ARG_ENABLE([debug],
 AC_DEFINE_UNQUOTED([ENABLE_DEBUG], [$dbg],
 	[defined to 1 if configured with --enable-debug])
 
+AC_ARG_ENABLE([asan],
+	      [AS_HELP_STRING([--enable-asan],
+			      [Enable address sanitizer @<:@default=no@:>@])
+	      ],
+	      [],
+	      [enable_asan=no])
+
+AS_IF([test x"$enable_asan" != x"no"],
+      [CFLAGS="-fsanitize=address $CFLAGS"])
+
 dnl Fix autoconf's habit of adding -g -O2 by default
 AS_IF([test -z "$CFLAGS"],
       [CFLAGS="-O2 -DNDEBUG ${base_c_warn_flags}"])
@@ -137,17 +147,20 @@ AC_CHECK_HEADER([rdma/fabric.h], [],
 
 AC_ARG_WITH([ze],
             AC_HELP_STRING([--with-ze], [Use non-default ZE location - default NO]),
-            [CPPFLAGS="-I$withval/include $CPPFLAGS"
-             LDFLAGS="-L$withval/$lib $LDFLAGS"],
-            [])
+            AS_IF([test x"$withval" != x"no"],
+		  [CPPFLAGS="-I$withval/include $CPPFLAGS"
+		   LDFLAGS="-L$withval/$lib $LDFLAGS"]))
 
-dnl Checks for ZE libraries
+dnl Checks for ZE support. Require fabtests to dlopen ZE libraries
+have_ze=0
 AS_IF([test x"$with_ze" != x"no"],
-      [AC_CHECK_LIB([ze_loader], zeInit,
-       AC_CHECK_HEADER([level_zero/ze_api.h],
-			AC_DEFINE([HAVE_LIBZE], 1, [ZE support])),
-			[])]
-      [])
+      [AC_CHECK_HEADER([level_zero/ze_api.h], [have_ze=1])])
+
+AS_IF([test x"$with_ze" != x"no" && test -n "$with_ze" && test "$have_ze" = "0" ],
+	[AC_MSG_ERROR([ZE support requested but ZE runtime not available.])],
+	[])
+
+AC_DEFINE_UNQUOTED([HAVE_LIBZE], [$have_ze], [ZE support])
 
 AC_MSG_CHECKING([for fi_trywait support])
 AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fi_eq.h>]],
@@ -165,6 +178,10 @@ AC_CHECK_FUNC([epoll_create1], [have_epoll=1], [have_epoll=0])
 AC_DEFINE_UNQUOTED([HAVE_EPOLL], [$have_epoll],
 		   [Defined to 1 if Linux epoll is available])
 
+dnl Check for 128-bit integer support
+AC_CHECK_TYPE([__int128],
+	[AC_DEFINE(HAVE___INT128, 1, [Set to 1 to use 128-bit ints])])
+
 AC_CONFIG_FILES([Makefile fabtests.spec])
 
 AC_OUTPUT
diff --git a/deps/libfabric/fabtests/fabtests.sln b/deps/libfabric/fabtests/fabtests.sln
index 2e460cb0283eedcadf2157e8fdcbb6170e7c8497..1735e56da5f828aa18aa631a2b084a22630907eb 100644
--- a/deps/libfabric/fabtests/fabtests.sln
+++ b/deps/libfabric/fabtests/fabtests.sln
@@ -9,18 +9,24 @@ Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug-v140|x64 = Debug-v140|x64
 		Debug-v141|x64 = Debug-v141|x64
+		Debug-v142|x64 = Debug-v142|x64
 		Release-v140|x64 = Release-v140|x64
 		Release-v141|x64 = Release-v141|x64
+		Release-v142|x64 = Release-v142|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v140|x64.ActiveCfg = Debug-v140|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v140|x64.Build.0 = Debug-v140|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v141|x64.ActiveCfg = Debug-v141|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v141|x64.Build.0 = Debug-v141|x64
+		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v142|x64.ActiveCfg = Debug-v142|x64
+		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v142|x64.Build.0 = Debug-v142|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v140|x64.ActiveCfg = Release-v140|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v140|x64.Build.0 = Release-v140|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v141|x64.ActiveCfg = Release-v141|x64
 		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v141|x64.Build.0 = Release-v141|x64
+		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v142|x64.ActiveCfg = Release-v142|x64
+		{076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v142|x64.Build.0 = Release-v142|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/deps/libfabric/fabtests/fabtests.vcxproj b/deps/libfabric/fabtests/fabtests.vcxproj
index ad3cd3e3e9bc085fa0f2eee1041977af811e0d53..d7bd0feba3b4b5df2c64252f7301a45b2cf6ae2a 100644
--- a/deps/libfabric/fabtests/fabtests.vcxproj
+++ b/deps/libfabric/fabtests/fabtests.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug-v140|x64">
@@ -9,6 +9,10 @@
       <Configuration>Debug-v141</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug-v142|x64">
+      <Configuration>Debug-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release-v140|x64">
       <Configuration>Release-v140</Configuration>
       <Platform>x64</Platform>
@@ -17,6 +21,10 @@
       <Configuration>Release-v141</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-v142|x64">
+      <Configuration>Release-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{076F757A-8827-4D3C-A87F-6E49623C16E1}</ProjectGuid>
@@ -37,6 +45,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="Configuration">
+    <ConfigurationType>Makefile</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'" Label="Configuration">
     <ConfigurationType>Makefile</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -51,6 +66,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="Configuration">
+    <ConfigurationType>Makefile</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,12 +84,18 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">
     <NMakePreprocessorDefinitions>WIN32;_DEBUG;$(NMakePreprocessorDefinitions)</NMakePreprocessorDefinitions>
@@ -83,6 +111,13 @@
     <NMakeReBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all</NMakeReBuildCommandLine>
     <NMakeCleanCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean</NMakeCleanCommandLine>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <NMakePreprocessorDefinitions>WIN32;_DEBUG;$(NMakePreprocessorDefinitions)</NMakePreprocessorDefinitions>
+    <ExecutablePath>$(ProjectDir)Include;$(ExecutablePath)</ExecutablePath>
+    <NMakeBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) all</NMakeBuildCommandLine>
+    <NMakeReBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all</NMakeReBuildCommandLine>
+    <NMakeCleanCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean</NMakeCleanCommandLine>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">
     <NMakePreprocessorDefinitions>WIN32;NDEBUG;$(NMakePreprocessorDefinitions)</NMakePreprocessorDefinitions>
     <ExecutablePath>$(ProjectDir)Include;$(ExecutablePath)</ExecutablePath>
@@ -97,6 +132,13 @@
     <NMakeReBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all</NMakeReBuildCommandLine>
     <NMakeCleanCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean</NMakeCleanCommandLine>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <NMakePreprocessorDefinitions>WIN32;NDEBUG;$(NMakePreprocessorDefinitions)</NMakePreprocessorDefinitions>
+    <ExecutablePath>$(ProjectDir)Include;$(ExecutablePath)</ExecutablePath>
+    <NMakeBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) all</NMakeBuildCommandLine>
+    <NMakeReBuildCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all</NMakeReBuildCommandLine>
+    <NMakeCleanCommandLine>nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean</NMakeCleanCommandLine>
+  </PropertyGroup>
   <ItemGroup>
     <ClCompile Include="benchmarks\benchmark_shared.c" />
     <ClCompile Include="benchmarks\dgram_pingpong.c" />
@@ -107,7 +149,6 @@
     <ClCompile Include="benchmarks\rdm_tagged_bw.c" />
     <ClCompile Include="benchmarks\rdm_tagged_pingpong.c" />
     <ClCompile Include="benchmarks\rma_bw.c" />
-    <ClCompile Include="common\jsmn.c" />
     <ClCompile Include="common\hmem.c" />
     <ClCompile Include="common\hmem_cuda.c" />
     <ClCompile Include="common\hmem_rocr.c" />
@@ -132,7 +173,7 @@
     <ClCompile Include="functional\msg_sockets.c" />
     <ClCompile Include="functional\poll.c" />
     <ClCompile Include="functional\rdm.c" />
-    <ClCompile Include="functional\rdm_rma_simple.c" />
+    <ClCompile Include="functional\rdm_rma_event.c" />
     <ClCompile Include="functional\rdm_rma_trigger.c" />
     <ClCompile Include="functional\rdm_shared_ctx.c" />
     <ClCompile Include="functional\rdm_tagged_peek.c" />
diff --git a/deps/libfabric/fabtests/fabtests.vcxproj.filters b/deps/libfabric/fabtests/fabtests.vcxproj.filters
index 571896ccee19f9a2ef12eb74fd13474be80af5ee..207c5cb789204df9b072d2d8ce008c1f9779da02 100644
--- a/deps/libfabric/fabtests/fabtests.vcxproj.filters
+++ b/deps/libfabric/fabtests/fabtests.vcxproj.filters
@@ -87,7 +87,7 @@
     <ClCompile Include="functional\rdm.c">
       <Filter>Source Files\functional</Filter>
     </ClCompile>
-    <ClCompile Include="functional\rdm_rma_simple.c">
+    <ClCompile Include="functional\rdm_rma_event.c">
       <Filter>Source Files\functional</Filter>
     </ClCompile>
     <ClCompile Include="functional\rdm_rma_trigger.c">
diff --git a/deps/libfabric/fabtests/functional/av_xfer.c b/deps/libfabric/fabtests/functional/av_xfer.c
index f8a6646403b2a8d22f0fa9efde93a026f959bdf9..1a912d7897b63555fbe7e9e189bc87c07949a89e 100644
--- a/deps/libfabric/fabtests/functional/av_xfer.c
+++ b/deps/libfabric/fabtests/functional/av_xfer.c
@@ -109,9 +109,9 @@ static int av_removal_test(void)
 		}
 	}
 
-	fprintf(stdout, "PASS\n");
 	(void) ft_sync();
 out:
+	fprintf(stdout, "%s\n", ret ? "FAIL" : "PASS");
 	ft_free_res();
 	return ret;
 }
@@ -127,7 +127,7 @@ static int av_reinsert_test(void)
 
 	ret = ft_init_fabric();
 	if (ret)
-		return ret;
+		goto out;
 
 	if (opts.dst_addr) {
 		ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx);
@@ -171,9 +171,9 @@ static int av_reinsert_test(void)
 		}
 	}
 
-	fprintf(stdout, "PASS\n");
 	(void) ft_sync();
 out:
+	fprintf(stdout, "%s\n", ret ? "FAIL" : "PASS");
 	ft_free_res();
 	return ret;
 }
diff --git a/deps/libfabric/fabtests/functional/bw.c b/deps/libfabric/fabtests/functional/bw.c
index 4ba676907b9ada35f12ed4df2c25e5436e8f5827..7be849eae562ce565f29ec7a80e9f61ec7673886 100644
--- a/deps/libfabric/fabtests/functional/bw.c
+++ b/deps/libfabric/fabtests/functional/bw.c
@@ -38,9 +38,14 @@ int sleep_time = 0;
 
 static ssize_t post_one_tx(struct ft_context *msg)
 {
-	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE))
-		ft_fill_buf(msg->buf + ft_tx_prefix_size(),
-			    opts.transfer_size);
+	ssize_t ret;
+
+	if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) {
+		ret = ft_fill_buf(msg->buf + ft_tx_prefix_size(),
+				  opts.transfer_size);
+		if (ret)
+			return ret;
+	}
 
 	return ft_post_tx_buf(ep, remote_fi_addr, opts.transfer_size,
 			      NO_CQ_DATA, &msg->context, msg->buf,
@@ -199,7 +204,7 @@ int main(int argc, char **argv)
 
 	hints->ep_attr->type = FI_EP_RDM;
 
-	while ((op = getopt(argc, argv, "W:vT:h" CS_OPTS ADDR_OPTS INFO_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "UW:vT:h" CS_OPTS ADDR_OPTS INFO_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_addr_opts(op, optarg, &opts);
@@ -209,6 +214,9 @@ int main(int argc, char **argv)
 		case 'W':
 			opts.window_size = atoi(optarg);
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case 'v':
 			opts.options |= FT_OPT_VERIFY_DATA;
 			break;
diff --git a/deps/libfabric/fabtests/functional/cm_data.c b/deps/libfabric/fabtests/functional/cm_data.c
index ba92370b4324a22baf6319eec84697c290c5e112..68b91a578a6f78bd8d0566b48663233a23fea9c6 100644
--- a/deps/libfabric/fabtests/functional/cm_data.c
+++ b/deps/libfabric/fabtests/functional/cm_data.c
@@ -150,7 +150,10 @@ static int server_reject(size_t paramlen)
 		return ret;
 
 	/* Data will appear in error event generated on remote end. */
-	ft_fill_buf(cm_data, paramlen);
+	ret = ft_fill_buf(cm_data, paramlen);
+	if (ret)
+		return ret;
+
 	ret = fi_reject(pep, fi->handle, cm_data, paramlen);
 	if (ret)
 		FT_PRINTERR("fi_reject", ret);
@@ -187,7 +190,9 @@ static int server_accept(size_t paramlen)
 		goto err;
 	}
 	/* Data will appear on accept event on remote end. */
-	ft_fill_buf(cm_data, paramlen);
+	ret = ft_fill_buf(cm_data, paramlen);
+	if (ret)
+		return ret;
 
 	/* Accept the incoming connection. Also transitions endpoint to active
 	 * state.
@@ -254,7 +259,11 @@ static int server(size_t paramlen)
 
 static int client_connect(size_t paramlen)
 {
-	ft_fill_buf(cm_data, paramlen);
+	int ret;
+
+	ret = ft_fill_buf(cm_data, paramlen);
+	if (ret)
+		return ret;
 
 	/* Connect to server */
 	return fi_connect(ep, fi->dest_addr, cm_data, paramlen);
@@ -448,6 +457,7 @@ err1:
 	ft_sock_shutdown(sock);
 err2:
 	free(entry);
+	free(cm_data);
 	return ret;
 }
 
@@ -456,7 +466,7 @@ int main(int argc, char **argv)
 	int op, ret;
 
 	opts = INIT_OPTS;
-	opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_REG_MR;
+	opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_REG_MR | FT_OPT_SKIP_MSG_ALLOC;
 
 	hints = fi_allocinfo();
 	if (!hints)
diff --git a/deps/libfabric/fabtests/functional/inj_complete.c b/deps/libfabric/fabtests/functional/inj_complete.c
index 0980508ef3b97bc49962e5b8c974cec99af7958c..ce27ec4ca8caa3ff06ca2c2b2b0ed16afaf63f2c 100644
--- a/deps/libfabric/fabtests/functional/inj_complete.c
+++ b/deps/libfabric/fabtests/functional/inj_complete.c
@@ -44,8 +44,11 @@ static int send_msg(int sendmsg, size_t size)
 	int ret;
 	ft_tag = 0xabcd;
 
-	if (ft_check_opts(FT_OPT_VERIFY_DATA))
-		ft_fill_buf(tx_buf, size);
+	if (ft_check_opts(FT_OPT_VERIFY_DATA)) {
+		ret = ft_fill_buf(tx_buf, size);
+		if (ret)
+			return ret;
+	}
 
 	if (sendmsg) {
 		ret = ft_sendmsg(ep, remote_fi_addr, size,
diff --git a/deps/libfabric/fabtests/functional/loopback.c b/deps/libfabric/fabtests/functional/loopback.c
new file mode 100644
index 0000000000000000000000000000000000000000..e15e4e3750e3254aa390d9d886fb1a88b6ad6678
--- /dev/null
+++ b/deps/libfabric/fabtests/functional/loopback.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+
+#include <shared.h>
+
+
+static int run(void)
+{
+	int ret;
+
+	ret = ft_getinfo(hints, &fi);
+	if (ret)
+		return ret;
+
+	ret = ft_open_fabric_res();
+	if (ret)
+		return ret;
+
+	ret = ft_alloc_active_res(fi);
+	if (ret)
+		return ret;
+
+	ret = ft_enable_ep_recv();
+	if (ret)
+		return ret;
+
+	opts.dst_addr = fi->src_addr;
+	fi->dest_addr = fi->src_addr;
+	fi->dest_addrlen = fi->src_addrlen;
+
+	ret = ft_init_av();
+	if (ret)
+		goto out;
+
+	ret = ft_send_greeting(ep);
+	if (ret)
+		goto out;
+
+	ret = ft_recv_greeting(ep);
+	if (ret)
+		goto out;
+
+out:
+	fi->dest_addr = NULL;
+	fi->dest_addrlen = 0;
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	opts = INIT_OPTS;
+
+	hints = fi_allocinfo();
+	if (!hints)
+		return EXIT_FAILURE;
+
+	hints->caps |= FI_LOCAL_COMM;
+	hints->ep_attr->type = FI_EP_RDM;
+
+	while ((op = getopt(argc, argv, "h" INFO_OPTS)) != -1) {
+		switch (op) {
+		default:
+			ft_parseinfo(op, optarg, hints, &opts);
+			break;
+		case '?':
+		case 'h':
+			ft_usage(argv[0], "A loopback communication test.");
+			return EXIT_FAILURE;
+		}
+	}
+
+	hints->caps = FI_MSG;
+	hints->mode = FI_CONTEXT;
+	hints->domain_attr->mr_mode = opts.mr_mode;
+
+	ret = run();
+
+	ft_free_res();
+	return ft_exit_code(ret);
+}
diff --git a/deps/libfabric/fabtests/functional/msg_sockets.c b/deps/libfabric/fabtests/functional/msg_sockets.c
index a4a7da8a072997395cb228a74fa5a26bc7ff8583..95551c0ebe8bcfa53f688b8c33ca822e8a0ac34f 100644
--- a/deps/libfabric/fabtests/functional/msg_sockets.c
+++ b/deps/libfabric/fabtests/functional/msg_sockets.c
@@ -54,6 +54,9 @@ union sockaddr_any {
 static union sockaddr_any bound_addr;
 static size_t bound_addr_len = sizeof bound_addr;
 
+/* string format is [%s]:%s */
+#define MAXADDRSTR	((BUFSIZ * 2) + 4)
+
 
 /* Wrapper for memcmp for sockaddr.  Note that the sockaddr structure may
  * contain holes, so sockaddr's are expected to have been initialized to all
@@ -113,7 +116,7 @@ sockaddrstr(const union sockaddr_any *addr, socklen_t len, char *buf, size_t buf
 
 static int check_address(struct fid *fid, const char *message)
 {
-	char buf1[BUFSIZ], buf2[BUFSIZ];
+	char buf1[MAXADDRSTR], buf2[MAXADDRSTR];
 	union sockaddr_any tmp;
 	size_t tmplen;
 	const char *ep_addr, *addr_expected;
@@ -127,13 +130,14 @@ static int check_address(struct fid *fid, const char *message)
 	}
 
 	if (sockaddrcmp(&tmp, tmplen, &bound_addr, bound_addr_len)) {
-		ep_addr = sockaddrstr(&tmp, tmplen, buf1, BUFSIZ);
+		ep_addr = sockaddrstr(&tmp, tmplen, buf1, sizeof buf1);
 		if (!ep_addr) {
 			FT_ERR("Unable to get ep_addr as string!");
 			return -FI_EINVAL;
 		}
 
-		addr_expected = sockaddrstr(&bound_addr, bound_addr_len, buf2, BUFSIZ);
+		addr_expected = sockaddrstr(&bound_addr, bound_addr_len, buf2,
+					    sizeof buf2);
 		if (!addr_expected) {
 			FT_ERR("Unable to get addr_expected as string!");
 			return -FI_EINVAL;
@@ -302,7 +306,7 @@ static int client_connect(void)
 
 static int setup_handle(void)
 {
-	static char buf[BUFSIZ];
+	static char buf[MAXADDRSTR];
 	struct addrinfo *ai, aihints;
 	const char *bound_addr_str;
 	char *saved_addr;
@@ -398,7 +402,8 @@ static int setup_handle(void)
 		break;
 	}
 
-	bound_addr_str = sockaddrstr(&bound_addr, bound_addr_len, buf, BUFSIZ);
+	bound_addr_str = sockaddrstr(&bound_addr, bound_addr_len, buf,
+				     sizeof buf);
 	if (!bound_addr_str) {
 		FT_ERR("Unable to get bound_addr as string!");
 		ret = -FI_EINVAL;
diff --git a/deps/libfabric/fabtests/functional/multi_ep.c b/deps/libfabric/fabtests/functional/multi_ep.c
index 3e122cadbfe952d5dadb26053972f742965c88aa..fd7fad767bcc41f32a2b0850722d304bcac04703 100644
--- a/deps/libfabric/fabtests/functional/multi_ep.c
+++ b/deps/libfabric/fabtests/functional/multi_ep.c
@@ -111,8 +111,11 @@ static int do_transfers(void)
 	}
 
 	for (i = 0; i < num_eps; i++) {
-		if (ft_check_opts(FT_OPT_VERIFY_DATA))
-			ft_fill_buf(send_bufs[i], opts.transfer_size);
+		if (ft_check_opts(FT_OPT_VERIFY_DATA)) {
+			ret = ft_fill_buf(send_bufs[i], opts.transfer_size);
+			if (ret)
+				return ret;
+		}
 
 		tx_buf = send_bufs[i];
 		ret = ft_post_tx(eps[i], remote_addr[i], opts.transfer_size, NO_CQ_DATA, &send_ctx[i]);
@@ -153,7 +156,7 @@ static int setup_client_ep(struct fid_ep **ep)
 		return ret;
 	}
 
-	ret = ft_enable_ep(*ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	ret = ft_enable_ep(*ep);
 	if (ret)
 		return ret;
 
@@ -178,7 +181,7 @@ static int setup_server_ep(struct fid_ep **ep)
 		goto failed_accept;
 	}
 
-	ret = ft_enable_ep(*ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	ret = ft_enable_ep(*ep);
 	if (ret)
 		goto failed_accept;
 
@@ -212,7 +215,7 @@ static int setup_av_ep(struct fid_ep **ep, fi_addr_t *remote_addr)
 		return ret;
 	}
 
-	ret = ft_enable_ep(*ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	ret = ft_enable_ep(*ep);
 	if (ret)
 		return ret;
 
diff --git a/deps/libfabric/fabtests/functional/multi_mr.c b/deps/libfabric/fabtests/functional/multi_mr.c
index 6a814fcaeb35c93d19f0e942d59366ad041c0e28..500e25f3efcb68bcfa3b1e43881b9c29b83d64a7 100644
--- a/deps/libfabric/fabtests/functional/multi_mr.c
+++ b/deps/libfabric/fabtests/functional/multi_mr.c
@@ -184,8 +184,10 @@ static int mr_key_test()
 		tx_buf = (char *)mr_res_array[i].buf;
 
 		if (opts.dst_addr) {
-			ft_fill_buf(mr_res_array[i].buf,
-					opts.transfer_size);
+			ret = ft_fill_buf(mr_res_array[i].buf,
+					  opts.transfer_size);
+			if (ret)
+				return ret;
 
 			if (verbose)
 				printf("write to host's key %lx\n",
@@ -231,8 +233,10 @@ static int mr_key_test()
 					return ret;
 			}
 
-			ft_fill_buf(mr_res_array[i].buf,
-					opts.transfer_size);
+			ret = ft_fill_buf(mr_res_array[i].buf,
+					  opts.transfer_size);
+			if (ret)
+				return ret;
 
 			if (verbose)
 				printf("write to client's key %lx\n",
diff --git a/deps/libfabric/fabtests/functional/multi_recv.c b/deps/libfabric/fabtests/functional/multi_recv.c
index 93d11fe8fc5722d94ba920c56a21930902ea02aa..c01a575196b3e46e73747a46f980fd8b666933ba 100644
--- a/deps/libfabric/fabtests/functional/multi_recv.c
+++ b/deps/libfabric/fabtests/functional/multi_recv.c
@@ -210,6 +210,11 @@ static int alloc_ep_res(struct fi_info *fi)
 		return ret;
 	}
 
+	/* We only ues the common code to send messages, so
+	 * set mr_desc to the tx buffer's region.
+	 */
+	mr_desc = fi_mr_desc(mr);
+
 	//Each multi recv buffer will be able to hold at least 2 and
 	//up to 64 messages, allowing proper testing of multi recv
 	//completions and reposting
diff --git a/deps/libfabric/fabtests/functional/rdm.c b/deps/libfabric/fabtests/functional/rdm.c
index 8456a0231157a4142c0748cd5119dafa5939da25..6fe12c860e5553ea031b93f97248e81002f7e64d 100644
--- a/deps/libfabric/fabtests/functional/rdm.c
+++ b/deps/libfabric/fabtests/functional/rdm.c
@@ -69,12 +69,15 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "h" ADDR_OPTS INFO_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "Uh" ADDR_OPTS INFO_OPTS)) != -1) {
 		switch (op) {
 		default:
 			ft_parse_addr_opts(op, optarg, &opts);
 			ft_parseinfo(op, optarg, hints, &opts);
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case '?':
 		case 'h':
 			ft_usage(argv[0], "A simple RDM client-sever example.");
diff --git a/deps/libfabric/fabtests/functional/rdm_atomic.c b/deps/libfabric/fabtests/functional/rdm_atomic.c
index 86e46d20af49f8b56af133e5241c1f668499b7f6..1ab88fb2ecc19bc0b66a3d0904c1e9f4c7051949 100644
--- a/deps/libfabric/fabtests/functional/rdm_atomic.c
+++ b/deps/libfabric/fabtests/functional/rdm_atomic.c
@@ -112,6 +112,10 @@ static enum fi_datatype get_fi_datatype(char *op)
 		return FI_INT64;
 	else if (!strcmp(op, "uint64"))
 		return FI_UINT64;
+	else if (!strcmp(op, "int128"))
+		return FI_INT128;
+	else if (!strcmp(op, "uint128"))
+		return FI_UINT128;
 	else if (!strcmp(op, "float"))
 		return FI_FLOAT;
 	else if (!strcmp(op, "double"))
@@ -126,7 +130,7 @@ static enum fi_datatype get_fi_datatype(char *op)
 		return FI_LONG_DOUBLE_COMPLEX;
 	else {
 		fprintf(stderr, "Not a valid atomic operation\n");
-		return FI_DATATYPE_LAST;
+		return OFI_DATATYPE_CNT;
 	}
 }
 
@@ -140,8 +144,8 @@ static void print_opts_usage(char *name)
 	FT_PRINT_OPTS_USAGE("", "cswap_ge|cswap_gt|mswap (default: all)");
 	/* Atomic datatype */
 	FT_PRINT_OPTS_USAGE("-z <datatype>", "atomic datatype: int8|uint8|int16|uint16|");
-	FT_PRINT_OPTS_USAGE("", "int32|uint32|int64|uint64|float|double|"
-				"float_complex|double_complex|");
+	FT_PRINT_OPTS_USAGE("", "int32|uint32|int64|uint64|int128|uint128|"
+			    "float|double|float_complex|double_complex|");
 	FT_PRINT_OPTS_USAGE("", "long_double|long_double_complex (default: all)");
 }
 
@@ -177,7 +181,7 @@ static inline int handle_atomic_ ## type ## _op(int run_all_datatypes,		\
 	int ret = FI_SUCCESS;							\
 										\
 	if (run_all_datatypes) {						\
-		for (datatype = 0; datatype < FI_DATATYPE_LAST; datatype++) {	\
+		for (datatype = 0; datatype < OFI_DATATYPE_CNT; datatype++) {	\
 			ret = check_ ## type ## _atomic_op(ep, op_type,		\
 							   datatype, count);	\
 			if (ret == -FI_ENOSYS || ret == -FI_EOPNOTSUPP) {	\
@@ -435,6 +439,10 @@ static int init_fabric(void)
 {
 	int ret;
 
+	ret = ft_init();
+	if (ret)
+		return ret;
+
 	ret  = ft_init_oob();
 	if (ret)
 		return ret;
@@ -495,7 +503,7 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "ho:z:" CS_OPTS INFO_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "ho:Uz:" CS_OPTS INFO_OPTS)) != -1) {
 		switch (op) {
 		case 'o':
 			if (!strncasecmp("all", optarg, 3)) {
@@ -509,13 +517,16 @@ int main(int argc, char **argv)
 				}
 			}
 			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
 		case 'z':
 			if (!strncasecmp("all", optarg, 3)) {
 				run_all_datatypes = 1;
 			} else {
 				run_all_datatypes = 0;
 				datatype = get_fi_datatype(optarg);
-				if (datatype == FI_DATATYPE_LAST) {
+				if (datatype == OFI_DATATYPE_CNT) {
 					print_opts_usage(argv[0]);
 					return EXIT_FAILURE;
 				}
diff --git a/deps/libfabric/fabtests/functional/rdm_multi_client.c b/deps/libfabric/fabtests/functional/rdm_multi_client.c
new file mode 100644
index 0000000000000000000000000000000000000000..a19b79458d737465c91d888e2c6a432c17f25009
--- /dev/null
+++ b/deps/libfabric/fabtests/functional/rdm_multi_client.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, Amazon.com, Inc.  All rights reserved.
+ *
+ * This software is available to you under the BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * This program tests the functionality of RDM endpoint in the case
+ * that a persistent server does ping-pong with multiple clients that
+ * come and leave in sequence. The client connects to a server, sends
+ * ping-pong, disconnects with the server by cleaning all fabric
+ * resources, and repeats.
+ * If the `-R` option is specified, it will re-use the first client's 
+ * address for the subsequent clients by setting the src_addr for
+ * endpoints 2..n to the output of fi_getname() of the first client.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <shared.h>
+#include <rdma/fi_cm.h>
+
+static int run_pingpong(void)
+{
+	int ret, i;
+
+	fprintf(stdout, "Start ping-pong.\n");
+	for (i = 0; i < opts.iterations; i++) {
+		if (opts.dst_addr) {
+			ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx);
+			if (ret) {
+				FT_PRINTERR("ft_tx", -ret);
+				return ret;
+			}
+			ret = ft_rx(ep, opts.transfer_size);
+			if (ret) {
+				FT_PRINTERR("ft_rx", -ret);
+				return ret;
+			}
+		} else {
+			ret = ft_rx(ep, opts.transfer_size);
+			if (ret) {
+				FT_PRINTERR("ft_rx", -ret);
+				return ret;
+			}
+			ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx);
+			if (ret) {
+				FT_PRINTERR("ft_tx", -ret);
+				return ret;
+			}
+		}
+	}
+
+	fprintf(stdout, "Ping-pong succeeds.\n");
+	return 0;
+}
+
+static int run_server(void)
+{
+	int nconn, ret;
+
+	ret = ft_init_fabric();
+	if (ret) {
+		FT_PRINTERR("ft_init_fabric", -ret);
+		return ret;
+	}
+
+	nconn = opts.num_connections;
+
+	while (nconn) {
+		ret = run_pingpong();
+		if (ret) {
+			FT_PRINTERR("run_pingpong", -ret);
+			return ret;
+		}
+		if (--nconn) {
+			ret = ft_init_av();
+			if (ret) {
+				FT_PRINTERR("ft_init_av", -ret);
+				return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+static int run_client(int client_id, bool address_reuse)
+{
+	static char name[256];
+	static size_t size = sizeof(name);
+	int ret;
+
+	ret = ft_init();
+	if (ret) {
+		FT_PRINTERR("ft_init", -ret);
+		return ret;
+	}
+
+	ret = ft_init_oob();
+	if (ret) {
+		FT_PRINTERR("ft_init_oob", -ret);
+		return ret;
+	}
+
+	ret = ft_getinfo(hints, &fi);
+	if (ret) {
+		FT_PRINTERR("ft_getinfo", -ret);
+		return ret;
+	}
+
+	ret = ft_open_fabric_res();
+	if (ret) {
+		FT_PRINTERR("ft_open_fabric_res", -ret);
+		return ret;
+	}
+
+	if (client_id > 0 && address_reuse) {
+		memcpy(fi->src_addr, name, size);
+		fi->src_addrlen = size;
+	}
+
+	ret = ft_alloc_active_res(fi);
+	if (ret) {
+		FT_PRINTERR("ft_alloc_active_res", -ret);
+		return ret;
+	}
+
+	ret = ft_enable_ep_recv();
+	if (ret) {
+		FT_PRINTERR("ft_enable_ep_recv", -ret);
+		return ret;
+	}
+
+	ret = ft_init_av();
+	if (ret) {
+		FT_PRINTERR("ft_init_av", -ret);
+		return ret;
+	}
+
+	if (client_id == 0) {
+		ret = fi_getname(&ep->fid, name, &size);
+		if (ret) {
+			FT_PRINTERR("fi_getname", -ret);
+			return ret;
+		}
+	}
+
+	return run_pingpong();
+}
+
+static void print_opts_usage(char *name, char *desc)
+{
+	ft_usage(name, desc);
+	/* rdm_multi_client test op type */
+	FT_PRINT_OPTS_USAGE("-R", "Reuse the address of the first client for subsequent clients");
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret, i;
+	struct fi_info *save;
+	bool address_reuse = false;
+
+	opts = INIT_OPTS;
+	opts.options |= FT_OPT_SIZE;
+
+	hints = fi_allocinfo();
+	if (!hints)
+		return EXIT_FAILURE;
+
+	while ((op = getopt(argc, argv, "URh" ADDR_OPTS INFO_OPTS CS_OPTS)) != -1) {
+		switch (op) {
+		default:
+			ft_parse_addr_opts(op, optarg, &opts);
+			ft_parseinfo(op, optarg, hints, &opts);
+			ft_parsecsopts(op, optarg, &opts);
+			break;
+		case 'U':
+			hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
+			break;
+		case 'R':
+			address_reuse = true;
+			break;
+		case '?':
+		case 'h':
+			print_opts_usage(argv[0], "RDM multi-client test");
+			return EXIT_FAILURE;
+		}
+	}
+
+	if (optind < argc)
+		opts.dst_addr = argv[optind];
+
+	hints->ep_attr->type = FI_EP_RDM;
+	hints->caps = FI_MSG;
+	hints->mode = FI_CONTEXT;
+	hints->domain_attr->mr_mode = opts.mr_mode;
+
+	if (opts.dst_addr) {
+		for (i = 0; i < opts.num_connections; i++) {
+			save = fi_dupinfo(hints);
+			printf("Starting client: %d\n", i);
+			ret = run_client(i, address_reuse);
+			if (ret) {
+				FT_PRINTERR("run_client", -ret);
+				goto out;
+			}
+			ft_free_res();
+			hints = save;
+		}
+	} else {
+		ret = run_server();
+		if (ret)
+			FT_PRINTERR("run_server", -ret);
+	}
+out:
+	ft_free_res();
+	return ft_exit_code(ret);
+}
diff --git a/deps/libfabric/fabtests/functional/rdm_rma_simple.c b/deps/libfabric/fabtests/functional/rdm_rma_event.c
similarity index 100%
rename from deps/libfabric/fabtests/functional/rdm_rma_simple.c
rename to deps/libfabric/fabtests/functional/rdm_rma_event.c
diff --git a/deps/libfabric/fabtests/functional/rdm_rma_trigger.c b/deps/libfabric/fabtests/functional/rdm_rma_trigger.c
index a09fba22d8d687c5ac4825339be1791177f6cbe5..4b994eeebe92ebd79bfd4f576d956bc4e1730567 100644
--- a/deps/libfabric/fabtests/functional/rdm_rma_trigger.c
+++ b/deps/libfabric/fabtests/functional/rdm_rma_trigger.c
@@ -62,6 +62,7 @@ static int rma_write_trigger(void *src, size_t size,
 static int run_test(void)
 {
 	int ret = 0;
+	uint64_t start_tx, start_rx;
 
 	ret = ft_init_fabric();
 	if (ret)
@@ -75,12 +76,14 @@ static int run_test(void)
 	if (ret)
 		return ret;
 
+	start_tx = fi_cntr_read(txcntr);
+	start_rx = fi_cntr_read(rxcntr);
 	if (opts.dst_addr) {
 		sprintf(tx_buf, "%s%s", welcome_text1, welcome_text2);
 
 		fprintf(stdout, "Triggered RMA write to server\n");
 		ret = rma_write_trigger((char *) tx_buf + strlen(welcome_text1),
-					strlen(welcome_text2), txcntr, 3);
+					strlen(welcome_text2), txcntr, start_tx + 1);
 		if (ret)
 			goto out;
 
@@ -92,9 +95,9 @@ static int run_test(void)
  			FT_PRINTERR("fi_write", ret);
  			goto out;
 		}
-		/* The value of the counter is 4 including a transfer during
-		 * init_av and ft_exchange_keys() */
-		ret = fi_cntr_wait(txcntr, 4, -1);
+		/* The value of the tx counter should have increased by 2
+		 * for both operations (write and triggered) */
+		ret = fi_cntr_wait(txcntr, start_tx + 2, -1);
 		if (ret < 0) {
 			FT_PRINTERR("fi_cntr_wait", ret);
 			goto out;
@@ -102,9 +105,9 @@ static int run_test(void)
 
 		fprintf(stdout, "Received completion events for RMA write operations\n");
 	} else {
-		/* The value of the counter is 4 including a transfer during
-		 * init_av and ft_exchange_keys() */
-		ret = fi_cntr_wait(rxcntr, 4, -1);
+		/* The value of the rx counter should have increased by 2
+		 * for both operations (write and triggered) */
+		ret = fi_cntr_wait(rxcntr, start_rx + 2, -1);
 		if (ret < 0) {
 			FT_PRINTERR("fi_cntr_wait", ret);
 			goto out;
diff --git a/deps/libfabric/fabtests/functional/recv_cancel.c b/deps/libfabric/fabtests/functional/recv_cancel.c
index 35bcd500b4fa80b63ce179357efedf811f2dc3a5..eb7c5c82dd6dcd022893016d5bbd127f67d1974f 100644
--- a/deps/libfabric/fabtests/functional/recv_cancel.c
+++ b/deps/libfabric/fabtests/functional/recv_cancel.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -47,7 +47,8 @@ static int recv_cancel_client(void)
 		return ret;
 
 	ft_tag = CANCEL_TAG;
-	ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, &tx_ctx);
+	ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA,
+			 &tx_ctx);
 	if (ret)
 		return ret;
 
@@ -55,7 +56,8 @@ static int recv_cancel_client(void)
 		fprintf(stdout, "CANCEL msg posted to server\n");
 
 	ft_tag = STANDARD_TAG;
-	ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, &tx_ctx);
+	ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA,
+			 &tx_ctx);
 	if (ret)
 		return ret;
 
@@ -110,16 +112,18 @@ static int recv_cancel_host(void)
 		usleep(1000);
 	} while ((ret == -FI_EAGAIN) && (retries < 5000));
 	if (retries >= 5000) {
-		FT_PRINTERR("ERROR: failed to detect error CQ entry in cq_read", -FI_EOTHER);
+		FT_PRINTERR("ERROR: no error CQ entry in cq_read deteceted",
+			    -FI_EOTHER);
 		return -FI_EOTHER;
 	} else {
 		if (opts.verbose)
-			fprintf(stdout, "GOOD: detected error cq entry in cq_read\n");
+			fprintf(stdout, "GOOD: detected error cq entry\n");
 	}
 
 	/* Verify the error CQ has been populated */
 	if (fi_cq_readerr(rxcq, &cancel_error_entry, 0) != 1) {
-		FT_PRINTERR("ERROR: No cancel CQ error entry was populated", -FI_EOTHER);
+		FT_PRINTERR("ERROR: No cancel CQ error entry was populated",
+			    -FI_EOTHER);
 		return -FI_EOTHER;
 	}
 
@@ -129,7 +133,8 @@ static int recv_cancel_host(void)
 	}
 
 	if (!(cancel_error_entry.flags & FI_RECV)) {
-		FT_PRINTERR("ERROR: cancelled completion flags is incorrect", -FI_EOTHER);
+		FT_PRINTERR("ERROR: cancelled completion flags are incorrect",
+			    -FI_EOTHER);
 		return -FI_EOTHER;
 	}
 
@@ -138,19 +143,21 @@ static int recv_cancel_host(void)
 
 	/* Verify only one CQ err entry can be read */
 	if (fi_cq_readerr(rxcq, &cancel_error_entry, 0) != -FI_EAGAIN) {
-		FT_PRINTERR("ERROR: Another CQ error entry was populated", -FI_EOTHER);
+		FT_PRINTERR("ERROR: Another CQ error entry was populated",
+			    -FI_EOTHER);
 		return -FI_EOTHER;
 	}
 
 	if (opts.verbose)
-		fprintf(stdout, "GOOD: no additional error entries have been detected\n");
+		fprintf(stdout, "GOOD: no extra error entries detected\n");
 
 	/* Check for second recv completion*/
 	do {
 		ret = fi_cq_read(rxcq, &recv_completion, 1);
 		if (ret > 0) {
 			if (recv_completion.op_context != &standard_recv_ctx) {
-				FT_PRINTERR("ERROR: op_context does not match recv ctx", -FI_EOTHER);
+				FT_PRINTERR("ERROR: op_context does not match",
+					    -FI_EOTHER);
 				return -FI_EOTHER;
 			}
 		} else if ((ret <= 0) && (ret != -FI_EAGAIN)) {
diff --git a/deps/libfabric/fabtests/functional/shared_ctx.c b/deps/libfabric/fabtests/functional/shared_ctx.c
index 7aa04e791f7079b9af77d2c9206e9f6913db30ee..185079e8fc191c1ba3ebf88d8c88af442616d138 100644
--- a/deps/libfabric/fabtests/functional/shared_ctx.c
+++ b/deps/libfabric/fabtests/functional/shared_ctx.c
@@ -58,11 +58,8 @@ struct ep_info {
 };
 
 static struct fi_info *fi_dup;
-static int tx_shared_ctx = 1;
-static int rx_shared_ctx = 1;
 static int ep_cnt = 4;
-static struct fid_ep **ep_array, *srx_ctx;
-static struct fid_stx *stx_ctx;
+static struct fid_ep **ep_array;
 static size_t addrlen = 0;
 static fi_addr_t *addr_array;
 
@@ -102,7 +99,7 @@ static int get_dupinfo(void)
 	return ret;
 }
 
-static int alloc_ep(void)
+static int alloc_eps(void)
 {
 	int i, ret;
 
@@ -130,62 +127,11 @@ static int alloc_ep(void)
 	return 0;
 }
 
-static int alloc_ep_res(struct fi_info *fi)
-{
-	int ret;
-
-	ret = ft_alloc_ep_res(fi);
-	if (ret)
-		return ret;
-
-	if (tx_shared_ctx) {
-		ret = fi_stx_context(domain, fi->tx_attr, &stx_ctx, NULL);
-		if (ret) {
-			FT_PRINTERR("fi_stx_context", ret);
-			return ret;
-		}
-	}
-
-	if (rx_shared_ctx) {
-		ret = fi_srx_context(domain, fi->rx_attr, &srx_ctx, NULL);
-		if (ret) {
-			FT_PRINTERR("fi_srx_context", ret);
-			return ret;
-		}
-	}
-	return 0;
-}
-
-static int bind_ep_res(struct fid_ep *ep)
-{
-	int ret;
-
-	if (hints->ep_attr->type == FI_EP_MSG)
-		FT_EP_BIND(ep, eq, 0);
-
-	if (tx_shared_ctx)
-		FT_EP_BIND(ep, stx_ctx, 0);
-
-	if (rx_shared_ctx)
-		FT_EP_BIND(ep, srx_ctx, 0);
-
-	FT_EP_BIND(ep, txcq, FI_SEND);
-	FT_EP_BIND(ep, rxcq, FI_RECV);
-	FT_EP_BIND(ep, av, 0);
-
-	ret = fi_enable(ep);
-	if (ret) {
-		FT_PRINTERR("fi_enable", ret);
-		return ret;
-	}
-	return 0;
-}
-
-static int bind_ep_array_res(void)
+static int enable_eps(void)
 {
 	int i, ret;
 	for (i = 0; i < ep_cnt; i++) {
-		ret = bind_ep_res(ep_array[i]);
+		ret = ft_enable_ep(ep_array[i]);
 		if (ret)
 			return ret;
 	}
@@ -198,9 +144,9 @@ static int run_test()
 
 	/* Post recvs */
 	for (i = 0; i < ep_cnt; i++) {
-		if (rx_shared_ctx) {
+		if (srx) {
 			fprintf(stdout, "Posting recv #%d for shared rx ctx\n", i);
-			ret = ft_post_rx(srx_ctx, rx_size, &rx_ctx_arr[i].context);
+			ret = ft_post_rx(srx, rx_size, &rx_ctx_arr[i].context);
 		 } else {
 			fprintf(stdout, "Posting recv for endpoint #%d\n", i);
 			ret = ft_post_rx(ep_array[i], rx_size, &rx_ctx_arr[i].context);
@@ -212,7 +158,7 @@ static int run_test()
 	if (opts.dst_addr) {
 		/* Post sends addressed to remote EPs */
 		for (i = 0; i < ep_cnt; i++) {
-			if (tx_shared_ctx)
+			if (stx)
 				fprintf(stdout, "Posting send #%d to shared tx ctx\n", i);
 			else
 				fprintf(stdout, "Posting send to endpoint #%d\n", i);
@@ -230,7 +176,7 @@ static int run_test()
 	if (!opts.dst_addr) {
 		/* Post sends addressed to remote EPs */
 		for (i = 0; i < ep_cnt; i++) {
-			if (tx_shared_ctx)
+			if (stx)
 				fprintf(stdout, "Posting send #%d to shared tx ctx\n", i);
 			else
 				fprintf(stdout, "Posting send to endpoint #%d\n", i);
@@ -268,8 +214,8 @@ static int init_av(void)
 			if (ret)
 				return ret;
 
-			if (rx_shared_ctx)
-				ret = ft_rx(srx_ctx, rx_size);
+			if (srx)
+				ret = ft_rx(srx, rx_size);
 			else
 				ret = ft_rx(ep_array[0], rx_size);
 			if (ret)
@@ -283,8 +229,8 @@ static int init_av(void)
 					return ret;
 			}
 		} else {
-			if (rx_shared_ctx)
-				ret = ft_rx(srx_ctx, rx_size);
+			if (srx)
+				ret = ft_rx(srx, rx_size);
 			else
 				ret = ft_rx(ep_array[0], rx_size);
 			if (ret)
@@ -306,8 +252,8 @@ static int init_av(void)
 	if (opts.dst_addr) {
 		ret = ft_tx(ep_array[0], addr_array[0], 1, &tx_ctx);
 	} else {
-		if (rx_shared_ctx)
-			ret = ft_rx(srx_ctx, rx_size);
+		if (srx)
+			ret = ft_rx(srx, rx_size);
 		else
 			ret = ft_rx(ep_array[0], rx_size);
 	}
@@ -333,21 +279,21 @@ static int init_fabric(void)
 
 	av_attr.count = ep_cnt;
 
-	ret = alloc_ep_res(fi);
+	ret = ft_alloc_ep_res(fi);
 	if (ret)
 		return ret;
 
-	ret = alloc_ep();
+	ret = alloc_eps();
 	if (ret)
 		return ret;
 
-	ret = bind_ep_array_res();
+	ret = enable_eps();
 	if (ret)
 		return ret;
 
 	/* Post recv */
-	if (rx_shared_ctx)
-		ret = ft_post_rx(srx_ctx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
+	if (srx)
+		ret = ft_post_rx(srx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	else
 		ret = ft_post_rx(ep_array[0], MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	if (ret)
@@ -376,15 +322,15 @@ static int client_connect(void)
 	if (ret)
 		return ret;
 
-	ret = alloc_ep_res(fi);
+	ret = ft_alloc_ep_res(fi);
 	if (ret)
 		return ret;
 
-	ret = alloc_ep();
+	ret = alloc_eps();
 	if (ret)
 		return ret;
 
-	ret = bind_ep_array_res();
+	ret = enable_eps();
 	if (ret)
 		return ret;
 
@@ -411,8 +357,8 @@ static int client_connect(void)
 	}
 
 	/* Post recv */
-	if (rx_shared_ctx)
-		ret = ft_post_rx(srx_ctx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
+	if (srx)
+		ret = ft_post_rx(srx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	else
 		ret = ft_post_rx(ep_array[0], MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	if (ret)
@@ -457,13 +403,11 @@ static int server_connect(void)
 			ep_state_array[num_conn_reqs].state = FT_EP_CONNECT_RCVD;
 
 			if (num_conn_reqs == 0) {
-				ret = fi_domain(fabric, fi, &domain, NULL);
-				if (ret) {
-					FT_PRINTERR("fi_domain", ret);
+				ret = ft_open_domain_res();
+				if (ret)
 					goto err;
-				}
 
-				ret = alloc_ep_res(fi);
+				ret = ft_alloc_ep_res(fi);
 				if (ret)
 					goto err;
 			}
@@ -475,7 +419,7 @@ static int server_connect(void)
 			}
 
 			ep_state_array[num_conn_reqs].ep = ep_array[num_conn_reqs];
-			ret = bind_ep_res(ep_array[num_conn_reqs]);
+			ret = ft_enable_ep(ep_array[num_conn_reqs]);
 			if (ret)
 				goto err;
 
@@ -522,8 +466,8 @@ static int server_connect(void)
 	}
 
 	/* Post recv */
-	if (rx_shared_ctx)
-		ret = ft_post_rx(srx_ctx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
+	if (srx)
+		ret = ft_post_rx(srx, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	else
 		ret = ft_post_rx(ep_array[0], MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx);
 	if (ret)
@@ -588,10 +532,11 @@ int main(int argc, char **argv)
 {
 	int op, ret;
 	int option_index = 0;
+	int use_stx = 1, use_srx = 1;
 
 	struct option long_options[] = {
-		{"no-tx-shared-ctx", no_argument, &tx_shared_ctx, 0},
-		{"no-rx-shared-ctx", no_argument, &rx_shared_ctx, 0},
+		{"no-tx-shared-ctx", no_argument, &use_stx, 0},
+		{"no-rx-shared-ctx", no_argument, &use_srx, 0},
 		{"ep-count", required_argument, 0, FT_EP_CNT},
 		{0, 0, 0, 0},
 	};
@@ -603,8 +548,8 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt_long(argc, argv, "h" ADDR_OPTS INFO_OPTS,
-					long_options, &option_index)) != -1) {
+	while ((op = getopt_long(argc, argv, "h" ADDR_OPTS INFO_OPTS API_OPTS,
+				 long_options, &option_index)) != -1) {
 		switch (op) {
 		case FT_EP_CNT:
 			ep_cnt = atoi(optarg);
@@ -617,6 +562,7 @@ int main(int argc, char **argv)
 		default:
 			ft_parse_addr_opts(op, optarg, &opts);
 			ft_parseinfo(op, optarg, hints, &opts);
+			ft_parse_api_opts(op, optarg, hints, &opts);
 			break;
 		case '?':
 		case 'h':
@@ -635,22 +581,23 @@ int main(int argc, char **argv)
 	if (optind < argc)
 		opts.dst_addr = argv[optind];
 
-	hints->caps = FI_MSG;
+	if (!(hints->caps & FI_TAGGED))
+		hints->caps = FI_MSG;
 	hints->mode = FI_CONTEXT;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 
-	if (tx_shared_ctx)
+	if (use_stx) {
+		opts.options |= FT_OPT_STX;
 		hints->ep_attr->tx_ctx_cnt = FI_SHARED_CONTEXT;
-	if (rx_shared_ctx)
+	}
+	if (use_srx) {
+		opts.options |= FT_OPT_SRX;
 		hints->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT;
+	}
 
 	ret = run();
 
 	FT_CLOSEV_FID(ep_array, ep_cnt);
-	if (rx_shared_ctx)
-		FT_CLOSE_FID(srx_ctx);
-	if (tx_shared_ctx)
-		FT_CLOSE_FID(stx_ctx);
 	ft_free_res();
 	free(addr_array);
 	free(ep_array);
diff --git a/deps/libfabric/fabtests/functional/unexpected_msg.c b/deps/libfabric/fabtests/functional/unexpected_msg.c
index 006ba4baeb1f0c7ef60a2a7abb475803d26b0f1d..c180fc09de5cbb84ebbc6f1903f03e567ac9c361 100644
--- a/deps/libfabric/fabtests/functional/unexpected_msg.c
+++ b/deps/libfabric/fabtests/functional/unexpected_msg.c
@@ -94,7 +94,7 @@ static char *get_rx_buf(int index)
 	return rx_buf + rx_size * index;
 }
 
-static int wait_recvs()
+static int wait_recv(void)
 {
 	struct fi_cq_tagged_entry entry;
 	int ret;
@@ -132,18 +132,24 @@ static int run_test_loop(void)
 	for (i = 0; i < num_iters; i++) {
 		for (j = 0; j < concurrent_msgs; j++) {
 			op_buf = get_tx_buf(j);
-			if (ft_check_opts(FT_OPT_VERIFY_DATA))
-				ft_fill_buf(op_buf + ft_tx_prefix_size(),
-					    opts.transfer_size);
+			if (ft_check_opts(FT_OPT_VERIFY_DATA)) {
+				ret = ft_fill_buf(op_buf + ft_tx_prefix_size(),
+						  opts.transfer_size);
+				if (ret)
+					return ret;
+			}
 
 			ret = ft_post_tx_buf(ep, remote_fi_addr,
 					     opts.transfer_size,
 					     op_data, &tx_ctx_arr[j].context,
-					     op_buf, mr_desc, op_tag);
+					     op_buf, mr_desc, op_tag + j);
 			if (ret) {
 				printf("ERROR send_msg returned %d\n", ret);
 				return ret;
 			}
+
+			/* Request send progress */
+			(void) fi_cq_read(txcq, NULL, 0);
 		}
 
 		ret = ft_sync();
@@ -154,15 +160,17 @@ static int run_test_loop(void)
 			op_buf = get_rx_buf(j);
 			ret = ft_post_rx_buf(ep, opts.transfer_size,
 					     &rx_ctx_arr[j].context, op_buf,
-					     mr_desc, op_tag);
+					     mr_desc,
+					     op_tag + (concurrent_msgs - 1) - j);
 			if (ret) {
 				printf("ERROR recv_msg returned %d\n", ret);
 				return ret;
 			}
-		}
 
-		for (j = 0; j < concurrent_msgs; j++) {
-			ret = wait_recvs();
+			/* Progress sends */
+			(void) fi_cq_read(txcq, NULL, 0);
+
+			ret = wait_recv();
 			if (ret < 1)
 				return ret;
 		}
@@ -222,42 +230,24 @@ int main(int argc, char **argv)
 	if (!hints)
 		return EXIT_FAILURE;
 
-	while ((op = getopt(argc, argv, "m:i:c:vdSh" ADDR_OPTS INFO_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "CM:h" CS_OPTS INFO_OPTS)) != -1) {
 		switch (op) {
 		default:
+			ft_parsecsopts(op, optarg, &opts);
 			ft_parse_addr_opts(op, optarg, &opts);
 			ft_parseinfo(op, optarg, hints, &opts);
 			break;
-		case 'c':
-			concurrent_msgs = strtoul(optarg, NULL, 0);
-			break;
-		case 'i':
-			num_iters = strtoul(optarg, NULL, 0);
-			break;
-		case 'S':
-			opts.comp_method = FT_COMP_SREAD;
-			break;
-		case 'v':
-			opts.options |= FT_OPT_VERIFY_DATA;
-			break;
-		case 'm':
-			opts.transfer_size = strtoul(optarg, NULL, 0);
-			break;
-		case 'd':
+		case 'C':
 			send_data = true;
 			break;
+		case 'M':
+			concurrent_msgs = strtoul(optarg, NULL, 0);
+			break;
 		case '?':
 		case 'h':
-			ft_usage(argv[0], "Unexpected message functional test");
-			FT_PRINT_OPTS_USAGE("-c <int>",
-				"Concurrent messages per iteration ");
-			FT_PRINT_OPTS_USAGE("-v", "Enable data verification");
-			FT_PRINT_OPTS_USAGE("-i <int>", "Number of iterations");
-			FT_PRINT_OPTS_USAGE("-S",
-				"Use fi_cq_sread instead of polling fi_cq_read");
-			FT_PRINT_OPTS_USAGE("-m <size>",
-				"Size of unexpected messages");
-			FT_PRINT_OPTS_USAGE("-d", "Send remote CQ data");
+			ft_csusage(argv[0], "Unexpected message handling test.");
+			FT_PRINT_OPTS_USAGE("-C", "transfer remote CQ data");
+			FT_PRINT_OPTS_USAGE("-M <count>", "number of concurrent msgs");
 			return EXIT_FAILURE;
 		}
 	}
diff --git a/deps/libfabric/fabtests/include/freebsd/malloc.h b/deps/libfabric/fabtests/include/freebsd/malloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..30abe10b2f695a2d818237a87185d2eeb159440f
--- /dev/null
+++ b/deps/libfabric/fabtests/include/freebsd/malloc.h
@@ -0,0 +1,44 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FABTESTS_FREEBSD_MALLOC_H_
+#define _FABTESTS_FREEBSD_MALLOC_H_
+
+#define M_MMAP_THRESHOLD    -3
+
+int mallopt(int param, int value)
+{
+	/* Not supported. */
+	return 0;
+}
+
+#endif /* _FABTESTS_FREEBSD_MALLOC_H_ */
diff --git a/deps/libfabric/fabtests/include/ft_osd.h b/deps/libfabric/fabtests/include/ft_osd.h
index a71c1c9d3b41fbe01e15913d5ca28257ede15e85..6ea377941cd26efe1ec901f3583f0b43127987eb 100644
--- a/deps/libfabric/fabtests/include/ft_osd.h
+++ b/deps/libfabric/fabtests/include/ft_osd.h
@@ -45,4 +45,11 @@
 #include <unix/osd.h>
 #endif
 
+#define OFI_DATATYPE_CNT	(FI_UINT128 + 1)
+
+#ifdef HAVE___INT128
+typedef __int128 ofi_int128_t;
+typedef unsigned __int128 ofi_uint128_t;
+#endif
+
 #endif /* _FT_OSD_H_ */
diff --git a/deps/libfabric/fabtests/include/hmem.h b/deps/libfabric/fabtests/include/hmem.h
index c12fff7c03a246705a4f18a0dbac0589f7f72b3a..813e1b0932a563fbee7c1b16ff090e346cdf6b8f 100644
--- a/deps/libfabric/fabtests/include/hmem.h
+++ b/deps/libfabric/fabtests/include/hmem.h
@@ -42,12 +42,12 @@ int ft_ze_free(void *buf);
 int ft_ze_memset(uint64_t device, void *buf, int value, size_t size);
 int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size);
 
-static inline int ft_host_init()
+static inline int ft_host_init(void)
 {
 	return FI_SUCCESS;
 }
 
-static inline int ft_host_cleanup()
+static inline int ft_host_cleanup(void)
 {
 	return FI_SUCCESS;
 }
diff --git a/deps/libfabric/fabtests/include/jsmn.h b/deps/libfabric/fabtests/include/jsmn.h
index 48a07c1d2000da501eb089112182b42485bc95a4..b95368a2061ed52e817f8dbd885d3a1f477efccc 100644
--- a/deps/libfabric/fabtests/include/jsmn.h
+++ b/deps/libfabric/fabtests/include/jsmn.h
@@ -1,5 +1,7 @@
 /*
- * Copyright (c) 2010 Serge A. Zaitsev
+ * MIT License
+ *
+ * Copyright (c) 2010 Serge Zaitsev
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -16,12 +18,11 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
-
-#ifndef __JSMN_H_
-#define __JSMN_H_
+#ifndef JSMN_H
+#define JSMN_H
 
 #include <stddef.h>
 
@@ -29,6 +30,12 @@
 extern "C" {
 #endif
 
+#ifdef JSMN_STATIC
+#define JSMN_API static
+#else
+#define JSMN_API extern
+#endif
+
 /**
  * JSON type identifier. Basic types are:
  * 	o Object
@@ -37,61 +44,425 @@ extern "C" {
  * 	o Other primitive: number, boolean (true/false) or null
  */
 typedef enum {
-	JSMN_PRIMITIVE = 0,
-	JSMN_OBJECT = 1,
-	JSMN_ARRAY = 2,
-	JSMN_STRING = 3
+  JSMN_UNDEFINED = 0,
+  JSMN_OBJECT = 1,
+  JSMN_ARRAY = 2,
+  JSMN_STRING = 3,
+  JSMN_PRIMITIVE = 4
 } jsmntype_t;
 
-typedef enum {
-	/* Not enough tokens were provided */
-	JSMN_ERROR_NOMEM = -1,
-	/* Invalid character inside JSON string */
-	JSMN_ERROR_INVAL = -2,
-	/* The string is not a full JSON packet, more bytes expected */
-	JSMN_ERROR_PART = -3
-} jsmnerr_t;
+enum jsmnerr {
+  /* Not enough tokens were provided */
+  JSMN_ERROR_NOMEM = -1,
+  /* Invalid character inside JSON string */
+  JSMN_ERROR_INVAL = -2,
+  /* The string is not a full JSON packet, more bytes expected */
+  JSMN_ERROR_PART = -3
+};
 
 /**
  * JSON token description.
- * @param		type	type (object, array, string etc.)
- * @param		start	start position in JSON data string
- * @param		end		end position in JSON data string
+ * type		type (object, array, string etc.)
+ * start	start position in JSON data string
+ * end		end position in JSON data string
  */
 typedef struct {
-	jsmntype_t type;
-	int start;
-	int end;
-	int size;
+  jsmntype_t type;
+  int start;
+  int end;
+  int size;
 #ifdef JSMN_PARENT_LINKS
-	int parent;
+  int parent;
 #endif
 } jsmntok_t;
 
 /**
  * JSON parser. Contains an array of token blocks available. Also stores
- * the string being parsed now and current position in that string
+ * the string being parsed now and current position in that string.
  */
 typedef struct {
-	unsigned int pos; /* offset in the JSON string */
-	unsigned int toknext; /* next token to allocate */
-	int toksuper; /* superior token node, e.g parent object or array */
+  unsigned int pos;     /* offset in the JSON string */
+  unsigned int toknext; /* next token to allocate */
+  int toksuper;         /* superior token node, e.g. parent object or array */
 } jsmn_parser;
 
 /**
  * Create JSON parser over an array of tokens
  */
-void jsmn_init(jsmn_parser *parser);
+JSMN_API void jsmn_init(jsmn_parser *parser);
 
 /**
- * Run JSON parser. It parses a JSON data string into and array of tokens, each describing
+ * Run JSON parser. It parses a JSON data string into and array of tokens, each
+ * describing
  * a single JSON object.
  */
-jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len,
-		jsmntok_t *tokens, unsigned int num_tokens);
+JSMN_API int jsmn_parse(jsmn_parser *parser, const char *js, const size_t len,
+                        jsmntok_t *tokens, const unsigned int num_tokens);
+
+#ifndef JSMN_HEADER
+/**
+ * Allocates a fresh unused token from the token pool.
+ */
+static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser, jsmntok_t *tokens,
+                                   const size_t num_tokens) {
+  jsmntok_t *tok;
+  if (parser->toknext >= num_tokens) {
+    return NULL;
+  }
+  tok = &tokens[parser->toknext++];
+  tok->start = tok->end = -1;
+  tok->size = 0;
+#ifdef JSMN_PARENT_LINKS
+  tok->parent = -1;
+#endif
+  return tok;
+}
+
+/**
+ * Fills token type and boundaries.
+ */
+static void jsmn_fill_token(jsmntok_t *token, const jsmntype_t type,
+                            const int start, const int end) {
+  token->type = type;
+  token->start = start;
+  token->end = end;
+  token->size = 0;
+}
+
+/**
+ * Fills next available token with JSON primitive.
+ */
+static int jsmn_parse_primitive(jsmn_parser *parser, const char *js,
+                                const size_t len, jsmntok_t *tokens,
+                                const size_t num_tokens) {
+  jsmntok_t *token;
+  int start;
+
+  start = parser->pos;
+
+  for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
+    switch (js[parser->pos]) {
+#ifndef JSMN_STRICT
+    /* In strict mode primitive must be followed by "," or "}" or "]" */
+    case ':':
+#endif
+    case '\t':
+    case '\r':
+    case '\n':
+    case ' ':
+    case ',':
+    case ']':
+    case '}':
+      goto found;
+    }
+    if (js[parser->pos] < 32 || js[parser->pos] >= 127) {
+      parser->pos = start;
+      return JSMN_ERROR_INVAL;
+    }
+  }
+#ifdef JSMN_STRICT
+  /* In strict mode primitive must be followed by a comma/object/array */
+  parser->pos = start;
+  return JSMN_ERROR_PART;
+#endif
+
+found:
+  if (tokens == NULL) {
+    parser->pos--;
+    return 0;
+  }
+  token = jsmn_alloc_token(parser, tokens, num_tokens);
+  if (token == NULL) {
+    parser->pos = start;
+    return JSMN_ERROR_NOMEM;
+  }
+  jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos);
+#ifdef JSMN_PARENT_LINKS
+  token->parent = parser->toksuper;
+#endif
+  parser->pos--;
+  return 0;
+}
+
+/**
+ * Fills next token with JSON string.
+ */
+static int jsmn_parse_string(jsmn_parser *parser, const char *js,
+                             const size_t len, jsmntok_t *tokens,
+                             const size_t num_tokens) {
+  jsmntok_t *token;
+
+  int start = parser->pos;
+
+  parser->pos++;
+
+  /* Skip starting quote */
+  for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
+    char c = js[parser->pos];
+
+    /* Quote: end of string */
+    if (c == '\"') {
+      if (tokens == NULL) {
+        return 0;
+      }
+      token = jsmn_alloc_token(parser, tokens, num_tokens);
+      if (token == NULL) {
+        parser->pos = start;
+        return JSMN_ERROR_NOMEM;
+      }
+      jsmn_fill_token(token, JSMN_STRING, start + 1, parser->pos);
+#ifdef JSMN_PARENT_LINKS
+      token->parent = parser->toksuper;
+#endif
+      return 0;
+    }
+
+    /* Backslash: Quoted symbol expected */
+    if (c == '\\' && parser->pos + 1 < len) {
+      int i;
+      parser->pos++;
+      switch (js[parser->pos]) {
+      /* Allowed escaped symbols */
+      case '\"':
+      case '/':
+      case '\\':
+      case 'b':
+      case 'f':
+      case 'r':
+      case 'n':
+      case 't':
+        break;
+      /* Allows escaped symbol \uXXXX */
+      case 'u':
+        parser->pos++;
+        for (i = 0; i < 4 && parser->pos < len && js[parser->pos] != '\0';
+             i++) {
+          /* If it isn't a hex character we have an error */
+          if (!((js[parser->pos] >= 48 && js[parser->pos] <= 57) ||   /* 0-9 */
+                (js[parser->pos] >= 65 && js[parser->pos] <= 70) ||   /* A-F */
+                (js[parser->pos] >= 97 && js[parser->pos] <= 102))) { /* a-f */
+            parser->pos = start;
+            return JSMN_ERROR_INVAL;
+          }
+          parser->pos++;
+        }
+        parser->pos--;
+        break;
+      /* Unexpected symbol */
+      default:
+        parser->pos = start;
+        return JSMN_ERROR_INVAL;
+      }
+    }
+  }
+  parser->pos = start;
+  return JSMN_ERROR_PART;
+}
+
+/**
+ * Parse JSON string and fill tokens.
+ */
+JSMN_API int jsmn_parse(jsmn_parser *parser, const char *js, const size_t len,
+                        jsmntok_t *tokens, const unsigned int num_tokens) {
+  int r;
+  int i;
+  jsmntok_t *token;
+  int count = parser->toknext;
+
+  for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
+    char c;
+    jsmntype_t type;
+
+    c = js[parser->pos];
+    switch (c) {
+    case '{':
+    case '[':
+      count++;
+      if (tokens == NULL) {
+        break;
+      }
+      token = jsmn_alloc_token(parser, tokens, num_tokens);
+      if (token == NULL) {
+        return JSMN_ERROR_NOMEM;
+      }
+      if (parser->toksuper != -1) {
+        jsmntok_t *t = &tokens[parser->toksuper];
+#ifdef JSMN_STRICT
+        /* In strict mode an object or array can't become a key */
+        if (t->type == JSMN_OBJECT) {
+          return JSMN_ERROR_INVAL;
+        }
+#endif
+        t->size++;
+#ifdef JSMN_PARENT_LINKS
+        token->parent = parser->toksuper;
+#endif
+      }
+      token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY);
+      token->start = parser->pos;
+      parser->toksuper = parser->toknext - 1;
+      break;
+    case '}':
+    case ']':
+      if (tokens == NULL) {
+        break;
+      }
+      type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY);
+#ifdef JSMN_PARENT_LINKS
+      if (parser->toknext < 1) {
+        return JSMN_ERROR_INVAL;
+      }
+      token = &tokens[parser->toknext - 1];
+      for (;;) {
+        if (token->start != -1 && token->end == -1) {
+          if (token->type != type) {
+            return JSMN_ERROR_INVAL;
+          }
+          token->end = parser->pos + 1;
+          parser->toksuper = token->parent;
+          break;
+        }
+        if (token->parent == -1) {
+          if (token->type != type || parser->toksuper == -1) {
+            return JSMN_ERROR_INVAL;
+          }
+          break;
+        }
+        token = &tokens[token->parent];
+      }
+#else
+      for (i = parser->toknext - 1; i >= 0; i--) {
+        token = &tokens[i];
+        if (token->start != -1 && token->end == -1) {
+          if (token->type != type) {
+            return JSMN_ERROR_INVAL;
+          }
+          parser->toksuper = -1;
+          token->end = parser->pos + 1;
+          break;
+        }
+      }
+      /* Error if unmatched closing bracket */
+      if (i == -1) {
+        return JSMN_ERROR_INVAL;
+      }
+      for (; i >= 0; i--) {
+        token = &tokens[i];
+        if (token->start != -1 && token->end == -1) {
+          parser->toksuper = i;
+          break;
+        }
+      }
+#endif
+      break;
+    case '\"':
+      r = jsmn_parse_string(parser, js, len, tokens, num_tokens);
+      if (r < 0) {
+        return r;
+      }
+      count++;
+      if (parser->toksuper != -1 && tokens != NULL) {
+        tokens[parser->toksuper].size++;
+      }
+      break;
+    case '\t':
+    case '\r':
+    case '\n':
+    case ' ':
+      break;
+    case ':':
+      parser->toksuper = parser->toknext - 1;
+      break;
+    case ',':
+      if (tokens != NULL && parser->toksuper != -1 &&
+          tokens[parser->toksuper].type != JSMN_ARRAY &&
+          tokens[parser->toksuper].type != JSMN_OBJECT) {
+#ifdef JSMN_PARENT_LINKS
+        parser->toksuper = tokens[parser->toksuper].parent;
+#else
+        for (i = parser->toknext - 1; i >= 0; i--) {
+          if (tokens[i].type == JSMN_ARRAY || tokens[i].type == JSMN_OBJECT) {
+            if (tokens[i].start != -1 && tokens[i].end == -1) {
+              parser->toksuper = i;
+              break;
+            }
+          }
+        }
+#endif
+      }
+      break;
+#ifdef JSMN_STRICT
+    /* In strict mode primitives are: numbers and booleans */
+    case '-':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case 't':
+    case 'f':
+    case 'n':
+      /* And they must not be keys of the object */
+      if (tokens != NULL && parser->toksuper != -1) {
+        const jsmntok_t *t = &tokens[parser->toksuper];
+        if (t->type == JSMN_OBJECT ||
+            (t->type == JSMN_STRING && t->size != 0)) {
+          return JSMN_ERROR_INVAL;
+        }
+      }
+#else
+    /* In non-strict mode every unquoted value is a primitive */
+    default:
+#endif
+      r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens);
+      if (r < 0) {
+        return r;
+      }
+      count++;
+      if (parser->toksuper != -1 && tokens != NULL) {
+        tokens[parser->toksuper].size++;
+      }
+      break;
+
+#ifdef JSMN_STRICT
+    /* Unexpected char in strict mode */
+    default:
+      return JSMN_ERROR_INVAL;
+#endif
+    }
+  }
+
+  if (tokens != NULL) {
+    for (i = parser->toknext - 1; i >= 0; i--) {
+      /* Unmatched opened object or array */
+      if (tokens[i].start != -1 && tokens[i].end == -1) {
+        return JSMN_ERROR_PART;
+      }
+    }
+  }
+
+  return count;
+}
+
+/**
+ * Creates a new parser based over a given buffer with an array of tokens
+ * available.
+ */
+JSMN_API void jsmn_init(jsmn_parser *parser) {
+  parser->pos = 0;
+  parser->toknext = 0;
+  parser->toksuper = -1;
+}
+
+#endif /* JSMN_HEADER */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* __JSMN_H_ */
+#endif /* JSMN_H */
diff --git a/deps/libfabric/fabtests/include/shared.h b/deps/libfabric/fabtests/include/shared.h
index 709537924aa90a15505f1c2ad5b2c322dba7aace..1d30919ae334af3979108d916a267cd846d22c7a 100644
--- a/deps/libfabric/fabtests/include/shared.h
+++ b/deps/libfabric/fabtests/include/shared.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2013-2017 Intel Corporation.  All rights reserved.
  * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  *
  * This software is available to you under the BSD license below:
  *
@@ -56,6 +57,10 @@ extern "C" {
 #define OFI_UTIL_PREFIX "ofi_"
 #define OFI_NAME_DELIM ';'
 
+#define ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define ALIGN(x, a) ALIGN_MASK(x, (typeof(x))(a) - 1)
+#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a))
+
 #define OFI_MR_BASIC_MAP (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR)
 
 /* exit codes must be 0-255 */
@@ -113,6 +118,10 @@ enum {
 	FT_OPT_SERVER_PERSIST	= 1 << 16,
 	FT_OPT_ENABLE_HMEM	= 1 << 17,
 	FT_OPT_USE_DEVICE	= 1 << 18,
+	FT_OPT_DOMAIN_EQ	= 1 << 19,
+	FT_OPT_FORK_CHILD	= 1 << 20,
+	FT_OPT_SRX		= 1 << 21,
+	FT_OPT_STX		= 1 << 22,
 	FT_OPT_OOB_CTRL		= FT_OPT_OOB_SYNC | FT_OPT_OOB_ADDR_EXCH,
 };
 
@@ -185,6 +194,8 @@ extern struct fid_pep *pep;
 extern struct fid_ep *ep, *alias_ep;
 extern struct fid_cq *txcq, *rxcq;
 extern struct fid_cntr *txcntr, *rxcntr;
+extern struct fid_ep *srx;
+extern struct fid_stx *stx;
 extern struct fid_mr *mr, no_mr;
 extern void *mr_desc;
 extern struct fid_av *av;
@@ -218,14 +229,14 @@ void ft_parseinfo(int op, char *optarg, struct fi_info *hints,
 		  struct ft_opts *opts);
 void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts);
 void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts);
-int ft_parse_rma_opts(int op, char *optarg, struct fi_info *hints,
+int ft_parse_api_opts(int op, char *optarg, struct fi_info *hints,
 		      struct ft_opts *opts);
 void ft_addr_usage();
 void ft_usage(char *name, char *desc);
 void ft_mcusage(char *name, char *desc);
 void ft_csusage(char *name, char *desc);
 
-void ft_fill_buf(void *buf, size_t size);
+int ft_fill_buf(void *buf, size_t size);
 int ft_check_buf(void *buf, size_t size);
 int ft_check_opts(uint64_t flags);
 uint64_t ft_init_cq_data(struct fi_info *info);
@@ -243,9 +254,10 @@ extern int ft_socket_pair[2];
 extern int sock;
 extern int listen_sock;
 #define ADDR_OPTS "B:P:s:a:b::E::C:F:"
-#define FAB_OPTS "f:d:p:D:i:H"
+#define FAB_OPTS "f:d:p:D:i:HK"
 #define INFO_OPTS FAB_OPTS "e:M:"
-#define CS_OPTS ADDR_OPTS "I:S:mc:t:w:l"
+#define CS_OPTS ADDR_OPTS "I:QS:mc:t:w:l"
+#define API_OPTS "o:"
 #define NO_CQ_DATA 0
 
 extern char default_port[8];
@@ -254,6 +266,7 @@ extern char default_port[8];
 	{	.options = FT_OPT_RX_CQ | FT_OPT_TX_CQ, \
 		.iterations = 1000, \
 		.warmup_iterations = 10, \
+		.num_connections = 1, \
 		.transfer_size = 1024, \
 		.window_size = 64, \
 		.av_size = 1, \
@@ -358,11 +371,15 @@ static inline int ft_use_size(int index, int enable_flags)
 		}							\
 	} while (0)
 
+int ft_init();
 int ft_alloc_bufs();
 int ft_open_fabric_res();
+int ft_open_domain_res();
 int ft_getinfo(struct fi_info *hints, struct fi_info **info);
 int ft_init_fabric();
 int ft_init_oob();
+int ft_close_oob();
+int ft_reset_oob();
 int ft_start_server();
 int ft_server_connect();
 int ft_client_connect();
@@ -375,9 +392,7 @@ int ft_connect_ep(struct fid_ep *ep,
 int ft_alloc_ep_res(struct fi_info *fi);
 int ft_alloc_active_res(struct fi_info *fi);
 int ft_enable_ep_recv(void);
-int ft_enable_ep(struct fid_ep *ep, struct fid_eq *eq, struct fid_av *av,
-		 struct fid_cq *txcq, struct fid_cq *rxcq,
-		 struct fid_cntr *txcntr, struct fid_cntr *rxcntr);
+int ft_enable_ep(struct fid_ep *ep);
 int ft_init_alias_ep(uint64_t flags);
 int ft_av_insert(struct fid_av *av, void *addr, size_t count, fi_addr_t *fi_addr,
 		uint64_t flags, void *context);
@@ -430,6 +445,7 @@ static inline bool ft_check_prefix_forced(struct fi_info *info,
 int ft_sync(void);
 int ft_sync_pair(int status);
 int ft_fork_and_pair(void);
+int ft_fork_child(void);
 int ft_wait_child(void);
 int ft_finalize(void);
 int ft_finalize_ep(struct fid_ep *ep);
diff --git a/deps/libfabric/fabtests/include/unix/osd.h b/deps/libfabric/fabtests/include/unix/osd.h
index 3d7b415af75100e93d64b9e7bbdfca86792dbebb..4e85ca95adf300761ed4d56bccdd92c05c5dec05 100644
--- a/deps/libfabric/fabtests/include/unix/osd.h
+++ b/deps/libfabric/fabtests/include/unix/osd.h
@@ -34,12 +34,28 @@
 #define _FABTESTS_UNIX_OSD_H_
 
 #include <complex.h>
+#include <unistd.h>
+#include <fcntl.h>
 
 static inline int ft_startup(void)
 {
 	return 0;
 }
 
+static inline int ft_fd_nonblock(int fd)
+{
+	long flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		return -errno;
+
+	if (fcntl(fd, F_SETFL, flags | O_NONBLOCK))
+		return -errno;
+
+	return 0;
+}
+
 /* complex operations implementation */
 #define OFI_COMPLEX(name) ofi_##name##_complex
 #define OFI_COMPLEX_OP(name, op) ofi_complex_##name##_##op
diff --git a/deps/libfabric/fabtests/include/windows/osd.h b/deps/libfabric/fabtests/include/windows/osd.h
index bf7a114507792ad14981ed465dbd5948b265a3be..ff3e5db689ef6c4780f0487515446c70611dc3b2 100644
--- a/deps/libfabric/fabtests/include/windows/osd.h
+++ b/deps/libfabric/fabtests/include/windows/osd.h
@@ -44,6 +44,7 @@ struct iovec
 };
 
 #define strdup _strdup
+#define strcasecmp _stricmp
 #define strncasecmp _strnicmp
 #define SHUT_RDWR SD_BOTH
 #define CLOCK_MONOTONIC	1
@@ -131,6 +132,12 @@ static long int sysconf(int name)
 
 int socketpair(int af, int type, int protocol, int socks[2]);
 
+static inline int ft_fd_nonblock(int fd)
+{
+	u_long argp = 1;
+	return ioctlsocket(fd, FIONBIO, &argp) ? -WSAGetLastError() : 0;
+}
+
 /* Bits in the fourth argument to `waitid'.  */
 #define WSTOPPED	2	/* Report stopped child (same as WUNTRACED). */
 #define WEXITED		4	/* Report dead child. */
diff --git a/deps/libfabric/fabtests/man/fabtests.7.md b/deps/libfabric/fabtests/man/fabtests.7.md
index f64b2353900fcba58236ab5e16b1705ab048cb4d..84b8d4f2d8047db4c6b2c5fb0ba2a0f462c48961 100644
--- a/deps/libfabric/fabtests/man/fabtests.7.md
+++ b/deps/libfabric/fabtests/man/fabtests.7.md
@@ -38,7 +38,7 @@ These tests are a mix of very basic functionality tests that show major
 features of libfabric.
 
 *fi_av_xfer*
-: Tests communication for unconnected endpoints, as addresses
+: Tests communication for connectionless endpoints, as addresses
   are inserted and removed from the local address vector.
 
 *fi_cm_data*
@@ -51,7 +51,7 @@ features of libfabric.
 : A basic datagram endpoint example.
 
 *fi_dgram_waitset*
-: Transfers datagrams using waitsets for completion notifcation.
+: Transfers datagrams using waitsets for completion notification.
 
 *fi_inj_complete*
 : Sends messages using the FI_INJECT_COMPLETE operation flag.
@@ -64,7 +64,7 @@ features of libfabric.
 
 *fi_msg_epoll*
 : Transfers messages with completion queues configured to use file
-  descriptors as wait objetcts.  The file descriptors are retrieved
+  descriptors as wait objects.  The file descriptors are retrieved
   by the program and used directly with the Linux epoll API.
 
 *fi_msg_sockets*
@@ -101,8 +101,9 @@ features of libfabric.
 : Transfers multiple messages over an RDM endpoint that are received
   into a single buffer, posted using the FI_MULTI_RECV flag.
 
-*fi_rdm_rma_simple*
-: A simple RMA write example over an RDM endpoint.
+*fi_rdm_rma_event*
+: An RMA write example over an RDM endpoint that uses RMA events
+  to notify the peer that the RMA transfer has completed.
 
 *fi_rdm_rma_trigger*
 : A basic example of queuing an RMA write operation that is initiated
@@ -121,7 +122,7 @@ features of libfabric.
 
 *fi_resmgmt_test*
 : Tests the resource management enabled feature.  This verifies that the
-  provider prevents applications from overruning local and remote command
+  provider prevents applications from overrunning local and remote command
   queues and completion queues.  This corresponds to setting the domain
   attribute resource_mgmt to FI_RM_ENABLED.
 
@@ -147,6 +148,10 @@ features of libfabric.
   A sleep time on the receiving side can be enabled in order to allow
   the sender to get ahead of the receiver.
 
+*fi_rdm_multi_client*
+: Tests a persistent server communicating with multiple clients, one at a
+  time, in sequence.
+
 # Benchmarks
 
 The client and the server exchange messages in either a ping-pong manner,
@@ -212,17 +217,14 @@ testing scope is limited.
 *fi_mr_cache_evict*
 : Tests provider MR cache eviction capabilities.
 
-*fi_resource_freeing*
-: Allocates and closes fabric resources to check for proper cleanup.
-
 # Multinode
 
 This test runs a series of tests over multiple formats and patterns to help
 validate at scale. The patterns are an all to all, one to all, all to one and
-a ring. The tests also run accross multiple capabilites, such as messages, rma,
-atomics, and tagged messages. Currently, there is no option to run these 
+a ring. The tests also run across multiple capabilities, such as messages, rma,
+atomics, and tagged messages. Currently, there is no option to run these
 capabilities and patterns independently, however the test is short enough to be
-all run at once.   
+all run at once.
 
 # Ubertest
 
@@ -232,7 +234,7 @@ number of tests by iterating over a large number of test variables.  As a
 result, a full ubertest run can take a significant amount of time.  Because
 ubertest iterates over input variables, it relies on a test configuration
 file for control, rather than extensive command line options that are used
-by other fabtests.  A configuration file must be constructured for each
+by other fabtests.  A configuration file must be constructed for each
 provider.  Example test configurations are at test_configs.
 
 *fi_ubertest*
@@ -244,6 +246,19 @@ provider.  Example test configurations are at test_configs.
   values and 2 having 3 possible values, ubertest will execute 576 total
   iterations of each test.
 
+# EFA provider specific tests
+
+Beyond libfabric defined functionalities, EFA provider defines its
+specific features/functionalities. These EFA provider specific fabtests
+show users how to correctly use them.
+
+*fi_efa_ep_rnr_retry*
+: Tests modifying the RNR retry count (rnr_retry) via fi_setopt, and
+  then runs a simple program to test if the error cq entry (with error
+  FI_ENORX) can be written to application, if RNR happens.
+  Use `-R` option to specify RNR retry count. The valid values are 0-7,
+  where 7 indicates infinite retry on firmware.
+
 ### Config file options
 
 The following keys and respective key values may be used in the config file.
@@ -263,7 +278,7 @@ The following keys and respective key values may be used in the config file.
   FT_FUNC_INJECT, FT_FUNC_INJECTDATA, FT_FUNC_SENDDATA
 
   For FT_CAP_RMA: FT_FUNC_WRITE, FT_FUNC_WRITEV, FT_FUNC_WRITEMSG,
-  FT_FUNC_WRITEDATA, FT_FUNC_INJECT_WRITE, FT_FUNC_INJECT_WRITEDATA
+  FT_FUNC_WRITEDATA, FT_FUNC_INJECT_WRITE, FT_FUNC_INJECT_WRITEDATA,
   FT_FUNC_READ, FT_FUNC_READV, FT_FUNC_READMSG
 
   For FT_CAP_ATOMIC: FT_FUNC_ATOMIC, FT_FUNC_ATOMICV, FT_FUNC_ATOMICMSG,
@@ -315,10 +330,10 @@ The following keys and respective key values may be used in the config file.
 *datatype*
 : For FT_CAP_ATOMIC: FI_INT8, FI_UINT8, FI_INT16, FI_UINT16, FI_INT32,
   FI_UINT32, FI_INT64, FI_UINT64, FI_FLOAT, FI_DOUBLE, FI_FLOAT_COMPLEX,
-  FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLE
+  FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLEX
 
 *msg_flags - values OR'ed together*
-: For FT_FUNC_XXXMSG: FI_REMOTE_CQ_DATA, FI_COMPLETION
+: For FT_FUNC_[SEND,WRITE,READ,ATOMIC]MSG: FI_REMOTE_CQ_DATA, FI_COMPLETION
 
 *rx_cq_bind_flags - values OR'ed together*
 : FI_SELECTIVE_COMPLETION
@@ -369,6 +384,10 @@ the list available for that test.
 : Use the specified endpoint type for the test.  Valid options are msg,
   dgram, and rdm.  The default endpoint type is rdm.
 
+*-D <device_name>*
+: Allocate data buffers on the specified device, rather than in host
+  memory.  Valid options are ze and cuda.
+
 *-a <address vector name>*
 : The name of a shared address vector.  This option only applies to tests
   that support shared address vectors.
@@ -385,6 +404,9 @@ the list available for that test.
 *-F <address_format>
 : Specifies the address format.
 
+*-K
+: Fork a child process after initializing endpoint.
+
 *-b[=oob_port]*
 : Enables out-of-band (via sockets) address exchange and test
   synchronization.  A port for the out-of-band connection may be specified
@@ -395,9 +417,15 @@ the list available for that test.
   out-of-band connection may be specified as part of this option to override
   the default. Cannot be used together with the '-b' option.
 
+*-U*
+: Run fabtests with FI_DELIVERY_COMPLETE.
+
 *-I <number>*
 : Number of data transfer iterations.
 
+*-Q*
+: Associated any EQ with the domain, rather than directly with the EP.
+
 *-w <number>*
 : Number of warm-up data transfer iterations.
 
@@ -427,9 +455,11 @@ the list available for that test.
   select() to block until the fd has been signaled, prior to checking for
   completions.
 
-*-o <rma_op>*
+*-o <op>*
 : For RMA based tests, specify the type of RMA operation to perform.  Valid
   values are read, write, and writedata.  Write operations are the default.
+  For message based, tests, specify whether msg (default) or tagged transfers
+  will be used.
 
 *-M <mcast_addr>*
 : For multicast tests, specifies the address of the multicast group to join.
@@ -460,12 +490,12 @@ This will run "fi_rdm_atomic" for all atomic operations with
 
 ## Run multinode tests
 
-	Server and clients are invoked with the same command: 
+	Server and clients are invoked with the same command:
 		fi_multinode -n <number of processes> -s <server_addr> -C <mode>
-	
-	A process on the server must be started before any of the clients can be started 
+
+	A process on the server must be started before any of the clients can be started
 	succesfully. -C lists the mode that the tests will run in. Currently the options are
-  for rma and msg. If not provided, the test will default to msg. 
+  for rma and msg. If not provided, the test will default to msg.
 
 ## Run fi_ubertest
 
diff --git a/deps/libfabric/fabtests/man/man1/fi_rdm_rma_simple.1 b/deps/libfabric/fabtests/man/man1/fi_efa_ep_rnr_retry.1
similarity index 100%
rename from deps/libfabric/fabtests/man/man1/fi_rdm_rma_simple.1
rename to deps/libfabric/fabtests/man/man1/fi_efa_ep_rnr_retry.1
diff --git a/deps/libfabric/fabtests/man/man1/fi_rdm_multi_client.1 b/deps/libfabric/fabtests/man/man1/fi_rdm_multi_client.1
new file mode 100644
index 0000000000000000000000000000000000000000..3f6ccf96f11bf1fcd23a27226a50166c3f6b3201
--- /dev/null
+++ b/deps/libfabric/fabtests/man/man1/fi_rdm_multi_client.1
@@ -0,0 +1 @@
+.so man7/fabtests.7
diff --git a/deps/libfabric/fabtests/man/man1/fi_rdm_rma_event.1 b/deps/libfabric/fabtests/man/man1/fi_rdm_rma_event.1
new file mode 100644
index 0000000000000000000000000000000000000000..3f6ccf96f11bf1fcd23a27226a50166c3f6b3201
--- /dev/null
+++ b/deps/libfabric/fabtests/man/man1/fi_rdm_rma_event.1
@@ -0,0 +1 @@
+.so man7/fabtests.7
diff --git a/deps/libfabric/fabtests/man/man7/fabtests.7 b/deps/libfabric/fabtests/man/man7/fabtests.7
index 25644a35e7c296f0b129b93ce64fe2b18b590534..165187c9dc796df3dfcd1558886b7dd4c2101625 100644
--- a/deps/libfabric/fabtests/man/man7/fabtests.7
+++ b/deps/libfabric/fabtests/man/man7/fabtests.7
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fabtests" "7" "2020\-07\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fabtests" "7" "2020\-12\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -37,8 +37,8 @@ These tests are a mix of very basic functionality tests that show major
 features of libfabric.
 .TP
 .B \f[I]fi_av_xfer\f[]
-Tests communication for unconnected endpoints, as addresses are inserted
-and removed from the local address vector.
+Tests communication for connectionless endpoints, as addresses are
+inserted and removed from the local address vector.
 .RS
 .RE
 .TP
@@ -58,7 +58,7 @@ A basic datagram endpoint example.
 .RE
 .TP
 .B \f[I]fi_dgram_waitset\f[]
-Transfers datagrams using waitsets for completion notifcation.
+Transfers datagrams using waitsets for completion notification.
 .RS
 .RE
 .TP
@@ -79,7 +79,7 @@ A basic message endpoint example.
 .TP
 .B \f[I]fi_msg_epoll\f[]
 Transfers messages with completion queues configured to use file
-descriptors as wait objetcts.
+descriptors as wait objects.
 The file descriptors are retrieved by the program and used directly with
 the Linux epoll API.
 .RS
@@ -138,8 +138,9 @@ a single buffer, posted using the FI_MULTI_RECV flag.
 .RS
 .RE
 .TP
-.B \f[I]fi_rdm_rma_simple\f[]
-A simple RMA write example over an RDM endpoint.
+.B \f[I]fi_rdm_rma_event\f[]
+An RMA write example over an RDM endpoint that uses RMA events to notify
+the peer that the RMA transfer has completed.
 .RS
 .RE
 .TP
@@ -169,7 +170,7 @@ Tests canceling posted receives for tagged messages.
 .TP
 .B \f[I]fi_resmgmt_test\f[]
 Tests the resource management enabled feature.
-This verifies that the provider prevents applications from overruning
+This verifies that the provider prevents applications from overrunning
 local and remote command queues and completion queues.
 This corresponds to setting the domain attribute resource_mgmt to
 FI_RM_ENABLED.
@@ -307,17 +308,12 @@ Tests memory registration.
 Tests provider MR cache eviction capabilities.
 .RS
 .RE
-.TP
-.B \f[I]fi_resource_freeing\f[]
-Allocates and closes fabric resources to check for proper cleanup.
-.RS
-.RE
 .SH Multinode
 .PP
 This test runs a series of tests over multiple formats and patterns to
 help validate at scale.
 The patterns are an all to all, one to all, all to one and a ring.
-The tests also run accross multiple capabilites, such as messages, rma,
+The tests also run across multiple capabilities, such as messages, rma,
 atomics, and tagged messages.
 Currently, there is no option to run these capabilities and patterns
 independently, however the test is short enough to be all run at once.
@@ -331,7 +327,7 @@ As a result, a full ubertest run can take a significant amount of time.
 Because ubertest iterates over input variables, it relies on a test
 configuration file for control, rather than extensive command line
 options that are used by other fabtests.
-A configuration file must be constructured for each provider.
+A configuration file must be constructed for each provider.
 Example test configurations are at test_configs.
 .TP
 .B \f[I]fi_ubertest\f[]
@@ -452,12 +448,13 @@ FI_CSWAP_GT, FI_MSWAP
 .B \f[I]datatype\f[]
 For FT_CAP_ATOMIC: FI_INT8, FI_UINT8, FI_INT16, FI_UINT16, FI_INT32,
 FI_UINT32, FI_INT64, FI_UINT64, FI_FLOAT, FI_DOUBLE, FI_FLOAT_COMPLEX,
-FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLE
+FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLEX
 .RS
 .RE
 .TP
 .B \f[I]msg_flags \- values OR\[aq]ed together\f[]
-For FT_FUNC_XXXMSG: FI_REMOTE_CQ_DATA, FI_COMPLETION
+For FT_FUNC_[SEND,WRITE,READ,ATOMIC]MSG: FI_REMOTE_CQ_DATA,
+FI_COMPLETION
 .RS
 .RE
 .TP
@@ -532,6 +529,13 @@ Valid options are msg, dgram, and rdm.
 The default endpoint type is rdm.
 .RS
 .RE
+.TP
+.B \f[I]\-D \f[]
+Allocate data buffers on the specified device, rather than in host
+memory.
+Valid options are ze and cuda.
+.RS
+.RE
 *\-a
 .IP \[bu] 2
 : The name of a shared address vector.
@@ -571,11 +575,21 @@ Cannot be used together with the \[aq]\-b\[aq] option.
 .RS
 .RE
 .TP
+.B \f[I]\-U\f[]
+Run fabtests with FI_DELIVERY_COMPLETE.
+.RS
+.RE
+.TP
 .B \f[I]\-I \f[]
 Number of data transfer iterations.
 .RS
 .RE
 .TP
+.B \f[I]\-Q\f[]
+Associated any EQ with the domain, rather than directly with the EP.
+.RS
+.RE
+.TP
 .B \f[I]\-w \f[]
 Number of warm\-up data transfer iterations.
 .RS
@@ -672,10 +686,10 @@ This will run "fi_rdm_atomic" for all atomic operations with
 .IP
 .nf
 \f[C]
-Server\ and\ clients\ are\ invoked\ with\ the\ same\ command:\ 
+Server\ and\ clients\ are\ invoked\ with\ the\ same\ command:
 \ \ \ \ fi_multinode\ \-n\ <number\ of\ processes>\ \-s\ <server_addr>\ \-C\ <mode>
 
-A\ process\ on\ the\ server\ must\ be\ started\ before\ any\ of\ the\ clients\ can\ be\ started\ 
+A\ process\ on\ the\ server\ must\ be\ started\ before\ any\ of\ the\ clients\ can\ be\ started
 succesfully.\ \-C\ lists\ the\ mode\ that\ the\ tests\ will\ run\ in.\ Currently\ the\ options\ are
 \f[]
 .fi
diff --git a/deps/libfabric/fabtests/multinode/src/core.c b/deps/libfabric/fabtests/multinode/src/core.c
index 8a1511d688d39afd6d7b8e72bb07c72a6725dc5a..cc04271adcb7439c27858ef97e308c653e74ee26 100644
--- a/deps/libfabric/fabtests/multinode/src/core.c
+++ b/deps/libfabric/fabtests/multinode/src/core.c
@@ -52,6 +52,7 @@
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <assert.h>
+#include <hmem.h>
 
 char *tx_barrier;
 char *rx_barrier;
@@ -81,7 +82,7 @@ static int multi_setup_fabric(int argc, char **argv)
 	char my_name[FT_MAX_CTRL_MSG];
 	size_t len;
 	int i, ret;
-	struct fi_rma_iov *remote = malloc(sizeof(*remote));
+	struct fi_rma_iov remote;
 
 	hints->ep_attr->type = FI_EP_RDM;
 	hints->mode = FI_CONTEXT;
@@ -103,6 +104,10 @@ static int multi_setup_fabric(int argc, char **argv)
 	tx_cq_cntr = 0;
 	rx_cq_cntr = 0;
 
+	ret = ft_hmem_init(opts.iface);
+	if (ret)
+		return ret;
+
 	if (pm_job.my_rank != 0)
 		pm_barrier();
 
@@ -119,7 +124,7 @@ static int multi_setup_fabric(int argc, char **argv)
 	if (ret)
 		return ret;
 
-	ret = ft_enable_ep(ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	ret = ft_enable_ep(ep);
 	if (ret)
 		return ret;
 
@@ -170,15 +175,15 @@ static int multi_setup_fabric(int argc, char **argv)
 		goto err;
 	}
 
-	if (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR) 
-		remote->addr = (uintptr_t) rx_buf;
+	if (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR)
+		remote.addr = (uintptr_t) rx_buf;
 	else
-		remote->addr = 0;
+		remote.addr = 0;
 
-	remote->key = fi_mr_key(mr);
-	remote->len = rx_size;
+	remote.key = fi_mr_key(mr);
+	remote.len = rx_size;
 
-	ret = pm_allgather(remote, pm_job.multi_iovs, sizeof(*remote));
+	ret = pm_allgather(&remote, pm_job.multi_iovs, sizeof(remote));
 	if (ret) {
 		FT_ERR("error exchanging rma_iovs\n");
 		goto err;
@@ -264,7 +269,6 @@ int multi_msg_send()
 		offset = state.sends_posted % opts.window_size;
 		assert(tx_ctx_arr[offset].state == OP_DONE);
 
-		tx_ctx_arr[offset].buf[0] = offset;
 		dest = pm_job.fi_addrs[state.cur_target];
 		ret = ft_post_tx_buf(ep, dest, opts.transfer_size,
 				     NO_CQ_DATA,
@@ -323,20 +327,20 @@ int multi_rma_write()
 
 		snprintf((char*) tx_buf + tx_size * state.cur_target, tx_size,
 		        "Hello World! from %zu to %i on the %zuth iteration, %s test",
-		        pm_job.my_rank, state.cur_target, 
+		        pm_job.my_rank, state.cur_target,
 		        (size_t) tx_seq, pattern->name);
 
 		while (1) {
-			ret = fi_write(ep, 
+			ret = fi_write(ep,
 				tx_buf + tx_size * state.cur_target,
-				opts.transfer_size, mr_desc, 
-				pm_job.fi_addrs[state.cur_target], 
+				opts.transfer_size, mr_desc,
+				pm_job.fi_addrs[state.cur_target],
 				pm_job.multi_iovs[state.cur_target].addr,
-				pm_job.multi_iovs[state.cur_target].key, 
+				pm_job.multi_iovs[state.cur_target].key,
 				&tx_ctx_arr[state.tx_window].context);
 			if (!ret)
 				break;
-		
+
 			if (ret != -FI_EAGAIN) {
 				printf("RMA write failed");
 				return ret;
@@ -349,7 +353,7 @@ int multi_rma_write()
 			}
 		}
 		tx_seq++;
-	
+
 		state.sends_posted++;
 		state.tx_window--;
 	}
@@ -393,7 +397,7 @@ int send_recv_barrier(int sync)
 	}
 
 	for (i = 0; i < pm_job.num_ranks; i++) {
-		ret = ft_post_tx_buf(ep, pm_job.fi_addrs[i], 0, 
+		ret = ft_post_tx_buf(ep, pm_job.fi_addrs[i], 0,
 				     NO_CQ_DATA, &barrier_tx_ctx[i],
 		                     tx_buf, mr_desc, 0);
 		if (ret)
@@ -404,7 +408,7 @@ int send_recv_barrier(int sync)
 	if (ret)
 		return ret;
 
-	ret = ft_get_rx_comp(rx_seq);	
+	ret = ft_get_rx_comp(rx_seq);
 
 	return ret;
 }
@@ -482,7 +486,7 @@ int multinode_run_tests(int argc, char **argv)
 	ret = multi_setup_fabric(argc, argv);
 	if (ret)
 		return ret;
-	
+
 
 	for (i = 0; i < NUM_TESTS && !ret; i++) {
 		printf("starting %s... ", patterns[i].name);
diff --git a/deps/libfabric/fabtests/multinode/src/core_coll.c b/deps/libfabric/fabtests/multinode/src/core_coll.c
index 29fcc9aefa72d843a589b165496e99ed7bd4aa52..016d98a35a830dc5afc7a66729d37d163dba7b28 100644
--- a/deps/libfabric/fabtests/multinode/src/core_coll.c
+++ b/deps/libfabric/fabtests/multinode/src/core_coll.c
@@ -59,6 +59,20 @@ fi_addr_t world_addr;
 fi_addr_t coll_addr;
 struct fid_mc *coll_mc;
 
+// For the verification
+struct fi_av_set_attr av_set_attr;
+
+static bool is_my_rank_participating()
+{
+	size_t rank = pm_job.my_rank;
+	if (rank < av_set_attr.start_addr)
+		return false;
+	if (rank > av_set_attr.end_addr)
+		return false;
+	if ((rank - av_set_attr.start_addr) % av_set_attr.stride != 0)
+		return false;
+	return true;
+}
 
 static int wait_for_event(uint32_t event)
 {
@@ -119,15 +133,17 @@ static int wait_for_comp(void *ctx)
 	return err;
 }
 
-static int coll_setup()
+static int coll_setup_w_start_addr_stride(int start_addr, int stride)
 {
 	int err;
-	struct fi_av_set_attr av_set_attr;
 
-	av_set_attr.count = pm_job.num_ranks;
-	av_set_attr.start_addr = 0;
+	av_set_attr.count = 0;
+	av_set_attr.start_addr = start_addr;
 	av_set_attr.end_addr = pm_job.num_ranks - 1;
-	av_set_attr.stride = 1;
+	av_set_attr.stride = stride;
+
+	if (!is_my_rank_participating())
+		return FI_SUCCESS;
 
 	err = fi_av_set(av, &av_set_attr, &av_set, NULL);
 	if (err) {
@@ -150,8 +166,20 @@ static int coll_setup()
 	return wait_for_event(FI_JOIN_COMPLETE);
 }
 
+static int coll_setup()
+{
+	return coll_setup_w_start_addr_stride(/*start_addr=*/0, /*stride=*/1);
+}
+
+static int coll_setup_w_stride()
+{
+	return coll_setup_w_start_addr_stride(/*start_addr=*/1, /*stride=*/2);
+}
+
 static void coll_teardown()
 {
+	if (!is_my_rank_participating())
+		return;
 	fi_close(&coll_mc->fid);
 	fi_close(&av_set->fid);
 }
@@ -193,11 +221,19 @@ static int sum_all_reduce_test_run()
 	uint64_t done_flag;
 	uint64_t result = 0;
 	uint64_t expect_result = 0;
-	uint64_t data = pm_job.my_rank;
+	uint64_t data;
+	const uint64_t base_data_value = 1234; /* any arbitrary value != 0 */
 	size_t count = 1;
 	uint64_t i;
 	struct fi_collective_attr attr;
 
+	if (!is_my_rank_participating())
+		return FI_SUCCESS;
+
+	// Set to rank + base_data_value to make the participation of rank 0
+	// verifiable
+	data = base_data_value + pm_job.my_rank;
+
 	attr.op = FI_SUM;
 	attr.datatype = FI_UINT64;
 	attr.mode = 0;
@@ -208,8 +244,10 @@ static int sum_all_reduce_test_run()
 		return err;
 	}
 
-	for (i = 0; i < pm_job.num_ranks; i++) {
-		expect_result += i;
+	for (i = av_set_attr.start_addr;
+	     i <= av_set_attr.end_addr;
+	     i += av_set_attr.stride) {
+		expect_result += base_data_value + i;
 	}
 
 	coll_addr = fi_mc_addr(coll_mc);
@@ -233,7 +271,7 @@ static int sum_all_reduce_test_run()
 
 static int all_gather_test_run()
 {
-	int err;
+	int ret;
 	uint64_t done_flag;
 	uint64_t *result;
 	uint64_t *expect_result;
@@ -245,50 +283,58 @@ static int all_gather_test_run()
 	attr.op = FI_NOOP;
 	attr.datatype = FI_UINT64;
 	attr.mode = 0;
-	err = fi_query_collective(domain, FI_ALLGATHER, &attr, 0);
-	if (err) {
-		FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", err,
-			 fi_strerror(err));
-		return err;
+	ret = fi_query_collective(domain, FI_ALLGATHER, &attr, 0);
+	if (ret) {
+		FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", ret,
+			 fi_strerror(ret));
+		return ret;
 	}
 
 	result = malloc(pm_job.num_ranks * sizeof(*expect_result));
+	if (!result)
+		return -FI_ENOMEM;
 	expect_result = malloc(pm_job.num_ranks * sizeof(*expect_result));
+	if (!expect_result) {
+		free(result);
+		return -FI_ENOMEM;
+	}
+
 	for (i = 0; i < pm_job.num_ranks; i++) {
 		expect_result[i] = i;
 	}
 
 	coll_addr = fi_mc_addr(coll_mc);
-	err = fi_allgather(ep, &data, count, NULL, result, NULL, coll_addr, FI_UINT64, 0,
+	ret = fi_allgather(ep, &data, count, NULL, result, NULL, coll_addr, FI_UINT64, 0,
 			   &done_flag);
-	if (err) {
-		FT_DEBUG("collective allreduce failed: %d (%s)\n", err, fi_strerror(err));
-		goto errout;
+	if (ret) {
+		FT_DEBUG("collective allreduce failed: %d (%s)\n", ret, fi_strerror(ret));
+		goto out;
 	}
 
-	err = wait_for_comp(&done_flag);
-	if (err)
-		goto errout;
+	ret = wait_for_comp(&done_flag);
+	if (ret)
+		goto out;
 
 	for (i = 0; i < pm_job.num_ranks; i++) {
 		if ((expect_result[i]) != result[i]) {
 			FT_DEBUG("allgather failed; expect[%ld]: %ld, actual[%ld]: %ld\n",
 				 i, expect_result[i], i, result[i]);
-			err = -1;
-			goto errout;
+			ret = -1;
+			goto out;
 		}
 	}
-	return FI_SUCCESS;
 
-errout:
+	ret = FI_SUCCESS;
+
+out:
 	free(expect_result);
 	free(result);
-	return err;
+	return ret;
 }
 
 static int scatter_test_run()
 {
-	int err;
+	int ret;
 	uint64_t done_flag;
 	uint64_t result;
 	uint64_t *data;
@@ -300,11 +346,11 @@ static int scatter_test_run()
 	attr.op = FI_NOOP;
 	attr.datatype = FI_UINT64;
 	attr.mode = 0;
-	err = fi_query_collective(domain, FI_SCATTER, &attr, 0);
-	if (err) {
-		FT_DEBUG("Scatter collective not supported: %d (%s)\n", err,
-			 fi_strerror(err));
-		return err;
+	ret = fi_query_collective(domain, FI_SCATTER, &attr, 0);
+	if (ret) {
+		FT_DEBUG("Scatter collective not supported: %d (%s)\n", ret,
+			 fi_strerror(ret));
+		return ret;
 	}
 
 	data = malloc(data_size);
@@ -317,32 +363,33 @@ static int scatter_test_run()
 
 	coll_addr = fi_mc_addr(coll_mc);
 	if (pm_job.my_rank == root)
-		err = fi_scatter(ep, data, 1, NULL, &result, NULL, coll_addr, root,
+		ret = fi_scatter(ep, data, 1, NULL, &result, NULL, coll_addr, root,
 				 FI_UINT64, 0, &done_flag);
 	else
-		err = fi_scatter(ep, NULL, 1, NULL, &result, NULL, coll_addr, root,
+		ret = fi_scatter(ep, NULL, 1, NULL, &result, NULL, coll_addr, root,
 				 FI_UINT64, 0, &done_flag);
 
-	if (err) {
-		FT_DEBUG("collective scatter failed: %d (%s)\n", err, fi_strerror(err));
-		goto errout;
+	if (ret) {
+		FT_DEBUG("collective scatter failed: %d (%s)\n", ret, fi_strerror(ret));
+		goto out;
 	}
 
-	err = wait_for_comp(&done_flag);
-	if (err)
-		goto errout;
+	ret = wait_for_comp(&done_flag);
+	if (ret)
+		goto out;
 
 	if (data[pm_job.my_rank] != result) {
 		FT_DEBUG("scatter failed; expect: %ld, actual: %ld\n",
 			 data[pm_job.my_rank], result);
-		err = -1;
-		goto errout;
+		ret = -1;
+		goto out;
 	}
-	return FI_SUCCESS;
 
-errout:
+	ret = FI_SUCCESS;
+
+out:
 	free(data);
-	return err;
+	return ret;
 }
 
 static int broadcast_test_run()
@@ -370,8 +417,10 @@ static int broadcast_test_run()
 		return -FI_ENOMEM;
 
 	data = malloc(data_cnt * sizeof(*data));
-	if (!data)
+	if (!data) {
+		free(result);
 		return -FI_ENOMEM;
+	}
 
 	for (i = 0; i < pm_job.num_ranks; ++i) {
 		data[i] = pm_job.num_ranks - 1 - i;
@@ -434,6 +483,12 @@ struct coll_test tests[] = {
 		.run = sum_all_reduce_test_run,
 		.teardown = coll_teardown
 	},
+	{
+		.name = "sum_all_reduce_w_stride_test",
+		.setup = coll_setup_w_stride,
+		.run = sum_all_reduce_test_run,
+		.teardown = coll_teardown
+	},
 	{
 		.name = "all_gather_test",
 		.setup = coll_setup,
@@ -490,7 +545,7 @@ static int multinode_setup_fabric(int argc, char **argv)
 	if (err)
 		return err;
 
-	err = ft_enable_ep(ep, eq, av, txcq, rxcq, txcntr, rxcntr);
+	err = ft_enable_ep(ep);
 	if (err)
 		return err;
 
diff --git a/deps/libfabric/fabtests/prov/efa/Makefile.include b/deps/libfabric/fabtests/prov/efa/Makefile.include
new file mode 100644
index 0000000000000000000000000000000000000000..5dbc39f53bd527da311ce33d247f57a1ac4ee3f5
--- /dev/null
+++ b/deps/libfabric/fabtests/prov/efa/Makefile.include
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+bin_PROGRAMS += prov/efa/src/fi_efa_ep_rnr_retry
+
+prov_efa_src_fi_efa_ep_rnr_retry_SOURCES = prov/efa/src/rdm_ep_rnr_retry.c
+prov_efa_src_fi_efa_ep_rnr_retry_LDADD =  libfabtests.la
diff --git a/deps/libfabric/fabtests/prov/efa/src/rdm_ep_rnr_retry.c b/deps/libfabric/fabtests/prov/efa/src/rdm_ep_rnr_retry.c
new file mode 100644
index 0000000000000000000000000000000000000000..a242584996d4692b3e356289f8d759a26d4f1a5e
--- /dev/null
+++ b/deps/libfabric/fabtests/prov/efa/src/rdm_ep_rnr_retry.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021, Amazon.com, Inc.  All rights reserved.
+ *
+ * This software is available to you under the BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * This program tests the RNR retry counter reset via fi_setopt.
+ * When running the test, use `-R` option to specify RNR retry counter.
+ * The valid values are 0 - 7 (7 inidicates infinite retry on firmware).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <shared.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_ext.h>
+
+static int poll_rnr_cq_error(void)
+{
+	struct fi_cq_data_entry comp;
+	struct fi_cq_err_entry comp_err;
+	int total_send, expected_rnr_error;
+	int ret, i, cnt, rnr_flag;
+
+	rnr_flag = 0;
+	expected_rnr_error = 1;
+	/*
+	 * In order for the sender to get RNR error, we need to first consume
+	 * all pre-posted receive buffer (in efa provider, fi->rx_attr->size
+	 * receiving buffer are pre-posted) on the receiver side, the subsequent
+	 * sends (expected_rnr_error) will then get RNR errors.
+	 */
+	total_send = fi->rx_attr->size + expected_rnr_error;
+
+	for (i = 0; i < total_send; i++) {
+		do {
+			ret = fi_send(ep, tx_buf, 32, mr_desc, remote_fi_addr, &tx_ctx);
+			if (ret < 0 && ret != -FI_EAGAIN) {
+				FT_PRINTERR("fi_send", -ret);
+				return ret;
+			}
+		} while (ret == -FI_EAGAIN);
+	}
+
+	cnt = total_send;
+	do {
+		ret = fi_cq_read(txcq, &comp, 1);
+		if (ret == 1) {
+			cnt--;
+		} else if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(txcq, &comp_err, FI_SEND);
+			if (ret < 0 && ret != -FI_EAGAIN) {
+				FT_PRINTERR("fi_cq_readerr", -ret);
+				return ret;
+			} else if (ret == 1) {
+				cnt--;
+				if (comp_err.err == FI_ENORX) {
+					rnr_flag = 1;
+					printf("Got RNR error CQ entry as expected: %d, %s\n",
+						comp_err.err, fi_strerror(comp_err.err));
+				} else {
+					printf("Got non-RNR error CQ entry: %d, %s\n",
+						comp_err.err, fi_strerror(comp_err.err));
+					return comp_err.err;
+				}
+			}
+		} else if (ret < 0 && ret != -FI_EAGAIN) {
+			FT_PRINTERR("fi_cq_read", -ret);
+			return ret;
+		}
+	} while (cnt);
+
+	return (rnr_flag) ? 0 : -FI_EINVAL;
+}
+
+static int run(size_t rnr_retry)
+{
+	int ret;
+
+	ret = ft_init();
+	if (ret) {
+		FT_PRINTERR("ft_init", -ret);
+		return ret;
+	}
+
+	ret = ft_init_oob();
+	if (ret) {
+		FT_PRINTERR("ft_init_oob", -ret);
+		return ret;
+	}
+
+	ret = ft_getinfo(hints, &fi);
+	if (ret) {
+		FT_PRINTERR("ft_getinfo", -ret);
+		return ret;
+	}
+
+	ret = ft_open_fabric_res();
+	if (ret) {
+		FT_PRINTERR("ft_open_fabric_res", -ret);
+		return ret;
+	}
+
+	ret = ft_alloc_active_res(fi);
+	if (ret) {
+		FT_PRINTERR("ft_alloc_active_res", -ret);
+		return ret;
+	}
+
+	fprintf(stdout, "Setting RNR retry count to %zu ...\n", rnr_retry);
+	ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &rnr_retry, sizeof(rnr_retry));
+	if (ret) {
+		FT_PRINTERR("fi_setopt", -ret);
+		return ret;
+	}
+	fprintf(stdout, "RNR retry count has been set to %zu.\n", rnr_retry);
+
+	ret = ft_enable_ep(ep);
+	if (ret) {
+		FT_PRINTERR("ft_enable_ep_recv", -ret);
+		return ret;
+	}
+
+	ret = ft_init_av();
+	if (ret) {
+		FT_PRINTERR("ft_init_av", -ret);
+		return ret;
+	}
+	/* client does fi_send and then poll CQ to get error (FI_ENORX) CQ entry */
+	if (opts.dst_addr) {
+		ret = poll_rnr_cq_error();
+		if (ret) {
+			FT_PRINTERR("pingpong", -ret);
+			return ret;
+		}
+	}
+
+	/*
+	 * To get RNR error on the client side, the server should not close its
+	 * endpoint while the client is still sending.
+	 * ft_reset_oob() will re-initialize OOB sync between server and client.
+	 * Calling it here to ensure the client has finished the sending.
+	 * And both server and client are ready to close endpoint and free resources.
+	 */
+	ret = ft_reset_oob();
+	if (ret) {
+		FT_PRINTERR("ft_reset_oob", -ret);
+		return ret;
+	}
+	ret = ft_close_oob();
+	if (ret) {
+		FT_PRINTERR("ft_close_oob", -ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void print_opts_usage(char *name, char *desc)
+{
+	ft_usage(name, desc);
+	/* rdm_ep_rnr_retry test usage */
+	FT_PRINT_OPTS_USAGE("-R <number>", "RNR retry count (valid values: 0-7, default: 1)");
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+	size_t rnr_retry;
+
+	rnr_retry = 1;
+	opts = INIT_OPTS;
+	opts.options |= FT_OPT_SIZE;
+
+	hints = fi_allocinfo();
+	if (!hints)
+		return EXIT_FAILURE;
+
+	while ((op = getopt(argc, argv, "R:h" ADDR_OPTS INFO_OPTS CS_OPTS)) != -1) {
+		switch (op) {
+		default:
+			ft_parse_addr_opts(op, optarg, &opts);
+			ft_parseinfo(op, optarg, hints, &opts);
+			ft_parsecsopts(op, optarg, &opts);
+			break;
+		case 'R':
+			rnr_retry = atoi(optarg);
+			if (rnr_retry > 7) {
+				fprintf(stdout, "RNR retry count invalid, it must be 0-7.\n");
+				return EXIT_FAILURE;
+			}
+			break;
+		case '?':
+		case 'h':
+			print_opts_usage(argv[0], "RDM RNR retry counter reset test");
+			return EXIT_FAILURE;
+		}
+	}
+
+	if (optind < argc)
+		opts.dst_addr = argv[optind];
+
+	hints->ep_attr->type = FI_EP_RDM;
+	hints->caps = FI_MSG;
+	/*
+	 * RNR error is generated from EFA device, so disable shm transfer by
+	 * setting FI_REMOTE_COMM and unsetting FI_LOCAL_COMM in order to ensure
+	 * EFA device is being used when running this test on a single node.
+	 */
+	hints->caps |= FI_REMOTE_COMM;
+	hints->caps &= ~FI_LOCAL_COMM;
+	hints->mode |= FI_CONTEXT;
+	hints->domain_attr->mr_mode = opts.mr_mode;
+	/*
+	 * FI_RM_DISABLED is required to be set in order for the RNR error CQ entry
+	 * to be written to applications. Otherwise, the packet (failed with RNR error)
+	 * will be queued and resent.
+	 */
+	hints->domain_attr->resource_mgmt = FI_RM_DISABLED;
+
+	ret = run(rnr_retry);
+	if (ret)
+		FT_PRINTERR("run", -ret);
+
+	ft_free_res();
+	return ft_exit_code(ret);
+}
diff --git a/deps/libfabric/fabtests/scripts/runfabtests.cmd b/deps/libfabric/fabtests/scripts/runfabtests.cmd
index eb74c5215997669785999fce9776f9929eaefa82..d6f0a957be1e1a2895a57036b56d8bfc17216f0c 100644
--- a/deps/libfabric/fabtests/scripts/runfabtests.cmd
+++ b/deps/libfabric/fabtests/scripts/runfabtests.cmd
@@ -24,7 +24,7 @@ set functional_tests=^
 	"poll -t queue"^
 	"poll -t counter"^
 	"rdm"^
-	"rdm_rma_simple"^
+	"rdm_rma_event"^
 	"rdm_rma_trigger"^
 	"rdm_tagged_peek"^
 	"bw -e rdm -v -T 1"^
diff --git a/deps/libfabric/fabtests/scripts/runfabtests.sh b/deps/libfabric/fabtests/scripts/runfabtests.sh
index 6f74d53ab926e2cd07eb3260529efc5ae16161f8..dafd831fa0ac52b3a4790146228609d41e47cef0 100755
--- a/deps/libfabric/fabtests/scripts/runfabtests.sh
+++ b/deps/libfabric/fabtests/scripts/runfabtests.sh
@@ -61,6 +61,7 @@ declare FORK=0
 declare OOB=0
 declare C_ARGS=""
 declare S_ARGS=""
+declare PROVIDER_TESTS=0
 
 declare cur_excludes=""
 declare file_excludes=""
@@ -109,7 +110,8 @@ functional_tests=(
 	"fi_poll -t queue"
 	"fi_poll -t counter"
 	"fi_rdm"
-	"fi_rdm_rma_simple"
+	"fi_rdm -U"
+	"fi_rdm_rma_event"
 	"fi_rdm_rma_trigger"
 	"fi_shared_ctx"
 	"fi_shared_ctx --no-tx-shared-ctx"
@@ -128,8 +130,6 @@ functional_tests=(
 	"fi_recv_cancel -e rdm -V"
 	"fi_unexpected_msg -e msg -i 10"
 	"fi_unexpected_msg -e rdm -i 10"
-	"fi_unexpected_msg -e msg -S -i 10"
-	"fi_unexpected_msg -e rdm -S -i 10"
 	"fi_inj_complete -e msg"
 	"fi_inj_complete -e rdm"
 	"fi_inj_complete -e dgram"
@@ -137,7 +137,10 @@ functional_tests=(
 	"fi_inj_complete -e rdm -SR"
 	"fi_inj_complete -e dgram -SR"
 	"fi_bw -e rdm -v -T 1"
+	"fi_bw -e rdm -v -T 1 -U"
 	"fi_bw -e msg -v -T 1"
+	"fi_rdm_multi_client -C 10 -I 5"
+	"fi_rdm_multi_client -C 10 -I 5 -U"
 )
 
 short_tests=(
@@ -149,18 +152,28 @@ short_tests=(
 	"fi_rma_bw -e msg -o read -I 5"
 	"fi_rma_bw -e msg -o writedata -I 5"
 	"fi_rma_bw -e rdm -o write -I 5"
+	"fi_rma_bw -e rdm -o write -I 5 -U"
 	"fi_rma_bw -e rdm -o read -I 5"
+	"fi_rma_bw -e rdm -o read -I 5 -U"
 	"fi_rma_bw -e rdm -o writedata -I 5"
+	"fi_rma_bw -e rdm -o writedata -I 5 -U"
 	"fi_rdm_atomic -I 5 -o all"
+	"fi_rdm_atomic -I 5 -o all -U"
 	"fi_rdm_cntr_pingpong -I 5"
 	"fi_multi_recv -e rdm -I 5"
 	"fi_multi_recv -e msg -I 5"
 	"fi_rdm_pingpong -I 5"
+	"fi_rdm_pingpong -I 5 -U"
 	"fi_rdm_pingpong -I 5 -v"
+	"fi_rdm_pingpong -I 5 -v -U"
 	"fi_rdm_tagged_pingpong -I 5"
+	"fi_rdm_tagged_pingpong -I 5 -U"
 	"fi_rdm_tagged_pingpong -I 5 -v"
+	"fi_rdm_tagged_pingpong -I 5 -v -U"
 	"fi_rdm_tagged_bw -I 5"
+	"fi_rdm_tagged_bw -I 5 -U"
 	"fi_rdm_tagged_bw -I 5 -v"
+	"fi_rdm_tagged_bw -I 5 -v -U"
 	"fi_dgram_pingpong -I 5"
 )
 
@@ -175,20 +188,32 @@ standard_tests=(
 	"fi_rma_bw -e msg -o read"
 	"fi_rma_bw -e msg -o writedata"
 	"fi_rma_bw -e rdm -o write"
+	"fi_rma_bw -e rdm -o write -U"
 	"fi_rma_bw -e rdm -o read"
+	"fi_rma_bw -e rdm -o read -U"
 	"fi_rma_bw -e rdm -o writedata"
+	"fi_rma_bw -e rdm -o writedata -U"
 	"fi_rdm_atomic -o all -I 1000"
+	"fi_rdm_atomic -o all -I 1000 -U"
 	"fi_rdm_cntr_pingpong"
 	"fi_multi_recv -e rdm"
 	"fi_multi_recv -e msg"
 	"fi_rdm_pingpong"
+	"fi_rdm_pingpong -U"
 	"fi_rdm_pingpong -v"
+	"fi_rdm_pingpong -v -U"
 	"fi_rdm_pingpong -k"
+	"fi_rdm_pingpong -k -U"
 	"fi_rdm_pingpong -k -v"
+	"fi_rdm_pingpong -k -v -U"
 	"fi_rdm_tagged_pingpong"
+	"fi_rdm_tagged_pingpong -U"
 	"fi_rdm_tagged_pingpong -v"
+	"fi_rdm_tagged_pingpong -v -U"
 	"fi_rdm_tagged_bw"
+	"fi_rdm_tagged_bw -U"
 	"fi_rdm_tagged_bw -v"
+	"fi_rdm_tagged_bw -v -U"
 	"fi_dgram_pingpong"
 	"fi_dgram_pingpong -k"
 )
@@ -213,6 +238,10 @@ multinode_tests=(
 	"fi_multinode_coll"
 )
 
+prov_efa_tests=( \
+	"fi_efa_ep_rnr_retry -R 0"
+)
+
 function errcho {
 	>&2 echo $*
 }
@@ -454,7 +483,12 @@ function cs_test {
 	wait $c_pid
 	c_ret=$?
 
-	[[ c_ret -ne 0 ]] && kill -9 $s_pid 2> /dev/null
+	if [[ $c_ret -ne 0 ]] && ps -p $s_pid > /dev/null; then
+	    if [[ $STRICT_MODE -eq 0 ]]; then
+	        sleep 2
+	    fi
+	    kill -9 $s_pid 2> /dev/null
+	fi
 
 	wait $s_pid
 	s_ret=$?
@@ -483,6 +517,10 @@ function set_cfg_file {
 	local parent=$UTIL
 	local name=$CORE
 
+	if [[ ! -z "$COMPLEX_CFG" ]]; then
+		return
+	fi
+
 	if [ -z $UTIL ]; then
 		parent=$CORE
 		name=$1
@@ -510,9 +548,9 @@ function complex_test {
 	local end_time
 	local test_time
 
-	is_excluded "$test" && return
+	set_cfg_file $config
 	if [[ -z "$COMPLEX_CFG" ]]; then
-		set_cfg_file $config
+		is_excluded "$test" && return
 	fi
 
 	start_time=$(date '+%s')
@@ -527,12 +565,12 @@ function complex_test {
 		opts+=" -E"
 	fi
 
-	s_cmd="${BIN_PATH}${test_exe} -x $opts"
+	s_cmd="${BIN_PATH}${test_exe} ${S_ARGS} -x $opts"
 	FI_LOG_LEVEL=error ${SERVER_CMD} "${EXPORT_ENV} $s_cmd" &> $s_outp &
 	s_pid=$!
 	sleep 1
 
-	c_cmd="${BIN_PATH}${test_exe} -u "${COMPLEX_CFG}" $S_INTERFACE $opts"
+	c_cmd="${BIN_PATH}${test_exe} ${C_ARGS} -u "${COMPLEX_CFG}" $S_INTERFACE $opts"
 	FI_LOG_LEVEL=error ${CLIENT_CMD} "${EXPORT_ENV} $c_cmd" &> $c_outp &
 	c_pid=$!
 
@@ -580,7 +618,7 @@ function multinode_test {
 	local c_ret=0
 	local c_out_arr=()
 	local num_procs=$2
-	local test_exe="${test} -n $num_procs -p \"${PROV}\"" 	
+	local test_exe="${test} -n $num_procs -p \"${PROV}\""
 	local c_out
 	local start_time
 	local end_time
@@ -589,18 +627,18 @@ function multinode_test {
 	is_excluded "$test" && return
 
 	start_time=$(date '+%s')
-	
+
 	s_cmd="${BIN_PATH}${test_exe} ${S_ARGS} -s ${S_INTERFACE}"
 	${SERVER_CMD} "${EXPORT_ENV} $s_cmd" &> $s_outp &
 	s_pid=$!
 	sleep 1
-	
-	c_pid_arr=()	
+
+	c_pid_arr=()
 	for ((i=1; i<num_procs; i++))
 	do
 		local c_out=$(mktemp fabtests.c_outp${i}.XXXXXX)
 		c_cmd="${BIN_PATH}${test_exe} ${S_ARGS} -s ${S_INTERFACE}"
-		${CLIENT_CMD} "${EXPORT_ENV} $c_cmd" &> $c_out & 
+		${CLIENT_CMD} "${EXPORT_ENV} $c_cmd" &> $c_out &
 		c_pid_arr+=($!)
 		c_out_arr+=($c_out)
 	done
@@ -609,21 +647,26 @@ function multinode_test {
 		wait $pid
 		c_ret=($?)||$c_ret
 	done
-	
-	[[ c_ret -ne 0 ]] && kill -9 $s_pid 2> /dev/null
+
+	if [[ $c_ret -ne 0 ]] && ps -p $s_pid > /dev/null; then
+	    if [[ $STRICT_MODE -eq 0 ]]; then
+	        sleep 2
+	    fi
+	    kill -9 $s_pid 2> /dev/null
+	fi
 
 	wait $s_pid
 	s_ret=$?
 	echo "server finished"
-	
+
 	end_time=$(date '+%s')
 	test_time=$(compute_duration "$start_time" "$end_time")
-	
+
 	pe=1
 	if [[ $STRICT_MODE -eq 0 && $s_ret -eq $FI_ENODATA && $c_ret -eq $FI_ENODATA ]] ||
 	   [[ $STRICT_MODE -eq 0 && $s_ret -eq $FI_ENOSYS && $c_ret -eq $FI_ENOSYS ]]; then
 		print_results "$test_exe" "Notrun" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd"
-		for c_out in "${c_out_arr[@]}" 
+		for c_out in "${c_out_arr[@]}"
 		do
 			printf -- "  client_stdout $pe: |\n"
 			sed -e 's/^/    /' < $c_out
@@ -632,7 +675,7 @@ function multinode_test {
 		skip_count+=1
 	elif [ $s_ret -ne 0 -o $c_ret -ne 0 ]; then
 		print_results "$test_exe" "Fail" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd"
-		for c_out in "${c_out_arr[@]}" 
+		for c_out in "${c_out_arr[@]}"
 		do
 			printf -- "  client_stdout $pe: |\n"
 			sed -e 's/^/    /' < $c_out
@@ -644,7 +687,7 @@ function multinode_test {
 		fail_count+=1
 	else
 		print_results "$test_exe" "Pass" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd"
-		for c_out in "${c_out_arr[@]}" 
+		for c_out in "${c_out_arr[@]}"
 		do
 			printf -- "  client_stdout $pe: |\n"
 			sed -e 's/^/    /' < $c_out
@@ -654,6 +697,12 @@ function multinode_test {
 	fi
 }
 
+function prov_efa_test {
+	for test in "${prov_efa_tests[@]}"; do
+		cs_test "$test"
+	done
+}
+
 function set_core_util {
 	prov_arr=$(echo $PROV | tr ";" " ")
 	CORE=""
@@ -675,7 +724,7 @@ function main {
 
 	set_core_util
 	set_excludes
-	
+
 
 	if [[ $1 == "quick" ]]; then
 		local -r tests="unit functional short"
@@ -739,6 +788,10 @@ function main {
 	esac
 	done
 
+	if [[ $PROVIDER_TESTS -eq 1 ]]; then
+		prov_${PROV}_test
+	fi
+
 	total=$(( $pass_count + $fail_count ))
 
 	print_border
@@ -774,6 +827,7 @@ function usage {
 			 regex patterns e.g. \"dgram,rma.*write\""
 	errcho -e " -E\texport provided variable name and value to ssh client and server processes.
 			 options must of of the form '-E var=value'"
+	errcho -e " -U\trun fabtests with FI_DELIVERY_COMPLETE set"
 	errcho -e " -f\texclude tests file: File containing list of test names /
 			 regex patterns to exclude (one per line)"
 	errcho -e " -N\tskip negative unit tests"
@@ -786,10 +840,11 @@ function usage {
 	errcho -e " -C\tAdditional client test arguments: Parameters to pass to client fabtests"
 	errcho -e " -L\tAdditional server test arguments: Parameters to pass to server fabtests"
 	errcho -e " -b\tenable out-of-band address exchange over the default port"
+	errcho -e " -P\tRun provider specific tests"
 	exit 1
 }
 
-while getopts ":vt:p:g:e:f:c:s:u:T:C:L:NRSbkE:" opt; do
+while getopts ":vt:p:g:e:f:c:s:u:T:C:L:NRSbkPE:" opt; do
 case ${opt} in
 	t) TEST_TYPE=$OPTARG
 	;;
@@ -814,6 +869,8 @@ case ${opt} in
 	;;
 	N) SKIP_NEG+=1
 	;;
+	P) PROVIDER_TESTS=1
+	;;
 	R)
 	;;
 	S) STRICT_MODE=1
diff --git a/deps/libfabric/fabtests/test_configs/efa/efa.exclude b/deps/libfabric/fabtests/test_configs/efa/efa.exclude
index 8ec42b4c28a320d01b0c1153eecc24df7dc69e3d..ddf05a611631415d0770dae74c7f5c4c58c2ad2f 100644
--- a/deps/libfabric/fabtests/test_configs/efa/efa.exclude
+++ b/deps/libfabric/fabtests/test_configs/efa/efa.exclude
@@ -38,9 +38,6 @@
 # Exclude cq_data test until fi_pool/fi_wait is supported
 cq_data
 
-# Exclude all rdm prefix tests
-rdm.* -k
-
 multi_mr
 rdm_rma_trigger
 
@@ -61,9 +58,6 @@ trigger
 #rdm_cntr_pingpong
 
 
-# This test requires ENA IPs for the OOB sync
-av_xfer
-
 # Connection manager isn't supported
 cm_data
 
@@ -96,3 +90,4 @@ dgram_bw
 
 # Multinode tests failing with an unsupported address format
 multinode
+
diff --git a/deps/libfabric/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude b/deps/libfabric/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude
index 648ca0043335bd7f799120ff08119da6655632fc..ec8121b07847fbe964a12e806d89aced1b3386ad 100644
--- a/deps/libfabric/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude
+++ b/deps/libfabric/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude
@@ -9,7 +9,7 @@
 -e dgram
 
 cm_data
-rdm_rma_simple
+rdm_rma_event
 trigger
 shared_ctx
 scalable_ep
diff --git a/deps/libfabric/fabtests/test_configs/ofi_rxm/tcp.test b/deps/libfabric/fabtests/test_configs/ofi_rxm/tcp.test
index baec15f19728cd14abf7cf24afcb1aa90ce6e2b7..6087f7ed58817338bcd2e0a26b011ab4f781efd1 100644
--- a/deps/libfabric/fabtests/test_configs/ofi_rxm/tcp.test
+++ b/deps/libfabric/fabtests/test_configs/ofi_rxm/tcp.test
@@ -1,26 +1,190 @@
 {
 	prov_name: tcp;ofi_rxm,
 	test_type: [
+		FT_TEST_UNIT,
 		FT_TEST_LATENCY,
 		FT_TEST_BANDWIDTH,
 	],
+	test_class: [
+		FT_CAP_MSG,
+		FT_CAP_TAGGED,
+	],
 	class_function: [
 		FT_FUNC_SEND,
 		FT_FUNC_SENDV,
-		FT_FUNC_SENDDATA,
+		FT_FUNC_SENDMSG,
 		FT_FUNC_INJECT,
 		FT_FUNC_INJECTDATA,
+		FT_FUNC_SENDDATA,
 	],
 	ep_type: [
 		FI_EP_RDM,
 	],
 	comp_type: [
 		FT_COMP_QUEUE,
+		FT_COMP_CNTR,
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL,
+		FI_PROGRESS_AUTO
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
+},
+{
+	prov_name: tcp;ofi_rxm,
+	test_type: [
+		FT_TEST_UNIT,
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH
+	],
+	test_class: [
+		FT_CAP_RMA,
+	],
+	class_function: [
+		FT_FUNC_WRITE,
+		FT_FUNC_WRITEV,
+		FT_FUNC_WRITEMSG,
+		FT_FUNC_WRITEDATA,
+		FT_FUNC_INJECT_WRITE,
+		FT_FUNC_INJECT_WRITEDATA,
+		FT_FUNC_READ,
+		FT_FUNC_READV,
+		FT_FUNC_READMSG,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+		FT_COMP_CNTR,
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL,
+		FI_PROGRESS_AUTO
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
+},
+{
+	prov_name: tcp;ofi_rxm,
+	test_type: [
+		FT_TEST_UNIT,
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH
+	],
+	test_class: [
+		FT_CAP_ATOMIC,
+	],
+	class_function: [
+		FT_FUNC_ATOMIC,
+		FT_FUNC_ATOMICV,
+		FT_FUNC_ATOMICMSG,
+		FT_FUNC_INJECT_ATOMIC,
+		FT_FUNC_FETCH_ATOMIC,
+		FT_FUNC_FETCH_ATOMICV,
+		FT_FUNC_FETCH_ATOMICMSG,
+		FT_FUNC_COMPARE_ATOMIC,
+		FT_FUNC_COMPARE_ATOMICV,
+		FT_FUNC_COMPARE_ATOMICMSG
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL,
+		FI_PROGRESS_AUTO
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
+},
+{
+	prov_name: tcp;ofi_rxm,
+	test_type: [
+		FT_TEST_LATENCY,
+	],
+	test_class: [
+		FT_CAP_TAGGED,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	eq_wait_obj: [
+		FI_WAIT_NONE,
+		FI_WAIT_UNSPEC,
+		FI_WAIT_FD
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
+},
+{
+	prov_name: tcp;ofi_rxm,
+	test_type: [
+		FT_TEST_LATENCY,
 	],
 	test_class: [
-		FT_CAP_MSG,
 		FT_CAP_TAGGED,
 	],
-	mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY],
-	progress: [FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	cq_wait_obj: [
+		FI_WAIT_NONE,
+		FI_WAIT_UNSPEC,
+		FI_WAIT_FD
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
+},
+{
+	prov_name: tcp;ofi_rxm,
+	test_type: [
+		FT_TEST_LATENCY,
+	],
+	test_class: [
+		FT_CAP_TAGGED,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	comp_type: [
+		FT_COMP_CNTR,
+	],
+	cntr_wait_obj: [
+		FI_WAIT_NONE,
+		FI_WAIT_UNSPEC,
+		FI_WAIT_FD
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST
+	],
 },
diff --git a/deps/libfabric/fabtests/test_configs/psm2/psm2.exclude b/deps/libfabric/fabtests/test_configs/psm2/psm2.exclude
index 30303e022739e0b9ce883463ccd73b9b6748267f..36ec1495927f8f3fbe306894e3cabec2e8dba489 100644
--- a/deps/libfabric/fabtests/test_configs/psm2/psm2.exclude
+++ b/deps/libfabric/fabtests/test_configs/psm2/psm2.exclude
@@ -15,3 +15,4 @@ scalable_ep
 shared_av
 rdm_cntr_pingpong
 multi_recv
+rdm_multi_client
diff --git a/deps/libfabric/fabtests/test_configs/psm3/all.test b/deps/libfabric/fabtests/test_configs/psm3/all.test
new file mode 100644
index 0000000000000000000000000000000000000000..f18a9da9306d7785595a22cb5d30dfd723501e4b
--- /dev/null
+++ b/deps/libfabric/fabtests/test_configs/psm3/all.test
@@ -0,0 +1,120 @@
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_LATENCY,
+	],
+	class_function: [
+		FT_FUNC_SEND,
+		FT_FUNC_SENDV,
+		FT_FUNC_SENDMSG,
+		FT_FUNC_INJECT,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+		FI_AV_MAP,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+		FT_CAP_TAGGED,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_BANDWIDTH,
+	],
+	class_function: [
+		FT_FUNC_SEND,
+		FT_FUNC_INJECT,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+		FI_AV_MAP,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+		FT_CAP_TAGGED,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH,
+	],
+	class_function: [
+		FT_FUNC_SENDDATA,
+		FT_FUNC_INJECTDATA,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+		FI_AV_MAP,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH,
+	],
+	class_function: [
+		FT_FUNC_WRITE,
+		FT_FUNC_WRITEV,
+		FT_FUNC_WRITEMSG,
+		FT_FUNC_INJECT_WRITE,
+		FT_FUNC_WRITEDATA,
+		FT_FUNC_READ,
+		FT_FUNC_READV,
+		FT_FUNC_READMSG,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+		FI_AV_MAP,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_RMA,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
diff --git a/deps/libfabric/fabtests/test_configs/psm3/psm3.exclude b/deps/libfabric/fabtests/test_configs/psm3/psm3.exclude
new file mode 100644
index 0000000000000000000000000000000000000000..30303e022739e0b9ce883463ccd73b9b6748267f
--- /dev/null
+++ b/deps/libfabric/fabtests/test_configs/psm3/psm3.exclude
@@ -0,0 +1,17 @@
+# Regex patterns of tests to exclude in runfabtests.sh
+
+# Exclude all prefix tests
+-k
+
+# av_test supports only FI_SOCKADDR
+av_test
+
+^fi_msg
+-e msg
+
+cm_data
+shared_ctx
+scalable_ep
+shared_av
+rdm_cntr_pingpong
+multi_recv
diff --git a/deps/libfabric/fabtests/test_configs/psm3/verify.test b/deps/libfabric/fabtests/test_configs/psm3/verify.test
new file mode 100644
index 0000000000000000000000000000000000000000..eb2d45261e450ce506223a2c90d95f2f72e3f24e
--- /dev/null
+++ b/deps/libfabric/fabtests/test_configs/psm3/verify.test
@@ -0,0 +1,246 @@
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_SEND,
+		FT_FUNC_SENDV,
+		FT_FUNC_SENDMSG,
+		FT_FUNC_INJECT,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+		FT_CAP_TAGGED,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_SENDDATA,
+		FT_FUNC_INJECTDATA,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_WRITE,
+		FT_FUNC_WRITEV,
+		FT_FUNC_WRITEMSG,
+		FT_FUNC_INJECT_WRITE,
+		FT_FUNC_WRITEDATA,
+		FT_FUNC_READ,
+		FT_FUNC_READV,
+		FT_FUNC_READMSG,
+	],
+	ep_type: [
+		FI_EP_RDM
+	],
+	av_type: [
+		FI_AV_TABLE
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_RMA,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_ATOMIC,
+		FT_FUNC_ATOMICV,
+		FT_FUNC_ATOMICMSG,
+		FT_FUNC_FETCH_ATOMIC,
+		FT_FUNC_FETCH_ATOMICV,
+		FT_FUNC_FETCH_ATOMICMSG,
+		FT_FUNC_INJECT_ATOMIC,
+	],
+	op:[
+		FI_MIN,
+		FI_MAX,
+		FI_SUM,
+		FI_PROD,
+		FI_LOR,
+		FI_LAND,
+		FI_BOR,
+		FI_BAND,
+		FI_LXOR,
+		FI_BXOR,
+		FI_ATOMIC_WRITE,
+	],
+	datatype:[
+		FI_INT8,
+		FI_UINT8,
+		FI_INT16,
+		FI_UINT16,
+		FI_INT32,
+		FI_UINT32,
+		FI_INT64,
+		FI_UINT64,
+		FI_FLOAT,
+		FI_DOUBLE,
+		FI_LONG_DOUBLE,
+		FI_FLOAT_COMPLEX,
+		FI_DOUBLE_COMPLEX,
+		FI_LONG_DOUBLE_COMPLEX,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	av_type: [
+		FI_AV_TABLE,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_ATOMIC,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_FETCH_ATOMIC,
+		FT_FUNC_FETCH_ATOMICV,
+		FT_FUNC_FETCH_ATOMICMSG,
+	],
+	op:[
+		FI_ATOMIC_READ,
+	],
+	datatype:[
+		FI_INT8,
+		FI_UINT8,
+		FI_INT16,
+		FI_UINT16,
+		FI_INT32,
+		FI_UINT32,
+		FI_INT64,
+		FI_UINT64,
+		FI_FLOAT,
+		FI_DOUBLE,
+		FI_LONG_DOUBLE,
+		FI_FLOAT_COMPLEX,
+		FI_DOUBLE_COMPLEX,
+		FI_LONG_DOUBLE_COMPLEX,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	av_type: [
+		FI_AV_TABLE,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+		FT_COMP_CNTR,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_ATOMIC,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
+{
+	prov_name: psm3,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	class_function: [
+		FT_FUNC_COMPARE_ATOMIC,
+		FT_FUNC_COMPARE_ATOMICV,
+		FT_FUNC_COMPARE_ATOMICMSG,
+	],
+	op:[
+		FI_CSWAP,
+		FI_CSWAP_NE,
+		FI_CSWAP_LE,
+		FI_CSWAP_LT,
+		FI_CSWAP_GE,
+		FI_CSWAP_GT,
+		FI_MSWAP,
+	],
+	datatype:[
+		FI_INT8,
+		FI_UINT8,
+		FI_INT16,
+		FI_UINT16,
+		FI_INT32,
+		FI_UINT32,
+		FI_INT64,
+		FI_UINT64,
+		FI_FLOAT,
+		FI_DOUBLE,
+		FI_LONG_DOUBLE,
+		FI_FLOAT_COMPLEX,
+		FI_DOUBLE_COMPLEX,
+		FI_LONG_DOUBLE_COMPLEX,
+	],
+	ep_type: [
+		FI_EP_RDM,
+	],
+	av_type: [
+		FI_AV_TABLE,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	mode: [
+		FI_CONTEXT,
+	],
+	test_class: [
+		FT_CAP_ATOMIC,
+	],
+	test_flags: FT_FLAG_QUICKTEST
+},
diff --git a/deps/libfabric/fabtests/test_configs/shm/all.test b/deps/libfabric/fabtests/test_configs/shm/all.test
index a975a7a75d9dfd7c6e83e7c24e23b39bd2a8840f..72c210d3e41dde33670eabd4a922367307f19f2a 100644
--- a/deps/libfabric/fabtests/test_configs/shm/all.test
+++ b/deps/libfabric/fabtests/test_configs/shm/all.test
@@ -191,6 +191,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -228,6 +229,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -269,6 +271,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -310,6 +313,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -342,6 +346,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -380,6 +385,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
diff --git a/deps/libfabric/fabtests/test_configs/shm/quick.test b/deps/libfabric/fabtests/test_configs/shm/quick.test
index 94c98b6558d751d77d559880b110f7b5de62b106..c7a07f7b1651cf3738686062cc7fda91cd5cf6a6 100644
--- a/deps/libfabric/fabtests/test_configs/shm/quick.test
+++ b/deps/libfabric/fabtests/test_configs/shm/quick.test
@@ -199,6 +199,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -237,6 +238,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -279,6 +281,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -321,6 +324,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -354,6 +358,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
@@ -393,6 +398,7 @@
 		FI_INT16,
 		FI_INT32,
 		FI_INT64,
+		FI_INT128,
 		FI_FLOAT,
 		FI_LONG_DOUBLE,
 		FI_DOUBLE_COMPLEX,
diff --git a/deps/libfabric/fabtests/test_configs/shm/verify.test b/deps/libfabric/fabtests/test_configs/shm/verify.test
index d94394be585afe175dc0f5147dbb98442c203131..cb52daab5d1fad3f014d7b7aa1e207a6ec95cac5 100644
--- a/deps/libfabric/fabtests/test_configs/shm/verify.test
+++ b/deps/libfabric/fabtests/test_configs/shm/verify.test
@@ -119,6 +119,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
@@ -162,6 +164,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
@@ -211,6 +215,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
@@ -268,6 +274,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
@@ -308,6 +316,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
@@ -354,6 +364,8 @@
 		FI_UINT32,
 		FI_INT64,
 		FI_UINT64,
+		FI_INT128,
+		FI_UINT128,
 		FI_FLOAT,
 		FI_DOUBLE,
 		FI_LONG_DOUBLE,
diff --git a/deps/libfabric/fabtests/test_configs/tcp/all.test b/deps/libfabric/fabtests/test_configs/tcp/all.test
new file mode 100644
index 0000000000000000000000000000000000000000..58474947b2542d478d60fb9941408956eb49a725
--- /dev/null
+++ b/deps/libfabric/fabtests/test_configs/tcp/all.test
@@ -0,0 +1,185 @@
+#: "Suite of tests for the tcp provider"
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH,
+		FT_TEST_UNIT
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	class_function: [
+		FT_FUNC_SEND,
+		FT_FUNC_SENDV,
+		FT_FUNC_SENDMSG,
+		FT_FUNC_INJECT,
+		FT_FUNC_INJECTDATA,
+		FT_FUNC_SENDDATA,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	progress: [
+		FI_PROGRESS_MANUAL,
+		FI_PROGRESS_AUTO,
+	]
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+},
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_LATENCY,
+		FT_TEST_BANDWIDTH,
+		FT_TEST_UNIT,
+	],
+	test_class: [
+		FT_CAP_RMA,
+	],
+	class_function: [
+		FT_FUNC_WRITE,
+		FT_FUNC_WRITEV,
+		FT_FUNC_WRITEMSG,
+		FT_FUNC_WRITEDATA,
+		FT_FUNC_INJECT_WRITE,
+		FT_FUNC_INJECT_WRITEDATA,
+		FT_FUNC_READ,
+		FT_FUNC_READV,
+		FT_FUNC_READMSG,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	progress: [
+		FI_PROGRESS_MANUAL,
+		FI_PROGRESS_AUTO,
+	]
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+}
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_LATENCY,
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	eq_wait_obj: [
+		FI_WAIT_NONE,
+		FI_WAIT_UNSPEC,
+		FI_WAIT_FD
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL,
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+},
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_LATENCY,
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	cq_wait_obj: [
+		FI_WAIT_NONE,
+		FI_WAIT_UNSPEC,
+		FI_WAIT_FD
+	],
+	mr_mode: [],
+	progress: [
+		FI_PROGRESS_MANUAL,
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+},
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	test_class: [
+		FT_CAP_MSG,
+	],
+	class_function: [
+		FT_FUNC_SENDMSG,
+		FT_FUNC_INJECTDATA,
+		FT_FUNC_SENDDATA,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	progress: [
+		FI_PROGRESS_MANUAL,
+	],
+	cq_format: [
+		FI_CQ_FORMAT_CONTEXT,
+		FI_CQ_FORMAT_DATA,
+		FI_CQ_FORMAT_MSG,
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+},
+{
+	prov_name: tcp,
+	test_type: [
+		FT_TEST_UNIT,
+	],
+	test_class: [
+		FT_CAP_RMA,
+	],
+	class_function: [
+		FT_FUNC_WRITEMSG,
+		FT_FUNC_WRITEDATA,
+		FT_FUNC_INJECT_WRITEDATA,
+		FT_FUNC_READMSG,
+	],
+	ep_type: [
+		FI_EP_MSG,
+	],
+	comp_type: [
+		FT_COMP_QUEUE,
+	],
+	progress: [
+		FI_PROGRESS_MANUAL,
+	],
+	cq_format: [
+		FI_CQ_FORMAT_CONTEXT,
+		FI_CQ_FORMAT_DATA,
+		FI_CQ_FORMAT_MSG,
+	],
+	test_flags: [
+		FT_FLAG_QUICKTEST,
+	],
+}
diff --git a/deps/libfabric/fabtests/test_configs/tcp/quick.test b/deps/libfabric/fabtests/test_configs/tcp/quick.test
deleted file mode 100644
index 34b323e958e581dd3df99a6d2b7442eb84cb8fe3..0000000000000000000000000000000000000000
--- a/deps/libfabric/fabtests/test_configs/tcp/quick.test
+++ /dev/null
@@ -1,54 +0,0 @@
-#: "Suite of tests for the tcp provider"
-{
-	prov_name: tcp,
-	test_type: [
-		FT_TEST_LATENCY,
-		FT_TEST_BANDWIDTH,
-		FT_TEST_UNIT
-	],
-	class_function: [
-		FT_FUNC_SEND,
-		FT_FUNC_SENDV,
-		FT_FUNC_SENDMSG,
-		FT_FUNC_INJECT,
-		FT_FUNC_INJECTDATA,
-		FT_FUNC_SENDDATA,
-	],
-	ep_type: [
-		FI_EP_MSG,
-	],
-	comp_type: [
-		FT_COMP_QUEUE
-	],
-	test_class: [
-		FT_CAP_MSG,
-	],
-	test_flags: FT_FLAG_QUICKTEST
-},
-{
-	prov_name: tcp,
-	test_type: [
-		FT_TEST_LATENCY,
-		FT_TEST_BANDWIDTH,
-		FT_TEST_UNIT
-	],
-	class_function: [
-		FT_FUNC_READ,
-		FT_FUNC_READV,
-		FT_FUNC_READMSG,
-		FT_FUNC_WRITE,
-		FT_FUNC_WRITEV,
-		FT_FUNC_WRITEMSG,
-		FT_FUNC_WRITEDATA
-	],
-	ep_type: [
-		FI_EP_MSG,
-	],
-	comp_type: [
-		FT_COMP_QUEUE
-	],
-	test_class: [
-		FT_CAP_RMA,
-	],
-	test_flags: FT_FLAG_QUICKTEST
-}
diff --git a/deps/libfabric/fabtests/test_configs/tcp/tcp.exclude b/deps/libfabric/fabtests/test_configs/tcp/tcp.exclude
index 88a0dada3d520b82758033fbaba929a0f56ca969..be4eb2436533f9017b343e75f511ba64832e3450 100644
--- a/deps/libfabric/fabtests/test_configs/tcp/tcp.exclude
+++ b/deps/libfabric/fabtests/test_configs/tcp/tcp.exclude
@@ -3,7 +3,7 @@
 ^fi_dgram
 -e dgram
 
-rdm_rma_simple
+rdm_rma_event
 rdm_rma_trigger
 shared_ctx
 scalable_ep
@@ -13,6 +13,7 @@ atomic
 inj_complete -e msg
 unexpected_msg -e msg
 multi_recv
+-k
 
 # TODO. Following fails with macOS. will fix them later
 cq_data -e rdm
diff --git a/deps/libfabric/fabtests/ubertest/config.c b/deps/libfabric/fabtests/ubertest/config.c
index d274b65767044c0857ea8c6a29723c6cac793d7f..e5950e8f40f2dba46a25975cd5333cb5ff3a26be 100644
--- a/deps/libfabric/fabtests/ubertest/config.c
+++ b/deps/libfabric/fabtests/ubertest/config.c
@@ -32,7 +32,6 @@
 #include "fabtest.h"
 #include "jsmn.h"
 
-
 #define FT_CAP_MSG	FI_MSG | FI_SEND | FI_RECV
 #define FT_CAP_TAGGED	FI_TAGGED | FI_SEND | FI_RECV
 #define FT_CAP_RMA	FI_RMA | FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE
@@ -173,7 +172,7 @@ static struct key_t keys[] = {
 		.str = "datatype",
 		.offset = offsetof(struct ft_set, datatype),
 		.val_type = VAL_NUM,
-		.val_size = sizeof(((struct ft_set *)0)->datatype) / FI_DATATYPE_LAST,
+		.val_size = sizeof(((struct ft_set *)0)->datatype) / OFI_DATATYPE_CNT,
 	},
 	{
 		.str = "mode",
@@ -247,6 +246,12 @@ static struct key_t keys[] = {
 		.val_type = VAL_NUM,
 		.val_size = sizeof(((struct ft_set *)0)->threading) / FT_MAX_THREADING,
 	},
+	{
+		.str = "cq_format",
+		.offset = offsetof(struct ft_set, cq_format),
+		.val_type = VAL_NUM,
+		.val_size = sizeof(((struct ft_set *)0)->cq_format) / FT_MAX_CQ_FORMAT,
+	},
 };
 
 static int ft_parse_num(char *str, int len, struct key_t *key, void *buf)
@@ -265,12 +270,12 @@ static int ft_parse_num(char *str, int len, struct key_t *key, void *buf)
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_SENDDATA, enum ft_class_function, buf);
 
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITE, enum ft_class_function, buf);
-		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEV, enum ft_class_function, buf);	
+		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEV, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEMSG, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEDATA, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_INJECT_WRITE, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_INJECT_WRITEDATA, enum ft_class_function, buf);
-		
+
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READ, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READV, enum ft_class_function, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READMSG, enum ft_class_function, buf);
@@ -340,6 +345,8 @@ static int ft_parse_num(char *str, int len, struct key_t *key, void *buf)
 		TEST_ENUM_SET_N_RETURN(str, len, FI_UINT32, enum fi_datatype, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FI_INT64, enum fi_datatype, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FI_UINT64, enum fi_datatype, buf);
+		TEST_ENUM_SET_N_RETURN(str, len, FI_INT128, enum fi_datatype, buf);
+		TEST_ENUM_SET_N_RETURN(str, len, FI_UINT128, enum fi_datatype, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FI_FLOAT, enum fi_datatype, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FI_DOUBLE, enum fi_datatype, buf);
 		TEST_ENUM_SET_N_RETURN(str, len, FI_FLOAT_COMPLEX, enum fi_datatype, buf);
@@ -405,6 +412,11 @@ static int ft_parse_num(char *str, int len, struct key_t *key, void *buf)
 		FT_ERR("Unsupported mode bit");
 	} else if (!strncmp(key->str, "test_flags", strlen("test_flags"))) {
 		TEST_SET_N_RETURN(str, len, "FT_FLAG_QUICKTEST", FT_FLAG_QUICKTEST, uint64_t, buf);
+	} else if (!strncmp(key->str, "cq_format", strlen("cq_format"))) {
+		TEST_ENUM_SET_N_RETURN(str, len, FI_CQ_FORMAT_CONTEXT, uint64_t, buf);
+		TEST_ENUM_SET_N_RETURN(str, len, FI_CQ_FORMAT_MSG, uint64_t, buf);
+		TEST_ENUM_SET_N_RETURN(str, len, FI_CQ_FORMAT_DATA, uint64_t, buf);
+		TEST_ENUM_SET_N_RETURN(str, len, FI_CQ_FORMAT_TAGGED, uint64_t, buf);
 	} else {
 		FT_ERR("Unknown test configuration key");
 	}
@@ -504,8 +516,8 @@ static int ft_parse_config(char *config, int size,
 	 * 	JSMN_STRING
 	 * 	JSMN_STRING : <key>
 	 * 	JSMN_STRING : <value>
-	 * In our case, JSMN_OBJECT would represent a ft_set structure. The rest 
-	 * of the tokens would be treated as key-value pairs. The first JSMN_STRING 
+	 * In our case, JSMN_OBJECT would represent a ft_set structure. The rest
+	 * of the tokens would be treated as key-value pairs. The first JSMN_STRING
 	 * would represent a key and the next would represent a value. A value
 	 * can also be an array. jsmntok_t.size would represent the length of
 	 * the array.
@@ -661,6 +673,7 @@ void fts_start(struct ft_series *series, int index)
 	series->cur_class = 0;
 	series->cur_progress = 0;
 	series->cur_threading = 0;
+	series->cur_cq_format = 0;
 
 	series->test_index = 1;
 	if (index > 1) {
@@ -686,7 +699,17 @@ int fts_info_is_valid(void)
 		if (!ft_use_comp_cntr(test_info.comp_type))
 			return 0;
 	}
-
+	if (test_info.test_class & FI_TAGGED) {
+		if (test_info.cq_format != FI_CQ_FORMAT_TAGGED)
+			return 0;
+	} else if (test_info.cq_format == FI_CQ_FORMAT_TAGGED) {
+		return 0;
+	}
+	if (test_info.msg_flags & FI_REMOTE_CQ_DATA ||
+	    is_data_func(test_info.class_function)) {
+		if (test_info.cq_format < FI_CQ_FORMAT_DATA)
+			return 0;
+	}
 	if (test_info.test_class & (FI_MSG | FI_TAGGED) &&
 	    !ft_check_rx_completion(test_info) &&
 	    !ft_use_comp_cntr(test_info.comp_type))
@@ -767,6 +790,10 @@ void fts_next(struct ft_series *series)
 		return;
 	series->cur_threading = 0;
 
+	if (set->cq_format[++series->cur_cq_format])
+		return;
+	series->cur_cq_format = 0;
+
 	series->cur_set++;
 }
 
@@ -839,6 +866,20 @@ void fts_cur_info(struct ft_series *series, struct ft_info *info)
 			info->mode |= set->mode[i++];
 	}
 
+	if (set->cq_format[0]) {
+		info->cq_format = set->cq_format[series->cur_cq_format];
+	} else {
+		if (info->test_class & FI_TAGGED)
+			info->cq_format = FI_CQ_FORMAT_TAGGED;
+		else if (info->test_class & FI_MSG)
+			info->cq_format = FI_CQ_FORMAT_MSG;
+		else if (info->msg_flags & FI_REMOTE_CQ_DATA ||
+		    is_data_func(info->class_function))
+			info->cq_format = FI_CQ_FORMAT_DATA;
+		else
+			info->cq_format = FI_CQ_FORMAT_CONTEXT;
+	}
+
 	info->ep_type = set->ep_type[series->cur_ep];
 	info->av_type = set->av_type[series->cur_av];
 	if (set->comp_type[0])
@@ -853,16 +894,17 @@ void fts_cur_info(struct ft_series *series, struct ft_info *info)
 	info->cntr_wait_obj = set->cntr_wait_obj[series->cur_cntr_wait_obj];
 
 	if (set->node[0])
-		strncpy(info->node, set->node, sizeof(info->node) - 1);
+		strncpy(info->node, set->node, sizeof(info->node));
 	else if (opts.dst_addr)
-		strncpy(info->node, opts.dst_addr, sizeof(info->node) - 1);
+		strncpy(info->node, opts.dst_addr, sizeof(info->node));
+	info->node[sizeof(info->node) - 1] = '\0';
+
 	if (set->service[0])
-		strncpy(info->service, set->service, sizeof(info->service) - 1);
+		strncpy(info->service, set->service, sizeof(info->service));
 	else if (opts.dst_port)
-		strncpy(info->service, opts.dst_port, sizeof(info->service) - 1);
-	strncpy(info->prov_name, set->prov_name, sizeof(info->prov_name) - 1);
-
-	info->node[sizeof(info->node) - 1] = '\0';
+		strncpy(info->service, opts.dst_port, sizeof(info->service));
 	info->service[sizeof(info->service) - 1] = '\0';
+
+	strncpy(info->prov_name, set->prov_name, sizeof(info->prov_name));
 	info->prov_name[sizeof(info->prov_name) - 1] = '\0';
 }
diff --git a/deps/libfabric/fabtests/ubertest/fabtest.h b/deps/libfabric/fabtests/ubertest/fabtest.h
index 0cc83ea9ff7f23af78eec431045868cf4ecc9fe8..e4335cc441f5da9b7c759e7ca2d06a50c830e163 100644
--- a/deps/libfabric/fabtests/ubertest/fabtest.h
+++ b/deps/libfabric/fabtests/ubertest/fabtest.h
@@ -86,12 +86,14 @@ struct ft_xcontrol {
 	size_t			max_credits;
 	fi_addr_t		addr;
 	uint64_t		tag;
+	uint64_t		check_tag;
 	uint8_t			seqno;
 	uint64_t		total_comp;
 	enum fi_cq_format	cq_format;
 	uint64_t		remote_cq_data;
 	struct fi_context	*ctx;
 	int			curr_ctx;
+	int			check_ctx;
 };
 
 struct ft_atomic_control {
@@ -149,6 +151,7 @@ enum {
 	FT_MAX_FLAGS		= 64,
 	FT_MAX_PROGRESS		= 3,
 	FT_MAX_THREADING	= 6,
+	FT_MAX_CQ_FORMAT	= 4,
 };
 
 enum ft_comp_type {
@@ -237,7 +240,7 @@ struct ft_set {
 	enum ft_class_function	class_function[FT_MAX_FUNCTIONS];
 	uint64_t		msg_flags;
 	enum fi_op		op[FI_ATOMIC_OP_LAST];
-	enum fi_datatype	datatype[FI_DATATYPE_LAST];
+	enum fi_datatype	datatype[OFI_DATATYPE_CNT];
 	enum fi_ep_type		ep_type[FT_MAX_EP_TYPES];
 	enum fi_av_type		av_type[FT_MAX_AV_TYPES];
 	enum ft_comp_type	comp_type[FT_MAX_COMP];
@@ -255,6 +258,7 @@ struct ft_set {
 	uint64_t 		tx_cq_bind_flags[FT_MAX_FLAGS];
 	uint64_t 		rx_op_flags[FT_MAX_FLAGS];
 	uint64_t 		tx_op_flags[FT_MAX_FLAGS];
+	enum fi_cq_format	cq_format[FT_MAX_CQ_FORMAT];
 };
 
 struct ft_series {
@@ -277,6 +281,7 @@ struct ft_series {
 	int			cur_class;
 	int			cur_progress;
 	int			cur_threading;
+	int			cur_cq_format;
 };
 
 struct ft_info {
@@ -310,6 +315,7 @@ struct ft_info {
 	uint64_t 		tx_cq_bind_flags;
 	uint64_t 		rx_op_flags;
 	uint64_t 		tx_op_flags;
+	enum fi_cq_format	cq_format;
 };
 
 
diff --git a/deps/libfabric/fabtests/ubertest/ofi_atomic.c b/deps/libfabric/fabtests/ubertest/ofi_atomic.c
index 1737a4981d4bd8a42eb0456cbc09c1b36b5676dc..311747175d5bc533a1933af569f95678c9164f17 100644
--- a/deps/libfabric/fabtests/ubertest/ofi_atomic.c
+++ b/deps/libfabric/fabtests/ubertest/ofi_atomic.c
@@ -217,6 +217,33 @@
 	}
 
 
+#ifdef HAVE___INT128
+
+/* If __int128 is supported, the existing macros work. */
+#define OFI_DEF_WRITE_INT128_NAME(op, type) OFI_DEF_WRITE_NAME(op, type)
+#define OFI_DEF_WRITE_INT128_FUNC(op, type) OFI_DEF_WRITE_FUNC(op, type)
+#define OFI_DEF_READ_INT128_NAME(op, type) OFI_DEF_READ_NAME(op, type)
+#define OFI_DEF_READ_INT128_FUNC(op, type) OFI_DEF_READ_FUNC(op, type)
+#define OFI_DEF_READWRITE_INT128_NAME(op, type) OFI_DEF_READWRITE_NAME(op, type)
+#define OFI_DEF_READWRITE_INT128_FUNC(op, type) OFI_DEF_READWRITE_FUNC(op, type)
+#define OFI_DEF_CSWAP_INT128_NAME(op, type) OFI_DEF_CSWAP_NAME(op, type)
+#define OFI_DEF_CSWAP_INT128_FUNC(op, type) OFI_DEF_CSWAP_FUNC(op, type)
+
+#else /* HAVE___INT128 */
+
+/* If __int128 is not supported, verfication not done. */
+
+#define OFI_DEF_WRITE_INT128_NAME(op, type) NULL,
+#define OFI_DEF_WRITE_INT128_FUNC(op, type)
+#define OFI_DEF_READ_INT128_NAME(op, type) NULL,
+#define OFI_DEF_READ_INT128_FUNC(op, type)
+#define OFI_DEF_READWRITE_INT128_NAME(op, type) NULL,
+#define OFI_DEF_READWRITE_INT128_FUNC(op, type)
+#define OFI_DEF_CSWAP_INT128_NAME(op, type) NULL,
+#define OFI_DEF_CSWAP_INT128_FUNC(op, type)
+
+#endif /* HAVE___INT128 */
+
 /*********************************************************************
  * Macros create atomic functions for each operation for each datatype
  *********************************************************************/
@@ -243,7 +270,9 @@
 	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, float)	\
 	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, double)\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, long_double)		\
-	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, long_double)
+	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, long_double) \
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #define OFI_DEFINE_REALNO_HANDLERS(ATOMICTYPE, FUNCNAME, op)		\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, int8_t)			\
@@ -259,7 +288,9 @@
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, long_double)		\
-	OFI_DEF_NOOP_##FUNCNAME
+	OFI_DEF_NOOP_##FUNCNAME						\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #define OFI_DEFINE_INT_HANDLERS(ATOMICTYPE, FUNCNAME, op)		\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, int8_t)			\
@@ -275,8 +306,9 @@
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
-	OFI_DEF_NOOP_##FUNCNAME
-
+	OFI_DEF_NOOP_##FUNCNAME						\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 /**********************
  * Write dispatch table
@@ -294,7 +326,14 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
+/* 5 per line to be easily counted by inspection. */
+#define OFI_OP_NOT_SUPPORTED(op)		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL
+
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) },
@@ -308,7 +347,7 @@ void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 	{ OFI_DEFINE_ALL_HANDLERS(WRITE, NAME, OFI_OP_LXOR) },
 	{ OFI_DEFINE_INT_HANDLERS(WRITE, NAME, OFI_OP_BXOR) },
 	 /* no-op: FI_ATOMIC_READ */
-	{ NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL},
+	{ OFI_OP_NOT_SUPPORTED(READ) },
 	{ OFI_DEFINE_ALL_HANDLERS(WRITE, NAME, OFI_OP_WRITE) },
 };
 
@@ -330,7 +369,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) },
@@ -360,7 +399,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
diff --git a/deps/libfabric/fabtests/ubertest/ofi_atomic.h b/deps/libfabric/fabtests/ubertest/ofi_atomic.h
index aec830b728f791a18f7aa6c94724e9350dc2521a..a61a7bae4325d1e3e4c0c62ed11bcec9e01a0060 100644
--- a/deps/libfabric/fabtests/ubertest/ofi_atomic.h
+++ b/deps/libfabric/fabtests/ubertest/ofi_atomic.h
@@ -61,21 +61,27 @@ typedef long double complex ofi_complex_long_double;
 #define ofi_atomic_isswap_op(op) \
 	(op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST)
 
-extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, size_t cnt);
-extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, void *res, size_t cnt);
-extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, const void *cmp,
 			 void *res, size_t cnt);
 
-#define ofi_atomic_write_handler(op, datatype, dst, src, cnt) \
-	ofi_atomic_write_handlers[op][datatype](dst, src, cnt)
-#define ofi_atomic_readwrite_handler(op, datatype, dst, src, res, cnt) \
-	ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, cnt)
-#define ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, cnt) \
-	ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, src, \
-								cmp, res, cnt)
+#define ofi_atomic_write_handler(op, datatype)				\
+	ofi_atomic_write_handlers[op][datatype]
+#define ofi_atomic_readwrite_handler(op, datatype)			\
+	ofi_atomic_readwrite_handlers[op][datatype]
+#define ofi_atomic_swap_handler(op, datatype)				\
+	ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype]
+
+#define ofi_atomic_write_op(op, datatype, dst, src, cnt)		\
+	ofi_atomic_write_handler(op, datatype)(dst, src, cnt)
+#define ofi_atomic_readwrite_op(op, datatype, dst, src, res, cnt)	\
+	ofi_atomic_readwrite_handler(op, datatype)(dst, src, res, cnt)
+#define ofi_atomic_swap_op(op, datatype, dst, src, cmp, res, cnt)	\
+	ofi_atomic_swap_handler(op, datatype)(dst, src, cmp, res, cnt)
 
 #define OFI_DEF_COMPLEX_OPS(type)				\
 static inline int ofi_complex_eq_## type			\
diff --git a/deps/libfabric/fabtests/ubertest/test_ctrl.c b/deps/libfabric/fabtests/ubertest/test_ctrl.c
index dac2a7e090228f77067e1ba9560a603588c4cfae..1caf781bca03809d8c8124c3822aeb3f66deb2a8 100644
--- a/deps/libfabric/fabtests/ubertest/test_ctrl.c
+++ b/deps/libfabric/fabtests/ubertest/test_ctrl.c
@@ -67,7 +67,7 @@ static int ft_init_rx_control(void)
 	if (ret)
 		return ret;
 
-	ft_rx_ctrl.cq_format = FI_CQ_FORMAT_DATA;
+	ft_rx_ctrl.cq_format = test_info.cq_format;
 	ft_rx_ctrl.addr = FI_ADDR_UNSPEC;
 
 	ft_rx_ctrl.msg_size = ft_ctrl.size_array[ft_ctrl.size_cnt - 1];
@@ -418,8 +418,7 @@ static int ft_sync_msg_needed(void)
 
 static int ft_check_verify_cnt()
 {
-	if (test_info.msg_flags == FI_REMOTE_CQ_DATA &&
-	    ft_ctrl.verify_cnt != ft_ctrl.xfer_iter)
+	if (ft_generates_rx_comp() && ft_ctrl.verify_cnt != ft_ctrl.xfer_iter)
 		return -FI_EIO;
 	return 0;
 }
@@ -862,8 +861,12 @@ static int ft_unit_atomic(void)
 			return ret;
 
 		ret = ft_verify_bufs();
-		if (ret)
-			fail = -FI_EIO;
+		if (ret) {
+			if (ret < 0)
+				fail = -FI_EIO;
+			else
+				fail = 1;
+		}
 	}
 
 	ret = ft_check_verify_cnt();
@@ -947,12 +950,19 @@ static int ft_run_unit(void)
 
 		ret = ft_unit();
 		if (ret) {
-			if (ret != -FI_EIO)
-				return ret;
-			fail = -FI_EIO;
+			if (ret < 0) {
+				if (ret != -FI_EIO)
+					return ret;
+				fail = -FI_EIO;
+			} else if (!fail)
+				fail = ret;
 		}
 	}
-	if (fail)
+	if (fail > 0) {
+		printf("unit test UNVERIFIED\n");
+		/* Allow testing to continue. */
+		fail = 0;
+	} else if (fail < 0)
 		printf("unit test FAILED\n");
 	else
 		printf("unit test PASSED\n");
diff --git a/deps/libfabric/fabtests/ubertest/uber.c b/deps/libfabric/fabtests/ubertest/uber.c
index eb6ef3b29d5ffeb6f89df6f2ce027a4a62b70ff6..eec905201f832d3a491a7713b300930186050578 100644
--- a/deps/libfabric/fabtests/ubertest/uber.c
+++ b/deps/libfabric/fabtests/ubertest/uber.c
@@ -68,7 +68,7 @@ enum {
 
 static int results[FT_MAX_RESULT];
 static char *filename = NULL;
-
+static char *domain_name = NULL;
 
 static int ft_nullstr(char *str)
 {
@@ -188,6 +188,7 @@ static void ft_print_comp(struct ft_info *test)
 	printf(", rx: ");
 	ft_print_comp_flag(test->rx_cq_bind_flags, test->rx_op_flags);
 	printf(", ");
+	printf("[%s], ", fi_tostr(&test_info.cq_format, FI_TYPE_CQ_FORMAT));
 }
 
 static void ft_show_test_info(void)
@@ -374,6 +375,8 @@ static int ft_server_setup(struct fi_info *hints, struct fi_info *info)
 	}
 
 	ft_fw_convert_info(hints, &test_info);
+	if (domain_name)
+		hints->domain_attr->name = domain_name;
 
 	ret = fi_getinfo(FT_FIVERSION, ft_strptr(test_info.node),
 			 ft_strptr(test_info.service), FI_SOURCE, hints, &info);
@@ -446,7 +449,7 @@ static int ft_server_child()
 	}
 
 	printf("Ending test %d, result: %s\n", test_info.test_index,
-		fi_strerror(-ret));
+		fi_strerror(-result));
 
 	return result;
 }
@@ -501,6 +504,8 @@ static int ft_client_setup(struct fi_info *hints, struct fi_info *info)
 		goto err;
 
 	ft_fw_convert_info(hints, &test_info);
+	if (domain_name)
+		hints->domain_attr->name = domain_name;
 
 	ft_show_test_info();
 
@@ -518,9 +523,9 @@ static int ft_client_setup(struct fi_info *hints, struct fi_info *info)
 	ft_fw_update_info(&test_info, fabric_info);
 
 	ret = ft_open_res();
-	
+
 	return 0;
-	
+
 err:
 	ft_send_result(ret, info);
 	return ret;
@@ -579,7 +584,7 @@ static int ft_client_child(void)
 	fi_freeinfo(hints);
 	ft_cleanup();
 
-	return 0;
+	return result;
 
 err:
 	ft_send_result(ret, info);
@@ -649,6 +654,7 @@ static void ft_fw_usage(char *program)
 	FT_PRINT_OPTS_USAGE("-B <src_port>", "non default source port number");
 	FT_PRINT_OPTS_USAGE("-P <dst_port>", "non default destination port number "
 		"(config file service parameter will override this)");
+	FT_PRINT_OPTS_USAGE("-d <domain>", "domain name");
 }
 
 void ft_free()
@@ -663,7 +669,7 @@ int main(int argc, char **argv)
 	opts = INIT_OPTS;
 	int ret, op;
 
-	while ((op = getopt(argc, argv, "u:q:xy:z:hf" ADDR_OPTS)) != -1) {
+	while ((op = getopt(argc, argv, "u:q:xy:z:hfd:" ADDR_OPTS)) != -1) {
 		switch (op) {
 		case 'u':
 			filename = strdup(optarg);
@@ -683,6 +689,9 @@ int main(int argc, char **argv)
 		case 'f':
 			do_fork = 1;
 			break;
+		case 'd':
+			domain_name = strdup(optarg);
+			break;
 		default:
 			ft_parse_addr_opts(op, optarg, &opts);
 			break;
@@ -745,5 +754,13 @@ out:
 	if (opts.dst_addr)
 		fts_close(series);
 	ft_free();
+
+	if (results[FT_EIO])
+		ret = -FI_EIO;
+	else if (results[FT_ENOSYS])
+		ret = -FI_ENOSYS;
+	else if (results[FT_ENODATA])
+		ret = -FI_ENODATA;
+
 	return ft_exit_code(ret);
 }
diff --git a/deps/libfabric/fabtests/ubertest/verify.c b/deps/libfabric/fabtests/ubertest/verify.c
index b58fa927461ee876b7c823450295d6c6522b2781..bbac29cf8af181f10781d0da10218c4378e7fd42 100644
--- a/deps/libfabric/fabtests/ubertest/verify.c
+++ b/deps/libfabric/fabtests/ubertest/verify.c
@@ -35,7 +35,6 @@
 #include "ofi_atomic.h"
 #include "fabtest.h"
 
-static int alph_index = 0;
 static const char integ_alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
 static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_alphabet)) - 1;
 
@@ -53,17 +52,31 @@ static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_a
 	} while (0)				\
 
 
-#define FT_FILL(dst,cnt,TYPE)				\
-	do {								\
-		int i;							\
-		TYPE *d = (dst);					\
-		for (i = 0; i < cnt; i++) {				\
-			d[i] = (TYPE) (integ_alphabet[alph_index++]);	\
-			if (alph_index >= integ_alphabet_length)	\
-				alph_index = 0;				\
-		}							\
+#define FT_FILL(dst,cnt,TYPE)					\
+	do {							\
+		int i, a = 0;					\
+		TYPE *d = (dst);				\
+		for (i = 0; i < cnt; i++) {			\
+			d[i] = (TYPE) (integ_alphabet[a]);	\
+			if (++a >= integ_alphabet_length)	\
+				a = 0;				\
+		}						\
 	} while (0)
 
+#ifdef  HAVE___INT128
+
+/* If __int128 supported, things just work. */
+#define FT_FILL_INT128(...)	FT_FILL(__VA_ARGS__)
+#define CHECK_LOCAL_INT128(...)	CHECK_LOCAL(__VA_ARGS__)
+
+#else
+
+/* If __int128, we're not going to fill/verify. */
+#define FT_FILL_INT128(...)
+#define CHECK_LOCAL_INT128(...)
+
+#endif
+
 #define SWITCH_TYPES(type,FUNC,...)				\
 	switch (type) {						\
 	case FI_INT8:	FUNC(__VA_ARGS__,int8_t); break;	\
@@ -74,6 +87,8 @@ static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_a
 	case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break;	\
 	case FI_INT64:	FUNC(__VA_ARGS__,int64_t); break;	\
 	case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break;	\
+	case FI_INT128:	FUNC##_INT128(__VA_ARGS__,ofi_int128_t); break;	\
+	case FI_UINT128: FUNC##_INT128(__VA_ARGS__,ofi_uint128_t); break; \
 	case FI_FLOAT:	FUNC(__VA_ARGS__,float); break;		\
 	case FI_DOUBLE:	FUNC(__VA_ARGS__,double); break;	\
 	case FI_LONG_DOUBLE: FUNC(__VA_ARGS__,long_double); break;		\
@@ -85,6 +100,7 @@ static const int integ_alphabet_length = (sizeof(integ_alphabet)/sizeof(*integ_a
 
 int ft_sync_fill_bufs(size_t size)
 {
+	int ret;
 	ft_sock_sync(0);
 
 	if (test_info.caps & FI_ATOMIC) {
@@ -95,9 +111,14 @@ int ft_sync_fill_bufs(size_t size)
 		memcpy(ft_atom_ctrl.orig_buf, ft_mr_ctrl.buf, size);
 		memcpy(ft_tx_ctrl.cpy_buf, ft_tx_ctrl.buf, size);
 	} else if (is_read_func(test_info.class_function)) {
-		ft_fill_buf(ft_mr_ctrl.buf, size);
+		ret = ft_fill_buf(ft_mr_ctrl.buf, size);
+		if (ret)
+			return ret;
 	} else {
-		ft_fill_buf(ft_tx_ctrl.buf, size);
+		ret = ft_fill_buf(ft_tx_ctrl.buf, size);
+		if (ret)
+			return ret;
+
 		memcpy(ft_tx_ctrl.cpy_buf, ft_tx_ctrl.buf, size);
 	}
 
@@ -116,6 +137,7 @@ static int verify_atomic(void)
 
 	dst = ft_atom_ctrl.orig_buf;
 	src = ft_tx_ctrl.cpy_buf;
+
 	cmp = ft_atom_ctrl.comp_buf;
 	tmp = ft_rx_ctrl.buf;
 	res = ft_atom_ctrl.res_buf;
@@ -124,6 +146,21 @@ static int verify_atomic(void)
 	op = ft_atom_ctrl.op;
 	count = ft_atom_ctrl.count;
 
+	/*
+	 * If we don't have the test function, return > 0 to indicate
+	 * verification is unsupported.
+	 */
+	if (is_compare_func(test_info.class_function)) {
+		if (!ofi_atomic_swap_handler(op, type))
+			return 1;
+	} else if (is_fetch_func(test_info.class_function)) {
+		if (!ofi_atomic_readwrite_handler(op, type))
+			return 1;
+	} else {
+		if (!ofi_atomic_write_handler(op, type))
+			return 1;
+	}
+
 	if (is_fetch_func(test_info.class_function) ||
 	    is_compare_func(test_info.class_function)) {
 		SWITCH_TYPES(type, CHECK_LOCAL, dst, res, count, ret);
@@ -132,11 +169,11 @@ static int verify_atomic(void)
 	}
 
 	if (is_compare_func(test_info.class_function)) {
-		ofi_atomic_swap_handler(op, type, dst, src, cmp, tmp, count);
+		ofi_atomic_swap_op(op, type, dst, src, cmp, tmp, count);
 	} else if (is_fetch_func(test_info.class_function)) {
-		ofi_atomic_readwrite_handler(op, type, dst, src, tmp, count);
+		ofi_atomic_readwrite_op(op, type, dst, src, tmp, count);
 	} else {
-		ofi_atomic_write_handler(op, type, dst, src, count);
+		ofi_atomic_write_op(op, type, dst, src, count);
 	}
 
 	SWITCH_TYPES(type, CHECK_LOCAL, dst, ft_mr_ctrl.buf, count, ret);
@@ -163,20 +200,49 @@ int ft_verify_bufs()
 		compare_buf = (char *) ft_rx_ctrl.buf;
 	}
 
-	return ft_check_buf(compare_buf, compare_size) ? -FI_EIO : 0;
+	return ft_check_buf(compare_buf, compare_size);
 }
 
 void ft_verify_comp(void *buf)
 {
-	struct fi_cq_data_entry *comp;
-
-	if (ft_rx_ctrl.cq_format != FI_CQ_FORMAT_DATA)
+	struct fi_cq_err_entry *comp = (struct fi_cq_err_entry *) buf;
+
+	switch (ft_rx_ctrl.cq_format) {
+	case FI_CQ_FORMAT_TAGGED:
+		if ((test_info.test_class & FI_TAGGED) &&
+		    (comp->tag != ft_tx_ctrl.check_tag++))
+			return;
+		/* fall through */
+	case FI_CQ_FORMAT_DATA:
+		if (test_info.msg_flags & FI_REMOTE_CQ_DATA ||
+		    is_data_func(test_info.class_function)) {
+			if (!(comp->flags & FI_REMOTE_CQ_DATA))
+				return;
+			comp->flags &= ~FI_REMOTE_CQ_DATA;
+			if (comp->data != ft_tx_ctrl.remote_cq_data)
+				return;
+		}
+		/* fall through */
+	case FI_CQ_FORMAT_MSG:
+		if (((test_info.test_class & FI_MSG) &&
+		    (comp->flags != (FI_MSG | FI_RECV))) ||
+		    ((test_info.test_class & FI_TAGGED) &&
+		    (comp->flags != (FI_TAGGED | FI_RECV))))
+			return;
+		if ((test_info.test_class & (FI_MSG | FI_TAGGED)) &&
+		    (comp->len != ft_tx_ctrl.msg_size))
+			return;
+		/* fall through */
+	case FI_CQ_FORMAT_CONTEXT:
+		if (test_info.test_class & (FI_MSG | FI_TAGGED)) {
+			ft_rx_ctrl.check_ctx = (++ft_rx_ctrl.check_ctx >=
+			    ft_rx_ctrl.max_credits) ? 0 : ft_rx_ctrl.check_ctx;
+			if (comp->op_context != &(ft_rx_ctrl.ctx[ft_rx_ctrl.check_ctx]))
+				return;
+		}
+		break;
+	default:
 		return;
-
-	comp = (struct fi_cq_data_entry *) buf;
-
-	if (comp->flags & FI_REMOTE_CQ_DATA) {
-		if (comp->data == ft_tx_ctrl.remote_cq_data)
-			ft_ctrl.verify_cnt++;
 	}
+	ft_ctrl.verify_cnt++;
 }
diff --git a/deps/libfabric/fabtests/unit/av_test.c b/deps/libfabric/fabtests/unit/av_test.c
index 81fd4d5ad696b3b5d92b4c80361897f518544eab..edcd2bd8c97a0fff22d7a0c71d5fe46036d34409 100644
--- a/deps/libfabric/fabtests/unit/av_test.c
+++ b/deps/libfabric/fabtests/unit/av_test.c
@@ -515,6 +515,82 @@ fail:
 	return TEST_RET_VAL(ret, testret);
 }
 
+/*
+ * Tests:
+ * - sync vector with 1 good and 1 bad using FI_SYNC_ERR
+ */
+static int
+av_goodbad_vector_sync_err()
+{
+	int testret, ret;
+	struct fid_av *av;
+	struct fi_av_attr attr;
+	uint8_t addrbuf[4096];
+	int buflen;
+	int sync_err[2];
+
+	if (av_type != FI_AV_TABLE) {
+		ret = 0;
+		testret = SKIPPED;
+		sprintf(err_buf, "test not valid for AV type FI_AV_MAP");
+		goto out;
+	}
+
+	testret = FAIL;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = av_type;
+	attr.count = 32;
+
+	av = NULL;
+	ret = fi_av_open(domain, &attr, &av, NULL);
+	if (ret != 0) {
+		sprintf(err_buf, "fi_av_open(%s) = %d, %s",
+				fi_tostr(&av_type, FI_TYPE_AV_TYPE),
+				ret, fi_strerror(-ret));
+		goto fail;
+	}
+
+	sync_err[0] = -1;
+	sync_err[1] = 0;
+
+	buflen = sizeof(addrbuf);
+
+	/* vector is good address + bad address */
+	ret = av_create_address_list(good_address, 0, 1, addrbuf, 0, buflen);
+	if (ret < 0) {
+		goto fail;		// av_create_address_list filled err_buf
+	}
+	ret = av_create_address_list(bad_address, 0, 1, addrbuf, 1, buflen);
+	if (ret < 0) {
+		goto fail;		// av_create_address_list filled err_buf
+	}
+	ret = fi_av_insert(av, addrbuf, 2, NULL, FI_SYNC_ERR, sync_err);
+	if (ret != 1) {
+		if (ret == -FI_EBADFLAGS) {
+			sprintf(err_buf, "FI_SYNC_ERR not supported\n");
+			ret = -FI_ENOSYS;
+		}
+		sprintf(err_buf, "fi_av_insert ret=%d, should be 1", ret);
+		goto fail;
+	}
+
+	if (sync_err[0] != 0) {
+		sprintf(err_buf, "sync_err[0] != 0");
+		goto fail;
+	}
+	if (sync_err[1] == 0) {
+		sprintf(err_buf, "sync_err[1] = 0");
+		goto fail;
+	}
+
+	testret = PASS;
+fail:
+	FT_CLOSE_FID(av);
+out:
+	return TEST_RET_VAL(ret, testret);
+}
+
 /*
  * Tests:
  * - async good vector
@@ -958,10 +1034,10 @@ struct test_entry test_array_good[] = {
 	TEST_ENTRY(av_good_sync, "Test sync AV insert with good address"),
 	TEST_ENTRY(av_null_fi_addr, "Test AV insert without specifying fi_addr"),
 	TEST_ENTRY(av_good_vector_async,
-			"Test async AV insert with vector of good addresses"),
+		   "Test async AV insert with vector of good addresses"),
 	TEST_ENTRY(av_zero_async, "Test async insert AV insert of zero addresses"),
 	TEST_ENTRY(av_good_2vector_async,
-			"Test async AV inserts with two address vectors"),
+		   "Test async AV inserts with two address vectors"),
 	TEST_ENTRY(av_insert_stages, "Test AV insert at various stages"),
 	{ NULL, "" }
 };
@@ -969,9 +1045,11 @@ struct test_entry test_array_good[] = {
 struct test_entry test_array_bad[] = {
 	TEST_ENTRY(av_bad_sync, "Test sync AV insert of bad address"),
 	TEST_ENTRY(av_goodbad_vector_sync,
-			"Test sync AV insert of 1 good and 1 bad address"),
+		   "Test sync AV insert of 1 good and 1 bad address"),
 	TEST_ENTRY(av_goodbad_vector_async,
-			"Test async AV insert with good and bad address"),
+		   "Test async AV insert with good and bad address"),
+	TEST_ENTRY(av_goodbad_vector_sync_err,
+		   "Test AV insert of 1 good, 1 bad address using FI_SYNC_ERR"),
 	{ NULL, "" }
 };
 
diff --git a/deps/libfabric/fabtests/unit/getinfo_test.c b/deps/libfabric/fabtests/unit/getinfo_test.c
index e02138c4310efd8b5344f47b4340ede21c90c461..31bf406ca3e6b5d44acd549c8b2e4191bd3c8cda 100644
--- a/deps/libfabric/fabtests/unit/getinfo_test.c
+++ b/deps/libfabric/fabtests/unit/getinfo_test.c
@@ -45,7 +45,8 @@
 					    getinfo_ ## name ## _desc)
 
 typedef int (*ft_getinfo_init)(struct fi_info *);
-typedef int (*ft_getinfo_test)(char *, char *, uint64_t, struct fi_info *, struct fi_info **);
+typedef int (*ft_getinfo_test)(char *, char *, uint64_t, struct fi_info *,
+				struct fi_info **);
 typedef int (*ft_getinfo_check)(struct fi_info *);
 typedef int (*ft_getinfo_init_val)(struct fi_info *, uint64_t);
 typedef int (*ft_getinfo_check_val)(struct fi_info *, uint64_t);
@@ -142,7 +143,8 @@ static int validate_bit_combos(char *node, char *service, uint64_t flags,
 			if (check && check(fi, combinations[i])) {
 				FT_DEBUG("%s:failed check for caps [%s]\n",
 					 fi->fabric_attr->prov_name,
-					 fi_tostr(&combinations[i], FI_TYPE_CAPS));
+					 fi_tostr(&combinations[i],
+					 FI_TYPE_CAPS));
 				ret = -FI_EIO;
 			}
 		}
@@ -336,8 +338,10 @@ static int init_valid_rma_RAW_ordering_set_size(struct fi_info *hints)
 			-ret, fi_strerror(-ret));
 		return ret;
 	}
-	if (fi->ep_attr->max_order_raw_size > 0)
-		hints->ep_attr->max_order_raw_size = fi->ep_attr->max_order_raw_size - 1;
+	if (fi->ep_attr->max_order_raw_size > 0) {
+		hints->ep_attr->max_order_raw_size =
+				fi->ep_attr->max_order_raw_size - 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -369,8 +373,10 @@ static int init_valid_rma_WAR_ordering_set_size(struct fi_info *hints)
 			-ret, fi_strerror(-ret));
 		return ret;
 	}
-	if (fi->ep_attr->max_order_war_size > 0)
-		hints->ep_attr->max_order_war_size = fi->ep_attr->max_order_war_size - 1;
+	if (fi->ep_attr->max_order_war_size > 0) {
+		hints->ep_attr->max_order_war_size =
+				fi->ep_attr->max_order_war_size - 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -401,8 +407,10 @@ static int init_valid_rma_WAW_ordering_set_size(struct fi_info *hints)
 			-ret, fi_strerror(-ret));
 		return ret;
 	}
-	if (fi->ep_attr->max_order_waw_size > 0)
-		hints->ep_attr->max_order_waw_size = fi->ep_attr->max_order_waw_size - 1;
+	if (fi->ep_attr->max_order_waw_size > 0) {
+		hints->ep_attr->max_order_waw_size =
+				fi->ep_attr->max_order_waw_size - 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -416,7 +424,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info)
 		if (info->ep_attr->max_order_raw_size <= 0)
 			return EXIT_FAILURE;
 		if (hints->ep_attr->max_order_raw_size) {
-			if (info->ep_attr->max_order_raw_size < hints->ep_attr->max_order_raw_size)
+			if (info->ep_attr->max_order_raw_size <
+			    hints->ep_attr->max_order_raw_size)
 				return EXIT_FAILURE;
 		}
 	}
@@ -425,7 +434,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info)
 		if (info->ep_attr->max_order_war_size <= 0)
 			return EXIT_FAILURE;
 		if (hints->ep_attr->max_order_war_size) {
-			if (info->ep_attr->max_order_war_size < hints->ep_attr->max_order_war_size)
+			if (info->ep_attr->max_order_war_size <
+			    hints->ep_attr->max_order_war_size)
 				return EXIT_FAILURE;
 		}
 	}
@@ -434,7 +444,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info)
 		if (info->ep_attr->max_order_waw_size <= 0)
 			return EXIT_FAILURE;
 		if (hints->ep_attr->max_order_waw_size) {
-			if (info->ep_attr->max_order_waw_size < hints->ep_attr->max_order_waw_size)
+			if (info->ep_attr->max_order_waw_size <
+			    hints->ep_attr->max_order_waw_size)
 				return EXIT_FAILURE;
 		}
 	}
@@ -460,8 +471,10 @@ static int init_invalid_rma_RAW_ordering_size(struct fi_info *hints)
 		return ret;
 	}
 
-	if (fi->ep_attr->max_order_raw_size)
-		hints->ep_attr->max_order_raw_size = fi->ep_attr->max_order_raw_size + 1;
+	if (fi->ep_attr->max_order_raw_size) {
+		hints->ep_attr->max_order_raw_size =
+				fi->ep_attr->max_order_raw_size + 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -486,8 +499,10 @@ static int init_invalid_rma_WAR_ordering_size(struct fi_info *hints)
 		return ret;
 	}
 
-	if (fi->ep_attr->max_order_war_size)
-		hints->ep_attr->max_order_war_size = fi->ep_attr->max_order_war_size + 1;
+	if (fi->ep_attr->max_order_war_size) {
+		hints->ep_attr->max_order_war_size =
+				fi->ep_attr->max_order_war_size + 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -512,8 +527,10 @@ static int init_invalid_rma_WAW_ordering_size(struct fi_info *hints)
 		return ret;
 	}
 
-	if (fi->ep_attr->max_order_waw_size)
-		hints->ep_attr->max_order_waw_size = fi->ep_attr->max_order_waw_size + 1;
+	if (fi->ep_attr->max_order_waw_size) {
+		hints->ep_attr->max_order_waw_size =
+				fi->ep_attr->max_order_waw_size + 1;
+	}
 
 	fi_freeinfo(fi);
 
@@ -560,7 +577,8 @@ static int init_mr_unspec(struct fi_info *hints)
 static int test_mr_v1_0(char *node, char *service, uint64_t flags,
 			struct fi_info *test_hints, struct fi_info **info)
 {
-	return fi_getinfo(FI_VERSION(1, 0), node, service, flags, test_hints, info);
+	return fi_getinfo(FI_VERSION(1, 0), node, service, flags,
+			  test_hints, info);
 }
 
 static int check_mr_unspec(struct fi_info *info)
@@ -663,12 +681,61 @@ static int validate_domain_caps(char *node, char *service, uint64_t flags,
 				   init_domain_caps, check_domain_caps);
 }
 
+/* Some apps (MPI) request all fi_info structures, and use the output to
+ * form the hints for a second call.  This usage breaks if the provider
+ * adds a new capability bit that also requires setting a mode or mr_mode
+ * bit (new or otherwise), which the app does not set.
+ * This is really a problem with the app, but avoid a regression
+ * by verifying that providers do not add new requirements for apps that
+ * inadvertently pick up a new capability bit.
+ */
+static int test_caps_regression(char *node, char *service, uint64_t flags,
+		struct fi_info *hints, struct fi_info **info)
+{
+	struct fi_info *fi;
+	int ret;
+
+	ret = fi_getinfo(FT_FIVERSION, node, service, flags, NULL, info);
+	if (ret)
+		return ret;
+
+	if (!hints || !hints->fabric_attr || !hints->fabric_attr->prov_name) {
+		fi = *info;
+	} else {
+		for (fi = *info; fi; fi = fi->next) {
+			if (!strcasecmp(hints->fabric_attr->prov_name,
+					(*info)->fabric_attr->prov_name))
+				break;
+		}
+	}
+
+	if (!fi)
+		return 0;
+
+	/* Limit mode bits to common, older options only */
+	hints->caps |= fi->caps;
+	hints->mode = FI_CONTEXT;
+	hints->domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP;
+
+	fi_freeinfo(*info);
+	*info = NULL;
+
+	ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info);
+	if (ret) {
+		printf("regression: new mode/mr_mode bits required...");
+		return -FI_EINVAL;
+	}
+
+	return 0;
+}
+
+
 /*
  * getinfo test
  */
 static int getinfo_unit_test(char *node, char *service, uint64_t flags,
-		struct fi_info *base_hints, ft_getinfo_init init, ft_getinfo_test test,
-		ft_getinfo_check check, int ret_exp)
+		struct fi_info *base_hints, ft_getinfo_init init,
+		ft_getinfo_test test, ft_getinfo_check check, int ret_exp)
 {
 	struct fi_info *info = NULL, *fi, *test_hints = NULL;
 	int ret;
@@ -685,10 +752,12 @@ static int getinfo_unit_test(char *node, char *service, uint64_t flags,
 			goto out;
 	}
 
-	if (test)
+	if (test) {
 		ret = test(node, service, flags, test_hints, &info);
-	else
-		ret = fi_getinfo(FT_FIVERSION, node, service, flags, test_hints, &info);
+	} else {
+		ret = fi_getinfo(FT_FIVERSION, node, service, flags,
+				 test_hints, &info);
+	}
 	if (ret) {
 		if (ret == ret_exp) {
 			ret = 0;
@@ -704,8 +773,8 @@ static int getinfo_unit_test(char *node, char *service, uint64_t flags,
 
 	for (fi = info; fi; fi = fi->next) {
 		FT_DEBUG("\nTesting for fabric: %s, domain: %s, endpoint type: %d",
-				fi->fabric_attr->prov_name, fi->domain_attr->name,
-				fi->ep_attr->type);
+			 fi->fabric_attr->prov_name, fi->domain_attr->name,
+			 fi->ep_attr->type);
 		ret = check(fi);
 		if (ret)
 			break;
@@ -716,19 +785,19 @@ out:
 	return ret;
 }
 
-#define getinfo_test(name, num, desc, node, service, flags, hints, init, test, check,	\
-		ret_exp)							\
-char *getinfo_ ## name ## num ## _desc = desc;					\
-static int getinfo_ ## name ## num(void)					\
-{										\
-	int ret, testret = FAIL;						\
-	ret = getinfo_unit_test(node, service, flags, hints, init, test, check,	\
-			ret_exp);						\
-	if (ret)								\
-		goto fail;							\
-	testret = PASS;								\
-fail:										\
-	return TEST_RET_VAL(ret, testret);					\
+#define getinfo_test(name, num, desc, node, service, flags, hints, 	\
+		     init, test, check, ret_exp)			\
+char *getinfo_ ## name ## num ## _desc = desc;				\
+static int getinfo_ ## name ## num(void)				\
+{									\
+	int ret, testret = FAIL;					\
+	ret = getinfo_unit_test(node, service, flags, hints, init,	\
+				test, check, ret_exp);			\
+	if (ret)							\
+		goto fail;						\
+	testret = PASS;							\
+fail:									\
+	return TEST_RET_VAL(ret, testret);				\
 }
 
 /*
@@ -744,85 +813,85 @@ fail:										\
 
 /* 1.1 Source address only tests */
 getinfo_test(no_hints, 1, "Test with no node, service, flags or hints",
-		NULL, NULL, 0, NULL, NULL, NULL, check_srcaddr, 0)
-getinfo_test(no_hints, 2, "Test with node, no service, FI_SOURCE flag and no hints",
-		opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE,
-		NULL, NULL, NULL, check_srcaddr, 0)
-getinfo_test(no_hints, 3, "Test with service, FI_SOURCE flag and no node or hints",
+	     NULL, NULL, 0, NULL, NULL, NULL, check_srcaddr, 0)
+getinfo_test(no_hints, 2, "Test with node, no service, FI_SOURCE flag, no hints",
+	     opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE,
+	     NULL, NULL, NULL, check_srcaddr, 0)
+getinfo_test(no_hints, 3, "Test with service, FI_SOURCE flag, no node, no hints",
 		 NULL, opts.src_port, FI_SOURCE, NULL, NULL,
-		 NULL, check_srcaddr, 0)	// TODO should we check for wildcard addr?
-getinfo_test(no_hints, 4, "Test with node, service, FI_SOURCE flags and no hints",
-		opts.src_addr ? opts.src_addr : "localhost", opts.src_port,
-		FI_SOURCE, NULL, NULL, NULL, check_srcaddr, 0)
+		 NULL, check_srcaddr, 0)
+getinfo_test(no_hints, 4, "Test with node, service, FI_SOURCE flag, no hints",
+	     opts.src_addr ? opts.src_addr : "localhost", opts.src_port,
+	     FI_SOURCE, NULL, NULL, NULL, check_srcaddr, 0)
 
 /* 1.2 Source and destination address tests */
-getinfo_test(no_hints, 5, "Test with node, service and no hints",
-		opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port,
-		0, NULL, NULL, NULL, check_src_dest_addr, 0)
+getinfo_test(no_hints, 5, "Test with node, service, no hints",
+	     opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port,
+	     0, NULL, NULL, NULL, check_src_dest_addr, 0)
 
-/* 2. Test with hints */
+/* 2. Tests, most with hints */
 /* 2.1 Source address only tests */
 getinfo_test(src, 1, "Test with no node, service, or flags",
-		NULL, NULL, 0, hints, NULL, NULL, check_srcaddr, 0)
+	     NULL, NULL, 0, hints, NULL, NULL, check_srcaddr, 0)
 getinfo_test(src, 2, "Test with node, no service, FI_SOURCE flag",
-		opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE,
-		hints, NULL, NULL, check_srcaddr, 0)
-getinfo_test(src, 3, "Test with service, FI_SOURCE flag and no node",
-		 NULL, opts.src_port, FI_SOURCE, hints, NULL,
-		 NULL, check_srcaddr, 0)	// TODO should we check for wildcard addr?
-getinfo_test(src, 4, "Test with node, service, FI_SOURCE flags",
-		opts.src_addr ? opts.src_addr : "localhost", opts.src_port,
-		FI_SOURCE, hints, NULL, NULL, check_srcaddr, 0)
+	     opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE,
+	     hints, NULL, NULL, check_srcaddr, 0)
+getinfo_test(src, 3, "Test with service, FI_SOURCE flag, no node",
+	     NULL, opts.src_port, FI_SOURCE, hints, NULL,
+	     NULL, check_srcaddr, 0)
+getinfo_test(src, 4, "Test with node, service, FI_SOURCE flag",
+	     opts.src_addr ? opts.src_addr : "localhost", opts.src_port,
+	     FI_SOURCE, hints, NULL, NULL, check_srcaddr, 0)
 
 /* 2.2 Source and destination address tests */
 getinfo_test(src_dest, 1, "Test with node, service",
-		opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port,
-		0, hints, NULL, NULL, check_src_dest_addr, 0)
+	     opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port,
+	     0, hints, NULL, NULL, check_src_dest_addr, 0)
 
 getinfo_test(src_dest, 2, "Test API version",
-		NULL, NULL, 0, hints, NULL, NULL, check_api_version , 0)
+	     NULL, NULL, 0, hints, NULL, NULL, check_api_version , 0)
 
 /* Negative tests */
 getinfo_test(neg, 1, "Test with non-existent domain name",
-		NULL, NULL, 0, hints, invalid_dom, NULL, NULL, -FI_ENODATA)
+	     NULL, NULL, 0, hints, invalid_dom, NULL, NULL, -FI_ENODATA)
 
 /* Utility provider tests */
 getinfo_test(util, 1, "Test if we get utility provider when requested",
-		NULL, NULL, 0, hints, NULL, NULL, check_util_prov, 0)
+	     NULL, NULL, 0, hints, NULL, NULL, check_util_prov, 0)
 
 /* Message Ordering Tests */
 getinfo_test(msg_ordering, 1, "Test tx ordering bits supported are set",
-		NULL, NULL, 0, hints, NULL, validate_tx_ordering_bits, NULL, 0)
+	     NULL, NULL, 0, hints, NULL, validate_tx_ordering_bits, NULL, 0)
 getinfo_test(msg_ordering, 2, "Test rx ordering bits supported are set",
-		NULL, NULL, 0, hints, NULL, validate_rx_ordering_bits, NULL, 0)
+	     NULL, NULL, 0, hints, NULL, validate_rx_ordering_bits, NULL, 0)
 
 getinfo_test(raw_ordering, 1, "Test rma RAW ordering size is set",
-		NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_no_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_no_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(raw_ordering, 2, "Test rma RAW ordering size is set to hints",
-		NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(war_ordering, 1, "Test rma WAR ordering size is set",
-		NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_no_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_no_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(war_ordering, 2, "Test rma WAR ordering size is set to hints",
-		NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(waw_ordering, 1, "Test rma WAW ordering size is set",
-		NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_no_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_no_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(waw_ordering, 2, "Test rma WAW ordering size is set to hints",
-		NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_set_size,
-		NULL, check_valid_rma_ordering_sizes, 0)
+	     NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_set_size,
+	     NULL, check_valid_rma_ordering_sizes, 0)
 getinfo_test(bad_raw_ordering, 1, "Test invalid rma RAW ordering size",
-		NULL, NULL, 0, hints, init_invalid_rma_RAW_ordering_size,
-		NULL, NULL, -FI_ENODATA)
+	     NULL, NULL, 0, hints, init_invalid_rma_RAW_ordering_size,
+	     NULL, NULL, -FI_ENODATA)
 getinfo_test(bad_war_ordering, 1, "Test invalid rma WAR ordering size",
-		NULL, NULL, 0, hints, init_invalid_rma_WAR_ordering_size,
-		NULL, NULL, -FI_ENODATA)
+	     NULL, NULL, 0, hints, init_invalid_rma_WAR_ordering_size,
+	     NULL, NULL, -FI_ENODATA)
 getinfo_test(bad_waw_ordering, 1, "Test invalid rma WAW ordering size",
-		NULL, NULL, 0, hints, init_invalid_rma_WAW_ordering_size,
-		NULL, NULL, -FI_ENODATA)
+	     NULL, NULL, 0, hints, init_invalid_rma_WAW_ordering_size,
+	     NULL, NULL, -FI_ENODATA)
 
 /* MR mode tests */
 getinfo_test(mr_mode, 1, "Test FI_MR_BASIC", NULL, NULL, 0,
@@ -834,7 +903,8 @@ getinfo_test(mr_mode, 3, "Test FI_MR_UNSPEC (v1.0)", NULL, NULL, 0,
 getinfo_test(mr_mode, 4, "Test FI_MR_BASIC (v1.0)", NULL, NULL, 0,
 	     hints, init_mr_basic, test_mr_v1_0, check_mr_basic, -FI_ENODATA)
 getinfo_test(mr_mode, 5, "Test FI_MR_SCALABLE (v1.0)", NULL, NULL, 0,
-     	     hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable, -FI_ENODATA)
+     	     hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable,
+	     -FI_ENODATA)
 getinfo_test(mr_mode, 6, "Test mr_mode bits", NULL, NULL, 0,
 	     hints, NULL, validate_mr_modes, NULL, 0)
 
@@ -850,17 +920,20 @@ getinfo_test(progress, 4, "Test ctrl auto progress", NULL, NULL, 0,
 
 /* Capability test */
 getinfo_test(caps, 1, "Test capability bits supported are set",
-		NULL, NULL, 0, hints, NULL, validate_primary_caps, NULL, 0)
+	     NULL, NULL, 0, hints, NULL, validate_primary_caps, NULL, 0)
 getinfo_test(caps, 2, "Test capability with no hints",
-		NULL, NULL, 0, NULL, NULL, NULL, test_null_hints_caps, 0)
+	     NULL, NULL, 0, NULL, NULL, NULL, test_null_hints_caps, 0)
 getinfo_test(caps, 3, "Test domain capabilities", NULL, NULL, 0,
 	     hints, NULL, validate_domain_caps, NULL, 0)
+getinfo_test(caps, 4, "Test for capability bit regression",
+	     NULL, NULL, 0, hints, NULL, test_caps_regression, NULL, 0)
 
 
 static void usage(void)
 {
 	ft_unit_usage("getinfo_test", "Unit tests for fi_getinfo");
-	FT_PRINT_OPTS_USAGE("-e <ep_type>", "Endpoint type: msg|rdm|dgram (default:rdm)");
+	FT_PRINT_OPTS_USAGE("-e <ep_type>",
+			    "Endpoint type: msg|rdm|dgram (default:rdm)");
 	ft_addr_usage();
 }
 
@@ -937,6 +1010,7 @@ int main(int argc, char **argv)
 		TEST_ENTRY_GETINFO(caps1),
 		TEST_ENTRY_GETINFO(caps2),
 		TEST_ENTRY_GETINFO(caps3),
+		TEST_ENTRY_GETINFO(caps4),
 		{ NULL, "" }
 	};
 
@@ -980,10 +1054,10 @@ int main(int argc, char **argv)
 		if (set_prov(hints->fabric_attr->prov_name))
 			return EXIT_FAILURE;
 	} else {
-	       FT_WARN("\nTests getinfo1 to getinfo5 may not run exclusively "
-		       "for a particular provider since we don't pass hints.\n"
-		       "So the failures in any of those tests may not be "
-		       "attributable to a single provider.\n");
+	       FT_WARN("\nSome tests do not pass in hints, and may not run "
+		       "exclusively for a particular provider.\n"
+		       "Failures in any of those tests may not be "
+		       "attributable to a specific provider.\n");
 	}
 
 	failed = run_tests(no_hint_tests, err_buf);
diff --git a/deps/libfabric/fabtests/unit/resource_freeing.c b/deps/libfabric/fabtests/unit/resource_freeing.c
deleted file mode 100644
index 21dbe6ce0db1e6baadf876ef3bb360649595f46c..0000000000000000000000000000000000000000
--- a/deps/libfabric/fabtests/unit/resource_freeing.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright (c) 2017 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <getopt.h>
-#include "shared.h"
-
-#define lengthof(arr) (sizeof(arr) / sizeof(*arr))
-
-enum test_depth {
-	DEPTH_FABRIC,
-	DEPTH_DOMAIN,
-	DEPTH_ENABLE_ENDPOINT
-};
-
-int test_resource_freeing(enum test_depth test_depth,
-		const char *fabric_service)
-{
-	int our_ret = FI_SUCCESS;
-	int ret;
-	uint64_t flags;
-	struct fi_info *info;
-
-	/* Setup fabric */
-
-	hints = fi_allocinfo();
-	if (!hints) {
-		our_ret = -FI_ENOMEM;
-		goto error_return;
-	}
-
-	flags = FI_SOURCE;
-	hints->caps = FI_RMA;
-	hints->ep_attr->type = FI_EP_RDM;
-
-	ret = fi_getinfo(FT_FIVERSION, NULL, fabric_service, flags,
-			 hints, &info);
-	if (ret) {
-		FT_PRINTERR("fi_getinfo", ret);
-		our_ret = ret;
-		goto free_hints;
-	}
-
-	ret = fi_fabric(info->fabric_attr, &fabric, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_fabric", ret);
-		our_ret = ret;
-		goto free_info;
-	}
-
-	if (test_depth == DEPTH_FABRIC) {
-		goto close_fabric;
-	}
-
-	ret = fi_domain(fabric, info, &domain, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_domain", ret);
-		our_ret = ret;
-		goto close_fabric;
-	}
-
-	if (test_depth == DEPTH_DOMAIN) {
-		goto close_domain;
-	}
-
-	/* Create pre-endpoint resources */
-
-	av_attr.type = info->domain_attr->av_type;
-	av_attr.count = 0;
-	av_attr.name = NULL;
-	ret = fi_av_open(domain, &av_attr, &av, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_av_open", ret);
-		our_ret = ret;
-		goto close_domain;
-	}
-
-	cntr_attr.events = FI_CNTR_EVENTS_COMP;
-	cntr_attr.wait_obj = FI_WAIT_UNSPEC;
-	ret = fi_cntr_open(domain, &cntr_attr, &txcntr, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_cntr_open", ret);
-		our_ret = ret;
-		goto close_av;
-	}
-
-	ret = fi_cq_open(domain, &cq_attr, &txcq, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_cq_open", ret);
-		our_ret = ret;
-		goto close_txcntr;
-	}
-
-	ret = fi_endpoint(domain, info, &ep, NULL);
-	if (ret) {
-		FT_PRINTERR("fi_endpoint", ret);
-		our_ret = ret;
-		goto close_txcq;
-	}
-
-	/* Bind pre-endpoint resources to ep */
-
-	ret = fi_ep_bind(ep, &txcntr->fid, FI_WRITE);
-	if (ret) {
-		FT_PRINTERR("fi_ep_bind", ret);
-		our_ret = ret;
-		goto close_ep;
-	}
-
-	ret = fi_ep_bind(ep, &av->fid, 0);
-	if (ret) {
-		FT_PRINTERR("fi_ep_bind", ret);
-		our_ret = ret;
-		goto close_ep;
-	}
-
-	ret = fi_ep_bind(ep, &txcq->fid, FI_TRANSMIT);
-	if (ret) {
-		FT_PRINTERR("fi_ep_bind", ret);
-		our_ret = ret;
-		goto close_ep;
-	}
-
-	/* Enable ep */
-
-	ret = fi_enable(ep);
-	if (ret) {
-		FT_PRINTERR("fi_enable", ret);
-		our_ret = ret;
-		goto close_ep;
-	}
-
-	if (test_depth == DEPTH_ENABLE_ENDPOINT) {
-		goto close_ep;
-	}
-
-close_ep:
-	ret = fi_close(&ep->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-close_txcq:
-	ret = fi_close(&txcq->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-close_txcntr:
-	ret = fi_close(&txcntr->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-close_av:
-	ret = fi_close(&av->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-close_domain:
-	ret = fi_close(&domain->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-close_fabric:
-	ret = fi_close(&fabric->fid);
-	if (ret) {
-		FT_PRINTERR("fi_close", ret);
-		our_ret = our_ret ? our_ret : ret;
-	}
-
-free_info:
-	fi_freeinfo(info);
-
-free_hints:
-	fi_freeinfo(hints);
-
-error_return:
-	return our_ret;
-}
-
-void print_test_resource_freeing_call(enum test_depth test_depth, int iter)
-{
-	fprintf(stdout,
-		"Running test_resource_freeing with "
-		"[%s] for %d iterations\n",
-		(test_depth == DEPTH_FABRIC) ? "DEPTH_FABRIC"
-		: (test_depth == DEPTH_DOMAIN) ? "DEPTH_DOMAIN"
-		: (test_depth == DEPTH_ENABLE_ENDPOINT) ? "DEPTH_ENABLE_ENDPOINT"
-		: "(unknown test depth)",
-		iter
-	);
-
-	fflush(stderr);
-	fflush(stdout);
-}
-
-void print_test_resource_freeing_result_call(int success,
-		enum test_depth test_depth,
-		int iter)
-{
-	fprintf(success ? stdout : stderr,
-		"%s: test_resource_freeing %s with "
-		"[%s]\n",
-		success ? "GOOD" : "ERROR",
-		success ? "succeeded" : "failed",
-		(test_depth == DEPTH_FABRIC) ? "DEPTH_FABRIC"
-		: (test_depth == DEPTH_DOMAIN) ? "DEPTH_DOMAIN"
-		: (test_depth == DEPTH_ENABLE_ENDPOINT) ? "DEPTH_ENABLE_ENDPOINT"
-		: "(unknown test depth)"
-	);
-
-	fflush(stderr);
-	fflush(stdout);
-}
-
-int main(int argc, char **argv)
-{
-	int op, i, td_idx, ret = 0, iters = 2, exit_code = 0;
-
-	opts = INIT_OPTS;
-
-	hints = fi_allocinfo();
-	if (!hints)
-		return EXIT_FAILURE;
-
-	while ((op = getopt(argc, argv, "i:h" ADDR_OPTS INFO_OPTS)) != -1) {
-		switch (op) {
-		default:
-			ft_parse_addr_opts(op, optarg, &opts);
-			ft_parseinfo(op, optarg, hints, &opts);
-			break;
-		case 'i':
-			iters = atoi(optarg);
-			break;
-		case '?':
-		case 'h':
-			ft_usage(argv[0], "Test which exercises resource freeing in a provider\n");
-			FT_PRINT_OPTS_USAGE("-i <int>", "number of iterations to test");
-			return EXIT_FAILURE;
-		}
-	}
-
-	enum test_depth test_depth[] = {
-		DEPTH_FABRIC, DEPTH_DOMAIN, DEPTH_ENABLE_ENDPOINT};
-
-	for (td_idx = 0; td_idx < lengthof(test_depth); td_idx += 1) {
-		print_test_resource_freeing_call(
-			test_depth[td_idx], iters);
-		for (i = 0; i < iters; i += 1) {
-			ret = test_resource_freeing(
-				test_depth[td_idx], default_port);
-			if (ret) {
-				exit_code = EXIT_FAILURE;
-				break;
-			}
-		}
-		print_test_resource_freeing_result_call(
-			!ret, /* int success */
-			test_depth[td_idx],
-			i);
-	}
-
-	return ft_exit_code(exit_code);
-}
diff --git a/deps/libfabric/include/freebsd/osd.h b/deps/libfabric/include/freebsd/osd.h
index e49722846722567f587cc4592490fb46b390d855..c185d90e545d318ffa975d2a3fcca8c9158159e9 100644
--- a/deps/libfabric/include/freebsd/osd.h
+++ b/deps/libfabric/include/freebsd/osd.h
@@ -95,6 +95,62 @@ static inline size_t ofi_process_vm_writev(pid_t pid,
 	return -FI_ENOSYS;
 }
 
+static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count,
+				      int flags)
+{
+	return recv(fd, buf, count, flags);
+}
+
+static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
+					  struct sockaddr *from, socklen_t *fromlen)
+{
+	return recvfrom(fd, buf, count, flags, from, fromlen);
+}
+
+static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count,
+				      int flags)
+{
+	return send(fd, buf, count, flags);
+}
+
+static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags,
+					const struct sockaddr *to, socklen_t tolen)
+{
+	return sendto(fd, buf, count, flags, to, tolen);
+}
+
+static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt)
+{
+	return writev(fd, iov, iov_cnt);
+}
+
+static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt)
+{
+	return readv(fd, iov, iov_cnt);
+}
+
+static inline ssize_t
+ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags)
+{
+	return sendmsg(fd, msg, flags);
+}
+
+static inline ssize_t
+ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags)
+{
+	return recvmsg(fd, msg, flags);
+}
+
 #endif /* _FREEBSD_OSD_H_ */
 
 
diff --git a/deps/libfabric/include/linux/osd.h b/deps/libfabric/include/linux/osd.h
index b491108c1d1146a48448c1f84b07b5192fc02e98..1c61993a1dd58a16a678798f0dd2e54771cdbe81 100644
--- a/deps/libfabric/include/linux/osd.h
+++ b/deps/libfabric/include/linux/osd.h
@@ -41,11 +41,17 @@
 #include <sys/mman.h>
 #include <string.h>
 #include <assert.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
 
+#include <linux/errqueue.h>
 #include <ifaddrs.h>
 #include "unix/osd.h"
 #include "rdma/fi_errno.h"
 
+
 static inline int ofi_shm_remap(struct util_shm *shm,
 				size_t newsize, void **mapped)
 {
@@ -125,4 +131,60 @@ static inline size_t ofi_process_vm_writev(pid_t pid,
 		       remote_iov, riovcnt, flags);
 }
 
+static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
+{
+	return read(fd, buf, count);
+}
+
+static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
+{
+	return write(fd, buf, count);
+}
+
+static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count,
+				      int flags)
+{
+	return recv(fd, buf, count, flags);
+}
+
+static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
+					  struct sockaddr *from, socklen_t *fromlen)
+{
+	return recvfrom(fd, buf, count, flags, from, fromlen);
+}
+
+static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count,
+				      int flags)
+{
+	return send(fd, buf, count, flags);
+}
+
+static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags,
+					const struct sockaddr *to, socklen_t tolen)
+{
+	return sendto(fd, buf, count, flags, to, tolen);
+}
+
+static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt)
+{
+	return writev(fd, iov, iov_cnt);
+}
+
+static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt)
+{
+	return readv(fd, iov, iov_cnt);
+}
+
+static inline ssize_t
+ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags)
+{
+	return sendmsg(fd, msg, flags);
+}
+
+static inline ssize_t
+ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags)
+{
+	return recvmsg(fd, msg, flags);
+}
+
 #endif /* _LINUX_OSD_H_ */
diff --git a/deps/libfabric/include/ofi.h b/deps/libfabric/include/ofi.h
index ee9ee6476849d70225c59cd05ee609d1b4c54256..46d61940724478052f466ec213370dded321d78d 100644
--- a/deps/libfabric/include/ofi.h
+++ b/deps/libfabric/include/ofi.h
@@ -133,8 +133,6 @@ static inline int ofi_val32_ge(uint32_t x, uint32_t y) {
 #define ofi_val64_inrange(start, length, value) \
     ofi_val64_ge(value, start) && ofi_val64_lt(value, start + length)
 
-#define OFI_MAGIC_64 (0x0F1C0DE0F1C0DE64)
-
 #ifndef BIT
 #define BIT(nr) (1UL << (nr))
 #endif
@@ -157,28 +155,24 @@ static inline int ofi_val32_ge(uint32_t x, uint32_t y) {
 
 #define TAB "    "
 
-#define CASEENUMSTR(SYM) \
-	case SYM: { ofi_strcatf(buf, #SYM); break; }
-#define IFFLAGSTR(flags, SYM) \
-	do { if (flags & SYM) ofi_strcatf(buf, #SYM ", "); } while(0)
 #define CASEENUMSTRN(SYM, N) \
 	case SYM: { ofi_strncatf(buf, N, #SYM); break; }
 #define IFFLAGSTRN(flags, SYM, N) \
 	do { if (flags & SYM) ofi_strncatf(buf, N, #SYM ", "); } while(0)
 
-#define ofi_strcatf(dest, ...) \
-	ofi_strncatf(dest, OFI_BUFSIZ, __VA_ARGS__)
 
 /*
  * CPU specific features
  */
+
+/* X86_64 */
 enum {
-	OFI_CLWB_REG		= 2,
+	OFI_CLWB_REG		= 1,
 	OFI_CLWB_BIT		= (1 << 24),
 	OFI_CLFLUSHOPT_REG	= 1,
-	OFI_CLFLUSHOPT_BIT	= (1 << 24),
+	OFI_CLFLUSHOPT_BIT	= (1 << 23),
 	OFI_CLFLUSH_REG		= 3,
-	OFI_CLFLUSH_BIT		= (1 << 23),
+	OFI_CLFLUSH_BIT		= (1 << 19),
 };
 
 int ofi_cpu_supports(unsigned func, unsigned reg, unsigned bit);
@@ -263,14 +257,20 @@ static inline void *ofi_get_page_start(const void *addr, size_t page_size)
 
 static inline void *ofi_get_page_end(const void *addr, size_t page_size)
 {
-	return ofi_get_page_start((const char *) addr + page_size -1, page_size);
+	return (void *)((uintptr_t)ofi_get_page_start((const char *)addr
+			+ page_size, page_size) - 1);
 }
 
 static inline size_t
 ofi_get_page_bytes(const void *addr, size_t len, size_t page_size)
 {
-	return (char *)ofi_get_page_end((const char *) addr + len, page_size) -
-	       (char *)ofi_get_page_start(addr, page_size);
+	char *start = ofi_get_page_start(addr, page_size);
+	char *end = (char *)ofi_get_page_start((const char*)addr + len - 1, page_size)
+		    + page_size;
+	size_t result = end - start;
+
+	assert(result % page_size == 0);
+	return result;
 }
 
 #define FI_TAG_GENERIC	0xAAAAAAAAAAAAAAAAULL
@@ -283,10 +283,13 @@ uint8_t ofi_lsb(uint64_t num);
 
 extern size_t ofi_universe_size;
 
-int ofi_send_allowed(uint64_t caps);
-int ofi_recv_allowed(uint64_t caps);
-int ofi_rma_initiate_allowed(uint64_t caps);
-int ofi_rma_target_allowed(uint64_t caps);
+bool ofi_send_allowed(uint64_t caps);
+bool ofi_recv_allowed(uint64_t caps);
+bool ofi_rma_initiate_allowed(uint64_t caps);
+bool ofi_rma_target_allowed(uint64_t caps);
+bool ofi_needs_tx(uint64_t caps);
+bool ofi_needs_rx(uint64_t caps);
+
 int ofi_ep_bind_valid(const struct fi_provider *prov, struct fid *bfid,
 		      uint64_t flags);
 int ofi_check_rx_mode(const struct fi_info *info, uint64_t flags);
diff --git a/deps/libfabric/include/ofi_abi.h b/deps/libfabric/include/ofi_abi.h
index 368109a44d69cc6d9485e68f169b47cf2f693b55..b9008f544fab17fcef1b0dd8e3223d556e14777e 100644
--- a/deps/libfabric/include/ofi_abi.h
+++ b/deps/libfabric/include/ofi_abi.h
@@ -90,7 +90,7 @@ extern "C" {
  * {
  *    ...
  * }
- * DEFAULT_SYMVER(bar_, bar, "MYLIB_1.0");
+ * DEFAULT_SYMVER(bar_, bar, MYLIB_1.0);
  *
  * This function is the main entry point for function foo.
  * int DEFAULT_SYMVER_PRE(foo)(void)
@@ -105,13 +105,13 @@ extern "C" {
  * {
  *    ...
  * }
- * COMPAT_SYMVER(foo_1_0, foo, "MYLIB_1.0");
+ * COMPAT_SYMVER(foo_1_0, foo, MYLIB_1.0);
  *
  * By convention, the name of compatibility functions is the exported function
  * name appended with the ABI version that it is compatible with.
  */
 
-#define CURRENT_ABI "FABRIC_1.3"
+#define CURRENT_ABI "FABRIC_1.6"
 
 #if  HAVE_ALIAS_ATTRIBUTE == 1
 #define DEFAULT_SYMVER_PRE(a) a##_
diff --git a/deps/libfabric/include/ofi_atom.h b/deps/libfabric/include/ofi_atom.h
index 46d6eb98dec01ce71f5892307045de295546a5b9..0e1af6aa5ef0b82d609e7ed6d43a1251afb53919 100644
--- a/deps/libfabric/include/ofi_atom.h
+++ b/deps/libfabric/include/ofi_atom.h
@@ -159,6 +159,13 @@ typedef atomic_long	ofi_atomic_int64_t;
 							     &expected, desired,			\
 							     memory_order_acq_rel,			\
 							     memory_order_relaxed);			\
+	}												\
+	static inline											\
+	bool ofi_atomic_cas_bool##radix(ofi_atomic##radix##_t *atomic, 					\
+					int##radix##_t expected, 					\
+					int##radix##_t desired)						\
+	{												\
+		return ofi_atomic_cas_bool_strong##radix(atomic, expected, desired);			\
 	}
 
 #elif defined HAVE_BUILTIN_ATOMICS
diff --git a/deps/libfabric/include/ofi_atomic.h b/deps/libfabric/include/ofi_atomic.h
index fc956945fd3388c818f388714348cf318884f0ea..abff3c2b01ed6bdbc7d43f6e1394b2e800f8df63 100644
--- a/deps/libfabric/include/ofi_atomic.h
+++ b/deps/libfabric/include/ofi_atomic.h
@@ -63,11 +63,18 @@ size_t ofi_datatype_size(enum fi_datatype datatype);
 #define ofi_atomic_isswap_op(op) \
 	(op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST)
 
-extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
+#define OFI_DATATYPE_CNT	(FI_UINT128 + 1)
+
+#ifdef HAVE___INT128
+typedef __int128 ofi_int128_t;
+typedef unsigned __int128 ofi_uint128_t;
+#endif
+
+extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, size_t cnt);
-extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, void *res, size_t cnt);
-extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT])
 			(void *dst, const void *src, const void *cmp,
 			 void *res, size_t cnt);
 
diff --git a/deps/libfabric/include/ofi_bitmask.h b/deps/libfabric/include/ofi_bitmask.h
index 067fa38994f305fc53d57a2a1cdbb7bb9a891541..624792b5ad85c624a1f096bc1af5fa8bc55bcde6 100644
--- a/deps/libfabric/include/ofi_bitmask.h
+++ b/deps/libfabric/include/ofi_bitmask.h
@@ -58,35 +58,35 @@ static inline int ofi_bitmask_create(struct bitmask *mask, size_t size)
 	mask->size = size;
 
 	return FI_SUCCESS;
-};
+}
 
 static inline void ofi_bitmask_free(struct bitmask *mask)
 {
 	free(mask->bytes);
 	mask->bytes = NULL;
-};
+}
 
 static inline size_t ofi_bitmask_bytesize(struct bitmask *mask)
 {
 	return (mask->size % 8) ? (mask->size / 8 + 1) : (mask->size / 8);
-};
+}
 
 static inline void ofi_bitmask_unset(struct bitmask *mask, size_t idx)
 {
 	assert(idx <= mask->size);
 	mask->bytes[idx / 8] &= ~(0x01 << (idx % 8));
-};
+}
 
 static inline void ofi_bitmask_set(struct bitmask *mask, size_t idx)
 {
 	assert(idx <= mask->size);
 	mask->bytes[idx / 8] |= (0x01 << (idx % 8));
-};
+}
 
 static inline void ofi_bitmask_set_all(struct bitmask *mask)
 {
 	memset(mask->bytes, 0xff, ofi_bitmask_bytesize(mask));
-};
+}
 
 static inline size_t ofi_bitmask_get_lsbset(struct bitmask mask)
 {
@@ -109,6 +109,6 @@ static inline size_t ofi_bitmask_get_lsbset(struct bitmask mask)
 
 	assert(ret <= (mask.size));
 	return ret;
-};
+}
 
 #endif
diff --git a/deps/libfabric/include/ofi_enosys.h b/deps/libfabric/include/ofi_enosys.h
index 5f7148becdf4c430d9b61e7c583761b5806fbd75..671de283e4160a4be76766ffbf2d80216eaaa679 100644
--- a/deps/libfabric/include/ofi_enosys.h
+++ b/deps/libfabric/include/ofi_enosys.h
@@ -56,12 +56,17 @@ static struct fi_ops X = {
 	.bind = fi_no_bind,
 	.control = fi_no_control,
 	.ops_open = fi_no_ops_open,
+	.tostr = fi_no_ops_tostr,
+	.ops_set = fi_no_ops_set,
 };
  */
 int fi_no_bind(struct fid *fid, struct fid *bfid, uint64_t flags);
 int fi_no_control(struct fid *fid, int command, void *arg);
 int fi_no_ops_open(struct fid *fid, const char *name,
 		uint64_t flags, void **ops, void *context);
+int fi_no_tostr(const struct fid *fid, char *buf, size_t len);
+int fi_no_ops_set(struct fid *fid, const char *name, uint64_t flags,
+		  void *ops, void *context);
 
 /*
 static struct fi_ops_fabric X = {
diff --git a/deps/libfabric/include/ofi_epoll.h b/deps/libfabric/include/ofi_epoll.h
index 532b7ae37f471d2bc2d55f68fa60f83a91140286..de50d847004d65489a3f8d36ce04f062032ba14a 100644
--- a/deps/libfabric/include/ofi_epoll.h
+++ b/deps/libfabric/include/ofi_epoll.h
@@ -44,6 +44,19 @@
 #include <ofi_list.h>
 #include <ofi_signal.h>
 
+
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#define ofi_epollfds_event epoll_event
+#else
+struct ofi_epollfds_event {
+	uint32_t events;
+	union {
+		void *ptr;
+	} data;
+};
+#endif
+
 enum ofi_pollfds_ctl {
 	POLLFDS_CTL_ADD,
 	POLLFDS_CTL_DEL,
@@ -63,28 +76,38 @@ struct ofi_pollfds {
 	int		nfds;
 	struct pollfd	*fds;
 	void		**context;
-	int		index;
 	struct fd_signal signal;
 	struct slist	work_item_list;
 	fastlock_t	lock;
 };
 
 int ofi_pollfds_create(struct ofi_pollfds **pfds);
+int ofi_pollfds_grow(struct ofi_pollfds *pfds, int max_size);
 int ofi_pollfds_add(struct ofi_pollfds *pfds, int fd, uint32_t events,
 		    void *context);
 int ofi_pollfds_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
 		    void *context);
 int ofi_pollfds_del(struct ofi_pollfds *pfds, int fd);
-int ofi_pollfds_wait(struct ofi_pollfds *pfds, void **contexts,
-		     int max_contexts, int timeout);
+int ofi_pollfds_wait(struct ofi_pollfds *pfds,
+		     struct ofi_epollfds_event *events,
+		     int maxevents, int timeout);
 void ofi_pollfds_close(struct ofi_pollfds *pfds);
 
+/* OS specific */
+void ofi_pollfds_do_add(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item);
+int ofi_pollfds_do_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
+		       void *context);
+void ofi_pollfds_do_del(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item);
+
 
 #ifdef HAVE_EPOLL
 #include <sys/epoll.h>
 
 #define OFI_EPOLL_IN  EPOLLIN
 #define OFI_EPOLL_OUT EPOLLOUT
+#define OFI_EPOLL_ERR EPOLLERR
 
 typedef int ofi_epoll_t;
 #define OFI_EPOLL_INVALID -1
@@ -122,19 +145,17 @@ static inline int ofi_epoll_del(int ep, int fd)
 	return epoll_ctl(ep, EPOLL_CTL_DEL, fd, NULL) ? -ofi_syserr() : 0;
 }
 
-static inline int ofi_epoll_wait(int ep, void **contexts, int max_contexts,
-                                int timeout)
+static inline int
+ofi_epoll_wait(int ep, struct ofi_epollfds_event *events,
+	       int maxevents, int timeout)
 {
-	struct epoll_event events[max_contexts];
 	int ret;
-	int i;
 
-	ret = epoll_wait(ep, events, max_contexts, timeout);
+	ret = epoll_wait(ep, (struct epoll_event *) events, maxevents,
+			 timeout);
 	if (ret == -1)
 		return -ofi_syserr();
 
-	for (i = 0; i < ret; i++)
-		contexts[i] = events[i].data.ptr;
 	return ret;
 }
 
@@ -147,6 +168,7 @@ static inline void ofi_epoll_close(int ep)
 
 #define OFI_EPOLL_IN  POLLIN
 #define OFI_EPOLL_OUT POLLOUT
+#define OFI_EPOLL_ERR POLLERR
 
 typedef struct ofi_pollfds *ofi_epoll_t;
 #define OFI_EPOLL_INVALID NULL
diff --git a/deps/libfabric/include/ofi_hmem.h b/deps/libfabric/include/ofi_hmem.h
index 789b261c376b6997ca8b39d46244c2e1fd81940a..798b261e295e5e25ec7a2d6a208de0ba0f2bf095 100644
--- a/deps/libfabric/include/ofi_hmem.h
+++ b/deps/libfabric/include/ofi_hmem.h
@@ -1,5 +1,6 @@
 /*
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * (C) Copyright 2020-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -40,6 +41,8 @@
 #include <rdma/fi_domain.h>
 #include <stdbool.h>
 
+extern bool ofi_hmem_disable_p2p;
+
 #if HAVE_LIBCUDA
 
 #include <cuda.h>
@@ -52,7 +55,8 @@ const char *ofi_cudaGetErrorName(cudaError_t error);
 const char *ofi_cudaGetErrorString(cudaError_t error);
 CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
 				   CUdeviceptr ptr);
-CUresult ofi_cuInit(unsigned int flags);
+cudaError_t ofi_cudaHostRegister(void *ptr, size_t size, unsigned int flags);
+cudaError_t ofi_cudaHostUnregister(void *ptr);
 
 #endif /* HAVE_LIBCUDA */
 
@@ -79,26 +83,83 @@ hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr,
 					hsa_amd_deallocation_callback_t cb,
 					void *user_data);
 
+hsa_status_t ofi_hsa_amd_memory_lock(void *host_ptr, size_t size,
+				     hsa_agent_t *agents, int num_agents,
+				     void **agent_ptr);
+hsa_status_t ofi_hsa_amd_memory_unlock(void *host_ptr);
+
 #endif /* HAVE_ROCR */
 
-int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size);
+struct ofi_hmem_ops {
+	bool initialized;
+	int (*init)(void);
+	int (*cleanup)(void);
+	int (*copy_to_hmem)(uint64_t device, void *dest, const void *src,
+			    size_t size);
+	int (*copy_from_hmem)(uint64_t device, void *dest, const void *src,
+			      size_t size);
+	bool (*is_addr_valid)(const void *addr, uint64_t *device, uint64_t *flags);
+	int (*get_handle)(void *dev_buf, void **handle);
+	int (*open_handle)(void **handle, uint64_t device, void **ipc_ptr);
+	int (*close_handle)(void *ipc_ptr);
+	int (*host_register)(void *ptr, size_t size);
+	int (*host_unregister)(void *ptr);
+	int (*get_base_addr)(const void *ptr, void **base, size_t *size);
+	bool (*is_ipc_enabled)(void);
+};
+
+extern struct ofi_hmem_ops hmem_ops[];
+
+int rocr_copy_from_dev(uint64_t device, void *dest, const void *src,
+		       size_t size);
+int rocr_copy_to_dev(uint64_t device, void *dest, const void *src,
+		     size_t size);
 int rocr_hmem_init(void);
 int rocr_hmem_cleanup(void);
-bool rocr_is_addr_valid(const void *addr);
+bool rocr_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags);
+int rocr_host_register(void *ptr, size_t size);
+int rocr_host_unregister(void *ptr);
 
 int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
 int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
 int cuda_hmem_init(void);
 int cuda_hmem_cleanup(void);
-bool cuda_is_addr_valid(const void *addr);
-
+bool cuda_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags);
+int cuda_host_register(void *ptr, size_t size);
+int cuda_host_unregister(void *ptr);
+int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle);
+int cuda_dev_unregister(uint64_t handle);
+int cuda_get_handle(void *dev_buf, void **handle);
+int cuda_open_handle(void **handle, uint64_t device, void **ipc_ptr);
+int cuda_close_handle(void *ipc_ptr);
+bool cuda_is_ipc_enabled(void);
+bool cuda_is_gdrcopy_enabled(void);
+
+void cuda_gdrcopy_to_dev(uint64_t handle, void *dev,
+			 const void *host, size_t size);
+void cuda_gdrcopy_from_dev(uint64_t handle, void *host,
+			   const void *dev, size_t size);
+int cuda_gdrcopy_hmem_init(void);
+int cuda_gdrcopy_hmem_cleanup(void);
+int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle);
+int cuda_gdrcopy_dev_unregister(uint64_t handle);
+
+#define ZE_MAX_DEVICES 4
 int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size);
 int ze_hmem_init(void);
 int ze_hmem_cleanup(void);
-bool ze_is_addr_valid(const void *addr);
+bool ze_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags);
 int ze_hmem_get_handle(void *dev_buf, void **handle);
 int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr);
+int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd,
+			      void **handle);
+int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd,
+			       uint64_t device, void **ipc_ptr);
 int ze_hmem_close_handle(void *ipc_ptr);
+bool ze_hmem_p2p_enabled(void);
+int ze_hmem_get_base_addr(const void *ptr, void **base, size_t *size);
+int ze_hmem_get_id(const void *ptr, uint64_t *id);
+int *ze_hmem_get_dev_fds(int *nfds);
 
 static inline int ofi_memcpy(uint64_t device, void *dest, const void *src,
 			     size_t size)
@@ -132,6 +193,31 @@ static inline int ofi_hmem_no_close_handle(void *ipc_ptr)
 	return -FI_ENOSYS;
 }
 
+static inline int ofi_hmem_register_noop(void *ptr, size_t size)
+{
+	return FI_SUCCESS;
+}
+
+static inline int ofi_hmem_host_unregister_noop(void *ptr)
+{
+	return FI_SUCCESS;
+}
+
+static inline int ofi_hmem_no_base_addr(const void *ptr, void **base, size_t *size)
+{
+	return -FI_ENOSYS;
+}
+
+static inline bool ofi_hmem_no_is_ipc_enabled(void)
+{
+	return false;
+}
+
+static inline bool ofi_hmem_p2p_disabled(void)
+{
+	return ofi_hmem_disable_p2p;
+}
+
 ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size,
 			       enum fi_hmem_iface hmem_iface, uint64_t device,
 			       const struct iovec *hmem_iov,
@@ -140,15 +226,21 @@ ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size,
 ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device,
 			     const struct iovec *hmem_iov,
 			     size_t hmem_iov_count, uint64_t hmem_iov_offset,
-			     void *src, size_t size);
+			     const void *src, size_t size);
 
 int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle);
 int ofi_hmem_open_handle(enum fi_hmem_iface iface, void **handle,
 			 uint64_t device, void **ipc_ptr);
 int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr);
+int ofi_hmem_get_base_addr(enum fi_hmem_iface iface, const void *ptr,
+			   void **base, size_t *size);
+bool ofi_hmem_is_initialized(enum fi_hmem_iface iface);
 
 void ofi_hmem_init(void);
 void ofi_hmem_cleanup(void);
 enum fi_hmem_iface ofi_get_hmem_iface(const void *addr);
+int ofi_hmem_host_register(void *ptr, size_t size);
+int ofi_hmem_host_unregister(void *ptr);
+bool ofi_hmem_is_ipc_enabled(enum fi_hmem_iface iface);
 
 #endif /* _OFI_HMEM_H_ */
diff --git a/deps/libfabric/include/ofi_hook.h b/deps/libfabric/include/ofi_hook.h
index d7a33cb7013b363d10466d1ac59dae213f04cf14..652c9137893a58a7d92a2d82a3db41e62ab3638a 100644
--- a/deps/libfabric/include/ofi_hook.h
+++ b/deps/libfabric/include/ofi_hook.h
@@ -64,6 +64,15 @@ enum ofi_hook_class {
 };
 
 
+/*
+ * Default fi_ops members, can be used to construct custom fi_ops
+ */
+int hook_close(struct fid *fid);
+int hook_bind(struct fid *fid, struct fid *bfid, uint64_t flags);
+int hook_control(struct fid *fid, int command, void *arg);
+int hook_ops_open(struct fid *fid, const char *name,
+		  uint64_t flags, void **ops, void *context);
+
 /*
  * Define hook structs so we can cast from fid to parent using simple cast.
  * This lets us have a single close() call.
@@ -146,10 +155,14 @@ static inline struct fi_provider *hook_to_hprov(const struct fid *fid)
 	return hook_fabric_to_hprov(hook_to_fabric(fid));
 }
 
+struct ofi_ops_flow_ctrl;
+
 struct hook_domain {
 	struct fid_domain domain;
 	struct fid_domain *hdomain;
 	struct hook_fabric *fabric;
+	struct ofi_ops_flow_ctrl *base_ops_flow_ctrl;
+	ssize_t (*base_credit_handler)(struct fid_ep *ep_fid, size_t credits);
 };
 
 int hook_domain(struct fid_fabric *fabric, struct fi_info *info,
@@ -257,6 +270,7 @@ int hook_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
 		  enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags);
 
 extern struct fi_ops hook_fabric_fid_ops;
+extern struct fi_ops hook_domain_fid_ops;
 extern struct fi_ops_fabric hook_fabric_ops;
 extern struct fi_ops_domain hook_domain_ops;
 extern struct fi_ops_cq hook_cq_ops;
diff --git a/deps/libfabric/include/ofi_indexer.h b/deps/libfabric/include/ofi_indexer.h
index 250f95644f2475366944a5b798b8a1da7f4fdd31..450324c0cea430df9be831de213dfb808d509fb0 100644
--- a/deps/libfabric/include/ofi_indexer.h
+++ b/deps/libfabric/include/ofi_indexer.h
@@ -126,7 +126,7 @@ struct index_map
 
 int ofi_idm_set(struct index_map *idm, int index, void *item);
 void *ofi_idm_clear(struct index_map *idm, int index);
-void ofi_idm_reset(struct index_map *idm);
+void ofi_idm_reset(struct index_map *idm, void (*callback)(void *item));
 
 static inline void *ofi_idm_at(struct index_map *idm, int index)
 {
diff --git a/deps/libfabric/include/ofi_list.h b/deps/libfabric/include/ofi_list.h
index 1b79d0a2d2c871c5c68f592518421786d1cc0cd2..51120000e53c0ba86302dbd02b57bb6ee8da014a 100644
--- a/deps/libfabric/include/ofi_list.h
+++ b/deps/libfabric/include/ofi_list.h
@@ -45,6 +45,12 @@
 #include <ofi_signal.h>
 #include <ofi_lock.h>
 
+
+enum ofi_list_end {
+	OFI_LIST_TAIL,
+	OFI_LIST_HEAD
+};
+
 /*
  * Double-linked list
  */
diff --git a/deps/libfabric/include/ofi_lock.h b/deps/libfabric/include/ofi_lock.h
index 91b8dfadf76bf6997e35945d9c0a2f9cb58281a9..c11ad18f00fb90870745ddd738b62cd9943106a3 100644
--- a/deps/libfabric/include/ofi_lock.h
+++ b/deps/libfabric/include/ofi_lock.h
@@ -133,6 +133,11 @@ static inline void fastlock_release(fastlock_t *lock)
 	assert(!ret);
 }
 
+static inline int fastlock_held(fastlock_t *lock)
+{
+	return lock->in_use;
+}
+
 #else /* !ENABLE_DEBUG */
 
 #  define fastlock_t fastlock_t_
@@ -141,6 +146,7 @@ static inline void fastlock_release(fastlock_t *lock)
 #  define fastlock_acquire(lock) fastlock_acquire_(lock)
 #  define fastlock_tryacquire(lock) fastlock_tryacquire_(lock)
 #  define fastlock_release(lock) fastlock_release_(lock)
+#  define fastlock_held(lock) true
 
 #endif
 
@@ -162,7 +168,7 @@ static inline void ofi_fastlock_acquire_noop(fastlock_t *lock)
 	assert(!lock->in_use);
 	lock->in_use = 1;
 #else
-    (void)lock;
+	(void) lock;
 #endif
 }
 static inline void ofi_fastlock_release_noop(fastlock_t *lock)
@@ -171,7 +177,7 @@ static inline void ofi_fastlock_release_noop(fastlock_t *lock)
 	assert(lock->in_use);
 	lock->in_use = 0;
 #else
-    (void)lock;
+	(void) lock;
 #endif
 }
 
diff --git a/deps/libfabric/include/ofi_mem.h b/deps/libfabric/include/ofi_mem.h
index b3bd951018d7e9cc49b5124462280fc34ded2134..03c79fc637c7d432211b7790e6566978693ed013 100644
--- a/deps/libfabric/include/ofi_mem.h
+++ b/deps/libfabric/include/ofi_mem.h
@@ -66,7 +66,7 @@ enum {
 extern size_t *page_sizes;
 extern size_t num_page_sizes;
 
-static inline long ofi_get_page_size()
+static inline long ofi_get_page_size(void)
 {
 	return ofi_sysconf(_SC_PAGESIZE);
 }
@@ -100,50 +100,50 @@ static inline int ofi_str_dup(const char *src, char **dst)
 /*
  * Buffer pool (free stack) template
  */
-#define FREESTACK_EMPTY	NULL
+#define OFI_FREESTACK_EMPTY	NULL
 
-#define freestack_get_next(user_buf)	((char *)user_buf - sizeof(void *))
-#define freestack_get_user_buf(entry)	((char *)entry + sizeof(void *))
+#define ofi_freestack_get_next(user_buf) ((char *)user_buf - sizeof(void *))
+#define ofi_freestack_get_user_buf(entry) ((char *)entry + sizeof(void *))
 
 #if ENABLE_DEBUG
-#define freestack_init_next(entry)	*((void **)entry) = NULL
-#define freestack_check_next(entry)	assert(*((void **)entry) == NULL)
+#define ofi_freestack_init_next(entry)	*((void **)entry) = NULL
+#define ofi_freestack_check_next(entry)	assert(*((void **)entry) == NULL)
 #else
-#define freestack_init_next(entry)
-#define freestack_check_next(entry)
+#define ofi_freestack_init_next(entry)
+#define ofi_freestack_check_next(entry)
 #endif
 
-#define FREESTACK_HEADER 					\
+#define OFI_FREESTACK_HEADER 					\
 	size_t		size;					\
 	void		*next;					\
 
-#define freestack_isempty(fs)	((fs)->next == FREESTACK_EMPTY)
-#define freestack_push(fs, p)					\
+#define ofi_freestack_isempty(fs) ((fs)->next == OFI_FREESTACK_EMPTY)
+#define ofi_freestack_push(fs, p)				\
 do {								\
-	freestack_check_next(freestack_get_next(p));		\
-	*(void **) (freestack_get_next(p)) = (fs)->next;	\
-	(fs)->next = (freestack_get_next(p));			\
+	ofi_freestack_check_next(ofi_freestack_get_next(p));	\
+	*(void **) (ofi_freestack_get_next(p)) = (fs)->next;	\
+	(fs)->next = (ofi_freestack_get_next(p));		\
 } while (0)
-#define freestack_pop(fs) freestack_pop_impl(fs, (fs)->next)
+#define ofi_freestack_pop(fs) ofi_freestack_pop_impl(fs, (fs)->next)
 
-static inline void* freestack_pop_impl(void *fs, void *fs_next)
+static inline void* ofi_freestack_pop_impl(void *fs, void *fs_next)
 {
 	struct _freestack {
-		FREESTACK_HEADER
+		OFI_FREESTACK_HEADER
 	} *freestack = (struct _freestack *)fs;
-	assert(!freestack_isempty(freestack));
+	assert(!ofi_freestack_isempty(freestack));
 	freestack->next = *((void **)fs_next);
-	freestack_init_next(fs_next);
-	return freestack_get_user_buf(fs_next);
+	ofi_freestack_init_next(fs_next);
+	return ofi_freestack_get_user_buf(fs_next);
 }
 
-#define DECLARE_FREESTACK(entrytype, name)			\
+#define OFI_DECLARE_FREESTACK(entrytype, name)			\
 struct name ## _entry {						\
 	void		*next;					\
 	entrytype	buf;					\
 };								\
 struct name {							\
-	FREESTACK_HEADER					\
+	OFI_FREESTACK_HEADER					\
 	struct name ## _entry	entry[];			\
 };								\
 								\
@@ -158,11 +158,11 @@ name ## _init(struct name *fs, size_t size,			\
 	assert(size == roundup_power_of_two(size));		\
 	assert(sizeof(fs->entry[0].buf) >= sizeof(void *));	\
 	fs->size = size;					\
-	fs->next = FREESTACK_EMPTY;				\
+	fs->next = OFI_FREESTACK_EMPTY;				\
 	for (i = size - 1; i >= 0; i--) {			\
 		if (init)					\
 			init(&fs->entry[i].buf, arg);		\
-		freestack_push(fs, &fs->entry[i].buf);		\
+		ofi_freestack_push(fs, &fs->entry[i].buf);	\
 	}							\
 }								\
 								\
@@ -184,14 +184,16 @@ static inline int name ## _index(struct name *fs,		\
 				 entrytype *entry)		\
 {								\
 	return (int)((struct name ## _entry *)			\
-			(freestack_get_next(entry))		\
+			(ofi_freestack_get_next(entry))		\
 			- (struct name ## _entry *)fs->entry);	\
 }								\
 								\
 static inline void name ## _free(struct name *fs)		\
 {								\
 	free(fs);						\
-}
+}								\
+void dummy ## name (void) /* work-around global ; scope */
+
 
 /*
  * Buffer pool (free stack) template for shared memory regions
@@ -207,9 +209,9 @@ static inline void name ## _free(struct name *fs)		\
 #define smr_freestack_push(fs, local_p)				\
 do {								\
 	void *p = (char **) fs->base_addr +			\
-	    ((char **) freestack_get_next(local_p) -		\
+	    ((char **) ofi_freestack_get_next(local_p) -	\
 		(char **) fs);					\
-	*(void **) freestack_get_next(local_p) = (fs)->next;	\
+	*(void **) ofi_freestack_get_next(local_p) = (fs)->next;\
 	(fs)->next = p;						\
 } while (0)
 #define smr_freestack_pop(fs) smr_freestack_pop_impl(fs, fs->next)
@@ -227,12 +229,12 @@ static inline void* smr_freestack_pop_impl(void *fs, void *next)
 		(char **) freestack->base_addr);
 
 	freestack->next = *((void **)local);
-	freestack_init_next(local);
+	ofi_freestack_init_next(local);
 
-	return freestack_get_user_buf(local);
+	return ofi_freestack_get_user_buf(local);
 }
 
-#define DECLARE_SMR_FREESTACK(entrytype, name)			\
+#define SMR_DECLARE_FREESTACK(entrytype, name)			\
 struct name ## _entry {						\
 	void		*next;					\
 	entrytype	buf;					\
@@ -268,14 +270,15 @@ static inline int name ## _index(struct name *fs,		\
 		entrytype *entry)				\
 {								\
 	return (int)((struct name ## _entry *)			\
-			(freestack_get_next(entry))		\
+			(ofi_freestack_get_next(entry))		\
 			- (struct name ## _entry *)fs->entry);	\
 }								\
 								\
 static inline void name ## _free(struct name *fs)		\
 {								\
 	free(fs);						\
-}
+}								\
+void dummy ## name (void) /* work-around global ; scope */
 
 
 /*
@@ -327,9 +330,11 @@ struct ofi_bufpool_region {
 	void 				*context;
 	struct ofi_bufpool 		*pool;
 	int				flags;
-#ifndef NDEBUG
-	size_t 				use_cnt;
-#endif
+	OFI_DBG_VAR(size_t,		use_cnt)
+};
+
+struct ofi_bufpool_ftr {
+	size_t				magic;
 };
 
 struct ofi_bufpool_hdr {
@@ -339,6 +344,9 @@ struct ofi_bufpool_hdr {
 	} entry;
 	struct ofi_bufpool_region	*region;
 	size_t 				index;
+
+	OFI_DBG_VAR(struct ofi_bufpool_ftr *, ftr)
+	OFI_DBG_VAR(size_t,		magic)
 };
 
 int ofi_bufpool_create_attr(struct ofi_bufpool_attr *attr,
@@ -390,6 +398,9 @@ static inline void ofi_buf_free(void *buf)
 {
 	assert(ofi_buf_region(buf)->use_cnt--);
 	assert(!(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED));
+	assert(ofi_buf_hdr(buf)->magic == OFI_MAGIC_SIZE_T);
+	assert(ofi_buf_hdr(buf)->ftr->magic == OFI_MAGIC_SIZE_T);
+
 	slist_insert_head(&ofi_buf_hdr(buf)->entry.slist,
 			  &ofi_buf_pool(buf)->free_list.entries);
 }
@@ -401,13 +412,15 @@ static inline void ofi_ibuf_free(void *buf)
 {
 	struct ofi_bufpool_hdr *buf_hdr;
 
-	assert(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED);
-	assert(ofi_buf_region(buf)->use_cnt--);
 	buf_hdr = ofi_buf_hdr(buf);
 
+	assert(ofi_buf_region(buf)->use_cnt--);
+	assert(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED);
+	assert(buf_hdr->magic == OFI_MAGIC_SIZE_T);
+	assert(buf_hdr->ftr->magic == OFI_MAGIC_SIZE_T);
+
 	dlist_insert_order(&buf_hdr->region->free_list,
 			   ofi_ibuf_is_lower, &buf_hdr->entry.dlist);
-
 	if (dlist_empty(&buf_hdr->region->entry)) {
 		dlist_insert_order(&buf_hdr->region->pool->free_list.regions,
 				   ofi_ibufpool_region_is_lower,
diff --git a/deps/libfabric/include/ofi_mr.h b/deps/libfabric/include/ofi_mr.h
index 149b534611f2db17880db01688fb718e8ef2869e..1dba72204f6330099a14b7860956b685c123ad0d 100644
--- a/deps/libfabric/include/ofi_mr.h
+++ b/deps/libfabric/include/ofi_mr.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved.
- * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates.
+ * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates.
  *                         All rights reserved.
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
@@ -50,6 +50,10 @@
 #include <ofi_tree.h>
 #include <ofi_hmem.h>
 
+
+int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len,
+		      uint64_t flags, struct fid **fid, void *context);
+
 struct ofi_mr_info {
 	struct iovec iov;
 	enum fi_hmem_iface iface;
@@ -120,11 +124,21 @@ struct ofi_mr_cache;
 
 union ofi_mr_hmem_info {
 	uint64_t cuda_id;
+	uint64_t ze_id;
+};
+
+enum fi_mm_state {
+	FI_MM_STATE_UNSPEC = 0,
+	FI_MM_STATE_IDLE,
+	FI_MM_STATE_STARTING,
+	FI_MM_STATE_RUNNING,
+	FI_MM_STATE_STOPPING,
 };
 
 struct ofi_mem_monitor {
 	struct dlist_entry		list;
 	enum fi_hmem_iface		iface;
+	enum fi_mm_state                state;
 
 	void (*init)(struct ofi_mem_monitor *monitor);
 	void (*cleanup)(struct ofi_mem_monitor *monitor);
@@ -151,6 +165,8 @@ void ofi_monitor_init(struct ofi_mem_monitor *monitor);
 void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor);
 void ofi_monitors_init(void);
 void ofi_monitors_cleanup(void);
+int ofi_monitor_import(struct fid *fid);
+
 int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 			   struct ofi_mr_cache *cache);
 void ofi_monitors_del_cache(struct ofi_mr_cache *cache);
@@ -168,6 +184,7 @@ void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor,
 extern struct ofi_mem_monitor *default_monitor;
 extern struct ofi_mem_monitor *default_cuda_monitor;
 extern struct ofi_mem_monitor *default_rocr_monitor;
+extern struct ofi_mem_monitor *default_ze_monitor;
 
 /*
  * Userfault fd memory monitor
@@ -191,8 +208,9 @@ struct ofi_memhooks {
 extern struct ofi_mem_monitor *memhooks_monitor;
 
 extern struct ofi_mem_monitor *cuda_monitor;
-
 extern struct ofi_mem_monitor *rocr_monitor;
+extern struct ofi_mem_monitor *ze_monitor;
+extern struct ofi_mem_monitor *import_monitor;
 
 /*
  * Used to store registered memory regions into a lookup map.  This
@@ -262,41 +280,20 @@ struct ofi_mr_cache_params {
 	char *				monitor;
 	int				cuda_monitor_enabled;
 	int				rocr_monitor_enabled;
+	int				ze_monitor_enabled;
 };
 
 extern struct ofi_mr_cache_params	cache_params;
 
 struct ofi_mr_entry {
 	struct ofi_mr_info		info;
-	void				*storage_context;
+	struct ofi_rbnode		*node;
 	int				use_cnt;
 	struct dlist_entry		list_entry;
 	union ofi_mr_hmem_info		hmem_info;
 	uint8_t				data[];
 };
 
-enum ofi_mr_storage_type {
-	OFI_MR_STORAGE_DEFAULT = 0,
-	OFI_MR_STORAGE_RBT,
-	OFI_MR_STORAGE_USER,
-};
-
-struct ofi_mr_storage {
-	enum ofi_mr_storage_type	type;
-	void				*storage;
-
-	struct ofi_mr_entry *		(*find)(struct ofi_mr_storage *storage,
-						const struct ofi_mr_info *key);
-	struct ofi_mr_entry *		(*overlap)(struct ofi_mr_storage *storage,
-						const struct iovec *key);
-	int				(*insert)(struct ofi_mr_storage *storage,
-						struct ofi_mr_info *key,
-						struct ofi_mr_entry *entry);
-	int				(*erase)(struct ofi_mr_storage *storage,
-						struct ofi_mr_entry *entry);
-	void				(*destroy)(struct ofi_mr_storage *storage);
-};
-
 #define OFI_HMEM_MAX 4
 
 struct ofi_mr_cache {
@@ -305,9 +302,9 @@ struct ofi_mr_cache {
 	struct dlist_entry		notify_entries[OFI_HMEM_MAX];
 	size_t				entry_data_size;
 
-	struct ofi_mr_storage		storage;
+	struct ofi_rbmap		tree;
 	struct dlist_entry		lru_list;
-	struct dlist_entry		flush_list;
+	struct dlist_entry		dead_region_list;
 	pthread_mutex_t 		lock;
 
 	size_t				cached_cnt;
@@ -333,10 +330,17 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache);
 
 void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t len);
 
+static inline bool ofi_mr_cache_full(struct ofi_mr_cache *cache)
+{
+	return (cache->cached_cnt >= cache_params.max_cnt) ||
+	       (cache->cached_size >= cache_params.max_size);
+}
+
 bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru);
 
 int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr,
 			struct ofi_mr_entry **entry);
+
 /**
  * Given an attr (with an iov range), if the iov range is already registered,
  * return the corresponding ofi_mr_entry. Otherwise, return NULL.
diff --git a/deps/libfabric/include/ofi_net.h b/deps/libfabric/include/ofi_net.h
index c37805cd765a08f6205944c42ae114191711c02b..df36b9ee1bd12a1e3dfedaf5e3827afd83c22d70 100644
--- a/deps/libfabric/include/ofi_net.h
+++ b/deps/libfabric/include/ofi_net.h
@@ -92,6 +92,14 @@ static inline uint64_t ntohll(uint64_t x) { return x; }
 #endif
 #endif
 
+#ifdef MSG_ZEROCOPY
+#define OFI_ZEROCOPY MSG_ZEROCOPY
+#define OFI_ZEROCOPY_SIZE 9000 /* arbitrary based on documentation */
+#else
+#define OFI_ZEROCOPY 0
+#define OFI_ZEROCOPY_SIZE SIZE_MAX
+#endif
+
 
 static inline int ofi_recvall_socket(SOCKET sock, void *buf, size_t len)
 {
@@ -117,6 +125,170 @@ static inline int ofi_sendall_socket(SOCKET sock, const void *buf, size_t len)
 
 int ofi_discard_socket(SOCKET sock, size_t len);
 
+/*
+ * Byte queue - streaming socket staging buffer
+ */
+enum {
+	OFI_BYTEQ_SIZE = 9000, /* Hard-coded max, good for 6 1500B buffers */
+};
+
+struct ofi_byteq {
+	size_t size;
+	unsigned int head;
+	unsigned int tail;
+	uint8_t data[OFI_BYTEQ_SIZE];
+};
+
+static inline void ofi_byteq_init(struct ofi_byteq *byteq, ssize_t size)
+{
+	memset(byteq, 0, sizeof *byteq);
+	if (size > OFI_BYTEQ_SIZE)
+		byteq->size = OFI_BYTEQ_SIZE;
+	else if (size >= 0)
+		byteq->size = size;
+	else
+		byteq->size = 0;
+}
+
+static inline void ofi_byteq_discard(struct ofi_byteq *byteq)
+{
+	byteq->head = 0;
+	byteq->tail = 0;
+}
+
+static inline size_t ofi_byteq_readable(struct ofi_byteq *byteq)
+{
+	return byteq->tail - byteq->head;
+}
+
+static inline size_t ofi_byteq_writeable(struct ofi_byteq *byteq)
+{
+	return byteq->size - byteq->tail;
+}
+
+static inline size_t
+ofi_byteq_read(struct ofi_byteq *byteq, void *buf, size_t len)
+{
+	size_t avail;
+
+	avail = ofi_byteq_readable(byteq);
+	if (!avail)
+		return 0;
+
+	if (len < avail) {
+		memcpy(buf, &byteq->data[byteq->head], len);
+		byteq->head += len;
+		return len;
+	}
+
+	memcpy(buf, &byteq->data[byteq->head], avail);
+	byteq->head = 0;
+	byteq->tail = 0;
+	return avail;
+}
+
+static inline void
+ofi_byteq_write(struct ofi_byteq *byteq, const void *buf, size_t len)
+{
+	assert(len <= ofi_byteq_writeable(byteq));
+	memcpy(&byteq->data[byteq->tail], buf, len);
+	byteq->tail += len;
+}
+
+void ofi_byteq_writev(struct ofi_byteq *byteq, const struct iovec *iov,
+		      size_t cnt);
+
+static inline ssize_t ofi_byteq_recv(struct ofi_byteq *byteq, SOCKET sock)
+{
+	size_t avail;
+	ssize_t ret;
+
+	avail = ofi_byteq_writeable(byteq);
+	assert(avail);
+	ret = ofi_recv_socket(sock, &byteq->data[byteq->tail], avail,
+			      MSG_NOSIGNAL);
+	if (ret > 0)
+		byteq->tail += ret;
+	return ret;
+}
+
+size_t ofi_byteq_readv(struct ofi_byteq *byteq, struct iovec *iov,
+		       size_t cnt, size_t offset);
+
+static inline ssize_t ofi_byteq_send(struct ofi_byteq *byteq, SOCKET sock)
+{
+	size_t avail;
+	ssize_t ret;
+
+	avail = ofi_byteq_readable(byteq);
+	assert(avail);
+	ret = ofi_send_socket(sock, &byteq->data[byteq->head], avail,
+			      MSG_NOSIGNAL);
+	if (ret == avail) {
+		byteq->head = 0;
+		byteq->tail = 0;
+	} else if (ret > 0) {
+		byteq->head += ret;
+	}
+	return ret;
+}
+
+
+/*
+ * Buffered socket - socket with send/receive staging buffers.
+ */
+struct ofi_bsock {
+	SOCKET sock;
+	struct ofi_byteq sq;
+	struct ofi_byteq rq;
+	size_t zerocopy_size;
+	uint32_t async_index;
+	uint32_t done_index;
+};
+
+static inline void
+ofi_bsock_init(struct ofi_bsock *bsock, ssize_t sbuf_size, ssize_t rbuf_size)
+{
+	bsock->sock = INVALID_SOCKET;
+	ofi_byteq_init(&bsock->sq, sbuf_size);
+	ofi_byteq_init(&bsock->rq, rbuf_size);
+	bsock->zerocopy_size = SIZE_MAX;
+
+	/* first async op will wrap back to 0 as the starting index */
+	bsock->async_index = UINT32_MAX;
+	bsock->done_index = UINT32_MAX;
+}
+
+static inline void ofi_bsock_discard(struct ofi_bsock *bsock)
+{
+	ofi_byteq_discard(&bsock->rq);
+	ofi_byteq_discard(&bsock->sq);
+}
+
+static inline size_t ofi_bsock_readable(struct ofi_bsock *bsock)
+{
+	return ofi_byteq_readable(&bsock->rq);
+}
+
+static inline size_t ofi_bsock_tosend(struct ofi_bsock *bsock)
+{
+	return ofi_byteq_readable(&bsock->sq);
+}
+
+ssize_t ofi_bsock_flush(struct ofi_bsock *bsock);
+/* For sends started asynchronously, the return value will be -EINPROGRESS,
+ * and len will be set to the number of bytes that were queued.
+ */
+ssize_t ofi_bsock_send(struct ofi_bsock *bsock, const void *buf, size_t *len);
+ssize_t ofi_bsock_sendv(struct ofi_bsock *bsock, const struct iovec *iov,
+			size_t cnt, size_t *len);
+ssize_t ofi_bsock_recv(struct ofi_bsock *bsock, void *buf, size_t len);
+ssize_t ofi_bsock_recvv(struct ofi_bsock *bsock, struct iovec *iov,
+			size_t cnt);
+uint32_t ofi_bsock_async_done(const struct fi_provider *prov,
+			      struct ofi_bsock *bsock);
+
+
 /*
  * Address utility functions
  */
@@ -229,45 +401,60 @@ static inline int ofi_translate_addr_format(int family)
 	}
 }
 
+static inline size_t ofi_sizeof_addr_format(int format)
+{
+	switch (format) {
+	case FI_SOCKADDR_IN:
+		return sizeof(struct sockaddr_in);
+	case FI_SOCKADDR_IN6:
+		return sizeof(struct sockaddr_in6);
+	case FI_SOCKADDR_IB:
+		return sizeof(struct ofi_sockaddr_ib);
+	default:
+		FI_WARN(&core_prov, FI_LOG_CORE, "Unsupported address format\n");
+		return 0;
+	}
+}
+
 uint16_t ofi_get_sa_family(const struct fi_info *info);
 
-static inline int ofi_sin_is_any_addr(struct sockaddr *sa)
+static inline bool ofi_sin_is_any_addr(const struct sockaddr *sa)
 {
 	struct in_addr ia_any = {
 		.s_addr = INADDR_ANY,
 	};
 
 	if (!sa)
-		return 0;
+		return false;
 
 	return !memcmp(&ofi_sin_addr(sa).s_addr, &ia_any, sizeof(ia_any));
 
 }
 
-static inline int ofi_sin6_is_any_addr(struct sockaddr *sa)
+static inline bool ofi_sin6_is_any_addr(const struct sockaddr *sa)
 {
 	struct in6_addr ia6_any = IN6ADDR_ANY_INIT;
 
 	if (!sa)
-		return 0;
+		return false;
 
 	return !memcmp(&ofi_sin6_addr(sa), &ia6_any, sizeof(ia6_any));
 }
 
-static inline int ofi_sib_is_any_addr(struct sockaddr *sa)
+static inline bool ofi_sib_is_any_addr(const struct sockaddr *sa)
 {
 	struct in6_addr ia6_any = IN6ADDR_ANY_INIT;
 
 	if (!sa)
-		return 0;
+		return false;
 
 	return !memcmp(&ofi_sib_addr(sa), &ia6_any, sizeof(ia6_any));
 }
 
-static inline int ofi_is_any_addr(struct sockaddr *sa)
+static inline bool ofi_is_any_addr(const struct sockaddr *sa)
 {
 	if (!sa)
-		return 0;
+		return false;
 
 	switch(sa->sa_family) {
 	case AF_INET:
@@ -278,7 +465,7 @@ static inline int ofi_is_any_addr(struct sockaddr *sa)
 		return ofi_sib_is_any_addr(sa);
 	default:
 		FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format!\n");
-		return 0;
+		return false;
 	}
 }
 
@@ -296,7 +483,6 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr)
 		return (uint16_t)ntohll(((const struct ofi_sockaddr_ib *)addr)->sib_sid);
 	default:
 		FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n");
-		assert(0);
 		return 0;
 	}
 }
@@ -337,11 +523,16 @@ static inline void * ofi_get_ipaddr(const struct sockaddr *addr)
 	}
 }
 
-static inline int ofi_equals_ipaddr(const struct sockaddr *addr1,
+static inline bool ofi_valid_dest_ipaddr(const struct sockaddr *addr)
+{
+	return ofi_addr_get_port(addr) && !ofi_is_any_addr(addr);
+}
+
+static inline bool ofi_equals_ipaddr(const struct sockaddr *addr1,
 				    const struct sockaddr *addr2)
 {
 	if (addr1->sa_family != addr2->sa_family)
-		return 0;
+		return false;
 
 	switch (addr1->sa_family) {
 	case AF_INET:
@@ -354,19 +545,19 @@ static inline int ofi_equals_ipaddr(const struct sockaddr *addr1,
 	        return !memcmp(&ofi_sib_addr(addr1), &ofi_sib_addr(addr2),
 				sizeof(ofi_sib_addr(addr1)));
 	default:
-		return 0;
+		return false;
 	}
 }
 
-static inline int ofi_equals_sockaddr(const struct sockaddr *addr1,
-				      const struct sockaddr *addr2)
+static inline bool ofi_equals_sockaddr(const struct sockaddr *addr1,
+				       const struct sockaddr *addr2)
 {
         return (ofi_addr_get_port(addr1) == ofi_addr_get_port(addr2)) &&
 		ofi_equals_ipaddr(addr1, addr2);
 }
 
-int ofi_is_wildcard_listen_addr(const char *node, const char *service,
-				uint64_t flags, const struct fi_info *hints);
+bool ofi_is_wildcard_listen_addr(const char *node, const char *service,
+				 uint64_t flags, const struct fi_info *hints);
 
 size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr,
 		     const struct sockaddr *netmask);
diff --git a/deps/libfabric/include/ofi_osd.h b/deps/libfabric/include/ofi_osd.h
index 97be08b7bdb5b571bb111464f334ee20c2e8b7c0..31bbc08da3033f8c1d3a58b25fd1d7379b6b02d8 100644
--- a/deps/libfabric/include/ofi_osd.h
+++ b/deps/libfabric/include/ofi_osd.h
@@ -107,4 +107,18 @@ static inline int ofi_detect_endianness(void)
 	}
 }
 
+#define OFI_MAGIC_64 (0x0F1C0DE0F1C0DE64)
+#define OFI_MAGIC_PTR ((void *) (uintptr_t) OFI_MAGIC_64)
+#define OFI_MAGIC_SIZE_T ((size_t) OFI_MAGIC_64)
+
+#ifndef NDEBUG
+#define OFI_DBG_VAR(type, name) type name;
+#define OFI_DBG_SET(name, val) name = val
+#define OFI_DBG_ADD(name, val) name += val
+#else
+#define OFI_DBG_VAR(type, name)
+#define OFI_DBG_SET(name, val)
+#define OFI_DBG_ADD(name, val)
+#endif
+
 #endif /* _OFI_OSD_H_ */
diff --git a/deps/libfabric/include/ofi_prov.h b/deps/libfabric/include/ofi_prov.h
index c4a63cbad202e6c4eaa425144c92e37f13e8034f..ff9c1fbe1bcdce57b17ad0aa693747b2e41f21ac 100644
--- a/deps/libfabric/include/ofi_prov.h
+++ b/deps/libfabric/include/ofi_prov.h
@@ -103,6 +103,17 @@ PSM2_INI ;
 #  define PSM2_INIT NULL
 #endif
 
+#if (HAVE_PSM3) && (HAVE_PSM3_DL)
+#  define PSM3_INI FI_EXT_INI
+#  define PSM3_INIT NULL
+#elif (HAVE_PSM3)
+#  define PSM3_INI INI_SIG(fi_psm3_ini)
+#  define PSM3_INIT fi_psm3_ini()
+PSM3_INI ;
+#else
+#  define PSM3_INIT NULL
+#endif
+
 #if (HAVE_SOCKETS) && (HAVE_SOCKETS_DL)
 #  define SOCKETS_INI FI_EXT_INI
 #  define SOCKETS_INIT NULL
diff --git a/deps/libfabric/include/ofi_rbuf.h b/deps/libfabric/include/ofi_rbuf.h
index 5f2e6e38becdaf3af5bf5ab57372c1a22348dddc..3d8e1c77cdb1344dcb5f6eaebe7bae4ae0f55ebd 100644
--- a/deps/libfabric/include/ofi_rbuf.h
+++ b/deps/libfabric/include/ofi_rbuf.h
@@ -82,7 +82,8 @@ static inline struct name * name ## _create(size_t size)	\
 static inline void name ## _free(struct name *cq)		\
 {								\
 	free(cq);						\
-}
+}								\
+void dummy ## name (void) /* work-around global ; scope */
 
 #define ofi_cirque_isempty(cq)		((cq)->wcnt == (cq)->rcnt)
 #define ofi_cirque_usedcnt(cq)		((cq)->wcnt - (cq)->rcnt)
@@ -91,8 +92,10 @@ static inline void name ## _free(struct name *cq)		\
 
 #define ofi_cirque_rindex(cq)		((cq)->rcnt & (cq)->size_mask)
 #define ofi_cirque_windex(cq)		((cq)->wcnt & (cq)->size_mask)
+#define ofi_cirque_tindex(cq)		(((cq)->wcnt - 1) & (cq)->size_mask)
 #define ofi_cirque_head(cq)		(&(cq)->buf[ofi_cirque_rindex(cq)])
-#define ofi_cirque_tail(cq)		(&(cq)->buf[ofi_cirque_windex(cq)])
+#define ofi_cirque_tail(cq)		(&(cq)->buf[ofi_cirque_tindex(cq)])
+#define ofi_cirque_next(cq)		(&(cq)->buf[ofi_cirque_windex(cq)])
 #define ofi_cirque_insert(cq, x)	(cq)->buf[(cq)->wcnt++ & (cq)->size_mask] = x
 #define ofi_cirque_remove(cq)		(&(cq)->buf[(cq)->rcnt++ & (cq)->size_mask])
 #define ofi_cirque_discard(cq)		((cq)->rcnt++)
diff --git a/deps/libfabric/include/ofi_recvwin.h b/deps/libfabric/include/ofi_recvwin.h
index 468c657f8b4b62b318ead2be6629eeb5002ae7d5..2d1073a8addcb8d84dfbba8db5d8823e3493e3ba 100644
--- a/deps/libfabric/include/ofi_recvwin.h
+++ b/deps/libfabric/include/ofi_recvwin.h
@@ -80,6 +80,12 @@ ofi_recvwin_id_valid(struct name *recvq, id_type id)			\
 }									\
 									\
 static inline int							\
+ofi_recvwin_id_processed(struct name *recvq, id_type id)		\
+{									\
+	return ofi_recvwin_id_processed_ ## id_type (recvq, id);	\
+}									\
+									\
+static inline int							\
 ofi_recvwin_queue_msg(struct name *recvq, entrytype * msg, id_type id)	\
 {									\
 	size_t write_idx;						\
@@ -139,4 +145,9 @@ ofi_recvwin_slide(struct name *recvq)					\
 #define ofi_recvwin_id_valid_uint64_t(rq, id) \
 	ofi_val64_inrange(rq->exp_msg_id, rq->win_size, id)
 
+#define ofi_recvwin_id_processed_uint32_t(rq, id) \
+	ofi_val32_gt(rq->exp_msg_id, id)
+#define ofi_recvwin_id_processed_uint64_t(rq, id) \
+	ofi_val64_gt(rq->exp_msg_id, id)
+
 #endif /* FI_RECVWIN_H */
diff --git a/deps/libfabric/include/ofi_shm.h b/deps/libfabric/include/ofi_shm.h
index db388488ae1b43baf8cb45c14b9eeb747ca8fffc..838f255ac1d1a933c9160e15b53f0201f6fa2a05 100644
--- a/deps/libfabric/include/ofi_shm.h
+++ b/deps/libfabric/include/ofi_shm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,7 @@
 
 #include <stdint.h>
 #include <stddef.h>
+#include <sys/un.h>
 
 #include <ofi_atom.h>
 #include <ofi_proto.h>
@@ -51,7 +52,7 @@ extern "C" {
 #endif
 
 
-#define SMR_VERSION	1
+#define SMR_VERSION	2
 
 #ifdef HAVE_ATOMICS
 #define SMR_FLAG_ATOMIC	(1 << 0)
@@ -65,6 +66,7 @@ extern "C" {
 #define SMR_FLAG_DEBUG	(0 << 1)
 #endif
 
+#define SMR_FLAG_IPC_SOCK (1 << 2)
 
 #define SMR_CMD_SIZE		128	/* align with 64-byte cache line */
 
@@ -80,7 +82,7 @@ enum {
 
 //reserves 0-255 for defined ops and room for new ops
 //256 and beyond reserved for ctrl ops
-#define SMR_OP_MAX (1 << 8) 
+#define SMR_OP_MAX (1 << 8)
 
 #define SMR_REMOTE_CQ_DATA	(1 << 0)
 #define SMR_RMA_REQ		(1 << 1)
@@ -95,7 +97,7 @@ enum {
 	SMR_CMA_CAP_OFF,
 };
 
-/* 
+/*
  * Unique smr_op_hdr for smr message protocol:
  * 	addr - local shm_id of peer sending msg (for shm lookup)
  * 	op - type of op (ex. ofi_op_msg, defined in ofi_proto.h)
@@ -121,7 +123,7 @@ struct smr_msg_hdr {
 			uint8_t	atomic_op;
 		};
 	};
-};
+} __attribute__ ((aligned(16)));
 
 #define SMR_MSG_DATA_LEN	(SMR_CMD_SIZE - sizeof(struct smr_msg_hdr))
 #define SMR_COMP_DATA_LEN	(SMR_MSG_DATA_LEN / 2)
@@ -130,7 +132,12 @@ struct smr_msg_hdr {
 struct smr_ipc_info {
 	uint64_t	iface;
 	union {
-		uint8_t	ipc_handle[IPC_HANDLE_SIZE];
+		uint8_t		ipc_handle[IPC_HANDLE_SIZE];
+		struct {
+			uint64_t	device;
+			uint64_t	offset;
+			uint64_t	fd_handle;
+		};
 	};
 };
 
@@ -178,7 +185,10 @@ struct smr_cmd {
 #define SMR_COMP_INJECT_SIZE	(SMR_INJECT_SIZE / 2)
 #define SMR_SAR_SIZE		16384
 
-#define SMR_NAME_MAX		256
+#define SMR_DIR "/dev/shm/"
+#define SMR_NAME_MAX	256
+#define SMR_PATH_MAX	(SMR_NAME_MAX + sizeof(SMR_DIR))
+#define SMR_SOCK_NAME_MAX sizeof(((struct sockaddr_un *)0)->sun_path)
 
 struct smr_addr {
 	char		name[SMR_NAME_MAX];
@@ -202,6 +212,13 @@ struct smr_ep_name {
 	struct dlist_entry entry;
 };
 
+static inline const char *smr_no_prefix(const char *addr)
+{
+	char *start;
+
+	return (start = strstr(addr, "://")) ? start + 3 : addr;
+}
+
 struct smr_peer {
 	struct smr_addr		peer;
 	fi_addr_t		fiaddr;
@@ -222,11 +239,14 @@ struct smr_region {
 	uint8_t		resv;
 	uint16_t	flags;
 	int		pid;
-	uint8_t		cma_cap;
+	uint8_t		cma_cap_peer;
+	uint8_t		cma_cap_self;
 	void		*base_addr;
 	fastlock_t	lock; /* lock for shm access
 				 Must hold smr->lock before tx/rx cq locks
 				 in order to progress or post recv */
+	ofi_atomic32_t	signal;
+
 	struct smr_map	*map;
 
 	size_t		total_size;
@@ -245,6 +265,7 @@ struct smr_region {
 	size_t		sar_pool_offset;
 	size_t		peer_data_offset;
 	size_t		name_offset;
+	size_t		sock_name_offset;
 };
 
 struct smr_resp {
@@ -278,8 +299,8 @@ struct smr_sar_msg {
 
 OFI_DECLARE_CIRQUE(struct smr_cmd, smr_cmd_queue);
 OFI_DECLARE_CIRQUE(struct smr_resp, smr_resp_queue);
-DECLARE_SMR_FREESTACK(struct smr_inject_buf, smr_inject_pool);
-DECLARE_SMR_FREESTACK(struct smr_sar_msg, smr_sar_pool);
+SMR_DECLARE_FREESTACK(struct smr_inject_buf, smr_inject_pool);
+SMR_DECLARE_FREESTACK(struct smr_sar_msg, smr_sar_pool);
 
 static inline struct smr_region *smr_peer_region(struct smr_region *smr, int i)
 {
@@ -299,17 +320,22 @@ static inline struct smr_inject_pool *smr_inject_pool(struct smr_region *smr)
 }
 static inline struct smr_peer_data *smr_peer_data(struct smr_region *smr)
 {
-	return (struct smr_peer_data *) ((char *) smr + smr->peer_data_offset); 
+	return (struct smr_peer_data *) ((char *) smr + smr->peer_data_offset);
 }
 static inline struct smr_sar_pool *smr_sar_pool(struct smr_region *smr)
 {
-	return (struct smr_sar_pool *) ((char *) smr + smr->sar_pool_offset); 
+	return (struct smr_sar_pool *) ((char *) smr + smr->sar_pool_offset);
 }
 static inline const char *smr_name(struct smr_region *smr)
 {
 	return (const char *) smr + smr->name_offset;
 }
 
+static inline char *smr_sock_name(struct smr_region *smr)
+{
+	return (char *) smr + smr->sock_name_offset;
+}
+
 static inline void smr_set_map(struct smr_region *smr, struct smr_map *map)
 {
 	smr->map = map;
@@ -324,7 +350,8 @@ struct smr_attr {
 size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 				  size_t *cmd_offset, size_t *resp_offset,
 				  size_t *inject_offset, size_t *sar_offset,
-				  size_t *peer_offset, size_t *name_offset);
+				  size_t *peer_offset, size_t *name_offset,
+				  size_t *sock_offset);
 void	smr_cma_check(struct smr_region *region, struct smr_region *peer_region);
 void	smr_cleanup(void);
 int	smr_map_create(const struct fi_provider *prov, int peer_count,
@@ -345,6 +372,11 @@ int	smr_create(const struct fi_provider *prov, struct smr_map *map,
 		   const struct smr_attr *attr, struct smr_region *volatile *smr);
 void	smr_free(struct smr_region *smr);
 
+static inline void smr_signal(struct smr_region *smr)
+{
+	ofi_atomic_set32(&smr->signal, 1);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/libfabric/include/ofi_util.h b/deps/libfabric/include/ofi_util.h
index 3929b0bc968ff5c4ccc666efe67a216b1977e709..11cc208163d6b18d7276512ba5a704733363ab0a 100644
--- a/deps/libfabric/include/ofi_util.h
+++ b/deps/libfabric/include/ofi_util.h
@@ -74,11 +74,13 @@ extern "C" {
 #endif
 
 /* EQ / CQ flags
- * ERROR: The added entry was the result of an error completion
- * OVERFLOW: The CQ has overflowed, and events have been lost
+ * ERROR: EQ entry was the result of a failed operation,
+ *        or the caller is trying to read the next entry
+ *        if it is an error.
+ * AUX: CQ entries are stored in the auxiliary queue
  */
 #define UTIL_FLAG_ERROR		(1ULL << 60)
-#define UTIL_FLAG_OVERFLOW	(1ULL << 61)
+#define UTIL_FLAG_AUX		(1ULL << 61)
 
 /* Indicates that an EP has been bound to a counter */
 #define OFI_CNTR_ENABLED	(1ULL << 61)
@@ -99,41 +101,48 @@ extern "C" {
 #define OFI_EQ_STRERROR(prov, level, subsys, eq, entry) \
 	OFI_Q_STRERROR(prov, level, subsys, eq, "eq", entry, fi_eq_strerror)
 
-#define FI_INFO_FIELD(provider, prov_attr, user_attr, prov_str, user_str, type)	\
-	do {										\
-		FI_INFO(provider, FI_LOG_CORE, prov_str ": %s\n",			\
-				fi_tostr(&prov_attr, type));				\
-		FI_INFO(provider, FI_LOG_CORE, user_str ": %s\n",			\
-				fi_tostr(&user_attr, type));				\
+#define OFI_INFO_FIELD(provider, prov_attr, user_attr, prov_str, user_str, type) \
+	do {									\
+		FI_INFO(provider, FI_LOG_CORE, prov_str ": %s\n",		\
+				fi_tostr(&prov_attr, type));			\
+		FI_INFO(provider, FI_LOG_CORE, user_str ": %s\n",		\
+				fi_tostr(&user_attr, type));			\
 	} while (0)
 
-#define FI_INFO_STRING(provider, prov_attr, user_attr, prov_str, user_str)	\
+#define OFI_INFO_STR(provider, prov_attr, user_attr, prov_str, user_str)	\
 	do {									\
 		FI_INFO(provider, FI_LOG_CORE, prov_str ": %s\n", prov_attr);	\
 		FI_INFO(provider, FI_LOG_CORE, user_str ": %s\n", user_attr);	\
 	} while (0)
 
-#define FI_INFO_CHECK(provider, prov, user, field, type)		\
-	FI_INFO_FIELD(provider, prov->field, user->field, "Supported",	\
+#define OFI_INFO_CHECK(provider, prov, user, field, type)		\
+	OFI_INFO_FIELD(provider, prov->field, user->field, "Supported",	\
 		      "Requested", type)
 
-#define FI_INFO_CHECK_VAL(provider, prov, user, field)					\
-	do {										\
-		FI_INFO(provider, FI_LOG_CORE, "Supported: %zd\n", prov->field);	\
-		FI_INFO(provider, FI_LOG_CORE, "Requested: %zd\n", user->field);	\
+#define OFI_INFO_CHECK_SIZE(provider, prov, user, field)			\
+	do {									\
+		FI_INFO(provider, FI_LOG_CORE, "Supported: %zd\n", prov->field);\
+		FI_INFO(provider, FI_LOG_CORE, "Requested: %zd\n", user->field);\
 	} while (0)
 
-#define FI_INFO_MODE(provider, prov_mode, user_mode)				\
-	FI_INFO_FIELD(provider, prov_mode, user_mode, "Expected", "Given",	\
+#define OFI_INFO_CHECK_U64(provider, prov, user, field)			\
+	do {								\
+		FI_INFO(provider, FI_LOG_CORE,				\
+			"Supported: %" PRIu64 "\n", prov->field);	\
+		FI_INFO(provider, FI_LOG_CORE,				\
+			"Requested: %" PRIu64 "\n", user->field);	\
+	} while (0)
+
+#define OFI_INFO_MODE(provider, prov_mode, user_mode)				\
+	OFI_INFO_FIELD(provider, prov_mode, user_mode, "Expected", "Given",	\
 		      FI_TYPE_MODE)
 
-#define FI_INFO_MR_MODE(provider, prov_mode, user_mode)			\
-	FI_INFO_FIELD(provider, prov_mode, user_mode, "Expected", "Given",	\
+#define OFI_INFO_MR_MODE(provider, prov_mode, user_mode)			\
+	OFI_INFO_FIELD(provider, prov_mode, user_mode, "Expected", "Given",	\
 		      FI_TYPE_MR_MODE)
 
-#define FI_INFO_NAME(provider, prov, user)				\
-	FI_INFO_STRING(provider, prov->name, user->name, "Supported",	\
-		       "Requested")
+#define OFI_INFO_NAME(provider, prov, user)				\
+	OFI_INFO_STR(provider, prov->name, user->name, "Supported", "Requested")
 
 #define ofi_after_eq(a,b)	((long)((a) - (b)) >= 0)
 #define ofi_before(a,b)		((long)((a) - (b)) < 0)
@@ -151,9 +160,14 @@ struct ofi_common_locks {
 /*
  * Provider details
  */
+typedef int (*ofi_alter_info_t)(uint32_t version, const struct fi_info *src_info,
+				const struct fi_info *base_info,
+				struct fi_info *dest_info);
+
 struct util_prov {
 	const struct fi_provider	*prov;
 	const struct fi_info		*info;
+	ofi_alter_info_t		alter_defaults;
 	const int			flags;
 };
 
@@ -209,7 +223,7 @@ struct util_domain {
 
 int ofi_domain_init(struct fid_fabric *fabric_fid, const struct fi_info *info,
 		     struct util_domain *domain, void *context);
-int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq);
+int ofi_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags);
 int ofi_domain_close(struct util_domain *domain);
 
 static const uint64_t ofi_rx_mr_flags[] = {
@@ -328,6 +342,12 @@ static inline void ofi_ep_lock_release(struct util_ep *ep)
 	ep->lock_release(&ep->lock);
 }
 
+static inline bool ofi_ep_lock_held(struct util_ep *ep)
+{
+	return (ep->lock_acquire == ofi_fastlock_acquire_noop) ||
+		fastlock_held(&ep->lock);
+}
+
 static inline void ofi_ep_tx_cntr_inc(struct util_ep *ep)
 {
 	ep->tx_cntr_inc(ep->tx_cntr);
@@ -479,8 +499,8 @@ int ofi_wait_yield_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
 
 typedef void (*fi_cq_read_func)(void **dst, void *src);
 
-struct util_cq_oflow_err_entry {
-	struct fi_cq_tagged_entry	*parent_comp;
+struct util_cq_aux_entry {
+	struct fi_cq_tagged_entry	*cq_slot;
 	struct fi_cq_err_entry		comp;
 	fi_addr_t			src;
 	struct slist_entry		list_entry;
@@ -504,7 +524,7 @@ struct util_cq {
 	struct util_comp_cirq	*cirq;
 	fi_addr_t		*src;
 
-	struct slist		oflow_err_list;
+	struct slist		aux_queue;
 	fi_cq_read_func		read_entry;
 	int			internal_wait;
 	ofi_atomic32_t		signaled;
@@ -530,8 +550,9 @@ ssize_t ofi_cq_sreadfrom(struct fid_cq *cq_fid, void *buf, size_t count,
 		fi_addr_t *src_addr, const void *cond, int timeout);
 int ofi_cq_signal(struct fid_cq *cq_fid);
 
-int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, size_t len,
-			  void *buf, uint64_t data, uint64_t tag, fi_addr_t src);
+int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags,
+			  size_t len, void *buf, uint64_t data, uint64_t tag,
+			  fi_addr_t src);
 
 static inline void util_cq_signal(struct util_cq *cq)
 {
@@ -540,10 +561,10 @@ static inline void util_cq_signal(struct util_cq *cq)
 }
 
 static inline void
-ofi_cq_write_comp_entry(struct util_cq *cq, void *context, uint64_t flags,
-			size_t len, void *buf, uint64_t data, uint64_t tag)
+ofi_cq_write_entry(struct util_cq *cq, void *context, uint64_t flags,
+		   size_t len, void *buf, uint64_t data, uint64_t tag)
 {
-	struct fi_cq_tagged_entry *comp = ofi_cirque_tail(cq->cirq);
+	struct fi_cq_tagged_entry *comp = ofi_cirque_next(cq->cirq);
 	comp->op_context = context;
 	comp->flags = flags;
 	comp->len = len;
@@ -553,18 +574,13 @@ ofi_cq_write_comp_entry(struct util_cq *cq, void *context, uint64_t flags,
 	ofi_cirque_commit(cq->cirq);
 }
 
-static inline int
-ofi_cq_write_thread_unsafe(struct util_cq *cq, void *context, uint64_t flags,
-			   size_t len, void *buf, uint64_t data, uint64_t tag)
+static inline void
+ofi_cq_write_src_entry(struct util_cq *cq, void *context, uint64_t flags,
+		       size_t len, void *buf, uint64_t data, uint64_t tag,
+		       fi_addr_t src)
 {
-	if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) {
-		FI_DBG(cq->domain->prov, FI_LOG_CQ,
-		       "util_cq cirq is full!\n");
-		return ofi_cq_write_overflow(cq, context, flags, len,
-					     buf, data, tag, 0);
-	}
-	ofi_cq_write_comp_entry(cq, context, flags, len, buf, data, tag);
-	return 0;
+	cq->src[ofi_cirque_windex(cq->cirq)] = src;
+	ofi_cq_write_entry(cq, context, flags, len, buf, data, tag);
 }
 
 static inline int
@@ -572,39 +588,40 @@ ofi_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len,
 	     void *buf, uint64_t data, uint64_t tag)
 {
 	int ret;
+
 	cq->cq_fastlock_acquire(&cq->cq_lock);
-	ret = ofi_cq_write_thread_unsafe(cq, context, flags, len, buf, data, tag);
+	if (ofi_cirque_freecnt(cq->cirq) > 1) {
+		ofi_cq_write_entry(cq, context, flags, len, buf, data, tag);
+		ret = 0;
+	} else {
+		ret = ofi_cq_write_overflow(cq, context, flags, len,
+					    buf, data, tag, FI_ADDR_NOTAVAIL);
+	}
 	cq->cq_fastlock_release(&cq->cq_lock);
 	return ret;
 }
 
-static inline int
-ofi_cq_write_src_thread_unsafe(struct util_cq *cq, void *context, uint64_t flags, size_t len,
-			       void *buf, uint64_t data, uint64_t tag, fi_addr_t src)
-{
-	if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) {
-		FI_DBG(cq->domain->prov, FI_LOG_CQ,
-		       "util_cq cirq is full!\n");
-		return ofi_cq_write_overflow(cq, context, flags, len,
-					     buf, data, tag, src);
-	}
-	cq->src[ofi_cirque_windex(cq->cirq)] = src;
-	ofi_cq_write_comp_entry(cq, context, flags, len, buf, data, tag);
-	return 0;
-}
-
 static inline int
 ofi_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len,
 		 void *buf, uint64_t data, uint64_t tag, fi_addr_t src)
 {
 	int ret;
+
 	cq->cq_fastlock_acquire(&cq->cq_lock);
-	ret = ofi_cq_write_src_thread_unsafe(cq, context, flags, len,
-					     buf, data, tag, src);
+	if (ofi_cirque_freecnt(cq->cirq) > 1) {
+		ofi_cq_write_src_entry(cq, context, flags, len, buf, data,
+				       tag, src);
+		ret = 0;
+	} else {
+		ret = ofi_cq_write_overflow(cq, context, flags, len,
+					    buf, data, tag, src);
+	}
 	cq->cq_fastlock_release(&cq->cq_lock);
 	return ret;
 }
 
+int ofi_cq_insert_error(struct util_cq *cq,
+			const struct fi_cq_err_entry *err_entry);
 int ofi_cq_write_error(struct util_cq *cq,
 		       const struct fi_cq_err_entry *err_entry);
 int ofi_cq_write_error_peek(struct util_cq *cq, uint64_t tag, void *context);
@@ -690,7 +707,7 @@ struct util_av_entry {
 	 * field in 'data' and addr length should be a multiple
 	 * of 8 bytes to ensure alignment of additional fields
 	 */
-	char		data[0];
+	char		data[];
 };
 
 struct util_av {
@@ -707,7 +724,6 @@ struct util_av {
 	struct util_coll_mc	*coll_mc;
 	void			*context;
 	uint64_t		flags;
-	size_t			count;
 	size_t			addrlen;
 	/*
 	 * context_offset is addrlen + offset (required for alignment),
@@ -740,6 +756,7 @@ int ofi_av_init_lightweight(struct util_domain *domain, const struct fi_av_attr
 int ofi_av_close(struct util_av *av);
 int ofi_av_close_lightweight(struct util_av *av);
 
+size_t ofi_av_size(struct util_av *av);
 int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr);
 int ofi_av_remove_addr(struct util_av *av, fi_addr_t fi_addr);
 fi_addr_t ofi_av_lookup_fi_addr_unsafe(struct util_av *av, const void *addr);
@@ -756,6 +773,8 @@ int ofi_ip_av_create_flags(struct fid_domain *domain_fid, struct fi_av_attr *att
 
 void *ofi_av_get_addr(struct util_av *av, fi_addr_t fi_addr);
 #define ofi_ip_av_get_addr ofi_av_get_addr
+void *ofi_av_addr_context(struct util_av *av, fi_addr_t fi_addr);
+
 fi_addr_t ofi_ip_av_get_fi_addr(struct util_av *av, const void *addr);
 
 int ofi_get_addr(uint32_t *addr_format, uint64_t flags,
@@ -767,9 +786,10 @@ int ofi_get_src_addr(uint32_t addr_format,
 void ofi_getnodename(uint16_t sa_family, char *buf, int buflen);
 int ofi_av_get_index(struct util_av *av, const void *addr);
 
-int ofi_verify_av_insert(struct util_av *av, uint64_t flags);
+int ofi_verify_av_insert(struct util_av *av, uint64_t flags, void *context);
 int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen,
-		      size_t count, fi_addr_t *fi_addr, void *context);
+		      size_t count, fi_addr_t *fi_addr, uint64_t flags,
+		      void *context);
 /* Caller should free *addr */
 int ofi_ip_av_sym_getaddr(struct util_av *av, const char *node,
 			  size_t nodecnt, const char *service,
@@ -823,7 +843,7 @@ struct util_event {
 	ssize_t			size;
 	int			event;
 	int			err;
-	uint8_t			data[0]; /* offset should be 8-byte aligned */
+	uint8_t			data[]; /* offset should be 8-byte aligned */
 };
 
 int ofi_eq_create(struct fid_fabric *fabric, struct fi_eq_attr *attr,
@@ -949,10 +969,6 @@ static inline int ofi_has_util_prefix(const char *str)
 	return !strncasecmp(str, OFI_UTIL_PREFIX, strlen(OFI_UTIL_PREFIX));
 }
 
-typedef int (*ofi_alter_info_t)(uint32_t version, const struct fi_info *src_info,
-				const struct fi_info *base_info,
-				struct fi_info *dest_info);
-
 int ofi_get_core_info(uint32_t version, const char *node, const char *service,
 		      uint64_t flags, const struct util_prov *util_prov,
 		      const struct fi_info *util_hints,
@@ -1035,6 +1051,25 @@ struct ofi_ops_flow_ctrl {
 };
 
 
+/* Dynamic receive buffering support. */
+#define OFI_OPS_DYNAMIC_RBUF "ofix_dynamic_rbuf_v2"
+
+struct ofi_cq_rbuf_entry {
+	void			*op_context;
+	uint64_t		flags;
+	size_t			len;
+	void			*buf;
+	uint64_t		data;
+	uint64_t		tag;
+	void			*ep_context;
+};
+
+struct ofi_ops_dynamic_rbuf {
+	size_t	size;
+	ssize_t	(*get_rbuf)(struct ofi_cq_rbuf_entry *entry, struct iovec *iov,
+			    size_t *count);
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/libfabric/include/osx/osd.h b/deps/libfabric/include/osx/osd.h
index c0f54d133f6965d77be94bb2293422bff245b0f7..6d671b9f3611e21c38607e34add9947e47d0dec4 100644
--- a/deps/libfabric/include/osx/osd.h
+++ b/deps/libfabric/include/osx/osd.h
@@ -47,6 +47,8 @@
 
 #include <ifaddrs.h>
 
+#include <limits.h>
+
 #include "unix/osd.h"
 #include "rdma/fi_errno.h"
 #include "config.h"
@@ -115,6 +117,51 @@ static inline size_t ofi_process_vm_writev(pid_t pid,
 	return -FI_ENOSYS;
 }
 
+static inline ssize_t
+ofi_recv_socket(SOCKET fd, void *buf, size_t count, int flags)
+{
+	size_t len = count > INT_MAX ? INT_MAX : count;
+	return recv(fd, buf, len, flags);
+}
+
+static inline ssize_t
+ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags)
+{
+	size_t len = count > INT_MAX ? INT_MAX : count;
+	return send(fd, buf, len, flags);
+}
+
+static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
+{
+	return ofi_recv_socket(fd, buf, count, 0);
+}
+
+static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
+{
+	return ofi_send_socket(fd, buf, count, 0);
+}
+
+static inline ssize_t
+ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
+		    struct sockaddr *from, socklen_t *fromlen)
+{
+	size_t len = count > INT_MAX ? INT_MAX : count;
+	return recvfrom(fd, buf, len, flags, from, fromlen);
+}
+
+static inline ssize_t
+ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags,
+		  const struct sockaddr *to, socklen_t tolen)
+{
+	size_t len = count > INT_MAX ? INT_MAX : count;
+	return sendto(fd, buf, len, flags, to, tolen);
+}
+
+ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt);
+ssize_t ofi_readv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt);
+ssize_t ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags);
+ssize_t ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/libfabric/include/rdma/fabric.h b/deps/libfabric/include/rdma/fabric.h
index bf1ccf0c4180fe4d19a267b9505f6ac7ec025c4f..62764806536d9042d70873bd6ac4560b3550c5c3 100644
--- a/deps/libfabric/include/rdma/fabric.h
+++ b/deps/libfabric/include/rdma/fabric.h
@@ -79,8 +79,8 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 11
-#define FI_REVISION_VERSION 1
+#define FI_MINOR_VERSION 14
+#define FI_REVISION_VERSION 0
 
 enum {
 	FI_PATH_MAX		= 256,
@@ -166,6 +166,8 @@ typedef struct fid *fid_t;
 #define FI_COMMIT_COMPLETE	(1ULL << 30)
 #define FI_MATCH_COMPLETE	(1ULL << 31)
 
+#define FI_HMEM_HOST_ALLOC	(1ULL << 45)
+#define FI_HMEM_DEVICE_ONLY	(1ULL << 46)
 #define FI_HMEM			(1ULL << 47)
 #define FI_VARIABLE_MSG		(1ULL << 48)
 #define FI_RMA_PMEM		(1ULL << 49)
@@ -208,6 +210,7 @@ enum {
 	FI_ADDR_PSMX2,		/* uint64_t[2] */
 	FI_ADDR_IB_UD,		/* uint64_t[4] */
 	FI_ADDR_EFA,
+	FI_ADDR_PSMX3,		/* uint64_t[2] */
 };
 
 #define FI_ADDR_UNSPEC		((uint64_t) -1)
@@ -237,6 +240,7 @@ enum fi_mr_mode {
 #define FI_MR_RMA_EVENT		(1 << 8)
 #define FI_MR_ENDPOINT		(1 << 9)
 #define FI_MR_HMEM		(1 << 10)
+#define FI_MR_COLLECTIVE	(1 << 11)
 
 enum fi_progress {
 	FI_PROGRESS_UNSPEC,
@@ -319,7 +323,8 @@ enum {
 	FI_PROTO_MRAIL,
 	FI_PROTO_RSTREAM,
 	FI_PROTO_RDMA_CM_IB_XRC,
-	FI_PROTO_EFA
+	FI_PROTO_EFA,
+	FI_PROTO_PSMX3
 };
 
 enum {
@@ -517,6 +522,8 @@ enum {
 	FI_CLASS_MC,
 	FI_CLASS_NIC,
 	FI_CLASS_AV_SET,
+	FI_CLASS_MR_CACHE,
+	FI_CLASS_MEM_MONITOR,
 };
 
 struct fi_eq_attr;
@@ -575,7 +582,10 @@ struct fid_fabric {
 	uint32_t		api_version;
 };
 
-int fi_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context);
+int fi_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+	      void *context);
+int fi_open(uint32_t version, const char *name, void *attr, size_t attr_len,
+	    uint64_t flags, struct fid **fid, void *context);
 
 struct fid_nic {
 	struct fid		fid;
@@ -598,6 +608,11 @@ struct fi_alias {
 	uint64_t		flags;
 };
 
+struct fi_fid_var {
+	int		name;
+	void		*val;
+};
+
 struct fi_mr_raw_attr {
 	uint64_t	flags;
 	uint64_t	*base_addr;
@@ -632,6 +647,9 @@ enum {
 	FI_REFRESH,		/* mr: fi_mr_modify */
 	FI_DUP,			/* struct fid ** */
 	FI_GETWAITOBJ,		/*enum fi_wait_obj * */
+	FI_GET_VAL,		/* struct fi_fid_var */
+	FI_SET_VAL,		/* struct fi_fid_var */
+	FI_EXPORT_FID,		/* struct fi_fid_export */
 };
 
 static inline int fi_control(struct fid *fid, int command, void *arg)
@@ -647,6 +665,28 @@ static inline int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t fla
 	return fi_control(fid, FI_ALIAS, &alias);
 }
 
+/* fid value names */
+/*
+ * Currently no common name is defined. Provider specific names should
+ * have the FI_PROV_SPECIFIC bit set.
+ */
+
+static inline int fi_get_val(struct fid *fid, int name, void *val)
+{
+	struct fi_fid_var var;
+	var.name = name;
+	var.val = val;
+	return fi_control(fid, FI_GET_VAL, &var);
+}
+
+static inline int fi_set_val(struct fid *fid, int name, void *val)
+{
+	struct fi_fid_var var;
+	var.name = name;
+	var.val = val;
+	return fi_control(fid, FI_SET_VAL, &var);
+}
+
 static inline int
 fi_open_ops(struct fid *fid, const char *name, uint64_t flags,
 	    void **ops, void *context)
@@ -689,9 +729,12 @@ enum fi_type {
 	FI_TYPE_FID,
 	FI_TYPE_COLLECTIVE_OP,
 	FI_TYPE_HMEM_IFACE,
+	FI_TYPE_CQ_FORMAT,
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);
+char *fi_tostr_r(char *buf, size_t len, const void *data,
+		 enum fi_type datatype);
 
 enum fi_param_type {
 	FI_PARAM_STRING,
diff --git a/deps/libfabric/include/rdma/fi_domain.h b/deps/libfabric/include/rdma/fi_domain.h
index 27d6dd398b28157383f6e0953f0f731cf29fa1ab..99f29c9fa32c7e34e9f577378fc97dabaafe603e 100644
--- a/deps/libfabric/include/rdma/fi_domain.h
+++ b/deps/libfabric/include/rdma/fi_domain.h
@@ -52,7 +52,15 @@ extern "C" {
 #define FI_SYMMETRIC		(1ULL << 59)
 #define FI_SYNC_ERR		(1ULL << 58)
 #define FI_UNIVERSE		(1ULL << 57)
-
+#define FI_BARRIER_SET		(1ULL << 40)
+#define FI_BROADCAST_SET	(1ULL << 41)
+#define FI_ALLTOALL_SET		(1ULL << 42)
+#define FI_ALLREDUCE_SET	(1ULL << 43)
+#define FI_ALLGATHER_SET	(1ULL << 44)
+#define FI_REDUCE_SCATTER_SET	(1ULL << 45)
+#define FI_REDUCE_SET		(1ULL << 46)
+#define FI_SCATTER_SET		(1ULL << 47)
+#define FI_GATHER_SET		(1ULL << 48)
 
 struct fi_av_attr {
 	enum fi_av_type		type;
@@ -185,6 +193,12 @@ enum fi_datatype {
 	FI_LONG_DOUBLE_COMPLEX,
 	/* End of point to point atomic datatypes */
 	FI_DATATYPE_LAST,
+	/*
+	 * enums for 128-bit integer atomics, existing ordering and
+	 * FI_DATATYPE_LAST preserved for compatabilty.
+	 */
+	FI_INT128 = FI_DATATYPE_LAST,
+	FI_UINT128,
 
 	/* Collective datatypes */
 	FI_VOID = FI_COLLECTIVE_OFFSET,
diff --git a/deps/libfabric/include/rdma/fi_endpoint.h b/deps/libfabric/include/rdma/fi_endpoint.h
index 7f7a4c81437124874e28f6a995f553aa29a5bce0..2cc15839c81b39a7eb7ce5d760decc73520dd5ee 100644
--- a/deps/libfabric/include/rdma/fi_endpoint.h
+++ b/deps/libfabric/include/rdma/fi_endpoint.h
@@ -66,6 +66,18 @@ enum {
 	FI_OPT_RECV_BUF_SIZE,
 	FI_OPT_TX_SIZE,
 	FI_OPT_RX_SIZE,
+	FI_OPT_FI_HMEM_P2P,		/* int */
+};
+
+/*
+ * Parameters for FI_OPT_HMEM_P2P to allow endpoint control over peer to peer
+ * support and FI_HMEM.
+ */
+enum {
+	FI_HMEM_P2P_ENABLED,	/* Provider decides when to use P2P, default. */
+	FI_HMEM_P2P_REQUIRED,	/* Must use P2P for all transfers */
+	FI_HMEM_P2P_PREFERRED,	/* Should use P2P for all transfers if available */
+	FI_HMEM_P2P_DISABLED	/* Do not use P2P */
 };
 
 struct fi_ops_ep {
diff --git a/deps/libfabric/include/rdma/fi_errno.h b/deps/libfabric/include/rdma/fi_errno.h
index 63a6acbfd127a41f7d384000e808341354bece3d..fee1046e81b8a0a71003f41dbe53199a3f99592f 100644
--- a/deps/libfabric/include/rdma/fi_errno.h
+++ b/deps/libfabric/include/rdma/fi_errno.h
@@ -193,6 +193,7 @@ enum {
 	FI_ENOKEY        = 266, /* Required key not available */
 	FI_ENOAV	 = 267, /* Missing or unavailable address vector */
 	FI_EOVERRUN	 = 268, /* Queue has been overrun */
+	FI_ENORX	 = 269, /* Receiver not ready, no receive buffers available */
 	FI_ERRNO_MAX
 };
 
diff --git a/deps/libfabric/include/rdma/fi_ext.h b/deps/libfabric/include/rdma/fi_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bf7e089b1a8b898b8ee0fa967e92e0f70180908
--- /dev/null
+++ b/deps/libfabric/include/rdma/fi_ext.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_EXT_H
+#define FI_EXT_H
+
+#include <stdbool.h>
+#include <rdma/fabric.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Each provider needs to define an unique 12-bit provider
+ * specific code to avoid overlapping with other providers,
+ * then bit left shift the code 16 bits. Note that the
+ * highest 4 bits are not touched, so they are still left
+ * to 0. The lowest 16 bits can be used to define provider
+ * specific values. E.g.,
+ *
+ * define FI_PROV_SPECIFIC_XXX    (0xabc << 16)
+ *
+ * enum {
+ *        FI_PROV_XXX_FOO = -(FI_PROV_SPECIFIC_XXX),
+ *        FI_PROV_XXX_BAR,
+ * }
+ */
+
+#define FI_PROV_SPECIFIC_EFA   (0xefa << 16)
+
+/* negative options are provider specific */
+enum {
+       FI_OPT_EFA_RNR_RETRY = -FI_PROV_SPECIFIC_EFA,
+};
+
+struct fi_fid_export {
+	struct fid **fid;
+	uint64_t flags;
+	void *context;
+};
+
+static inline int
+fi_export_fid(struct fid *fid, uint64_t flags,
+	      struct fid **expfid, void *context)
+{
+	struct fi_fid_export exp;
+
+	exp.fid = expfid;
+	exp.flags = flags;
+	exp.context = context;
+	return fi_control(fid, FI_EXPORT_FID, &exp);
+}
+
+static inline int
+fi_import_fid(struct fid *fid, struct fid *expfid, uint64_t flags)
+{
+	return fid->ops->bind(fid, expfid, flags);
+}
+
+
+/*
+ * System memory monitor import extension:
+ * To use, open mr_cache fid and import.
+ */
+
+struct fid_mem_monitor;
+
+struct fi_ops_mem_monitor {
+	size_t	size;
+	int	(*start)(struct fid_mem_monitor *monitor);
+	void	(*stop)(struct fid_mem_monitor *monitor);
+	int	(*subscribe)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+	void	(*unsubscribe)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+	bool	(*valid)(struct fid_mem_monitor *monitor,
+			const void *addr, size_t len);
+};
+
+struct fi_ops_mem_notify {
+	size_t	size;
+	void	(*notify)(struct fid_mem_monitor *monitor, const void *addr,
+			size_t len);
+};
+
+struct fid_mem_monitor {
+	struct fid fid;
+	struct fi_ops_mem_monitor *export_ops;
+	struct fi_ops_mem_notify *import_ops;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_EXT_H */
diff --git a/deps/libfabric/include/rdma/providers/fi_log.h b/deps/libfabric/include/rdma/providers/fi_log.h
index acf6f246e559394cc9955d0cdd29ed8a50c6c754..fca69104104225c37e7903745a18a0fa3e43139b 100644
--- a/deps/libfabric/include/rdma/providers/fi_log.h
+++ b/deps/libfabric/include/rdma/providers/fi_log.h
@@ -68,6 +68,8 @@ enum fi_log_level {
 
 int fi_log_enabled(const struct fi_provider *prov, enum fi_log_level level,
 		   enum fi_log_subsys subsys);
+int fi_log_ready(const struct fi_provider *prov, enum fi_log_level level,
+		 enum fi_log_subsys subsys, uint64_t *showtime);
 void fi_log(const struct fi_provider *prov, enum fi_log_level level,
 	    enum fi_log_subsys subsys, const char *func, int line,
 	    const char *fmt, ...) __attribute__ ((__format__ (__printf__, 6, 7)));
@@ -82,8 +84,21 @@ void fi_log(const struct fi_provider *prov, enum fi_log_level level,
 		}							\
 	} while (0)
 
+#define FI_LOG_SPARSE(prov, level, subsystem, ...)			\
+	do {								\
+		static uint64_t showtime;				\
+		if (fi_log_ready(prov, level, subsystem, &showtime)) {	\
+			int saved_errno = errno;			\
+			fi_log(prov, level, subsystem,			\
+				__func__, __LINE__, __VA_ARGS__);	\
+			errno = saved_errno;				\
+		}							\
+	} while (0)
+
 #define FI_WARN(prov, subsystem, ...)					\
 	FI_LOG(prov, FI_LOG_WARN, subsystem, __VA_ARGS__)
+#define FI_WARN_SPARSE(prov, subsystem, ...)				\
+	FI_LOG_SPARSE(prov, FI_LOG_WARN, subsystem, __VA_ARGS__)
 
 #define FI_TRACE(prov, subsystem, ...)					\
 	FI_LOG(prov, FI_LOG_TRACE, subsystem, __VA_ARGS__)
diff --git a/deps/libfabric/include/rdma/providers/fi_prov.h b/deps/libfabric/include/rdma/providers/fi_prov.h
index 104bf78af371e29a1cb92e2e1e6d080868054b11..ab8858d8f9d3279f21e46cf2e7e21235a17c914b 100644
--- a/deps/libfabric/include/rdma/providers/fi_prov.h
+++ b/deps/libfabric/include/rdma/providers/fi_prov.h
@@ -76,14 +76,6 @@ struct fi_provider {
 
 /*
  * Defines a configuration parameter for use with libfabric.
- *
- * This registers the configuration variable "foo" in the specified
- * provider.
- *
- * The help string cannot be NULL or empty.
- *
- * The param_name and help_string parameters will be copied internally;
- * they can be freed upon return from fi_param_define().
  */
 int fi_param_define(const struct fi_provider *provider, const char *param_name,
 		    enum fi_param_type type, const char *help_string_fmt, ...);
@@ -92,22 +84,8 @@ int fi_param_define(const struct fi_provider *provider, const char *param_name,
  * Get the value of a configuration variable.
  *
  * Currently, configuration parameter will only be read from the
- * environment.  The environment variable names will be of the form
- * upper_case(FI_<provider_name>_<param_name>).
- *
- * Someday this call could be expanded to also check config files.
- *
- * If the parameter was previously defined and the user set a value,
- * FI_SUCCESS is returned and (*value) points to the retrieved
- * value.
- *
- * If the parameter name was previously defined, but the user did
- * not set a value, -FI_ENODATA is returned and the value of (*value)
- * is unchanged.
- *
- * If the variable name was not previously defined via
- * fi_param_define(), -FI_ENOENT will be returned and the value of
- * (*value) is unchanged.
+ * environment. Someday this call could be expanded to also check
+ * config files.
  */
 int fi_param_get(struct fi_provider *provider, const char *param_name,
 		 void *value);
diff --git a/deps/libfabric/include/unix/osd.h b/deps/libfabric/include/unix/osd.h
index f8b02b87614b4f5a92837875e9abc14e869adc0a..7078b19e170274489ffdd9d48fc83dbc3d2e90c8 100644
--- a/deps/libfabric/include/unix/osd.h
+++ b/deps/libfabric/include/unix/osd.h
@@ -73,6 +73,10 @@
 	(((err) == EAGAIN)	||		\
 	 ((err) == EWOULDBLOCK))
 
+#define OFI_SOCK_TRY_ACCEPT_AGAIN(err)		\
+	(((err) == EAGAIN)	||		\
+	 ((err) == EWOULDBLOCK))
+
 #define OFI_SOCK_TRY_CONN_AGAIN(err)	\
 	((err) == EINPROGRESS)
 
@@ -119,68 +123,12 @@ static inline SOCKET ofi_socket(int domain, int type, int protocol)
 	return socket(domain, type, protocol);
 }
 
-static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
-{
-	return read(fd, buf, count);
-}
-
-static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
-{
-	return write(fd, buf, count);
-}
-
-static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count,
-				      int flags)
-{
-	return recv(fd, buf, count, flags);
-}
-
-static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
-					  struct sockaddr *from, socklen_t *fromlen)
-{
-	return recvfrom(fd, buf, count, flags, from, fromlen);
-}
-
-static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count,
-				      int flags)
-{
-	return send(fd, buf, count, flags);
-}
-
-static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags,
-					const struct sockaddr *to, socklen_t tolen)
-{
-	return sendto(fd, buf, count, flags, to, tolen);
-}
-
-static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt)
-{
-	return writev(fd, iov, iov_cnt);
-}
-
-static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt)
-{
-	return readv(fd, iov, iov_cnt);
-}
-
-static inline ssize_t
-ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags)
-{
-	return sendmsg(fd, msg, flags);
-}
-
 static inline ssize_t
 ofi_sendmsg_udp(SOCKET fd, const struct msghdr *msg, int flags)
 {
 	return sendmsg(fd, msg, flags);
 }
 
-static inline ssize_t
-ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags)
-{
-	return recvmsg(fd, msg, flags);
-}
-
 static inline ssize_t
 ofi_recvmsg_udp(SOCKET fd, struct msghdr *msg, int flags)
 {
@@ -199,6 +147,8 @@ static inline int ofi_close_socket(SOCKET socket)
 
 int fi_fd_nonblock(int fd);
 
+int fi_fd_block(int fd);
+
 static inline int ofi_sockerr(void)
 {
 	return errno;
diff --git a/deps/libfabric/include/windows/config.h b/deps/libfabric/include/windows/config.h
index 912858778ef2f4b64f3964964676cfb85f41b11a..7b53b8f7d38536c329198e1f98156e4fbd964484 100644
--- a/deps/libfabric/include/windows/config.h
+++ b/deps/libfabric/include/windows/config.h
@@ -165,7 +165,7 @@
 #define PACKAGE_TARNAME PACKAGE
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.11.1"
+#define PACKAGE_VERSION "1.14.0"
 
 /* Define to the full name and version of this package. */
 #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION
@@ -195,7 +195,7 @@
 /* Version number of package */
 #define _FI_EXP(s) #s
 #define _FI_TO_STRING(s) _FI_EXP(s)
-#define VERSION _FI_TO_STRING(FI_MAJOR_VERSION) "." _FI_TO_STRING(FI_MINOR_VERSION) ".1a1"
+#define VERSION _FI_TO_STRING(FI_MAJOR_VERSION) "." _FI_TO_STRING(FI_MINOR_VERSION) "." _FI_TO_STRING(FI_REVISION_VERSION)
 
 #ifndef BUILD_ID
 #define BUILD_ID ""
diff --git a/deps/libfabric/include/windows/osd.h b/deps/libfabric/include/windows/osd.h
index d3cabcebb659df82185af455138ddb9a921dae3e..2cb9bbcdb09b2bb38701dc01f416ee494681b4b0 100644
--- a/deps/libfabric/include/windows/osd.h
+++ b/deps/libfabric/include/windows/osd.h
@@ -226,6 +226,10 @@ extern "C" {
 	 ((err) == EWOULDBLOCK)		||	\
 	 ((err) == EAGAIN))
 
+#define OFI_SOCK_TRY_ACCEPT_AGAIN(err)		\
+	(((err) == EAGAIN)		||	\
+	 ((err) == EWOULDBLOCK))
+
 #define OFI_SOCK_TRY_CONN_AGAIN(err)		\
 	(((err) == EWOULDBLOCK)		||	\
 	 ((err) == EINPROGRESS))
@@ -792,6 +796,12 @@ static inline int fi_fd_nonblock(SOCKET fd)
 	return ioctlsocket(fd, FIONBIO, &argp) ? -WSAGetLastError() : 0;
 }
 
+static inline int fi_fd_block(SOCKET fd)
+{
+	u_long argp = 0;
+	return ioctlsocket(fd, FIONBIO, &argp) ? -WSAGetLastError() : 0;
+}
+
 /* Note: Use static variable `errno` for libc routines
  * (such as fopen, lseek and etc)
  * If you need to define which function/variable is needed
diff --git a/deps/libfabric/include/windows/sched.h b/deps/libfabric/include/windows/sched.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9360e499e63b55b1906bc95cb25a2f0abe02dcb
--- /dev/null
+++ b/deps/libfabric/include/windows/sched.h
@@ -0,0 +1,13 @@
+
+#pragma once
+
+
+#include "osd.h"
+#include <processthreadsapi.h>
+
+
+static inline int sched_yield(void)
+{
+	(void) SwitchToThread();
+	return 0;
+}
diff --git a/deps/libfabric/info.vcxproj b/deps/libfabric/info.vcxproj
index 33b5ead27af0aaa42dbc9f1ee2157e1260385f2b..ec60d27d1eb76b9ab0316736701db8b07f586da7 100644
--- a/deps/libfabric/info.vcxproj
+++ b/deps/libfabric/info.vcxproj
@@ -13,6 +13,10 @@
       <Configuration>Debug-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug-v142|x64">
+      <Configuration>Debug-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release-ICC|x64">
       <Configuration>Release-ICC</Configuration>
       <Platform>x64</Platform>
@@ -25,6 +29,10 @@
       <Configuration>Release-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-v142|x64">
+      <Configuration>Release-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{90850937-D15C-491D-B294-66DCA165254D}</ProjectGuid>
@@ -45,6 +53,12 @@
     <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -65,6 +79,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -83,6 +104,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -92,6 +116,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -106,6 +133,11 @@
     <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
@@ -121,6 +153,11 @@
     <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <LinkIncremental>false</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\info\</IntDir>
@@ -156,6 +193,21 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <ClCompile>
       <PrecompiledHeader>
@@ -209,6 +261,25 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -232,9 +303,11 @@
     <ClCompile Include="util\info.c">
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">true</C99Support>
     </ClCompile>
     <ClCompile Include="util\windows\getopt\getopt.cpp" />
@@ -251,4 +324,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/deps/libfabric/libfabric.map.in b/deps/libfabric/libfabric.map.in
index d3815c8d22037679266dbb1e1fb3bbfbc314d04b..342216c57bf3ddb7a92b2c5af7318bbffb52d5f6 100644
--- a/deps/libfabric/libfabric.map.in
+++ b/deps/libfabric/libfabric.map.in
@@ -38,3 +38,18 @@ FABRIC_1.3 {
 		fi_freeinfo;
 		fi_dupinfo;
 } FABRIC_1.2;
+
+FABRIC_1.4 {
+	global:
+		fi_tostr_r;
+} FABRIC_1.3;
+
+FABRIC_1.5 {
+	global:
+		fi_open;
+} FABRIC_1.4;
+
+FABRIC_1.6 {
+	global:
+		fi_log_ready;
+} FABRIC_1.5;
\ No newline at end of file
diff --git a/deps/libfabric/libfabric.sln b/deps/libfabric/libfabric.sln
index 6258724c334c5dfab558844022a5ba7fb9edaa4e..39734ce17d60ac1324652c8502743971b1c2aceb 100644
--- a/deps/libfabric/libfabric.sln
+++ b/deps/libfabric/libfabric.sln
@@ -1,7 +1,7 @@
 
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 14
-VisualStudioVersion = 14.0.25420.1
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.29709.97
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libfabric", "libfabric.vcxproj", "{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}"
 EndProject
@@ -25,9 +25,11 @@ Global
 		Debug-ICC|x64 = Debug-ICC|x64
 		Debug-v140|x64 = Debug-v140|x64
 		Debug-v141|x64 = Debug-v141|x64
+		Debug-v142|x64 = Debug-v142|x64
 		Release-ICC|x64 = Release-ICC|x64
 		Release-v140|x64 = Release-v140|x64
 		Release-v141|x64 = Release-v141|x64
+		Release-v142|x64 = Release-v142|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64
@@ -36,50 +38,69 @@ Global
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v140|x64.Build.0 = Debug-v140|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v141|x64.ActiveCfg = Debug-v141|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v141|x64.Build.0 = Debug-v141|x64
+		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v142|x64.ActiveCfg = Debug-v142|x64
+		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v142|x64.Build.0 = Debug-v142|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-ICC|x64.ActiveCfg = Release-ICC|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-ICC|x64.Build.0 = Release-ICC|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v140|x64.ActiveCfg = Release-v140|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v140|x64.Build.0 = Release-v140|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v141|x64.ActiveCfg = Release-v141|x64
 		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v141|x64.Build.0 = Release-v141|x64
+		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v142|x64.ActiveCfg = Release-v142|x64
+		{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v142|x64.Build.0 = Release-v142|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-ICC|x64.Build.0 = Debug-ICC|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v140|x64.ActiveCfg = Debug-v140|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v140|x64.Build.0 = Debug-v140|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v141|x64.ActiveCfg = Debug-v141|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v141|x64.Build.0 = Debug-v141|x64
+		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v142|x64.ActiveCfg = Debug-v142|x64
+		{90850937-D15C-491D-B294-66DCA165254D}.Debug-v142|x64.Build.0 = Debug-v142|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-ICC|x64.ActiveCfg = Release-ICC|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-ICC|x64.Build.0 = Release-ICC|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-v140|x64.ActiveCfg = Release-v140|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-v140|x64.Build.0 = Release-v140|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-v141|x64.ActiveCfg = Release-v141|x64
 		{90850937-D15C-491D-B294-66DCA165254D}.Release-v141|x64.Build.0 = Release-v141|x64
+		{90850937-D15C-491D-B294-66DCA165254D}.Release-v142|x64.ActiveCfg = Release-v142|x64
+		{90850937-D15C-491D-B294-66DCA165254D}.Release-v142|x64.Build.0 = Release-v142|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-ICC|x64.Build.0 = Debug-ICC|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v140|x64.ActiveCfg = Debug-v140|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v140|x64.Build.0 = Debug-v140|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v141|x64.ActiveCfg = Debug-v141|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v141|x64.Build.0 = Debug-v141|x64
+		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v142|x64.ActiveCfg = Debug-v142|x64
+		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v142|x64.Build.0 = Debug-v142|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-ICC|x64.ActiveCfg = Release-ICC|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-ICC|x64.Build.0 = Release-ICC|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v140|x64.ActiveCfg = Release-v140|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v140|x64.Build.0 = Release-v140|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v141|x64.ActiveCfg = Release-v141|x64
 		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v141|x64.Build.0 = Release-v141|x64
+		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v142|x64.ActiveCfg = Release-v142|x64
+		{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v142|x64.Build.0 = Release-v142|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-ICC|x64.Build.0 = Debug-ICC|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v140|x64.ActiveCfg = Debug-v140|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v140|x64.Build.0 = Debug-v140|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v141|x64.ActiveCfg = Debug-v141|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v141|x64.Build.0 = Debug-v141|x64
+		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v142|x64.ActiveCfg = Debug-v142|x64
+		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v142|x64.Build.0 = Debug-v142|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-ICC|x64.ActiveCfg = Release-ICC|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-ICC|x64.Build.0 = Release-ICC|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v140|x64.ActiveCfg = Release-v140|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v140|x64.Build.0 = Release-v140|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v141|x64.ActiveCfg = Release-v141|x64
 		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v141|x64.Build.0 = Release-v141|x64
+		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v142|x64.ActiveCfg = Release-v142|x64
+		{C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v142|x64.Build.0 = Release-v142|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {081E384D-462B-4FB7-AB58-B39108563DB3}
+	EndGlobalSection
 EndGlobal
diff --git a/deps/libfabric/libfabric.spec.in b/deps/libfabric/libfabric.spec.in
index f7680f651553d0b173065d04258bb30d641790e2..a1d5e1ff2b91d2b690b48aa2d6ba68449c4a9095 100644
--- a/deps/libfabric/libfabric.spec.in
+++ b/deps/libfabric/libfabric.spec.in
@@ -78,9 +78,12 @@ EOF
 %endif
 %endif
 
-%makeinstall installdirs
+%make_install installdirs
 # remove unpackaged files from the buildroot
 rm -f %{buildroot}%{_libdir}/*.la
+%if 0%{?_version_symbolic_link:1}
+%{__ln_s} %{version} %{buildroot}/%{_version_symbolic_link}
+%endif
 
 %clean
 rm -rf %{buildroot}
@@ -94,6 +97,9 @@ rm -rf %{buildroot}
 %{_bindir}/fi_info
 %{_bindir}/fi_strerror
 %{_bindir}/fi_pingpong
+%if 0%{?_version_symbolic_link:1}
+%{_version_symbolic_link}
+%endif
 %dir %{_libdir}/libfabric/
 %doc AUTHORS COPYING README
 
diff --git a/deps/libfabric/libfabric.vcxproj b/deps/libfabric/libfabric.vcxproj
index ce41ee23d252bf1564e89155b9fc0703fbdde038..d327f18f8fef524c9feb1ad62121b476747861c3 100644
--- a/deps/libfabric/libfabric.vcxproj
+++ b/deps/libfabric/libfabric.vcxproj
@@ -13,6 +13,10 @@
       <Configuration>Debug-v141</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug-v142|x64">
+      <Configuration>Debug-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release-ICC|x64">
       <Configuration>Release-ICC</Configuration>
       <Platform>x64</Platform>
@@ -25,6 +29,10 @@
       <Configuration>Release-v141</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-v142|x64">
+      <Configuration>Release-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}</ProjectGuid>
@@ -45,6 +53,12 @@
     <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -65,6 +79,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -81,6 +102,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -90,6 +114,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -102,6 +129,10 @@
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
@@ -114,25 +145,22 @@
     <LinkIncremental>false</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <LinkIncremental>false</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\libfabric\</IntDir>
   </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">
+  <ItemDefinitionGroup>
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include;$(ProjectDir)prov\efa\src;$(ProjectDir)prov\efa\include;$(ProjectDir)prov\efa\src\rxr;$(ProjectDir)prov\efa\src\efa_verbs;$(ProjectDir)prov\efa\src\efa_verbs\plat</AdditionalIncludeDirectories>
       <SDLCheck>true</SDLCheck>
-      <AdditionalIncludeDirectories>$(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include</AdditionalIncludeDirectories>
       <CompileAs>CompileAsC</CompileAs>
-      <DisableSpecificWarnings>4127;4200;4204;4221;4115;4201;4100</DisableSpecificWarnings>
-      <C99Support>true</C99Support>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
       <ExceptionHandling>false</ExceptionHandling>
-      <UseMSVC>false</UseMSVC>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -141,6 +169,17 @@
       <ModuleDefinitionFile>libfabric.def</ModuleDefinitionFile>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4127;4200;4204;4221;4115;4201;4100</DisableSpecificWarnings>
+      <C99Support>true</C99Support>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <UseMSVC>false</UseMSVC>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
@@ -157,12 +196,17 @@
       <UseMSVC>false</UseMSVC>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
     </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;iphlpapi.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>libfabric.def</ModuleDefinitionFile>
-    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4127;4200;4204;4221;4115;4201;4100</DisableSpecificWarnings>
+      <C99Support>true</C99Support>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <UseMSVC>false</UseMSVC>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <ClCompile>
@@ -201,6 +245,7 @@
       <ShowIncludes>false</ShowIncludes>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <ExceptionHandling>false</ExceptionHandling>
+      <AdditionalOptions>/DNDEBUG %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -225,15 +270,32 @@
       <C99Support>true</C99Support>
       <ShowIncludes>false</ShowIncludes>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <ExceptionHandling>false</ExceptionHandling>
+      <AdditionalOptions>/DNDEBUG %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include;</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4127;4200;4204;4221;4115;4201;4100</DisableSpecificWarnings>
+      <SDLCheck>true</SDLCheck>
+      <C99Support>true</C99Support>
+      <ShowIncludes>false</ShowIncludes>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <AdditionalOptions>/DNDEBUG %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;iphlpapi.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>libfabric.def</ModuleDefinitionFile>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
@@ -295,12 +357,16 @@
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">
       </ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+      </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">
       </ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+      </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
       </ForcedIncludeFiles>
     </ClCompile>
@@ -318,12 +384,16 @@
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">
       </ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+      </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">
       </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">
       </ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+      </ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
       </ForcedIncludeFiles>
     </ClCompile>
@@ -341,189 +411,234 @@
     <ClCompile Include="prov\sockets\src\sock_attr.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_atomic.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_av.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_cntr.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_comm.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+   	  <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_conn.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_cq.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_ctx.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_dom.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_ep.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_ep_dgram.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_ep_msg.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_ep_rdm.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_eq.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_fabric.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_mr.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_msg.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_poll.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_progress.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_rma.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_rx_entry.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_trigger.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\sockets\src\sock_wait.c">
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">$(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
     <ClCompile Include="prov\tcp\src\tcpx_attr.c" />
-    <ClCompile Include="prov\tcp\src\tcpx_comm.c" />
     <ClCompile Include="prov\tcp\src\tcpx_conn_mgr.c" />
     <ClCompile Include="prov\tcp\src\tcpx_shared_ctx.c" />
     <ClCompile Include="prov\tcp\src\tcpx_cq.c" />
@@ -543,9 +658,11 @@
     <ClCompile Include="prov\udp\src\udpx_init.c">
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">ofi_osd.h</ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">ofi_osd.h</ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">ofi_osd.h</ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">ofi_osd.h</ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">ofi_osd.h</ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">ofi_osd.h</ForcedIncludeFiles>
+      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">ofi_osd.h</ForcedIncludeFiles>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">ofi_osd.h</ForcedIncludeFiles>
     </ClCompile>
     <ClCompile Include="prov\util\src\util_attr.c" />
@@ -570,6 +687,7 @@
     <ClCompile Include="prov\util\src\util_mr_cache.c" />
     <ClCompile Include="prov\util\src\cuda_mem_monitor.c" />
     <ClCompile Include="prov\util\src\rocr_mem_monitor.c" />
+    <ClCompile Include="prov\util\src\ze_mem_monitor.c" />
     <ClCompile Include="src\common.c" />
     <ClCompile Include="src\enosys.c">
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">4127;869</DisableSpecificWarnings>
@@ -628,6 +746,7 @@
     <ClInclude Include="include\rdma\fi_endpoint.h" />
     <ClInclude Include="include\rdma\fi_eq.h" />
     <ClInclude Include="include\rdma\fi_errno.h" />
+    <ClInclude Include="include\rdma\fi_ext.h" />
     <ClInclude Include="include\rdma\fi_rma.h" />
     <ClInclude Include="include\rdma\fi_tagged.h" />
     <ClInclude Include="include\rdma\fi_trigger.h" />
@@ -647,6 +766,7 @@
     <ClInclude Include="include\windows\osd.h" />
     <ClInclude Include="include\windows\poll.h" />
     <ClInclude Include="include\windows\pthread.h" />
+    <ClInclude Include="include\windows\sched.h" />
     <ClInclude Include="include\windows\sys\ipc.h" />
     <ClInclude Include="include\windows\sys\mman.h" />
     <ClInclude Include="include\windows\sys\param.h" />
diff --git a/deps/libfabric/libfabric.vcxproj.filters b/deps/libfabric/libfabric.vcxproj.filters
index 55f0e0440a08bb854ddb86f6d09f9150a2e68048..87029bbb57b5235b81596335c0793631d7a8b048 100644
--- a/deps/libfabric/libfabric.vcxproj.filters
+++ b/deps/libfabric/libfabric.vcxproj.filters
@@ -201,6 +201,9 @@
     <ClCompile Include="prov\util\src\rocr_mem_monitor.c">
       <Filter>Source Files\prov\util</Filter>
     </ClCompile>
+    <ClCompile Include="prov\util\src\ze_mem_monitor.c">
+      <Filter>Source Files\prov\util</Filter>
+    </ClCompile>
     <ClCompile Include="src\windows\osd.c">
       <Filter>Source Files\src\windows</Filter>
     </ClCompile>
@@ -429,9 +432,6 @@
     <ClCompile Include="prov\tcp\src\tcpx_attr.c">
       <Filter>Source Files\prov\tcp\src</Filter>
     </ClCompile>
-    <ClCompile Include="prov\tcp\src\tcpx_comm.c">
-      <Filter>Source Files\prov\tcp\src</Filter>
-    </ClCompile>
     <ClCompile Include="prov\tcp\src\tcpx_conn_mgr.c">
       <Filter>Source Files\prov\tcp\src</Filter>
     </ClCompile>
@@ -566,6 +566,9 @@
     <ClInclude Include="include\rdma\fi_errno.h">
       <Filter>Header Files\rdma</Filter>
     </ClInclude>
+    <ClInclude Include="include\rdma\fi_ext.h">
+      <Filter>Header Files\rdma</Filter>
+    </ClInclude>
     <ClInclude Include="include\rdma\fi_rma.h">
       <Filter>Header Files\rdma</Filter>
     </ClInclude>
@@ -620,6 +623,9 @@
     <ClInclude Include="include\windows\pthread.h">
       <Filter>Header Files\windows</Filter>
     </ClInclude>
+    <ClInclude Include="include\windows\sched.h">
+      <Filter>Header Files\windows</Filter>
+    </ClInclude>
     <ClInclude Include="include\windows\unistd.h">
       <Filter>Header Files\windows</Filter>
     </ClInclude>
diff --git a/deps/libfabric/man/fabric.7.md b/deps/libfabric/man/fabric.7.md
index 29f76f17ed8b746ac7078a85af709a7bc11af4d0..6fef65d954ecb3d1dca8e8740b2d40035d0a076e 100644
--- a/deps/libfabric/man/fabric.7.md
+++ b/deps/libfabric/man/fabric.7.md
@@ -281,6 +281,7 @@ application, with the -e or --env command line option.
 
 # NOTES
 
+## System Calls
 Because libfabric is designed to provide applications direct access to
 fabric hardware, there are limits on how libfabric resources may be used
 in conjunction with system calls.  These limitations are notable for
@@ -297,6 +298,25 @@ portability across providers.
   fabric domain may not be available in a child process because of copy
   on write restrictions.
 
+## CUDA deadlock
+In some cases, calls to `cudaMemcpy` within libfabric may result in a
+deadlock. This typically occurs when a CUDA kernel blocks until a
+`cudaMemcpy` on the host completes.  To avoid this deadlock,
+`cudaMemcpy` may be disabled by setting
+`FI_HMEM_CUDA_ENABLE_XFER=0`. If this environment variable is set and
+there is a call to `cudaMemcpy` with libfabric, a warning will be
+emitted and no copy will occur. Note that not all providers support
+this option.
+
+Another mechanism which can be used to avoid deadlock is Nvidia's
+gdrcopy. Using gdrcopy requires an external library and kernel module
+available at https://github.com/NVIDIA/gdrcopy. Libfabric must be
+configured with gdrcopy support using the `--with-gdrcopy` option, and
+be run with `FI_HMEM_CUDA_USE_GDRCOPY=1`. This may be used in
+conjunction with the above option to provide a method for copying
+to/from CUDA device memory when `cudaMemcpy` cannot be used. Again,
+this may not be supported by all providers.
+
 # ABI CHANGES
 
 libfabric releases maintain compatibility with older releases, so that
@@ -350,8 +370,8 @@ expanded the following structure.
 
 ## ABI 1.3
 
-The 1.3 ABI is also the current ABI version.  All libfabric releases
-starting at 1.9 export this ABI.
+The 1.3 ABI version was exported by libfabric versions 1.9, 1.10, and
+1.11.  Added new fields to the following attributes:
 
 *fi_domain_attr*
 : Added tclass
@@ -359,6 +379,20 @@ starting at 1.9 export this ABI.
 *fi_tx_attr*
 : Added tclass
 
+## ABI 1.4
+
+The 1.4 ABI version was exported by libfabric 1.12.  Added fi_tostr_r, a
+thread-safe (re-entrant) version of fi_tostr.
+
+## ABI 1.5
+
+ABI version starting with libfabric 1.13.  Added new fi_open API
+call.
+
+## ABI 1.6
+
+ABI version starting with libfabric 1.14.  Added fi_log_ready for providers.
+
 # SEE ALSO
 
 [`fi_info`(1)](fi_info.1.html),
diff --git a/deps/libfabric/man/fi_atomic.3.md b/deps/libfabric/man/fi_atomic.3.md
index 455dbd120a09c4ca5592150273c43bfc60eb7175..b7d5d72ed50463a57d47c5fe2b3df5635ccf8810 100644
--- a/deps/libfabric/man/fi_atomic.3.md
+++ b/deps/libfabric/man/fi_atomic.3.md
@@ -190,6 +190,12 @@ provider implementation constraints.
 *FI_UINT64*
 : Unsigned 64-bit integer.
 
+*FI_INT128*
+: Signed 128-bit integer.
+
+*FI_UINT128*
+: Unsigned 128-bit integer.
+
 *FI_FLOAT*
 : A single-precision floating point value (IEEE 754).
 
@@ -382,7 +388,7 @@ and type of parameters that they accept as input.  Otherwise, they
 perform the same general function.
 
 The call fi_atomic transfers the data contained in the user-specified
-data buffer to a remote node.  For unconnected endpoints, the destination
+data buffer to a remote node.  For connectionless endpoints, the destination
 endpoint is specified through the dest_addr parameter.  Unless
 the endpoint has been configured differently, the data buffer passed
 into fi_atomic must not be touched by the application until the
@@ -405,7 +411,7 @@ discussion below for more details. The requested message size that
 can be used with fi_inject_atomic is limited by inject_size.
 
 The fi_atomicmsg call supports atomic functions over both connected
-and unconnected endpoints, with the ability to control the atomic
+and connectionless endpoints, with the ability to control the atomic
 operation per call through the use of flags.  The fi_atomicmsg
 function takes a struct fi_msg_atomic as input.
 
@@ -600,7 +606,7 @@ with atomic message calls.
   targeting the same peer endpoint have completed.  Operations posted
   after the fencing will see and/or replace the results of any
   operations initiated prior to the fenced operation.
-  
+
   The ordering of operations starting at the posting of the fenced
   operation (inclusive) to the posting of a subsequent fenced operation
   (exclusive) is controlled by the endpoint's ordering semantics.
diff --git a/deps/libfabric/man/fi_av_set.3.md b/deps/libfabric/man/fi_av_set.3.md
index 38f439d5fd45dfebc7384d742ae237e878be7389..c2c250557cd6792bf8e9e763c871cba558d6d4b8 100644
--- a/deps/libfabric/man/fi_av_set.3.md
+++ b/deps/libfabric/man/fi_av_set.3.md
@@ -159,8 +159,10 @@ struct fi_av_set_attr {
   communication key is fabric provider specific.
 
 *flags*
-: If the flag FI_UNIVERSE is set, then the AV set will be created
-  containing all addresses stored in the AV.
+: Flags may be used to configure the AV set, including restricting which
+  collective operations the AV set needs to support.  See the flags section
+  for a list of flags that may be specified when creating the AV
+  set.
 
 ## fi_av_set_union
 
@@ -204,11 +206,54 @@ Closes an AV set and releases all resources associated with it.  Any
 operations active at the time an AV set is closed will be aborted, with
 the result of the collective undefined.
 
+# FLAGS
+
+The following flags may be specified as part of AV set creation.
+
+*FI_UNIVERSE*
+: When set, then the AV set will be created containing all addresses stored
+  in the corresponding AV.
+
+*FI_BARRIER_SET*
+: If set, the AV set will be configured to support barrier operations.
+
+*FI_BROADCAST_SET*
+: If set, the AV set will be configured to support broadcast operations.
+
+*FI_ALLTOALL_SET*
+: If set, the AV set will be configured to support all to all operations.
+
+*FI_ALLREDUCE_SET*
+: If set, the AV set will be configured to support all reduce operations.
+
+*FI_ALLGATHER_SET*
+: If set, the AV set will be configured to support all gather operations.
+
+*FI_REDUCE_SCATTER_SET*
+: If set, the AV set will be configured to support reduce scatter operations.
+
+*FI_REDUCE_SET*
+: If set, the AV set will be configured to support reduce operations.
+
+*FI_SCATTER_SET*
+: If set, the AV set will be configured to support scatter operations.
+
+*FI_GATHER_SET*
+: If set, the AV set will be configured to support gather operations.
+
 # NOTES
 
 Developers who are familiar with MPI will find that AV sets are similar to
 MPI groups, and may act as a direct mapping in some, but not all, situations.
 
+By default an AV set will be created to support all collective operations
+supported by the underlying provider (see fi_query_collective).  Users may
+reduce resource requirements by specifying only those collection operations
+needed by the AV set through the use of creation flags: FI_BARRIER_SET,
+FI_BROADCAST_SET, etc.  If no such flags are specified, the AV set will be
+configured to support any that are supported.  It is an error for a user to
+request an unsupported collective.
+
 # RETURN VALUES
 
 Returns 0 on success. On error, a negative value corresponding to fabric
diff --git a/deps/libfabric/man/fi_control.3.md b/deps/libfabric/man/fi_control.3.md
index 61f41a5f9dbf87b72424fb9e8a38efbb29863f5b..648f3f326ff57a14ad2632fdeae6d7c931edde4e 100644
--- a/deps/libfabric/man/fi_control.3.md
+++ b/deps/libfabric/man/fi_control.3.md
@@ -15,6 +15,9 @@ fi_control \- Perform an operation on a fabric resource.
 #include <rdma/fabric.h>
 
 int fi_control(struct fid *fid, int command, void *arg);
+int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t flags);
+int fi_get_val(struct fid *fid, int name, void *val);
+int fi_set_val(struct fid *fid, int name, void *val);
 ```
 
 
@@ -38,6 +41,15 @@ resource being operated on, the specified command, and any provided
 arguments for the command.  For specific details, see the fabric resource
 specific help pages noted below.
 
+fi_alias, fi_get_val, and fi_set_val are wrappers for fi_control with
+commands FI_ALIAS, FI_GET_VAL, FI_SET_VAL, respectively. fi_alias creates
+an alias of the specified fabric resource. fi_get_val reads the value of
+the named parameter associated with the fabric resource, while fi_set_val
+updates that value. Available parameter names depend on the type of the
+fabric resource and the provider in use. Providers may define provider
+specific names in the provider extension header files ('rdma/fi_ext_*.h').
+Please refer to the provider man pages for details. 
+
 # SEE ALSO
 
 [`fi_endpoint`(3)](fi_endpoint.3.html),
diff --git a/deps/libfabric/man/fi_cq.3.md b/deps/libfabric/man/fi_cq.3.md
index e16009572a7cf434692f3414cc844a7e3c26e9ad..f29f0ff9e8d278a5ecb846c81c9739ae1d4bbafd 100644
--- a/deps/libfabric/man/fi_cq.3.md
+++ b/deps/libfabric/man/fi_cq.3.md
@@ -463,7 +463,7 @@ of these fields are the same for all CQ entry structure formats.
   was provided with an asynchronous operation.  The op_context field is
   valid for all completions that are associated with an asynchronous
   operation.
-  
+
   For completion events that are not associated with a posted operation,
   this field will be set to NULL.  This includes completions generated
   at the target in response to RMA write operations that carry CQ data
@@ -624,7 +624,7 @@ operation.  The following completion flags are defined.
   have FI_BUFFERED_RECV mode enabled.  When set to one, it indicates that
   the buffer referenced by the completion is limited by the
   FI_OPT_BUFFERED_LIMIT threshold, and additional message data must be
-  retrieved by the application using an FI_CLAIM operation.  
+  retrieved by the application using an FI_CLAIM operation.
 
 *FI_CLAIM*
 : See the 'Buffered Receives' section in `fi_msg`(3) for more details.
@@ -762,7 +762,209 @@ The operational flags for the described completion levels are defined below.
   that was originally requested has been met.  It is the completion
   of the fenced operation that guarantees that the additional
   semantics have been met.
- 
+
+The above completion semantics are defined with respect to the initiator
+of the operation.  The different semantics are useful for describing
+when the initiator may re-use a data buffer, and guarantees what state
+a transfer must reach prior to a completion being generated.  This
+allows applications to determine appropriate error handling in case
+of communication failures.
+
+# TARGET COMPLETION SEMANTICS
+
+The completion semantic at the target is used to determine when data
+at the target is visible to the peer application.  Visibility
+indicates that a memory read to the same address that was
+the target of a data transfer will return the results of the transfer.
+The target of a transfer can be identified by the initiator,
+as may be the case for RMA and atomic operations, or determined by
+the target, for example by providing a matching receive buffer.
+Global visibility indicates that the results are available regardless
+of where the memory read originates.  For example, the read could come
+from a process running on a host CPU, it may be accessed by subsequent
+data transfer over the fabric, or read from a peer device such as a GPU.
+
+In terms of completion semantics, visibility usually indicates that the
+transfer meets the FI_DELIVERY_COMPLETE requirements from the
+perspective of the target.  The target completion semantic may be, but
+is not necessarily, linked with the completion semantic specified by the
+initiator of the transfer.
+
+Often, target processes do not explicitly state a desired completion
+semantic and instead rely on the default semantic.  The
+default behavior is based on several factors, including:
+
+- whether a completion even is generated at the target
+- the type of transfer involved (e.g. msg vs RMA)
+- endpoint data and message ordering guarantees
+- properties of the targeted memory buffer
+- the initiator's specified completion semantic
+
+Broadly, target completion semantics are grouped
+based on whether or not the transfer generates a completion event
+at the target.  This includes writing a CQ entry or updating a completion
+counter.  In common use cases, transfers that use a message
+interface (FI_MSG or FI_TAGGED) typically generate target events, while
+transfers involving an RMA interface (FI_RMA or FI_ATOMIC) often do not.
+There are exceptions to both these cases, depending on endpoint to CQ
+and counter bindings and operational flags.  For example, RMA writes that
+carry remote CQ data will generate a completion event at the target,
+and are frequently used to convey visibility to the target application.
+The general guidelines for target side semantics are described below,
+followed by exceptions that modify that behavior.
+
+By default, completions generated at the target indicate that the
+transferred data is immediately available to be read from the target buffer.
+That is, the target sees FI_DELIVERY_COMPLETE (or better) semantics,
+even if the initiator requested lower semantics.
+For applications using only data buffers allocated from
+host memory, this is often sufficient.
+
+For operations that do not generate a completion event at the target,
+the visibility of the data at the target may need to be inferred
+based on subsequent operations that do generate target completions.
+Absent a target completion, when a completion of an
+operation is written at the initiator, the visibility semantic
+of the operation at the target aligns with the initiator completion
+semantic.  For instance, if an RMA operation completes at the initiator
+as either FI_INJECT_COMPLETE or FI_TRANSMIT_COMPLETE, the data visibility
+at the target is not guaranteed.
+
+One or more of the following mechanisms can be used by the target process to
+guarantee that the results of a data transfer that did not generate a
+completion at the target is now visible.  This list is not inclusive of
+all options, but defines common uses.  In the descriptions below, the first
+transfer does not result in a completion event at the target, but is
+eventually followed by a transfer which does.
+
+- If the endpoint guarantees message ordering between two transfers, the
+  target completion of a second transfer will indicate that the data from
+  the first transfer is available.  For example, if the endpoint supports
+  send after write ordering (FI_ORDER_SAW), then a receive completion
+  corresponding to the send will indicate that the write data is available.
+  This holds independent of the initiator's completion semantic for either
+  the write or send.  When ordering is guaranteed, the second transfer
+  can be queued with the provider immediately after queuing the first.
+
+- If the endpoint does not guarantee message ordering, the initiator must take
+  additional steps to ensure visibility.  If initiator requests
+  FI_DELIVERY_COMPLETE semantics for the first operation, the initiator can wait
+  for the operation to complete locally.  Once the completion has been
+  read, the target completion of a second transfer will indicate that the
+  first transfer's data is visible.
+
+- Alternatively, if message ordering is not guaranteed by the endpoint, the
+  initiator can use the FI_FENCE and FI_DELIVERY_COMPLETE flags on the second
+  data transfer to force the first transfers to meet the
+  FI_DELIVERY_COMPLETE semantics.  If the second transfer generates a
+  completion at the target, that will indicate that the data is visible.
+  Otherwise, a target completion for any transfer after the
+  fenced operation will indicate that the data is visible.
+
+The above semantics apply for transfers targeting traditional host memory
+buffers.  However, the behavior may differ when device memory and/or
+persistent memory is involved (FI_HMEM and FI_PMEM capability bits).  When
+heterogenous memory is involved, the concept of memory domains come into
+play.  Memory domains identify the physical separation of memory, which
+may or may not be accessible through the same virtual address space.  See
+the [`fi_mr`(3)](fi_mr.3.html) man page for further details on memory domains.
+
+Completion ordering and data visibility are only well-defined for transfers
+that target the same memory domain.  Applications need to be aware of
+ordering and visibility differences when transfers target different memory
+domains.  Additionally, applications also need to be concerned with the
+memory domain that completions themselves are written and if it differs
+from the memory domain targeted by a transfer.  In some situations,
+either the provider or application may need to call device specific APIs
+to synchronize or flush device memory caches in order to achieve the
+desired data visibility.
+
+When heterogenous memory is in use, the default target completion semantic
+for transfers that generate a completion at the target is still
+FI_DELIVERY_COMPLETE, however, applications should be aware that there
+may be a negative impact on overall performance for providers to meet
+this requirement.
+
+For example, a target process may be using a GPU to accelerate computations.
+A memory region mapping to memory on the GPU may be exposed to peers as
+either an RMA target or posted locally as a receive buffer.  In this case,
+the application is concerned with two memory domains -- system and GPU
+memory.  Completions are written to system memory.
+
+Continuing the example, a peer process sends a tagged message.  That message
+is matched with the receive buffer located in GPU memory.  The NIC copies
+the data from the network into the receive buffer and writes an entry into
+the completion queue.  Note that both memory domains were accessed as part
+of this transfer.  The message data was directed to the GPU memory, but the
+completion went to host memory.   Because separate memory domains may not be
+synchronized with each other, it is possible for the host CPU to see and process
+the completion entry before the transfer to the GPU memory is visible to either
+the host GPU or even software running on the GPU.  From the perspective
+of the *provider*, visibility of the completion does not imply visibility of
+data written to the GPU's memory domain.
+
+The default completion semantic at the target *application* for message
+operations is FI_DELIVERY_COMPLETE.  An anticipated provider implementation
+in this situation is for the provider software running on the host CPU to
+intercept the CQ entry, detect that the data landed in heterogenous memory,
+and perform the necessary device synchronization or flush operation
+before reporting the completion up to the application.  This ensures that
+the data is visible to CPU _and_ GPU software prior to the application
+processing the completion.
+
+In addition to the cost of provider software intercepting completions
+and checking if a transfer targeted heterogenous memory, device
+synchronization itself may impact performance.  As a result, applications
+can request a lower completion semantic when posting receives.  That
+indicates to the provider that the application will be responsible for
+handling any device specific flush operations that might be needed.
+See [`fi_msg`(3)](fi_msg.3.html) FLAGS.
+
+For data transfers that do not generate a completion at the target,
+such as RMA or atomics, it is the responsibility of the application
+to ensure that all target buffers meet the necessary visibility
+requirements of the application.  The previously mentioned bulleted
+methods for notifying the target that the data is visible may not
+be sufficient, as the provider software at the target could lack
+the context needed to ensure visibility.  This implies that the
+application may need to call device synchronization/flush APIs
+directly.
+
+For example, a peer application could perform several RMA writes
+that target GPU memory buffers.  If the provider offloads RMA
+operations into the NIC, the provider software at the target will
+be unaware that the RMA operations have occurred.  If the peer
+sends a message to the target application that indicates that the
+RMA operations are done, the application must ensure that the RMA data
+is visible to the host CPU or GPU prior to executing code that accesses
+the data.  The target completion of having received the sent message
+is not sufficient, even if send-after-write ordering is supported.
+
+Most target heterogenous memory completion semantics map to
+FI_TRANSMIT_COMPLETE or FI_DELIVERY_COMPLETE.  Persistent memory
+(FI_PMEM capability), however, is often used with FI_COMMIT_COMPLETE
+semantics.  Heterogenous completion concepts still apply.
+
+For transfers flagged by the initiator with FI_COMMIT_COMPLETE,
+a completion at the target indicates that the results are visible
+and durable.  For transfers targeting persistent memory, but using
+a different completion semantic at the initiator, the visibility
+at the target is similar to that described above.  Durability is
+only associated with transfers marked with FI_COMMIT_COMPLETE.
+
+For transfers targeting persistent memory that request
+FI_DELIVERY_COMPLETE, then a completion, at either the initiator or
+target, indicates that the data is visible.  Visibility at the
+target can be conveyed using one of the above describe mechanism --
+generating a target completion, sending a message from the initiator,
+etc.  Similarly, if the initiator requested FI_TRANSMIT_COMPLETE,
+then additional steps are needed to ensure visibility at the target.
+For example, the transfer can generate a completion at the target,
+which would indicate visibility, but not durability.  The initiator
+can also follow the transfer with another operation that forces
+visibility, such as using FI_FENCE in conjunction with
+FI_DELIVERY_COMPLETE.
+
 # NOTES
 
 A completion queue must be bound to at least one enabled endpoint before any
diff --git a/deps/libfabric/man/fi_domain.3.md b/deps/libfabric/man/fi_domain.3.md
index aa77ec011f50836cc11ac81aca51ecf06edea7c7..79ad47899f08ce2df512fbec8f7ea8f35233845e 100644
--- a/deps/libfabric/man/fi_domain.3.md
+++ b/deps/libfabric/man/fi_domain.3.md
@@ -446,8 +446,10 @@ transfer operation.
   seen by the initiator of a request.  For FI_EP_DGRAM endpoints, if the target EP
   queues are unable to accept incoming messages, received messages will
   be dropped.  For reliable endpoints, if RM is disabled, the transmit
-  operation will complete in error.  If RM is enabled, the provider will
-  internally retry the operation.
+  operation will complete in error. A provider may choose to return an error
+  completion with the error code FI_ENORX for that transmit operation so that
+  it can be retried. If RM is enabled, the provider will internally retry the
+  operation.
 
 *Rx Buffer Overrun*
 : This refers to buffers posted to receive incoming tagged or untagged messages,
@@ -471,7 +473,7 @@ transfer operation.
 
 When a resource management error occurs on an endpoint, the endpoint is
 transitioned into a disabled state.  Any operations which have not
-already completed will fail and be discarded.  For unconnected endpoints,
+already completed will fail and be discarded.  For connectionless endpoints,
 the endpoint must be re-enabled before it will accept new data transfer
 operations.  For connected endpoints, the connection is torn down and
 must be re-established.
@@ -525,6 +527,10 @@ The following values may be specified.
 : Indicates that memory registration occurs on allocated data buffers, and
   physical pages must back all virtual addresses being registered.
 
+*FI_MR_COLLECTIVE*
+: Requires data buffers passed to collective operations be explicitly
+  registered for collective operations using the FI_COLLECTIVE flag.
+
 *FI_MR_ENDPOINT*
 : Memory registration occurs at the endpoint level, rather than domain.
 
diff --git a/deps/libfabric/man/fi_efa.7.md b/deps/libfabric/man/fi_efa.7.md
index 5155c00f1b121c7de8d1c30f2af61a436911896c..186799aac10f26775675d20732892cccc7433051 100644
--- a/deps/libfabric/man/fi_efa.7.md
+++ b/deps/libfabric/man/fi_efa.7.md
@@ -53,8 +53,10 @@ The following features are supported:
   registrations on the DGRAM endpoint.
 
 *Memory registration modes*
-: The RDM endpoint does not require memory registration and the
-  *FI_EP_DGRAM* endpoint only supports *FI_MR_LOCAL*.
+: The RDM endpoint does not require memory registration for send and receive
+  operations, i.e. it does not require *FI_MR_LOCAL*. Applications may specify
+  *FI_MR_LOCAL* in the MR mode flags in order to use descriptors provided by the
+  application. The *FI_EP_DGRAM* endpoint only supports *FI_MR_LOCAL*.
 
 *Progress*
 : The RDM endpoint supports both *FI_PROGRESS_AUTO* and *FI_PROGRESS_MANUAL*,
@@ -69,17 +71,25 @@ The following features are supported:
 
 # LIMITATIONS
 
-The provider does not support *FI_ATOMIC* interfaces. For RMA operations,
+The DGRAM endpoint does not support *FI_ATOMIC* interfaces. For RMA operations,
 completion events for RMA targets (*FI_RMA_EVENT*) is not supported. The DGRAM
 endpoint does not fully protect against resource overruns, so resource
 management is disabled for this endpoint (*FI_RM_DISABLED*).
 
 No support for selective completions.
 
-No support for counters.
+No support for counters for the DGRAM endpoint.
 
 No support for inject.
 
+# PROVIDER SPECIFIC ENDPOINT LEVEL OPTION
+
+*FI_OPT_EFA_RNR_RETRY*
+: Defines the number of RNR retry. The application can use it to reset RNR retry
+  counter via the call to fi_setopt. Note that this option must be set before
+  the endpoint is enabled. Otherwise, the call will fail. Also note that this
+  option only applies to RDM endpoint.
+
 # RUNTIME PARAMETERS
 
 *FI_EFA_TX_SIZE*
@@ -163,10 +173,14 @@ These OFI runtime parameters apply only to the RDM endpoint.
 *FI_EFA_SHM_MAX_MEDIUM_SIZE*
 : Defines the switch point between small/medium message and large message. The message
   larger than this switch point will be transferred with large message protocol.
+  NOTE: This parameter is now deprecated.
 
 *FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE*
 : The maximum size for inter EFA messages to be sent by using medium message protocol. Messages which can fit in one packet will be sent as eager message. Messages whose sizes are smaller than this value will be sent using medium message protocol. Other messages will be sent using CTS based long message protocol.
 
+*FI_EFA_FORK_SAFE*
+: Enable fork() support. This may have a small performance impact and should only be set when required. Applications that require to register regions backed by huge pages and also require fork support are not supported.
+
 # SEE ALSO
 
 [`fabric`(7)](fabric.7.html),
diff --git a/deps/libfabric/man/fi_endpoint.3.md b/deps/libfabric/man/fi_endpoint.3.md
index 61794d920ceddf3db3134f28963e285c5009c6f5..33ce49eb0d92ea1670e657018ad0004f2659f3a6 100644
--- a/deps/libfabric/man/fi_endpoint.3.md
+++ b/deps/libfabric/man/fi_endpoint.3.md
@@ -198,7 +198,7 @@ Additionally, endpoints that use manual progress must be associated
 with relevant completion queues or event queues in order to drive
 progress.  For endpoints that are only used as the target of RMA or
 atomic operations, this means binding the endpoint to a completion
-queue associated with receive processing.  Unconnected endpoints must
+queue associated with receive processing.  Connectionless endpoints must
 be bound to an address vector.
 
 Once an endpoint has been activated, it may be associated with an address
@@ -520,6 +520,27 @@ The following option levels and option names and parameters are defined.
   that applications that want to override the default MIN_MULTI_RECV
   value set this option before enabling the corresponding endpoint.
 
+- *FI_OPT_FI_HMEM_P2P - int*
+: Defines how the provider should handle peer to peer FI_HMEM transfers for
+  this endpoint. By default, the provider will chose whether to use peer to peer
+  support based on the type of transfer (FI_HMEM_P2P_ENABLED). Valid values
+  defined in fi_endpoint.h are:
+	* FI_HMEM_P2P_ENABLED: Peer to peer support may be used by the provider
+	  to handle FI_HMEM transfers, and which transfers are initiated using
+	  peer to peer is subject to the provider implementation.
+	* FI_HMEM_P2P_REQUIRED: Peer to peer support must be used for
+	  transfers, transfers that cannot be performed using p2p will be
+	  reported as failing.
+	* FI_HMEM_P2P_PREFERRED: Peer to peer support should be used by the
+	  provider for all transfers if available, but the provider may choose
+	  to copy the data to initiate the transfer if peer to peer support is
+	  unavailable.
+	* FI_HMEM_P2P_DISABLED: Peer to peer support should not be used.
+: fi_setopt() will return -FI_EOPNOTSUPP if the mode requested cannot be supported
+  by the provider.
+: The FI_HMEM_DISABLE_P2P environment variable discussed in
+  [`fi_mr`(3)](fi_mr.3.html) takes precedence over this setopt option.
+
 ## fi_tc_dscp_set
 
 This call converts a DSCP defined value into a libfabric traffic class value.
@@ -594,7 +615,7 @@ desired.  Supported types are:
   flow control that maintains message boundaries.
 
 *FI_EP_RDM*
-: Reliable datagram message.  Provides a reliable, unconnected data
+: Reliable datagram message.  Provides a reliable, connectionless data
   transfer service with flow control that maintains message
   boundaries.
 
@@ -658,6 +679,10 @@ protocol value set to one.
   performance scaled messaging version 2.  PSMX2 is an extended version of the
   PSM2 protocol to support the libfabric interfaces.
 
+*FI_PROTO_PSMX3*
+: The protocol is Intel's protocol known as PSM3, performance scaled
+  messaging version 3.  PSMX3 is implemented over RoCEv2 and verbs.
+
 *FI_PROTO_RDMA_CM_IB_RC*
 : The protocol runs over Infiniband reliable-connected queue pairs,
   using the RDMA CM protocol for connection establishment.
@@ -714,16 +739,26 @@ data into target memory for RMA and atomic operations.  Data ordering
 is separate, but dependent on message ordering (defined below).  Data
 ordering is unspecified where message order is not defined.
 
-Data ordering refers to the access of target memory by subsequent
+Data ordering refers to the access of the same target memory by subsequent
 operations.  When back to back RMA read or write operations access the
 same registered memory location, data ordering indicates whether the
 second operation reads or writes the target memory after the first
-operation has completed.  Because RMA ordering applies between two
-operations, and not within a single data transfer, ordering is defined
-per byte-addressable memory location.  I.e.  ordering specifies
+operation has completed.  For example, will an RMA read that follows
+an RMA write read back the data that was written?  Similarly, will an
+RMA write that follows an RMA read update the target buffer after the
+read has transferred the original data?  Data ordering answers these
+questions, even in the presence of errors, such as the need to resend
+data because of lost or corrupted network traffic.
+
+RMA ordering applies between two operations, and not within a single
+data transfer.  Therefore, ordering is defined
+per byte-addressable memory location.  I.e. ordering specifies
 whether location X is accessed by the second operation after the first
 operation.  Nothing is implied about the completion of the first
-operation before the second operation is initiated.
+operation before the second operation is initiated.  For example, if
+the first operation updates locations X and Y, but the second operation
+only accesses location X, there are no guarantees defined relative to
+location Y and the second operation.
 
 In order to support large data transfers being broken into multiple packets
 and sent using multiple paths through the fabric, data ordering may be
@@ -1038,7 +1073,7 @@ message order.  Relaxed completion order may enable faster reporting of
 completed transfers, allow acknowledgments to be sent over different
 fabric paths, and support more sophisticated retry mechanisms.
 This can result in lower-latency completions, particularly when
-using unconnected endpoints.  Strict completion ordering may require
+using connectionless endpoints.  Strict completion ordering may require
 that providers queue completed operations or limit available optimizations.
 
 For transmit requests, completion ordering depends on the endpoint
@@ -1072,9 +1107,21 @@ be used with the FI_INJECT data transfer flag.
 
 ## size
 
-The size of the context.  The size is specified as the minimum number
-of transmit operations that may be posted to the endpoint without the
-operation returning -FI_EAGAIN.
+The size of the transmit context.  The mapping of the size value to resources
+is provider specific, but it is directly related to the number of command
+entries allocated for the endpoint.  A smaller size value consumes fewer
+hardware and software resources, while a larger size allows queuing more
+transmit requests.
+
+While the size attribute guides the size of underlying endpoint transmit
+queue, there is not necessarily a one-to-one mapping between a transmit
+operation and a queue entry.  A single transmit operation may consume
+multiple queue entries; for example, one per scatter-gather entry.
+Additionally, the size field is intended to guide the allocation of the
+endpoint's transmit context.  Specifically, for connectionless endpoints,
+there may be lower-level queues use to track communication on a per peer basis.
+The sizes of any lower-level queues may only be significantly smaller than
+the endpoint's transmit size, in order to reduce resource utilization.
 
 ## iov_limit
 
@@ -1254,9 +1301,21 @@ anticipate receiving unexpected messages, rather than modifying this value.
 
 ## size
 
-The size of the context.  The size is specified as the minimum number
-of receive operations that may be posted to the endpoint without the
-operation returning -FI_EAGAIN.
+The size of the receive context.  The mapping of the size value to resources
+is provider specific, but it is directly related to the number of command
+entries allocated for the endpoint.  A smaller size value consumes fewer
+hardware and software resources, while a larger size allows queuing more
+transmit requests.
+
+While the size attribute guides the size of underlying endpoint receive
+queue, there is not necessarily a one-to-one mapping between a receive
+operation and a queue entry.  A single receive operation may consume
+multiple queue entries; for example, one per scatter-gather entry.
+Additionally, the size field is intended to guide the allocation of the
+endpoint's receive context.  Specifically, for connectionless endpoints,
+there may be lower-level queues use to track communication on a per peer basis.
+The sizes of any lower-level queues may only be significantly smaller than
+the endpoint's receive size, in order to reduce resource utilization.
 
 ## iov_limit
 
diff --git a/deps/libfabric/man/fi_fabric.3.md b/deps/libfabric/man/fi_fabric.3.md
index a69b0e288f2ff5579249d3f1b26418469fb051b7..852036f11a5a514387fe43ce265b047c10f708d3 100644
--- a/deps/libfabric/man/fi_fabric.3.md
+++ b/deps/libfabric/man/fi_fabric.3.md
@@ -7,12 +7,12 @@ tagline: Libfabric Programmer's Manual
 
 # NAME
 
-fi_fabric \- Fabric domain operations
+fi_fabric \- Fabric network operations
 
 fi_fabric / fi_close
-: Open / close a fabric domain
+: Open / close a fabric network
 
-fi_tostr
+fi_tostr / fi_tostr_r
 : Convert fabric attributes, flags, and capabilities to printable string
 
 # SYNOPSIS
@@ -26,6 +26,9 @@ int fi_fabric(struct fi_fabric_attr *attr,
 int fi_close(struct fid *fabric);
 
 char * fi_tostr(const void *data, enum fi_type datatype);
+
+char * fi_tostr(char *buf, size_t len, const void *data,
+    enum fi_type datatype);
 ```
 
 # ARGUMENTS
@@ -34,33 +37,51 @@ char * fi_tostr(const void *data, enum fi_type datatype);
 : Attributes of fabric to open.
 
 *fabric*
-: Fabric domain
+: Fabric network
 
 *context*
 : User specified context associated with the opened object.  This
   context is returned as part of any associated asynchronous event.
 
+*buf*
+: Output buffer to write string.
+
+*len*
+: Size in bytes of memory referenced by buf.
+
+*data*
+: Input data to convert into a string.  The format of data is determined
+  by the datatype parameter.
+
+*datatype*
+: Indicates the data to convert to a printable string.
+
 # DESCRIPTION
 
-A fabric domain represents a collection of hardware and software
+A fabric identifier is used to reference opened fabric resources
+and library related objects.
+
+The fabric network represents a collection of hardware and software
 resources that access a single physical or virtual network.  All
 network ports on a system that can communicate with each other through
-their attached networks belong to the same fabric domain.  A fabric
-domain shares network addresses and can span multiple providers.
+their attached networks belong to the same fabric.  A fabric
+network shares network addresses and can span multiple providers.  An
+application must open a fabric network prior to allocating other network
+resources, such as communication endpoints.
 
 ## fi_fabric
 
-Opens a fabric provider.  The attributes of the fabric provider are
+Opens a fabric network provider.  The attributes of the fabric provider are
 specified through the open call, and may be obtained by calling
 fi_getinfo.
 
 ## fi_close
 
 The fi_close call is used to release all resources associated with a
-fabric domain or interface.  All items associated with the opened
+fabric object.  All items associated with the opened
 fabric must be released prior to calling fi_close.
 
-## fi_tostr
+## fi_tostr / fi_tostr_r
 
 Converts fabric interface attributes, capabilities, flags, and enum
 values into a printable string.  The data parameter accepts a pointer
@@ -147,10 +168,17 @@ datatype or field value.
 *FI_TYPE_HMEM_IFACE*
 : enum fi_hmem_iface *
 
+*FI_TYPE_CQ_FORMAT*
+: enum fi_cq_format
+
 fi_tostr() will return a pointer to an internal libfabric buffer that
 should not be modified, and will be overwritten the next time
 fi_tostr() is invoked.  fi_tostr() is not thread safe.
 
+The fi_tostr_r() function is a re-entrant and thread safe version of
+fi_tostr().  It writes the string into a buffer provided by the caller.
+fi_tostr_r() returns the start of the caller's buffer.
+
 # NOTES
 
 The following resources are associated with fabric domains: access
diff --git a/deps/libfabric/man/fi_getinfo.3.md b/deps/libfabric/man/fi_getinfo.3.md
index 5159356655bd3d3b97101b4db76219f60693eeff..e78dbd541dbf5de9bbae409e98481b88057b8040 100644
--- a/deps/libfabric/man/fi_getinfo.3.md
+++ b/deps/libfabric/man/fi_getinfo.3.md
@@ -623,6 +623,10 @@ fabric.  See [`fi_av`(3)](fi_av.3.html).
 : Address is an Intel proprietary format used with their Performance Scaled
   Messaging protocol version 2.
 
+*FI_ADDR_PSMX3*
+: Address is an Intel proprietary format used with their Performance Scaled
+  Messaging protocol version 3.
+
 *FI_ADDR_STR*
 : Address is a formatted character string.  The length and content of
   the string is address and/or provider specific, but in general follows
diff --git a/deps/libfabric/man/fi_mr.3.md b/deps/libfabric/man/fi_mr.3.md
index 1197c0dd2751a7fd50e328ce58d247d8dab27f66..578c36a411a2ee5eff52de6ad4cbb978af443da3 100644
--- a/deps/libfabric/man/fi_mr.3.md
+++ b/deps/libfabric/man/fi_mr.3.md
@@ -72,8 +72,8 @@ int fi_mr_unmap_key(struct fid_domain *domain, uint64_t key);
 
 int fi_mr_bind(struct fid_mr *mr, struct fid *bfid, uint64_t flags);
 
-int fi_mr_refresh(struct fid_mr *mr, const struct iovec *iov, size, count,
-    uint64_t flags)
+int fi_mr_refresh(struct fid_mr *mr, const struct iovec *iov,
+    size_t count, uint64_t flags);
 
 int fi_mr_enable(struct fid_mr *mr);
 ```
@@ -93,10 +93,10 @@ int fi_mr_enable(struct fid_mr *mr);
 : User specified context associated with the memory region.
 
 *buf*
-: Memory buffer to register with the fabric hardware
+: Memory buffer to register with the fabric hardware.
 
 *len*
-: Length of memory buffer to register
+: Length of memory buffer to register.  Must be > 0.
 
 *iov*
 : Vectored memory buffer.
@@ -112,7 +112,8 @@ int fi_mr_enable(struct fid_mr *mr);
   This parameter is reserved for future use and must be 0.
 
 *requested_key*
-: Optional requested remote key associated with registered buffers.
+: Requested remote key associated with registered buffers.  Parameter
+  is ignored if FI_MR_PROV_KEY flag is set in the domain mr_mode bits.
 
 *attr*
 : Memory region attributes
@@ -129,29 +130,44 @@ of a remote RMA or atomic data transfer.  Additionally, a fabric
 provider may require that data buffers be registered before being used
 in local transfers.  Memory registration restrictions are controlled
 using a separate set of mode bits, specified through the domain
-attributes (mr_mode field).
+attributes (mr_mode field).  Each mr_mode bit requires that an
+application take specific steps in order to use memory buffers with
+libfabric interfaces.
 
 The following apply to memory registration.
 
-*Scalable Memory Registration*
-: By default, memory registration is considered scalable.  (For library versions
-  1.4 and earlier, this is indicated by setting mr_mode to FI_MR_SCALABLE,
-  with the fi_info mode bit FI_LOCAL_MR set to 0).  For versions 1.5 and later,
-  scalable is implied by the lack of any mr_mode bits being set.  The setting
-  of mr_mode bits therefore adjusts application behavior as described below.
-  Default, scalable registration has several properties.
-
-  In scalable mode, registration occurs on memory address ranges.
-  Because registration refers to memory regions, versus data buffers, the
-  address ranges given for a registration request do not need to map to
+*Default Memory Registration*
+: If no mr_mode bits are set, the default behaviors describe below are
+  followed.  Historically, these defaults were collectively referred to as
+  scalable memory registration.  The default requirements are outlined below,
+  followed by definitions of how each mr_mode bit alters the definition.
+
+  Compatibility: For library versions 1.4 and earlier, this was indicated by
+  setting mr_mode to FI_MR_SCALABLE and the fi_info mode bit FI_LOCAL_MR to 0.
+  FI_MR_SCALABLE and FI_LOCAL_MR were deprecated in libfabric version 1.5,
+  though they are supported for backwards compatibility purposes.
+
+  For security, memory registration is required for data buffers that are
+  accessed directly by a peer process.  For example, registration is
+  required for RMA target buffers (read or written to), and those accessed
+  by atomic or collective operations.
+
+  By default, registration occurs on virtual address ranges.
+  Because registration refers to address ranges, rather than allocated
+  data buffers, the address ranges do not need to map to
   data buffers allocated by the application at the time the registration
   call is made.  That is, an application can register any
   range of addresses in their virtual address space, whether or not those
   addresses are backed by physical pages or have been allocated.
 
-  The resulting memory regions are accessible by peers starting at a base
-  address of 0.  That is, the target address that is specified is a byte
-  offset into the registered region.
+  Note that physical pages must back addresses prior to the addresses being
+  accessed as part of a data transfer operation, or the data transfers will
+  fail.  Additionally, depending on the operation, this could result in the
+  local process receiving a segmentation fault for accessing invalid memory.
+
+  Once registered, the resulting memory regions are accessible by peers starting
+  at a base address of 0.  That is, the target address that is specified is a
+  byte offset into the registered region.
 
   The application also selects the access key associated with the MR.  The
   key size is restricted to a maximum of 8 bytes.
@@ -161,6 +177,20 @@ The following apply to memory registration.
   tagged sends, RMA, and atomics -- as well as buffers posted for receive
   and tagged receive operations.
 
+  Although the default memory registration behavior is convenient for
+  application developers, it is difficult to implement in hardware.
+  Attempts to hide the hardware requirements from the application often
+  results in significant and unacceptable impacts to performance.  The
+  following mr_mode bits are provided as input into fi_getinfo.  If a
+  provider requires the behavior defined for an mr_mode bit, it will leave
+  the bit set on output to fi_getinfo.  Otherwise, the provider can clear
+  the bit to indicate that the behavior is not needed.
+
+  By setting an mr_mode bit, the application has agreed to adjust its
+  behavior as indicated.  Importantly, applications that choose to support
+  an mr_mode must be prepared to handle the case where the mr_mode is
+  not required.  A provider will clear an mr_mode bit if it is not needed.
+
 *FI_MR_LOCAL*
 : When the FI_MR_LOCAL mode bit is set, applications must register all
   data buffers that will be accessed by the local hardware and provide
@@ -262,20 +292,28 @@ The following apply to memory registration.
   Similarly, if FI_MR_LOCAL is set, but FI_MR_HMEM is not, the desc
   parameter must either be valid or NULL.
 
+*FI_MR_COLLECTIVE*
+: This bit is associated with the FI_COLLECTIVE capability.  When set,
+  the provider requires that memory regions used in collection operations
+  must explicitly be registered for use with collective calls.  This
+  requires registering regions passed to collective calls using the
+  FI_COLLECTIVE flag.
+
 *Basic Memory Registration*
-: Basic memory registration is indicated by the FI_MR_BASIC mr_mode bit.
-  FI_MR_BASIC is maintained for backwards compatibility (libfabric version
-  1.4 or earlier).  The behavior of basic registration is equivalent
-  to setting the following mr_mode bits to one: FI_MR_VIRT_ADDR,
-  FI_MR_ALLOCATED, and FI_MR_PROV_KEY.  Additionally, providers that
-  support basic registration usually required the fi_info mode bit FI_LOCAL_MR.
-  As a result, it is recommended that applications migrating from libfabric 1.4
-  or earlier or wanting to support basic memory registration set the mr_mode
-  to FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL.
-  FI_MR_BASIC must be set alone.  Other mr_mode bit pairings are invalid.
+: Basic memory registration was deprecated in libfabric version 1.5, but
+  is supported for backwards compatibility.  Basic memory registration
+  is indicated by setting mr_mode equal to FI_MR_BASIC.
+  FI_MR_BASIC must be set alone and not paired with mr_mode bits.
   Unlike other mr_mode bits, if FI_MR_BASIC is set on input to fi_getinfo(),
-  it will not be cleared by the provider.  That is, setting FI_MR_BASIC
-  to one requests basic registration.
+  it will not be cleared by the provider.  That is, setting mr_mode equal to
+  FI_MR_BASIC forces basic registration if the provider supports it.
+
+  The behavior of basic registration is equivalent
+  to requiring the following mr_mode bits: FI_MR_VIRT_ADDR,
+  FI_MR_ALLOCATED, and FI_MR_PROV_KEY.  Additionally, providers that
+  support basic registration usually require the (deprecated) fi_info mode
+  bit FI_LOCAL_MR, which was incorporated into the FI_MR_LOCAL mr_mode
+  bit.
 
 The registrations functions -- fi_mr_reg, fi_mr_regv, and
 fi_mr_regattr -- are used to register one or more memory regions with
@@ -312,7 +350,8 @@ region must provide the key associated with the registration.
 Because MR keys must be provided by a remote process, an application
 can use the requested_key parameter to indicate that a specific key
 value be returned.  Support for user requested keys is provider
-specific and is determined by the mr_mode domain attribute.
+specific and is determined by the FI_MR_PROV_KEY flag value in the
+mr_mode domain attribute.
 
 Remote RMA and atomic operations indicate the location within a
 registered memory region by specifying an address.  The location
@@ -498,11 +537,13 @@ bitwise OR of the following flags:
 
 *FI_SEND*
 : The memory buffer may be used in outgoing message data transfers.  This
-  includes fi_msg and fi_tagged send operations.
+  includes fi_msg and fi_tagged send operations, as well as fi_collective
+  operations.
 
 *FI_RECV*
 : The memory buffer may be used to receive inbound message transfers.
-  This includes fi_msg and fi_tagged receive operations.
+  This includes fi_msg and fi_tagged receive operations, as well as
+  fi_collective operations.
 
 *FI_READ*
 : The memory buffer may be used as the result buffer for RMA read
@@ -526,6 +567,12 @@ bitwise OR of the following flags:
   or atomic operation.  The contents of the memory buffer may be
   modified as a result of such operations.
 
+*FI_COLLECTIVE*
+: This flag provides an explicit indication that the memory buffer may
+  be used with collective operations.  Use of this flag is required if
+  the FI_MR_COLLECTIVE mr_mode bit has been set on the domain.  This flag
+  should be paired with FI_SEND and/or FI_RECV
+
 Note that some providers may not enforce fine grained access permissions.
 For example, a memory region registered for FI_WRITE access may also
 behave as if FI_SEND were specified as well.  Relaxed enforcement of
@@ -545,7 +592,7 @@ can use the requested_key field to indicate that a specific key be
 used by the provider.  This allows applications to use well known key
 values, which can avoid applications needing to exchange and store keys.
 Support for user requested keys is provider specific and is determined
-by the mr_mode domain attribute.
+by the the FI_MR_PROV_KEY flag in the mr_mode domain attribute field.
 
 ## context
 
@@ -565,8 +612,8 @@ version 1.5 or greater.
 Indicates the key to associate with this memory registration.  Authorization
 keys are used to limit communication between endpoints.  Only peer endpoints
 that are programmed to use the same authorization key may access the memory
-region.  The domain authorization key will be used if the auth_key_size 
-provided is 0.  This field is ignored unless the fabric is opened with API 
+region.  The domain authorization key will be used if the auth_key_size
+provided is 0.  This field is ignored unless the fabric is opened with API
 version 1.5 or greater.
 
 ## iface
@@ -587,7 +634,7 @@ requested the FI_HMEM capability.
 
 *FI_HMEM_ZE*
 : Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem,
-  zeDriverFreeMem. 
+  zeDriverFreeMem.
 
 ## device
 Reserved 64 bits for device identifier if using non-standard HMEM interface.
@@ -622,6 +669,13 @@ portable applications target using those interfaces; however, their use
 does carry extra message and memory footprint overhead, making it less
 desirable for highly scalable apps.
 
+There may be cases where device peer to peer support should not be used or
+cannot be used, such as when the PCIe ACS configuration does not permit the
+transfer. The FI_HMEM_DISABLE_P2P environment variable can be set to notify
+Libfabric that peer to peer transactions should not be used. The provider may
+choose to perform a copy instead, or will fail support for FI_HMEM if the
+provider is unable to do that.
+
 # FLAGS
 
 The follow flag may be specified to any memory registration call.
@@ -637,6 +691,60 @@ The follow flag may be specified to any memory registration call.
   specified if persistent completion semantics or persistent data transfers
   are required when accessing the registered region.
 
+*FI_HMEM_DEVICE_ONLY*
+: This flag indicates that the memory is only accessible by a device. Which
+  device is specified by the fi_mr_attr fields iface and device. This refers
+  to memory regions that were allocated using a device API AllocDevice call
+  (as opposed to using the host allocation or unified/shared memory allocation).
+
+*FI_HMEM_HOST_ALLOC*
+: This flag indicates that the memory is owned by the host only. Whether it
+  can be accessed by the device is implementation dependent. The fi_mr_attr
+  field iface is still used to identify the device API, but the field device
+  is ignored. This refers to memory regions that were allocated using a device
+  API AllocHost call (as opposed to using malloc-like host allocation,
+  unified/shared memory allocation, or AllocDevice).
+
+# MEMORY DOMAINS
+
+Memory domains identify the physical separation of memory which
+may or may not be accessible through the same virtual address space.
+Traditionally, applications only dealt with a single memory domain,
+that of host memory tightly coupled with the system CPUs.  With
+the introduction of device and non-uniform memory subsystems,
+applications often need to be aware of which memory domain a particular
+virtual address maps to.
+
+As a general rule, separate physical devices can be considered to have
+their own memory domains.  For example, a NIC may have user accessible
+memory, and would be considered a separate memory domain from memory
+on a GPU.  Both the NIC and GPU memory domains are separate from host
+system memory.  Individual GPUs or computation accelerators may have
+distinct memory domains, or may be connected in such a way (e.g. a GPU
+specific fabric) that all GPUs would belong to the same memory domain.
+Unfortunately, identifying memory domains is specific to each
+system and its physical and/or virtual configuration.
+
+Understanding memory domains in heterogenous memory environments is
+important as it can impact data ordering and visibility as viewed
+by an application.  It is also important to understand which memory
+domain an application is most tightly coupled to.  In most cases,
+applications are tightly coupled to host memory.  However, an
+application running directly on a GPU or NIC may be more tightly
+coupled to memory associated with those devices.
+
+Memory regions are often associated with a single memory domain.
+The domain is often indicated by the fi_mr_attr iface and device
+fields.  Though it is possible for physical pages backing a virtual
+memory region to migrate between memory domains based on access patterns.
+For example, the physical pages referenced by a virtual address range
+could migrate between host memory and GPU memory, depending on which
+computational unit is actively using it.
+
+See the [`fi_endpoint`(3)](fi_endpoint.3.html) and [`fi_cq`(3)](fi_cq.3.html)
+man pages for addition discussion on message, data, and completion ordering
+semantics, including the impact of memory domains.
+
 # RETURN VALUES
 
 Returns 0 on success.  On error, a negative value corresponding to
@@ -724,6 +832,19 @@ configure registration caches.
   are: 0 or 1. Note that the ROCR memory monitor requires a ROCR version with
   unified virtual addressing enabled.
 
+*FI_MR_ZE_CACHE_MONITOR_ENABLED*
+: The ZE cache monitor is responsible for detecting ZE device memory
+  (FI_HMEM_ZE) changes made between the device virtual addresses used by an
+  application and the underlying device physical pages. Valid monitor options
+  are: 0 or 1.
+
+More direct access to the internal registration cache is possible through the
+fi_open() call, using the "mr_cache" service name.  Once opened, custom
+memory monitors may be installed.  A memory monitor is a component of the cache
+responsible for detecting changes in virtual to physical address mappings.
+Some level of control over the cache is possible through the above mentioned
+environment variables.
+
 # SEE ALSO
 
 [`fi_getinfo`(3)](fi_getinfo.3.html),
diff --git a/deps/libfabric/man/fi_msg.3.md b/deps/libfabric/man/fi_msg.3.md
index 0d1f9bdf87ed1e58a21d4e1ead3446bf815678ad..02fc15d72faa59cf0a81f33598cd3bb3f3751ca9 100644
--- a/deps/libfabric/man/fi_msg.3.md
+++ b/deps/libfabric/man/fi_msg.3.md
@@ -12,7 +12,7 @@ fi_msg - Message data transfer operations
 fi_recv / fi_recvv / fi_recvmsg
 :   Post a buffer to receive an incoming message
 
-fi_send / fi_sendv / fi_sendmsg  
+fi_send / fi_sendv / fi_sendmsg
 fi_inject / fi_senddata
 :   Initiate an operation to send a message
 
@@ -127,11 +127,7 @@ event details.
 
 The call fi_send transfers the data contained in the user-specified
 data buffer to a remote endpoint, with message boundaries being
-maintained.  For connection based endpoints (FI_EP_MSG) the local
-endpoint must be connected to a remote endpoint or destination before
-fi_send is called.  Unless the endpoint has been configured
-differently, the data buffer passed into fi_send must not be touched
-by the application until the fi_send call completes asynchronously.
+maintained.
 
 ## fi_sendv
 
@@ -143,7 +139,7 @@ message.
 ## fi_sendmsg
 
 The fi_sendmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the send operation
+connectionless endpoints, with the ability to control the send operation
 per call through the use of flags.  The fi_sendmsg function takes a
 `struct fi_msg` as input.
 
@@ -192,7 +188,7 @@ corresponding endpoint.  Posted receives are searched in the order in
 which they were posted in order to match sends.
 Message boundaries are maintained.  The order in which
 the receives complete is dependent on
-the endpoint type and protocol.  For unconnected endpoints, the
+the endpoint type and protocol.  For connectionless endpoints, the
 src_addr parameter can be used to indicate that a buffer should be
 posted to receive incoming data from a specific remote endpoint.
 
@@ -205,7 +201,7 @@ parameter to a receive incoming data.
 ## fi_recvmsg
 
 The fi_recvmsg call supports posting buffers over both connected and
-unconnected endpoints, with the ability to control the receive
+connectionless endpoints, with the ability to control the receive
 operation per call through the use of flags.  The fi_recvmsg function
 takes a struct fi_msg as input.
 
@@ -276,11 +272,16 @@ fi_sendmsg.
 *FI_INJECT_COMPLETE*
 : Applies to fi_sendmsg.  Indicates that a completion should be
   generated when the source buffer(s) may be reused.
-  
+
 *FI_TRANSMIT_COMPLETE*
-: Applies to fi_sendmsg.  Indicates that a completion should not be
-  generated until the operation has been successfully transmitted and
-  is no longer being tracked by the provider.
+: Applies to fi_sendmsg and fi_recvmsg.  For sends, indicates that a
+  completion should not be generated until the operation has been
+  successfully transmitted and is no longer being tracked by the provider.
+  For receive operations, indicates that a completion may be generated
+  as soon as the message has been processed by the local provider,
+  even if the message data may not be visible to all processing
+  elements.  See [`fi_cq`(3)](fi_cq.3.html) for target side completion
+  semantics.
 
 *FI_DELIVERY_COMPLETE*
 : Applies to fi_sendmsg.  Indicates that a completion should be
@@ -293,7 +294,7 @@ fi_sendmsg.
   targeting the same peer endpoint have completed.  Operations posted
   after the fencing will see and/or replace the results of any
   operations initiated prior to the fenced operation.
-  
+
   The ordering of operations starting at the posting of the fenced
   operation (inclusive) to the posting of a subsequent fenced operation
   (exclusive) is controlled by the endpoint's ordering semantics.
diff --git a/deps/libfabric/man/fi_provider.3.md b/deps/libfabric/man/fi_provider.3.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5b94b8fbb150001df27eda67bf29fb8425f6f5d
--- /dev/null
+++ b/deps/libfabric/man/fi_provider.3.md
@@ -0,0 +1,255 @@
+---
+layout: page
+title: fi_provider(3)
+tagline: Libfabric Programmer's Manual
+---
+{% include JB/setup %}
+
+# NAME
+
+fi_prov_ini \- External provider entry point
+
+fi_param_define / fi_param_get
+: Register and retrieve environment variables with the libfabric core
+
+fi_log_enabled / fi_log_ready / fi_log
+: Control and output debug logging information.
+
+fi_open / fi_close
+: Open a named library object
+
+fi_export_fid / fi_import_fid
+: Share a fabric object between different providers or resources
+
+# SYNOPSIS
+
+```c
+#include <rdma/fabric.h>
+#include <rdma/prov/fi_prov.h>
+
+struct fi_provider* fi_prov_ini(void);
+
+int fi_param_define(const struct fi_provider *provider, const char *param_name,
+	enum fi_param_type type, const char *help_string_fmt, ...);
+
+int fi_param_get_str(struct fi_provider *provider, const char *param_name,
+	char **value);
+
+int fi_param_get_int(struct fi_provider *provider, const char *param_name,
+	int *value);
+
+int fi_param_get_bool(struct fi_provider *provider, const char *param_name,
+	int *value);
+
+int fi_param_get_size_t(struct fi_provider *provider, const char *param_name,
+	size_t *value);
+```
+
+```c
+#include <rdma/fabric.h>
+#include <rdma/prov/fi_prov.h>
+#include <rdma/prov/fi_log.h>
+
+int fi_log_enabled(const struct fi_provider *prov, enum fi_log_level level,
+	enum fi_log_subsys subsys);
+
+int fi_log_ready(const struct fi_provider *prov, enum fi_log_level level,
+	enum fi_log_subsys subsys, uint64_t *showtime);
+
+void fi_log(const struct fi_provider *prov, enum fi_log_level level,
+	enum fi_log_subsys subsys, const char *func, int line,
+	const char *fmt, ...);
+```
+
+```c
+#include <rdma/fabric.h>
+
+int fi_open(uint32_t version, const char *name, void *attr,
+	size_t attr_len, uint64_t flags, struct fid **fid, void *context);
+
+int fi_close(struct fid *fid);
+```
+
+```c
+#include <rdma/fabric.h>
+#include <rdma/fi_ext.h>
+
+int fi_export_fid(struct fid *fid, uint64_t flags,
+	struct fid **expfid, void *context);
+
+int fi_import_fid(struct fid *fid, struct fid *expfid, uint64_t flags);
+```
+
+# ARGUMENTS
+
+*provider*
+: Reference to the provider.
+
+*version*
+: API version requested by application.
+
+*name*
+: Well-known name of the library object to open.
+
+*attr*
+: Optional attributes of object to open.
+
+*attr_len*
+: Size of any attribute structure passed to fi_open.  Should be 0
+  if no attributes are give.
+
+*fid*
+: Returned fabric identifier for opened object.
+
+# DESCRIPTION
+
+A fabric provider implements the application facing software
+interfaces needed to access network specific protocols,
+drivers, and hardware.  The interfaces and structures defined by
+this man page are exported by the libfabric library, but are
+targeted for provider implementations, rather than for direct
+use by most applications.
+
+Integrated providers are those built directly into the libfabric
+library itself.  External providers are loaded dynamically by
+libfabric at initialization time.  External providers must be in
+a standard library path or in the libfabric library search path
+as specified by environment variable.  Additionally, external
+providers must be named with the suffix "-fi.so" at the end of
+the name.
+
+Named objects are special purpose resources which are accessible directly
+to applications.  They may be used to enhance or modify the behavior of
+library core.  For details, see the fi_open call below.
+
+## fi_prov_ini
+
+This entry point must be defined by external providers.  On loading,
+libfabric will invoke fi_prov_ini() to retrieve the provider's
+fi_provider structure.  Additional interactions between the libfabric
+core and the provider will be through the interfaces defined by that
+struct.
+
+## fi_param_define
+
+Defines a configuration parameter for use by a specified provider. The
+help_string and param_name arguments must be non-NULL, help_string
+must additionally be non-empty. They are copied internally and may be
+freed after calling fi_param_define.
+
+## fi_param_get
+
+Gets the value of a configuration parameter previously defined using
+fi_param_define(). The value comes from the environment variable name of
+the form FI_<provider_name>_<param_name>, all converted to upper case.
+
+If the parameter was previously defined and the user set a value,
+FI_SUCCESS is returned and (*value) points to the retrieved
+value.
+
+If the parameter name was previously defined, but the user did
+not set a value, -FI_ENODATA is returned and the value of (*value)
+is unchanged.
+
+If the parameter name was not previously defined via
+fi_param_define(), -FI_ENOENT will be returned and the value of
+(*value) is unchanged.
+
+If the value in the environment is not valid for the parameter type,
+-FI_EINVAL will be returned and the value of (*value) is unchanged.
+
+## fi_log_enabled / fi_log_ready / fi_log
+
+These functions control debug and informational logging output.
+Providers typically access these functions through the FI_LOG and
+related macros in fi_log.h and do not call these function directly.
+
+## fi_open
+
+Open a library resource using a well-known name.  This feature allows
+applications and providers a mechanism which can be used to modify or
+enhance core library services and behavior.  The details are specific
+based on the requested object name.  Most applications will not need
+this level of control.
+
+The library API version known to the application should be provided
+through the version parameter.  The use of attributes is object dependent.
+If required, attributes should be provided through the attr parameter,
+with attr_len set to the size of the referenced attribute structure.
+The following is a list of published names, along with descriptions
+of the service or resource to which they correspond.
+
+*mr_cache*
+: The mr_cache object references the internal memory registration cache
+  used by the different providers.  Additional information on the cache
+  is available in the `fi_mr(3)` man page.
+
+## fi_export_fid / fi_import_fid
+
+Generally, fabric objects are allocated and managed entirely by a single
+provider.  Typically only the application facing software interfaces of
+a fabric object are defined, for example, the message or tagged operations
+of an endpoint.  The fi_export_fid and fi_import_fid calls provide a
+a mechanism by which provider facing APIs may be accessed.  This allows
+the creation of fid objects that are shareable between providers, or
+for library plug-in services.  The ability to export a shareable object
+is object and provider implementation dependent.
+
+Shareable fids typically contain at least 3 main components: a
+base fid, a set of exporter defined ops, and a set of importer defined
+ops.
+
+# NOTES
+
+TODO
+
+# PROVIDER INTERFACE
+
+The fi_provider structure defines entry points for the libfabric core
+to use to access the provider.  All other calls into a provider are
+through function pointers associated with allocated objects.
+
+```c
+struct fi_provider {
+	uint32_t version;
+	uint32_t fi_version;
+	struct fi_context context;
+	const char *name;
+	int	(*getinfo)(uint32_t version, const char *node, const char *service,
+			uint64_t flags, const struct fi_info *hints,
+			struct fi_info **info);
+	int	(*fabric)(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+			void *context);
+	void	(*cleanup)(void);
+};
+```
+
+## version
+
+The provider version.  For providers integrated with the library, this is
+often the same as the library version.
+
+## fi_version
+
+The library interface version that the provider was implemented against.
+The provider's fi_version must be greater than or equal to an application's
+requested api version for the application to use the provider.  It is a
+provider's responsibility to support older versions of the api if it
+wishes to supports legacy applications.  For integrated providers
+
+## TODO
+
+# RETURN VALUE
+
+Returns FI_SUCCESS on success. On error, a negative value corresponding to
+fabric errno is returned. Fabric errno values are defined in
+`rdma/fi_errno.h`.
+
+# ERRORS
+
+
+# SEE ALSO
+
+[`fabric`(7)](fabric.7.html),
+[`fi_getinfo`(3)](fi_getinfo.3.html)
+[`fi_mr`(3)](fi_mr.3.html),
diff --git a/deps/libfabric/man/fi_provider.7.md b/deps/libfabric/man/fi_provider.7.md
index 7a7134009e2f63a29465261fcde570d9a492216c..adb0f7960018cd0e437a26238ecf208632b168cf 100644
--- a/deps/libfabric/man/fi_provider.7.md
+++ b/deps/libfabric/man/fi_provider.7.md
@@ -33,6 +33,14 @@ This distribution of libfabric contains the following providers
 : High-speed InfiniBand networking from Intel.  See
   [`fi_psm`(7)](fi_psm.7.html) for more information.
 
+*PSM2*
+: High-speed Omni-Path networking from Intel.  See
+  [`fi_psm2`(7)](fi_psm2.7.html) for more information.
+
+*PSM3*
+: High-speed Ethernet networking from Intel.  See
+  [`fi_psm3`(7)](fi_psm3.7.html) for more information.
+
 *Sockets*
 : A general purpose provider that can be used on any network that
   supports TCP/UDP sockets.  This provider is not intended to provide
diff --git a/deps/libfabric/man/fi_psm.7.md b/deps/libfabric/man/fi_psm.7.md
index 80514769f31c3ef0454ed4a9dfc901af4c50dd91..961125a11db39c4bb258b134d2e98ad172b9a490 100644
--- a/deps/libfabric/man/fi_psm.7.md
+++ b/deps/libfabric/man/fi_psm.7.md
@@ -165,3 +165,4 @@ The *psm* provider checks for the following environment variables:
 [`fabric`(7)](fabric.7.html),
 [`fi_provider`(7)](fi_provider.7.html),
 [`fi_psm2`(7)](fi_psm2.7.html),
+[`fi_psm3`(7)](fi_psm3.7.html),
diff --git a/deps/libfabric/man/fi_psm2.7.md b/deps/libfabric/man/fi_psm2.7.md
index 7de5b213900f473ff07f545a8f99407209ec02c8..686122527c6602cb8d7e1db454de099cb20acc7b 100644
--- a/deps/libfabric/man/fi_psm2.7.md
+++ b/deps/libfabric/man/fi_psm2.7.md
@@ -115,6 +115,12 @@ The *psm2* provider checks for the following environment variables:
 
   The default UUID is 00FF00FF-0000-0000-0000-00FF0F0F00FF.
 
+  It is possible to create endpoints with UUID different from the one
+  set here. To achieve that, set 'info->ep_attr->auth_key' to the uuid
+  value and 'info->ep_attr->auth_key_size' to its size (16 bytes) when
+  calling fi_endpoint() or fi_scalable_ep(). It is still true that an
+  endpoint can only communicate with endpoints with the same UUID.
+
 *FI_PSM2_NAME_SERVER*
 : The *psm2* provider has a simple built-in name server that can be used
   to resolve an IP address or host name into a transport address needed
@@ -256,8 +262,21 @@ The *psm2* provider checks for the following environment variables:
   to 1 (means *tag60*) or 2 (means *tag64*), the choice is fixed at compile time
   and this runtime option will be disabled.
 
+# PSM2 EXTENSIONS
+
+The *psm2* provider supports limited low level parameter setting through the
+fi_set_val() and fi_get_val() functions. Currently the following parameters
+can be set via the domain fid:
+
+* FI_PSM2_DISCONNECT *
+: Overwite the global runtime parameter *FI_PSM2_DISCONNECT* for this domain.
+  See the *RUNTIME PARAMETERS* section for details.
+
+Valid parameter names are defined in the header file *rdma/fi_ext_psm2.h*.
+
 # SEE ALSO
 
 [`fabric`(7)](fabric.7.html),
 [`fi_provider`(7)](fi_provider.7.html),
 [`fi_psm`(7)](fi_psm.7.html),
+[`fi_psm3`(7)](fi_psm3.7.html),
diff --git a/deps/libfabric/man/fi_psm3.7.md b/deps/libfabric/man/fi_psm3.7.md
new file mode 100644
index 0000000000000000000000000000000000000000..23256bb8eff8eed692b564536a213cf8532887a8
--- /dev/null
+++ b/deps/libfabric/man/fi_psm3.7.md
@@ -0,0 +1,265 @@
+---
+layout: page
+title: fi_psm3(7)
+tagline: Libfabric Programmer's Manual
+---
+{% include JB/setup %}
+
+# NAME
+
+fi_psm3 \- The PSM3 Fabric Provider
+
+# OVERVIEW
+
+The *psm3* provider implements a Performance Scaled Messaging
+capability which supports Intel RoCEv2 capable NICs. PSM3 represents
+an Ethernet and standard RoCEv2 enhancement of previous PSM
+implementations.
+
+# SUPPORTED FEATURES
+
+The *psm3* provider supports a subset of all the features defined in the
+libfabric API.
+
+Endpoint types
+: Supports non-connection based types *FI_DGRAM* and *FI_RDM*.
+
+Endpoint capabilities
+: Endpoints can support any combination of data transfer capabilities
+  *FI_TAGGED*, *FI_MSG*, *FI_ATOMICS*, and *FI_RMA*. These capabilities
+  can be further refined by *FI_SEND*, *FI_RECV*, *FI_READ*, *FI_WRITE*,
+  *FI_REMOTE_READ*, and *FI_REMOTE_WRITE* to limit the direction of
+  operations.
+
+  *FI_MULTI_RECV* is supported for non-tagged message queue only.
+
+  Scalable endpoints are supported if the underlying PSM3 library supports
+  multiple endpoints. This condition must be satisfied both when the
+  provider is built and when the provider is used. See the *Scalable
+  endpoints* section for more information.
+
+  Other supported capabilities include *FI_TRIGGER*, *FI_REMOTE_CQ_DATA*,
+  *FI_RMA_EVENT*, *FI_SOURCE*, and *FI_SOURCE_ERR*. Furthermore,
+  *FI_NAMED_RX_CTX* is supported when scalable endpoints are enabled.
+
+Modes
+: *FI_CONTEXT* is required for the *FI_TAGGED* and *FI_MSG*
+  capabilities. That means, any request belonging to these two
+  categories that generates a completion must pass as the operation
+  context a valid pointer to type *struct fi_context*, and the space
+  referenced by the pointer must remain untouched until the request
+  has completed. If none of *FI_TAGGED* and *FI_MSG* is asked for,
+  the *FI_CONTEXT* mode is not required.
+
+Progress
+: The *psm3* provider performs optimal with manual progress. By default, the
+  application is expected to call *fi_cq_read* or *fi_cntr_read* function
+  from time to time when no other libfabric function is called to ensure
+  progress is made in a timely manner. The provider does support auto
+  progress mode. However, the performance can be significantly impacted if
+  the application purely depends on the provider to make auto progress.
+
+Scalable endpoints
+: Scalable endpoints support depends on the multi-EP feature of the *PSM3*
+  library. If the *PSM3* library supports this feature, the availability is
+  further controlled by an environment variable *PSM3_MULTI_EP*. The *psm3*
+  provider automatically sets this variable to 1 if it is not set. The
+  feature can be disabled explicitly by setting *PSM3_MULTI_EP* to 0.
+
+  When creating a scalable endpoint, the exact number of contexts requested
+  should be set in the "fi_info" structure passed to the *fi_scalable_ep*
+  function. This number should be set in "fi_info->ep_attr->tx_ctx_cnt" or
+  "fi_info->ep_attr->rx_ctx_cnt" or both, whichever greater is used. The
+  *psm3* provider allocates all requested contexts upfront when the scalable
+  endpoint is created. The same context is used for both Tx and Rx.
+
+  For optimal performance, it is advised to avoid having multiple threads
+  accessing the same context, either directly by posting send/recv/read/write
+  request, or indirectly by polling associated completion queues or counters.
+
+  Using the scalable endpoint as a whole in communication functions is not
+  supported. Instead, individual tx context or rx context of the scalable
+  endpoint should be used. Similarly, using the address of the scalable
+  endpoint as the source address or destination address doesn't collectively
+  address all the tx/rx contexts. It addresses only the first tx/rx context,
+  instead.
+
+# LIMITATIONS
+
+The *psm3* provider doesn't support all the features defined in the
+libfabric API. Here are some of the limitations not listed above:
+
+Unsupported features
+: These features are unsupported: connection management, passive endpoint,
+  and shared receive context.
+
+# RUNTIME PARAMETERS
+
+The *psm3* provider checks for the following environment variables:
+
+*FI_PSM3_UUID*
+: PSM requires that each job has a unique ID (UUID). All the processes
+  in the same job need to use the same UUID in order to be able to
+  talk to each other. The PSM reference manual advises to keep UUID
+  unique to each job. In practice, it generally works fine to reuse
+  UUID as long as (1) no two jobs with the same UUID are running at
+  the same time; and (2) previous jobs with the same UUID have exited
+  normally. If running into "resource busy" or "connection failure"
+  issues with unknown reason, it is advisable to manually set the UUID
+  to a value different from the default.
+
+  The default UUID is 00FF00FF-0000-0000-0000-00FF0F0F00FF.
+
+  It is possible to create endpoints with UUID different from the one
+  set here. To achieve that, set 'info->ep_attr->auth_key' to the uuid
+  value and 'info->ep_attr->auth_key_size' to its size (16 bytes) when
+  calling fi_endpoint() or fi_scalable_ep(). It is still true that an
+  endpoint can only communicate with endpoints with the same UUID.
+
+*FI_PSM3_NAME_SERVER*
+: The *psm3* provider has a simple built-in name server that can be used
+  to resolve an IP address or host name into a transport address needed
+  by the *fi_av_insert* call. The main purpose of this name server is to
+  allow simple client-server type applications (such as those in *fabtests*)
+  to be written purely with libfabric, without using any out-of-band
+  communication mechanism. For such applications, the server would run first
+  to allow endpoints be created and registered with the name server, and
+  then the client would call *fi_getinfo* with the *node* parameter set to
+  the IP address or host name of the server. The resulting *fi_info*
+  structure would have the transport address of the endpoint created by the
+  server in the *dest_addr* field. Optionally the *service* parameter can
+  be used in addition to *node*. Notice that the *service* number is
+  interpreted by the provider and is not a TCP/IP port number.
+
+  The name server is on by default. It can be turned off by setting the
+  variable to 0. This may save a small amount of resource since a separate
+  thread is created when the name server is on.
+
+  The provider detects OpenMPI and MPICH runs and changes the default setting
+  to off.
+
+*FI_PSM3_TAGGED_RMA*
+: The RMA functions are implemented on top of the PSM Active Message functions.
+  The Active Message functions have limit on the size of data can be transferred
+  in a single message. Large transfers can be divided into small chunks and
+  be pipe-lined. However, the bandwidth is sub-optimal by doing this way.
+
+  The *psm3* provider use PSM tag-matching message queue functions to achieve
+  higher bandwidth for large size RMA. It takes advantage of the extra tag bits
+  available in PSM3 to separate the RMA traffic from the regular tagged message
+  queue.
+
+  The option is on by default. To turn it off set the variable to 0.
+
+*FI_PSM3_DELAY*
+: Time (seconds) to sleep before closing PSM endpoints. This is a workaround
+  for a bug in some versions of PSM library.
+
+  The default setting is 0.
+
+*FI_PSM3_TIMEOUT*
+: Timeout (seconds) for gracefully closing PSM endpoints. A forced closing
+  will be issued if timeout expires.
+
+  The default setting is 5.
+
+*FI_PSM3_CONN_TIMEOUT*
+: Timeout (seconds) for establishing connection between two PSM endpoints.
+
+  The default setting is 5.
+
+*FI_PSM3_PROG_INTERVAL*
+: When auto progress is enabled (asked via the hints to *fi_getinfo*),
+  a progress thread is created to make progress calls from time to time.
+  This option set the interval (microseconds) between progress calls.
+
+  The default setting is 1 if affinity is set, or 1000 if not. See
+  *FI_PSM3_PROG_AFFINITY*.
+
+*FI_PSM3_PROG_AFFINITY*
+: When set, specify the set of CPU cores to set the progress thread
+  affinity to. The format is
+  `<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*`,
+  where each triplet `<start>:<end>:<stride>` defines a block of
+  core_ids. Both `<start>` and `<end>` can be either the `core_id`
+  (when >=0) or `core_id - num_cores` (when <0).
+
+  By default affinity is not set.
+
+*FI_PSM3_INJECT_SIZE*
+: Maximum message size allowed for fi_inject and fi_tinject calls. This is
+  an experimental feature to allow some applications to override default
+  inject size limitation. When the inject size is larger than the default
+  value, some inject calls might block.
+
+  The default setting is 64.
+
+*FI_PSM3_LOCK_LEVEL*
+: When set, dictate the level of locking being used by the provider. Level
+  2 means all locks are enabled. Level 1 disables some locks and is suitable
+  for runs that limit the access to each PSM3 context to a single thread.
+  Level 0 disables all locks and thus is only suitable for single threaded
+  runs.
+
+  To use level 0 or level 1, wait object and auto progress mode cannot be
+  used because they introduce internal threads that may break the conditions
+  needed for these levels.
+
+  The default setting is 2.
+
+*FI_PSM3_LAZY_CONN*
+: There are two strategies on when to establish connections between the PSM3
+  endpoints that OFI endpoints are built on top of. In eager connection mode,
+  connections are established when addresses are inserted into the address
+  vector. In lazy connection mode, connections are established when addresses
+  are used the first time in communication. Eager connection mode has slightly
+  lower critical path overhead but lazy connection mode scales better.
+
+  This option controls how the two connection modes are used. When set to 1,
+  lazy connection mode is always used. When set to 0, eager connection mode
+  is used when required conditions are all met and lazy connection mode is
+  used otherwise. The conditions for eager connection mode are: (1) multiple
+  endpoint (and scalable endpoint) support is disabled by explicitly setting
+  PSM3_MULTI_EP=0; and (2) the address vector type is FI_AV_MAP.
+
+  The default setting is 0.
+
+*FI_PSM3_DISCONNECT*
+: The provider has a mechanism to automatically send disconnection notifications
+  to all connected peers before the local endpoint is closed. As the response,
+  the peers call *psm3_ep_disconnect* to clean up the connection state at their
+  side. This allows the same PSM3 epid be used by different dynamically started
+  processes (clients) to communicate with the same peer (server). This mechanism,
+  however, introduce extra overhead to the finalization phase. For applications
+  that never reuse epids within the same session such overhead is unnecessary.
+
+  This option controls whether the automatic disconnection notification mechanism
+  should be enabled. For client-server application mentioned above, the client
+  side should set this option to 1, but the server should set it to 0.
+
+  The default setting is 0.
+
+*FI_PSM3_TAG_LAYOUT*
+: Select how the 96-bit PSM3 tag bits are organized. Currently three choices are
+  available: *tag60* means 32-4-60 partitioning for CQ data, internal protocol
+  flags, and application tag. *tag64* means 4-28-64 partitioning for internal
+  protocol flags, CQ data, and application tag. *auto* means to choose either
+  *tag60* or *tag64* based on the hints passed to fi_getinfo -- *tag60* is used
+  if remote CQ data support is requested explicitly, either by passing non-zero value
+  via *hints->domain_attr->cq_data_size* or by including *FI_REMOTE_CQ_DATA* in
+  *hints->caps*, otherwise *tag64* is used. If *tag64* is the result of automatic
+  selection, *fi_getinfo* also returns a second instance of the provider with
+  *tag60* layout.
+
+  The default setting is *auto*.
+
+  Notice that if the provider is compiled with macro *PSMX3_TAG_LAYOUT* defined
+  to 1 (means *tag60*) or 2 (means *tag64*), the choice is fixed at compile time
+  and this runtime option will be disabled.
+
+# SEE ALSO
+
+[`fabric`(7)](fabric.7.html),
+[`fi_provider`(7)](fi_provider.7.html),
+[`fi_psm`(7)](fi_psm.7.html),
+[`fi_psm2`(7)](fi_psm2.7.html),
diff --git a/deps/libfabric/man/fi_rma.3.md b/deps/libfabric/man/fi_rma.3.md
index 312d9defcd87200e96fbdf7b12aca00f6d951bd8..156780834c31ff7230a7630df59b5ae367c6ea19 100644
--- a/deps/libfabric/man/fi_rma.3.md
+++ b/deps/libfabric/man/fi_rma.3.md
@@ -12,7 +12,7 @@ fi_rma - Remote memory access operations
 fi_read / fi_readv / fi_readmsg
 :   Initiates a read from remote memory
 
-fi_write / fi_writev / fi_writemsg    
+fi_write / fi_writev / fi_writemsg
 fi_inject_write / fi_writedata
 :   Initiate a write to remote memory
 
@@ -141,11 +141,7 @@ may be delivered.
 ## fi_write
 
 The call fi_write transfers the data contained in the user-specified
-data buffer to a remote memory region.  The local endpoint must be
-connected to a remote endpoint or destination before fi_write is
-called.  Unless the endpoint has been configured differently, the data
-buffer passed into fi_write must not be touched by the application
-until the fi_write call completes asynchronously.
+data buffer to a remote memory region.
 
 ## fi_writev
 
@@ -156,7 +152,7 @@ referenced by the iov parameter to the remote memory region.
 ## fi_writemsg
 
 The fi_writemsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the write operation
+connectionless endpoints, with the ability to control the write operation
 per call through the use of flags.  The fi_writemsg function takes a
 struct fi_msg_rma as input.
 
@@ -199,9 +195,7 @@ transfer.
 ## fi_read
 
 The fi_read call requests that the remote endpoint transfer data from
-the remote memory region into the local data buffer.  The local
-endpoint must be connected to a remote endpoint or destination before
-fi_read is called.
+the remote memory region into the local data buffer.
 
 ## fi_readv
 
@@ -212,7 +206,7 @@ the set of data buffers referenced by the iov parameter.
 ## fi_readmsg
 
 The fi_readmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the read operation
+connectionless endpoints, with the ability to control the read operation
 per call through the use of flags.  The fi_readmsg function takes a
 struct fi_msg_rma as input.
 
@@ -254,7 +248,7 @@ fi_writemsg.
 *FI_INJECT_COMPLETE*
 : Applies to fi_writemsg.  Indicates that a completion should be
   generated when the source buffer(s) may be reused.
-  
+
 *FI_TRANSMIT_COMPLETE*
 : Applies to fi_writemsg.  Indicates that a completion should not be
   generated until the operation has been successfully transmitted and
@@ -276,7 +270,7 @@ fi_writemsg.
   targeting the same peer endpoint have completed.  Operations posted
   after the fencing will see and/or replace the results of any
   operations initiated prior to the fenced operation.
-  
+
   The ordering of operations starting at the posting of the fenced
   operation (inclusive) to the posting of a subsequent fenced operation
   (exclusive) is controlled by the endpoint's ordering semantics.
diff --git a/deps/libfabric/man/fi_rxm.7.md b/deps/libfabric/man/fi_rxm.7.md
index 35c86fde42b7a92129a0aab2426abdf5378fcdf1..5d6190f2df24fffd97683af6c7c9f9a4d4900f50 100644
--- a/deps/libfabric/man/fi_rxm.7.md
+++ b/deps/libfabric/man/fi_rxm.7.md
@@ -131,6 +131,13 @@ The ofi_rxm provider checks for the following environment variables.
 : Defines the maximum number of MSG provider CQ entries (default: 1) that would
   be read per progress (RxM CQ read).
 
+*FI_OFI_RXM_ENABLE_DYN_RBUF*
+: Enables support for dynamic receive buffering, if available by the message
+  endpoint provider.  This feature allows direct placement of received
+  message data into application buffers, bypassing RxM bounce buffers.
+  This feature targets providers that provide internal network buffering,
+  such as the tcp provider.  (default: false)
+
 *FI_OFI_RXM_SAR_LIMIT*
 : Set this environment variable to control the RxM SAR (Segmentation And Reassembly)
   protocol. Messages of size greater than this (default: 128 Kb) would be transmitted
diff --git a/deps/libfabric/man/fi_shm.7.md b/deps/libfabric/man/fi_shm.7.md
index 8e00a5907bf68efcb85fb1e373b8f84369dcd0dd..b27a57cce88abb82d3efd6eb7b82f2bd78d7817f 100644
--- a/deps/libfabric/man/fi_shm.7.md
+++ b/deps/libfabric/man/fi_shm.7.md
@@ -74,7 +74,7 @@ of operations.
   provided (and in the case of setting the src address without FI_SOURCE and
   no hints), the process ID will be used as a default address.
   On endpoint creation, if the src_addr has the "fi_shm://" prefix, the provider
-  will append ":[uid]:[dom_idx]:[ep_idx]" as a unique endpoint name (essentially,
+  will append ":[uid]:[ep_idx]" as a unique endpoint name (essentially,
   in place of a service).  In the case of the "fi_ns://" prefix (or any other
   prefix if one was provided by the application), no supplemental information
   is required to make it unique and it will remain with only the
@@ -122,6 +122,9 @@ The *shm* provider checks for the following environment variables:
 *FI_SHM_RX_SIZE*
 : Maximum number of outstanding rx operations. Default 1024
 
+*FI_SHM_DISABLE_CMA*
+: Manually disables CMA. Default false
+
 # SEE ALSO
 
 [`fabric`(7)](fabric.7.html),
diff --git a/deps/libfabric/man/fi_tagged.3.md b/deps/libfabric/man/fi_tagged.3.md
index 07eeedda591467f24dbcd9974fdef6612e689b74..badcfce252e0223b2a72503a04cd131114f58901 100644
--- a/deps/libfabric/man/fi_tagged.3.md
+++ b/deps/libfabric/man/fi_tagged.3.md
@@ -165,7 +165,7 @@ message.
 ## fi_tsendmsg
 
 The fi_tsendmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the send operation
+connectionless endpoints, with the ability to control the send operation
 per call through the use of flags.  The fi_tsendmsg function takes a
 struct fi_msg_tagged as input.
 
@@ -216,7 +216,7 @@ parameter to a receive incoming data.
 ## fi_trecvmsg
 
 The fi_trecvmsg call supports posting buffers over both connected and
-unconnected endpoints, with the ability to control the receive
+connectionless endpoints, with the ability to control the receive
 operation per call through the use of flags.  The fi_trecvmsg function
 takes a struct fi_msg_tagged as input.
 
@@ -258,7 +258,7 @@ and/or fi_tsendmsg.
 *FI_INJECT_COMPLETE*
 : Applies to fi_tsendmsg.  Indicates that a completion should be
   generated when the source buffer(s) may be reused.
-  
+
 *FI_TRANSMIT_COMPLETE*
 : Applies to fi_tsendmsg.  Indicates that a completion should not be
   generated until the operation has been successfully transmitted and
@@ -276,7 +276,7 @@ and/or fi_tsendmsg.
   targeting the same peer endpoint have completed.  Operations posted
   after the fencing will see and/or replace the results of any
   operations initiated prior to the fenced operation.
-  
+
   The ordering of operations starting at the posting of the fenced
   operation (inclusive) to the posting of a subsequent fenced operation
   (exclusive) is controlled by the endpoint's ordering semantics.
@@ -289,7 +289,7 @@ The following flags may be used with fi_trecvmsg.
   allocated buffering enabled (see fi_rx_attr total_buffered_recv).
   Unlike standard receive operations, a receive operation with the FI_PEEK
   flag set does not remain queued with the provider after the peek completes
-  successfully. The peek operation operates asynchronously, and the results 
+  successfully. The peek operation operates asynchronously, and the results
   of the peek operation are available in the completion queue associated with
   the endpoint. If no message is found matching the tags specified in the peek
   request, then a completion queue error entry with err field set to FI_ENOMSG
diff --git a/deps/libfabric/man/fi_tcp.7.md b/deps/libfabric/man/fi_tcp.7.md
index 4eb3e44aa1474f0b4c3b80f2d7cacae0e78dcf18..82efd96e959e4aa2d1a3425db84acd235f6c0d4a 100644
--- a/deps/libfabric/man/fi_tcp.7.md
+++ b/deps/libfabric/man/fi_tcp.7.md
@@ -53,6 +53,12 @@ The tcp provider check for the following enviroment variables -
   tcp provider for its passive endpoint creation. This is useful where
   only a range of ports are allowed by firewall for tcp connections.
 
+*FI_TCP_TX_SIZE*
+: Default tx context size (default: 256)
+
+*FI_TCP_RX_SIZE*
+: Default rx context size (default: 256)
+
 # LIMITATIONS
 
 The tcp provider is implemented over TCP sockets to emulate libfabric API.
diff --git a/deps/libfabric/man/fi_verbs.7.md b/deps/libfabric/man/fi_verbs.7.md
index 18036cdf4e7861eae0621169fccfcdf5b5eca98b..595d3dca1962a35ae733b2c11ed472ad28783c9d 100644
--- a/deps/libfabric/man/fi_verbs.7.md
+++ b/deps/libfabric/man/fi_verbs.7.md
@@ -153,6 +153,8 @@ The support for fork in the provider has the following limitations:
 ### XRC Transport
 The XRC transport is intended to be used when layered with the RXM provider and
 requires the use of shared receive contexts. See [`fi_rxm`(7)](fi_rxm.7.thml).
+To enable XRC, the following environment variables must usually be set:
+FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX.
 
 # RUNTIME PARAMETERS
 
diff --git a/deps/libfabric/man/man1/fi_info.1 b/deps/libfabric/man/man1/fi_info.1
index 90c75c14dbe2f78561e3a9e374685f8e893b2a5c..2b413e576fdbad34c13db8f36ea0b98ae334c949 100644
--- a/deps/libfabric/man/man1/fi_info.1
+++ b/deps/libfabric/man/man1/fi_info.1
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_info" "1" "2020\-01\-30" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_info" "1" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,8 +9,8 @@ fi_info \- Simple utility to query for fabric interfaces
 .IP
 .nf
 \f[C]
-\ fi_info\ [OPTIONS]
-\f[]
+ fi_info [OPTIONS]
+\f[R]
 .fi
 .SH DESCRIPTION
 .PP
@@ -25,101 +25,73 @@ all providers and endpoint types will be returned.
 .SH OPTIONS
 .SS Filtering
 .TP
-.B \f[I]\-n, \-\-node=<NAME>\f[]
+.B \f[I]\-n, \[en]node=<NAME>\f[R]
 Node name or address used to filter interfaces.
 Only interfaces which can reach the given node or address will respond.
-.RS
-.RE
 .TP
-.B \f[I]\-P, \-\-port=<PORT>\f[]
+.B \f[I]\-P, \[en]port=<PORT>\f[R]
 Port number used to filter interfaces.
-.RS
-.RE
 .TP
-.B \f[I]\-c, \-\-caps=<CAP1|CAP2>..\f[]
+.B \f[I]\-c, \[en]caps=<CAP1|CAP2>..\f[R]
 Pipe separated list of capabilities used to filter interfaces.
 Only interfaces supporting all of the given capabilities will respond.
 For more information on capabilities, see fi_getinfo(3).
-.RS
-.RE
 .TP
-.B \f[I]\-m, \-\-mode=<MOD1|MOD2>..\f[]
+.B \f[I]\-m, \[en]mode=<MOD1|MOD2>..\f[R]
 Pipe separated list of modes used to filter interfaces.
 Only interfaces supporting all of the given modes will respond.
 For more information on, modes see fi_getinfo(3).
-.RS
-.RE
 .TP
-.B \f[I]\-t, \-\-ep_type=<EPTYPE>\f[]
+.B \f[I]\-t, \[en]ep_type=<EPTYPE>\f[R]
 Specifies the type of fabric interface communication desired.
 For example, specifying FI_EP_DGRAM would return only interfaces which
 support unreliable datagram.
 For more information on endpoint types, see fi_endpoint(3).
-.RS
-.RE
 .TP
-.B \f[I]\-a, \-\-addr_format=<FMT>\f[]
+.B \f[I]\-a, \[en]addr_format=<FMT>\f[R]
 Filter fabric interfaces by their address format.
 For example, specifying FI_SOCKADDR_IN would return only interfaces
 which use sockaddr_in structures for addressing.
 For more information on address formats, see fi_getinfo(3).
-.RS
-.RE
 .TP
-.B \f[I]\-p, \-\-provider=<PROV>\f[]
+.B \f[I]\-p, \[en]provider=<PROV>\f[R]
 Filter fabric interfaces by the provider implementation.
-For a list of providers, see the \f[C]\-\-list\f[] option.
-.RS
-.RE
+For a list of providers, see the \f[C]\-\-list\f[R] option.
 .TP
-.B \f[I]\-d, \-\-domain=<DOMAIN>\f[]
+.B \f[I]\-d, \[en]domain=<DOMAIN>\f[R]
 Filter interfaces to only those with the given domain name.
-.RS
-.RE
 .TP
-.B \f[I]\-f, \-\-fabric=<FABRIC>\f[]
+.B \f[I]\-f, \[en]fabric=<FABRIC>\f[R]
 Filter interfaces to only those with the given fabric name.
-.RS
-.RE
 .SS Discovery
 .TP
-.B \f[I]\-e, \-\-env\f[]
+.B \f[I]\-e, \[en]env\f[R]
 List libfabric related environment variables which can be used to enable
 extra configuration or tuning.
-.RS
-.RE
 .TP
 .B *\-g [filter]
 Same as \-e option, with output limited to environment variables
 containing filter as a substring.
-.RS
-.RE
 .TP
-.B \f[I]\-l, \-\-list\f[]
+.B \f[I]\-l, \[en]list\f[R]
 List available libfabric providers.
-.RS
-.RE
 .TP
-.B \f[I]\-v, \-\-verbose\f[]
+.B \f[I]\-v, \[en]verbose\f[R]
 By default, fi_info will display a summary of each of the interfaces
 discovered.
 If the verbose option is enabled, then all of the contents of the
 fi_info structure are displayed.
 For more information on the data contained in the fi_info structure, see
 fi_getinfo(3).
-.RS
-.RE
 .TP
-.B \f[I]\-\-version\f[]
+.B \f[I]\[en]version\f[R]
 Display versioning information.
-.RS
-.RE
 .SH USAGE EXAMPLES
 .IP
 .nf
 \f[C]
-$\ fi_info\ \-n\ 30.0.11.1\ \-p\ usnic\ \-t\ FI_EP_DGRAM
-\f[]
+$ fi_info \-n 30.0.11.1 \-p usnic \-t FI_EP_DGRAM
+\f[R]
 .fi
 .PP
 This will respond with all fabric interfaces that can reach address
@@ -131,19 +103,19 @@ discovered:
 .IP
 .nf
 \f[C]
-$\ ./fi_info\ \-n\ 30.0.11.1\ \-p\ usnic\ \-t\ FI_EP_DGRAM
-provider:\ usnic
-\ \ \ \ fabric:\ 30.0.11.0/24
-\ \ \ \ domain:\ usnic_2
-\ \ \ \ version:\ 1.0
-\ \ \ \ type:\ FI_EP_DGRAM
-\ \ \ \ protocol:\ FI_PROTO_UDP
-\f[]
+$ ./fi_info \-n 30.0.11.1 \-p usnic \-t FI_EP_DGRAM
+provider: usnic
+    fabric: 30.0.11.0/24
+    domain: usnic_2
+    version: 1.0
+    type: FI_EP_DGRAM
+    protocol: FI_PROTO_UDP
+\f[R]
 .fi
 .PP
-To see the full fi_info structure, specify the \f[C]\-v\f[] option.
+To see the full fi_info structure, specify the \f[C]\-v\f[R] option.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo(3)\f[], \f[C]fi_endpoint(3)\f[]
+\f[C]fi_getinfo(3)\f[R], \f[C]fi_endpoint(3)\f[R]
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man1/fi_pingpong.1 b/deps/libfabric/man/man1/fi_pingpong.1
index c9b859c5492f7b9cac17ac46e3c20a337d30e288..49c9ffc867a0dd741dcb54c4c19b190ef58639d4 100644
--- a/deps/libfabric/man/man1/fi_pingpong.1
+++ b/deps/libfabric/man/man1/fi_pingpong.1
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_pingpong" "1" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_pingpong" "1" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,9 +9,9 @@ fi_pingpong \- Quick and simple pingpong test for libfabric
 .IP
 .nf
 \f[C]
-\ fi_pingpong\ [OPTIONS]\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ start\ server
-\ fi_pingpong\ [OPTIONS]\ <server\ address>\ \ \ \ \ connect\ to\ server
-\f[]
+ fi_pingpong [OPTIONS]                      start server
+ fi_pingpong [OPTIONS] <server address>     connect to server
+\f[R]
 .fi
 .SH DESCRIPTION
 .PP
@@ -21,7 +21,7 @@ fi_pingpong also displays aggregated statistics after each test run, and
 can additionally verify data integrity upon receipt.
 .PP
 By default, the datagram (FI_EP_DGRAM) endpoint is used for the test,
-unless otherwise specified via \f[C]\-e\f[].
+unless otherwise specified via \f[C]\-e\f[R].
 .SH HOW TO RUN TESTS
 .PP
 Two copies of the program must be launched: first, one copy must be
@@ -33,112 +33,92 @@ As a client\-server test, each have the following usage model:
 .IP
 .nf
 \f[C]
-server$\ fi_pingpong
-\f[]
+server$ fi_pingpong
+\f[R]
 .fi
 .SS Start the client
 .IP
 .nf
 \f[C]
-client$\ fi_pingpong\ <server\ address>
-\f[]
+client$ fi_pingpong <server address>
+\f[R]
 .fi
 .SH OPTIONS
 .PP
 The server and client must be able to communicate properly for the
 fi_pingpong utility to function.
-If any of the \f[C]\-e\f[], \f[C]\-I\f[], \f[C]\-S\f[], or \f[C]\-p\f[]
-options are used, then they must be specified on the invocation for both
-the server and the client process.
-If the \f[C]\-d\f[] option is specified on the server, then the client
+If any of the \f[C]\-e\f[R], \f[C]\-I\f[R], \f[C]\-S\f[R], or
+\f[C]\-p\f[R] options are used, then they must be specified on the
+invocation for both the server and the client process.
+If the \f[C]\-d\f[R] option is specified on the server, then the client
 will select the appropriate domain if no hint is provided on the client
 side.
-If the \f[C]\-d\f[] option is specified on the client, then it must also
-be specified on the server.
-If both the server and client specify the \f[C]\-d\f[] option and the
+If the \f[C]\-d\f[R] option is specified on the client, then it must
+also be specified on the server.
+If both the server and client specify the \f[C]\-d\f[R] option and the
 given domains cannot communicate, then the application will fail.
 .SS Control Messaging
 .TP
-.B \f[I]\-B <src_port>\f[]
+.B \f[I]\-B <src_port>\f[R]
 The non\-default source port number of the control socket.
 If this is not provided then the server will bind to port 47592 by
 default and the client will allow the port to be selected automatically.
-.RS
-.RE
 .TP
-.B \f[I]\-P <dest_port>\f[]
+.B \f[I]\-P <dest_port>\f[R]
 The non\-default destination port number of the control socket.
 If this is not provided then the client will connect to 47592 by
 default.
 The server ignores this option.
-.RS
-.RE
 .SS Fabric Filtering
 .TP
-.B \f[I]\-p <provider_name>\f[]
+.B \f[I]\-p <provider_name>\f[R]
 The name of the underlying fabric provider (e.g., sockets, psm, usnic,
 etc.).
 If a provider is not specified via the \-p switch, the test will pick
 one from the list of available providers (as returned by fi_getinfo(3)).
-.RS
-.RE
 .TP
-.B \f[I]\-e <endpoint>\f[]
+.B \f[I]\-e <endpoint>\f[R]
 The type of endpoint to be used for data messaging between the two
 processes.
 Supported values are dgram, rdm, and msg.
 For more information on endpoint types, see fi_endpoint(3).
-.RS
-.RE
 .TP
-.B \f[I]\-d <domain>\f[]
+.B \f[I]\-d <domain>\f[R]
 The name of the specific domain to be used.
-.RS
-.RE
 .SS Test Options
 .TP
-.B \f[I]\-I <iter>\f[]
+.B \f[I]\-I <iter>\f[R]
 The number of iterations of the test will run.
-.RS
-.RE
 .TP
-.B \f[I]\-S <msg_size>\f[]
-The specific size of the message in bytes the test will use or
-\[aq]all\[aq] to run all the default sizes.
-.RS
-.RE
+.B \f[I]\-S <msg_size>\f[R]
+The specific size of the message in bytes the test will use or `all' to
+run all the default sizes.
 .TP
-.B \f[I]\-c\f[]
+.B \f[I]\-c\f[R]
 Activate data integrity checks at the receiver (note: this will degrade
 performance).
-.RS
-.RE
 .SS Utility
 .TP
-.B \f[I]\-v\f[]
+.B \f[I]\-v\f[R]
 Activate output debugging (warning: highly verbose)
-.RS
-.RE
 .TP
-.B \f[I]\-h\f[]
+.B \f[I]\-h\f[R]
 Displays help output for the pingpong test.
-.RS
-.RE
 .SH USAGE EXAMPLES
 .SS A simple example
-.SS Server: \f[C]fi_pingpong\ \-p\ <provider_name>\f[]
+.SS Server: \f[C]fi_pingpong \-p <provider_name>\f[R]
 .PP
-\f[C]server$\ fi_pingpong\ \-p\ sockets\f[]
-.SS Client: \f[C]fi_pingpong\ \-p\ <provider_name>\ <server_addr>\f[]
+\f[C]server$ fi_pingpong \-p sockets\f[R]
+.SS Client: \f[C]fi_pingpong \-p <provider_name> <server_addr>\f[R]
 .PP
-\f[C]client$\ fi_pingpong\ \-p\ sockets\ 192.168.0.123\f[]
+\f[C]client$ fi_pingpong \-p sockets 192.168.0.123\f[R]
 .SS An example with various options
 .SS Server:
 .PP
-\f[C]server$\ fi_pingpong\ \-p\ usnic\ \-I\ 1000\ \-S\ 1024\f[]
+\f[C]server$ fi_pingpong \-p usnic \-I 1000 \-S 1024\f[R]
 .SS Client:
 .PP
-\f[C]client$\ fi_pingpong\ \-p\ usnic\ \-I\ 1000\ \-S\ 1024\ 192.168.0.123\f[]
+\f[C]client$ fi_pingpong \-p usnic \-I 1000 \-S 1024 192.168.0.123\f[R]
 .PP
 Specifically, this will run a pingpong test with:
 .IP \[bu] 2
@@ -152,17 +132,17 @@ server node as 192.168.0.123
 .SS A longer test
 .SS Server:
 .PP
-\f[C]server$\ fi_pingpong\ \-p\ usnic\ \-I\ 10000\ \-S\ all\f[]
+\f[C]server$ fi_pingpong \-p usnic \-I 10000 \-S all\f[R]
 .SS Client:
 .PP
-\f[C]client$\ fi_pingpong\ \-p\ usnic\ \-I\ 10000\ \-S\ all\ 192.168.0.123\f[]
+\f[C]client$ fi_pingpong \-p usnic \-I 10000 \-S all 192.168.0.123\f[R]
 .SH DEFAULTS
 .PP
 There is no default provider; if a provider is not specified via the
-\f[C]\-p\f[] switch, the test will pick one from the list of available
+\f[C]\-p\f[R] switch, the test will pick one from the list of available
 providers (as returned by fi_getinfo(3)).
 .PP
-If no endpoint type is specified, \[aq]dgram\[aq] is used.
+If no endpoint type is specified, `dgram' is used.
 .PP
 The default tested sizes are: 64, 256, 1024, 4096, 65536, and 1048576.
 The test will only test sizes that are within the selected endpoints
@@ -172,28 +152,28 @@ maximum message size boundary.
 Each test generates data messages which are accounted for.
 Specifically, the displayed statistics at the end are :
 .IP \[bu] 2
-\f[I]bytes\f[] : number of bytes per message sent
+\f[I]bytes\f[R] : number of bytes per message sent
 .IP \[bu] 2
-\f[I]#sent\f[] : number of messages (ping) sent from the client to the
+\f[I]#sent\f[R] : number of messages (ping) sent from the client to the
 server
 .IP \[bu] 2
-\f[I]#ack\f[] : number of replies (pong) of the server received by the
+\f[I]#ack\f[R] : number of replies (pong) of the server received by the
 client
 .IP \[bu] 2
-\f[I]total\f[] : amount of memory exchanged between the processes
+\f[I]total\f[R] : amount of memory exchanged between the processes
 .IP \[bu] 2
-\f[I]time\f[] : duration of this single test
+\f[I]time\f[R] : duration of this single test
 .IP \[bu] 2
-\f[I]MB/sec\f[] : throughput computed from \f[I]total\f[] and
-\f[I]time\f[]
+\f[I]MB/sec\f[R] : throughput computed from \f[I]total\f[R] and
+\f[I]time\f[R]
 .IP \[bu] 2
-\f[I]usec/xfer\f[] : average time for transferring a message outbound
+\f[I]usec/xfer\f[R] : average time for transferring a message outbound
 (ping or pong) in microseconds
 .IP \[bu] 2
-\f[I]Mxfers/sec\f[] : average amount of transfers of message outbound
+\f[I]Mxfers/sec\f[R] : average amount of transfers of message outbound
 per second
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3) \f[C]fabric\f[](7),
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3) \f[C]fabric\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man1/fi_strerror.1 b/deps/libfabric/man/man1/fi_strerror.1
index f2f8676b31f624be915c066ce47b129c208b74bc..0a9be348c601dd13061ea7f521399886ed16ffb4 100644
--- a/deps/libfabric/man/man1/fi_strerror.1
+++ b/deps/libfabric/man/man1/fi_strerror.1
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_strerror" "1" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_strerror" "1" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,20 +9,21 @@ fi_strerror \- display libfabric error strings
 .IP
 .nf
 \f[C]
-fi_strerror\ FI_ERROR_CODE
-\f[]
+fi_strerror FI_ERROR_CODE
+\f[R]
 .fi
 .SH DESCRIPTION
 .PP
-Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[].
-\f[C]FI_ERROR_CODE\f[] may be a hexadecimal, octal, or decimal constant.
-Although the \f[C]fi_strerror\f[](3) library function only accepts
+Display the error string for the given numeric \f[C]FI_ERROR_CODE\f[R].
+\f[C]FI_ERROR_CODE\f[R] may be a hexadecimal, octal, or decimal
+constant.
+Although the \f[C]fi_strerror\f[R](3) library function only accepts
 positive error values, for convenience this utility accepts both
 positive and negative error values.
 .PP
 This is primarily a convenience tool for developers.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7) \f[C]fi_errno\f[](3)
+\f[C]fabric\f[R](7) \f[C]fi_errno\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_atomic.3 b/deps/libfabric/man/man3/fi_atomic.3
index 5c38a5cd526d9777c2aee6762126bee8bb825409..50aaaed46e1d1fb5bef712bb6cee032417910e4d 100644
--- a/deps/libfabric/man/man3/fi_atomic.3
+++ b/deps/libfabric/man/man3/fi_atomic.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_atomic" "3" "2019\-09\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_atomic" "3" "2021\-06\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,20 +8,14 @@ fi_atomic \- Remote atomic functions
 .TP
 .B fi_atomic / fi_atomicv / fi_atomicmsg / fi_inject_atomic
 Initiates an atomic operation to remote memory
-.RS
-.RE
 .TP
 .B fi_fetch_atomic / fi_fetch_atomicv / fi_fetch_atomicmsg
 Initiates an atomic operation to remote memory, retrieving the initial
 value.
-.RS
-.RE
 .TP
 .B fi_compare_atomic / fi_compare_atomicv / fi_compare_atomicmsg
 Initiates an atomic compare\-operation to remote memory, retrieving the
 initial value.
-.RS
-.RE
 .PP
 fi_atomicvalid / fi_fetch_atomicvalid / fi_compare_atomicvalid /
 fi_query_atomic : Indicates if a provider supports a specific atomic
@@ -30,165 +24,134 @@ operation
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_atomic.h>
+#include <rdma/fi_atomic.h>
 
-ssize_t\ fi_atomic(struct\ fid_ep\ *ep,\ const\ void\ *buf,
-\ \ \ \ size_t\ count,\ void\ *desc,\ fi_addr_t\ dest_addr,
-\ \ \ \ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,\ void\ *context);
+ssize_t fi_atomic(struct fid_ep *ep, const void *buf,
+    size_t count, void *desc, fi_addr_t dest_addr,
+    uint64_t addr, uint64_t key,
+    enum fi_datatype datatype, enum fi_op op, void *context);
 
-ssize_t\ fi_atomicv(struct\ fid_ep\ *ep,\ const\ struct\ fi_ioc\ *iov,
-\ \ \ \ void\ **desc,\ size_t\ count,\ fi_addr_t\ dest_addr,
-\ \ \ \ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,\ void\ *context);
+ssize_t fi_atomicv(struct fid_ep *ep, const struct fi_ioc *iov,
+    void **desc, size_t count, fi_addr_t dest_addr,
+    uint64_t addr, uint64_t key,
+    enum fi_datatype datatype, enum fi_op op, void *context);
 
-ssize_t\ fi_atomicmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg_atomic\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_atomicmsg(struct fid_ep *ep, const struct fi_msg_atomic *msg,
+    uint64_t flags);
 
-ssize_t\ fi_inject_atomic(struct\ fid_ep\ *ep,\ const\ void\ *buf,
-\ \ \ \ size_t\ count,\ fi_addr_t\ dest_addr,
-\ \ \ \ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op);
+ssize_t fi_inject_atomic(struct fid_ep *ep, const void *buf,
+    size_t count, fi_addr_t dest_addr,
+    uint64_t addr, uint64_t key,
+    enum fi_datatype datatype, enum fi_op op);
 
-ssize_t\ fi_fetch_atomic(struct\ fid_ep\ *ep,\ const\ void\ *buf,
-\ \ \ \ size_t\ count,\ void\ *desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,\ void\ *context);
+ssize_t fi_fetch_atomic(struct fid_ep *ep, const void *buf,
+    size_t count, void *desc, void *result, void *result_desc,
+    fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+    enum fi_datatype datatype, enum fi_op op, void *context);
 
-ssize_t\ fi_fetch_atomicv(struct\ fid_ep\ *ep,\ const\ struct\ fi_ioc\ *iov,
-\ \ \ \ void\ **desc,\ size_t\ count,\ struct\ fi_ioc\ *resultv,
-\ \ \ \ void\ **result_desc,\ size_t\ result_count,\ fi_addr_t\ dest_addr,
-\ \ \ \ uint64_t\ addr,\ uint64_t\ key,\ enum\ fi_datatype\ datatype,
-\ \ \ \ enum\ fi_op\ op,\ void\ *context);
+ssize_t fi_fetch_atomicv(struct fid_ep *ep, const struct fi_ioc *iov,
+    void **desc, size_t count, struct fi_ioc *resultv,
+    void **result_desc, size_t result_count, fi_addr_t dest_addr,
+    uint64_t addr, uint64_t key, enum fi_datatype datatype,
+    enum fi_op op, void *context);
 
-ssize_t\ fi_fetch_atomicmsg(struct\ fid_ep\ *ep,
-\ \ \ \ const\ struct\ fi_msg_atomic\ *msg,\ struct\ fi_ioc\ *resultv,
-\ \ \ \ void\ **result_desc,\ size_t\ result_count,\ uint64_t\ flags);
+ssize_t fi_fetch_atomicmsg(struct fid_ep *ep,
+    const struct fi_msg_atomic *msg, struct fi_ioc *resultv,
+    void **result_desc, size_t result_count, uint64_t flags);
 
-ssize_t\ fi_compare_atomic(struct\ fid_ep\ *ep,\ const\ void\ *buf,
-\ \ \ \ size_t\ count,\ void\ *desc,\ const\ void\ *compare,
-\ \ \ \ void\ *compare_desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,\ void\ *context);
+ssize_t fi_compare_atomic(struct fid_ep *ep, const void *buf,
+    size_t count, void *desc, const void *compare,
+    void *compare_desc, void *result, void *result_desc,
+    fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+    enum fi_datatype datatype, enum fi_op op, void *context);
 
-size_t\ fi_compare_atomicv(struct\ fid_ep\ *ep,\ const\ struct\ fi_ioc\ *iov,
-\ \ \ \ \ \ \ void\ **desc,\ size_t\ count,\ const\ struct\ fi_ioc\ *comparev,
-\ \ \ \ \ \ \ void\ **compare_desc,\ size_t\ compare_count,\ struct\ fi_ioc\ *resultv,
-\ \ \ \ \ \ \ void\ **result_desc,\ size_t\ result_count,\ fi_addr_t\ dest_addr,
-\ \ \ \ \ \ \ uint64_t\ addr,\ uint64_t\ key,\ enum\ fi_datatype\ datatype,
-\ \ \ \ \ \ \ enum\ fi_op\ op,\ void\ *context);
+size_t fi_compare_atomicv(struct fid_ep *ep, const struct fi_ioc *iov,
+       void **desc, size_t count, const struct fi_ioc *comparev,
+       void **compare_desc, size_t compare_count, struct fi_ioc *resultv,
+       void **result_desc, size_t result_count, fi_addr_t dest_addr,
+       uint64_t addr, uint64_t key, enum fi_datatype datatype,
+       enum fi_op op, void *context);
 
-ssize_t\ fi_compare_atomicmsg(struct\ fid_ep\ *ep,
-\ \ \ \ const\ struct\ fi_msg_atomic\ *msg,\ const\ struct\ fi_ioc\ *comparev,
-\ \ \ \ void\ **compare_desc,\ size_t\ compare_count,
-\ \ \ \ struct\ fi_ioc\ *resultv,\ void\ **result_desc,\ size_t\ result_count,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_compare_atomicmsg(struct fid_ep *ep,
+    const struct fi_msg_atomic *msg, const struct fi_ioc *comparev,
+    void **compare_desc, size_t compare_count,
+    struct fi_ioc *resultv, void **result_desc, size_t result_count,
+    uint64_t flags);
 
-int\ fi_atomicvalid(struct\ fid_ep\ *ep,\ enum\ fi_datatype\ datatype,
-\ \ \ \ enum\ fi_op\ op,\ size_t\ *count);
+int fi_atomicvalid(struct fid_ep *ep, enum fi_datatype datatype,
+    enum fi_op op, size_t *count);
 
-int\ fi_fetch_atomicvalid(struct\ fid_ep\ *ep,\ enum\ fi_datatype\ datatype,
-\ \ \ \ enum\ fi_op\ op,\ size_t\ *count);
+int fi_fetch_atomicvalid(struct fid_ep *ep, enum fi_datatype datatype,
+    enum fi_op op, size_t *count);
 
-int\ fi_compare_atomicvalid(struct\ fid_ep\ *ep,\ enum\ fi_datatype\ datatype,
-\ \ \ \ enum\ fi_op\ op,\ size_t\ *count);
+int fi_compare_atomicvalid(struct fid_ep *ep, enum fi_datatype datatype,
+    enum fi_op op, size_t *count);
 
-int\ fi_query_atomic(struct\ fid_domain\ *domain,
-\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,
-\ \ \ \ struct\ fi_atomic_attr\ *attr,\ uint64_t\ flags);
-\f[]
+int fi_query_atomic(struct fid_domain *domain,
+    enum fi_datatype datatype, enum fi_op op,
+    struct fi_atomic_attr *attr, uint64_t flags);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]ep\f[]
+.B \f[I]ep\f[R]
 Fabric endpoint on which to initiate atomic operation.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 Local data buffer that specifies first operand of atomic operation
-.RS
-.RE
 .TP
-.B \f[I]iov / comparev / resultv\f[]
+.B \f[I]iov / comparev / resultv\f[R]
 Vectored data buffer(s).
-.RS
-.RE
 .TP
-.B \f[I]count / compare_count / result_count\f[]
+.B \f[I]count / compare_count / result_count\f[R]
 Count of vectored data entries.
 The number of elements referenced, where each element is the indicated
 datatype.
-.RS
-.RE
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Address of remote memory to access.
-.RS
-.RE
 .TP
-.B \f[I]key\f[]
+.B \f[I]key\f[R]
 Protection key associated with the remote memory.
-.RS
-.RE
 .TP
-.B \f[I]datatype\f[]
+.B \f[I]datatype\f[R]
 Datatype associated with atomic operands
-.RS
-.RE
 .TP
-.B \f[I]op\f[]
+.B \f[I]op\f[R]
 Atomic operation to perform
-.RS
-.RE
 .TP
-.B \f[I]compare\f[]
+.B \f[I]compare\f[R]
 Local compare buffer, containing comparison data.
-.RS
-.RE
 .TP
-.B \f[I]result\f[]
+.B \f[I]result\f[R]
 Local data buffer to store initial value of remote buffer
-.RS
-.RE
 .TP
-.B \f[I]desc / compare_desc / result_desc\f[]
+.B \f[I]desc / compare_desc / result_desc\f[R]
 Data descriptor associated with the local data buffer, local compare
 buffer, and local result buffer, respectively.
-See \f[C]fi_mr\f[](3).
-.RS
-.RE
+See \f[C]fi_mr\f[R](3).
 .TP
-.B \f[I]dest_addr\f[]
+.B \f[I]dest_addr\f[R]
 Destination address for connectionless atomic operations.
 Ignored for connected endpoints.
-.RS
-.RE
 .TP
-.B \f[I]msg\f[]
+.B \f[I]msg\f[R]
 Message descriptor for atomic operations
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply for the atomic operation
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified pointer to associate with the operation.
 This parameter is ignored if the operation will not generate a
 successful completion, unless an op flag specifies the context parameter
 be used for required input.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Atomic transfers are used to read and update data located in remote
 memory regions in an atomic fashion.
 Conceptually, they are similar to local atomic operations of a similar
-nature (e.g.
-atomic increment, compare and swap, etc.).
+nature (e.g.\ atomic increment, compare and swap, etc.).
 Updates to remote data involve one of several operations on the data,
 and act on specific types of data, as listed below.
 As such, atomic transfers have knowledge of the format of the data being
@@ -203,71 +166,53 @@ types.
 A given atomic function may support any datatype, subject to provider
 implementation constraints.
 .TP
-.B \f[I]FI_INT8\f[]
+.B \f[I]FI_INT8\f[R]
 Signed 8\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_UINT8\f[]
+.B \f[I]FI_UINT8\f[R]
 Unsigned 8\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_INT16\f[]
+.B \f[I]FI_INT16\f[R]
 Signed 16\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_UINT16\f[]
+.B \f[I]FI_UINT16\f[R]
 Unsigned 16\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_INT32\f[]
+.B \f[I]FI_INT32\f[R]
 Signed 32\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_UINT32\f[]
+.B \f[I]FI_UINT32\f[R]
 Unsigned 32\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_INT64\f[]
+.B \f[I]FI_INT64\f[R]
 Signed 64\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_UINT64\f[]
+.B \f[I]FI_UINT64\f[R]
 Unsigned 64\-bit integer.
-.RS
-.RE
 .TP
-.B \f[I]FI_FLOAT\f[]
+.B \f[I]FI_INT128\f[R]
+Signed 128\-bit integer.
+.TP
+.B \f[I]FI_UINT128\f[R]
+Unsigned 128\-bit integer.
+.TP
+.B \f[I]FI_FLOAT\f[R]
 A single\-precision floating point value (IEEE 754).
-.RS
-.RE
 .TP
-.B \f[I]FI_DOUBLE\f[]
+.B \f[I]FI_DOUBLE\f[R]
 A double\-precision floating point value (IEEE 754).
-.RS
-.RE
 .TP
-.B \f[I]FI_FLOAT_COMPLEX\f[]
+.B \f[I]FI_FLOAT_COMPLEX\f[R]
 An ordered pair of single\-precision floating point values (IEEE 754),
 with the first value representing the real portion of a complex number
 and the second representing the imaginary portion.
-.RS
-.RE
 .TP
-.B \f[I]FI_DOUBLE_COMPLEX\f[]
+.B \f[I]FI_DOUBLE_COMPLEX\f[R]
 An ordered pair of double\-precision floating point values (IEEE 754),
 with the first value representing the real portion of a complex number
 and the second representing the imaginary portion.
-.RS
-.RE
 .TP
-.B \f[I]FI_LONG_DOUBLE\f[]
+.B \f[I]FI_LONG_DOUBLE\f[R]
 A double\-extended precision floating point value (IEEE 754).
 Note that the size of a long double and number of bits used for
 precision is compiler, platform, and/or provider specific.
@@ -276,15 +221,11 @@ using a long double format that is compatible with their application,
 and that format is supported by the provider.
 The mechanism used for this validation is currently beyond the scope of
 the libfabric API.
-.RS
-.RE
 .TP
-.B \f[I]FI_LONG_DOUBLE_COMPLEX\f[]
+.B \f[I]FI_LONG_DOUBLE_COMPLEX\f[R]
 An ordered pair of double\-extended precision floating point values
 (IEEE 754), with the first value representing the real portion of a
 complex number and the second representing the imaginary portion.
-.RS
-.RE
 .SS Atomic Operations
 .PP
 The following atomic operations are defined.
@@ -294,227 +235,189 @@ It may also carry source data to replace the target value in compare and
 swap operations.
 A conceptual description of each operation is provided.
 .TP
-.B \f[I]FI_MIN\f[]
+.B \f[I]FI_MIN\f[R]
 Minimum
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (buf[i]\ <\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (buf[i] < addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_MAX\f[]
+.B \f[I]FI_MAX\f[R]
 Maximum
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (buf[i]\ >\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (buf[i] > addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_SUM\f[]
+.B \f[I]FI_SUM\f[R]
 Sum
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ addr[i]\ +\ buf[i]
-\f[]
+addr[i] = addr[i] + buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_PROD\f[]
+.B \f[I]FI_PROD\f[R]
 Product
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ addr[i]\ *\ buf[i]
-\f[]
+addr[i] = addr[i] * buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_LOR\f[]
+.B \f[I]FI_LOR\f[R]
 Logical OR
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ (addr[i]\ ||\ buf[i])
-\f[]
+addr[i] = (addr[i] || buf[i])
+\f[R]
 .fi
 .TP
-.B \f[I]FI_LAND\f[]
+.B \f[I]FI_LAND\f[R]
 Logical AND
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ (addr[i]\ &&\ buf[i])
-\f[]
+addr[i] = (addr[i] && buf[i])
+\f[R]
 .fi
 .TP
-.B \f[I]FI_BOR\f[]
+.B \f[I]FI_BOR\f[R]
 Bitwise OR
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ addr[i]\ |\ buf[i]
-\f[]
+addr[i] = addr[i] | buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_BAND\f[]
+.B \f[I]FI_BAND\f[R]
 Bitwise AND
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ addr[i]\ &\ buf[i]
-\f[]
+addr[i] = addr[i] & buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_LXOR\f[]
+.B \f[I]FI_LXOR\f[R]
 Logical exclusive\-OR (XOR)
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ ((addr[i]\ &&\ !buf[i])\ ||\ (!addr[i]\ &&\ buf[i]))
-\f[]
+addr[i] = ((addr[i] && !buf[i]) || (!addr[i] && buf[i]))
+\f[R]
 .fi
 .TP
-.B \f[I]FI_BXOR\f[]
+.B \f[I]FI_BXOR\f[R]
 Bitwise exclusive\-OR (XOR)
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ addr[i]\ ^\ buf[i]
-\f[]
+addr[i] = addr[i] \[ha] buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_ATOMIC_READ\f[]
+.B \f[I]FI_ATOMIC_READ\f[R]
 Read data atomically
-.RS
-.RE
 .IP
 .nf
 \f[C]
-result[i]\ =\ addr[i]
-\f[]
+result[i] = addr[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_ATOMIC_WRITE\f[]
+.B \f[I]FI_ATOMIC_WRITE\f[R]
 Write data atomically
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ buf[i]
-\f[]
+addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP\f[]
+.B \f[I]FI_CSWAP\f[R]
 Compare values and if equal swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ ==\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] == addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP_NE\f[]
+.B \f[I]FI_CSWAP_NE\f[R]
 Compare values and if not equal swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ !=\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] != addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP_LE\f[]
+.B \f[I]FI_CSWAP_LE\f[R]
 Compare values and if less than or equal swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ <=\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] <= addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP_LT\f[]
+.B \f[I]FI_CSWAP_LT\f[R]
 Compare values and if less than swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ <\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] < addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP_GE\f[]
+.B \f[I]FI_CSWAP_GE\f[R]
 Compare values and if greater than or equal swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ >=\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] >= addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_CSWAP_GT\f[]
+.B \f[I]FI_CSWAP_GT\f[R]
 Compare values and if greater than swap with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-if\ (compare[i]\ >\ addr[i])
-\ \ \ \ addr[i]\ =\ buf[i]
-\f[]
+if (compare[i] > addr[i])
+    addr[i] = buf[i]
+\f[R]
 .fi
 .TP
-.B \f[I]FI_MSWAP\f[]
+.B \f[I]FI_MSWAP\f[R]
 Swap masked bits with data
-.RS
-.RE
 .IP
 .nf
 \f[C]
-addr[i]\ =\ (buf[i]\ &\ compare[i])\ |\ (addr[i]\ &\ ~compare[i])
-\f[]
+addr[i] = (buf[i] & compare[i]) | (addr[i] & \[ti]compare[i])
+\f[R]
 .fi
 .SS Base Atomic Functions
 .PP
-The base atomic functions \-\- fi_atomic, fi_atomicv, fi_atomicmsg \-\-
-are used to transmit data to a remote node, where the specified atomic
-operation is performed against the target data.
+The base atomic functions \[en] fi_atomic, fi_atomicv, fi_atomicmsg
+\[en] are used to transmit data to a remote node, where the specified
+atomic operation is performed against the target data.
 The result of a base atomic function is stored at the remote memory
 region.
 The main difference between atomic functions are the number and type of
@@ -523,8 +426,8 @@ Otherwise, they perform the same general function.
 .PP
 The call fi_atomic transfers the data contained in the user\-specified
 data buffer to a remote node.
-For unconnected endpoints, the destination endpoint is specified through
-the dest_addr parameter.
+For connectionless endpoints, the destination endpoint is specified
+through the dest_addr parameter.
 Unless the endpoint has been configured differently, the data buffer
 passed into fi_atomic must not be touched by the application until the
 fi_atomic call completes asynchronously.
@@ -549,36 +452,36 @@ The requested message size that can be used with fi_inject_atomic is
 limited by inject_size.
 .PP
 The fi_atomicmsg call supports atomic functions over both connected and
-unconnected endpoints, with the ability to control the atomic operation
-per call through the use of flags.
+connectionless endpoints, with the ability to control the atomic
+operation per call through the use of flags.
 The fi_atomicmsg function takes a struct fi_msg_atomic as input.
 .IP
 .nf
 \f[C]
-struct\ fi_msg_atomic\ {
-\ \ \ \ const\ struct\ fi_ioc\ *msg_iov;\ /*\ local\ scatter\-gather\ array\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ **desc;\ \ \ /*\ local\ access\ descriptors\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ iov_count;/*\ #\ elements\ in\ ioc\ */
-\ \ \ \ const\ void\ \ \ \ \ \ \ \ \ \ *addr;\ \ \ \ /*\ optional\ endpoint\ address\ */
-\ \ \ \ const\ struct\ fi_rma_ioc\ *rma_iov;\ /*\ remote\ SGL\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ rma_iov_count;/*\ #\ elements\ in\ remote\ SGL\ */
-\ \ \ \ enum\ fi_datatype\ \ \ \ datatype;\ /*\ operand\ datatype\ */
-\ \ \ \ enum\ fi_op\ \ \ \ \ \ \ \ \ \ op;\ \ \ \ \ \ \ /*\ atomic\ operation\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context;\ /*\ user\-defined\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ data;\ \ \ \ \ /*\ optional\ data\ */
+struct fi_msg_atomic {
+    const struct fi_ioc *msg_iov; /* local scatter\-gather array */
+    void                **desc;   /* local access descriptors */
+    size_t              iov_count;/* # elements in ioc */
+    const void          *addr;    /* optional endpoint address */
+    const struct fi_rma_ioc *rma_iov; /* remote SGL */
+    size_t              rma_iov_count;/* # elements in remote SGL */
+    enum fi_datatype    datatype; /* operand datatype */
+    enum fi_op          op;       /* atomic operation */
+    void                *context; /* user\-defined context */
+    uint64_t            data;     /* optional data */
 };
 
-struct\ fi_ioc\ {
-\ \ \ \ void\ \ \ \ \ \ \ \ *addr;\ \ \ \ /*\ local\ address\ */
-\ \ \ \ size_t\ \ \ \ \ \ count;\ \ \ \ /*\ #\ target\ operands\ */
+struct fi_ioc {
+    void        *addr;    /* local address */
+    size_t      count;    /* # target operands */
 };
 
-struct\ fi_rma_ioc\ {
-\ \ \ \ uint64_t\ \ \ \ addr;\ \ \ \ \ /*\ target\ address\ */
-\ \ \ \ size_t\ \ \ \ \ \ count;\ \ \ \ /*\ #\ target\ operands\ */
-\ \ \ \ uint64_t\ \ \ \ key;\ \ \ \ \ \ /*\ access\ key\ */
+struct fi_rma_ioc {
+    uint64_t    addr;     /* target address */
+    size_t      count;    /* # target operands */
+    uint64_t    key;      /* access key */
 };
-\f[]
+\f[R]
 .fi
 .PP
 The following list of atomic operations are usable with base atomic
@@ -586,8 +489,8 @@ operations: FI_MIN, FI_MAX, FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR,
 FI_BAND, FI_LXOR, FI_BXOR, and FI_ATOMIC_WRITE.
 .SS Fetch\-Atomic Functions
 .PP
-The fetch atomic functions \-\- fi_fetch_atomic, fi_fetch_atomicv, and
-fi_fetch atomicmsg \-\- behave similar to the equivalent base atomic
+The fetch atomic functions \[en] fi_fetch_atomic, fi_fetch_atomicv, and
+fi_fetch atomicmsg \[en] behave similar to the equivalent base atomic
 function.
 The difference between the fetch and base atomic calls are the fetch
 atomic routines return the initial value that was stored at the target
@@ -605,16 +508,16 @@ fi_fetch_atomic buf parameter) is ignored and may be NULL.
 The results are written into the result buffer.
 .SS Compare\-Atomic Functions
 .PP
-The compare atomic functions \-\- fi_compare_atomic, fi_compare_atomicv,
-and fi_compare atomicmsg \-\- are used for operations that require
-comparing the target data against a value before performing a swap
-operation.
+The compare atomic functions \[en] fi_compare_atomic,
+fi_compare_atomicv, and fi_compare atomicmsg \[en] are used for
+operations that require comparing the target data against a value before
+performing a swap operation.
 The compare atomic functions support: FI_CSWAP, FI_CSWAP_NE,
 FI_CSWAP_LE, FI_CSWAP_LT, FI_CSWAP_GE, FI_CSWAP_GT, and FI_MSWAP.
 .SS Atomic Valid Functions
 .PP
-The atomic valid functions \-\- fi_atomicvalid, fi_fetch_atomicvalid,
-and fi_compare_atomicvalid \-\-indicate which operations the local
+The atomic valid functions \[en] fi_atomicvalid, fi_fetch_atomicvalid,
+and fi_compare_atomicvalid \[en]indicate which operations the local
 provider supports.
 Needed operations not supported by the provider must be emulated by the
 application.
@@ -657,11 +560,11 @@ The output of fi_query_atomic is struct fi_atomic_attr:
 .IP
 .nf
 \f[C]
-struct\ fi_atomic_attr\ {
-\ \ \ \ size_t\ count;
-\ \ \ \ size_t\ size;
+struct fi_atomic_attr {
+    size_t count;
+    size_t size;
 };
-\f[]
+\f[R]
 .fi
 .PP
 The count attribute field is as defined for the atomic valid calls.
@@ -693,7 +596,7 @@ prior to the completion notification.
 After processing a completion for the atomic, if the initiator submits a
 transfer between the same endpoints that generates a completion at the
 target, the results will be available prior to the subsequent
-transfer\[aq]s event.
+transfer\[cq]s event.
 Or, if a fenced data transfer from the initiator follows the atomic
 request, the results will be available prior to a completion at the
 target for the fenced transfer.
@@ -703,14 +606,14 @@ guaranteed only when performed by a single actor for a given window of
 time.
 An actor is defined as a single libfabric domain (identified by the
 domain name, and not an open instance of that domain), a coherent CPU
-complex, or other device (e.g.
-GPU) capable of performing atomic operations on the target memory.
+complex, or other device (e.g.\ GPU) capable of performing atomic
+operations on the target memory.
 The results of atomic operations performed by multiple actors
 simultaneously are undefined.
 For example, issuing CPU based atomic operations to a target region
 concurrently being updated by NIC based atomics may leave the
-region\[aq]s data in an unknown state.
-The results of a first actor\[aq]s atomic operations must be visible to
+region\[cq]s data in an unknown state.
+The results of a first actor\[cq]s atomic operations must be visible to
 a second actor prior to the second actor issuing its own atomics.
 .SH FLAGS
 .PP
@@ -722,38 +625,32 @@ previously configured with the endpoint, except where noted (see
 fi_control).
 The following list of flags are usable with atomic message calls.
 .TP
-.B \f[I]FI_COMPLETION\f[]
+.B \f[I]FI_COMPLETION\f[R]
 Indicates that a completion entry should be generated for the specified
 operation.
 The endpoint must be bound to a completion queue with
 FI_SELECTIVE_COMPLETION that corresponds to the specified operation, or
 this flag is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_MORE\f[]
+.B \f[I]FI_MORE\f[R]
 Indicates that the user has additional requests that will immediately be
 posted after the current call returns.
 Use of this flag may improve performance by enabling the provider to
 optimize its access to the fabric hardware.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT\f[]
+.B \f[I]FI_INJECT\f[R]
 Indicates that the control of constant data buffers should be returned
 to the user immediately after the call returns, even if the operation is
 handled asynchronously.
 This may require that the underlying provider implementation copy the
 data into a local buffer and transfer out of that buffer.
 Constant data buffers refers to any data buffer or iovec used by the
-atomic APIs that are marked as \[aq]const\[aq].
+atomic APIs that are marked as `const'.
 Non\-constant or output buffers are unaffected by this flag and may be
 accessed by the provider at anytime until the operation has completed.
 This flag can only be used with messages smaller than inject_size.
-.RS
-.RE
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 Applies to transmits.
 Indicates that the requested operation, also known as the fenced
 operation, and any operation posted after the fenced operation will be
@@ -761,43 +658,34 @@ deferred until all previous operations targeting the same peer endpoint
 have completed.
 Operations posted after the fencing will see and/or replace the results
 of any operations initiated prior to the fenced operation.
-.RS
-.RE
 .PP
 The ordering of operations starting at the posting of the fenced
 operation (inclusive) to the posting of a subsequent fenced operation
-(exclusive) is controlled by the endpoint\[aq]s ordering semantics.
+(exclusive) is controlled by the endpoint\[cq]s ordering semantics.
 .TP
-.B \f[I]FI_TAGGED\f[]
+.B \f[I]FI_TAGGED\f[R]
 Specifies that the target of the atomic operation is a tagged receive
 buffer instead of an RMA buffer.
 When a tagged buffer is the target memory region, the addr parameter is
 used as a 0\-based byte offset into the tagged buffer, with the key
 parameter specifying the tag.
-.RS
-.RE
 .SH RETURN VALUE
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EAGAIN\f[]
-See \f[C]fi_msg\f[](3) for a detailed description of handling FI_EAGAIN.
-.RS
-.RE
+.B \f[I]\-FI_EAGAIN\f[R]
+See \f[C]fi_msg\f[R](3) for a detailed description of handling
+FI_EAGAIN.
 .TP
-.B \f[I]\-FI_EOPNOTSUPP\f[]
+.B \f[I]\-FI_EOPNOTSUPP\f[R]
 The requested atomic operation is not supported on this endpoint.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EMSGSIZE\f[]
+.B \f[I]\-FI_EMSGSIZE\f[R]
 The number of atomic operations in a single request exceeds that
 supported by the underlying provider.
-.RS
-.RE
 .SH NOTES
 .PP
 Atomic operations operate on an array of values of a specific data type.
@@ -810,14 +698,14 @@ bytes to an aligned memory location.
 .IP
 .nf
 \f[C]
-fi_atomic(ep,\ buf,\ count,\ NULL,\ dest_addr,\ addr,\ key,
-\ \ \ \ \ \ FI_UINT64,\ FI_ATOMIC_WRITE,\ context)
+fi_atomic(ep, buf, count, NULL, dest_addr, addr, key,
+      FI_UINT64, FI_ATOMIC_WRITE, context)
 {
-\ \ \ \ for\ (i\ =\ 1;\ i\ <\ count;\ i\ ++)
-\ \ \ \ \ \ \ \ ATOMIC_WRITE_U64(((uint64_t\ *)\ addr)[i],
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ((uint64_t\ *)\ buf)[i]);
+    for (i = 1; i < count; i ++)
+        ATOMIC_WRITE_U64(((uint64_t *) addr)[i],
+                 ((uint64_t *) buf)[i]);
 }
-\f[]
+\f[R]
 .fi
 .PP
 The number of array elements to operate on is specified through a count
@@ -833,11 +721,11 @@ assigned to the transmitting and receiving endpoints.
 Both message and data ordering are required if the results of two atomic
 operations to the same memory buffers are to reflect the second
 operation acting on the results of the first.
-See \f[C]fi_endpoint\f[](3) for further details and message size
+See \f[C]fi_endpoint\f[R](3) for further details and message size
 restrictions.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_cq\f[](3), \f[C]fi_rma\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_rma\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_av.3 b/deps/libfabric/man/man3/fi_av.3
index 34cfbe939137e70aaab016b406fe1f1dd2a69e2c..5525e408a8954b91af79bfb5e04d16301395d8c5 100644
--- a/deps/libfabric/man/man3/fi_av.3
+++ b/deps/libfabric/man/man3/fi_av.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_av" "3" "2019\-07\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_av" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,120 +8,90 @@ fi_av \- Address vector operations
 .TP
 .B fi_av_open / fi_close
 Open or close an address vector
-.RS
-.RE
 .TP
 .B fi_av_bind
 Associate an address vector with an event queue.
-.RS
-.RE
 .TP
 .B fi_av_insert / fi_av_insertsvc / fi_av_remove
 Insert/remove an address into/from the address vector.
-.RS
-.RE
 .TP
 .B fi_av_lookup
 Retrieve an address stored in the address vector.
-.RS
-.RE
 .TP
 .B fi_av_straddr
 Convert an address into a printable string.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_av_open(struct\ fid_domain\ *domain,\ struct\ fi_av_attr\ *attr,
-\ \ \ \ struct\ fid_av\ **av,\ void\ *context);
+int fi_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
+    struct fid_av **av, void *context);
 
-int\ fi_close(struct\ fid\ *av);
+int fi_close(struct fid *av);
 
-int\ fi_av_bind(struct\ fid_av\ *av,\ struct\ fid\ *eq,\ uint64_t\ flags);
+int fi_av_bind(struct fid_av *av, struct fid *eq, uint64_t flags);
 
-int\ fi_av_insert(struct\ fid_av\ *av,\ void\ *addr,\ size_t\ count,
-\ \ \ \ fi_addr_t\ *fi_addr,\ uint64_t\ flags,\ void\ *context);
+int fi_av_insert(struct fid_av *av, void *addr, size_t count,
+    fi_addr_t *fi_addr, uint64_t flags, void *context);
 
-int\ fi_av_insertsvc(struct\ fid_av\ *av,\ const\ char\ *node,
-\ \ \ \ const\ char\ *service,\ fi_addr_t\ *fi_addr,\ uint64_t\ flags,
-\ \ \ \ void\ *context);
+int fi_av_insertsvc(struct fid_av *av, const char *node,
+    const char *service, fi_addr_t *fi_addr, uint64_t flags,
+    void *context);
 
-int\ fi_av_insertsym(struct\ fid_av\ *av,\ const\ char\ *node,
-\ \ \ \ size_t\ nodecnt,\ const\ char\ *service,\ size_t\ svccnt,
-\ \ \ \ fi_addr_t\ *fi_addr,\ uint64_t\ flags,\ void\ *context);
+int fi_av_insertsym(struct fid_av *av, const char *node,
+    size_t nodecnt, const char *service, size_t svccnt,
+    fi_addr_t *fi_addr, uint64_t flags, void *context);
 
-int\ fi_av_remove(struct\ fid_av\ *av,\ fi_addr_t\ *fi_addr,\ size_t\ count,
-\ \ \ \ uint64_t\ flags);
+int fi_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count,
+    uint64_t flags);
 
-int\ fi_av_lookup(struct\ fid_av\ *av,\ fi_addr_t\ fi_addr,
-\ \ \ \ void\ *addr,\ size_t\ *addrlen);
+int fi_av_lookup(struct fid_av *av, fi_addr_t fi_addr,
+    void *addr, size_t *addrlen);
 
-fi_addr_t\ fi_rx_addr(fi_addr_t\ fi_addr,\ int\ rx_index,
-\ \ \ \ \ \ int\ rx_ctx_bits);
+fi_addr_t fi_rx_addr(fi_addr_t fi_addr, int rx_index,
+      int rx_ctx_bits);
 
-const\ char\ *\ fi_av_straddr(struct\ fid_av\ *av,\ const\ void\ *addr,
-\ \ \ \ \ \ char\ *buf,\ size_t\ *len);
-\f[]
+const char * fi_av_straddr(struct fid_av *av, const void *addr,
+      char *buf, size_t *len);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 Resource domain
-.RS
-.RE
 .TP
-.B \f[I]av\f[]
+.B \f[I]av\f[R]
 Address vector
-.RS
-.RE
 .TP
-.B \f[I]eq\f[]
+.B \f[I]eq\f[R]
 Event queue
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Address vector attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the address vector or insert
 operation.
-.RS
-.RE
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Buffer containing one or more addresses to insert into address vector.
-.RS
-.RE
 .TP
-.B \f[I]addrlen\f[]
+.B \f[I]addrlen\f[R]
 On input, specifies size of addr buffer.
 On output, stores number of bytes written to addr buffer.
-.RS
-.RE
 .TP
-.B \f[I]fi_addr\f[]
+.B \f[I]fi_addr\f[R]
 For insert, a reference to an array where returned fabric addresses will
 be written.
 For remove, one or more fabric addresses to remove.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Number of addresses to insert/remove from an AV.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Address vectors are used to map higher level addresses, which may be
@@ -129,39 +99,37 @@ more natural for an application to use, into fabric specific addresses.
 The mapping of addresses is fabric and provider specific, but may
 involve lengthy address resolution and fabric management protocols.
 AV operations are synchronous by default, but may be set to operate
-asynchronously by specifying the FI_EVENT flag to \f[C]fi_av_open\f[].
+asynchronously by specifying the FI_EVENT flag to \f[C]fi_av_open\f[R].
 When requesting asynchronous operation, the application must first bind
 an event queue to the AV before inserting addresses.
 .SS fi_av_open
 .PP
 fi_av_open allocates or opens an address vector.
 The properties and behavior of the address vector are defined by
-\f[C]struct\ fi_av_attr\f[].
+\f[C]struct fi_av_attr\f[R].
 .IP
 .nf
 \f[C]
-struct\ fi_av_attr\ {
-\ \ \ \ enum\ fi_av_type\ \ type;\ \ \ \ \ \ \ \ /*\ type\ of\ AV\ */
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ rx_ctx_bits;\ /*\ address\ bits\ to\ identify\ rx\ ctx\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ count;\ \ \ \ \ \ \ /*\ #\ entries\ for\ AV\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ ep_per_node;\ /*\ #\ endpoints\ per\ fabric\ address\ */
-\ \ \ \ const\ char\ \ \ \ \ \ \ *name;\ \ \ \ \ \ \ /*\ system\ name\ of\ AV\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ *map_addr;\ \ \ /*\ base\ mmap\ address\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ \ \ /*\ operation\ flags\ */
+struct fi_av_attr {
+    enum fi_av_type  type;        /* type of AV */
+    int              rx_ctx_bits; /* address bits to identify rx ctx */
+    size_t           count;       /* # entries for AV */
+    size_t           ep_per_node; /* # endpoints per fabric address */
+    const char       *name;       /* system name of AV */
+    void             *map_addr;   /* base mmap address */
+    uint64_t         flags;       /* operation flags */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]type\f[]
+.B \f[I]type\f[R]
 An AV type corresponds to a conceptual implementation of an address
 vector.
 The type specifies how an application views data stored in the AV,
 including how it may be accessed.
 Valid values are:
-.RS
-.RE
 .TP
-.B \- \f[I]FI_AV_MAP\f[]
+.B \- \f[I]FI_AV_MAP\f[R]
 Addresses which are inserted into an AV are mapped to a native fabric
 address for use by the application.
 The use of FI_AV_MAP requires that an application store the returned
@@ -176,10 +144,8 @@ store the returned addresses.
 Addresses are stored in the AV using a provider specific mechanism,
 including, but not limited to a tree, hash table, or maintained on the
 heap.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_AV_TABLE\f[]
+.B \- \f[I]FI_AV_TABLE\f[R]
 Addresses which are inserted into an AV of type FI_AV_TABLE are
 accessible using a simple index.
 Conceptually, the AV may be treated as an array of addresses, though the
@@ -191,34 +157,26 @@ The index of the first address inserted into an FI_AV_TABLE will be 0,
 and successive insertions will be given sequential indices.
 Sequential indices will be assigned across insertion calls on the same
 AV.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_AV_UNSPEC\f[]
+.B \- \f[I]FI_AV_UNSPEC\f[R]
 Provider will choose its preferred AV type.
 The AV type used will be returned through the type field in fi_av_attr.
-.RS
-.RE
 .TP
-.B \f[I]Receive Context Bits (rx_ctx_bits)\f[]
+.B \f[I]Receive Context Bits (rx_ctx_bits)\f[R]
 The receive context bits field is only for use with scalable endpoints.
 It indicates the number of bits reserved in a returned fi_addr_t, which
 will be used to identify a specific target receive context.
 See fi_rx_addr() and fi_endpoint(3) for additional details on receive
 contexts.
-The requested number of bits should be selected such that 2 ^
+The requested number of bits should be selected such that 2 \[ha]
 rx_ctx_bits >= rx_ctx_cnt for the endpoint.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Indicates the expected number of addresses that will be inserted into
 the AV.
 The provider uses this to optimize resource allocations.
-.RS
-.RE
 .TP
-.B \f[I]ep_per_node\f[]
+.B \f[I]ep_per_node\f[R]
 This field indicates the number of endpoints that will be associated
 with a specific fabric, or network, address.
 If the number of endpoints per node is unknown, this value should be set
@@ -227,28 +185,22 @@ The provider uses this value to optimize resource allocations.
 For example, distributed, parallel applications may set this to the
 number of processes allocated per node, times the number of endpoints
 each process will open.
-.RS
-.RE
 .TP
-.B \f[I]name\f[]
+.B \f[I]name\f[R]
 An optional system name associated with the address vector to create or
 open.
 Address vectors may be shared across multiple processes which access the
 same named domain on the same node.
 The name field allows the underlying provider to identify a shared AV.
-.RS
-.RE
 .PP
 If the name field is non\-NULL and the AV is not opened for read\-only
 access, a named AV will be created, if it does not already exist.
 .TP
-.B \f[I]map_addr\f[]
+.B \f[I]map_addr\f[R]
 The map_addr determines the base fi_addr_t address that a provider
 should use when sharing an AV of type FI_AV_MAP between processes.
 Processes that provide the same value for map_addr to a shared AV may
 use the same fi_addr_t values returned from an fi_av_insert call.
-.RS
-.RE
 .PP
 The map_addr may be used by the provider to mmap memory allocated for a
 shared AV between processes; however, the provider is not required to
@@ -263,12 +215,10 @@ If name is non\-NULL and map_addr is 0, then the map_addr used by the
 provider will be returned through the attribute structure.
 The map_addr field is ignored if name is NULL.
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 The following flags may be used when opening an AV.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_EVENT\f[]
+.B \- \f[I]FI_EVENT\f[R]
 When the flag FI_EVENT is specified, all insert operations on this AV
 will occur asynchronously.
 There will be one EQ error entry generated for each failed address
@@ -280,15 +230,13 @@ The context field in all completions will be the context specified to
 the insert call, and the data field in the final completion entry will
 report the number of addresses successfully inserted.
 If an error occurs during the asynchronous insertion, an error
-completion entry is returned (see \f[C]fi_eq\f[](3) for a discussion of
+completion entry is returned (see \f[C]fi_eq\f[R](3) for a discussion of
 the fi_eq_err_entry error completion struct).
 The context field of the error completion will be the context that was
 specified in the insert call; the data field will contain the index of
 the failed address.
 There will be one error completion returned for each address that fails
 to insert into the AV.
-.RS
-.RE
 .PP
 If an AV is opened with FI_EVENT, any insertions attempted before an EQ
 is bound to the AV will fail with \-FI_ENOEQ.
@@ -302,23 +250,19 @@ The only guarantee is that all error completions for a given call to
 fi_av_insert will precede the single associated non\-error completion.
 \[bu] .RS 2
 .TP
-.B \f[I]FI_READ\f[]
+.B \f[I]FI_READ\f[R]
 Opens an AV for read\-only access.
 An AV opened for read\-only access must be named (name attribute
 specified), and the AV must exist.
-.RS
-.RE
 .RE
 \[bu] .RS 2
 .TP
-.B \f[I]FI_SYMMETRIC\f[]
+.B \f[I]FI_SYMMETRIC\f[R]
 Indicates that each node will be associated with the same number of
 endpoints, the same transport addresses will be allocated on each node,
 and the transport addresses will be sequential.
 This feature targets distributed applications on large fabrics and
 allows for highly\-optimized storage of remote endpoint addressing.
-.RS
-.RE
 .RE
 .SS fi_close
 .PP
@@ -336,10 +280,10 @@ the call will return \-FI_EBUSY.
 .SS fi_av_bind
 .PP
 Associates an event queue with the AV.
-If an AV has been opened with \f[C]FI_EVENT\f[], then an event queue
+If an AV has been opened with \f[C]FI_EVENT\f[R], then an event queue
 must be bound to the AV before any insertion calls are attempted.
 Any calls to insert addresses before an event queue has been bound will
-fail with \f[C]\-FI_ENOEQ\f[].
+fail with \f[C]\-FI_ENOEQ\f[R].
 Flags are reserved for future use and must be 0.
 .SS fi_av_insert
 .PP
@@ -350,10 +294,10 @@ AV.
 Addresses inserted into an address vector must be in the same format as
 specified in the addr_format field of the fi_info struct provided when
 opening the corresponding domain.
-When using the \f[C]FI_ADDR_STR\f[] format, the \f[C]addr\f[] parameter
-should reference an array of strings (char **).
+When using the \f[C]FI_ADDR_STR\f[R] format, the \f[C]addr\f[R]
+parameter should reference an array of strings (char **).
 .PP
-For AV\[aq]s of type FI_AV_MAP, once inserted addresses have been
+For AV\[cq]s of type FI_AV_MAP, once inserted addresses have been
 mapped, the mapped values are written into the buffer referenced by
 fi_addr.
 The fi_addr buffer must remain valid until the AV insertion has
@@ -365,7 +309,7 @@ specific encoding of low\-level addressing data, for example.
 In the latter case, use of FI_AV_MAP may be able to avoid memory
 references during data transfer operations.
 .PP
-For AV\[aq]s of type FI_AV_TABLE, addresses are placed into the table in
+For AV\[cq]s of type FI_AV_TABLE, addresses are placed into the table in
 order.
 An address is inserted at the lowest index that corresponds to an unused
 table location, with indices starting at 0.
@@ -384,7 +328,7 @@ the buffer must remain valid until the insertion operation completes.
 Note that if fi_addr is NULL and synchronous operation is requested
 without using FI_SYNC_ERR flag, individual insertion failures cannot be
 reported and the application must use other calls, such as
-\f[C]fi_av_lookup\f[] to learn which specific addresses failed to
+\f[C]fi_av_lookup\f[R] to learn which specific addresses failed to
 insert.
 Since fi_av_remove is provider\-specific, it is recommended that calls
 to fi_av_insert following a call to fi_av_remove always reference a
@@ -392,13 +336,11 @@ valid buffer in the fi_addr parameter.
 Otherwise it may be difficult to determine what the next assigned index
 will be.
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 The following flag may be passed to AV insertion calls: fi_av_insert,
 fi_av_insertsvc, or fi_av_insertsym.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_MORE\f[]
+.B \- \f[I]FI_MORE\f[R]
 In order to allow optimized address insertion, the application may
 specify the FI_MORE flag to the insert call to give a hint to the
 provider that more insertion requests will follow, allowing the provider
@@ -407,10 +349,8 @@ An application may make any number of insertion calls with FI_MORE set,
 provided that they are followed by an insertion call without FI_MORE.
 This signifies to the provider that the insertion list is complete.
 Providers are free to ignore FI_MORE.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_SYNC_ERR\f[]
+.B \- \f[I]FI_SYNC_ERR\f[R]
 This flag applies to synchronous insertions only, and is used to
 retrieve error details of failed insertions.
 If set, the context parameter of insertion calls references an array of
@@ -419,8 +359,6 @@ The resulting status of attempting to insert each address will be
 written to the corresponding array location.
 Successful insertions will be updated to 0.
 Failures will contain a fabric errno code.
-.RS
-.RE
 .SS fi_av_insertsvc
 .PP
 The fi_av_insertsvc call behaves similar to fi_av_insert, but allows the
@@ -431,10 +369,10 @@ Node should be a string that corresponds to a hostname or network
 address.
 The service string corresponds to a textual representation of a
 transport address.
-Applications may also pass in an \f[C]FI_ADDR_STR\f[] formatted address
+Applications may also pass in an \f[C]FI_ADDR_STR\f[R] formatted address
 as the node parameter.
 In such cases, the service parameter must be NULL.
-See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[].
+See fi_getinfo.3 for details on using \f[C]FI_ADDR_STR\f[R].
 Supported flags are the same as for fi_av_insert.
 .SS fi_av_insertsym
 .PP
@@ -456,11 +394,12 @@ Inserted node addresses will be of the range {node, node + nodecnt \-
 If node is a non\-numeric string, such as a hostname, it must contain a
 numeric suffix if nodecnt > 1.
 .PP
-As an example, if node = "10.1.1.1", nodecnt = 2, service = "5000", and
-svccnt = 2, the following addresses will be inserted into the AV in the
-order shown: 10.1.1.1:5000, 10.1.1.1:5001, 10.1.1.2:5000, 10.1.1.2:5001.
-If node were replaced by the hostname "host10", the addresses would be:
-host10:5000, host10:5001, host11:5000, host11:5001.
+As an example, if node = \[lq]10.1.1.1\[rq], nodecnt = 2, service =
+\[lq]5000\[rq], and svccnt = 2, the following addresses will be inserted
+into the AV in the order shown: 10.1.1.1:5000, 10.1.1.1:5001,
+10.1.1.2:5000, 10.1.1.2:5001.
+If node were replaced by the hostname \[lq]host10\[rq], the addresses
+would be: host10:5000, host10:5001, host11:5000, host11:5001.
 .PP
 The total number of inserted addresses will be nodecnt x svccnt.
 .PP
@@ -522,7 +461,7 @@ If the provided buffer is too small, the results will be truncated.
 fi_av_straddr returns a pointer to buf.
 .SH NOTES
 .PP
-Providers may implement AV\[aq]s using a variety of mechanisms.
+Providers may implement AV\[cq]s using a variety of mechanisms.
 Specifically, a provider may begin resolving inserted addresses as soon
 as they have been added to an AV, even if asynchronous operation has
 been specified.
@@ -549,10 +488,10 @@ FI_ADDR_NOTAVAIL.
 .PP
 All other calls return 0 on success, or a negative value corresponding
 to fabric errno on error.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_eq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_av_set.3 b/deps/libfabric/man/man3/fi_av_set.3
index 51548a6e626bc5f2898f777d8c22411adfcb0db7..72eb3c1d1cb5175611b9a30694faa1a4390966fb 100644
--- a/deps/libfabric/man/man3/fi_av_set.3
+++ b/deps/libfabric/man/man3/fi_av_set.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_av_set" "3" "2020\-03\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_av_set" "3" "2021\-10\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,108 +8,76 @@ fi_av_set \- Address vector set operations
 .TP
 .B fi_av_set / fi_close
 Open or close an address vector set
-.RS
-.RE
 .TP
 .B fi_av_set_union
 Perform a set union operation on two AV sets
-.RS
-.RE
 .TP
 .B fi_av_set_intersect
 Perform a set intersect operation on two AV sets
-.RS
-.RE
 .TP
 .B fi_av_set_diff
 Perform a set difference operation on two AV sets
-.RS
-.RE
 .TP
 .B fi_av_set_insert
 Add an address to an AV set
-.RS
-.RE
 .TP
 .B fi_av_set_remove
 Remove an address from an AV set
-.RS
-.RE
 .TP
 .B fi_av_set_addr
 Obtain a collective address for current addresses in an AV set
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_collective.h>
+#include <rdma/fi_collective.h>
 
-int\ fi_av_set(struct\ fid_av\ *av,\ struct\ fi_av_set_attr\ *attr,
-\ \ \ \ \ \ struct\ fid_av_set\ **set,\ void\ *\ context);
+int fi_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
+      struct fid_av_set **set, void * context);
 
-int\ fi_av_set_union(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src);
+int fi_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src);
 
-int\ fi_av_set_intersect(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src);
+int fi_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src);
 
-int\ fi_av_set_diff(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src);
+int fi_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src);
 
-int\ fi_av_set_insert(struct\ fid_av_set\ *set,\ fi_addr_t\ addr);
+int fi_av_set_insert(struct fid_av_set *set, fi_addr_t addr);
 
-int\ fi_av_set_remove(struct\ fid_av_set\ *set,\ fi_addr_t\ addr);
+int fi_av_set_remove(struct fid_av_set *set, fi_addr_t addr);
 
-int\ fi_av_set_addr(struct\ fid_av_set\ *set,\ fi_addr_t\ *coll_addr);
+int fi_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr);
 
-int\ fi_close(struct\ fid\ *av_set);
-\f[]
+int fi_close(struct fid *av_set);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]av\f[]
+.B \f[I]av\f[R]
 Address vector
-.RS
-.RE
 .TP
-.B \f[I]set\f[]
+.B \f[I]set\f[R]
 Address vector set
-.RS
-.RE
 .TP
-.B \f[I]dst\f[]
+.B \f[I]dst\f[R]
 Address vector set updated by set operation
-.RS
-.RE
 .TP
-.B \f[I]src\f[]
+.B \f[I]src\f[R]
 Address vector set providing input to a set operation
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Address vector set attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the address vector set
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation.
-.RS
-.RE
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Destination address to insert to remove from AV set.
-.RS
-.RE
 .TP
-.B \f[I]coll_addr\f[]
+.B \f[I]coll_addr\f[R]
 Address identifying collective group.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 An address vector set (AV set) represents an ordered subset of addresses
@@ -124,7 +92,7 @@ The creation and manipulation of an AV set is a local operation.
 No fabric traffic is exchanged between peers.
 As a result, each peer is responsible for creating matching AV sets as
 part of their collective membership definition.
-See \f[C]fi_collective\f[](3) for a discussion of membership models.
+See \f[C]fi_collective\f[R](3) for a discussion of membership models.
 .SS fi_av_set
 .PP
 The fi_av_set call creates a new AV set.
@@ -138,28 +106,26 @@ interfaces defined below.
 .IP
 .nf
 \f[C]
-struct\ fi_av_set_attr\ {
-\ \ \ \ size_t\ count;
-\ \ \ \ fi_addr_t\ start_addr;
-\ \ \ \ fi_addr_t\ end_addr;
-\ \ \ \ uint64_t\ stride;
-\ \ \ \ size_t\ comm_key_size;
-\ \ \ \ uint8_t\ *comm_key;
-\ \ \ \ uint64_t\ flags;
+struct fi_av_set_attr {
+    size_t count;
+    fi_addr_t start_addr;
+    fi_addr_t end_addr;
+    uint64_t stride;
+    size_t comm_key_size;
+    uint8_t *comm_key;
+    uint64_t flags;
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Indicates the expected the number of members that will be a part of the
 AV set.
 The provider uses this to optimize resource allocations.
 If count is 0, the provider will select a size based on available system
 configuration data or underlying limitations.
-.RS
-.RE
 .TP
-.B \f[I]start_addr / end_addr\f[]
+.B \f[I]start_addr / end_addr\f[R]
 The starting and ending addresses, inclusive, to include as part of the
 AV set.
 The use of start and end address require that the associated AV have
@@ -170,13 +136,11 @@ members to the AV set.
 The start_addr and end_addr must be set to FI_ADDR_NOTAVAIL if creating
 an empty AV set, a communication key is being provided, or the AV is of
 type FI_AV_MAP.
-.RS
-.RE
 .PP
 The number of addresses between start_addr and end_addr must be less
 than or equal to the specified count value.
 .TP
-.B \f[I]stride\f[]
+.B \f[I]stride\f[R]
 The number of entries between successive addresses included in the AV
 set.
 The AV set will include all addresses from start_addr + stride x i, for
@@ -185,16 +149,12 @@ A stride of 1 indicates that all addresses between start_addr and
 end_addr should be added to the AV set.
 Stride should be set to 0 unless the start_addr and end_addr fields are
 valid.
-.RS
-.RE
 .TP
-.B \f[I]comm_key_size\f[]
+.B \f[I]comm_key_size\f[R]
 The length of the communication key in bytes.
 This field should be 0 if a communication key is not available.
-.RS
-.RE
 .TP
-.B \f[I]comm_key\f[]
+.B \f[I]comm_key\f[R]
 If supported by the fabric, this represents a key associated with the AV
 set.
 The communication key is used by applications that directly manage
@@ -203,14 +163,12 @@ manager.
 The key is used to convey that results of the membership setup to the
 underlying provider.
 The use and format of a communication key is fabric provider specific.
-.RS
-.RE
-.TP
-.B \f[I]flags\f[]
-If the flag FI_UNIVERSE is set, then the AV set will be created
-containing all addresses stored in the AV.
-.RS
-.RE
+.TP
+.B \f[I]flags\f[R]
+Flags may be used to configure the AV set, including restricting which
+collective operations the AV set needs to support.
+See the flags section for a list of flags that may be specified when
+creating the AV set.
 .SS fi_av_set_union
 .PP
 The AV set union call adds all addresses in the source AV set that are
@@ -244,26 +202,70 @@ This is a local operation only that does not involve network
 communication.
 The returned address may be used as input into fi_join_collective.
 Note that attempting to use the address returned from fi_av_set_addr
-(e.g.
-passing it to fi_join_collective) while simultaneously modifying the
-addresses stored in an AV set results in undefined behavior.
+(e.g.\ passing it to fi_join_collective) while simultaneously modifying
+the addresses stored in an AV set results in undefined behavior.
 .SS fi_close
 .PP
 Closes an AV set and releases all resources associated with it.
 Any operations active at the time an AV set is closed will be aborted,
 with the result of the collective undefined.
+.SH FLAGS
+.PP
+The following flags may be specified as part of AV set creation.
+.TP
+.B \f[I]FI_UNIVERSE\f[R]
+When set, then the AV set will be created containing all addresses
+stored in the corresponding AV.
+.TP
+.B \f[I]FI_BARRIER_SET\f[R]
+If set, the AV set will be configured to support barrier operations.
+.TP
+.B \f[I]FI_BROADCAST_SET\f[R]
+If set, the AV set will be configured to support broadcast operations.
+.TP
+.B \f[I]FI_ALLTOALL_SET\f[R]
+If set, the AV set will be configured to support all to all operations.
+.TP
+.B \f[I]FI_ALLREDUCE_SET\f[R]
+If set, the AV set will be configured to support all reduce operations.
+.TP
+.B \f[I]FI_ALLGATHER_SET\f[R]
+If set, the AV set will be configured to support all gather operations.
+.TP
+.B \f[I]FI_REDUCE_SCATTER_SET\f[R]
+If set, the AV set will be configured to support reduce scatter
+operations.
+.TP
+.B \f[I]FI_REDUCE_SET\f[R]
+If set, the AV set will be configured to support reduce operations.
+.TP
+.B \f[I]FI_SCATTER_SET\f[R]
+If set, the AV set will be configured to support scatter operations.
+.TP
+.B \f[I]FI_GATHER_SET\f[R]
+If set, the AV set will be configured to support gather operations.
 .SH NOTES
 .PP
 Developers who are familiar with MPI will find that AV sets are similar
 to MPI groups, and may act as a direct mapping in some, but not all,
 situations.
+.PP
+By default an AV set will be created to support all collective
+operations supported by the underlying provider (see
+fi_query_collective).
+Users may reduce resource requirements by specifying only those
+collection operations needed by the AV set through the use of creation
+flags: FI_BARRIER_SET, FI_BROADCAST_SET, etc.
+If no such flags are specified, the AV set will be configured to support
+any that are supported.
+It is an error for a user to request an unsupported collective.
 .SH RETURN VALUES
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fi_av\f[](3), \f[C]fi_collective\f[](3)
+\f[C]fi_av\f[R](3), \f[C]fi_collective\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_cm.3 b/deps/libfabric/man/man3/fi_cm.3
index d48f467edebba57a5147881d3144ddae226585f0..bd34e2ca849e83795dce264c41a9a56c7f710c43 100644
--- a/deps/libfabric/man/man3/fi_cm.3
+++ b/deps/libfabric/man/man3/fi_cm.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_cm" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_cm" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,60 +8,52 @@ fi_cm \- Connection management operations
 .TP
 .B fi_connect / fi_listen / fi_accept / fi_reject / fi_shutdown
 Manage endpoint connection state.
-.RS
-.RE
 .TP
 .B fi_setname / fi_getname / fi_getpeer
 Set local, or return local or peer endpoint address.
-.RS
-.RE
 .TP
 .B fi_join / fi_close / fi_mc_addr
 Join, leave, or retrieve a multicast address.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_cm.h>
+#include <rdma/fi_cm.h>
 
-int\ fi_connect(struct\ fid_ep\ *ep,\ const\ void\ *addr,
-\ \ \ \ const\ void\ *param,\ size_t\ paramlen);
+int fi_connect(struct fid_ep *ep, const void *addr,
+    const void *param, size_t paramlen);
 
-int\ fi_listen(struct\ fid_pep\ *pep);
+int fi_listen(struct fid_pep *pep);
 
-int\ fi_accept(struct\ fid_ep\ *ep,\ const\ void\ *param,\ size_t\ paramlen);
+int fi_accept(struct fid_ep *ep, const void *param, size_t paramlen);
 
-int\ fi_reject(struct\ fid_pep\ *pep,\ fid_t\ handle,
-\ \ \ \ const\ void\ *param,\ size_t\ paramlen);
+int fi_reject(struct fid_pep *pep, fid_t handle,
+    const void *param, size_t paramlen);
 
-int\ fi_shutdown(struct\ fid_ep\ *ep,\ uint64_t\ flags);
+int fi_shutdown(struct fid_ep *ep, uint64_t flags);
 
-int\ fi_setname(fid_t\ fid,\ void\ *addr,\ size_t\ addrlen);
+int fi_setname(fid_t fid, void *addr, size_t addrlen);
 
-int\ fi_getname(fid_t\ fid,\ void\ *addr,\ size_t\ *addrlen);
+int fi_getname(fid_t fid, void *addr, size_t *addrlen);
 
-int\ fi_getpeer(struct\ fid_ep\ *ep,\ void\ *addr,\ size_t\ *addrlen);
+int fi_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen);
 
-int\ fi_join(struct\ fid_ep\ *ep,\ const\ void\ *addr,\ uint64_t\ flags,
-\ \ \ \ struct\ fid_mc\ **mc,\ void\ *context);
+int fi_join(struct fid_ep *ep, const void *addr, uint64_t flags,
+    struct fid_mc **mc, void *context);
 
-int\ fi_close(struct\ fid\ *mc);
+int fi_close(struct fid *mc);
 
-fi_addr_t\ fi_mc_addr(struct\ fid_mc\ *mc);
-\f[]
+fi_addr_t fi_mc_addr(struct fid_mc *mc);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]ep / pep\f[]
+.B \f[I]ep / pep\f[R]
 Fabric endpoint on which to change connection state.
-.RS
-.RE
 .PP
-\f[I]fid\f[] Active or passive endpoint to get/set address.
+\f[I]fid\f[R] Active or passive endpoint to get/set address.
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Buffer to address.
 On a set call, the endpoint will be assigned the specified address.
 On a get, the local address will be copied into the buffer, up to the
@@ -69,44 +61,28 @@ space provided.
 For connect, this parameter indicates the peer address to connect to.
 The address must be in the same format as that specified using fi_info:
 addr_format when the endpoint was created.
-.RS
-.RE
 .TP
-.B \f[I]addrlen\f[]
+.B \f[I]addrlen\f[R]
 On input, specifies size of addr buffer.
 On output, stores number of bytes written to addr buffer.
-.RS
-.RE
 .TP
-.B \f[I]param\f[]
+.B \f[I]param\f[R]
 User\-specified data exchanged as part of the connection exchange.
-.RS
-.RE
 .TP
-.B \f[I]paramlen\f[]
+.B \f[I]paramlen\f[R]
 Size of param buffer.
-.RS
-.RE
 .TP
-.B \f[I]info\f[]
+.B \f[I]info\f[R]
 Fabric information associated with a connection request.
-.RS
-.RE
 .TP
-.B \f[I]mc\f[]
+.B \f[I]mc\f[R]
 Multicast group associated with an endpoint.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags for controlling connection operation.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User context associated with the request.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Connection management functions are used to connect an
@@ -120,7 +96,7 @@ Connection requests against a listening endpoint are reported
 asynchronously to the user through a bound CM event queue using the
 FI_CONNREQ event type.
 The number of outstanding connection requests that can be queued at an
-endpoint is limited by the listening endpoint\[aq]s backlog parameter.
+endpoint is limited by the listening endpoint\[cq]s backlog parameter.
 The backlog is initialized based on administrative configuration values,
 but may be adjusted through the fi_control call.
 .SS fi_connect
@@ -232,7 +208,7 @@ An endpoint may be enabled explicitly through fi_enable, or implicitly,
 such as through fi_connect or fi_listen.
 An address may be assigned using fi_setname.
 fi_getpeer is not guaranteed to return a valid peer address until an
-endpoint has been completely connected \-\- an FI_CONNECTED event has
+endpoint has been completely connected \[en] an FI_CONNECTED event has
 been generated.
 .SS fi_join
 .PP
@@ -268,28 +244,24 @@ and paired with the FI_MULTICAST operation flag.
 .PP
 Except in functions noted below, flags are reserved and must be 0.
 .TP
-.B \f[I]FI_SEND\f[]
+.B \f[I]FI_SEND\f[R]
 Applies to fi_join.
 This flag indicates that the endpoint should join the multicast group as
 a send only member.
 The endpoint must be configured for transmit operations to use this
 flag, or an error will occur.
-.RS
-.RE
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 Applies to fi_join.
 This flag indicates that the endpoint should join the multicast group
 with receive permissions only.
 The endpoint must be configured for receive operations to use this flag,
 or an error will occur.
-.RS
-.RE
 .SH RETURN VALUE
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .SH NOTES
 .PP
@@ -307,7 +279,7 @@ events, or as additional err_data to fi_eq_err_entry, in the case of a
 rejected connection.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_eq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_cntr.3 b/deps/libfabric/man/man3/fi_cntr.3
index e958718444594b1b7256fcdf4982e27734cc502e..f7da7efd7de20d26bb0be95c119ea5102a0fbf52 100644
--- a/deps/libfabric/man/man3/fi_cntr.3
+++ b/deps/libfabric/man/man3/fi_cntr.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_cntr" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_cntr" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,97 +8,71 @@ fi_cntr \- Completion and event counter operations
 .TP
 .B fi_cntr_open / fi_close
 Allocate/free a counter
-.RS
-.RE
 .TP
 .B fi_cntr_read
 Read the current value of a counter
-.RS
-.RE
 .TP
 .B fi_cntr_readerr
 Reads the number of operations which have completed in error.
-.RS
-.RE
 .TP
 .B fi_cntr_add
 Increment a counter by a specified value
-.RS
-.RE
 .TP
 .B fi_cntr_set
 Set a counter to a specified value
-.RS
-.RE
 .TP
 .B fi_cntr_wait
 Wait for a counter to be greater or equal to a threshold value
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_cntr_open(struct\ fid_domain\ *domain,\ struct\ fi_cntr_attr\ *attr,
-\ \ \ \ struct\ fid_cntr\ **cntr,\ void\ *context);
+int fi_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
+    struct fid_cntr **cntr, void *context);
 
-int\ fi_close(struct\ fid\ *cntr);
+int fi_close(struct fid *cntr);
 
-uint64_t\ fi_cntr_read(struct\ fid_cntr\ *cntr);
+uint64_t fi_cntr_read(struct fid_cntr *cntr);
 
-uint64_t\ fi_cntr_readerr(struct\ fid_cntr\ *cntr);
+uint64_t fi_cntr_readerr(struct fid_cntr *cntr);
 
-int\ fi_cntr_add(struct\ fid_cntr\ *cntr,\ uint64_t\ value);
+int fi_cntr_add(struct fid_cntr *cntr, uint64_t value);
 
-int\ fi_cntr_adderr(struct\ fid_cntr\ *cntr,\ uint64_t\ value);
+int fi_cntr_adderr(struct fid_cntr *cntr, uint64_t value);
 
-int\ fi_cntr_set(struct\ fid_cntr\ *cntr,\ uint64_t\ value);
+int fi_cntr_set(struct fid_cntr *cntr, uint64_t value);
 
-int\ fi_cntr_seterr(struct\ fid_cntr\ *cntr,\ uint64_t\ value);
+int fi_cntr_seterr(struct fid_cntr *cntr, uint64_t value);
 
-int\ fi_cntr_wait(struct\ fid_cntr\ *cntr,\ uint64_t\ threshold,
-\ \ \ \ int\ timeout);
-\f[]
+int fi_cntr_wait(struct fid_cntr *cntr, uint64_t threshold,
+    int timeout);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 Fabric domain
-.RS
-.RE
 .TP
-.B \f[I]cntr\f[]
+.B \f[I]cntr\f[R]
 Fabric counter
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Counter attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the counter
-.RS
-.RE
 .TP
-.B \f[I]value\f[]
+.B \f[I]value\f[R]
 Value to increment or set counter
-.RS
-.RE
 .TP
-.B \f[I]threshold\f[]
+.B \f[I]threshold\f[R]
 Value to compare counter against
-.RS
-.RE
 .TP
-.B \f[I]timeout\f[]
+.B \f[I]timeout\f[R]
 Time in milliseconds to wait.
 A negative value indicates infinite timeout.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Counters record the number of requested operations that have completed.
@@ -117,44 +91,40 @@ That is, a counter actually stores two distinct values, with error
 completions updating an error specific value.
 .PP
 Counters are updated following the completion event semantics defined in
-\f[C]fi_cq\f[](3).
+\f[C]fi_cq\f[R](3).
 The timing of the update is based on the type of transfer and any
 specified operation flags.
 .SS fi_cntr_open
 .PP
 fi_cntr_open allocates a new fabric counter.
 The properties and behavior of the counter are defined by
-\f[C]struct\ fi_cntr_attr\f[].
+\f[C]struct fi_cntr_attr\f[R].
 .IP
 .nf
 \f[C]
-struct\ fi_cntr_attr\ {
-\ \ \ \ enum\ fi_cntr_events\ \ events;\ \ \ \ /*\ type\ of\ events\ to\ count\ */
-\ \ \ \ enum\ fi_wait_obj\ \ \ \ \ wait_obj;\ \ /*\ requested\ wait\ object\ */
-\ \ \ \ struct\ fid_wait\ \ \ \ \ *wait_set;\ \ /*\ optional\ wait\ set\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */
+struct fi_cntr_attr {
+    enum fi_cntr_events  events;    /* type of events to count */
+    enum fi_wait_obj     wait_obj;  /* requested wait object */
+    struct fid_wait     *wait_set;  /* optional wait set */
+    uint64_t             flags;     /* operation flags */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]events\f[]
+.B \f[I]events\f[R]
 A counter captures different types of events.
 The specific type which is to counted are one of the following:
-.RS
-.RE
 .TP
-.B \- \f[I]FI_CNTR_EVENTS_COMP\f[]
+.B \- \f[I]FI_CNTR_EVENTS_COMP\f[R]
 The counter increments for every successful completion that occurs on an
 associated bound endpoint.
-The type of completions \-\- sends and/or receives \-\- which are
+The type of completions \[en] sends and/or receives \[en] which are
 counted may be restricted using control flags when binding the counter
 and the endpoint.
 Counters increment on all successful completions, separately from
 whether the operation generates an entry in an event queue.
-.RS
-.RE
 .TP
-.B \f[I]wait_obj\f[]
+.B \f[I]wait_obj\f[R]
 Counters may be associated with a specific wait object.
 Wait objects allow applications to block until the wait object is
 signaled, indicating that a counter has reached a specific threshold.
@@ -164,16 +134,12 @@ The following values may be used to specify the type of wait object
 associated with a counter: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET,
 FI_WAIT_FD, FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD.
 The default is FI_WAIT_NONE.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_NONE\f[]
+.B \- \f[I]FI_WAIT_NONE\f[R]
 Used to indicate that the user will not block (wait) for events on the
 counter.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_UNSPEC\f[]
+.B \- \f[I]FI_WAIT_UNSPEC\f[R]
 Specifies that the user will only wait on the counter using fabric
 interface calls, such as fi_cntr_wait.
 In this case, the underlying provider may select the most appropriate or
@@ -181,41 +147,31 @@ highest performing wait object available, including custom wait
 mechanisms.
 Applications that select FI_WAIT_UNSPEC are not guaranteed to retrieve
 the underlying wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_SET\f[]
+.B \- \f[I]FI_WAIT_SET\f[R]
 Indicates that the event counter should use a wait set object to wait
 for events.
 If specified, the wait_set field must reference an existing wait set
 object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_FD\f[]
+.B \- \f[I]FI_WAIT_FD\f[R]
 Indicates that the counter should use a file descriptor as its wait
 mechanism.
 A file descriptor wait object must be usable in select, poll, and epoll
 routines.
 However, a provider may signal an FD wait object by marking it as
 readable, writable, or with an error.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_MUTEX_COND\f[]
+.B \- \f[I]FI_WAIT_MUTEX_COND\f[R]
 Specifies that the counter should use a pthread mutex and cond variable
 as a wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_YIELD\f[]
+.B \- \f[I]FI_WAIT_YIELD\f[R]
 Indicates that the counter will wait without a wait object but instead
 yield on every wait.
 Allows usage of fi_cntr_wait through a spin.
-.RS
-.RE
 .TP
-.B \f[I]wait_set\f[]
+.B \f[I]wait_set\f[R]
 If wait_obj is FI_WAIT_SET, this field references a wait object to which
 the event counter should attach.
 When an event is added to the event counter, the corresponding wait set
@@ -223,13 +179,9 @@ will be signaled if all necessary conditions are met.
 The use of a wait_set enables an optimized method of waiting for events
 across multiple event counters.
 This field is ignored if wait_obj is not FI_WAIT_SET.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Flags are reserved for future use, and must be set to 0.
-.RS
-.RE
 .SS fi_close
 .PP
 The fi_close call releases all resources associated with a counter.
@@ -247,26 +199,20 @@ fi_cntr_control is invoked, as it may redirect the implementation of
 counter operations.
 The following control commands are usable with a counter:
 .TP
-.B \f[I]FI_GETOPSFLAG (uint64_t *)\f[]
+.B \f[I]FI_GETOPSFLAG (uint64_t *)\f[R]
 Returns the current default operational flags associated with the
 counter.
-.RS
-.RE
 .TP
-.B \f[I]FI_SETOPSFLAG (uint64_t *)\f[]
+.B \f[I]FI_SETOPSFLAG (uint64_t *)\f[R]
 Modifies the current default operational flags associated with the
 counter.
-.RS
-.RE
 .TP
-.B \f[I]FI_GETWAIT (void **)\f[]
+.B \f[I]FI_GETWAIT (void **)\f[R]
 This command allows the user to retrieve the low\-level wait object
 associated with the counter.
 The format of the wait\-object is specified during counter creation,
 through the counter attributes.
 See fi_eq.3 for addition details using control with FI_GETWAIT.
-.RS
-.RE
 .SS fi_cntr_read
 .PP
 The fi_cntr_read call returns the current value of the counter.
@@ -295,7 +241,7 @@ or equal to the input threshold value.
 .PP
 If an operation associated with the counter encounters an error, it will
 increment the error value associated with the counter.
-Any change in a counter\[aq]s error value will unblock any thread inside
+Any change in a counter\[cq]s error value will unblock any thread inside
 fi_cntr_wait.
 .PP
 If the call returns due to timeout, \-FI_ETIMEDOUT will be returned.
@@ -310,17 +256,14 @@ On error, a negative value corresponding to fabric errno is returned.
 .TP
 .B fi_cntr_read / fi_cntr_readerr
 Returns the current value of the counter.
-.RS
-.RE
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH NOTES
 .PP
 In order to support a variety of counter implementations, updates made
-to counter values (e.g.
-fi_cntr_set or fi_cntr_add) may not be immediately visible to counter
-read operations (i.e.
-fi_cntr_read or fi_cntr_readerr).
+to counter values (e.g.\ fi_cntr_set or fi_cntr_add) may not be
+immediately visible to counter read operations (i.e.\ fi_cntr_read or
+fi_cntr_readerr).
 A small, but undefined, delay may occur between the counter changing and
 the reported value being updated.
 However, a final updated value will eventually be reflected in the read
@@ -336,7 +279,7 @@ fi_cntr_set / fi_cntr_seterr and results of related operations are
 reflected in the observed value of the counter.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_eq\f[](3), \f[C]fi_poll\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_poll\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_collective.3 b/deps/libfabric/man/man3/fi_collective.3
index 3d000b090645172570d6a253afe72845a7a3b6b1..852ef9520c7e4da8b89599d661854f75f994a1cb 100644
--- a/deps/libfabric/man/man3/fi_collective.3
+++ b/deps/libfabric/man/man3/fi_collective.3
@@ -1,192 +1,146 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_collective" "3" "2020\-04\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_collective" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .TP
 .B fi_join_collective
 Operation where a subset of peers join a new collective group.
-.RS
-.RE
 .TP
 .B fi_barrier
 Collective operation that does not complete until all peers have entered
 the barrier call.
-.RS
-.RE
 .TP
 .B fi_broadcast
 A single sender transmits data to all peers, including itself.
-.RS
-.RE
 .TP
 .B fi_alltoall
 Each peer distributes a slice of its local data to all peers.
-.RS
-.RE
 .TP
 .B fi_allreduce
 Collective operation where all peers broadcast an atomic operation to
 all other peers.
-.RS
-.RE
 .TP
 .B fi_allgather
 Each peer sends a complete copy of its local data to all peers.
-.RS
-.RE
 .TP
 .B fi_reduce_scatter
 Collective call where data is collected from all peers and merged
 (reduced).
 The results of the reduction is distributed back to the peers, with each
 peer receiving a slice of the results.
-.RS
-.RE
 .TP
 .B fi_reduce
 Collective call where data is collected from all peers to a root peer
 and merged (reduced).
-.RS
-.RE
 .TP
 .B fi_scatter
 A single sender distributes (scatters) a slice of its local data to all
 peers.
-.RS
-.RE
 .TP
 .B fi_gather
 All peers send their data to a root peer.
-.RS
-.RE
 .TP
 .B fi_query_collective
 Returns information about which collective operations are supported by a
 provider, and limitations on the collective.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_collective.h>
+#include <rdma/fi_collective.h>
 
-int\ fi_join_collective(struct\ fid_ep\ *ep,\ fi_addr_t\ coll_addr,
-\ \ \ \ const\ struct\ fid_av_set\ *set,
-\ \ \ \ uint64_t\ flags,\ struct\ fid_mc\ **mc,\ void\ *context);
+int fi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr,
+    const struct fid_av_set *set,
+    uint64_t flags, struct fid_mc **mc, void *context);
 
-ssize_t\ fi_barrier(struct\ fid_ep\ *ep,\ fi_addr_t\ coll_addr,
-\ \ \ \ void\ *context);
+ssize_t fi_barrier(struct fid_ep *ep, fi_addr_t coll_addr,
+    void *context);
 
-ssize_t\ fi_broadcast(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ count,\ void\ *desc,
-\ \ \ \ fi_addr_t\ coll_addr,\ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc,
+    fi_addr_t coll_addr, fi_addr_t root_addr, enum fi_datatype datatype,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_alltoall(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_alltoall(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc,
+    fi_addr_t coll_addr, enum fi_datatype datatype,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_allreduce(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_allreduce(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc,
+    fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_allgather(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_allgather(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc,
+    fi_addr_t coll_addr, enum fi_datatype datatype,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_reduce_scatter(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,
-\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc,
+    fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_reduce(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr,
-\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_reduce(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc, fi_addr_t coll_addr,
+    fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_scatter(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr,
-\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_scatter(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc, fi_addr_t coll_addr,
+    fi_addr_t root_addr, enum fi_datatype datatype,
+    uint64_t flags, void *context);
 
-ssize_t\ fi_gather(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count,
-\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr,
-\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype,
-\ \ \ \ uint64_t\ flags,\ void\ *context);
+ssize_t fi_gather(struct fid_ep *ep, const void *buf, size_t count,
+    void *desc, void *result, void *result_desc, fi_addr_t coll_addr,
+    fi_addr_t root_addr, enum fi_datatype datatype,
+    uint64_t flags, void *context);
 
-int\ fi_query_collective(struct\ fid_domain\ *domain,
-\ \ \ \ fi_collective_op\ coll,\ struct\ fi_collective_attr\ *attr,\ uint64_t\ flags);
-\f[]
+int fi_query_collective(struct fid_domain *domain,
+    fi_collective_op coll, struct fi_collective_attr *attr, uint64_t flags);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]ep\f[]
+.B \f[I]ep\f[R]
 Fabric endpoint on which to initiate collective operation.
-.RS
-.RE
 .TP
-.B \f[I]set\f[]
+.B \f[I]set\f[R]
 Address vector set defining the collective membership.
-.RS
-.RE
 .TP
-.B \f[I]mc\f[]
+.B \f[I]mc\f[R]
 Multicast group associated with the collective.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 Local data buffer that specifies first operand of collective operation
-.RS
-.RE
 .TP
-.B \f[I]datatype\f[]
+.B \f[I]datatype\f[R]
 Datatype associated with atomic operands
-.RS
-.RE
 .TP
-.B \f[I]op\f[]
+.B \f[I]op\f[R]
 Atomic operation to perform
-.RS
-.RE
 .TP
-.B \f[I]result\f[]
+.B \f[I]result\f[R]
 Local data buffer to store the result of the collective operation.
-.RS
-.RE
 .TP
-.B \f[I]desc / result_desc\f[]
+.B \f[I]desc / result_desc\f[R]
 Data descriptor associated with the local data buffer and local result
 buffer, respectively.
-.RS
-.RE
 .TP
-.B \f[I]coll_addr\f[]
+.B \f[I]coll_addr\f[R]
 Address referring to the collective group of endpoints.
-.RS
-.RE
 .TP
-.B \f[I]root_addr\f[]
+.B \f[I]root_addr\f[R]
 Single endpoint that is the source or destination of collective data.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply for the atomic operation
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified pointer to associate with the operation.
 This parameter is ignored if the operation will not generate a
 successful completion, unless an op flag specifies the context parameter
 be used for required input.
-.RS
-.RE
 .SH DESCRIPTION (EXPERIMENTAL APIs)
 .PP
 The collective APIs are new to the 1.9 libfabric release.
@@ -197,7 +151,7 @@ versions of the library until the experimental tag has been removed.
 .PP
 In general collective operations can be thought of as coordinated atomic
 operations between a set of peer endpoints.
-Readers should refer to the \f[C]fi_atomic\f[](3) man page for details
+Readers should refer to the \f[C]fi_atomic\f[R](3) man page for details
 on the atomic operations and datatypes defined by libfabric.
 .PP
 A collective operation is a group communication exchange.
@@ -244,7 +198,7 @@ provider by creating and configuring an address vector set (AV set).
 An AV set represents an ordered subset of addresses in an address vector
 (AV).
 Details on creating and configuring an AV set are available in
-\f[C]fi_av_set\f[](3).
+\f[C]fi_av_set\f[R](3).
 .PP
 Once an AV set has been programmed with the collective membership
 information, an endpoint is joined to the set.
@@ -303,7 +257,7 @@ Applications must call fi_close on the collective group to disconnect
 the endpoint from the group.
 After a join operation has completed, the fi_mc_addr call may be used to
 retrieve the address associated with the multicast group.
-See \f[C]fi_cm\f[](3) for additional details on fi_mc_addr().
+See \f[C]fi_cm\f[R](3) for additional details on fi_mc_addr().
 .SS Barrier (fi_barrier)
 .PP
 The fi_barrier operation provides a mechanism to synchronize peers.
@@ -329,13 +283,13 @@ transfer an array of integers to a group of peers.
 .IP
 .nf
 \f[C]
-[1]\ \ [1]\ \ [1]
-[5]\ \ [5]\ \ [5]
-[9]\ \ [9]\ \ [9]
-\ |____^\ \ \ \ ^
-\ |_________|
-\ broadcast
-\f[]
+[1]  [1]  [1]
+[5]  [5]  [5]
+[9]  [9]  [9]
+ |____\[ha]    \[ha]
+ |_________|
+ broadcast
+\f[R]
 .fi
 .SS All to All (fi_alltoall)
 .PP
@@ -347,16 +301,16 @@ entries in an integer array.
 .IP
 .nf
 \f[C]
-[1]\ \ \ [2]\ \ \ [3]
-[5]\ \ \ [6]\ \ \ [7]
-[9]\ \ [10]\ \ [11]
-\ \ \ \\\ \ \ |\ \ \ /
-\ \ \ All\ to\ all
-\ \ \ /\ \ \ |\ \ \ \\
-[1]\ \ \ [5]\ \ \ [9]
-[2]\ \ \ [6]\ \ [10]
-[3]\ \ \ [7]\ \ [11]
-\f[]
+[1]   [2]   [3]
+[5]   [6]   [7]
+[9]  [10]  [11]
+   \[rs]   |   /
+   All to all
+   /   |   \[rs]
+[1]   [5]   [9]
+[2]   [6]  [10]
+[3]   [7]  [11]
+\f[R]
 .fi
 .PP
 Each peer sends a piece of its data to the other peers.
@@ -383,17 +337,17 @@ involving summing an array of integers between three peers.
 .IP
 .nf
 \f[C]
-\ [1]\ \ [1]\ \ [1]
-\ [5]\ \ [5]\ \ [5]
-\ [9]\ \ [9]\ \ [9]
-\ \ \ \\\ \ \ |\ \ \ /
-\ \ \ \ \ \ sum
-\ \ \ /\ \ \ |\ \ \ \\
-\ [3]\ \ [3]\ \ [3]
-[15]\ [15]\ [15]
-[27]\ [27]\ [27]
-\ \ All\ Reduce
-\f[]
+ [1]  [1]  [1]
+ [5]  [5]  [5]
+ [9]  [9]  [9]
+   \[rs]   |   /
+      sum
+   /   |   \[rs]
+ [3]  [3]  [3]
+[15] [15] [15]
+[27] [27] [27]
+  All Reduce
+\f[R]
 .fi
 .SS All Gather (fi_allgather)
 .PP
@@ -404,14 +358,14 @@ that array back to each peer.
 .IP
 .nf
 \f[C]
-[1]\ \ [5]\ \ [9]
-\ \ \\\ \ \ |\ \ \ /
-\ All\ gather
-\ \ /\ \ \ |\ \ \ \\
-[1]\ \ [1]\ \ [1]
-[5]\ \ [5]\ \ [5]
-[9]\ \ [9]\ \ [9]
-\f[]
+[1]  [5]  [9]
+  \[rs]   |   /
+ All gather
+  /   |   \[rs]
+[1]  [1]  [1]
+[5]  [5]  [5]
+[9]  [9]  [9]
+\f[R]
 .fi
 .PP
 All gather may be performed on any non\-void datatype.
@@ -430,20 +384,20 @@ This is shown by the following example:
 .IP
 .nf
 \f[C]
-[1]\ \ [1]\ \ [1]
-[5]\ \ [5]\ \ [5]
-[9]\ \ [9]\ \ [9]
-\ \ \\\ \ \ |\ \ \ /
-\ \ \ \ \ sum\ (reduce)
-\ \ \ \ \ \ |
-\ \ \ \ \ [3]
-\ \ \ \ [15]
-\ \ \ \ [27]
-\ \ \ \ \ \ |
-\ \ \ scatter
-\ \ /\ \ \ |\ \ \ \\
-[3]\ [15]\ [27]
-\f[]
+[1]  [1]  [1]
+[5]  [5]  [5]
+[9]  [9]  [9]
+  \[rs]   |   /
+     sum (reduce)
+      |
+     [3]
+    [15]
+    [27]
+      |
+   scatter
+  /   |   \[rs]
+[3] [15] [27]
+\f[R]
 .fi
 .PP
 The reduce scatter call supports the same datatype and atomic operation
@@ -452,23 +406,23 @@ as fi_allreduce.
 .PP
 The fi_reduce collective is the first half of an fi_allreduce operation.
 With reduce, all peers provide input into an atomic operation, with the
-the results collected by a single \[aq]root\[aq] endpoint.
+the results collected by a single `root' endpoint.
 .PP
 This is shown by the following example, with the leftmost peer
 identified as the root:
 .IP
 .nf
 \f[C]
-[1]\ \ [1]\ \ [1]
-[5]\ \ [5]\ \ [5]
-[9]\ \ [9]\ \ [9]
-\ \ \\\ \ \ |\ \ \ /
-\ \ \ \ \ sum\ (reduce)
-\ \ \ \ /
-\ [3]
+[1]  [1]  [1]
+[5]  [5]  [5]
+[9]  [9]  [9]
+  \[rs]   |   /
+     sum (reduce)
+    /
+ [3]
 [15]
 [27]
-\f[]
+\f[R]
 .fi
 .PP
 The reduce call supports the same datatype and atomic operation as
@@ -477,21 +431,21 @@ fi_allreduce.
 .PP
 The fi_scatter collective is the second half of an fi_reduce_scatter
 operation.
-The data from a single \[aq]root\[aq] endpoint is split and distributed
-to all peers.
+The data from a single `root' endpoint is split and distributed to all
+peers.
 .PP
 This is shown by the following example:
 .IP
 .nf
 \f[C]
-\ [3]
+ [3]
 [15]
 [27]
-\ \ \ \ \\
-\ \ \ scatter
-\ \ /\ \ \ |\ \ \ \\
-[3]\ [15]\ [27]
-\f[]
+    \[rs]
+   scatter
+  /   |   \[rs]
+[3] [15] [27]
+\f[R]
 .fi
 .PP
 The scatter operation is used to distribute results to the peers.
@@ -499,21 +453,21 @@ No atomic operation is performed on the data.
 .SS Gather (fi_gather)
 .PP
 The fi_gather operation is used to collect (gather) the results from all
-peers and store them at a \[aq]root\[aq] peer.
+peers and store them at a `root' peer.
 .PP
 This is shown by the following example, with the leftmost peer
 identified as the root.
 .IP
 .nf
 \f[C]
-[1]\ \ [5]\ \ [9]
-\ \ \\\ \ \ |\ \ \ /
-\ \ \ \ gather
-\ \ \ /
+[1]  [5]  [9]
+  \[rs]   |   /
+    gather
+   /
 [1]
 [5]
 [9]
-\f[]
+\f[R]
 .fi
 .PP
 The gather operation does not perform any operation on the data itself.
@@ -540,19 +494,19 @@ must be specified through the given attributes.
 .IP
 .nf
 \f[C]
-struct\ fi_collective_attr\ {
-\ \ \ \ enum\ fi_op\ op;
-\ \ \ \ enum\ fi_datatype\ datatype;
-\ \ \ \ struct\ fi_atomic_attr\ datatype_attr;
-\ \ \ \ size_t\ max_members;
-\ \ \ \ \ \ uint64_t\ mode;
+struct fi_collective_attr {
+    enum fi_op op;
+    enum fi_datatype datatype;
+    struct fi_atomic_attr datatype_attr;
+    size_t max_members;
+      uint64_t mode;
 };
-\f[]
+\f[R]
 .fi
 .PP
-For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[](3).
+For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[R](3).
 .TP
-.B \f[I]op\f[]
+.B \f[I]op\f[R]
 On input, this specifies the atomic operation involved with the
 collective call.
 This should be set to one of the following values: FI_MIN, FI_MAX,
@@ -560,10 +514,8 @@ FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, FI_BAND, FI_LXOR, FI_BXOR,
 FI_ATOMIC_READ, FI_ATOMIC_WRITE, of FI_NOOP.
 For collectives that do not exchange application data (fi_barrier), this
 should be set to FI_NOOP.
-.RS
-.RE
 .TP
-.B \f[I]datatype\f[]
+.B \f[I]datatype\f[R]
 On onput, specifies the datatype of the data being modified by the
 collective.
 This should be set to one of the following values: FI_INT8, FI_UINT8,
@@ -572,31 +524,21 @@ FI_DOUBLE, FI_FLOAT_COMPLEX, FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE,
 FI_LONG_DOUBLE_COMPLEX, or FI_VOID.
 For collectives that do not exchange application data (fi_barrier), this
 should be set to FI_VOID.
-.RS
-.RE
 .TP
-.B \f[I]datatype_attr.count\f[]
+.B \f[I]datatype_attr.count\f[R]
 The maximum number of elements that may be used with the collective.
-.RS
-.RE
 .TP
-.B \f[I]datatype.size\f[]
+.B \f[I]datatype.size\f[R]
 The size of the datatype as supported by the provider.
 Applications should validate the size of datatypes that differ based on
 the platform, such as FI_LONG_DOUBLE.
-.RS
-.RE
 .TP
-.B \f[I]max_members\f[]
+.B \f[I]max_members\f[R]
 The maximum number of peers that may participate in a collective
 operation.
-.RS
-.RE
 .TP
-.B \f[I]mode\f[]
+.B \f[I]mode\f[R]
 This field is reserved and should be 0.
-.RS
-.RE
 .PP
 If a collective operation is supported, the query call will return
 FI_SUCCESS, along with attributes on the limits for using that
@@ -605,41 +547,34 @@ collective operation through the provider.
 .PP
 Collective operations map to underlying fi_atomic operations.
 For a discussion of atomic completion semantics, see
-\f[C]fi_atomic\f[](3).
+\f[C]fi_atomic\f[R](3).
 The completion, ordering, and atomicity of collective operations match
 those defined for point to point atomic operations.
 .SH FLAGS
 .PP
 The following flags are defined for the specified operations.
 .TP
-.B \f[I]FI_SCATTER\f[]
+.B \f[I]FI_SCATTER\f[R]
 Applies to fi_query_collective.
 When set, requests attribute information on the reduce\-scatter
 collective operation.
-.RS
-.RE
 .SH RETURN VALUE
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EAGAIN\f[]
-See \f[C]fi_msg\f[](3) for a detailed description of handling FI_EAGAIN.
-.RS
-.RE
+.B \f[I]\-FI_EAGAIN\f[R]
+See \f[C]fi_msg\f[R](3) for a detailed description of handling
+FI_EAGAIN.
 .TP
-.B \f[I]\-FI_EOPNOTSUPP\f[]
+.B \f[I]\-FI_EOPNOTSUPP\f[R]
 The requested atomic operation is not supported on this endpoint.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EMSGSIZE\f[]
+.B \f[I]\-FI_EMSGSIZE\f[R]
 The number of collective operations in a single request exceeds that
 supported by the underlying provider.
-.RS
-.RE
 .SH NOTES
 .PP
 Collective operations map to atomic operations.
@@ -647,11 +582,11 @@ As such, they follow most of the conventions and restrictions as peer to
 peer atomic operations.
 This includes data atomicity, data alignment, and message ordering
 semantics.
-See \f[C]fi_atomic\f[](3) for additional information on the datatypes
+See \f[C]fi_atomic\f[R](3) for additional information on the datatypes
 and operations defined for atomic and collective operations.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_av\f[](3), \f[C]fi_atomic\f[](3),
-\f[C]fi_cm\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_av\f[R](3), \f[C]fi_atomic\f[R](3),
+\f[C]fi_cm\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_control.3 b/deps/libfabric/man/man3/fi_control.3
index c6bc5b3bb0ca49086a2999d9cf59689a7e4aa54c..82924179c7897b3fef1b85e2452e1436c1b19def 100644
--- a/deps/libfabric/man/man3/fi_control.3
+++ b/deps/libfabric/man/man3/fi_control.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_control" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_control" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,27 +9,24 @@ fi_control \- Perform an operation on a fabric resource.
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-int\ fi_control(struct\ fid\ *fid,\ int\ command,\ void\ *arg);
-\f[]
+int fi_control(struct fid *fid, int command, void *arg);
+int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t flags);
+int fi_get_val(struct fid *fid, int name, void *val);
+int fi_set_val(struct fid *fid, int name, void *val);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fid\f[]
+.B \f[I]fid\f[R]
 Fabric resource
-.RS
-.RE
 .TP
-.B \f[I]command\f[]
+.B \f[I]command\f[R]
 Operation to perform
-.RS
-.RE
 .TP
-.B \f[I]arg\f[]
+.B \f[I]arg\f[R]
 Optional argument to the command
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 The fi_control operation is used to perform one or more operations on a
@@ -40,9 +37,20 @@ being operated on, the specified command, and any provided arguments for
 the command.
 For specific details, see the fabric resource specific help pages noted
 below.
+.PP
+fi_alias, fi_get_val, and fi_set_val are wrappers for fi_control with
+commands FI_ALIAS, FI_GET_VAL, FI_SET_VAL, respectively.
+fi_alias creates an alias of the specified fabric resource.
+fi_get_val reads the value of the named parameter associated with the
+fabric resource, while fi_set_val updates that value.
+Available parameter names depend on the type of the fabric resource and
+the provider in use.
+Providers may define provider specific names in the provider extension
+header files (\[cq]rdma/fi_ext_*.h\[cq]).
+Please refer to the provider man pages for details.
 .SH SEE ALSO
 .PP
-\f[C]fi_endpoint\f[](3), \f[C]fi_cm\f[](3), \f[C]fi_cntr\f[](3),
-\f[C]fi_cq\f[](3), \f[C]fi_eq\f[](3),
+\f[C]fi_endpoint\f[R](3), \f[C]fi_cm\f[R](3), \f[C]fi_cntr\f[R](3),
+\f[C]fi_cq\f[R](3), \f[C]fi_eq\f[R](3),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_cq.3 b/deps/libfabric/man/man3/fi_cq.3
index b84308d82f74a5d27acc816b50bcc06c89ec5654..5d7201dc3b7b2c9e29883e378e7b01959d1d244c 100644
--- a/deps/libfabric/man/man3/fi_cq.3
+++ b/deps/libfabric/man/man3/fi_cq.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_cq" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_cq" "3" "2021\-03\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,147 +8,105 @@ fi_cq \- Completion queue operations
 .TP
 .B fi_cq_open / fi_close
 Open/close a completion queue
-.RS
-.RE
 .TP
 .B fi_control
 Control CQ operation or attributes.
-.RS
-.RE
 .TP
 .B fi_cq_read / fi_cq_readfrom / fi_cq_readerr
 Read a completion from a completion queue
-.RS
-.RE
 .TP
 .B fi_cq_sread / fi_cq_sreadfrom
 A synchronous (blocking) read that waits until a specified condition has
 been met before reading a completion from a completion queue.
-.RS
-.RE
 .TP
 .B fi_cq_signal
 Unblock any thread waiting in fi_cq_sread or fi_cq_sreadfrom.
-.RS
-.RE
 .TP
 .B fi_cq_strerror
 Converts provider specific error information into a printable string
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_cq_open(struct\ fid_domain\ *domain,\ struct\ fi_cq_attr\ *attr,
-\ \ \ \ struct\ fid_cq\ **cq,\ void\ *context);
+int fi_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
+    struct fid_cq **cq, void *context);
 
-int\ fi_close(struct\ fid\ *cq);
+int fi_close(struct fid *cq);
 
-int\ fi_control(struct\ fid\ *cq,\ int\ command,\ void\ *arg);
+int fi_control(struct fid *cq, int command, void *arg);
 
-ssize_t\ fi_cq_read(struct\ fid_cq\ *cq,\ void\ *buf,\ size_t\ count);
+ssize_t fi_cq_read(struct fid_cq *cq, void *buf, size_t count);
 
-ssize_t\ fi_cq_readfrom(struct\ fid_cq\ *cq,\ void\ *buf,\ size_t\ count,
-\ \ \ \ fi_addr_t\ *src_addr);
+ssize_t fi_cq_readfrom(struct fid_cq *cq, void *buf, size_t count,
+    fi_addr_t *src_addr);
 
-ssize_t\ fi_cq_readerr(struct\ fid_cq\ *cq,\ struct\ fi_cq_err_entry\ *buf,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf,
+    uint64_t flags);
 
-ssize_t\ fi_cq_sread(struct\ fid_cq\ *cq,\ void\ *buf,\ size_t\ count,
-\ \ \ \ const\ void\ *cond,\ int\ timeout);
+ssize_t fi_cq_sread(struct fid_cq *cq, void *buf, size_t count,
+    const void *cond, int timeout);
 
-ssize_t\ fi_cq_sreadfrom(struct\ fid_cq\ *cq,\ void\ *buf,\ size_t\ count,
-\ \ \ \ fi_addr_t\ *src_addr,\ const\ void\ *cond,\ int\ timeout);
+ssize_t fi_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
+    fi_addr_t *src_addr, const void *cond, int timeout);
 
-int\ fi_cq_signal(struct\ fid_cq\ *cq);
+int fi_cq_signal(struct fid_cq *cq);
 
-const\ char\ *\ fi_cq_strerror(struct\ fid_cq\ *cq,\ int\ prov_errno,
-\ \ \ \ \ \ const\ void\ *err_data,\ char\ *buf,\ size_t\ len);
-\f[]
+const char * fi_cq_strerror(struct fid_cq *cq, int prov_errno,
+      const void *err_data, char *buf, size_t len);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 Open resource domain
-.RS
-.RE
 .TP
-.B \f[I]cq\f[]
+.B \f[I]cq\f[R]
 Completion queue
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Completion queue attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the completion queue.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 For read calls, the data buffer to write completions into.
 For write calls, a completion to insert into the completion queue.
 For fi_cq_strerror, an optional buffer that receives printable error
 information.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Number of CQ entries.
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
+.B \f[I]len\f[R]
 Length of data buffer
-.RS
-.RE
 .TP
-.B \f[I]src_addr\f[]
+.B \f[I]src_addr\f[R]
 Source address of a completed receive operation
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation
-.RS
-.RE
 .TP
-.B \f[I]command\f[]
+.B \f[I]command\f[R]
 Command of control operation to perform on CQ.
-.RS
-.RE
 .TP
-.B \f[I]arg\f[]
+.B \f[I]arg\f[R]
 Optional control argument
-.RS
-.RE
 .TP
-.B \f[I]cond\f[]
+.B \f[I]cond\f[R]
 Condition that must be met before a completion is generated
-.RS
-.RE
 .TP
-.B \f[I]timeout\f[]
+.B \f[I]timeout\f[R]
 Time in milliseconds to wait.
 A negative value indicates infinite timeout.
-.RS
-.RE
 .TP
-.B \f[I]prov_errno\f[]
+.B \f[I]prov_errno\f[R]
 Provider specific error value
-.RS
-.RE
 .TP
-.B \f[I]err_data\f[]
+.B \f[I]err_data\f[R]
 Provider specific error data related to a completion
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Completion queues are used to report events associated with data
@@ -165,39 +123,33 @@ Unlike event queues, completion queues are associated with a resource
 domain and may be offloaded entirely in provider hardware.
 .PP
 The properties and behavior of a completion queue are defined by
-\f[C]struct\ fi_cq_attr\f[].
+\f[C]struct fi_cq_attr\f[R].
 .IP
 .nf
 \f[C]
-struct\ fi_cq_attr\ {
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size;\ \ \ \ \ \ /*\ #\ entries\ for\ CQ\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */
-\ \ \ \ enum\ fi_cq_format\ \ \ \ format;\ \ \ \ /*\ completion\ format\ */
-\ \ \ \ enum\ fi_wait_obj\ \ \ \ \ wait_obj;\ \ /*\ requested\ wait\ object\ */
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ signaling_vector;\ /*\ interrupt\ affinity\ */
-\ \ \ \ enum\ fi_cq_wait_cond\ wait_cond;\ /*\ wait\ condition\ format\ */
-\ \ \ \ struct\ fid_wait\ \ \ \ \ *wait_set;\ \ /*\ optional\ wait\ set\ */
+struct fi_cq_attr {
+    size_t               size;      /* # entries for CQ */
+    uint64_t             flags;     /* operation flags */
+    enum fi_cq_format    format;    /* completion format */
+    enum fi_wait_obj     wait_obj;  /* requested wait object */
+    int                  signaling_vector; /* interrupt affinity */
+    enum fi_cq_wait_cond wait_cond; /* wait condition format */
+    struct fid_wait     *wait_set;  /* optional wait set */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]size\f[]
+.B \f[I]size\f[R]
 Specifies the minimum size of a completion queue.
 A value of 0 indicates that the provider may choose a default value.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Flags that control the configuration of the CQ.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_AFFINITY\f[]
+.B \- \f[I]FI_AFFINITY\f[R]
 Indicates that the signaling_vector field (see below) is valid.
-.RS
-.RE
 .TP
-.B \f[I]format\f[]
+.B \f[I]format\f[R]
 Completion queues allow the application to select the amount of detail
 that it must store and report.
 The format attribute allows the application to select one of several
@@ -206,92 +158,80 @@ completion queue should return when read.
 Supported formats and the structures that correspond to each are listed
 below.
 The meaning of the CQ entry fields are defined in the \f[I]Completion
-Fields\f[] section.
-.RS
-.RE
+Fields\f[R] section.
 .TP
-.B \- \f[I]FI_CQ_FORMAT_UNSPEC\f[]
+.B \- \f[I]FI_CQ_FORMAT_UNSPEC\f[R]
 If an unspecified format is requested, then the CQ will use a provider
 selected default format.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_CQ_FORMAT_CONTEXT\f[]
+.B \- \f[I]FI_CQ_FORMAT_CONTEXT\f[R]
 Provides only user specified context that was associated with the
 completion.
-.RS
-.RE
 .IP
 .nf
 \f[C]
-struct\ fi_cq_entry\ {
-\ \ \ \ void\ \ \ \ \ *op_context;\ /*\ operation\ context\ */
+struct fi_cq_entry {
+    void     *op_context; /* operation context */
 };
-\f[]
+\f[R]
 .fi
 \[bu] .RS 2
 .TP
-.B \f[I]FI_CQ_FORMAT_MSG\f[]
+.B \f[I]FI_CQ_FORMAT_MSG\f[R]
 Provides minimal data for processing completions, with expanded support
 for reporting information about received messages.
-.RS
-.RE
 .RE
 .IP
 .nf
 \f[C]
-struct\ fi_cq_msg_entry\ {
-\ \ \ \ void\ \ \ \ \ *op_context;\ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ flags;\ \ \ \ \ \ \ /*\ completion\ flags\ */
-\ \ \ \ size_t\ \ \ len;\ \ \ \ \ \ \ \ \ /*\ size\ of\ received\ data\ */
+struct fi_cq_msg_entry {
+    void     *op_context; /* operation context */
+    uint64_t flags;       /* completion flags */
+    size_t   len;         /* size of received data */
 };
-\f[]
+\f[R]
 .fi
 \[bu] .RS 2
 .TP
-.B \f[I]FI_CQ_FORMAT_DATA\f[]
+.B \f[I]FI_CQ_FORMAT_DATA\f[R]
 Provides data associated with a completion.
 Includes support for received message length, remote CQ data, and
 multi\-receive buffers.
-.RS
-.RE
 .RE
 .IP
 .nf
 \f[C]
-struct\ fi_cq_data_entry\ {
-\ \ \ \ void\ \ \ \ \ *op_context;\ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ flags;\ \ \ \ \ \ \ /*\ completion\ flags\ */
-\ \ \ \ size_t\ \ \ len;\ \ \ \ \ \ \ \ \ /*\ size\ of\ received\ data\ */
-\ \ \ \ void\ \ \ \ \ *buf;\ \ \ \ \ \ \ \ /*\ receive\ data\ buffer\ */
-\ \ \ \ uint64_t\ data;\ \ \ \ \ \ \ \ /*\ completion\ data\ */
+struct fi_cq_data_entry {
+    void     *op_context; /* operation context */
+    uint64_t flags;       /* completion flags */
+    size_t   len;         /* size of received data */
+    void     *buf;        /* receive data buffer */
+    uint64_t data;        /* completion data */
 };
-\f[]
+\f[R]
 .fi
 \[bu] .RS 2
 .TP
-.B \f[I]FI_CQ_FORMAT_TAGGED\f[]
+.B \f[I]FI_CQ_FORMAT_TAGGED\f[R]
 Expands completion data to include support for the tagged message
 interfaces.
-.RS
-.RE
 .RE
 .IP
 .nf
 \f[C]
-struct\ fi_cq_tagged_entry\ {
-\ \ \ \ void\ \ \ \ \ *op_context;\ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ flags;\ \ \ \ \ \ \ /*\ completion\ flags\ */
-\ \ \ \ size_t\ \ \ len;\ \ \ \ \ \ \ \ \ /*\ size\ of\ received\ data\ */
-\ \ \ \ void\ \ \ \ \ *buf;\ \ \ \ \ \ \ \ /*\ receive\ data\ buffer\ */
-\ \ \ \ uint64_t\ data;\ \ \ \ \ \ \ \ /*\ completion\ data\ */
-\ \ \ \ uint64_t\ tag;\ \ \ \ \ \ \ \ \ /*\ received\ tag\ */
+struct fi_cq_tagged_entry {
+    void     *op_context; /* operation context */
+    uint64_t flags;       /* completion flags */
+    size_t   len;         /* size of received data */
+    void     *buf;        /* receive data buffer */
+    uint64_t data;        /* completion data */
+    uint64_t tag;         /* received tag */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]wait_obj\f[]
-CQ\[aq]s may be associated with a specific wait object.
+.B \f[I]wait_obj\f[R]
+CQ\[cq]s may be associated with a specific wait object.
 Wait objects allow applications to block until the wait object is
 signaled, indicating that a completion is available to be read.
 Users may use fi_control to retrieve the underlying wait object
@@ -300,18 +240,14 @@ The following values may be used to specify the type of wait object
 associated with a CQ: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET,
 FI_WAIT_FD, FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD.
 The default is FI_WAIT_NONE.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_NONE\f[]
+.B \- \f[I]FI_WAIT_NONE\f[R]
 Used to indicate that the user will not block (wait) for completions on
 the CQ.
 When FI_WAIT_NONE is specified, the application may not call fi_cq_sread
 or fi_cq_sreadfrom.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_UNSPEC\f[]
+.B \- \f[I]FI_WAIT_UNSPEC\f[R]
 Specifies that the user will only wait on the CQ using fabric interface
 calls, such as fi_cq_sread or fi_cq_sreadfrom.
 In this case, the underlying provider may select the most appropriate or
@@ -319,49 +255,37 @@ highest performing wait object available, including custom wait
 mechanisms.
 Applications that select FI_WAIT_UNSPEC are not guaranteed to retrieve
 the underlying wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_SET\f[]
+.B \- \f[I]FI_WAIT_SET\f[R]
 Indicates that the completion queue should use a wait set object to wait
 for completions.
 If specified, the wait_set field must reference an existing wait set
 object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_FD\f[]
+.B \- \f[I]FI_WAIT_FD\f[R]
 Indicates that the CQ should use a file descriptor as its wait
 mechanism.
 A file descriptor wait object must be usable in select, poll, and epoll
 routines.
 However, a provider may signal an FD wait object by marking it as
 readable, writable, or with an error.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_MUTEX_COND\f[]
+.B \- \f[I]FI_WAIT_MUTEX_COND\f[R]
 Specifies that the CQ should use a pthread mutex and cond variable as a
 wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_YIELD\f[]
+.B \- \f[I]FI_WAIT_YIELD\f[R]
 Indicates that the CQ will wait without a wait object but instead yield
 on every wait.
 Allows usage of fi_cq_sread and fi_cq_sreadfrom through a spin.
-.RS
-.RE
 .TP
-.B \f[I]signaling_vector\f[]
+.B \f[I]signaling_vector\f[R]
 If the FI_AFFINITY flag is set, this indicates the logical cpu number
 (0..max cpu \- 1) that interrupts associated with the CQ should target.
 This field should be treated as a hint to the provider and may be
 ignored if the provider does not support interrupt affinity.
-.RS
-.RE
 .TP
-.B \f[I]wait_cond\f[]
+.B \f[I]wait_cond\f[R]
 By default, when a completion is inserted into a CQ that supports
 blocking reads (fi_cq_sread/fi_cq_sreadfrom), the corresponding wait
 object is signaled.
@@ -369,8 +293,6 @@ Users may specify a condition that must first be met before the wait is
 satisfied.
 This field indicates how the provider should interpret the cond field,
 which describes the condition needed to signal the wait object.
-.RS
-.RE
 .PP
 A wait condition should be treated as an optimization.
 Providers are not required to meet the requirements of the condition
@@ -388,7 +310,7 @@ before at the CQ before the wait is satisfied.
 .PP
 This field is ignored if wait_obj is set to FI_WAIT_NONE.
 .TP
-.B \f[I]wait_set\f[]
+.B \f[I]wait_set\f[R]
 If wait_obj is FI_WAIT_SET, this field references a wait object to which
 the completion queue should attach.
 When an event is inserted into the completion queue, the corresponding
@@ -396,8 +318,6 @@ wait set will be signaled if all necessary conditions are met.
 The use of a wait_set enables an optimized method of waiting for events
 across multiple event and completion queues.
 This field is ignored if wait_obj is not FI_WAIT_SET.
-.RS
-.RE
 .SS fi_close
 .PP
 The fi_close call releases all resources associated with a completion
@@ -416,7 +336,7 @@ Access to the CQ should be serialized across all calls when fi_control
 is invoked, as it may redirect the implementation of CQ operations.
 The following control commands are usable with a CQ.
 .TP
-.B \f[I]FI_GETWAIT (void **)\f[]
+.B \f[I]FI_GETWAIT (void **)\f[R]
 This command allows the user to retrieve the low\-level wait object
 associated with the CQ.
 The format of the wait\-object is specified during CQ creation, through
@@ -424,8 +344,6 @@ the CQ attributes.
 The fi_control arg parameter should be an address where a pointer to the
 returned wait object will be written.
 See fi_eq.3 for addition details using fi_control with FI_GETWAIT.
-.RS
-.RE
 .SS fi_cq_read
 .PP
 The fi_cq_read operation performs a non\-blocking read of completion
@@ -440,7 +358,7 @@ CQ returned by the call.
 .PP
 CQs are optimized to report operations which have completed
 successfully.
-Operations which fail are reported \[aq]out of band\[aq].
+Operations which fail are reported `out of band'.
 Such operations are retrieved using the fi_cq_readerr function.
 When an operation that has completed with an unexpected error is
 encountered, it is placed into a temporary error queue.
@@ -510,25 +428,25 @@ fi_cq_readerr is a non\-blocking call, returning immediately whether an
 error completion was found or not.
 .PP
 Error information is reported to the user through
-\f[C]struct\ fi_cq_err_entry\f[].
+\f[C]struct fi_cq_err_entry\f[R].
 The format of this structure is defined below.
 .IP
 .nf
 \f[C]
-struct\ fi_cq_err_entry\ {
-\ \ \ \ void\ \ \ \ \ *op_context;\ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ flags;\ \ \ \ \ \ \ /*\ completion\ flags\ */
-\ \ \ \ size_t\ \ \ len;\ \ \ \ \ \ \ \ \ /*\ size\ of\ received\ data\ */
-\ \ \ \ void\ \ \ \ \ *buf;\ \ \ \ \ \ \ \ /*\ receive\ data\ buffer\ */
-\ \ \ \ uint64_t\ data;\ \ \ \ \ \ \ \ /*\ completion\ data\ */
-\ \ \ \ uint64_t\ tag;\ \ \ \ \ \ \ \ \ /*\ message\ tag\ */
-\ \ \ \ size_t\ \ \ olen;\ \ \ \ \ \ \ \ /*\ overflow\ length\ */
-\ \ \ \ int\ \ \ \ \ \ err;\ \ \ \ \ \ \ \ \ /*\ positive\ error\ code\ */
-\ \ \ \ int\ \ \ \ \ \ prov_errno;\ \ /*\ provider\ error\ code\ */
-\ \ \ \ void\ \ \ \ *err_data;\ \ \ \ /*\ \ error\ data\ */
-\ \ \ \ size_t\ \ \ err_data_size;\ /*\ size\ of\ err_data\ */
+struct fi_cq_err_entry {
+    void     *op_context; /* operation context */
+    uint64_t flags;       /* completion flags */
+    size_t   len;         /* size of received data */
+    void     *buf;        /* receive data buffer */
+    uint64_t data;        /* completion data */
+    uint64_t tag;         /* message tag */
+    size_t   olen;        /* overflow length */
+    int      err;         /* positive error code */
+    int      prov_errno;  /* provider error code */
+    void    *err_data;    /*  error data */
+    size_t   err_data_size; /* size of err_data */
 };
-\f[]
+\f[R]
 .fi
 .PP
 The general reason for the error is provided through the err field.
@@ -540,9 +458,9 @@ See field details below for more information on the use of err_data and
 err_data_size.
 .PP
 Note that error completions are generated for all operations, including
-those for which a completion was not requested (e.g.
-an endpoint is configured with FI_SELECTIVE_COMPLETION, but the request
-did not have the FI_COMPLETION flag set).
+those for which a completion was not requested (e.g.\ an endpoint is
+configured with FI_SELECTIVE_COMPLETION, but the request did not have
+the FI_COMPLETION flag set).
 In such cases, providers will return as much information as made
 available by the underlying software and hardware about the failure,
 other fields will be set to NULL or 0.
@@ -551,7 +469,7 @@ was ignored on input as part of the transfer.
 .PP
 Notable completion error codes are given below.
 .TP
-.B \f[I]FI_EADDRNOTAVAIL\f[]
+.B \f[I]FI_EADDRNOTAVAIL\f[R]
 This error code is used by CQs configured with FI_SOURCE_ERR to report
 completions for which a usable fi_addr_t source address could not be
 found.
@@ -563,8 +481,6 @@ The source address will be in the same format as specified through the
 fi_info addr_format field for the opened domain.
 This may be passed directly into an fi_av_insert call to add the source
 address to the address vector.
-.RS
-.RE
 .SS fi_cq_signal
 .PP
 The fi_cq_signal call will unblock any thread waiting in fi_cq_sread or
@@ -579,13 +495,11 @@ The CQ entry data structures share many of the same fields.
 The meanings of these fields are the same for all CQ entry structure
 formats.
 .TP
-.B \f[I]op_context\f[]
+.B \f[I]op_context\f[R]
 The operation context is the application specified context value that
 was provided with an asynchronous operation.
 The op_context field is valid for all completions that are associated
 with an asynchronous operation.
-.RS
-.RE
 .PP
 For completion events that are not associated with a posted operation,
 this field will be set to NULL.
@@ -593,76 +507,60 @@ This includes completions generated at the target in response to RMA
 write operations that carry CQ data (FI_REMOTE_WRITE | FI_REMOTE_CQ_DATA
 flags set), when the FI_RX_CQ_DATA mode bit is not required.
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 This specifies flags associated with the completed operation.
-The \f[I]Completion Flags\f[] section below lists valid flag values.
+The \f[I]Completion Flags\f[R] section below lists valid flag values.
 Flags are set for all relevant completions.
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
-This len field only applies to completed receive operations (e.g.
-fi_recv, fi_trecv, etc.).
-It indicates the size of received \f[I]message\f[] data \-\- i.e.
-how many data bytes were placed into the associated receive buffer by a
+.B \f[I]len\f[R]
+This len field only applies to completed receive operations
+(e.g.\ fi_recv, fi_trecv, etc.).
+It indicates the size of received \f[I]message\f[R] data \[en] i.e.\ how
+many data bytes were placed into the associated receive buffer by a
 corresponding fi_send/fi_tsend/et al call.
 If an endpoint has been configured with the FI_MSG_PREFIX mode, the len
 also reflects the size of the prefix buffer.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 The buf field is only valid for completed receive operations, and only
 applies when the receive buffer was posted with the FI_MULTI_RECV flag.
 In this case, buf points to the starting location where the receive data
 was placed.
-.RS
-.RE
 .TP
-.B \f[I]data\f[]
+.B \f[I]data\f[R]
 The data field is only valid if the FI_REMOTE_CQ_DATA completion flag is
 set, and only applies to receive completions.
 If FI_REMOTE_CQ_DATA is set, this field will contain the completion data
 provided by the peer as part of their transmit request.
 The completion data will be given in host byte order.
-.RS
-.RE
 .TP
-.B \f[I]tag\f[]
+.B \f[I]tag\f[R]
 A tag applies only to received messages that occur using the tagged
 interfaces.
 This field contains the tag that was included with the received message.
 The tag will be in host byte order.
-.RS
-.RE
 .TP
-.B \f[I]olen\f[]
+.B \f[I]olen\f[R]
 The olen field applies to received messages.
 It is used to indicate that a received message has overrun the available
 buffer space and has been truncated.
 The olen specifies the amount of data that did not fit into the
 available receive buffer and was discarded.
-.RS
-.RE
 .TP
-.B \f[I]err\f[]
+.B \f[I]err\f[R]
 This err code is a positive fabric errno associated with a completion.
 The err value indicates the general reason for an error, if one
 occurred.
 See fi_errno.3 for a list of possible error codes.
-.RS
-.RE
 .TP
-.B \f[I]prov_errno\f[]
+.B \f[I]prov_errno\f[R]
 On an error, prov_errno may contain a provider specific error code.
 The use of this field and its meaning is provider specific.
 It is intended to be used as a debugging aid.
 See fi_cq_strerror for additional details on converting this error value
 into a human readable string.
-.RS
-.RE
 .TP
-.B \f[I]err_data\f[]
+.B \f[I]err_data\f[R]
 The err_data field is used to return provider specific information, if
 available, about the error.
 On input, err_data should reference a data buffer of size err_data_size.
@@ -674,18 +572,14 @@ See fi_cq_strerror for additional details on converting this error data
 into a human readable string.
 See the compatibility note below on how this field is used for older
 libfabric releases.
-.RS
-.RE
 .TP
-.B \f[I]err_data_size\f[]
+.B \f[I]err_data_size\f[R]
 On input, err_data_size indicates the size of the err_data buffer in
 bytes.
 On output, err_data_size will be set to the number of bytes copied to
 the err_data buffer.
 The err_data information is typically used with fi_cq_strerror to
 provide details about the type of error that occurred.
-.RS
-.RE
 .PP
 For compatibility purposes, the behavior of the err_data and
 err_data_size fields is may be modified from that listed above.
@@ -703,87 +597,63 @@ Completion flags provide additional details regarding the completed
 operation.
 The following completion flags are defined.
 .TP
-.B \f[I]FI_SEND\f[]
+.B \f[I]FI_SEND\f[R]
 Indicates that the completion was for a send operation.
 This flag may be combined with an FI_MSG or FI_TAGGED flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 Indicates that the completion was for a receive operation.
 This flag may be combined with an FI_MSG or FI_TAGGED flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_RMA\f[]
+.B \f[I]FI_RMA\f[R]
 Indicates that an RMA operation completed.
 This flag may be combined with an FI_READ, FI_WRITE, FI_REMOTE_READ, or
 FI_REMOTE_WRITE flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_ATOMIC\f[]
+.B \f[I]FI_ATOMIC\f[R]
 Indicates that an atomic operation completed.
 This flag may be combined with an FI_READ, FI_WRITE, FI_REMOTE_READ, or
 FI_REMOTE_WRITE flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_MSG\f[]
+.B \f[I]FI_MSG\f[R]
 Indicates that a message\-based operation completed.
 This flag may be combined with an FI_SEND or FI_RECV flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_TAGGED\f[]
+.B \f[I]FI_TAGGED\f[R]
 Indicates that a tagged message operation completed.
 This flag may be combined with an FI_SEND or FI_RECV flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_MULTICAST\f[]
+.B \f[I]FI_MULTICAST\f[R]
 Indicates that a multicast operation completed.
 This flag may be combined with FI_MSG and relevant flags.
 This flag is only guaranteed to be valid for received messages if the
 endpoint has been configured with FI_SOURCE.
-.RS
-.RE
 .TP
-.B \f[I]FI_READ\f[]
+.B \f[I]FI_READ\f[R]
 Indicates that a locally initiated RMA or atomic read operation has
 completed.
 This flag may be combined with an FI_RMA or FI_ATOMIC flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_WRITE\f[]
+.B \f[I]FI_WRITE\f[R]
 Indicates that a locally initiated RMA or atomic write operation has
 completed.
 This flag may be combined with an FI_RMA or FI_ATOMIC flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_READ\f[]
+.B \f[I]FI_REMOTE_READ\f[R]
 Indicates that a remotely initiated RMA or atomic read operation has
 completed.
 This flag may be combined with an FI_RMA or FI_ATOMIC flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_WRITE\f[]
+.B \f[I]FI_REMOTE_WRITE\f[R]
 Indicates that a remotely initiated RMA or atomic write operation has
 completed.
 This flag may be combined with an FI_RMA or FI_ATOMIC flag.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_CQ_DATA\f[]
+.B \f[I]FI_REMOTE_CQ_DATA\f[R]
 This indicates that remote CQ data is available as part of the
 completion.
-.RS
-.RE
 .TP
-.B \f[I]FI_MULTI_RECV\f[]
+.B \f[I]FI_MULTI_RECV\f[R]
 This flag applies to receive buffers that were posted with the
 FI_MULTI_RECV flag set.
 This completion flag indicates that the original receive buffer
@@ -792,8 +662,6 @@ provider.
 Providers may set this flag on the last message that is received into
 the multi\- recv buffer, or may generate a separate completion that
 indicates that the buffer has been released.
-.RS
-.RE
 .PP
 Applications can distinguish between these two cases by examining the
 completion entry flags field.
@@ -807,30 +675,26 @@ If other flag bits are zero, the provider is reporting that the
 multi\-recv buffer has been released, and the completion entry is not
 associated with a received message.
 .TP
-.B \f[I]FI_MORE\f[]
-See the \[aq]Buffered Receives\[aq] section in \f[C]fi_msg\f[](3) for
-more details.
+.B \f[I]FI_MORE\f[R]
+See the `Buffered Receives' section in \f[C]fi_msg\f[R](3) for more
+details.
 This flag is associated with receive completions on endpoints that have
 FI_BUFFERED_RECV mode enabled.
 When set to one, it indicates that the buffer referenced by the
 completion is limited by the FI_OPT_BUFFERED_LIMIT threshold, and
 additional message data must be retrieved by the application using an
 FI_CLAIM operation.
-.RS
-.RE
 .TP
-.B \f[I]FI_CLAIM\f[]
-See the \[aq]Buffered Receives\[aq] section in \f[C]fi_msg\f[](3) for
-more details.
+.B \f[I]FI_CLAIM\f[R]
+See the `Buffered Receives' section in \f[C]fi_msg\f[R](3) for more
+details.
 This flag is set on completions associated with receive operations that
 claim buffered receive data.
 Note that this flag only applies to endpoints configured with the
 FI_BUFFERED_RECV mode bit.
-.RS
-.RE
 .SH COMPLETION EVENT SEMANTICS
 .PP
-Libfabric defines several completion \[aq]levels\[aq], identified using
+Libfabric defines several completion `levels', identified using
 operational flags.
 Each flag indicates the soonest that a completion event may be generated
 by a provider, and the assumptions that an application may make upon
@@ -846,32 +710,30 @@ is guaranteed.
 To help understand the conceptual differences in completion levels,
 consider mailing a letter.
 Placing the letter into the local mailbox for pick\-up is similar to
-\[aq]inject complete\[aq].
+`inject complete'.
 Having the letter picked up and dropped off at the destination mailbox
-is equivalent to \[aq]transmit complete\[aq].
-The \[aq]delivery complete\[aq] semantic is a stronger guarantee, with a
-person at the destination signing for the letter.
+is equivalent to `transmit complete'.
+The `delivery complete' semantic is a stronger guarantee, with a person
+at the destination signing for the letter.
 However, the person who signed for the letter is not necessarily the
 intended recipient.
-The \[aq]match complete\[aq] option is similar to delivery complete, but
+The `match complete' option is similar to delivery complete, but
 requires the intended recipient to sign for the letter.
 .PP
-The \[aq]commit complete\[aq] level has different semantics than the
-previously mentioned levels.
+The `commit complete' level has different semantics than the previously
+mentioned levels.
 Commit complete would be closer to the letter arriving at the
 destination and being placed into a fire proof safe.
 .PP
 The operational flags for the described completion levels are defined
 below.
 .TP
-.B \f[I]FI_INJECT_COMPLETE\f[]
+.B \f[I]FI_INJECT_COMPLETE\f[R]
 Indicates that a completion should be generated when the source
 buffer(s) may be reused.
 A completion guarantees that the buffers will not be read from again and
 the application may reclaim them.
 No other guarantees are made with respect to the state of the operation.
-.RS
-.RE
 .PP
 Example: A provider may generate this completion event after copying the
 source buffer into a network buffer, either in host memory or on the
@@ -891,12 +753,10 @@ It does not apply to operations that do not generate a completion queue
 entry, such as the fi_inject operation, and is not subject to the
 inject_size message limit restriction.
 .TP
-.B \f[I]FI_TRANSMIT_COMPLETE\f[]
+.B \f[I]FI_TRANSMIT_COMPLETE\f[R]
 Indicates that a completion should be generated when the transmit
 operation has completed relative to the local provider.
 The exact behavior is dependent on the endpoint type.
-.RS
-.RE
 .PP
 For reliable endpoints:
 .PP
@@ -919,7 +779,7 @@ A completion guarantees that the operation is no longer dependent on
 local resources.
 The state of the operation within the fabric is not defined.
 .TP
-.B \f[I]FI_DELIVERY_COMPLETE\f[]
+.B \f[I]FI_DELIVERY_COMPLETE\f[R]
 Indicates that a completion should not be generated until an operation
 has been processed by the destination endpoint(s).
 A completion guarantees that the result of the operation is available;
@@ -927,14 +787,12 @@ however, additional steps may need to be taken at the destination to
 retrieve the results.
 For example, an application may need to provide a receive buffers in
 order to retrieve messages that were buffered by the provider.
-.RS
-.RE
 .PP
 Delivery complete indicates that the message has been processed by the
 peer.
 If an application buffer was ready to receive the results of the message
 when it arrived, then delivery complete indicates that the data was
-placed into the application\[aq]s buffer.
+placed into the application\[cq]s buffer.
 .PP
 This completion mode applies only to reliable endpoints.
 For operations that return data to the initiator, such as RMA read or
@@ -942,7 +800,7 @@ atomic\-fetch, the source endpoint is also considered a destination
 endpoint.
 This is the default completion mode for such operations.
 .TP
-.B \f[I]FI_MATCH_COMPLETE\f[]
+.B \f[I]FI_MATCH_COMPLETE\f[R]
 Indicates that a completion should be generated only after the operation
 has been matched with an application specified buffer.
 Operations using this completion semantic are dependent on the
@@ -952,22 +810,18 @@ acknowledgements or lengthy delays.
 However, this completion model enables peer applications to synchronize
 their execution.
 Many providers may not support this semantic.
-.RS
-.RE
 .TP
-.B \f[I]FI_COMMIT_COMPLETE\f[]
+.B \f[I]FI_COMMIT_COMPLETE\f[R]
 Indicates that a completion should not be generated (locally or at the
 peer) until the result of an operation have been made persistent.
 A completion guarantees that the result is both available and durable,
 in the case of power failure.
-.RS
-.RE
 .PP
 This completion mode applies only to operations that target persistent
 memory regions over reliable endpoints.
 This completion mode is experimental.
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 This is not a completion level, but plays a role in the completion
 ordering between operations that would not normally be ordered.
 An operation that is marked with the FI_FENCE flag and all operations
@@ -981,14 +835,242 @@ FI_FENCE, then its completion indicates prior operations have met the
 semantic required for FI_DELIVERY_COMPLETE.
 This is true even if the prior operation was posted with a lower
 completion level, such as FI_TRANSMIT_COMPLETE or FI_INJECT_COMPLETE.
-.RS
-.RE
 .PP
 Note that a completion generated for an operation posted prior to the
 fenced operation only guarantees that the completion level that was
 originally requested has been met.
 It is the completion of the fenced operation that guarantees that the
 additional semantics have been met.
+.PP
+The above completion semantics are defined with respect to the initiator
+of the operation.
+The different semantics are useful for describing when the initiator may
+re\-use a data buffer, and guarantees what state a transfer must reach
+prior to a completion being generated.
+This allows applications to determine appropriate error handling in case
+of communication failures.
+.SH TARGET COMPLETION SEMANTICS
+.PP
+The completion semantic at the target is used to determine when data at
+the target is visible to the peer application.
+Visibility indicates that a memory read to the same address that was the
+target of a data transfer will return the results of the transfer.
+The target of a transfer can be identified by the initiator, as may be
+the case for RMA and atomic operations, or determined by the target, for
+example by providing a matching receive buffer.
+Global visibility indicates that the results are available regardless of
+where the memory read originates.
+For example, the read could come from a process running on a host CPU,
+it may be accessed by subsequent data transfer over the fabric, or read
+from a peer device such as a GPU.
+.PP
+In terms of completion semantics, visibility usually indicates that the
+transfer meets the FI_DELIVERY_COMPLETE requirements from the
+perspective of the target.
+The target completion semantic may be, but is not necessarily, linked
+with the completion semantic specified by the initiator of the transfer.
+.PP
+Often, target processes do not explicitly state a desired completion
+semantic and instead rely on the default semantic.
+The default behavior is based on several factors, including:
+.IP \[bu] 2
+whether a completion even is generated at the target
+.IP \[bu] 2
+the type of transfer involved (e.g.\ msg vs RMA)
+.IP \[bu] 2
+endpoint data and message ordering guarantees
+.IP \[bu] 2
+properties of the targeted memory buffer
+.IP \[bu] 2
+the initiator\[cq]s specified completion semantic
+.PP
+Broadly, target completion semantics are grouped based on whether or not
+the transfer generates a completion event at the target.
+This includes writing a CQ entry or updating a completion counter.
+In common use cases, transfers that use a message interface (FI_MSG or
+FI_TAGGED) typically generate target events, while transfers involving
+an RMA interface (FI_RMA or FI_ATOMIC) often do not.
+There are exceptions to both these cases, depending on endpoint to CQ
+and counter bindings and operational flags.
+For example, RMA writes that carry remote CQ data will generate a
+completion event at the target, and are frequently used to convey
+visibility to the target application.
+The general guidelines for target side semantics are described below,
+followed by exceptions that modify that behavior.
+.PP
+By default, completions generated at the target indicate that the
+transferred data is immediately available to be read from the target
+buffer.
+That is, the target sees FI_DELIVERY_COMPLETE (or better) semantics,
+even if the initiator requested lower semantics.
+For applications using only data buffers allocated from host memory,
+this is often sufficient.
+.PP
+For operations that do not generate a completion event at the target,
+the visibility of the data at the target may need to be inferred based
+on subsequent operations that do generate target completions.
+Absent a target completion, when a completion of an operation is written
+at the initiator, the visibility semantic of the operation at the target
+aligns with the initiator completion semantic.
+For instance, if an RMA operation completes at the initiator as either
+FI_INJECT_COMPLETE or FI_TRANSMIT_COMPLETE, the data visibility at the
+target is not guaranteed.
+.PP
+One or more of the following mechanisms can be used by the target
+process to guarantee that the results of a data transfer that did not
+generate a completion at the target is now visible.
+This list is not inclusive of all options, but defines common uses.
+In the descriptions below, the first transfer does not result in a
+completion event at the target, but is eventually followed by a transfer
+which does.
+.IP \[bu] 2
+If the endpoint guarantees message ordering between two transfers, the
+target completion of a second transfer will indicate that the data from
+the first transfer is available.
+For example, if the endpoint supports send after write ordering
+(FI_ORDER_SAW), then a receive completion corresponding to the send will
+indicate that the write data is available.
+This holds independent of the initiator\[cq]s completion semantic for
+either the write or send.
+When ordering is guaranteed, the second transfer can be queued with the
+provider immediately after queuing the first.
+.IP \[bu] 2
+If the endpoint does not guarantee message ordering, the initiator must
+take additional steps to ensure visibility.
+If initiator requests FI_DELIVERY_COMPLETE semantics for the first
+operation, the initiator can wait for the operation to complete locally.
+Once the completion has been read, the target completion of a second
+transfer will indicate that the first transfer\[cq]s data is visible.
+.IP \[bu] 2
+Alternatively, if message ordering is not guaranteed by the endpoint,
+the initiator can use the FI_FENCE and FI_DELIVERY_COMPLETE flags on the
+second data transfer to force the first transfers to meet the
+FI_DELIVERY_COMPLETE semantics.
+If the second transfer generates a completion at the target, that will
+indicate that the data is visible.
+Otherwise, a target completion for any transfer after the fenced
+operation will indicate that the data is visible.
+.PP
+The above semantics apply for transfers targeting traditional host
+memory buffers.
+However, the behavior may differ when device memory and/or persistent
+memory is involved (FI_HMEM and FI_PMEM capability bits).
+When heterogenous memory is involved, the concept of memory domains come
+into play.
+Memory domains identify the physical separation of memory, which may or
+may not be accessible through the same virtual address space.
+See the \f[C]fi_mr\f[R](3) man page for further details on memory
+domains.
+.PP
+Completion ordering and data visibility are only well\-defined for
+transfers that target the same memory domain.
+Applications need to be aware of ordering and visibility differences
+when transfers target different memory domains.
+Additionally, applications also need to be concerned with the memory
+domain that completions themselves are written and if it differs from
+the memory domain targeted by a transfer.
+In some situations, either the provider or application may need to call
+device specific APIs to synchronize or flush device memory caches in
+order to achieve the desired data visibility.
+.PP
+When heterogenous memory is in use, the default target completion
+semantic for transfers that generate a completion at the target is still
+FI_DELIVERY_COMPLETE, however, applications should be aware that there
+may be a negative impact on overall performance for providers to meet
+this requirement.
+.PP
+For example, a target process may be using a GPU to accelerate
+computations.
+A memory region mapping to memory on the GPU may be exposed to peers as
+either an RMA target or posted locally as a receive buffer.
+In this case, the application is concerned with two memory domains \[en]
+system and GPU memory.
+Completions are written to system memory.
+.PP
+Continuing the example, a peer process sends a tagged message.
+That message is matched with the receive buffer located in GPU memory.
+The NIC copies the data from the network into the receive buffer and
+writes an entry into the completion queue.
+Note that both memory domains were accessed as part of this transfer.
+The message data was directed to the GPU memory, but the completion went
+to host memory.
+Because separate memory domains may not be synchronized with each other,
+it is possible for the host CPU to see and process the completion entry
+before the transfer to the GPU memory is visible to either the host GPU
+or even software running on the GPU.
+From the perspective of the \f[I]provider\f[R], visibility of the
+completion does not imply visibility of data written to the GPU\[cq]s
+memory domain.
+.PP
+The default completion semantic at the target \f[I]application\f[R] for
+message operations is FI_DELIVERY_COMPLETE.
+An anticipated provider implementation in this situation is for the
+provider software running on the host CPU to intercept the CQ entry,
+detect that the data landed in heterogenous memory, and perform the
+necessary device synchronization or flush operation before reporting the
+completion up to the application.
+This ensures that the data is visible to CPU \f[I]and\f[R] GPU software
+prior to the application processing the completion.
+.PP
+In addition to the cost of provider software intercepting completions
+and checking if a transfer targeted heterogenous memory, device
+synchronization itself may impact performance.
+As a result, applications can request a lower completion semantic when
+posting receives.
+That indicates to the provider that the application will be responsible
+for handling any device specific flush operations that might be needed.
+See \f[C]fi_msg\f[R](3) FLAGS.
+.PP
+For data transfers that do not generate a completion at the target, such
+as RMA or atomics, it is the responsibility of the application to ensure
+that all target buffers meet the necessary visibility requirements of
+the application.
+The previously mentioned bulleted methods for notifying the target that
+the data is visible may not be sufficient, as the provider software at
+the target could lack the context needed to ensure visibility.
+This implies that the application may need to call device
+synchronization/flush APIs directly.
+.PP
+For example, a peer application could perform several RMA writes that
+target GPU memory buffers.
+If the provider offloads RMA operations into the NIC, the provider
+software at the target will be unaware that the RMA operations have
+occurred.
+If the peer sends a message to the target application that indicates
+that the RMA operations are done, the application must ensure that the
+RMA data is visible to the host CPU or GPU prior to executing code that
+accesses the data.
+The target completion of having received the sent message is not
+sufficient, even if send\-after\-write ordering is supported.
+.PP
+Most target heterogenous memory completion semantics map to
+FI_TRANSMIT_COMPLETE or FI_DELIVERY_COMPLETE.
+Persistent memory (FI_PMEM capability), however, is often used with
+FI_COMMIT_COMPLETE semantics.
+Heterogenous completion concepts still apply.
+.PP
+For transfers flagged by the initiator with FI_COMMIT_COMPLETE, a
+completion at the target indicates that the results are visible and
+durable.
+For transfers targeting persistent memory, but using a different
+completion semantic at the initiator, the visibility at the target is
+similar to that described above.
+Durability is only associated with transfers marked with
+FI_COMMIT_COMPLETE.
+.PP
+For transfers targeting persistent memory that request
+FI_DELIVERY_COMPLETE, then a completion, at either the initiator or
+target, indicates that the data is visible.
+Visibility at the target can be conveyed using one of the above describe
+mechanism \[en] generating a target completion, sending a message from
+the initiator, etc.
+Similarly, if the initiator requested FI_TRANSMIT_COMPLETE, then
+additional steps are needed to ensure visibility at the target.
+For example, the transfer can generate a completion at the target, which
+would indicate visibility, but not durability.
+The initiator can also follow the transfer with another operation that
+forces visibility, such as using FI_FENCE in conjunction with
+FI_DELIVERY_COMPLETE.
 .SH NOTES
 .PP
 A completion queue must be bound to at least one enabled endpoint before
@@ -1004,7 +1086,7 @@ completion data when they are valid: FI_REMOTE_READ and FI_REMOTE_WRITE
 FI_MULTI_RECV.
 .PP
 If a completion queue has been overrun, it will be placed into an
-\[aq]overrun\[aq] state.
+`overrun' state.
 Read operations will continue to return any valid, non\-corrupted
 completions, if available.
 After all valid completions have been retrieved, any attempt to read the
@@ -1016,8 +1098,6 @@ report additional completions once the overrun occurs.
 .B fi_cq_open / fi_cq_signal
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-.RS
-.RE
 .PP
 fi_cq_read / fi_cq_readfrom / fi_cq_readerr fi_cq_sread /
 fi_cq_sreadfrom : On success, returns the number of completion events
@@ -1032,19 +1112,16 @@ completion queue.
 On error, a negative value corresponding to fabric errno is returned.
 If the timeout expires or the calling thread is signaled and no data is
 available to be read from the completion queue, \-FI_EAGAIN is returned.
-.RS
-.RE
 .TP
 .B fi_cq_strerror
 Returns a character string interpretation of the provider specific error
 returned with a completion.
-.RS
-.RE
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_eq\f[](3), \f[C]fi_cntr\f[](3), \f[C]fi_poll\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_cntr\f[R](3),
+\f[C]fi_poll\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_domain.3 b/deps/libfabric/man/man3/fi_domain.3
index 77845c8b8a85d03560176897fa5b76f3a79c4a7c..f7048f379730cccb5149638f5b2b179d5f62c1af 100644
--- a/deps/libfabric/man/man3/fi_domain.3
+++ b/deps/libfabric/man/man3/fi_domain.3
@@ -1,7 +1,7 @@
 .\"t
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_domain" "3" "2020\-07\-30" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_domain" "3" "2021\-10\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -10,63 +10,49 @@ fi_domain \- Open a fabric access domain
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_domain(struct\ fid_fabric\ *fabric,\ struct\ fi_info\ *info,
-\ \ \ \ struct\ fid_domain\ **domain,\ void\ *context);
+int fi_domain(struct fid_fabric *fabric, struct fi_info *info,
+    struct fid_domain **domain, void *context);
 
-int\ fi_close(struct\ fid\ *domain);
+int fi_close(struct fid *domain);
 
-int\ fi_domain_bind(struct\ fid_domain\ *domain,\ struct\ fid\ *eq,
-\ \ \ \ uint64_t\ flags);
+int fi_domain_bind(struct fid_domain *domain, struct fid *eq,
+    uint64_t flags);
 
-int\ fi_open_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags,
-\ \ \ \ void\ **ops,\ void\ *context);
+int fi_open_ops(struct fid *domain, const char *name, uint64_t flags,
+    void **ops, void *context);
 
-int\ fi_set_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags,
-\ \ \ \ void\ *ops,\ void\ *context);
-\f[]
+int fi_set_ops(struct fid *domain, const char *name, uint64_t flags,
+    void *ops, void *context);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fabric\f[]
+.B \f[I]fabric\f[R]
 Fabric domain
-.RS
-.RE
 .TP
-.B \f[I]info\f[]
+.B \f[I]info\f[R]
 Fabric information, including domain capabilities and attributes.
-.RS
-.RE
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 An opened access domain.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the domain.
 This context is returned as part of any asynchronous event associated
 with the domain.
-.RS
-.RE
 .TP
-.B \f[I]eq\f[]
+.B \f[I]eq\f[R]
 Event queue for asynchronous operations initiated on the domain.
-.RS
-.RE
 .TP
-.B \f[I]name\f[]
+.B \f[I]name\f[R]
 Name associated with an interface.
-.RS
-.RE
 .TP
-.B \f[I]ops\f[]
+.B \f[I]ops\f[R]
 Fabric interface operations.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 An access domain typically refers to a physical or virtual NIC or
@@ -92,7 +78,7 @@ documentation.
 .PP
 fi_set_ops assigns callbacks that a provider should invoke in place of
 performing selected tasks.
-This allows users to modify or control a provider\[aq]s default
+This allows users to modify or control a provider\[cq]s default
 behavior.
 Conceptually, it allows the user to hook specific functions used by a
 provider and replace it with their own.
@@ -101,7 +87,7 @@ The operations being modified are identified using a well\-known
 character string, passed as the name parameter.
 The format of the ops parameter is dependent upon the name value.
 The ops parameter will reference a structure containing the callbacks
-and other fields needed by the provider to invoke the user\[aq]s
+and other fields needed by the provider to invoke the user\[cq]s
 functions.
 .PP
 If a provider accepts the override, it will return FI_SUCCESS.
@@ -112,7 +98,7 @@ Overrides should be set prior to allocating resources on the domain.
 The following fi_set_ops operations and corresponding callback
 structures are defined.
 .PP
-\f[B]FI_SET_OPS_HMEM_OVERRIDE \-\- Heterogeneous Memory Overrides\f[]
+\f[B]FI_SET_OPS_HMEM_OVERRIDE \[en] Heterogeneous Memory Overrides\f[R]
 .PP
 HMEM override allows users to override HMEM related operations a
 provider may perform.
@@ -128,44 +114,38 @@ The following is the HMEM override operation name and structure.
 .IP
 .nf
 \f[C]
-#define\ FI_SET_OPS_HMEM_OVERRIDE\ "hmem_override_ops"
+#define FI_SET_OPS_HMEM_OVERRIDE \[dq]hmem_override_ops\[dq]
 
-struct\ fi_hmem_override_ops\ {
-\ \ \ \ size_t\ \ size;
+struct fi_hmem_override_ops {
+    size_t  size;
 
-\ \ \ \ ssize_t\ (*copy_from_hmem_iov)(void\ *dest,\ size_t\ size,
-\ \ \ \ \ \ \ \ enum\ fi_hmem_iface\ iface,\ uint64_t\ device,\ const\ struct\ iovec\ *hmem_iov,
-\ \ \ \ \ \ \ \ size_t\ hmem_iov_count,\ uint64_t\ hmem_iov_offset);
+    ssize_t (*copy_from_hmem_iov)(void *dest, size_t size,
+        enum fi_hmem_iface iface, uint64_t device, const struct iovec *hmem_iov,
+        size_t hmem_iov_count, uint64_t hmem_iov_offset);
 
-\ \ \ \ ssize_t\ (*copy_to_hmem_iov)(enum\ fi_hmem_iface\ iface,\ uint64_t\ device,
-\ \ \ \ const\ struct\ iovec\ *hmem_iov,\ size_t\ hmem_iov_count,
-\ \ \ \ \ \ \ \ uint64_t\ hmem_iov_offset,\ const\ void\ *src,\ size_t\ size);
+    ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device,
+    const struct iovec *hmem_iov, size_t hmem_iov_count,
+        uint64_t hmem_iov_offset, const void *src, size_t size);
 };
-\f[]
+\f[R]
 .fi
 .PP
 All fields in struct fi_hmem_override_ops must be set (non\-null) to a
 valid value.
 .TP
-.B \f[I]size\f[]
+.B \f[I]size\f[R]
 This should be set to the sizeof(struct fi_hmem_override_ops).
 The size field is used for forward and backward compatibility purposes.
-.RS
-.RE
 .TP
-.B \f[I]copy_from_hmem_iov\f[]
+.B \f[I]copy_from_hmem_iov\f[R]
 Copy data from the device/hmem to host memory.
 This function should return a negative fi_errno on error, or the number
 of bytes copied on success.
-.RS
-.RE
 .TP
-.B \f[I]copy_to_hmem_iov\f[]
+.B \f[I]copy_to_hmem_iov\f[R]
 Copy data from host memory to the device/hmem.
 This function should return a negative fi_errno on error, or the number
 of bytes copied on success.
-.RS
-.RE
 .SS fi_domain_bind
 .PP
 Associates an event queue with the domain.
@@ -182,9 +162,9 @@ asynchronously, with the completion reported through the event queue.
 If an event queue is not bound to the domain with the FI_REG_MR flag,
 then memory registration requests complete synchronously.
 .PP
-See \f[C]fi_av_bind\f[](3), \f[C]fi_ep_bind\f[](3),
-\f[C]fi_mr_bind\f[](3), \f[C]fi_pep_bind\f[](3), and
-\f[C]fi_scalable_ep_bind\f[](3) for more information.
+See \f[C]fi_av_bind\f[R](3), \f[C]fi_ep_bind\f[R](3),
+\f[C]fi_mr_bind\f[R](3), \f[C]fi_pep_bind\f[R](3), and
+\f[C]fi_scalable_ep_bind\f[R](3) for more information.
 .SS fi_close
 .PP
 The fi_close call is used to release all resources associated with a
@@ -193,41 +173,41 @@ All objects associated with the opened domain must be released prior to
 calling fi_close, otherwise the call will return \-FI_EBUSY.
 .SH DOMAIN ATTRIBUTES
 .PP
-The \f[C]fi_domain_attr\f[] structure defines the set of attributes
+The \f[C]fi_domain_attr\f[R] structure defines the set of attributes
 associated with a domain.
 .IP
 .nf
 \f[C]
-struct\ fi_domain_attr\ {
-\ \ \ \ struct\ fid_domain\ \ \ \ \ *domain;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *name;
-\ \ \ \ enum\ fi_threading\ \ \ \ \ threading;
-\ \ \ \ enum\ fi_progress\ \ \ \ \ \ control_progress;
-\ \ \ \ enum\ fi_progress\ \ \ \ \ \ data_progress;
-\ \ \ \ enum\ fi_resource_mgmt\ resource_mgmt;
-\ \ \ \ enum\ fi_av_type\ \ \ \ \ \ \ av_type;
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mr_mode;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mr_key_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ cq_data_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ cq_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ep_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ tx_ctx_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ rx_ctx_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_ep_tx_ctx;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_ep_rx_ctx;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_ep_stx_ctx;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_ep_srx_ctx;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ cntr_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mr_iov_limit;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ caps;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ mode;
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *auth_key;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ auth_key_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_err_data;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mr_cnt;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ tclass;
+struct fi_domain_attr {
+    struct fid_domain     *domain;
+    char                  *name;
+    enum fi_threading     threading;
+    enum fi_progress      control_progress;
+    enum fi_progress      data_progress;
+    enum fi_resource_mgmt resource_mgmt;
+    enum fi_av_type       av_type;
+    int                   mr_mode;
+    size_t                mr_key_size;
+    size_t                cq_data_size;
+    size_t                cq_cnt;
+    size_t                ep_cnt;
+    size_t                tx_ctx_cnt;
+    size_t                rx_ctx_cnt;
+    size_t                max_ep_tx_ctx;
+    size_t                max_ep_rx_ctx;
+    size_t                max_ep_stx_ctx;
+    size_t                max_ep_srx_ctx;
+    size_t                cntr_cnt;
+    size_t                mr_iov_limit;
+    uint64_t              caps;
+    uint64_t              mode;
+    uint8_t               *auth_key;
+    size_t                auth_key_size;
+    size_t                max_err_data;
+    size_t                mr_cnt;
+    uint32_t              tclass;
 };
-\f[]
+\f[R]
 .fi
 .SS domain
 .PP
@@ -254,15 +234,13 @@ Applications which can guarantee serialization in their access of
 provider allocated resources and interfaces enables a provider to
 eliminate lower\-level locks.
 .TP
-.B \f[I]FI_THREAD_COMPLETION\f[]
+.B \f[I]FI_THREAD_COMPLETION\f[R]
 The completion threading model is intended for providers that make use
 of manual progress.
 Applications must serialize access to all objects that are associated
 through the use of having a shared completion structure.
 This includes endpoint, transmit context, receive context, completion
 queue, counter, wait set, and poll set objects.
-.RS
-.RE
 .PP
 For example, threads must serialize access to an endpoint and its bound
 completion queue(s) and/or counters.
@@ -272,23 +250,19 @@ serialized.
 The use of FI_THREAD_COMPLETION can increase parallelism over
 FI_THREAD_SAFE, but requires the use of isolated resources.
 .TP
-.B \f[I]FI_THREAD_DOMAIN\f[]
+.B \f[I]FI_THREAD_DOMAIN\f[R]
 A domain serialization model requires applications to serialize access
 to all objects belonging to a domain.
-.RS
-.RE
 .TP
-.B \f[I]FI_THREAD_ENDPOINT\f[]
+.B \f[I]FI_THREAD_ENDPOINT\f[R]
 The endpoint threading model is similar to FI_THREAD_FID, but with the
 added restriction that serialization is required when accessing the same
 endpoint, even if multiple transmit and receive contexts are used.
 Conceptually, FI_THREAD_ENDPOINT maps well to providers that implement
 fabric services in hardware but use a single command queue to access
 different data flows.
-.RS
-.RE
 .TP
-.B \f[I]FI_THREAD_FID\f[]
+.B \f[I]FI_THREAD_FID\f[R]
 A fabric descriptor (FID) serialization model requires applications to
 serialize access to individual fabric resources associated with data
 transfer operations and completions.
@@ -296,8 +270,6 @@ Multiple threads must be serialized when accessing the same endpoint,
 transmit context, receive context, completion queue, counter, wait set,
 or poll set.
 Serialization is required only by threads accessing the same object.
-.RS
-.RE
 .PP
 For example, one thread may be initiating a data transfer on an
 endpoint, while another thread reads from a completion queue associated
@@ -316,21 +288,17 @@ Conceptually, FI_THREAD_FID maps well to providers that implement fabric
 services in hardware and provide separate command queues to different
 data flows.
 .TP
-.B \f[I]FI_THREAD_SAFE\f[]
+.B \f[I]FI_THREAD_SAFE\f[R]
 A thread safe serialization model allows a multi\-threaded application
 to access any allocated resources through any interface without
 restriction.
 All providers are required to support FI_THREAD_SAFE.
-.RS
-.RE
 .TP
-.B \f[I]FI_THREAD_UNSPEC\f[]
+.B \f[I]FI_THREAD_UNSPEC\f[R]
 This value indicates that no threading model has been defined.
 It may be used on input hints to the fi_getinfo call.
 When specified, providers will return a threading model that allows for
 the greatest level of parallelism.
-.RS
-.RE
 .SS Progress Models (control_progress / data_progress)
 .PP
 Progress is the ability of the underlying implementation to complete
@@ -363,7 +331,7 @@ and acknowledgement processing.
 To balance between performance and ease of use, two progress models are
 defined.
 .TP
-.B \f[I]FI_PROGRESS_AUTO\f[]
+.B \f[I]FI_PROGRESS_AUTO\f[R]
 This progress model indicates that the provider will make forward
 progress on an asynchronous operation without further intervention by
 the application.
@@ -371,15 +339,13 @@ When FI_PROGRESS_AUTO is provided as output to fi_getinfo in the absence
 of any progress hints, it often indicates that the desired functionality
 is implemented by the provider hardware or is a standard service of the
 operating system.
-.RS
-.RE
 .PP
 All providers are required to support FI_PROGRESS_AUTO.
 However, if a provider does not natively support automatic progress,
 forcing the use of FI_PROGRESS_AUTO may result in threads being
 allocated below the fabric interfaces.
 .TP
-.B \f[I]FI_PROGRESS_MANUAL\f[]
+.B \f[I]FI_PROGRESS_MANUAL\f[R]
 This progress model indicates that the provider requires the use of an
 application thread to complete an asynchronous request.
 When manual progress is set, the provider will attempt to advance an
@@ -388,8 +354,6 @@ or read an event queue, completion queue, or counter where the completed
 operation will be reported.
 Progress also occurs when the application processes a poll or wait set
 that has been associated with the event or completion queue.
-.RS
-.RE
 .PP
 Only wait operations defined by the fabric interface will result in an
 operation progressing.
@@ -405,11 +369,9 @@ For example, an endpoint that acts purely as the target of RMA or atomic
 operations that uses manual progress may still need application
 assistance to process received operations.
 .TP
-.B \f[I]FI_PROGRESS_UNSPEC\f[]
+.B \f[I]FI_PROGRESS_UNSPEC\f[R]
 This value indicates that no progress model has been defined.
 It may be used on input hints to the fi_getinfo call.
-.RS
-.RE
 .SS Resource Management (resource_mgmt)
 .PP
 Resource management (RM) is provider and protocol support to protect
@@ -433,23 +395,17 @@ protection against overruns.
 However, such protection is not guaranteed.
 The following values for resource management are defined.
 .TP
-.B \f[I]FI_RM_DISABLED\f[]
+.B \f[I]FI_RM_DISABLED\f[R]
 The provider is free to select an implementation and protocol that does
 not protect against resource overruns.
 The application is responsible for resource protection.
-.RS
-.RE
 .TP
-.B \f[I]FI_RM_ENABLED\f[]
+.B \f[I]FI_RM_ENABLED\f[R]
 Resource management is enabled for this provider domain.
-.RS
-.RE
 .TP
-.B \f[I]FI_RM_UNSPEC\f[]
+.B \f[I]FI_RM_UNSPEC\f[R]
 This value indicates that no resource management model has been defined.
 It may be used on input hints to the fi_getinfo call.
-.RS
-.RE
 .PP
 The behavior of the various resource management options depends on
 whether the endpoint is reliable or unreliable, as well as provider and
@@ -459,7 +415,7 @@ The table assumes that all peers enable or disable RM the same.
 .PP
 .TS
 tab(@);
-cw(8.0n) cw(16.0n) cw(16.0n) cw(15.3n) cw(14.6n).
+cw(7.7n) cw(16.2n) cw(16.2n) cw(15.4n) cw(14.6n).
 T{
 Resource
 T}@T{
@@ -576,7 +532,7 @@ T}
 The resource column indicates the resource being accessed by a data
 transfer operation.
 .TP
-.B \f[I]Tx Ctx / Rx Ctx\f[]
+.B \f[I]Tx Ctx / Rx Ctx\f[R]
 Refers to the transmit/receive contexts when a data transfer operation
 is submitted.
 When RM is enabled, attempting to submit a request will fail if the
@@ -584,10 +540,8 @@ context is full.
 If RM is disabled, an undefined error (provider specific) will occur.
 Such errors should be considered fatal to the context, and applications
 must take steps to avoid queue overruns.
-.RS
-.RE
 .TP
-.B \f[I]Tx CQ / Rx CQ\f[]
+.B \f[I]Tx CQ / Rx CQ\f[R]
 Refers to the completion queue associated with the Tx or Rx context when
 a local operation completes.
 When RM is disabled, applications must take care to ensure that
@@ -604,13 +558,11 @@ that could result in CQ overruns, or internally retrying requests (which
 will be hidden from the application).
 See notes at the end of this section regarding CQ resource management
 restrictions.
-.RS
-.RE
 .TP
-.B \f[I]Target EP / No Rx Buffer\f[]
+.B \f[I]Target EP / No Rx Buffer\f[R]
 Target EP refers to resources associated with the endpoint that is the
 target of a transmit operation.
-This includes the target endpoint\[aq]s receive queue, posted receive
+This includes the target endpoint\[cq]s receive queue, posted receive
 buffers (no Rx buffers), the receive side completion queue, and other
 related packet processing queues.
 The defined behavior is that seen by the initiator of a request.
@@ -618,11 +570,11 @@ For FI_EP_DGRAM endpoints, if the target EP queues are unable to accept
 incoming messages, received messages will be dropped.
 For reliable endpoints, if RM is disabled, the transmit operation will
 complete in error.
+A provider may choose to return an error completion with the error code
+FI_ENORX for that transmit operation so that it can be retried.
 If RM is enabled, the provider will internally retry the operation.
-.RS
-.RE
 .TP
-.B \f[I]Rx Buffer Overrun\f[]
+.B \f[I]Rx Buffer Overrun\f[R]
 This refers to buffers posted to receive incoming tagged or untagged
 messages, with the behavior defined from the viewpoint of the sender.
 The behavior for handling received messages that are larger than the
@@ -636,26 +588,22 @@ be truncated at the receive side.
 This can occur when the target side buffers received data until an
 application buffer is made available.
 The completion status may also be dependent upon the completion model
-selected byt the application (e.g.
-FI_DELIVERY_COMPLETE versus FI_TRANSMIT_COMPLETE).
-.RS
-.RE
+selected byt the application (e.g.\ FI_DELIVERY_COMPLETE versus
+FI_TRANSMIT_COMPLETE).
 .TP
-.B \f[I]Unmatched RMA / RMA Overrun\f[]
+.B \f[I]Unmatched RMA / RMA Overrun\f[R]
 Unmatched RMA and RMA overruns deal with the processing of RMA and
 atomic operations.
 Unlike send operations, RMA operations that attempt to access a memory
 address that is either not registered for such operations, or attempt to
 access outside of the target memory region will fail, resulting in a
 transmit error.
-.RS
-.RE
 .PP
 When a resource management error occurs on an endpoint, the endpoint is
 transitioned into a disabled state.
 Any operations which have not already completed will fail and be
 discarded.
-For unconnected endpoints, the endpoint must be re\-enabled before it
+For connectionless endpoints, the endpoint must be re\-enabled before it
 will accept new data transfer operations.
 For connected endpoints, the connection is torn down and must be
 re\-established.
@@ -682,23 +630,17 @@ size as the endpoint queue(s) that are bound to it.
 .SS AV Type (av_type)
 .PP
 Specifies the type of address vectors that are usable with this domain.
-For additional details on AV type, see \f[C]fi_av\f[](3).
+For additional details on AV type, see \f[C]fi_av\f[R](3).
 The following values may be specified.
 .TP
-.B \f[I]FI_AV_MAP\f[]
+.B \f[I]FI_AV_MAP\f[R]
 Only address vectors of type AV map are requested or supported.
-.RS
-.RE
 .TP
-.B \f[I]FI_AV_TABLE\f[]
+.B \f[I]FI_AV_TABLE\f[R]
 Only address vectors of type AV index are requested or supported.
-.RS
-.RE
 .TP
-.B \f[I]FI_AV_UNSPEC\f[]
+.B \f[I]FI_AV_UNSPEC\f[R]
 Any address vector format is requested and supported.
-.RS
-.RE
 .PP
 Address vectors are only used by connectionless endpoints.
 Applications that require the use of a specific type of address vector
@@ -712,87 +654,69 @@ optimal AV type supported by this domain.
 .SS Memory Registration Mode (mr_mode)
 .PP
 Defines memory registration specific mode bits used with this domain.
-Full details on MR mode options are available in \f[C]fi_mr\f[](3).
+Full details on MR mode options are available in \f[C]fi_mr\f[R](3).
 The following values may be specified.
 .TP
-.B \f[I]FI_MR_ALLOCATED\f[]
+.B \f[I]FI_MR_ALLOCATED\f[R]
 Indicates that memory registration occurs on allocated data buffers, and
 physical pages must back all virtual addresses being registered.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_ENDPOINT\f[]
+.B \f[I]FI_MR_COLLECTIVE\f[R]
+Requires data buffers passed to collective operations be explicitly
+registered for collective operations using the FI_COLLECTIVE flag.
+.TP
+.B \f[I]FI_MR_ENDPOINT\f[R]
 Memory registration occurs at the endpoint level, rather than domain.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_LOCAL\f[]
+.B \f[I]FI_MR_LOCAL\f[R]
 The provider is optimized around having applications register memory for
 locally accessed data buffers.
 Data buffers used in send and receive operations and as the source
 buffer for RMA and atomic operations must be registered by the
 application for access domains opened with this capability.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_MMU_NOTIFY\f[]
+.B \f[I]FI_MR_MMU_NOTIFY\f[R]
 Indicates that the application is responsible for notifying the provider
 when the page tables referencing a registered memory region may have
 been updated.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_PROV_KEY\f[]
+.B \f[I]FI_MR_PROV_KEY\f[R]
 Memory registration keys are selected and returned by the provider.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_RAW\f[]
+.B \f[I]FI_MR_RAW\f[R]
 The provider requires additional setup as part of their memory
 registration process.
 This mode is required by providers that use a memory key that is larger
 than 64\-bits.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_RMA_EVENT\f[]
+.B \f[I]FI_MR_RMA_EVENT\f[R]
 Indicates that the memory regions associated with completion counters
 must be explicitly enabled after being bound to any counter.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_UNSPEC\f[]
-Defined for compatibility \-\- library versions 1.4 and earlier.
+.B \f[I]FI_MR_UNSPEC\f[R]
+Defined for compatibility \[en] library versions 1.4 and earlier.
 Setting mr_mode to 0 indicates that FI_MR_BASIC or FI_MR_SCALABLE are
 requested and supported.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_VIRT_ADDR\f[]
+.B \f[I]FI_MR_VIRT_ADDR\f[R]
 Registered memory regions are referenced by peers using the virtual
 address of the registered memory region, rather than a 0\-based offset.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_BASIC\f[]
-Defined for compatibility \-\- library versions 1.4 and earlier.
+.B \f[I]FI_MR_BASIC\f[R]
+Defined for compatibility \[en] library versions 1.4 and earlier.
 Only basic memory registration operations are requested or supported.
 This mode is equivalent to the FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and
 FI_MR_PROV_KEY flags being set in later library versions.
 This flag may not be used in conjunction with other mr_mode bits.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_SCALABLE\f[]
-Defined for compatibility \-\- library versions 1.4 and earlier.
+.B \f[I]FI_MR_SCALABLE\f[R]
+Defined for compatibility \[en] library versions 1.4 and earlier.
 Only scalable memory registration operations are requested or supported.
 Scalable registration uses offset based addressing, with application
 selectable memory keys.
 For library versions 1.5 and later, this is the default if no mr_mode
 bits are set.
 This flag may not be used in conjunction with other mr_mode bits.
-.RS
-.RE
 .PP
 Buffers used in data transfer operations may require notifying the
 provider of their use before a data transfer can occur.
@@ -842,8 +766,8 @@ Providers return capability limits based on configured hardware maximum
 capabilities.
 Providers cannot predict all possible system limitations without
 posteriori knowledge acquired during runtime that will further limit
-these hardware maximums (e.g.
-application memory consumption, FD usage, etc.).
+these hardware maximums (e.g.\ application memory consumption, FD usage,
+etc.).
 .SS Transmit Context Count (tx_ctx_cnt)
 .PP
 The number of outbound command queues optimally supported by the
@@ -899,46 +823,38 @@ Domain level capabilities.
 Domain capabilities indicate domain level features that are supported by
 the provider.
 .TP
-.B \f[I]FI_LOCAL_COMM\f[]
+.B \f[I]FI_LOCAL_COMM\f[R]
 At a conceptual level, this field indicates that the underlying device
 supports loopback communication.
 More specifically, this field indicates that an endpoint may communicate
 with other endpoints that are allocated from the same underlying named
 domain.
 If this field is not set, an application may need to use an alternate
-domain or mechanism (e.g.
-shared memory) to communicate with peers that execute on the same node.
-.RS
-.RE
+domain or mechanism (e.g.\ shared memory) to communicate with peers that
+execute on the same node.
 .TP
-.B \f[I]FI_REMOTE_COMM\f[]
+.B \f[I]FI_REMOTE_COMM\f[R]
 This field indicates that the underlying provider supports communication
 with nodes that are reachable over the network.
 If this field is not set, then the provider only supports communication
-between processes that execute on the same node \-\- a shared memory
+between processes that execute on the same node \[en] a shared memory
 provider, for example.
-.RS
-.RE
 .TP
-.B \f[I]FI_SHARED_AV\f[]
+.B \f[I]FI_SHARED_AV\f[R]
 Indicates that the domain supports the ability to share address vectors
 among multiple processes using the named address vector feature.
-.RS
-.RE
 .PP
-See \f[C]fi_getinfo\f[](3) for a discussion on primary versus secondary
+See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary
 capabilities.
 All domain capabilities are considered secondary capabilities.
 .SS mode
 .PP
 The operational mode bit related to using the domain.
 .TP
-.B \f[I]FI_RESTRICTED_COMP\f[]
+.B \f[I]FI_RESTRICTED_COMP\f[R]
 This bit indicates that the domain limits completion queues and counters
 to only be used with endpoints, transmit contexts, and receive contexts
 that have the same set of capability flags.
-.RS
-.RE
 .SS Default authorization key (auth_key)
 .PP
 The default authorization key to associate with endpoint and memory
@@ -975,13 +891,13 @@ cache or lookup tables.
 .PP
 This specifies the default traffic class that will be associated any
 endpoints created within the domain.
-See [\f[C]fi_endpoint\f[](3)](fi_endpoint.3.html for additional
+See [\f[C]fi_endpoint\f[R](3)](fi_endpoint.3.html for additional
 information.
 .SH RETURN VALUE
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH NOTES
 .PP
 Users should call fi_close to release all resources allocated to the
@@ -1000,7 +916,7 @@ lightly loaded systems, without an administrator configuring system
 resources appropriately for the installed provider(s).
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_av\f[](3),
-\f[C]fi_ep\f[](3), \f[C]fi_eq\f[](3), \f[C]fi_mr\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_av\f[R](3),
+\f[C]fi_ep\f[R](3), \f[C]fi_eq\f[R](3), \f[C]fi_mr\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_endpoint.3 b/deps/libfabric/man/man3/fi_endpoint.3
index 1439db87f54b00109aef9e7a6eb1d6e5d31af80f..fcf0d82f3102b9672946b48ddc4ac4250603d88c 100644
--- a/deps/libfabric/man/man3/fi_endpoint.3
+++ b/deps/libfabric/man/man3/fi_endpoint.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_endpoint" "3" "2020\-08\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_endpoint" "3" "2021\-10\-29" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,212 +8,158 @@ fi_endpoint \- Fabric endpoint operations
 .TP
 .B fi_endpoint / fi_scalable_ep / fi_passive_ep / fi_close
 Allocate or close an endpoint.
-.RS
-.RE
 .TP
 .B fi_ep_bind
 Associate an endpoint with hardware resources, such as event queues,
 completion queues, counters, address vectors, or shared transmit/receive
 contexts.
-.RS
-.RE
 .TP
 .B fi_scalable_ep_bind
 Associate a scalable endpoint with an address vector
-.RS
-.RE
 .TP
 .B fi_pep_bind
 Associate a passive endpoint with an event queue
-.RS
-.RE
 .TP
 .B fi_enable
 Transitions an active endpoint into an enabled state.
-.RS
-.RE
 .TP
 .B fi_cancel
 Cancel a pending asynchronous data transfer
-.RS
-.RE
 .TP
 .B fi_ep_alias
 Create an alias to the endpoint
-.RS
-.RE
 .TP
 .B fi_control
 Control endpoint operation.
-.RS
-.RE
 .TP
 .B fi_getopt / fi_setopt
 Get or set endpoint options.
-.RS
-.RE
 .TP
 .B fi_rx_context / fi_tx_context / fi_srx_context / fi_stx_context
 Open a transmit or receive context.
-.RS
-.RE
 .TP
 .B fi_tc_dscp_set / fi_tc_dscp_get
 Convert between a DSCP value and a network traffic class
-.RS
-.RE
 .TP
 .B fi_rx_size_left / fi_tx_size_left (DEPRECATED)
 Query the lower bound on how many RX/TX operations may be posted without
 an operation returning \-FI_EAGAIN.
 This functions have been deprecated and will be removed in a future
 version of the library.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-#include\ <rdma/fi_endpoint.h>
+#include <rdma/fi_endpoint.h>
 
-int\ fi_endpoint(struct\ fid_domain\ *domain,\ struct\ fi_info\ *info,
-\ \ \ \ struct\ fid_ep\ **ep,\ void\ *context);
+int fi_endpoint(struct fid_domain *domain, struct fi_info *info,
+    struct fid_ep **ep, void *context);
 
-int\ fi_scalable_ep(struct\ fid_domain\ *domain,\ struct\ fi_info\ *info,
-\ \ \ \ struct\ fid_ep\ **sep,\ void\ *context);
+int fi_scalable_ep(struct fid_domain *domain, struct fi_info *info,
+    struct fid_ep **sep, void *context);
 
-int\ fi_passive_ep(struct\ fi_fabric\ *fabric,\ struct\ fi_info\ *info,
-\ \ \ \ struct\ fid_pep\ **pep,\ void\ *context);
+int fi_passive_ep(struct fi_fabric *fabric, struct fi_info *info,
+    struct fid_pep **pep, void *context);
 
-int\ fi_tx_context(struct\ fid_ep\ *sep,\ int\ index,
-\ \ \ \ struct\ fi_tx_attr\ *attr,\ struct\ fid_ep\ **tx_ep,
-\ \ \ \ void\ *context);
+int fi_tx_context(struct fid_ep *sep, int index,
+    struct fi_tx_attr *attr, struct fid_ep **tx_ep,
+    void *context);
 
-int\ fi_rx_context(struct\ fid_ep\ *sep,\ int\ index,
-\ \ \ \ struct\ fi_rx_attr\ *attr,\ struct\ fid_ep\ **rx_ep,
-\ \ \ \ void\ *context);
+int fi_rx_context(struct fid_ep *sep, int index,
+    struct fi_rx_attr *attr, struct fid_ep **rx_ep,
+    void *context);
 
-int\ fi_stx_context(struct\ fid_domain\ *domain,
-\ \ \ \ struct\ fi_tx_attr\ *attr,\ struct\ fid_stx\ **stx,
-\ \ \ \ void\ *context);
+int fi_stx_context(struct fid_domain *domain,
+    struct fi_tx_attr *attr, struct fid_stx **stx,
+    void *context);
 
-int\ fi_srx_context(struct\ fid_domain\ *domain,
-\ \ \ \ struct\ fi_rx_attr\ *attr,\ struct\ fid_ep\ **rx_ep,
-\ \ \ \ void\ *context);
+int fi_srx_context(struct fid_domain *domain,
+    struct fi_rx_attr *attr, struct fid_ep **rx_ep,
+    void *context);
 
-int\ fi_close(struct\ fid\ *ep);
+int fi_close(struct fid *ep);
 
-int\ fi_ep_bind(struct\ fid_ep\ *ep,\ struct\ fid\ *fid,\ uint64_t\ flags);
+int fi_ep_bind(struct fid_ep *ep, struct fid *fid, uint64_t flags);
 
-int\ fi_scalable_ep_bind(struct\ fid_ep\ *sep,\ struct\ fid\ *fid,\ uint64_t\ flags);
+int fi_scalable_ep_bind(struct fid_ep *sep, struct fid *fid, uint64_t flags);
 
-int\ fi_pep_bind(struct\ fid_pep\ *pep,\ struct\ fid\ *fid,\ uint64_t\ flags);
+int fi_pep_bind(struct fid_pep *pep, struct fid *fid, uint64_t flags);
 
-int\ fi_enable(struct\ fid_ep\ *ep);
+int fi_enable(struct fid_ep *ep);
 
-int\ fi_cancel(struct\ fid_ep\ *ep,\ void\ *context);
+int fi_cancel(struct fid_ep *ep, void *context);
 
-int\ fi_ep_alias(struct\ fid_ep\ *ep,\ struct\ fid_ep\ **alias_ep,\ uint64_t\ flags);
+int fi_ep_alias(struct fid_ep *ep, struct fid_ep **alias_ep, uint64_t flags);
 
-int\ fi_control(struct\ fid\ *ep,\ int\ command,\ void\ *arg);
+int fi_control(struct fid *ep, int command, void *arg);
 
-int\ fi_getopt(struct\ fid\ *ep,\ int\ level,\ int\ optname,
-\ \ \ \ void\ *optval,\ size_t\ *optlen);
+int fi_getopt(struct fid *ep, int level, int optname,
+    void *optval, size_t *optlen);
 
-int\ fi_setopt(struct\ fid\ *ep,\ int\ level,\ int\ optname,
-\ \ \ \ const\ void\ *optval,\ size_t\ optlen);
+int fi_setopt(struct fid *ep, int level, int optname,
+    const void *optval, size_t optlen);
 
-uint32_t\ fi_tc_dscp_set(uint8_t\ dscp);
+uint32_t fi_tc_dscp_set(uint8_t dscp);
 
-uint8_t\ fi_tc_dscp_get(uint32_t\ tclass);
+uint8_t fi_tc_dscp_get(uint32_t tclass);
 
-DEPRECATED\ ssize_t\ fi_rx_size_left(struct\ fid_ep\ *ep);
+DEPRECATED ssize_t fi_rx_size_left(struct fid_ep *ep);
 
-DEPRECATED\ ssize_t\ fi_tx_size_left(struct\ fid_ep\ *ep);
-\f[]
+DEPRECATED ssize_t fi_tx_size_left(struct fid_ep *ep);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fid\f[]
+.B \f[I]fid\f[R]
 On creation, specifies a fabric or access domain.
 On bind, identifies the event queue, completion queue, counter, or
 address vector to bind to the endpoint.
-In other cases, it\[aq]s a fabric identifier of an associated resource.
-.RS
-.RE
+In other cases, it\[cq]s a fabric identifier of an associated resource.
 .TP
-.B \f[I]info\f[]
+.B \f[I]info\f[R]
 Details about the fabric interface endpoint to be opened, obtained from
 fi_getinfo.
-.RS
-.RE
 .TP
-.B \f[I]ep\f[]
+.B \f[I]ep\f[R]
 A fabric endpoint.
-.RS
-.RE
 .TP
-.B \f[I]sep\f[]
+.B \f[I]sep\f[R]
 A scalable fabric endpoint.
-.RS
-.RE
 .TP
-.B \f[I]pep\f[]
+.B \f[I]pep\f[R]
 A passive fabric endpoint.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 Context associated with the endpoint or asynchronous operation.
-.RS
-.RE
 .TP
-.B \f[I]index\f[]
+.B \f[I]index\f[R]
 Index to retrieve a specific transmit/receive context.
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Transmit or receive context attributes.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation.
-.RS
-.RE
 .TP
-.B \f[I]command\f[]
+.B \f[I]command\f[R]
 Command of control operation to perform on endpoint.
-.RS
-.RE
 .TP
-.B \f[I]arg\f[]
+.B \f[I]arg\f[R]
 Optional control argument.
-.RS
-.RE
 .TP
-.B \f[I]level\f[]
+.B \f[I]level\f[R]
 Protocol level at which the desired option resides.
-.RS
-.RE
 .TP
-.B \f[I]optname\f[]
+.B \f[I]optname\f[R]
 The protocol option to read or set.
-.RS
-.RE
 .TP
-.B \f[I]optval\f[]
+.B \f[I]optval\f[R]
 The option value that was read or to set.
-.RS
-.RE
 .TP
-.B \f[I]optlen\f[]
+.B \f[I]optlen\f[R]
 The size of the optval buffer.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Endpoints are transport level communication portals.
@@ -227,8 +173,8 @@ transfers.
 .PP
 Active endpoints may be connection\-oriented or connectionless, and may
 provide data reliability.
-The data transfer interfaces \-\- messages (fi_msg), tagged messages
-(fi_tagged), RMA (fi_rma), and atomics (fi_atomic) \-\- are associated
+The data transfer interfaces \[en] messages (fi_msg), tagged messages
+(fi_tagged), RMA (fi_rma), and atomics (fi_atomic) \[en] are associated
 with active endpoints.
 In basic configurations, an active endpoint has transmit and receive
 queues.
@@ -258,7 +204,7 @@ relevant completion queues or event queues in order to drive progress.
 For endpoints that are only used as the target of RMA or atomic
 operations, this means binding the endpoint to a completion queue
 associated with receive processing.
-Unconnected endpoints must be bound to an address vector.
+Connectionless endpoints must be bound to an address vector.
 .PP
 Once an endpoint has been activated, it may be associated with an
 address vector.
@@ -353,32 +299,28 @@ reported to the user.
 An active endpoint may direct asynchronous completions to different CQs,
 based on the type of operation.
 This is specified using fi_ep_bind flags.
-The following flags may be OR\[aq]ed together when binding an endpoint
+The following flags may be OR\[cq]ed together when binding an endpoint
 to a completion domain CQ.
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 Directs the notification of inbound data transfers to the specified
 completion queue.
 This includes received messages.
 This binding automatically includes FI_REMOTE_WRITE, if applicable to
 the endpoint.
-.RS
-.RE
 .TP
-.B \f[I]FI_SELECTIVE_COMPLETION\f[]
+.B \f[I]FI_SELECTIVE_COMPLETION\f[R]
 By default, data transfer operations write CQ completion entries into
 the associated completion queue after they have successfully completed.
 Applications can use this bind flag to selectively enable when
 completions are generated.
 If FI_SELECTIVE_COMPLETION is specified, data transfer operations will
-not generate CQ entries for \f[I]successful\f[] completions unless
+not generate CQ entries for \f[I]successful\f[R] completions unless
 FI_COMPLETION is set as an operational flag for the given operation.
 Operations that fail asynchronously will still generate completions,
 even if a completion is not requested.
-FI_SELECTIVE_COMPLETION must be OR\[aq]ed with FI_TRANSMIT and/or
+FI_SELECTIVE_COMPLETION must be OR\[cq]ed with FI_TRANSMIT and/or
 FI_RECV flags.
-.RS
-.RE
 .PP
 When FI_SELECTIVE_COMPLETION is set, the user must determine when a
 request that does NOT have FI_COMPLETION set has completed indirectly,
@@ -390,12 +332,10 @@ avoid writing a CQ completion entry for every operation.
 See Notes section below for additional information on how this flag
 interacts with the FI_CONTEXT and FI_CONTEXT2 mode bits.
 .TP
-.B \f[I]FI_TRANSMIT\f[]
+.B \f[I]FI_TRANSMIT\f[R]
 Directs the completion of outbound data transfer requests to the
 specified completion queue.
 This includes send message, RMA, and atomic operations.
-.RS
-.RE
 .PP
 An endpoint may optionally be bound to a completion counter.
 Associating an endpoint with a counter is in addition to binding the EP
@@ -403,51 +343,39 @@ with a CQ.
 When binding an endpoint to a counter, the following flags may be
 specified.
 .TP
-.B \f[I]FI_READ\f[]
+.B \f[I]FI_READ\f[R]
 Increments the specified counter whenever an RMA read, atomic fetch, or
 atomic compare operation initiated from the endpoint has completed
 successfully or in error.
-.RS
-.RE
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 Increments the specified counter whenever a message is received over the
 endpoint.
 Received messages include both tagged and normal message operations.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_READ\f[]
+.B \f[I]FI_REMOTE_READ\f[R]
 Increments the specified counter whenever an RMA read, atomic fetch, or
 atomic compare operation is initiated from a remote endpoint that
 targets the given endpoint.
 Use of this flag requires that the endpoint be created using
 FI_RMA_EVENT.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_WRITE\f[]
+.B \f[I]FI_REMOTE_WRITE\f[R]
 Increments the specified counter whenever an RMA write or base atomic
 operation is initiated from a remote endpoint that targets the given
 endpoint.
 Use of this flag requires that the endpoint be created using
 FI_RMA_EVENT.
-.RS
-.RE
 .TP
-.B \f[I]FI_SEND\f[]
+.B \f[I]FI_SEND\f[R]
 Increments the specified counter whenever a message transfer initiated
 over the endpoint has completed successfully or in error.
 Sent messages include both tagged and normal message operations.
-.RS
-.RE
 .TP
-.B \f[I]FI_WRITE\f[]
+.B \f[I]FI_WRITE\f[R]
 Increments the specified counter whenever an RMA write or base atomic
 operation initiated from the endpoint has completed successfully or in
 error.
-.RS
-.RE
 .PP
 An endpoint may only be bound to a single CQ or counter for a given type
 of operation.
@@ -497,9 +425,9 @@ No specific entry related to fi_cancel itself will be posted.
 .PP
 Cancel uses the context parameter associated with an operation to
 identify the request to cancel.
-Operations posted without a valid context parameter \-\- either no
+Operations posted without a valid context parameter \[en] either no
 context parameter is specified or the context value was ignored by the
-provider \-\- cannot be canceled.
+provider \[en] cannot be canceled.
 If multiple outstanding operations match the context parameter, only one
 will be canceled.
 In this case, the operation which is canceled is provider specific.
@@ -523,7 +451,7 @@ When allocating an alias, an application may configure either the
 transmit or receive operational flags.
 This avoids needing a separate call to fi_control to set those flags.
 The flags passed to fi_ep_alias must include FI_TRANSMIT or FI_RECV (not
-both) with other operational flags OR\[aq]ed in.
+both) with other operational flags OR\[cq]ed in.
 This will override the transmit or receive flags, respectively, for
 operations posted through the alias endpoint.
 All allocated aliases must be closed for the underlying endpoint to be
@@ -546,19 +474,15 @@ endpoint.
 This option only applies to passive endpoints.
 It is used to set the connection request backlog for listening
 endpoints.
-.RS
-.RE
 .TP
-.B **FI_GETOPSFLAG \-\- uint64_t *flags**
+.B **FI_GETOPSFLAG \[en] uint64_t *flags**
 Used to retrieve the current value of flags associated with the data
 transfer operations initiated on the endpoint.
 The control argument must include FI_TRANSMIT or FI_RECV (not both)
 flags to indicate the type of data transfer flags to be returned.
 See below for a list of control flags.
-.RS
-.RE
 .TP
-.B \f[B]FI_GETWAIT \-\- void **\f[]
+.B \f[B]FI_GETWAIT \[en] void **\f[R]
 This command allows the user to retrieve the file descriptor associated
 with a socket endpoint.
 The fi_control arg parameter should be an address where a pointer to the
@@ -566,20 +490,16 @@ returned file descriptor will be written.
 See fi_eq.3 for addition details using fi_control with FI_GETWAIT.
 The file descriptor may be used for notification that the endpoint is
 ready to send or receive data.
-.RS
-.RE
 .TP
-.B **FI_SETOPSFLAG \-\- uint64_t *flags**
+.B **FI_SETOPSFLAG \[en] uint64_t *flags**
 Used to change the data transfer operation flags associated with an
 endpoint.
 The control argument must include FI_TRANSMIT or FI_RECV (not both) to
 indicate the type of data transfer that the flags should apply to, with
-other flags OR\[aq]ed in.
+other flags OR\[cq]ed in.
 The given flags will override the previous transmit and receive
 attributes that were set when the endpoint was created.
 Valid control flags are defined below.
-.RS
-.RE
 .SS fi_getopt / fi_setopt
 .PP
 Endpoint protocol operations may be retrieved using fi_getopt or set
@@ -591,19 +511,16 @@ and implementation specific details of an endpoint.
 .PP
 The following option levels and option names and parameters are defined.
 .PP
-\f[I]FI_OPT_ENDPOINT\f[]
+\f[I]FI_OPT_ENDPOINT\f[R]
 \[bu] .RS 2
 .TP
-.B \f[I]FI_OPT_BUFFERED_LIMIT \- size_t\f[]
+.B \f[I]FI_OPT_BUFFERED_LIMIT \- size_t\f[R]
 Defines the maximum size of a buffered message that will be reported to
 users as part of a receive completion when the FI_BUFFERED_RECV mode is
 enabled on an endpoint.
-.RS
-.RE
-.RE
 .PP
 fi_getopt() will return the currently configured threshold, or the
-provider\[aq]s default threshold if one has not be set by the
+provider\[cq]s default threshold if one has not be set by the
 application.
 fi_setopt() allows an application to configure the threshold.
 If the provider cannot support the requested threshold, it will fail the
@@ -615,11 +532,12 @@ fi_getopt() can then be used to retrieve the set size.
 In most cases, the sending and receiving endpoints must be configured to
 use the same threshold value, and the threshold must be set prior to
 enabling the endpoint.
+.RE
 \[bu] .RS 2
 .TP
-.B \f[I]FI_OPT_BUFFERED_MIN \- size_t\f[]
+.B \f[I]FI_OPT_BUFFERED_MIN \- size_t\f[R]
 Defines the minimum size of a buffered message that will be reported.
-Applications would set this to a size that\[aq]s big enough to decide
+Applications would set this to a size that\[cq]s big enough to decide
 whether to discard or claim a buffered receive or when to claim a
 buffered receive on getting a buffered receive completion.
 The value is typically used by a provider when sending a rendezvous
@@ -627,12 +545,10 @@ protocol request where it would send at least FI_OPT_BUFFERED_MIN bytes
 of application data along with it.
 A smaller sized rendezvous protocol message usually results in better
 latency for the overall transfer of a large message.
-.RS
-.RE
 .RE
 \[bu] .RS 2
 .TP
-.B \f[I]FI_OPT_CM_DATA_SIZE \- size_t\f[]
+.B \f[I]FI_OPT_CM_DATA_SIZE \- size_t\f[R]
 Defines the size of available space in CM messages for user\-defined
 data.
 This value limits the amount of data that applications can exchange
@@ -643,12 +559,10 @@ except in the case of passive endpoints, in which the size reflects the
 maximum size of the data that may be present as part of a connection
 request event.
 This option is read only.
-.RS
-.RE
 .RE
 \[bu] .RS 2
 .TP
-.B \f[I]FI_OPT_MIN_MULTI_RECV \- size_t\f[]
+.B \f[I]FI_OPT_MIN_MULTI_RECV \- size_t\f[R]
 Defines the minimum receive buffer space available when the receive
 buffer is released by the provider (see FI_MULTI_RECV).
 Modifying this value is only guaranteed to set the minimum buffer space
@@ -656,8 +570,36 @@ needed on receives posted after the value has been changed.
 It is recommended that applications that want to override the default
 MIN_MULTI_RECV value set this option before enabling the corresponding
 endpoint.
-.RS
 .RE
+\[bu] .RS 2
+.TP
+.B \f[I]FI_OPT_FI_HMEM_P2P \- int\f[R]
+Defines how the provider should handle peer to peer FI_HMEM transfers
+for this endpoint.
+By default, the provider will chose whether to use peer to peer support
+based on the type of transfer (FI_HMEM_P2P_ENABLED).
+Valid values defined in fi_endpoint.h are:
+.RS
+.IP \[bu] 2
+FI_HMEM_P2P_ENABLED: Peer to peer support may be used by the provider to
+handle FI_HMEM transfers, and which transfers are initiated using peer
+to peer is subject to the provider implementation.
+.IP \[bu] 2
+FI_HMEM_P2P_REQUIRED: Peer to peer support must be used for transfers,
+transfers that cannot be performed using p2p will be reported as
+failing.
+.IP \[bu] 2
+FI_HMEM_P2P_PREFERRED: Peer to peer support should be used by the
+provider for all transfers if available, but the provider may choose to
+copy the data to initiate the transfer if peer to peer support is
+unavailable.
+.IP \[bu] 2
+FI_HMEM_P2P_DISABLED: Peer to peer support should not be used.
+.RE
+fi_setopt() will return \-FI_EOPNOTSUPP if the mode requested cannot be
+supported by the provider.
+The FI_HMEM_DISABLE_P2P environment variable discussed in
+\f[C]fi_mr\f[R](3) takes precedence over this setopt option.
 .RE
 .SS fi_tc_dscp_set
 .PP
@@ -704,22 +646,22 @@ receive context attributes as shown below.
 .IP
 .nf
 \f[C]
-struct\ fi_ep_attr\ {
-\ \ \ \ enum\ fi_ep_type\ type;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ protocol;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ protocol_version;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ max_msg_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ msg_prefix_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ max_order_raw_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ max_order_war_size;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ max_order_waw_size;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ mem_tag_format;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ tx_ctx_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ rx_ctx_cnt;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ auth_key_size;
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ *auth_key;
+struct fi_ep_attr {
+    enum fi_ep_type type;
+    uint32_t        protocol;
+    uint32_t        protocol_version;
+    size_t          max_msg_size;
+    size_t          msg_prefix_size;
+    size_t          max_order_raw_size;
+    size_t          max_order_war_size;
+    size_t          max_order_waw_size;
+    uint64_t        mem_tag_format;
+    size_t          tx_ctx_cnt;
+    size_t          rx_ctx_cnt;
+    size_t          auth_key_size;
+    uint8_t         *auth_key;
 };
-\f[]
+\f[R]
 .fi
 .SS type \- Endpoint Type
 .PP
@@ -727,38 +669,30 @@ If specified, indicates the type of fabric interface communication
 desired.
 Supported types are:
 .TP
-.B \f[I]FI_EP_DGRAM\f[]
+.B \f[I]FI_EP_DGRAM\f[R]
 Supports a connectionless, unreliable datagram communication.
 Message boundaries are maintained, but the maximum message size may be
 limited to the fabric MTU.
 Flow control is not guaranteed.
-.RS
-.RE
 .TP
-.B \f[I]FI_EP_MSG\f[]
+.B \f[I]FI_EP_MSG\f[R]
 Provides a reliable, connection\-oriented data transfer service with
 flow control that maintains message boundaries.
-.RS
-.RE
 .TP
-.B \f[I]FI_EP_RDM\f[]
+.B \f[I]FI_EP_RDM\f[R]
 Reliable datagram message.
-Provides a reliable, unconnected data transfer service with flow control
-that maintains message boundaries.
-.RS
-.RE
+Provides a reliable, connectionless data transfer service with flow
+control that maintains message boundaries.
 .TP
-.B \f[I]FI_EP_SOCK_DGRAM\f[]
+.B \f[I]FI_EP_SOCK_DGRAM\f[R]
 A connectionless, unreliable datagram endpoint with UDP socket\-like
 semantics.
 FI_EP_SOCK_DGRAM is most useful for applications designed around using
 UDP sockets.
 See the SOCKET ENDPOINT section for additional details and restrictions
 that apply to datagram socket endpoints.
-.RS
-.RE
 .TP
-.B \f[I]FI_EP_SOCK_STREAM\f[]
+.B \f[I]FI_EP_SOCK_STREAM\f[R]
 Data streaming endpoint with TCP socket\-like semantics.
 Provides a reliable, connection\-oriented data transfer service that
 does not maintain message boundaries.
@@ -766,15 +700,11 @@ FI_EP_SOCK_STREAM is most useful for applications designed around using
 TCP sockets.
 See the SOCKET ENDPOINT section for additional details and restrictions
 that apply to stream endpoints.
-.RS
-.RE
 .TP
-.B \f[I]FI_EP_UNSPEC\f[]
+.B \f[I]FI_EP_UNSPEC\f[R]
 The type of endpoint is not specified.
 This is usually provided as input, with other attributes of the endpoint
 or the provider selecting the type.
-.RS
-.RE
 .SS Protocol
 .PP
 Specifies the low\-level end to end protocol employed by the provider.
@@ -785,95 +715,72 @@ Provider specific protocols are also allowed.
 Provider specific protocols will be indicated by having the upper bit of
 the protocol value set to one.
 .TP
-.B \f[I]FI_PROTO_GNI\f[]
+.B \f[I]FI_PROTO_GNI\f[R]
 Protocol runs over Cray GNI low\-level interface.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_IB_RDM\f[]
+.B \f[I]FI_PROTO_IB_RDM\f[R]
 Reliable\-datagram protocol implemented over InfiniBand
 reliable\-connected queue pairs.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_IB_UD\f[]
+.B \f[I]FI_PROTO_IB_UD\f[R]
 The protocol runs over Infiniband unreliable datagram queue pairs.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_IWARP\f[]
+.B \f[I]FI_PROTO_IWARP\f[R]
 The protocol runs over the Internet wide area RDMA protocol transport.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_IWARP_RDM\f[]
+.B \f[I]FI_PROTO_IWARP_RDM\f[R]
 Reliable\-datagram protocol implemented over iWarp reliable\-connected
 queue pairs.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_NETWORKDIRECT\f[]
+.B \f[I]FI_PROTO_NETWORKDIRECT\f[R]
 Protocol runs over Microsoft NetworkDirect service provider interface.
 This adds reliable\-datagram semantics over the NetworkDirect
 connection\- oriented endpoint semantics.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_PSMX\f[]
+.B \f[I]FI_PROTO_PSMX\f[R]
 The protocol is based on an Intel proprietary protocol known as PSM,
 performance scaled messaging.
 PSMX is an extended version of the PSM protocol to support the libfabric
 interfaces.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_PSMX2\f[]
+.B \f[I]FI_PROTO_PSMX2\f[R]
 The protocol is based on an Intel proprietary protocol known as PSM2,
 performance scaled messaging version 2.
 PSMX2 is an extended version of the PSM2 protocol to support the
 libfabric interfaces.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_RDMA_CM_IB_RC\f[]
+.B \f[I]FI_PROTO_PSMX3\f[R]
+The protocol is Intel\[cq]s protocol known as PSM3, performance scaled
+messaging version 3.
+PSMX3 is implemented over RoCEv2 and verbs.
+.TP
+.B \f[I]FI_PROTO_RDMA_CM_IB_RC\f[R]
 The protocol runs over Infiniband reliable\-connected queue pairs, using
 the RDMA CM protocol for connection establishment.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_RXD\f[]
+.B \f[I]FI_PROTO_RXD\f[R]
 Reliable\-datagram protocol implemented over datagram endpoints.
 RXD is a libfabric utility component that adds RDM endpoint semantics
 over DGRAM endpoint semantics.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_RXM\f[]
+.B \f[I]FI_PROTO_RXM\f[R]
 Reliable\-datagram protocol implemented over message endpoints.
 RXM is a libfabric utility component that adds RDM endpoint semantics
 over MSG endpoint semantics.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_SOCK_TCP\f[]
+.B \f[I]FI_PROTO_SOCK_TCP\f[R]
 The protocol is layered over TCP packets.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROTO_UDP\f[]
+.B \f[I]FI_PROTO_UDP\f[R]
 The protocol sends and receives UDP datagrams.
-For example, an endpoint using \f[I]FI_PROTO_UDP\f[] will be able to
+For example, an endpoint using \f[I]FI_PROTO_UDP\f[R] will be able to
 communicate with a remote peer that is using Berkeley
-\f[I]SOCK_DGRAM\f[] sockets using \f[I]IPPROTO_UDP\f[].
-.RS
-.RE
+\f[I]SOCK_DGRAM\f[R] sockets using \f[I]IPPROTO_UDP\f[R].
 .TP
-.B \f[I]FI_PROTO_UNSPEC\f[]
+.B \f[I]FI_PROTO_UNSPEC\f[R]
 The protocol is not specified.
 This is usually provided as input, with other attributes of the socket
 or the provider selecting the actual protocol.
-.RS
-.RE
 .SS protocol_version \- Protocol Version
 .PP
 Identifies which version of the protocol is employed by the provider.
@@ -901,20 +808,31 @@ Data ordering is separate, but dependent on message ordering (defined
 below).
 Data ordering is unspecified where message order is not defined.
 .PP
-Data ordering refers to the access of target memory by subsequent
-operations.
+Data ordering refers to the access of the same target memory by
+subsequent operations.
 When back to back RMA read or write operations access the same
 registered memory location, data ordering indicates whether the second
 operation reads or writes the target memory after the first operation
 has completed.
-Because RMA ordering applies between two operations, and not within a
-single data transfer, ordering is defined per byte\-addressable memory
-location.
+For example, will an RMA read that follows an RMA write read back the
+data that was written?
+Similarly, will an RMA write that follows an RMA read update the target
+buffer after the read has transferred the original data?
+Data ordering answers these questions, even in the presence of errors,
+such as the need to resend data because of lost or corrupted network
+traffic.
+.PP
+RMA ordering applies between two operations, and not within a single
+data transfer.
+Therefore, ordering is defined per byte\-addressable memory location.
 I.e.
 ordering specifies whether location X is accessed by the second
 operation after the first operation.
 Nothing is implied about the completion of the first operation before
 the second operation is initiated.
+For example, if the first operation updates locations X and Y, but the
+second operation only accesses location X, there are no guarantees
+defined relative to location Y and the second operation.
 .PP
 In order to support large data transfers being broken into multiple
 packets and sent using multiple paths through the fabric, data ordering
@@ -924,17 +842,15 @@ values.
 Note that even if data ordering is not maintained, message ordering may
 be.
 .TP
-.B \f[I]max_order_raw_size\f[]
+.B \f[I]max_order_raw_size\f[R]
 Read after write size.
 If set, an RMA or atomic read operation issued after an RMA or atomic
 write operation, both of which are smaller than the size, will be
 ordered.
 Where the target memory locations overlap, the RMA or atomic read
 operation will see the results of the previous RMA or atomic write.
-.RS
-.RE
 .TP
-.B \f[I]max_order_war_size\f[]
+.B \f[I]max_order_war_size\f[R]
 Write after read size.
 If set, an RMA or atomic write operation issued after an RMA or atomic
 read operation, both of which are smaller than the size, will be
@@ -942,18 +858,14 @@ ordered.
 The RMA or atomic read operation will see the initial value of the
 target memory location before a subsequent RMA or atomic write updates
 the value.
-.RS
-.RE
 .TP
-.B \f[I]max_order_waw_size\f[]
+.B \f[I]max_order_waw_size\f[R]
 Write after write size.
 If set, an RMA or atomic write operation issued after an RMA or atomic
 write operation, both of which are smaller than the size, will be
 ordered.
 The target memory location will reflect the results of the second RMA or
 atomic write.
-.RS
-.RE
 .PP
 An order size value of 0 indicates that ordering is not guaranteed.
 A value of \-1 guarantees ordering for any data size.
@@ -966,7 +878,7 @@ fields.
 The mem_tag_format optionally begins with a series of bits set to 0, to
 signify bits which are ignored by the provider.
 Following the initial prefix of ignored bits, the array will consist of
-alternating groups of bits set to all 1\[aq]s or all 0\[aq]s.
+alternating groups of bits set to all 1\[cq]s or all 0\[cq]s.
 Each group of bits corresponds to a tagged field.
 The implication of defining a tagged field is that when a mask is
 applied to the tagged bit array, all bits belonging to a single field
@@ -976,7 +888,7 @@ For example, a mem_tag_format of 0x30FF indicates support for 14 tagged
 bits, separated into 3 fields.
 The first field consists of 2\-bits, the second field 4\-bits, and the
 final field 8\-bits.
-Valid masks for such a tagged field would be a bitwise OR\[aq]ing of
+Valid masks for such a tagged field would be a bitwise OR\[cq]ing of
 zero or more of the following values: 0x3000, 0x0F00, and 0x00FF.
 The provider may not validate the mask provided by the application for
 performance reasons.
@@ -997,7 +909,7 @@ the tag field of the completion entry.
 .PP
 It is recommended that field sizes be ordered from smallest to largest.
 A generic, unstructured tag and mask can be achieved by requesting a bit
-array consisting of alternating 1\[aq]s and 0\[aq]s.
+array consisting of alternating 1\[cq]s and 0\[cq]s.
 .SS tx_ctx_cnt \- Transmit Context Count
 .PP
 Number of transmit contexts to associate with the endpoint.
@@ -1063,19 +975,19 @@ specified using struct fi_tx_attr.
 .IP
 .nf
 \f[C]
-struct\ fi_tx_attr\ {
-\ \ \ \ uint64_t\ \ caps;
-\ \ \ \ uint64_t\ \ mode;
-\ \ \ \ uint64_t\ \ op_flags;
-\ \ \ \ uint64_t\ \ msg_order;
-\ \ \ \ uint64_t\ \ comp_order;
-\ \ \ \ size_t\ \ \ \ inject_size;
-\ \ \ \ size_t\ \ \ \ size;
-\ \ \ \ size_t\ \ \ \ iov_limit;
-\ \ \ \ size_t\ \ \ \ rma_iov_limit;
-\ \ \ \ uint32_t\ \ tclass;
+struct fi_tx_attr {
+    uint64_t  caps;
+    uint64_t  mode;
+    uint64_t  op_flags;
+    uint64_t  msg_order;
+    uint64_t  comp_order;
+    size_t    inject_size;
+    size_t    size;
+    size_t    iov_limit;
+    size_t    rma_iov_limit;
+    uint32_t  tclass;
 };
-\f[]
+\f[R]
 .fi
 .SS caps \- Capabilities
 .PP
@@ -1138,160 +1050,124 @@ Message ordering requires matching ordering semantics on the receiving
 side of a data transfer operation in order to guarantee that ordering is
 met.
 .TP
-.B \f[I]FI_ORDER_ATOMIC_RAR\f[]
+.B \f[I]FI_ORDER_ATOMIC_RAR\f[R]
 Atomic read after read.
 If set, atomic fetch operations are transmitted in the order submitted
 relative to other atomic fetch operations.
 If not set, atomic fetches may be transmitted out of order from their
 submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_ATOMIC_RAW\f[]
+.B \f[I]FI_ORDER_ATOMIC_RAW\f[R]
 Atomic read after write.
 If set, atomic fetch operations are transmitted in the order submitted
 relative to atomic update operations.
 If not set, atomic fetches may be transmitted ahead of atomic updates.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_ATOMIC_WAR\f[]
+.B \f[I]FI_ORDER_ATOMIC_WAR\f[R]
 RMA write after read.
 If set, atomic update operations are transmitted in the order submitted
 relative to atomic fetch operations.
 If not set, atomic updates may be transmitted ahead of atomic fetches.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_ATOMIC_WAW\f[]
+.B \f[I]FI_ORDER_ATOMIC_WAW\f[R]
 RMA write after write.
 If set, atomic update operations are transmitted in the order submitted
 relative to other atomic update operations.
 If not atomic updates may be transmitted out of order from their
 submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_NONE\f[]
+.B \f[I]FI_ORDER_NONE\f[R]
 No ordering is specified.
 This value may be used as input in order to obtain the default message
 order supported by the provider.
 FI_ORDER_NONE is an alias for the value 0.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RAR\f[]
+.B \f[I]FI_ORDER_RAR\f[R]
 Read after read.
 If set, RMA and atomic read operations are transmitted in the order
 submitted relative to other RMA and atomic read operations.
 If not set, RMA and atomic reads may be transmitted out of order from
 their submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RAS\f[]
+.B \f[I]FI_ORDER_RAS\f[R]
 Read after send.
 If set, RMA and atomic read operations are transmitted in the order
 submitted relative to message send operations, including tagged sends.
 If not set, RMA and atomic reads may be transmitted ahead of sends.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RAW\f[]
+.B \f[I]FI_ORDER_RAW\f[R]
 Read after write.
 If set, RMA and atomic read operations are transmitted in the order
 submitted relative to RMA and atomic write operations.
 If not set, RMA and atomic reads may be transmitted ahead of RMA and
 atomic writes.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RMA_RAR\f[]
+.B \f[I]FI_ORDER_RMA_RAR\f[R]
 RMA read after read.
 If set, RMA read operations are transmitted in the order submitted
 relative to other RMA read operations.
 If not set, RMA reads may be transmitted out of order from their
 submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RMA_RAW\f[]
+.B \f[I]FI_ORDER_RMA_RAW\f[R]
 RMA read after write.
 If set, RMA read operations are transmitted in the order submitted
 relative to RMA write operations.
 If not set, RMA reads may be transmitted ahead of RMA writes.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RMA_WAR\f[]
+.B \f[I]FI_ORDER_RMA_WAR\f[R]
 RMA write after read.
 If set, RMA write operations are transmitted in the order submitted
 relative to RMA read operations.
 If not set, RMA writes may be transmitted ahead of RMA reads.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_RMA_WAW\f[]
+.B \f[I]FI_ORDER_RMA_WAW\f[R]
 RMA write after write.
 If set, RMA write operations are transmitted in the order submitted
 relative to other RMA write operations.
 If not set, RMA writes may be transmitted out of order from their
 submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_SAR\f[]
+.B \f[I]FI_ORDER_SAR\f[R]
 Send after read.
 If set, message send operations, including tagged sends, are transmitted
 in order submitted relative to RMA and atomic read operations.
 If not set, message sends may be transmitted ahead of RMA and atomic
 reads.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_SAS\f[]
+.B \f[I]FI_ORDER_SAS\f[R]
 Send after send.
 If set, message send operations, including tagged sends, are transmitted
 in the order submitted relative to other message send.
 If not set, message sends may be transmitted out of order from their
 submission.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_SAW\f[]
+.B \f[I]FI_ORDER_SAW\f[R]
 Send after write.
 If set, message send operations, including tagged sends, are transmitted
 in order submitted relative to RMA and atomic write operations.
 If not set, message sends may be transmitted ahead of RMA and atomic
 writes.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_WAR\f[]
+.B \f[I]FI_ORDER_WAR\f[R]
 Write after read.
 If set, RMA and atomic write operations are transmitted in the order
 submitted relative to RMA and atomic read operations.
 If not set, RMA and atomic writes may be transmitted ahead of RMA and
 atomic reads.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_WAS\f[]
+.B \f[I]FI_ORDER_WAS\f[R]
 Write after send.
 If set, RMA and atomic write operations are transmitted in the order
 submitted relative to message send operations, including tagged sends.
 If not set, RMA and atomic writes may be transmitted ahead of sends.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_WAW\f[]
+.B \f[I]FI_ORDER_WAW\f[R]
 Write after write.
 If set, RMA and atomic write operations are transmitted in the order
 submitted relative to other RMA and atomic write operations.
 If not set, RMA and atomic writes may be transmitted out of order from
 their submission.
-.RS
-.RE
 .SS comp_order \- Completion Ordering
 .PP
 Completion ordering refers to the order in which completed requests are
@@ -1301,7 +1177,7 @@ Relaxed completion order may enable faster reporting of completed
 transfers, allow acknowledgments to be sent over different fabric paths,
 and support more sophisticated retry mechanisms.
 This can result in lower\-latency completions, particularly when using
-unconnected endpoints.
+connectionless endpoints.
 Strict completion ordering may require that providers queue completed
 operations or limit available optimizations.
 .PP
@@ -1321,17 +1197,13 @@ with the constraint that the returned ordering is stricter than that
 specified by the application.
 Supported completion order values are:
 .TP
-.B \f[I]FI_ORDER_NONE\f[]
+.B \f[I]FI_ORDER_NONE\f[R]
 No ordering is defined for completed operations.
 Requests submitted to the transmit context may complete in any order.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_STRICT\f[]
+.B \f[I]FI_ORDER_STRICT\f[R]
 Requests complete in the order in which they are submitted to the
 transmit context.
-.RS
-.RE
 .SS inject_size
 .PP
 The requested inject operation size (see the FI_INJECT flag) that the
@@ -1341,10 +1213,25 @@ inject operation (such as fi_inject) or may be used with the FI_INJECT
 data transfer flag.
 .SS size
 .PP
-The size of the context.
-The size is specified as the minimum number of transmit operations that
-may be posted to the endpoint without the operation returning
-\-FI_EAGAIN.
+The size of the transmit context.
+The mapping of the size value to resources is provider specific, but it
+is directly related to the number of command entries allocated for the
+endpoint.
+A smaller size value consumes fewer hardware and software resources,
+while a larger size allows queuing more transmit requests.
+.PP
+While the size attribute guides the size of underlying endpoint transmit
+queue, there is not necessarily a one\-to\-one mapping between a
+transmit operation and a queue entry.
+A single transmit operation may consume multiple queue entries; for
+example, one per scatter\-gather entry.
+Additionally, the size field is intended to guide the allocation of the
+endpoint\[cq]s transmit context.
+Specifically, for connectionless endpoints, there may be lower\-level
+queues use to track communication on a per peer basis.
+The sizes of any lower\-level queues may only be significantly smaller
+than the endpoint\[cq]s transmit size, in order to reduce resource
+utilization.
 .SS iov_limit
 .PP
 This is the maximum number of IO vectors (scatter\-gather elements) that
@@ -1369,32 +1256,26 @@ definition.
 If tclass is unset or set to FI_TC_UNSPEC, the endpoint will use the
 default traffic class associated with the domain.
 .TP
-.B \f[I]FI_TC_BEST_EFFORT\f[]
+.B \f[I]FI_TC_BEST_EFFORT\f[R]
 This is the default in the absence of any other local or fabric
 configuration.
 This class carries the traffic for a number of applications executing
 concurrently over the same network infrastructure.
 Even though it is shared, network capacity and resource allocation are
 distributed fairly across the applications.
-.RS
-.RE
 .TP
-.B \f[I]FI_TC_BULK_DATA\f[]
+.B \f[I]FI_TC_BULK_DATA\f[R]
 This class is intended for large data transfers associated with I/O and
 is present to separate sustained I/O transfers from other application
 inter\-process communications.
-.RS
-.RE
 .TP
-.B \f[I]FI_TC_DEDICATED_ACCESS\f[]
+.B \f[I]FI_TC_DEDICATED_ACCESS\f[R]
 This class operates at the highest priority, except the management
 class.
 It carries a high bandwidth allocation, minimum latency targets, and the
 highest scheduling and arbitration priority.
-.RS
-.RE
 .TP
-.B \f[I]FI_TC_LOW_LATENCY\f[]
+.B \f[I]FI_TC_LOW_LATENCY\f[R]
 This class supports low latency, low jitter data patterns typically
 caused by transactional data exchanges, barrier synchronizations, and
 collective operations that are typical of HPC applications.
@@ -1403,33 +1284,25 @@ transfers must achieve for correct or performance operations.
 Fulfillment of such requests in this class will typically require
 accompanying bandwidth and message size limitations so as not to consume
 excessive bandwidth at high priority.
-.RS
-.RE
 .TP
-.B \f[I]FI_TC_NETWORK_CTRL\f[]
+.B \f[I]FI_TC_NETWORK_CTRL\f[R]
 This class is intended for traffic directly related to fabric (network)
 management, which is critical to the correct operation of the network.
 Its use is typically restricted to privileged network management
 applications.
-.RS
-.RE
 .TP
-.B \f[I]FI_TC_SCAVENGER\f[]
+.B \f[I]FI_TC_SCAVENGER\f[R]
 This class is used for data that is desired but does not have strict
 delivery requirements, such as in\-band network or application level
 monitoring data.
 Use of this class indicates that the traffic is considered lower
 priority and should not interfere with higher priority workflows.
-.RS
-.RE
 .TP
-.B \f[I]fi_tc_dscp_set / fi_tc_dscp_get\f[]
+.B \f[I]fi_tc_dscp_set / fi_tc_dscp_get\f[R]
 DSCP values are supported via the DSCP get and set functions.
 The definitions for DSCP values are outside the scope of libfabric.
 See the fi_tc_dscp_set and fi_tc_dscp_get function definitions for
 details on their use.
-.RS
-.RE
 .SH RECEIVE CONTEXT ATTRIBUTES
 .PP
 Attributes specific to the receive capabilities of an endpoint are
@@ -1437,17 +1310,17 @@ specified using struct fi_rx_attr.
 .IP
 .nf
 \f[C]
-struct\ fi_rx_attr\ {
-\ \ \ \ uint64_t\ \ caps;
-\ \ \ \ uint64_t\ \ mode;
-\ \ \ \ uint64_t\ \ op_flags;
-\ \ \ \ uint64_t\ \ msg_order;
-\ \ \ \ uint64_t\ \ comp_order;
-\ \ \ \ size_t\ \ \ \ total_buffered_recv;
-\ \ \ \ size_t\ \ \ \ size;
-\ \ \ \ size_t\ \ \ \ iov_limit;
+struct fi_rx_attr {
+    uint64_t  caps;
+    uint64_t  mode;
+    uint64_t  op_flags;
+    uint64_t  msg_order;
+    uint64_t  comp_order;
+    size_t    total_buffered_recv;
+    size_t    size;
+    size_t    iov_limit;
 };
-\f[]
+\f[R]
 .fi
 .SS caps \- Capabilities
 .PP
@@ -1489,7 +1362,7 @@ Applicable flags are listed in the Operation Flags section.
 .SS msg_order \- Message Ordering
 .PP
 For a description of message ordering, see the msg_order field in the
-\f[I]Transmit Context Attribute\f[] section.
+\f[I]Transmit Context Attribute\f[R] section.
 Receive context message ordering defines the order in which received
 transport message headers are processed when received by an endpoint.
 When ordering is set, it indicates that message headers will be
@@ -1508,28 +1381,22 @@ FI_ORDER_ATOMIC_WAW.
 .SS comp_order \- Completion Ordering
 .PP
 For a description of completion ordering, see the comp_order field in
-the \f[I]Transmit Context Attribute\f[] section.
+the \f[I]Transmit Context Attribute\f[R] section.
 .TP
-.B \f[I]FI_ORDER_DATA\f[]
+.B \f[I]FI_ORDER_DATA\f[R]
 When set, this bit indicates that received data is written into memory
 in order.
 Data ordering applies to memory accessed as part of a single operation
 and between operations if message ordering is guaranteed.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_NONE\f[]
+.B \f[I]FI_ORDER_NONE\f[R]
 No ordering is defined for completed operations.
 Receive operations may complete in any order, regardless of their
 submission order.
-.RS
-.RE
 .TP
-.B \f[I]FI_ORDER_STRICT\f[]
+.B \f[I]FI_ORDER_STRICT\f[R]
 Receive operations complete in the order in which they are processed by
 the receive context, based on the receive side msg_order attribute.
-.RS
-.RE
 .SS total_buffered_recv
 .PP
 This field is supported for backwards compatibility purposes.
@@ -1553,10 +1420,25 @@ anticipate receiving unexpected messages, rather than modifying this
 value.
 .SS size
 .PP
-The size of the context.
-The size is specified as the minimum number of receive operations that
-may be posted to the endpoint without the operation returning
-\-FI_EAGAIN.
+The size of the receive context.
+The mapping of the size value to resources is provider specific, but it
+is directly related to the number of command entries allocated for the
+endpoint.
+A smaller size value consumes fewer hardware and software resources,
+while a larger size allows queuing more transmit requests.
+.PP
+While the size attribute guides the size of underlying endpoint receive
+queue, there is not necessarily a one\-to\-one mapping between a receive
+operation and a queue entry.
+A single receive operation may consume multiple queue entries; for
+example, one per scatter\-gather entry.
+Additionally, the size field is intended to guide the allocation of the
+endpoint\[cq]s receive context.
+Specifically, for connectionless endpoints, there may be lower\-level
+queues use to track communication on a per peer basis.
+The sizes of any lower\-level queues may only be significantly smaller
+than the endpoint\[cq]s receive size, in order to reduce resource
+utilization.
 .SS iov_limit
 .PP
 This is the maximum number of IO vectors (scatter\-gather elements) that
@@ -1624,7 +1506,7 @@ Receive contexts are often associated with steering flows, that specify
 which incoming packets targeting a scalable endpoint to process.
 However, receive contexts may be targeted directly by the initiator, if
 supported by the underlying protocol.
-Such contexts are referred to as \[aq]named\[aq].
+Such contexts are referred to as `named'.
 Support for named contexts must be indicated by setting the caps
 FI_NAMED_RX_CTX capability when the corresponding endpoint is created.
 Support for named receive contexts is coordinated with address vectors.
@@ -1691,7 +1573,7 @@ scalable set of contexts of the alternate type.
 This call is used to open a shareable transmit context (see above for
 details on the transmit context attributes).
 Endpoints associated with a shared transmit context must use a subset of
-the transmit context\[aq]s attributes.
+the transmit context\[cq]s attributes.
 Note that this is the reverse of the requirement for transmit contexts
 for scalable endpoints.
 .SS fi_srx_context
@@ -1699,7 +1581,7 @@ for scalable endpoints.
 This allocates a shareable receive context (see above for details on the
 receive context attributes).
 Endpoints associated with a shared receive context must use a subset of
-the receive context\[aq]s attributes.
+the receive context\[cq]s attributes.
 Note that this is the reverse of the requirement for receive contexts
 for scalable endpoints.
 .SH SOCKET ENDPOINTS
@@ -1747,63 +1629,51 @@ The file descriptor may be retrieved using fi_control.
 .SH OPERATION FLAGS
 .PP
 Operation flags are obtained by OR\-ing the following flags together.
-Operation flags define the default flags applied to an endpoint\[aq]s
+Operation flags define the default flags applied to an endpoint\[cq]s
 data transfer operations, where a flags parameter is not available.
 Data transfer operations that take flags as input override the op_flags
 value of transmit or receive context attributes of an endpoint.
 .TP
-.B \f[I]FI_COMMIT_COMPLETE\f[]
+.B \f[I]FI_COMMIT_COMPLETE\f[R]
 Indicates that a completion should not be generated (locally or at the
 peer) until the result of an operation have been made persistent.
-See \f[C]fi_cq\f[](3) for additional details on completion semantics.
-.RS
-.RE
+See \f[C]fi_cq\f[R](3) for additional details on completion semantics.
 .TP
-.B \f[I]FI_COMPLETION\f[]
+.B \f[I]FI_COMPLETION\f[R]
 Indicates that a completion queue entry should be written for data
 transfer operations.
 This flag only applies to operations issued on an endpoint that was
 bound to a completion queue with the FI_SELECTIVE_COMPLETION flag set,
 otherwise, it is ignored.
 See the fi_ep_bind section above for more detail.
-.RS
-.RE
 .TP
-.B \f[I]FI_DELIVERY_COMPLETE\f[]
+.B \f[I]FI_DELIVERY_COMPLETE\f[R]
 Indicates that a completion should be generated when the operation has
 been processed by the destination endpoint(s).
-See \f[C]fi_cq\f[](3) for additional details on completion semantics.
-.RS
-.RE
+See \f[C]fi_cq\f[R](3) for additional details on completion semantics.
 .TP
-.B \f[I]FI_INJECT\f[]
+.B \f[I]FI_INJECT\f[R]
 Indicates that all outbound data buffers should be returned to the
-user\[aq]s control immediately after a data transfer call returns, even
+user\[cq]s control immediately after a data transfer call returns, even
 if the operation is handled asynchronously.
 This may require that the provider copy the data into a local buffer and
 transfer out of that buffer.
 A provider can limit the total amount of send data that may be buffered
 and/or the size of a single send that can use this flag.
 This limit is indicated using inject_size (see inject_size above).
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT_COMPLETE\f[]
+.B \f[I]FI_INJECT_COMPLETE\f[R]
 Indicates that a completion should be generated when the source
 buffer(s) may be reused.
-See \f[C]fi_cq\f[](3) for additional details on completion semantics.
-.RS
-.RE
+See \f[C]fi_cq\f[R](3) for additional details on completion semantics.
 .TP
-.B \f[I]FI_MULTICAST\f[]
+.B \f[I]FI_MULTICAST\f[R]
 Indicates that data transfers will target multicast addresses by
 default.
 Any fi_addr_t passed into a data transfer operation will be treated as a
 multicast address.
-.RS
-.RE
 .TP
-.B \f[I]FI_MULTI_RECV\f[]
+.B \f[I]FI_MULTI_RECV\f[R]
 Applies to posted receive operations.
 This flag allows the user to post a single buffer that will receive
 multiple incoming messages.
@@ -1815,15 +1685,11 @@ The placement of received data into the buffer may be subjected to
 provider specific alignment restrictions.
 The buffer will be released by the provider when the available buffer
 space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV).
-.RS
-.RE
 .TP
-.B \f[I]FI_TRANSMIT_COMPLETE\f[]
+.B \f[I]FI_TRANSMIT_COMPLETE\f[R]
 Indicates that a completion should be generated when the transmit
 operation has completed relative to the local provider.
-See \f[C]fi_cq\f[](3) for additional details on completion semantics.
-.RS
-.RE
+See \f[C]fi_cq\f[R](3) for additional details on completion semantics.
 .SH NOTES
 .PP
 Users should call fi_close to release all resources allocated to the
@@ -1873,27 +1739,21 @@ On error, a negative value corresponding to fabric errno is returned.
 For fi_cancel, a return value of 0 indicates that the cancel request was
 submitted for processing.
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EDOMAIN\f[]
+.B \f[I]\-FI_EDOMAIN\f[R]
 A resource domain was not bound to the endpoint or an attempt was made
 to bind multiple domains.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_ENOCQ\f[]
+.B \f[I]\-FI_ENOCQ\f[R]
 The endpoint has not been configured with necessary event queue.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EOPBADSTATE\f[]
-The endpoint\[aq]s state does not permit the requested operation.
-.RS
-.RE
+.B \f[I]\-FI_EOPBADSTATE\f[R]
+The endpoint\[cq]s state does not permit the requested operation.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_domain\f[](3), \f[C]fi_cq\f[](3)
-\f[C]fi_msg\f[](3), \f[C]fi_tagged\f[](3), \f[C]fi_rma\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3)
+\f[C]fi_msg\f[R](3), \f[C]fi_tagged\f[R](3), \f[C]fi_rma\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_eq.3 b/deps/libfabric/man/man3/fi_eq.3
index 47b1bc2880ae027fdc6ccc7658d19a02866cfd49..09243d13cd8b1ff1256fe011f840bfe0ca2955e2 100644
--- a/deps/libfabric/man/man3/fi_eq.3
+++ b/deps/libfabric/man/man3/fi_eq.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_eq" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_eq" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,131 +8,93 @@ fi_eq \- Event queue operations
 .TP
 .B fi_eq_open / fi_close
 Open/close an event queue
-.RS
-.RE
 .TP
 .B fi_control
 Control operation of EQ
-.RS
-.RE
 .TP
 .B fi_eq_read / fi_eq_readerr
 Read an event from an event queue
-.RS
-.RE
 .TP
 .B fi_eq_write
 Writes an event to an event queue
-.RS
-.RE
 .TP
 .B fi_eq_sread
 A synchronous (blocking) read of an event queue
-.RS
-.RE
 .TP
 .B fi_eq_strerror
 Converts provider specific error information into a printable string
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_eq_open(struct\ fid_fabric\ *fabric,\ struct\ fi_eq_attr\ *attr,
-\ \ \ \ struct\ fid_eq\ **eq,\ void\ *context);
+int fi_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
+    struct fid_eq **eq, void *context);
 
-int\ fi_close(struct\ fid\ *eq);
+int fi_close(struct fid *eq);
 
-int\ fi_control(struct\ fid\ *eq,\ int\ command,\ void\ *arg);
+int fi_control(struct fid *eq, int command, void *arg);
 
-ssize_t\ fi_eq_read(struct\ fid_eq\ *eq,\ uint32_t\ *event,
-\ \ \ \ void\ *buf,\ size_t\ len,\ uint64_t\ flags);
+ssize_t fi_eq_read(struct fid_eq *eq, uint32_t *event,
+    void *buf, size_t len, uint64_t flags);
 
-ssize_t\ fi_eq_readerr(struct\ fid_eq\ *eq,\ struct\ fi_eq_err_entry\ *buf,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_eq_readerr(struct fid_eq *eq, struct fi_eq_err_entry *buf,
+    uint64_t flags);
 
-ssize_t\ fi_eq_write(struct\ fid_eq\ *eq,\ uint32_t\ event,
-\ \ \ \ const\ void\ *buf,\ size_t\ len,\ uint64_t\ flags);
+ssize_t fi_eq_write(struct fid_eq *eq, uint32_t event,
+    const void *buf, size_t len, uint64_t flags);
 
-ssize_t\ fi_eq_sread(struct\ fid_eq\ *eq,\ uint32_t\ *event,
-\ \ \ \ void\ *buf,\ size_t\ len,\ int\ timeout,\ uint64_t\ flags);
+ssize_t fi_eq_sread(struct fid_eq *eq, uint32_t *event,
+    void *buf, size_t len, int timeout, uint64_t flags);
 
-const\ char\ *\ fi_eq_strerror(struct\ fid_eq\ *eq,\ int\ prov_errno,
-\ \ \ \ \ \ const\ void\ *err_data,\ char\ *buf,\ size_t\ len);
-\f[]
+const char * fi_eq_strerror(struct fid_eq *eq, int prov_errno,
+      const void *err_data, char *buf, size_t len);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fabric\f[]
+.B \f[I]fabric\f[R]
 Opened fabric descriptor
-.RS
-.RE
 .TP
-.B \f[I]eq\f[]
+.B \f[I]eq\f[R]
 Event queue
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Event queue attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the event queue.
-.RS
-.RE
 .TP
-.B \f[I]event\f[]
+.B \f[I]event\f[R]
 Reported event
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 For read calls, the data buffer to write events into.
 For write calls, an event to insert into the event queue.
 For fi_eq_strerror, an optional buffer that receives printable error
 information.
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
+.B \f[I]len\f[R]
 Length of data buffer
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation
-.RS
-.RE
 .TP
-.B \f[I]command\f[]
+.B \f[I]command\f[R]
 Command of control operation to perform on EQ.
-.RS
-.RE
 .TP
-.B \f[I]arg\f[]
+.B \f[I]arg\f[R]
 Optional control argument
-.RS
-.RE
 .TP
-.B \f[I]prov_errno\f[]
+.B \f[I]prov_errno\f[R]
 Provider specific error value
-.RS
-.RE
 .TP
-.B \f[I]err_data\f[]
+.B \f[I]err_data\f[R]
 Provider specific error data related to a completion
-.RS
-.RE
 .TP
-.B \f[I]timeout\f[]
+.B \f[I]timeout\f[R]
 Timeout specified in milliseconds
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Event queues are used to report events associated with control
@@ -147,66 +109,54 @@ as listening for connection requests.
 fi_eq_open allocates a new event queue.
 .PP
 The properties and behavior of an event queue are defined by
-\f[C]struct\ fi_eq_attr\f[].
+\f[C]struct fi_eq_attr\f[R].
 .IP
 .nf
 \f[C]
-struct\ fi_eq_attr\ {
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size;\ \ \ \ \ \ /*\ #\ entries\ for\ EQ\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */
-\ \ \ \ enum\ fi_wait_obj\ \ \ \ \ wait_obj;\ \ /*\ requested\ wait\ object\ */
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ signaling_vector;\ /*\ interrupt\ affinity\ */
-\ \ \ \ struct\ fid_wait\ \ \ \ \ *wait_set;\ \ /*\ optional\ wait\ set\ */
+struct fi_eq_attr {
+    size_t               size;      /* # entries for EQ */
+    uint64_t             flags;     /* operation flags */
+    enum fi_wait_obj     wait_obj;  /* requested wait object */
+    int                  signaling_vector; /* interrupt affinity */
+    struct fid_wait     *wait_set;  /* optional wait set */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]size\f[]
+.B \f[I]size\f[R]
 Specifies the minimum size of an event queue.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Flags that control the configuration of the EQ.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WRITE\f[]
+.B \- \f[I]FI_WRITE\f[R]
 Indicates that the application requires support for inserting user
 events into the EQ.
 If this flag is set, then the fi_eq_write operation must be supported by
 the provider.
 If the FI_WRITE flag is not set, then the application may not invoke
 fi_eq_write.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_AFFINITY\f[]
+.B \- \f[I]FI_AFFINITY\f[R]
 Indicates that the signaling_vector field (see below) is valid.
-.RS
-.RE
 .TP
-.B \f[I]wait_obj\f[]
-EQ\[aq]s may be associated with a specific wait object.
+.B \f[I]wait_obj\f[R]
+EQ\[cq]s may be associated with a specific wait object.
 Wait objects allow applications to block until the wait object is
 signaled, indicating that an event is available to be read.
 Users may use fi_control to retrieve the underlying wait object
 associated with an EQ, in order to use it in other system calls.
 The following values may be used to specify the type of wait object
 associated with an EQ:
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_NONE\f[]
+.B \- \f[I]FI_WAIT_NONE\f[R]
 Used to indicate that the user will not block (wait) for events on the
 EQ.
 When FI_WAIT_NONE is specified, the application may not call
 fi_eq_sread.
 This is the default is no wait object is specified.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_UNSPEC\f[]
+.B \- \f[I]FI_WAIT_UNSPEC\f[R]
 Specifies that the user will only wait on the EQ using fabric interface
 calls, such as fi_eq_sread.
 In this case, the underlying provider may select the most appropriate or
@@ -214,49 +164,37 @@ highest performing wait object available, including custom wait
 mechanisms.
 Applications that select FI_WAIT_UNSPEC are not guaranteed to retrieve
 the underlying wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_SET\f[]
+.B \- \f[I]FI_WAIT_SET\f[R]
 Indicates that the event queue should use a wait set object to wait for
 events.
 If specified, the wait_set field must reference an existing wait set
 object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_FD\f[]
+.B \- \f[I]FI_WAIT_FD\f[R]
 Indicates that the EQ should use a file descriptor as its wait
 mechanism.
 A file descriptor wait object must be usable in select, poll, and epoll
 routines.
 However, a provider may signal an FD wait object by marking it as
 readable or with an error.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_MUTEX_COND\f[]
+.B \- \f[I]FI_WAIT_MUTEX_COND\f[R]
 Specifies that the EQ should use a pthread mutex and cond variable as a
 wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_YIELD\f[]
+.B \- \f[I]FI_WAIT_YIELD\f[R]
 Indicates that the EQ will wait without a wait object but instead yield
 on every wait.
 Allows usage of fi_eq_sread through a spin.
-.RS
-.RE
 .TP
-.B \f[I]signaling_vector\f[]
+.B \f[I]signaling_vector\f[R]
 If the FI_AFFINITY flag is set, this indicates the logical cpu number
 (0..max cpu \- 1) that interrupts associated with the EQ should target.
 This field should be treated as a hint to the provider and may be
 ignored if the provider does not support interrupt affinity.
-.RS
-.RE
 .TP
-.B \f[I]wait_set\f[]
+.B \f[I]wait_set\f[R]
 If wait_obj is FI_WAIT_SET, this field references a wait object to which
 the event queue should attach.
 When an event is inserted into the event queue, the corresponding wait
@@ -264,8 +202,6 @@ set will be signaled if all necessary conditions are met.
 The use of a wait_set enables an optimized method of waiting for events
 across multiple event queues.
 This field is ignored if wait_obj is not FI_WAIT_SET.
-.RS
-.RE
 .SS fi_close
 .PP
 The fi_close call releases all resources associated with an event queue.
@@ -281,25 +217,23 @@ Access to the EQ should be serialized across all calls when fi_control
 is invoked, as it may redirect the implementation of EQ operations.
 The following control commands are usable with an EQ.
 .TP
-.B \f[I]FI_GETWAIT (void **)\f[]
+.B \f[I]FI_GETWAIT (void **)\f[R]
 This command allows the user to retrieve the low\-level wait object
 associated with the EQ.
 The format of the wait\-object is specified during EQ creation, through
 the EQ attributes.
 The fi_control arg parameter should be an address where a pointer to the
 returned wait object will be written.
-This should be an \[aq]int *\[aq] for FI_WAIT_FD, or \[aq]struct
-fi_mutex_cond\[aq] for FI_WAIT_MUTEX_COND.
-.RS
-.RE
+This should be an \[cq]int *\[cq] for FI_WAIT_FD, or `struct
+fi_mutex_cond' for FI_WAIT_MUTEX_COND.
 .IP
 .nf
 \f[C]
-struct\ fi_mutex_cond\ {
-\ \ \ \ pthread_mutex_t\ \ \ \ \ *mutex;
-\ \ \ \ pthread_cond_t\ \ \ \ \ \ *cond;
+struct fi_mutex_cond {
+    pthread_mutex_t     *mutex;
+    pthread_cond_t      *cond;
 };
-\f[]
+\f[R]
 .fi
 .SS fi_eq_read
 .PP
@@ -318,26 +252,24 @@ from the EQ.
 The following types of events may be reported to an EQ, along with
 information regarding the format associated with each event.
 .TP
-.B \f[I]Asynchronous Control Operations\f[]
+.B \f[I]Asynchronous Control Operations\f[R]
 Asynchronous control operations are basic requests that simply need to
 generate an event to indicate that they have completed.
 These include the following types of events: memory registration,
 address vector resolution, and multicast joins.
-.RS
-.RE
 .PP
 Control requests report their completion by inserting a
-\f[C]struct\ \ \ fi_eq_entry\f[] into the EQ.
+\f[C]struct   fi_eq_entry\f[R] into the EQ.
 The format of this structure is:
 .IP
 .nf
 \f[C]
-struct\ fi_eq_entry\ {
-\ \ \ \ fid_t\ \ \ \ \ \ \ \ \ \ \ \ fid;\ \ \ \ \ \ \ \ /*\ fid\ associated\ with\ request\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ *context;\ \ \ \ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ data;\ \ \ \ \ \ \ /*\ completion\-specific\ data\ */
+struct fi_eq_entry {
+    fid_t            fid;        /* fid associated with request */
+    void            *context;    /* operation context */
+    uint64_t         data;       /* completion\-specific data */
 };
-\f[]
+\f[R]
 .fi
 .PP
 For the completion of basic asynchronous control operations, the
@@ -351,34 +283,33 @@ The context field will be set to the context specified as part of the
 operation, if available, otherwise the context will be associated with
 the fabric descriptor.
 The data field will be set as described in the man page for the
-corresponding object type (e.g., see \f[C]fi_av\f[](3) for a description
-of how asynchronous address vector insertions are completed).
+corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a
+description of how asynchronous address vector insertions are
+completed).
 .TP
-.B \f[I]Connection Notification\f[]
+.B \f[I]Connection Notification\f[R]
 Connection notifications are connection management notifications used to
 setup or tear down connections between endpoints.
 There are three connection notification events: FI_CONNREQ,
 FI_CONNECTED, and FI_SHUTDOWN.
 Connection notifications are reported using
-\f[C]struct\ \ \ fi_eq_cm_entry\f[]:
-.RS
-.RE
+\f[C]struct   fi_eq_cm_entry\f[R]:
 .IP
 .nf
 \f[C]
-struct\ fi_eq_cm_entry\ {
-\ \ \ \ fid_t\ \ \ \ \ \ \ \ \ \ \ \ fid;\ \ \ \ \ \ \ \ /*\ fid\ associated\ with\ request\ */
-\ \ \ \ struct\ fi_info\ \ *info;\ \ \ \ \ \ \ /*\ endpoint\ information\ */
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ data[];\ \ \ \ \ /*\ app\ connection\ data\ */
+struct fi_eq_cm_entry {
+    fid_t            fid;        /* fid associated with request */
+    struct fi_info  *info;       /* endpoint information */
+    uint8_t         data[];     /* app connection data */
 };
-\f[]
+\f[R]
 .fi
 .PP
 A connection request (FI_CONNREQ) event indicates that a remote endpoint
 wishes to establish a new connection to a listening, or passive,
 endpoint.
 The fid is the passive endpoint.
-Information regarding the requested, active endpoint\[aq]s capabilities
+Information regarding the requested, active endpoint\[cq]s capabilities
 and attributes are available from the info field.
 The application is responsible for freeing this structure by calling
 fi_freeinfo when it is no longer needed.
@@ -407,8 +338,8 @@ the connecting peer.
 .PP
 If a connection request has been accepted, an FI_CONNECTED event will be
 generated on both sides of the connection.
-The active side \-\- one that called fi_connect() \-\- may receive user
-data as part of the FI_CONNECTED event.
+The active side \[en] one that called fi_connect() \[en] may receive
+user data as part of the FI_CONNECTED event.
 The user data is passed to the connection manager on the passive side
 through the fi_accept call.
 User data is not provided with an FI_CONNECTED event on the listening
@@ -418,16 +349,14 @@ Notification that a remote peer has disconnected from an active endpoint
 is done through the FI_SHUTDOWN event.
 Shutdown notification uses struct fi_eq_cm_entry as declared above.
 The fid field for a shutdown notification refers to the active
-endpoint\[aq]s fid_ep.
+endpoint\[cq]s fid_ep.
 .TP
-.B \f[I]Asynchronous Error Notification\f[]
+.B \f[I]Asynchronous Error Notification\f[R]
 Asynchronous errors are used to report problems with fabric resources.
 Reported errors may be fatal or transient, based on the error, and
 result in the resource becoming disabled.
 Disabled resources will fail operations submitted against them until
 they are explicitly re\-enabled by the application.
-.RS
-.RE
 .PP
 Asynchronous errors may be reported for completion queues and endpoints
 of all types.
@@ -469,7 +398,7 @@ error completion was found or not.
 .PP
 EQs are optimized to report operations which have completed
 successfully.
-Operations which fail are reported \[aq]out of band\[aq].
+Operations which fail are reported `out of band'.
 Such operations are retrieved using the fi_eq_readerr function.
 When an operation that completes with an unexpected error is inserted
 into an EQ, it is placed into a temporary error queue.
@@ -484,16 +413,16 @@ The format of this structure is defined below.
 .IP
 .nf
 \f[C]
-struct\ fi_eq_err_entry\ {
-\ \ \ \ fid_t\ \ \ \ \ \ \ \ \ \ \ \ fid;\ \ \ \ \ \ \ \ /*\ fid\ associated\ with\ error\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ *context;\ \ \ \ /*\ operation\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ data;\ \ \ \ \ \ \ /*\ completion\-specific\ data\ */
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ err;\ \ \ \ \ \ \ \ /*\ positive\ error\ code\ */
-\ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ prov_errno;\ /*\ provider\ error\ code\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ *err_data;\ \ \ /*\ additional\ error\ data\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ err_data_size;\ /*\ size\ of\ err_data\ */
+struct fi_eq_err_entry {
+    fid_t            fid;        /* fid associated with error */
+    void            *context;    /* operation context */
+    uint64_t         data;       /* completion\-specific data */
+    int              err;        /* positive error code */
+    int              prov_errno; /* provider error code */
+    void            *err_data;   /* additional error data */
+    size_t           err_data_size; /* size of err_data */
 };
-\f[]
+\f[R]
 .fi
 .PP
 The fid will reference the fabric descriptor associated with the event.
@@ -503,8 +432,9 @@ The context field will be set to the context specified as part of the
 operation.
 .PP
 The data field will be set as described in the man page for the
-corresponding object type (e.g., see \f[C]fi_av\f[](3) for a description
-of how asynchronous address vector insertions are completed).
+corresponding object type (e.g., see \f[C]fi_av\f[R](3) for a
+description of how asynchronous address vector insertions are
+completed).
 .PP
 The general reason for the error is provided through the err field.
 Provider or operational specific error information may also be available
@@ -532,7 +462,7 @@ The EQ entry data structures share many of the same fields.
 The meanings are the same or similar for all EQ structure formats, with
 specific details described below.
 .TP
-.B \f[I]fid\f[]
+.B \f[I]fid\f[R]
 This corresponds to the fabric descriptor associated with the event.
 The type of fid depends on the event being reported.
 For FI_CONNREQ this will be the fid of the passive endpoint.
@@ -543,60 +473,46 @@ FI_JOIN_COMPLETE will point to the multicast descriptor returned as part
 of the join operation.
 Applications can use fid\->context value to retrieve the context
 associated with the fabric descriptor.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 The context value is set to the context parameter specified with the
 operation that generated the event.
 If no context parameter is associated with the operation, this field
 will be NULL.
-.RS
-.RE
 .TP
-.B \f[I]data\f[]
+.B \f[I]data\f[R]
 Data is an operation specific value or set of bytes.
 For connection events, data is application data exchanged as part of the
 connection protocol.
-.RS
-.RE
 .TP
-.B \f[I]err\f[]
+.B \f[I]err\f[R]
 This err code is a positive fabric errno associated with an event.
 The err value indicates the general reason for an error, if one
 occurred.
 See fi_errno.3 for a list of possible error codes.
-.RS
-.RE
 .TP
-.B \f[I]prov_errno\f[]
+.B \f[I]prov_errno\f[R]
 On an error, prov_errno may contain a provider specific error code.
 The use of this field and its meaning is provider specific.
 It is intended to be used as a debugging aid.
 See fi_eq_strerror for additional details on converting this error value
 into a human readable string.
-.RS
-.RE
 .TP
-.B \f[I]err_data\f[]
+.B \f[I]err_data\f[R]
 On an error, err_data may reference a provider specific amount of data
 associated with an error.
 The use of this field and its meaning is provider specific.
 It is intended to be used as a debugging aid.
 See fi_eq_strerror for additional details on converting this error data
 into a human readable string.
-.RS
-.RE
 .TP
-.B \f[I]err_data_size\f[]
+.B \f[I]err_data_size\f[R]
 On input, err_data_size indicates the size of the err_data buffer in
 bytes.
 On output, err_data_size will be set to the number of bytes copied to
 the err_data buffer.
 The err_data information is typically used with fi_eq_strerror to
 provide details about the type of error that occurred.
-.RS
-.RE
 .PP
 For compatibility purposes, if err_data_size is 0 on input, or the
 fabric was opened with release < 1.5, err_data will be set to a data
@@ -607,8 +523,8 @@ Applications must serialize access to the EQ when processing errors to
 ensure that the buffer referenced by err_data does no change.
 .SH NOTES
 .PP
-If an event queue has been overrun, it will be placed into an
-\[aq]overrun\[aq] state.
+If an event queue has been overrun, it will be placed into an `overrun'
+state.
 Write operations against an overrun EQ will fail with \-FI_EOVERRUN.
 Read operations will continue to return any valid, non\-corrupted
 events, if available.
@@ -621,41 +537,31 @@ additional events once the overrun occurs.
 .B fi_eq_open
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-.RS
-.RE
 .TP
 .B fi_eq_read / fi_eq_readerr
 On success, returns the number of bytes read from the event queue.
 On error, a negative value corresponding to fabric errno is returned.
 If no data is available to be read from the event queue, \-FI_EAGAIN is
 returned.
-.RS
-.RE
 .TP
 .B fi_eq_sread
 On success, returns the number of bytes read from the event queue.
 On error, a negative value corresponding to fabric errno is returned.
 If the timeout expires or the calling thread is signaled and no data is
 available to be read from the event queue, \-FI_EAGAIN is returned.
-.RS
-.RE
 .TP
 .B fi_eq_write
 On success, returns the number of bytes written to the event queue.
 On error, a negative value corresponding to fabric errno is returned.
-.RS
-.RE
 .TP
 .B fi_eq_strerror
 Returns a character string interpretation of the provider specific error
 returned with a completion.
-.RS
-.RE
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_cntr\f[](3), \f[C]fi_poll\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3), \f[C]fi_poll\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_errno.3 b/deps/libfabric/man/man3/fi_errno.3
index 1b1c2f7012e9afea98ec5704f03bb07813471a61..919e25481b57efa75926b2cf3aca219c0c92a559 100644
--- a/deps/libfabric/man/man3/fi_errno.3
+++ b/deps/libfabric/man/man3/fi_errno.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_errno" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_errno" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -11,234 +11,146 @@ fi_strerror \- Convert fabric error into a printable string
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_errno.h>
+#include <rdma/fi_errno.h>
 
-const\ char\ *fi_strerror(int\ errno);
-\f[]
+const char *fi_strerror(int errno);
+\f[R]
 .fi
 .SH ERRORS
 .TP
-.B \f[I]FI_ENOENT\f[]
+.B \f[I]FI_ENOENT\f[R]
 No such file or directory.
-.RS
-.RE
 .TP
-.B \f[I]FI_EIO\f[]
+.B \f[I]FI_EIO\f[R]
 I/O error
-.RS
-.RE
 .TP
-.B \f[I]FI_E2BIG\f[]
+.B \f[I]FI_E2BIG\f[R]
 Argument list too long.
-.RS
-.RE
 .TP
-.B \f[I]FI_EBADF\f[]
+.B \f[I]FI_EBADF\f[R]
 Bad file number.
-.RS
-.RE
 .TP
-.B \f[I]FI_EAGAIN\f[]
+.B \f[I]FI_EAGAIN\f[R]
 Try again.
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOMEM\f[]
+.B \f[I]FI_ENOMEM\f[R]
 Out of memory.
-.RS
-.RE
 .TP
-.B \f[I]FI_EACCES\f[]
+.B \f[I]FI_EACCES\f[R]
 Permission denied.
-.RS
-.RE
 .TP
-.B \f[I]FI_EBUSY\f[]
+.B \f[I]FI_EBUSY\f[R]
 Device or resource busy
-.RS
-.RE
 .TP
-.B \f[I]FI_ENODEV\f[]
+.B \f[I]FI_ENODEV\f[R]
 No such device
-.RS
-.RE
 .TP
-.B \f[I]FI_EINVAL\f[]
+.B \f[I]FI_EINVAL\f[R]
 Invalid argument
-.RS
-.RE
 .TP
-.B \f[I]FI_EMFILE\f[]
+.B \f[I]FI_EMFILE\f[R]
 Too many open files
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOSPC\f[]
+.B \f[I]FI_ENOSPC\f[R]
 No space left on device
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOSYS\f[]
+.B \f[I]FI_ENOSYS\f[R]
 Function not implemented
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOMSG\f[]
+.B \f[I]FI_ENOMSG\f[R]
 No message of desired type
-.RS
-.RE
 .TP
-.B \f[I]FI_ENODATA\f[]
+.B \f[I]FI_ENODATA\f[R]
 No data available
-.RS
-.RE
 .TP
-.B \f[I]FI_EMSGSIZE\f[]
+.B \f[I]FI_EMSGSIZE\f[R]
 Message too long
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOPROTOOPT\f[]
+.B \f[I]FI_ENOPROTOOPT\f[R]
 Protocol not available
-.RS
-.RE
 .TP
-.B \f[I]FI_EOPNOTSUPP\f[]
+.B \f[I]FI_EOPNOTSUPP\f[R]
 Operation not supported on transport endpoint
-.RS
-.RE
 .TP
-.B \f[I]FI_EADDRINUSE\f[]
+.B \f[I]FI_EADDRINUSE\f[R]
 Address already in use
-.RS
-.RE
 .TP
-.B \f[I]FI_EADDRNOTAVAIL\f[]
+.B \f[I]FI_EADDRNOTAVAIL\f[R]
 Cannot assign requested address
-.RS
-.RE
 .TP
-.B \f[I]FI_ENETDOWN\f[]
+.B \f[I]FI_ENETDOWN\f[R]
 Network is down
-.RS
-.RE
 .TP
-.B \f[I]FI_ENETUNREACH\f[]
+.B \f[I]FI_ENETUNREACH\f[R]
 Network is unreachable
-.RS
-.RE
 .TP
-.B \f[I]FI_ECONNABORTED\f[]
+.B \f[I]FI_ECONNABORTED\f[R]
 Software caused connection abort
-.RS
-.RE
 .TP
-.B \f[I]FI_ECONNRESET\f[]
+.B \f[I]FI_ECONNRESET\f[R]
 Connection reset by peer
-.RS
-.RE
 .TP
-.B \f[I]FI_EISCONN\f[]
+.B \f[I]FI_EISCONN\f[R]
 Transport endpoint is already connected
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOTCONN\f[]
+.B \f[I]FI_ENOTCONN\f[R]
 Transport endpoint is not connected
-.RS
-.RE
 .TP
-.B \f[I]FI_ESHUTDOWN\f[]
+.B \f[I]FI_ESHUTDOWN\f[R]
 Cannot send after transport endpoint shutdown
-.RS
-.RE
 .TP
-.B \f[I]FI_ETIMEDOUT\f[]
+.B \f[I]FI_ETIMEDOUT\f[R]
 Operation timed out
-.RS
-.RE
 .TP
-.B \f[I]FI_ECONNREFUSED\f[]
+.B \f[I]FI_ECONNREFUSED\f[R]
 Connection refused
-.RS
-.RE
 .TP
-.B \f[I]FI_EHOSTUNREACH\f[]
+.B \f[I]FI_EHOSTUNREACH\f[R]
 No route to host
-.RS
-.RE
 .TP
-.B \f[I]FI_EALREADY\f[]
+.B \f[I]FI_EALREADY\f[R]
 Operation already in progress
-.RS
-.RE
 .TP
-.B \f[I]FI_EINPROGRESS\f[]
+.B \f[I]FI_EINPROGRESS\f[R]
 Operation now in progress
-.RS
-.RE
 .TP
-.B \f[I]FI_EREMOTEIO\f[]
+.B \f[I]FI_EREMOTEIO\f[R]
 Remote I/O error
-.RS
-.RE
 .TP
-.B \f[I]FI_ECANCELED\f[]
+.B \f[I]FI_ECANCELED\f[R]
 Operation Canceled
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOKEY\f[]
+.B \f[I]FI_ENOKEY\f[R]
 Required key not available
-.RS
-.RE
 .TP
-.B \f[I]FI_EKEYREJECTED\f[]
+.B \f[I]FI_EKEYREJECTED\f[R]
 Key was rejected by service
-.RS
-.RE
 .TP
-.B \f[I]FI_EOTHER\f[]
+.B \f[I]FI_EOTHER\f[R]
 Unspecified error
-.RS
-.RE
 .TP
-.B \f[I]FI_ETOOSMALL\f[]
+.B \f[I]FI_ETOOSMALL\f[R]
 Provided buffer is too small
-.RS
-.RE
 .TP
-.B \f[I]FI_EOPBADSTATE\f[]
+.B \f[I]FI_EOPBADSTATE\f[R]
 Operation not permitted in current state
-.RS
-.RE
 .TP
-.B \f[I]FI_EAVAIL\f[]
+.B \f[I]FI_EAVAIL\f[R]
 Error available
-.RS
-.RE
 .TP
-.B \f[I]FI_EBADFLAGS\f[]
+.B \f[I]FI_EBADFLAGS\f[R]
 Flags not supported
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOEQ\f[]
+.B \f[I]FI_ENOEQ\f[R]
 Missing or unavailable event queue
-.RS
-.RE
 .TP
-.B \f[I]FI_EDOMAIN\f[]
+.B \f[I]FI_EDOMAIN\f[R]
 Invalid resource domain
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOCQ\f[]
+.B \f[I]FI_ENOCQ\f[R]
 Missing or unavailable completion queue
-.RS
-.RE
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7)
+\f[C]fabric\f[R](7)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_fabric.3 b/deps/libfabric/man/man3/fi_fabric.3
index 02ca38954d4b9caa420d0aa660edf50cb5aceae0..e35fbda29821b10ebd0c4d4babb21e2427953429 100644
--- a/deps/libfabric/man/man3/fi_fabric.3
+++ b/deps/libfabric/man/man3/fi_fabric.3
@@ -1,71 +1,82 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_fabric" "3" "2020\-06\-02" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_fabric" "3" "2021\-10\-06" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
-fi_fabric \- Fabric domain operations
+fi_fabric \- Fabric network operations
 .TP
 .B fi_fabric / fi_close
-Open / close a fabric domain
-.RS
-.RE
+Open / close a fabric network
 .TP
-.B fi_tostr
+.B fi_tostr / fi_tostr_r
 Convert fabric attributes, flags, and capabilities to printable string
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-int\ fi_fabric(struct\ fi_fabric_attr\ *attr,
-\ \ \ \ struct\ fid_fabric\ **fabric,\ void\ *context);
+int fi_fabric(struct fi_fabric_attr *attr,
+    struct fid_fabric **fabric, void *context);
 
-int\ fi_close(struct\ fid\ *fabric);
+int fi_close(struct fid *fabric);
 
-char\ *\ fi_tostr(const\ void\ *data,\ enum\ fi_type\ datatype);
-\f[]
+char * fi_tostr(const void *data, enum fi_type datatype);
+
+char * fi_tostr(char *buf, size_t len, const void *data,
+    enum fi_type datatype);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Attributes of fabric to open.
-.RS
-.RE
 .TP
-.B \f[I]fabric\f[]
-Fabric domain
-.RS
-.RE
+.B \f[I]fabric\f[R]
+Fabric network
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the opened object.
 This context is returned as part of any associated asynchronous event.
-.RS
-.RE
+.TP
+.B \f[I]buf\f[R]
+Output buffer to write string.
+.TP
+.B \f[I]len\f[R]
+Size in bytes of memory referenced by buf.
+.TP
+.B \f[I]data\f[R]
+Input data to convert into a string.
+The format of data is determined by the datatype parameter.
+.TP
+.B \f[I]datatype\f[R]
+Indicates the data to convert to a printable string.
 .SH DESCRIPTION
 .PP
-A fabric domain represents a collection of hardware and software
+A fabric identifier is used to reference opened fabric resources and
+library related objects.
+.PP
+The fabric network represents a collection of hardware and software
 resources that access a single physical or virtual network.
 All network ports on a system that can communicate with each other
-through their attached networks belong to the same fabric domain.
-A fabric domain shares network addresses and can span multiple
+through their attached networks belong to the same fabric.
+A fabric network shares network addresses and can span multiple
 providers.
+An application must open a fabric network prior to allocating other
+network resources, such as communication endpoints.
 .SS fi_fabric
 .PP
-Opens a fabric provider.
+Opens a fabric network provider.
 The attributes of the fabric provider are specified through the open
 call, and may be obtained by calling fi_getinfo.
 .SS fi_close
 .PP
 The fi_close call is used to release all resources associated with a
-fabric domain or interface.
+fabric object.
 All items associated with the opened fabric must be released prior to
 calling fi_close.
-.SS fi_tostr
+.SS fi_tostr / fi_tostr_r
 .PP
 Converts fabric interface attributes, capabilities, flags, and enum
 values into a printable string.
@@ -75,138 +86,96 @@ referenced by the data parameter.
 Valid values for the datatype are listed below, along with the
 corresponding datatype or field value.
 .TP
-.B \f[I]FI_TYPE_INFO\f[]
+.B \f[I]FI_TYPE_INFO\f[R]
 struct fi_info, including all substructures and fields
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_EP_TYPE\f[]
+.B \f[I]FI_TYPE_EP_TYPE\f[R]
 struct fi_info::type field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_EP_CAP\f[]
+.B \f[I]FI_TYPE_EP_CAP\f[R]
 struct fi_info::ep_cap field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_OP_FLAGS\f[]
+.B \f[I]FI_TYPE_OP_FLAGS\f[R]
 struct fi_info::op_flags field, or general uint64_t flags
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_ADDR_FORMAT\f[]
+.B \f[I]FI_TYPE_ADDR_FORMAT\f[R]
 struct fi_info::addr_format field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_TX_ATTR\f[]
+.B \f[I]FI_TYPE_TX_ATTR\f[R]
 struct fi_tx_attr
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_RX_ATTR\f[]
+.B \f[I]FI_TYPE_RX_ATTR\f[R]
 struct fi_rx_attr
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_EP_ATTR\f[]
+.B \f[I]FI_TYPE_EP_ATTR\f[R]
 struct fi_ep_attr
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_DOMAIN_ATTR\f[]
+.B \f[I]FI_TYPE_DOMAIN_ATTR\f[R]
 struct fi_domain_attr
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_FABRIC_ATTR\f[]
+.B \f[I]FI_TYPE_FABRIC_ATTR\f[R]
 struct fi_fabric_attr
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_THREADING\f[]
+.B \f[I]FI_TYPE_THREADING\f[R]
 enum fi_threading
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_PROGRESS\f[]
+.B \f[I]FI_TYPE_PROGRESS\f[R]
 enum fi_progress
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_PROTOCOL\f[]
+.B \f[I]FI_TYPE_PROTOCOL\f[R]
 struct fi_ep_attr::protocol field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_MSG_ORDER\f[]
+.B \f[I]FI_TYPE_MSG_ORDER\f[R]
 struct fi_ep_attr::msg_order field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_MODE\f[]
+.B \f[I]FI_TYPE_MODE\f[R]
 struct fi_info::mode field
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_AV_TYPE\f[]
+.B \f[I]FI_TYPE_AV_TYPE\f[R]
 enum fi_av_type
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_ATOMIC_TYPE\f[]
+.B \f[I]FI_TYPE_ATOMIC_TYPE\f[R]
 enum fi_datatype
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_ATOMIC_OP\f[]
+.B \f[I]FI_TYPE_ATOMIC_OP\f[R]
 enum fi_op
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_VERSION\f[]
+.B \f[I]FI_TYPE_VERSION\f[R]
 Returns the library version of libfabric in string form.
 The data parameter is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_EQ_EVENT\f[]
+.B \f[I]FI_TYPE_EQ_EVENT\f[R]
 uint32_t event parameter returned from fi_eq_read().
-See \f[C]fi_eq(3)\f[] for a list of known values.
-.RS
-.RE
+See \f[C]fi_eq(3)\f[R] for a list of known values.
 .TP
-.B \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[]
+.B \f[I]FI_TYPE_CQ_EVENT_FLAGS\f[R]
 uint64_t flags field in fi_cq_xxx_entry structures.
-See \f[C]fi_cq(3)\f[] for valid flags.
-.RS
-.RE
+See \f[C]fi_cq(3)\f[R] for valid flags.
 .TP
-.B \f[I]FI_TYPE_MR_MODE\f[]
+.B \f[I]FI_TYPE_MR_MODE\f[R]
 struct fi_domain_attr::mr_mode flags
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_OP_TYPE\f[]
+.B \f[I]FI_TYPE_OP_TYPE\f[R]
 enum fi_op_type
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_FID\f[]
+.B \f[I]FI_TYPE_FID\f[R]
 struct fid *
-.RS
-.RE
 .TP
-.B \f[I]FI_TYPE_HMEM_IFACE\f[]
+.B \f[I]FI_TYPE_HMEM_IFACE\f[R]
 enum fi_hmem_iface *
-.RS
-.RE
+.TP
+.B \f[I]FI_TYPE_CQ_FORMAT\f[R]
+enum fi_cq_format
 .PP
 fi_tostr() will return a pointer to an internal libfabric buffer that
 should not be modified, and will be overwritten the next time fi_tostr()
 is invoked.
 fi_tostr() is not thread safe.
+.PP
+The fi_tostr_r() function is a re\-entrant and thread safe version of
+fi_tostr().
+It writes the string into a buffer provided by the caller.
+fi_tostr_r() returns the start of the caller\[cq]s buffer.
 .SH NOTES
 .PP
 The following resources are associated with fabric domains: access
@@ -218,14 +187,14 @@ with a fabric and a fabric provider.
 .IP
 .nf
 \f[C]
-struct\ fi_fabric_attr\ {
-\ \ \ \ struct\ fid_fabric\ *fabric;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ \ \ *name;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ \ \ *prov_name;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ \ \ prov_version;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ \ \ api_version;
+struct fi_fabric_attr {
+    struct fid_fabric *fabric;
+    char              *name;
+    char              *prov_name;
+    uint32_t          prov_version;
+    uint32_t          api_version;
 };
-\f[]
+\f[R]
 .fi
 .SS fabric
 .PP
@@ -247,29 +216,30 @@ A fabric identifier.
 The name of the underlying fabric provider.
 .PP
 To request an utility provider layered over a specific core provider,
-both the provider names have to be specified using ";" as delimiter.
+both the provider names have to be specified using \[lq];\[rq] as
+delimiter.
 .PP
-e.g.
-"ofi_rxm;verbs" or "verbs;ofi_rxm"
+e.g.\ \[lq]ofi_rxm;verbs\[rq] or \[lq]verbs;ofi_rxm\[rq]
 .PP
 For debugging and administrative purposes, environment variables can be
 used to control which fabric providers will be registered with
 libfabric.
-Specifying "FI_PROVIDER=foo,bar" will allow any providers with the names
-"foo" or "bar" to be registered.
-Similarly, specifying "FI_PROVIDER=^foo,bar" will prevent any providers
-with the names "foo" or "bar" from being registered.
+Specifying \[lq]FI_PROVIDER=foo,bar\[rq] will allow any providers with
+the names \[lq]foo\[rq] or \[lq]bar\[rq] to be registered.
+Similarly, specifying \[lq]FI_PROVIDER=\[ha]foo,bar\[rq] will prevent
+any providers with the names \[lq]foo\[rq] or \[lq]bar\[rq] from being
+registered.
 Providers which are not registered will not appear in fi_getinfo
 results.
 Applications which need a specific set of providers should implement
-their own filtering of fi_getinfo\[aq]s results rather than relying on
+their own filtering of fi_getinfo\[cq]s results rather than relying on
 these environment variables in a production setting.
 .SS prov_version \- Provider Version
 .PP
 Version information for the fabric provider, in a major.minor format.
 The use of the FI_MAJOR() and FI_MINOR() version macros may be used to
 extract the major and minor version data.
-See \f[C]fi_version(3)\f[].
+See \f[C]fi_version(3)\f[R].
 .PP
 In case of an utility provider layered over a core provider, the version
 would always refer to that of the utility provider.
@@ -277,16 +247,16 @@ would always refer to that of the utility provider.
 .PP
 The interface version requested by the application.
 This value corresponds to the version parameter passed into
-\f[C]fi_getinfo(3)\f[].
+\f[C]fi_getinfo(3)\f[R].
 .SH RETURN VALUE
 .PP
 Returns FI_SUCCESS on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_getinfo\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_eq\f[](3), \f[C]fi_endpoint\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3),
+\f[C]fi_eq\f[R](3), \f[C]fi_endpoint\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_getinfo.3 b/deps/libfabric/man/man3/fi_getinfo.3
index 7a7b24c522574a69f4803e818a0415896d2efb47..d62f365bca77f213ff0bf0404aa86d13e6db8575 100644
--- a/deps/libfabric/man/man3/fi_getinfo.3
+++ b/deps/libfabric/man/man3/fi_getinfo.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_getinfo" "3" "2020\-08\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_getinfo" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -11,51 +11,39 @@ fi_allocinfo, fi_dupinfo \- Allocate / duplicate an fi_info structure
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-int\ fi_getinfo(int\ version,\ const\ char\ *node,\ const\ char\ *service,
-\ \ \ \ \ \ \ \ uint64_t\ flags,\ const\ struct\ fi_info\ *hints,\ struct\ fi_info\ **info);
+int fi_getinfo(int version, const char *node, const char *service,
+        uint64_t flags, const struct fi_info *hints, struct fi_info **info);
 
-void\ fi_freeinfo(struct\ fi_info\ *info);
+void fi_freeinfo(struct fi_info *info);
 
-struct\ fi_info\ *fi_allocinfo(void);
+struct fi_info *fi_allocinfo(void);
 
-struct\ fi_info\ *fi_dupinfo(const\ struct\ fi_info\ *info);
-\f[]
+struct fi_info *fi_dupinfo(const struct fi_info *info);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]version\f[]
+.B \f[I]version\f[R]
 Interface version requested by application.
-.RS
-.RE
 .TP
-.B \f[I]node\f[]
+.B \f[I]node\f[R]
 Optional, name or fabric address to resolve.
-.RS
-.RE
 .TP
-.B \f[I]service\f[]
+.B \f[I]service\f[R]
 Optional, service name or port number of address.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Operation flags for the fi_getinfo call.
-.RS
-.RE
 .TP
-.B \f[I]hints\f[]
+.B \f[I]hints\f[R]
 Reference to an fi_info structure that specifies criteria for selecting
 the returned fabric information.
-.RS
-.RE
 .TP
-.B \f[I]info\f[]
+.B \f[I]info\f[R]
 A pointer to a linked list of fi_info structures containing response
 information.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 fi_getinfo returns information about available fabric services for
@@ -69,7 +57,7 @@ and the call will return \-FI_ENODATA.
 Based on the input hints, node, and service parameters, a list of fabric
 domains and endpoints will be returned.
 Each fi_info structure will describe an endpoint that meets the
-application\[aq]s specified communication criteria.
+application\[cq]s specified communication criteria.
 Each endpoint will be associated with a domain.
 Applications can restrict the number of returned endpoints by including
 additional criteria in their search hints.
@@ -136,85 +124,71 @@ substructures within it.
 .IP
 .nf
 \f[C]
-struct\ fi_info\ {
-\ \ \ \ struct\ fi_info\ \ \ \ \ \ \ \ *next;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ caps;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ mode;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ addr_format;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ src_addrlen;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ dest_addrlen;
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *src_addr;
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *dest_addr;
-\ \ \ \ fid_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ handle;
-\ \ \ \ struct\ fi_tx_attr\ \ \ \ \ *tx_attr;
-\ \ \ \ struct\ fi_rx_attr\ \ \ \ \ *rx_attr;
-\ \ \ \ struct\ fi_ep_attr\ \ \ \ \ *ep_attr;
-\ \ \ \ struct\ fi_domain_attr\ *domain_attr;
-\ \ \ \ struct\ fi_fabric_attr\ *fabric_attr;
-\ \ \ \ struct\ fid_nic\ \ \ \ \ \ \ \ *nic;
+struct fi_info {
+    struct fi_info        *next;
+    uint64_t              caps;
+    uint64_t              mode;
+    uint32_t              addr_format;
+    size_t                src_addrlen;
+    size_t                dest_addrlen;
+    void                  *src_addr;
+    void                  *dest_addr;
+    fid_t                 handle;
+    struct fi_tx_attr     *tx_attr;
+    struct fi_rx_attr     *rx_attr;
+    struct fi_ep_attr     *ep_attr;
+    struct fi_domain_attr *domain_attr;
+    struct fi_fabric_attr *fabric_attr;
+    struct fid_nic        *nic;
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]next\f[]
+.B \f[I]next\f[R]
 Pointer to the next fi_info structure in the list.
 Will be NULL if no more structures exist.
-.RS
-.RE
 .TP
-.B \f[I]caps \- fabric interface capabilities\f[]
+.B \f[I]caps \- fabric interface capabilities\f[R]
 If specified, indicates the desired capabilities of the fabric
 interfaces.
-Supported capabilities are listed in the \f[I]Capabilities\f[] section
+Supported capabilities are listed in the \f[I]Capabilities\f[R] section
 below.
-.RS
-.RE
 .TP
-.B \f[I]mode\f[]
+.B \f[I]mode\f[R]
 Operational modes supported by the application.
-See the \f[I]Mode\f[] section below.
-.RS
-.RE
+See the \f[I]Mode\f[R] section below.
 .TP
-.B \f[I]addr_format \- address format\f[]
+.B \f[I]addr_format \- address format\f[R]
 If specified, indicates the format of addresses referenced by the fabric
 interfaces and data structures.
-Supported formats are listed in the \f[I]Addressing formats\f[] section
+Supported formats are listed in the \f[I]Addressing formats\f[R] section
 below.
-.RS
-.RE
 .TP
-.B \f[I]src_addrlen \- source address length\f[]
+.B \f[I]src_addrlen \- source address length\f[R]
 Indicates the length of the source address.
-This value must be > 0 if \f[I]src_addr\f[] is non\-NULL.
+This value must be > 0 if \f[I]src_addr\f[R] is non\-NULL.
 This field will be ignored in hints if FI_SOURCE flag is set, or
-\f[I]src_addr\f[] is NULL.
-.RS
-.RE
+\f[I]src_addr\f[R] is NULL.
 .TP
-.B \f[I]dest_addrlen \- destination address length\f[]
+.B \f[I]dest_addrlen \- destination address length\f[R]
 Indicates the length of the destination address.
-This value must be > 0 if \f[I]dest_addr\f[] is non\-NULL.
+This value must be > 0 if \f[I]dest_addr\f[R] is non\-NULL.
 This field will be ignored in hints unless the node and service
-parameters are NULL or FI_SOURCE flag is set, or if \f[I]dst_addr\f[] is
-NULL.
-.RS
-.RE
+parameters are NULL or FI_SOURCE flag is set, or if \f[I]dst_addr\f[R]
+is NULL.
 .TP
-.B \f[I]src_addr \- source address\f[]
+.B \f[I]src_addr \- source address\f[R]
 If specified, indicates the source address.
 This field will be ignored in hints if FI_SOURCE flag is set.
 On output a provider shall return an address that corresponds to the
 indicated fabric, domain, node, and/or service fields.
 The format of the address is indicated by the returned
-\f[I]addr_format\f[] field.
+\f[I]addr_format\f[R] field.
 Note that any returned address is only used when opening a local
 endpoint.
 The address is not guaranteed to be usable by a peer process.
-.RS
-.RE
 .TP
-.B \f[I]dest_addr \- destination address\f[]
+.B \f[I]dest_addr \- destination address\f[R]
 If specified, indicates the destination address.
 This field will be ignored in hints unless the node and service
 parameters are NULL or FI_SOURCE flag is set.
@@ -222,10 +196,8 @@ If FI_SOURCE is not specified, on output a provider shall return an
 address the corresponds to the indicated node and/or service fields,
 relative to the fabric and domain.
 Note that any returned address is only usable locally.
-.RS
-.RE
 .TP
-.B \f[I]handle \- provider context handle\f[]
+.B \f[I]handle \- provider context handle\f[R]
 The use of this field is operation specific.
 If hints\->handle is set to struct fid_pep, the hints\->handle will be
 copied to info\->handle on output from fi_getinfo.
@@ -233,14 +205,12 @@ Other values of hints\->handle will be handled in a provider specific
 manner.
 The fi_info::handle field is also used by fi_endpoint() and fi_reject()
 calls when processing connection requests or to inherit another
-endpoint\[aq]s attributes.
-See \f[C]fi_eq\f[](3), \f[C]fi_reject\f[](3), and
-\f[C]fi_endpoint\f[](3).
+endpoint\[cq]s attributes.
+See \f[C]fi_eq\f[R](3), \f[C]fi_reject\f[R](3), and
+\f[C]fi_endpoint\f[R](3).
 The info\->handle field will be ignored by fi_dupinfo and fi_freeinfo.
-.RS
-.RE
 .TP
-.B \f[I]tx_attr \- transmit context attributes\f[]
+.B \f[I]tx_attr \- transmit context attributes\f[R]
 Optionally supplied transmit context attributes.
 Transmit context attributes may be specified and returned as part of
 fi_getinfo.
@@ -250,10 +220,8 @@ On output, the actual transmit context attributes that can be provided
 will be returned.
 Output values will be greater than or equal to the requested input
 values.
-.RS
-.RE
 .TP
-.B \f[I]rx_attr \- receive context attributes\f[]
+.B \f[I]rx_attr \- receive context attributes\f[R]
 Optionally supplied receive context attributes.
 Receive context attributes may be specified and returned as part of
 fi_getinfo.
@@ -263,10 +231,8 @@ On output, the actual receive context attributes that can be provided
 will be returned.
 Output values will be greater than or or equal to the requested input
 values.
-.RS
-.RE
 .TP
-.B \f[I]ep_attr \- endpoint attributes\f[]
+.B \f[I]ep_attr \- endpoint attributes\f[R]
 Optionally supplied endpoint attributes.
 Endpoint attributes may be specified and returned as part of fi_getinfo.
 When provided as hints, requested values of struct fi_ep_attr should be
@@ -274,11 +240,9 @@ set.
 On output, the actual endpoint attributes that can be provided will be
 returned.
 Output values will be greater than or equal to requested input values.
-See \f[C]fi_endpoint\f[](3) for details.
-.RS
-.RE
+See \f[C]fi_endpoint\f[R](3) for details.
 .TP
-.B \f[I]domain_attr \- domain attributes\f[]
+.B \f[I]domain_attr \- domain attributes\f[R]
 Optionally supplied domain attributes.
 Domain attributes may be specified and returned as part of fi_getinfo.
 When provided as hints, requested values of struct fi_domain_attr should
@@ -286,29 +250,23 @@ be set.
 On output, the actual domain attributes that can be provided will be
 returned.
 Output values will be greater than or equal to requested input values.
-See \f[C]fi_domain\f[](3) for details.
-.RS
-.RE
+See \f[C]fi_domain\f[R](3) for details.
 .TP
-.B \f[I]fabric_attr \- fabric attributes\f[]
+.B \f[I]fabric_attr \- fabric attributes\f[R]
 Optionally supplied fabric attributes.
 Fabric attributes may be specified and returned as part of fi_getinfo.
 When provided as hints, requested values of struct fi_fabric_attr should
 be set.
 On output, the actual fabric attributes that can be provided will be
 returned.
-See \f[C]fi_fabric\f[](3) for details.
-.RS
-.RE
+See \f[C]fi_fabric\f[R](3) for details.
 .TP
-.B \f[I]nic \- network interface details\f[]
+.B \f[I]nic \- network interface details\f[R]
 Optional attributes related to the hardware NIC associated with the
 specified fabric, domain, and endpoint data.
 This field is only valid for providers where the corresponding
 attributes are closely associated with a hardware NIC.
-See \f[C]fi_nic\f[](3) for details.
-.RS
-.RE
+See \f[C]fi_nic\f[R](3) for details.
 .SH CAPABILITIES
 .PP
 Interface capabilities are obtained by OR\-ing the following flags
@@ -325,7 +283,7 @@ Applications may use this feature to request a minimal set of
 requirements, then check the returned capabilities to enable additional
 optimizations.
 .TP
-.B \f[I]FI_ATOMIC\f[]
+.B \f[I]FI_ATOMIC\f[R]
 Specifies that the endpoint supports some set of atomic operations.
 Endpoints supporting this capability support operations defined by
 struct fi_ops_atomic.
@@ -334,25 +292,19 @@ initiate and be the target of remote atomic reads and writes.
 Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and
 FI_REMOTE_WRITE flags to restrict the types of atomic operations
 supported by an endpoint.
-.RS
-.RE
 .TP
-.B \f[I]FI_COLLECTIVE\f[]
+.B \f[I]FI_COLLECTIVE\f[R]
 Requests support for collective operations.
 Endpoints that support this capability support the collective operations
-defined in \f[C]fi_collective\f[](3).
-.RS
-.RE
+defined in \f[C]fi_collective\f[R](3).
 .TP
-.B \f[I]FI_DIRECTED_RECV\f[]
+.B \f[I]FI_DIRECTED_RECV\f[R]
 Requests that the communication endpoint use the source address of an
 incoming message when matching it with a receive buffer.
 If this capability is not set, then the src_addr parameter for msg and
 tagged receive operations is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 Indicates that the endpoint support the FI_FENCE flag on data transfer
 operations.
 Support requires tracking that all previous transmit requests to a
@@ -361,16 +313,12 @@ operation.
 Fenced operations are often used to enforce ordering between operations
 that are not otherwise guaranteed by the underlying provider or
 protocol.
-.RS
-.RE
 .TP
-.B \f[I]FI_HMEM\f[]
+.B \f[I]FI_HMEM\f[R]
 Specifies that the endpoint should support transfers to and from device
 memory.
-.RS
-.RE
 .TP
-.B \f[I]FI_LOCAL_COMM\f[]
+.B \f[I]FI_LOCAL_COMM\f[R]
 Indicates that the endpoint support host local communication.
 This flag may be used in conjunction with FI_REMOTE_COMM to indicate
 that local and remote communication are required.
@@ -380,17 +328,13 @@ affects performance.
 Providers that set FI_LOCAL_COMM but not FI_REMOTE_COMM, for example a
 shared memory provider, may only be used to communication between
 processes on the same system.
-.RS
-.RE
 .TP
-.B \f[I]FI_MSG\f[]
+.B \f[I]FI_MSG\f[R]
 Specifies that an endpoint should support sending and receiving messages
 or datagrams.
 Message capabilities imply support for send and/or receive queues.
 Endpoints supporting this capability support operations defined by
 struct fi_ops_msg.
-.RS
-.RE
 .PP
 The caps may be used to specify or restrict the type of messaging
 operations that are supported.
@@ -399,67 +343,51 @@ and receive messages.
 Applications can use the FI_SEND and FI_RECV flags to optimize an
 endpoint as send\-only or receive\-only.
 .TP
-.B \f[I]FI_MULTICAST\f[]
+.B \f[I]FI_MULTICAST\f[R]
 Indicates that the endpoint support multicast data transfers.
 This capability must be paired with FI_MSG.
 Applications can use FI_SEND and FI_RECV to optimize multicast as
 send\-only or receive\-only.
-.RS
-.RE
 .TP
-.B \f[I]FI_MULTI_RECV\f[]
+.B \f[I]FI_MULTI_RECV\f[R]
 Specifies that the endpoint must support the FI_MULTI_RECV flag when
 posting receive buffers.
-.RS
-.RE
 .TP
-.B \f[I]FI_NAMED_RX_CTX\f[]
+.B \f[I]FI_NAMED_RX_CTX\f[R]
 Requests that endpoints which support multiple receive contexts allow an
 initiator to target (or name) a specific receive context as part of a
 data transfer operation.
-.RS
-.RE
 .TP
-.B \f[I]FI_READ\f[]
+.B \f[I]FI_READ\f[R]
 Indicates that the user requires an endpoint capable of initiating reads
 against remote memory regions.
 This flag requires that FI_RMA and/or FI_ATOMIC be set.
-.RS
-.RE
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 Indicates that the user requires an endpoint capable of receiving
 message data transfers.
 Message transfers include base message operations as well as tagged
 message functionality.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_COMM\f[]
+.B \f[I]FI_REMOTE_COMM\f[R]
 Indicates that the endpoint support communication with endpoints located
 at remote nodes (across the fabric).
 See FI_LOCAL_COMM for additional details.
 Providers that set FI_REMOTE_COMM but not FI_LOCAL_COMM, for example
 NICs that lack loopback support, cannot be used to communicate with
 processes on the same system.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_READ\f[]
+.B \f[I]FI_REMOTE_READ\f[R]
 Indicates that the user requires an endpoint capable of receiving read
 memory operations from remote endpoints.
 This flag requires that FI_RMA and/or FI_ATOMIC be set.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_WRITE\f[]
+.B \f[I]FI_REMOTE_WRITE\f[R]
 Indicates that the user requires an endpoint capable of receiving write
 memory operations from remote endpoints.
 This flag requires that FI_RMA and/or FI_ATOMIC be set.
-.RS
-.RE
 .TP
-.B \f[I]FI_RMA\f[]
+.B \f[I]FI_RMA\f[R]
 Specifies that the endpoint should support RMA read and write
 operations.
 Endpoints supporting this capability support operations defined by
@@ -469,43 +397,33 @@ initiate and be the target of remote memory reads and writes.
 Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and
 FI_REMOTE_WRITE flags to restrict the types of RMA operations supported
 by an endpoint.
-.RS
-.RE
 .TP
-.B \f[I]FI_RMA_EVENT\f[]
+.B \f[I]FI_RMA_EVENT\f[R]
 Requests that an endpoint support the generation of completion events
 when it is the target of an RMA and/or atomic operation.
 This flag requires that FI_REMOTE_READ and/or FI_REMOTE_WRITE be enabled
 on the endpoint.
-.RS
-.RE
 .TP
-.B \f[I]FI_RMA_PMEM\f[]
-Indicates that the provider is \[aq]persistent memory aware\[aq] and
-supports RMA operations to and from persistent memory.
+.B \f[I]FI_RMA_PMEM\f[R]
+Indicates that the provider is `persistent memory aware' and supports
+RMA operations to and from persistent memory.
 Persistent memory aware providers must support registration of memory
 that is backed by non\- volatile memory, RMA transfers to/from
 persistent memory, and enhanced completion semantics.
 This flag requires that FI_RMA be set.
 This capability is experimental.
-.RS
-.RE
 .TP
-.B \f[I]FI_SEND\f[]
+.B \f[I]FI_SEND\f[R]
 Indicates that the user requires an endpoint capable of sending message
 data transfers.
 Message transfers include base message operations as well as tagged
 message functionality.
-.RS
-.RE
 .TP
-.B \f[I]FI_SHARED_AV\f[]
+.B \f[I]FI_SHARED_AV\f[R]
 Requests or indicates support for address vectors which may be shared
 among multiple processes.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOURCE\f[]
+.B \f[I]FI_SOURCE\f[R]
 Requests that the endpoint return source addressing data as part of its
 completion data.
 This capability only applies to connectionless endpoints.
@@ -515,10 +433,8 @@ available in the underlying protocol in order to provide the requested
 data, which may adversely affect performance.
 The performance impact may be greater for address vectors of type
 FI_AV_TABLE.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOURCE_ERR\f[]
+.B \f[I]FI_SOURCE_ERR\f[R]
 Must be paired with FI_SOURCE.
 When specified, this requests that raw source addressing data be
 returned as part of completion data for any address that has not been
@@ -526,10 +442,8 @@ inserted into the local address vector.
 Use of this capability may require the provider to validate incoming
 source address data against addresses stored in the local address
 vector, which may adversely affect performance.
-.RS
-.RE
 .TP
-.B \f[I]FI_TAGGED\f[]
+.B \f[I]FI_TAGGED\f[R]
 Specifies that the endpoint should handle tagged message transfers.
 Tagged message transfers associate a user\-specified key or tag with
 each message that is used for matching purposes at the remote side.
@@ -539,34 +453,26 @@ In the absence of any relevant flags, FI_TAGGED implies the ability to
 send and receive tagged messages.
 Applications can use the FI_SEND and FI_RECV flags to optimize an
 endpoint as send\-only or receive\-only.
-.RS
-.RE
 .TP
-.B \f[I]FI_TRIGGER\f[]
+.B \f[I]FI_TRIGGER\f[R]
 Indicates that the endpoint should support triggered operations.
 Endpoints support this capability must meet the usage model as described
 by fi_trigger.3.
-.RS
-.RE
 .TP
-.B \f[I]FI_VARIABLE_MSG\f[]
+.B \f[I]FI_VARIABLE_MSG\f[R]
 Requests that the provider must notify a receiver when a variable length
 message is ready to be received prior to attempting to place the data.
 Such notification will include the size of the message and any
 associated message tag (for FI_TAGGED).
-See \[aq]Variable Length Messages\[aq] in fi_msg.3 for full details.
+See `Variable Length Messages' in fi_msg.3 for full details.
 Variable length messages are any messages larger than an endpoint
 configurable size.
 This flag requires that FI_MSG and/or FI_TAGGED be set.
-.RS
-.RE
 .TP
-.B \f[I]FI_WRITE\f[]
+.B \f[I]FI_WRITE\f[R]
 Indicates that the user requires an endpoint capable of initiating
 writes against remote memory regions.
 This flag requires that FI_RMA and/or FI_ATOMIC be set.
-.RS
-.RE
 .PP
 Capabilities may be grouped into three general categories: primary,
 secondary, and primary modifiers.
@@ -612,10 +518,10 @@ achieve high\-performance.
 Mode bits that remain set indicate application requirements for using
 the fabric interfaces created using the returned fi_info.
 The set of modes are listed below.
-If a NULL hints structure is provided, then the provider\[aq]s supported
+If a NULL hints structure is provided, then the provider\[cq]s supported
 set of modes will be returned in the info structure(s).
 .TP
-.B \f[I]FI_ASYNC_IOV\f[]
+.B \f[I]FI_ASYNC_IOV\f[R]
 Applications can reference multiple data buffers as part of a single
 operation through the use of IO vectors (SGEs).
 Typically, the contents of an IO vector are copied by the provider into
@@ -627,10 +533,8 @@ buffering needed for the IO vectors.
 When set, an application must not modify an IO vector of length > 1,
 including any related memory descriptor array, until the associated
 operation has completed.
-.RS
-.RE
 .TP
-.B \f[I]FI_BUFFERED_RECV\f[]
+.B \f[I]FI_BUFFERED_RECV\f[R]
 The buffered receive mode bit indicates that the provider owns the data
 buffer(s) that are accessed by the networking layer for received
 messages.
@@ -639,12 +543,10 @@ buffer into the application buffer.
 Applications that can handle message processing from network allocated
 data buffers can set this mode bit to avoid copies.
 For full details on application requirements to support this mode, see
-the \[aq]Buffered Receives\[aq] section in \f[C]fi_msg\f[](3).
+the `Buffered Receives' section in \f[C]fi_msg\f[R](3).
 This mode bit applies to FI_MSG and FI_TAGGED receive operations.
-.RS
-.RE
 .TP
-.B \f[I]FI_CONTEXT\f[]
+.B \f[I]FI_CONTEXT\f[R]
 Specifies that the provider requires that applications use struct
 fi_context as their per operation context parameter for operations that
 generated full completions.
@@ -660,28 +562,23 @@ Doing so is likely to result in stack corruption that will be difficult
 to debug.
 Users should not update or interpret the fields in this structure, or
 reuse it until the original operation has completed.
-If an operation does not generate a completion (i.e.
-the endpoint was configured with FI_SELECTIVE_COMPLETION and the
-operation was not initiated with the FI_COMPLETION flag) then the
-context parameter is ignored by the fabric provider.
+If an operation does not generate a completion (i.e.\ the endpoint was
+configured with FI_SELECTIVE_COMPLETION and the operation was not
+initiated with the FI_COMPLETION flag) then the context parameter is
+ignored by the fabric provider.
 The structure is specified in rdma/fabric.h.
-.RS
-.RE
 .TP
-.B \f[I]FI_CONTEXT2\f[]
-This bit is similar to FI_CONTEXT, but doubles the provider\[aq]s
+.B \f[I]FI_CONTEXT2\f[R]
+This bit is similar to FI_CONTEXT, but doubles the provider\[cq]s
 requirement on the size of the per context structure.
 When set, this specifies that the provider requires that applications
 use struct fi_context2 as their per operation context parameter.
 Or, optionally, an application can provide an array of two fi_context
-structures (e.g.
-struct fi_context[2]) instead.
+structures (e.g.\ struct fi_context[2]) instead.
 The requirements for using struct fi_context2 are identical as defined
 for FI_CONTEXT above.
-.RS
-.RE
 .TP
-.B \f[I]FI_LOCAL_MR\f[]
+.B \f[I]FI_LOCAL_MR\f[R]
 The provider is optimized around having applications register memory for
 locally accessed data buffers.
 Data buffers used in send and receive operations and as the source
@@ -690,12 +587,10 @@ application for access domains opened with this capability.
 This flag is defined for compatibility and is ignored if the application
 version is 1.5 or later and the domain mr_mode is set to anything other
 than FI_MR_BASIC or FI_MR_SCALABLE.
-See the domain attribute mr_mode \f[C]fi_domain\f[](3) and
-\f[C]fi_mr\f[](3).
-.RS
-.RE
+See the domain attribute mr_mode \f[C]fi_domain\f[R](3) and
+\f[C]fi_mr\f[R](3).
 .TP
-.B \f[I]FI_MSG_PREFIX\f[]
+.B \f[I]FI_MSG_PREFIX\f[R]
 Message prefix mode indicates that an application will provide buffer
 space in front of all message send and receive buffers for use by the
 provider.
@@ -705,8 +600,6 @@ The contents of the prefix space should be treated as opaque.
 The use of FI_MSG_PREFIX may improve application performance over
 certain providers by reducing the number of IO vectors referenced by
 underlying hardware and eliminating provider buffer allocation.
-.RS
-.RE
 .PP
 FI_MSG_PREFIX only applies to send and receive operations, including
 tagged sends and receives.
@@ -736,34 +629,27 @@ For scatter\-gather send/recv operations, the prefix buffer must be a
 contiguous region, though it may or may not be directly adjacent to the
 payload portion of the buffer.
 .TP
-.B \f[I]FI_NOTIFY_FLAGS_ONLY\f[]
+.B \f[I]FI_NOTIFY_FLAGS_ONLY\f[R]
 This bit indicates that general completion flags may not be set by the
 provider, and are not needed by the application.
 If specified, completion flags which simply report the type of operation
-that completed (e.g.
-send or receive) may not be set.
+that completed (e.g.\ send or receive) may not be set.
 However, completion flags that are used for remote notifications will
 still be set when applicable.
-See \f[C]fi_cq\f[](3) for details on which completion flags are valid
+See \f[C]fi_cq\f[R](3) for details on which completion flags are valid
 when this mode bit is enabled.
-.RS
-.RE
 .TP
-.B \f[I]FI_RESTRICTED_COMP\f[]
+.B \f[I]FI_RESTRICTED_COMP\f[R]
 This bit indicates that the application will only share completion
 queues and counters among endpoints, transmit contexts, and receive
 contexts that have the same set of capability flags.
-.RS
-.RE
 .TP
-.B \f[I]FI_RX_CQ_DATA\f[]
+.B \f[I]FI_RX_CQ_DATA\f[R]
 This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA.
 When set, a data transfer that carries remote CQ data will consume a
 receive buffer at the target.
 This is true even for operations that would normally not consume posted
 receive buffers, such as RMA write operations.
-.RS
-.RE
 .SH ADDRESSING FORMATS
 .PP
 Multiple fabric interfaces take as input either a source or destination
@@ -777,48 +663,40 @@ these operations.
 A provider may support one or more of the following addressing formats.
 In some cases, a selected addressing format may need to be translated or
 mapped into an address which is native to the fabric.
-See \f[C]fi_av\f[](3).
+See \f[C]fi_av\f[R](3).
 .TP
-.B \f[I]FI_ADDR_BGQ\f[]
+.B \f[I]FI_ADDR_BGQ\f[R]
 Address is an IBM proprietary format that is used with their Blue Gene Q
 systems.
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_EFA\f[]
+.B \f[I]FI_ADDR_EFA\f[R]
 Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format.
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_GNI\f[]
+.B \f[I]FI_ADDR_GNI\f[R]
 Address is a Cray proprietary format that is used with their GNI
 protocol.
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_PSMX\f[]
+.B \f[I]FI_ADDR_PSMX\f[R]
 Address is an Intel proprietary format used with their Performance
 Scaled Messaging protocol.
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_PSMX2\f[]
+.B \f[I]FI_ADDR_PSMX2\f[R]
 Address is an Intel proprietary format used with their Performance
 Scaled Messaging protocol version 2.
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_STR\f[]
+.B \f[I]FI_ADDR_PSMX3\f[R]
+Address is an Intel proprietary format used with their Performance
+Scaled Messaging protocol version 3.
+.TP
+.B \f[I]FI_ADDR_STR\f[R]
 Address is a formatted character string.
 The length and content of the string is address and/or provider
 specific, but in general follows a URI model:
-.RS
-.RE
 .IP
 .nf
 \f[C]
 address_format[://[node][:[service][/[field3]...][?[key=value][&k2=v2]...]]]
-\f[]
+\f[R]
 .fi
 .PP
 Examples: \- fi_sockaddr://10.31.6.12:7471 \-
@@ -829,7 +707,7 @@ Since the string formatted address does not contain any provider
 information, the prov_name field of the fabric attribute structure
 should be used to filter by provider if necessary.
 .TP
-.B \f[I]FI_FORMAT_UNSPEC\f[]
+.B \f[I]FI_FORMAT_UNSPEC\f[R]
 FI_FORMAT_UNSPEC indicates that a provider specific address format
 should be selected.
 Provider specific addresses may be protocol specific or a vendor
@@ -840,50 +718,36 @@ FI_FORMAT_UNSPEC targets apps which make use of an out of band address
 exchange.
 Applications which use FI_FORMAT_UNSPEC may use fi_getname() to obtain a
 provider specific address assigned to an allocated endpoint.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKADDR\f[]
+.B \f[I]FI_SOCKADDR\f[R]
 Address is of type sockaddr.
 The specific socket address format will be determined at run time by
 interfaces examining the sa_family field.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKADDR_IB\f[]
+.B \f[I]FI_SOCKADDR_IB\f[R]
 Address is of type sockaddr_ib (defined in Linux kernel source)
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKADDR_IN\f[]
+.B \f[I]FI_SOCKADDR_IN\f[R]
 Address is of type sockaddr_in (IPv4).
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKADDR_IN6\f[]
+.B \f[I]FI_SOCKADDR_IN6\f[R]
 Address is of type sockaddr_in6 (IPv6).
-.RS
-.RE
 .TP
-.B \f[I]FI_ADDR_PSMX\f[]
+.B \f[I]FI_ADDR_PSMX\f[R]
 Address is an Intel proprietary format that is used with their PSMX
 (extended performance scaled messaging) protocol.
-.RS
-.RE
 .SH FLAGS
 .PP
 The operation of the fi_getinfo call may be controlled through the use
 of input flags.
 Valid flags include the following.
 .TP
-.B \f[I]FI_NUMERICHOST\f[]
+.B \f[I]FI_NUMERICHOST\f[R]
 Indicates that the node parameter is a numeric string representation of
 a fabric address, such as a dotted decimal IP address.
 Use of this flag will suppress any lengthy name resolution protocol.
-.RS
-.RE
 .TP
-.B \f[I]FI_PROV_ATTR_ONLY\f[]
+.B \f[I]FI_PROV_ATTR_ONLY\f[R]
 Indicates that the caller is only querying for what providers are
 potentially available.
 All providers will return exactly one fi_info struct, regardless of
@@ -892,23 +756,19 @@ The returned fi_info struct will contain default values for all members,
 with the exception of fabric_attr.
 The fabric_attr member will have the prov_name and prov_version values
 filled in.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOURCE\f[]
+.B \f[I]FI_SOURCE\f[R]
 Indicates that the node and service parameters specify the local source
 address to associate with an endpoint.
 If specified, either the node and/or service parameter must be
 non\-NULL.
 This flag is often used with passive endpoints.
-.RS
-.RE
 .SH RETURN VALUE
 .PP
 fi_getinfo() returns 0 on success.
 On error, fi_getinfo() returns a negative value corresponding to fabric
 errno.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .PP
 fi_allocinfo() returns a pointer to a new fi_info structure on success,
 or NULL on error.
@@ -919,34 +779,28 @@ Both calls require that the returned fi_info structure be freed via
 fi_freeinfo().
 .SH ERRORS
 .TP
-.B \f[I]FI_EBADFLAGS\f[]
+.B \f[I]FI_EBADFLAGS\f[R]
 The specified endpoint or domain capability or operation flags are
 invalid.
-.RS
-.RE
 .TP
-.B \f[I]FI_ENODATA\f[]
+.B \f[I]FI_ENODATA\f[R]
 Indicates that no providers could be found which support the requested
 fabric information.
-.RS
-.RE
 .TP
-.B \f[I]FI_ENOMEM\f[]
+.B \f[I]FI_ENOMEM\f[R]
 Indicates that there was insufficient memory to complete the operation.
-.RS
-.RE
 .SH NOTES
 .PP
 If hints are provided, the operation will be controlled by the values
 that are supplied in the various fields (see section on
-\f[I]fi_info\f[]).
+\f[I]fi_info\f[R]).
 Applications that require specific communication interfaces, domains,
 capabilities or other requirements, can specify them using fields in
-\f[I]hints\f[].
-Libfabric returns a linked list in \f[I]info\f[] that points to a list
+\f[I]hints\f[R].
+Libfabric returns a linked list in \f[I]info\f[R] that points to a list
 of matching interfaces.
-\f[I]info\f[] is set to NULL if there are no communication interfaces or
-none match the input hints.
+\f[I]info\f[R] is set to NULL if there are no communication interfaces
+or none match the input hints.
 .PP
 If node is provided, fi_getinfo will attempt to resolve the fabric
 address to the given node.
@@ -958,11 +812,11 @@ by fi_getinfo.
 If neither node, service or hints are provided, then fi_getinfo simply
 returns the list all available communication interfaces.
 .PP
-Multiple threads may call \f[C]fi_getinfo\f[] simultaneously, without
+Multiple threads may call \f[C]fi_getinfo\f[R] simultaneously, without
 any requirement for serialization.
 .SH SEE ALSO
 .PP
-\f[C]fi_open\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_nic\f[](3)
+\f[C]fi_open\f[R](3), \f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3),
+\f[C]fi_nic\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_mr.3 b/deps/libfabric/man/man3/fi_mr.3
index afcc664dd1a4805922202c3e174edc17084ef7c6..2f54aba8d5870de7e286610aac7d962171eb329b 100644
--- a/deps/libfabric/man/man3/fi_mr.3
+++ b/deps/libfabric/man/man3/fi_mr.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_mr" "3" "2020\-08\-11" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_mr" "3" "2021\-11\-12" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,161 +8,118 @@ fi_mr \- Memory region operations
 .TP
 .B fi_mr_reg / fi_mr_regv / fi_mr_regattr
 Register local memory buffers for direct fabric access
-.RS
-.RE
 .TP
 .B fi_close
 Deregister registered memory buffers.
-.RS
-.RE
 .TP
 .B fi_mr_desc
 Return a local descriptor associated with a registered memory region
-.RS
-.RE
 .TP
 .B fi_mr_key
 Return the remote key needed to access a registered memory region
-.RS
-.RE
 .TP
 .B fi_mr_raw_attr
 Return raw memory region attributes.
-.RS
-.RE
 .TP
 .B fi_mr_map_raw
 Converts a raw memory region key into a key that is usable for data
 transfer operations.
-.RS
-.RE
 .TP
 .B fi_mr_unmap_key
 Releases a previously mapped raw memory region key.
-.RS
-.RE
 .TP
 .B fi_mr_bind
 Associate a registered memory region with a completion counter or an
 endpoint.
-.RS
-.RE
 .TP
 .B fi_mr_refresh
 Updates the memory pages associated with a memory region.
-.RS
-.RE
 .TP
 .B fi_mr_enable
 Enables a memory region for use.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_mr_reg(struct\ fid_domain\ *domain,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ uint64_t\ access,\ uint64_t\ offset,\ uint64_t\ requested_key,
-\ \ \ \ uint64_t\ flags,\ struct\ fid_mr\ **mr,\ void\ *context);
+int fi_mr_reg(struct fid_domain *domain, const void *buf, size_t len,
+    uint64_t access, uint64_t offset, uint64_t requested_key,
+    uint64_t flags, struct fid_mr **mr, void *context);
 
-int\ fi_mr_regv(struct\ fid_domain\ *domain,\ const\ struct\ iovec\ *\ iov,
-\ \ \ \ size_t\ count,\ uint64_t\ access,\ uint64_t\ offset,\ uint64_t\ requested_key,
-\ \ \ \ uint64_t\ flags,\ struct\ fid_mr\ **mr,\ void\ *context);
+int fi_mr_regv(struct fid_domain *domain, const struct iovec * iov,
+    size_t count, uint64_t access, uint64_t offset, uint64_t requested_key,
+    uint64_t flags, struct fid_mr **mr, void *context);
 
-int\ fi_mr_regattr(struct\ fid_domain\ *domain,\ const\ struct\ fi_mr_attr\ *attr,
-\ \ \ \ uint64_t\ flags,\ struct\ fid_mr\ **mr);
+int fi_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr,
+    uint64_t flags, struct fid_mr **mr);
 
-int\ fi_close(struct\ fid\ *mr);
+int fi_close(struct fid *mr);
 
-void\ *\ fi_mr_desc(struct\ fid_mr\ *mr);
+void * fi_mr_desc(struct fid_mr *mr);
 
-uint64_t\ fi_mr_key(struct\ fid_mr\ *mr);
+uint64_t fi_mr_key(struct fid_mr *mr);
 
-int\ fi_mr_raw_attr(struct\ fid_mr\ *mr,\ uint64_t\ *base_addr,
-\ \ \ \ uint8_t\ *raw_key,\ size_t\ *key_size,\ uint64_t\ flags);
+int fi_mr_raw_attr(struct fid_mr *mr, uint64_t *base_addr,
+    uint8_t *raw_key, size_t *key_size, uint64_t flags);
 
-int\ fi_mr_map_raw(struct\ fid_domain\ *domain,\ uint64_t\ base_addr,
-\ \ \ \ uint8_t\ *raw_key,\ size_t\ key_size,\ uint64_t\ *key,\ uint64_t\ flags);
+int fi_mr_map_raw(struct fid_domain *domain, uint64_t base_addr,
+    uint8_t *raw_key, size_t key_size, uint64_t *key, uint64_t flags);
 
-int\ fi_mr_unmap_key(struct\ fid_domain\ *domain,\ uint64_t\ key);
+int fi_mr_unmap_key(struct fid_domain *domain, uint64_t key);
 
-int\ fi_mr_bind(struct\ fid_mr\ *mr,\ struct\ fid\ *bfid,\ uint64_t\ flags);
+int fi_mr_bind(struct fid_mr *mr, struct fid *bfid, uint64_t flags);
 
-int\ fi_mr_refresh(struct\ fid_mr\ *mr,\ const\ struct\ iovec\ *iov,\ size,\ count,
-\ \ \ \ uint64_t\ flags)
+int fi_mr_refresh(struct fid_mr *mr, const struct iovec *iov,
+    size_t count, uint64_t flags);
 
-int\ fi_mr_enable(struct\ fid_mr\ *mr);
-\f[]
+int fi_mr_enable(struct fid_mr *mr);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 Resource domain
-.RS
-.RE
 .TP
-.B \f[I]mr\f[]
+.B \f[I]mr\f[R]
 Memory region
-.RS
-.RE
 .TP
-.B \f[I]bfid\f[]
+.B \f[I]bfid\f[R]
 Fabric identifier of an associated resource.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified context associated with the memory region.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
-Memory buffer to register with the fabric hardware
-.RS
-.RE
+.B \f[I]buf\f[R]
+Memory buffer to register with the fabric hardware.
 .TP
-.B \f[I]len\f[]
-Length of memory buffer to register
-.RS
-.RE
+.B \f[I]len\f[R]
+Length of memory buffer to register.
+Must be > 0.
 .TP
-.B \f[I]iov\f[]
+.B \f[I]iov\f[R]
 Vectored memory buffer.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Count of vectored buffer entries.
-.RS
-.RE
 .TP
-.B \f[I]access\f[]
+.B \f[I]access\f[R]
 Memory access permissions associated with registration
-.RS
-.RE
 .TP
-.B \f[I]offset\f[]
+.B \f[I]offset\f[R]
 Optional specified offset for accessing specified registered buffers.
 This parameter is reserved for future use and must be 0.
-.RS
-.RE
 .TP
-.B \f[I]requested_key\f[]
-Optional requested remote key associated with registered buffers.
-.RS
-.RE
+.B \f[I]requested_key\f[R]
+Requested remote key associated with registered buffers.
+Parameter is ignored if FI_MR_PROV_KEY flag is set in the domain mr_mode
+bits.
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Memory region attributes
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply to the operation.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Registered memory regions associate memory buffers with permissions
@@ -173,33 +130,46 @@ Additionally, a fabric provider may require that data buffers be
 registered before being used in local transfers.
 Memory registration restrictions are controlled using a separate set of
 mode bits, specified through the domain attributes (mr_mode field).
+Each mr_mode bit requires that an application take specific steps in
+order to use memory buffers with libfabric interfaces.
 .PP
 The following apply to memory registration.
 .TP
-.B \f[I]Scalable Memory Registration\f[]
-By default, memory registration is considered scalable.
-(For library versions 1.4 and earlier, this is indicated by setting
-mr_mode to FI_MR_SCALABLE, with the fi_info mode bit FI_LOCAL_MR set to
-0).
-For versions 1.5 and later, scalable is implied by the lack of any
-mr_mode bits being set.
-The setting of mr_mode bits therefore adjusts application behavior as
-described below.
-Default, scalable registration has several properties.
-.RS
-.RE
-.PP
-In scalable mode, registration occurs on memory address ranges.
-Because registration refers to memory regions, versus data buffers, the
-address ranges given for a registration request do not need to map to
-data buffers allocated by the application at the time the registration
-call is made.
+.B \f[I]Default Memory Registration\f[R]
+If no mr_mode bits are set, the default behaviors describe below are
+followed.
+Historically, these defaults were collectively referred to as scalable
+memory registration.
+The default requirements are outlined below, followed by definitions of
+how each mr_mode bit alters the definition.
+.PP
+Compatibility: For library versions 1.4 and earlier, this was indicated
+by setting mr_mode to FI_MR_SCALABLE and the fi_info mode bit
+FI_LOCAL_MR to 0.
+FI_MR_SCALABLE and FI_LOCAL_MR were deprecated in libfabric version 1.5,
+though they are supported for backwards compatibility purposes.
+.PP
+For security, memory registration is required for data buffers that are
+accessed directly by a peer process.
+For example, registration is required for RMA target buffers (read or
+written to), and those accessed by atomic or collective operations.
+.PP
+By default, registration occurs on virtual address ranges.
+Because registration refers to address ranges, rather than allocated
+data buffers, the address ranges do not need to map to data buffers
+allocated by the application at the time the registration call is made.
 That is, an application can register any range of addresses in their
 virtual address space, whether or not those addresses are backed by
 physical pages or have been allocated.
 .PP
-The resulting memory regions are accessible by peers starting at a base
-address of 0.
+Note that physical pages must back addresses prior to the addresses
+being accessed as part of a data transfer operation, or the data
+transfers will fail.
+Additionally, depending on the operation, this could result in the local
+process receiving a segmentation fault for accessing invalid memory.
+.PP
+Once registered, the resulting memory regions are accessible by peers
+starting at a base address of 0.
 That is, the target address that is specified is a byte offset into the
 registered region.
 .PP
@@ -208,24 +178,37 @@ The key size is restricted to a maximum of 8 bytes.
 .PP
 With scalable registration, locally accessed data buffers are not
 registered.
-This includes source buffers for all transmit operations \-\- sends,
-tagged sends, RMA, and atomics \-\- as well as buffers posted for
+This includes source buffers for all transmit operations \[en] sends,
+tagged sends, RMA, and atomics \[en] as well as buffers posted for
 receive and tagged receive operations.
-.TP
-.B \f[I]FI_MR_LOCAL\f[]
+.PP
+Although the default memory registration behavior is convenient for
+application developers, it is difficult to implement in hardware.
+Attempts to hide the hardware requirements from the application often
+results in significant and unacceptable impacts to performance.
+The following mr_mode bits are provided as input into fi_getinfo.
+If a provider requires the behavior defined for an mr_mode bit, it will
+leave the bit set on output to fi_getinfo.
+Otherwise, the provider can clear the bit to indicate that the behavior
+is not needed.
+.PP
+By setting an mr_mode bit, the application has agreed to adjust its
+behavior as indicated.
+Importantly, applications that choose to support an mr_mode must be
+prepared to handle the case where the mr_mode is not required.
+A provider will clear an mr_mode bit if it is not needed.
+.TP
+.B \f[I]FI_MR_LOCAL\f[R]
 When the FI_MR_LOCAL mode bit is set, applications must register all
 data buffers that will be accessed by the local hardware and provide a
 valid desc parameter into applicable data transfer operations.
 When FI_MR_LOCAL is zero, applications are not required to register data
-buffers before using them for local operations (e.g.
-send and receive data buffers).
+buffers before using them for local operations (e.g.\ send and receive
+data buffers).
 The desc parameter into data transfer operations will be ignored in this
-case, unless otherwise required (e.g.
-se FI_MR_HMEM).
+case, unless otherwise required (e.g.\ se FI_MR_HMEM).
 It is recommended that applications pass in NULL for desc when not
 required.
-.RS
-.RE
 .PP
 A provider may hide local registration requirements from applications by
 making use of an internal registration cache or similar mechanisms.
@@ -240,33 +223,27 @@ Note: the FI_MR_LOCAL mr_mode bit replaces the FI_LOCAL_MR fi_info mode
 bit.
 When FI_MR_LOCAL is set, FI_LOCAL_MR is ignored.
 .TP
-.B \f[I]FI_MR_RAW\f[]
+.B \f[I]FI_MR_RAW\f[R]
 Raw memory regions are used to support providers with keys larger than
 64\-bits or require setup at the peer.
 When the FI_MR_RAW bit is set, applications must use fi_mr_raw_attr()
 locally and fi_mr_map_raw() at the peer before targeting a memory region
 as part of any data transfer request.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_VIRT_ADDR\f[]
+.B \f[I]FI_MR_VIRT_ADDR\f[R]
 The FI_MR_VIRT_ADDR bit indicates that the provider references memory
 regions by virtual address, rather than a 0\-based offset.
 Peers that target memory regions registered with FI_MR_VIRT_ADDR specify
-the destination memory buffer using the target\[aq]s virtual address,
+the destination memory buffer using the target\[cq]s virtual address,
 with any offset into the region specified as virtual address + offset.
 Support of this bit typically implies that peers must exchange
 addressing data prior to initiating any RMA or atomic operation.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_ALLOCATED\f[]
+.B \f[I]FI_MR_ALLOCATED\f[R]
 When set, all registered memory regions must be backed by physical
 memory pages at the time the registration call is made.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_PROV_KEY\f[]
+.B \f[I]FI_MR_PROV_KEY\f[R]
 This memory region mode indicates that the provider does not support
 application requested MR keys.
 MR keys are returned by the provider.
@@ -274,10 +251,8 @@ Applications that support FI_MR_PROV_KEY can obtain the provider key
 using fi_mr_key(), unless FI_MR_RAW is also set.
 The returned key should then be exchanged with peers prior to initiating
 an RMA or atomic operation.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_MMU_NOTIFY\f[]
+.B \f[I]FI_MR_MMU_NOTIFY\f[R]
 FI_MR_MMU_NOTIFY is typically set by providers that support memory
 registration against memory regions that are not necessarily backed by
 allocated physical pages at the time the memory registration occurs.
@@ -289,21 +264,17 @@ now back the region.
 The notification is necessary for providers that cannot hook directly
 into the operating system page tables or memory management unit.
 See fi_mr_refresh() for notification details.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_RMA_EVENT\f[]
+.B \f[I]FI_MR_RMA_EVENT\f[R]
 This mode bit indicates that the provider must configure memory regions
 that are associated with RMA events prior to their use.
 This includes all memory regions that are associated with completion
 counters.
 When set, applications must indicate if a memory region will be
-associated with a completion counter as part of the region\[aq]s
+associated with a completion counter as part of the region\[cq]s
 creation.
 This is done by passing in the FI_RMA_EVENT flag to the memory
 registration call.
-.RS
-.RE
 .PP
 Such memory regions will be created in a disabled state and must be
 associated with all completion counters prior to being enabled.
@@ -311,27 +282,22 @@ To enable a memory region, the application must call fi_mr_enable().
 After calling fi_mr_enable(), no further resource bindings may be made
 to the memory region.
 .TP
-.B \f[I]FI_MR_ENDPOINT\f[]
+.B \f[I]FI_MR_ENDPOINT\f[R]
 This mode bit indicates that the provider associates memory regions with
 endpoints rather than domains.
 Memory regions that are registered with the provider are created in a
 disabled state and must be bound to an endpoint prior to being enabled.
 To bind the MR with an endpoint, the application must use fi_mr_bind().
 To enable the memory region, the application must call fi_mr_enable().
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_HMEM\f[]
+.B \f[I]FI_MR_HMEM\f[R]
 This mode bit is associated with the FI_HMEM capability.
 If FI_MR_HMEM is set, the application must register buffers that were
 allocated using a device call and provide a valid desc parameter into
 applicable data transfer operations even if they are only used for local
-operations (e.g.
-send and receive data buffers).
+operations (e.g.\ send and receive data buffers).
 Device memory must be registered using the fi_mr_regattr call, with the
 iface and device fields filled out.
-.RS
-.RE
 .PP
 If FI_MR_HMEM is set, but FI_MR_LOCAL is unset, only device buffers must
 be registered when used locally.
@@ -340,29 +306,33 @@ must either be valid or NULL.
 Similarly, if FI_MR_LOCAL is set, but FI_MR_HMEM is not, the desc
 parameter must either be valid or NULL.
 .TP
-.B \f[I]Basic Memory Registration\f[]
-Basic memory registration is indicated by the FI_MR_BASIC mr_mode bit.
-FI_MR_BASIC is maintained for backwards compatibility (libfabric version
-1.4 or earlier).
-The behavior of basic registration is equivalent to setting the
-following mr_mode bits to one: FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and
-FI_MR_PROV_KEY.
-Additionally, providers that support basic registration usually required
-the fi_info mode bit FI_LOCAL_MR.
-As a result, it is recommended that applications migrating from
-libfabric 1.4 or earlier or wanting to support basic memory registration
-set the mr_mode to FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY |
-FI_MR_LOCAL.
-FI_MR_BASIC must be set alone.
-Other mr_mode bit pairings are invalid.
+.B \f[I]FI_MR_COLLECTIVE\f[R]
+This bit is associated with the FI_COLLECTIVE capability.
+When set, the provider requires that memory regions used in collection
+operations must explicitly be registered for use with collective calls.
+This requires registering regions passed to collective calls using the
+FI_COLLECTIVE flag.
+.TP
+.B \f[I]Basic Memory Registration\f[R]
+Basic memory registration was deprecated in libfabric version 1.5, but
+is supported for backwards compatibility.
+Basic memory registration is indicated by setting mr_mode equal to
+FI_MR_BASIC.
+FI_MR_BASIC must be set alone and not paired with mr_mode bits.
 Unlike other mr_mode bits, if FI_MR_BASIC is set on input to
 fi_getinfo(), it will not be cleared by the provider.
-That is, setting FI_MR_BASIC to one requests basic registration.
-.RS
-.RE
+That is, setting mr_mode equal to FI_MR_BASIC forces basic registration
+if the provider supports it.
 .PP
-The registrations functions \-\- fi_mr_reg, fi_mr_regv, and
-fi_mr_regattr \-\- are used to register one or more memory regions with
+The behavior of basic registration is equivalent to requiring the
+following mr_mode bits: FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and
+FI_MR_PROV_KEY.
+Additionally, providers that support basic registration usually require
+the (deprecated) fi_info mode bit FI_LOCAL_MR, which was incorporated
+into the FI_MR_LOCAL mr_mode bit.
+.PP
+The registrations functions \[en] fi_mr_reg, fi_mr_regv, and
+fi_mr_regattr \[en] are used to register one or more memory regions with
 fabric resources.
 The main difference between registration functions are the number and
 type of parameters that they accept as input.
@@ -405,7 +375,7 @@ Because MR keys must be provided by a remote process, an application can
 use the requested_key parameter to indicate that a specific key value be
 returned.
 Support for user requested keys is provider specific and is determined
-by the mr_mode domain attribute.
+by the FI_MR_PROV_KEY flag value in the mr_mode domain attribute.
 .PP
 Remote RMA and atomic operations indicate the location within a
 registered memory region by specifying an address.
@@ -518,13 +488,11 @@ with endpoints (see FI_MR_ENDPOINT).
 When binding with a counter, the type of events tracked against the
 memory region is based on the bitwise OR of the following flags.
 .TP
-.B \f[I]FI_REMOTE_WRITE\f[]
+.B \f[I]FI_REMOTE_WRITE\f[R]
 Generates an event whenever a remote RMA write or atomic operation
 modifies the memory region.
 Use of this flag requires that the endpoint through which the MR is
 accessed be created with the FI_RMA_EVENT capability.
-.RS
-.RE
 .PP
 When binding the memory region to an endpoint, flags should be 0.
 .SS fi_mr_refresh
@@ -564,23 +532,23 @@ passed directly into calls as function parameters.
 .IP
 .nf
 \f[C]
-struct\ fi_mr_attr\ {
-\ \ \ \ const\ struct\ iovec\ *mr_iov;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ iov_count;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ access;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ offset;
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ requested_key;
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ auth_key_size;
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ \ \ \ *auth_key;
-\ \ \ \ enum\ fi_hmem_iface\ iface;
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ reserved;
-\ \ \ \ \ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ cuda;
-\ \ \ \ \ \ \ \ int\ \ \ \ \ \ ze
-\ \ \ \ }\ device;
+struct fi_mr_attr {
+    const struct iovec *mr_iov;
+    size_t             iov_count;
+    uint64_t           access;
+    uint64_t           offset;
+    uint64_t           requested_key;
+    void               *context;
+    size_t             auth_key_size;
+    uint8_t            *auth_key;
+    enum fi_hmem_iface iface;
+    union {
+        uint64_t         reserved;
+        int              cuda;
+        int      ze
+    } device;
 };
-\f[]
+\f[R]
 .fi
 .SS mr_iov
 .PP
@@ -592,55 +560,52 @@ The number of entries in the iovec is specified by iov_count.
 The number of entries in the mr_iov array.
 The maximum number of memory buffers that may be associated with a
 single memory region is specified as the mr_iov_limit domain attribute.
-See \f[C]fi_domain(3)\f[].
+See \f[C]fi_domain(3)\f[R].
 .SS access
 .PP
-Indicates the type of \f[I]operations\f[] that the local or a peer
+Indicates the type of \f[I]operations\f[R] that the local or a peer
 endpoint may perform on registered memory region.
 Supported access permissions are the bitwise OR of the following flags:
 .TP
-.B \f[I]FI_SEND\f[]
+.B \f[I]FI_SEND\f[R]
 The memory buffer may be used in outgoing message data transfers.
-This includes fi_msg and fi_tagged send operations.
-.RS
-.RE
+This includes fi_msg and fi_tagged send operations, as well as
+fi_collective operations.
 .TP
-.B \f[I]FI_RECV\f[]
+.B \f[I]FI_RECV\f[R]
 The memory buffer may be used to receive inbound message transfers.
-This includes fi_msg and fi_tagged receive operations.
-.RS
-.RE
+This includes fi_msg and fi_tagged receive operations, as well as
+fi_collective operations.
 .TP
-.B \f[I]FI_READ\f[]
+.B \f[I]FI_READ\f[R]
 The memory buffer may be used as the result buffer for RMA read and
 atomic operations on the initiator side.
 Note that from the viewpoint of the application, the memory buffer is
 being written into by the network.
-.RS
-.RE
 .TP
-.B \f[I]FI_WRITE\f[]
+.B \f[I]FI_WRITE\f[R]
 The memory buffer may be used as the source buffer for RMA write and
 atomic operations on the initiator side.
 Note that from the viewpoint of the application, the endpoint is reading
 from the memory buffer and copying the data onto the network.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_READ\f[]
+.B \f[I]FI_REMOTE_READ\f[R]
 The memory buffer may be used as the source buffer of an RMA read
 operation on the target side.
 The contents of the memory buffer are not modified by such operations.
-.RS
-.RE
 .TP
-.B \f[I]FI_REMOTE_WRITE\f[]
+.B \f[I]FI_REMOTE_WRITE\f[R]
 The memory buffer may be used as the target buffer of an RMA write or
 atomic operation.
 The contents of the memory buffer may be modified as a result of such
 operations.
-.RS
-.RE
+.TP
+.B \f[I]FI_COLLECTIVE\f[R]
+This flag provides an explicit indication that the memory buffer may be
+used with collective operations.
+Use of this flag is required if the FI_MR_COLLECTIVE mr_mode bit has
+been set on the domain.
+This flag should be paired with FI_SEND and/or FI_RECV
 .PP
 Note that some providers may not enforce fine grained access
 permissions.
@@ -661,7 +626,7 @@ key be used by the provider.
 This allows applications to use well known key values, which can avoid
 applications needing to exchange and store keys.
 Support for user requested keys is provider specific and is determined
-by the mr_mode domain attribute.
+by the the FI_MR_PROV_KEY flag in the mr_mode domain attribute field.
 .SS context
 .PP
 Application context associated with asynchronous memory registration
@@ -692,48 +657,36 @@ and manage the memory region.
 This field is ignored unless the application has requested the FI_HMEM
 capability.
 .TP
-.B \f[I]FI_HMEM_SYSTEM\f[]
+.B \f[I]FI_HMEM_SYSTEM\f[R]
 Uses standard operating system calls and libraries, such as malloc,
 calloc, realloc, mmap, and free.
-.RS
-.RE
 .TP
-.B \f[I]FI_HMEM_CUDA\f[]
+.B \f[I]FI_HMEM_CUDA\f[R]
 Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost,
 cuMemAllocManaged, cuMemFree, cudaMalloc, cudaFree.
-.RS
-.RE
 .TP
-.B \f[I]FI_HMEM_ROCR\f[]
+.B \f[I]FI_HMEM_ROCR\f[R]
 Uses AMD ROCR interfaces such as hsa_memory_allocate and
 hsa_memory_free.
-.RS
-.RE
 .TP
-.B \f[I]FI_HMEM_ZE\f[]
+.B \f[I]FI_HMEM_ZE\f[R]
 Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem,
 zeDriverFreeMem.
-.RS
-.RE
 .SS device
 .PP
 Reserved 64 bits for device identifier if using non\-standard HMEM
 interface.
 This field is ignore unless the iface field is valid.
 .TP
-.B \f[I]cuda\f[]
+.B \f[I]cuda\f[R]
 For FI_HMEM_CUDA, this is equivalent to CUdevice (int).
-.RS
-.RE
 .TP
-.B \f[I]ze\f[]
+.B \f[I]ze\f[R]
 For FI_HMEM_ZE, this is equivalent to the ze_device_handle_t index
 (int).
-.RS
-.RE
 .SH NOTES
 .PP
-Direct access to an application\[aq]s memory by a remote peer requires
+Direct access to an application\[cq]s memory by a remote peer requires
 that the application register the targeted memory buffer(s).
 This is typically done by calling one of the fi_mr_reg* routines.
 For FI_MR_PROV_KEY, the provider will return a key that must be used by
@@ -756,54 +709,108 @@ regions, even if FI_MR_RAW has not been set.
 It is recommended that portable applications target using those
 interfaces; however, their use does carry extra message and memory
 footprint overhead, making it less desirable for highly scalable apps.
+.PP
+There may be cases where device peer to peer support should not be used
+or cannot be used, such as when the PCIe ACS configuration does not
+permit the transfer.
+The FI_HMEM_DISABLE_P2P environment variable can be set to notify
+Libfabric that peer to peer transactions should not be used.
+The provider may choose to perform a copy instead, or will fail support
+for FI_HMEM if the provider is unable to do that.
 .SH FLAGS
 .PP
 The follow flag may be specified to any memory registration call.
 .TP
-.B \f[I]FI_RMA_EVENT\f[]
+.B \f[I]FI_RMA_EVENT\f[R]
 This flag indicates that the specified memory region will be associated
 with a completion counter used to count RMA operations that access the
 MR.
-.RS
-.RE
 .TP
-.B \f[I]FI_RMA_PMEM\f[]
+.B \f[I]FI_RMA_PMEM\f[R]
 This flag indicates that the underlying memory region is backed by
 persistent memory and will be used in RMA operations.
 It must be specified if persistent completion semantics or persistent
 data transfers are required when accessing the registered region.
-.RS
-.RE
+.TP
+.B \f[I]FI_HMEM_DEVICE_ONLY\f[R]
+This flag indicates that the memory is only accessible by a device.
+Which device is specified by the fi_mr_attr fields iface and device.
+This refers to memory regions that were allocated using a device API
+AllocDevice call (as opposed to using the host allocation or
+unified/shared memory allocation).
+.TP
+.B \f[I]FI_HMEM_HOST_ALLOC\f[R]
+This flag indicates that the memory is owned by the host only.
+Whether it can be accessed by the device is implementation dependent.
+The fi_mr_attr field iface is still used to identify the device API, but
+the field device is ignored.
+This refers to memory regions that were allocated using a device API
+AllocHost call (as opposed to using malloc\-like host allocation,
+unified/shared memory allocation, or AllocDevice).
+.SH MEMORY DOMAINS
+.PP
+Memory domains identify the physical separation of memory which may or
+may not be accessible through the same virtual address space.
+Traditionally, applications only dealt with a single memory domain, that
+of host memory tightly coupled with the system CPUs.
+With the introduction of device and non\-uniform memory subsystems,
+applications often need to be aware of which memory domain a particular
+virtual address maps to.
+.PP
+As a general rule, separate physical devices can be considered to have
+their own memory domains.
+For example, a NIC may have user accessible memory, and would be
+considered a separate memory domain from memory on a GPU.
+Both the NIC and GPU memory domains are separate from host system
+memory.
+Individual GPUs or computation accelerators may have distinct memory
+domains, or may be connected in such a way (e.g.\ a GPU specific fabric)
+that all GPUs would belong to the same memory domain.
+Unfortunately, identifying memory domains is specific to each system and
+its physical and/or virtual configuration.
+.PP
+Understanding memory domains in heterogenous memory environments is
+important as it can impact data ordering and visibility as viewed by an
+application.
+It is also important to understand which memory domain an application is
+most tightly coupled to.
+In most cases, applications are tightly coupled to host memory.
+However, an application running directly on a GPU or NIC may be more
+tightly coupled to memory associated with those devices.
+.PP
+Memory regions are often associated with a single memory domain.
+The domain is often indicated by the fi_mr_attr iface and device fields.
+Though it is possible for physical pages backing a virtual memory region
+to migrate between memory domains based on access patterns.
+For example, the physical pages referenced by a virtual address range
+could migrate between host memory and GPU memory, depending on which
+computational unit is actively using it.
+.PP
+See the \f[C]fi_endpoint\f[R](3) and \f[C]fi_cq\f[R](3) man pages for
+addition discussion on message, data, and completion ordering semantics,
+including the impact of memory domains.
 .SH RETURN VALUES
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_ENOKEY\f[]
+.B \f[I]\-FI_ENOKEY\f[R]
 The requested_key is already in use.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EKEYREJECTED\f[]
+.B \f[I]\-FI_EKEYREJECTED\f[R]
 The requested_key is not available.
 They key may be out of the range supported by the provider, or the
 provider may not support user\-requested memory registration keys.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_ENOSYS\f[]
+.B \f[I]\-FI_ENOSYS\f[R]
 Returned by fi_mr_bind if the provider does not support reporting events
 based on access to registered memory regions.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EBADFLAGS\f[]
+.B \f[I]\-FI_EBADFLAGS\f[R]
 Returned if the specified flags are not supported by the provider.
-.RS
-.RE
 .SH MEMORY REGISTRATION CACHE
 .PP
 Many hardware NICs accessed by libfabric require that data buffers be
@@ -825,11 +832,11 @@ or device memory cannot be cached.
 .PP
 As a general rule, if hardware requires the FI_MR_LOCAL mode bit
 described above, but this is not supported by the application, a memory
-registration cache \f[I]may\f[] be in use.
+registration cache \f[I]may\f[R] be in use.
 The following environment variables may be used to configure
 registration caches.
 .TP
-.B \f[I]FI_MR_CACHE_MAX_SIZE\f[]
+.B \f[I]FI_MR_CACHE_MAX_SIZE\f[R]
 This defines the total number of bytes for all memory regions that may
 be tracked by the cache.
 If not set, the cache has no limit on how many bytes may be registered
@@ -837,10 +844,8 @@ and cached.
 Setting this will reduce the amount of memory that is not actively being
 used as part of a data transfer that is registered with a provider.
 By default, the cache size is unlimited.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_CACHE_MAX_COUNT\f[]
+.B \f[I]FI_MR_CACHE_MAX_COUNT\f[R]
 This defines the total number of memory regions that may be registered
 with the cache.
 If not set, a default limit is chosen.
@@ -848,10 +853,8 @@ Setting this will reduce the number of regions that are registered,
 regardless of their size, which are not actively being used as part of a
 data transfer.
 Setting this to zero will disable registration caching.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_CACHE_MONITOR\f[]
+.B \f[I]FI_MR_CACHE_MONITOR\f[R]
 The cache monitor is responsible for detecting system memory
 (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an
 application and the underlying physical pages.
@@ -864,31 +867,40 @@ deallocation calls which may result in the mappings changing, such as
 malloc, mmap, free, etc.
 Note that memhooks operates at the elf linker layer, and does not use
 glibc memory hooks.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[]
+.B \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[R]
 The CUDA cache monitor is responsible for detecting CUDA device memory
 (FI_HMEM_CUDA) changes made between the device virtual addresses used by
 an application and the underlying device physical pages.
 Valid monitor options are: 0 or 1.
 Note that the CUDA memory monitor requires a CUDA toolkit version with
 unified virtual addressing enabled.
-.RS
-.RE
 .TP
-.B \f[I]FI_MR_ROCR_CACHE_MONITOR_ENABLED\f[]
+.B \f[I]FI_MR_ROCR_CACHE_MONITOR_ENABLED\f[R]
 The ROCR cache monitor is responsible for detecting ROCR device memory
 (FI_HMEM_ROCR) changes made between the device virtual addresses used by
 an application and the underlying device physical pages.
 Valid monitor options are: 0 or 1.
 Note that the ROCR memory monitor requires a ROCR version with unified
 virtual addressing enabled.
-.RS
-.RE
+.TP
+.B \f[I]FI_MR_ZE_CACHE_MONITOR_ENABLED\f[R]
+The ZE cache monitor is responsible for detecting ZE device memory
+(FI_HMEM_ZE) changes made between the device virtual addresses used by
+an application and the underlying device physical pages.
+Valid monitor options are: 0 or 1.
+.PP
+More direct access to the internal registration cache is possible
+through the fi_open() call, using the \[lq]mr_cache\[rq] service name.
+Once opened, custom memory monitors may be installed.
+A memory monitor is a component of the cache responsible for detecting
+changes in virtual to physical address mappings.
+Some level of control over the cache is possible through the above
+mentioned environment variables.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_rma\f[](3), \f[C]fi_msg\f[](3), \f[C]fi_atomic\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_rma\f[R](3), \f[C]fi_msg\f[R](3),
+\f[C]fi_atomic\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_msg.3 b/deps/libfabric/man/man3/fi_msg.3
index 4f9394661d447a1c6f474501042be6756d7a7d96..68159a820404e09a63984a821b0042369c5dfaf2 100644
--- a/deps/libfabric/man/man3/fi_msg.3
+++ b/deps/libfabric/man/man3/fi_msg.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_msg" "3" "2019\-09\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_msg" "3" "2021\-03\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,122 +8,93 @@ fi_msg \- Message data transfer operations
 .TP
 .B fi_recv / fi_recvv / fi_recvmsg
 Post a buffer to receive an incoming message
-.RS
-.RE
-.PP
-fi_send / fi_sendv / fi_sendmsg
-.PD 0
-.P
-.PD
-fi_inject / fi_senddata : Initiate an operation to send a message
+.PP
+fi_send / fi_sendv / fi_sendmsg fi_inject / fi_senddata : Initiate an
+operation to send a message
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_endpoint.h>
+#include <rdma/fi_endpoint.h>
 
-ssize_t\ fi_recv(struct\ fid_ep\ *ep,\ void\ *\ buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ fi_addr_t\ src_addr,\ void\ *context);
+ssize_t fi_recv(struct fid_ep *ep, void * buf, size_t len,
+    void *desc, fi_addr_t src_addr, void *context);
 
-ssize_t\ fi_recvv(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,\ void\ **desc,
-\ \ \ \ size_t\ count,\ fi_addr_t\ src_addr,\ void\ *context);
+ssize_t fi_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc,
+    size_t count, fi_addr_t src_addr, void *context);
 
-ssize_t\ fi_recvmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
+    uint64_t flags);
 
-ssize_t\ fi_send(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ fi_addr_t\ dest_addr,\ void\ *context);
+ssize_t fi_send(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, fi_addr_t dest_addr, void *context);
 
-ssize_t\ fi_sendv(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,
-\ \ \ \ void\ **desc,\ size_t\ count,\ fi_addr_t\ dest_addr,\ void\ *context);
+ssize_t fi_sendv(struct fid_ep *ep, const struct iovec *iov,
+    void **desc, size_t count, fi_addr_t dest_addr, void *context);
 
-ssize_t\ fi_sendmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_sendmsg(struct fid_ep *ep, const struct fi_msg *msg,
+    uint64_t flags);
 
-ssize_t\ fi_inject(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ fi_addr_t\ dest_addr);
+ssize_t fi_inject(struct fid_ep *ep, const void *buf, size_t len,
+    fi_addr_t dest_addr);
 
-ssize_t\ fi_senddata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ uint64_t\ data,\ fi_addr_t\ dest_addr,\ void\ *context);
+ssize_t fi_senddata(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, uint64_t data, fi_addr_t dest_addr, void *context);
 
-ssize_t\ fi_injectdata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ uint64_t\ data,\ fi_addr_t\ dest_addr);
-\f[]
+ssize_t fi_injectdata(struct fid_ep *ep, const void *buf, size_t len,
+    uint64_t data, fi_addr_t dest_addr);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]ep\f[]
+.B \f[I]ep\f[R]
 Fabric endpoint on which to initiate send or post receive buffer.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 Data buffer to send or receive.
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
+.B \f[I]len\f[R]
 Length of data buffer to send or receive, specified in bytes.
-Valid transfers are from 0 bytes up to the endpoint\[aq]s max_msg_size.
-.RS
-.RE
+Valid transfers are from 0 bytes up to the endpoint\[cq]s max_msg_size.
 .TP
-.B \f[I]iov\f[]
+.B \f[I]iov\f[R]
 Vectored data buffer.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Count of vectored data entries.
-.RS
-.RE
 .TP
-.B \f[I]desc\f[]
+.B \f[I]desc\f[R]
 Descriptor associated with the data buffer.
-See \f[C]fi_mr\f[](3).
-.RS
-.RE
+See \f[C]fi_mr\f[R](3).
 .TP
-.B \f[I]data\f[]
+.B \f[I]data\f[R]
 Remote CQ data to transfer with the sent message.
-.RS
-.RE
 .TP
-.B \f[I]dest_addr\f[]
+.B \f[I]dest_addr\f[R]
 Destination address for connectionless transfers.
 Ignored for connected endpoints.
-.RS
-.RE
 .TP
-.B \f[I]src_addr\f[]
+.B \f[I]src_addr\f[R]
 Source address to receive from for connectionless transfers.
 Applies only to connectionless endpoints with the FI_DIRECTED_RECV
 capability enabled, otherwise this field is ignored.
 If set to FI_ADDR_UNSPEC, any source address may match.
-.RS
-.RE
 .TP
-.B \f[I]msg\f[]
+.B \f[I]msg\f[R]
 Message descriptor for send and receive operations.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply for the send or receive operation.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified pointer to associate with the operation.
 This parameter is ignored if the operation will not generate a
 successful completion, unless an op flag specifies the context parameter
 be used for required input.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
-The send functions \-\- fi_send, fi_sendv, fi_sendmsg, fi_inject, and
-fi_senddata \-\- are used to transmit a message from one endpoint to
+The send functions \[en] fi_send, fi_sendv, fi_sendmsg, fi_inject, and
+fi_senddata \[en] are used to transmit a message from one endpoint to
 another endpoint.
 The main difference between send functions are the number and type of
 parameters that they accept as input.
@@ -131,7 +102,7 @@ Otherwise, they perform the same general function.
 Messages sent using fi_msg operations are received by a remote endpoint
 into a buffer posted to receive such messages.
 .PP
-The receive functions \-\- fi_recv, fi_recvv, fi_recvmsg \-\- post a
+The receive functions \[en] fi_recv, fi_recvv, fi_recvmsg \[en] post a
 data buffer to an endpoint to receive inbound messages.
 Similar to the send operations, receive operations operate
 asynchronously.
@@ -155,11 +126,6 @@ See fi_cq for completion event details.
 The call fi_send transfers the data contained in the user\-specified
 data buffer to a remote endpoint, with message boundaries being
 maintained.
-For connection based endpoints (FI_EP_MSG) the local endpoint must be
-connected to a remote endpoint or destination before fi_send is called.
-Unless the endpoint has been configured differently, the data buffer
-passed into fi_send must not be touched by the application until the
-fi_send call completes asynchronously.
 .SS fi_sendv
 .PP
 The fi_sendv call adds support for a scatter\-gather list to fi_send.
@@ -168,21 +134,21 @@ parameter to a remote endpoint as a single message.
 .SS fi_sendmsg
 .PP
 The fi_sendmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the send operation
+connectionless endpoints, with the ability to control the send operation
 per call through the use of flags.
-The fi_sendmsg function takes a \f[C]struct\ fi_msg\f[] as input.
+The fi_sendmsg function takes a \f[C]struct fi_msg\f[R] as input.
 .IP
 .nf
 \f[C]
-struct\ fi_msg\ {
-\ \ \ \ const\ struct\ iovec\ *msg_iov;\ /*\ scatter\-gather\ array\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ **desc;\ \ \ /*\ local\ request\ descriptors\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ iov_count;/*\ #\ elements\ in\ iov\ */
-\ \ \ \ fi_addr_t\ \ \ \ \ \ \ \ \ \ addr;\ \ \ \ \ /*\ optional\ endpoint\ address\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context;\ /*\ user\-defined\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ data;\ \ \ \ \ /*\ optional\ message\ data\ */
+struct fi_msg {
+    const struct iovec *msg_iov; /* scatter\-gather array */
+    void               **desc;   /* local request descriptors */
+    size_t             iov_count;/* # elements in iov */
+    fi_addr_t          addr;     /* optional endpoint address */
+    void               *context; /* user\-defined context */
+    uint64_t           data;     /* optional message data */
 };
-\f[]
+\f[R]
 .fi
 .SS fi_inject
 .PP
@@ -217,7 +183,7 @@ order to match sends.
 Message boundaries are maintained.
 The order in which the receives complete is dependent on the endpoint
 type and protocol.
-For unconnected endpoints, the src_addr parameter can be used to
+For connectionless endpoints, the src_addr parameter can be used to
 indicate that a buffer should be posted to receive incoming data from a
 specific remote endpoint.
 .SS fi_recvv
@@ -228,8 +194,8 @@ parameter to a receive incoming data.
 .SS fi_recvmsg
 .PP
 The fi_recvmsg call supports posting buffers over both connected and
-unconnected endpoints, with the ability to control the receive operation
-per call through the use of flags.
+connectionless endpoints, with the ability to control the receive
+operation per call through the use of flags.
 The fi_recvmsg function takes a struct fi_msg as input.
 .SH FLAGS
 .PP
@@ -241,49 +207,39 @@ fi_endpoint.3).
 The following list of flags are usable with fi_recvmsg and/or
 fi_sendmsg.
 .TP
-.B \f[I]FI_REMOTE_CQ_DATA\f[]
+.B \f[I]FI_REMOTE_CQ_DATA\f[R]
 Applies to fi_sendmsg and fi_senddata.
 Indicates that remote CQ data is available and should be sent as part of
 the request.
 See fi_getinfo for additional details on FI_REMOTE_CQ_DATA.
-.RS
-.RE
 .TP
-.B \f[I]FI_CLAIM\f[]
+.B \f[I]FI_CLAIM\f[R]
 Applies to posted receive operations for endpoints configured for
 FI_BUFFERED_RECV or FI_VARIABLE_MSG.
 This flag is used to retrieve a message that was buffered by the
 provider.
 See the Buffered Receives section for details.
-.RS
-.RE
 .TP
-.B \f[I]FI_COMPLETION\f[]
+.B \f[I]FI_COMPLETION\f[R]
 Indicates that a completion entry should be generated for the specified
 operation.
 The endpoint must be bound to a completion queue with
 FI_SELECTIVE_COMPLETION that corresponds to the specified operation, or
 this flag is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_DISCARD\f[]
+.B \f[I]FI_DISCARD\f[R]
 Applies to posted receive operations for endpoints configured for
 FI_BUFFERED_RECV or FI_VARIABLE_MSG.
 This flag is used to free a message that was buffered by the provider.
 See the Buffered Receives section for details.
-.RS
-.RE
 .TP
-.B \f[I]FI_MORE\f[]
+.B \f[I]FI_MORE\f[R]
 Indicates that the user has additional requests that will immediately be
 posted after the current call returns.
 Use of this flag may improve performance by enabling the provider to
 optimize its access to the fabric hardware.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT\f[]
+.B \f[I]FI_INJECT\f[R]
 Applies to fi_sendmsg.
 Indicates that the outbound data buffer should be returned to user
 immediately after the send call returns, even if the operation is
@@ -291,10 +247,8 @@ handled asynchronously.
 This may require that the underlying provider implementation copy the
 data into a local buffer and transfer out of that buffer.
 This flag can only be used with messages smaller than inject_size.
-.RS
-.RE
 .TP
-.B \f[I]FI_MULTI_RECV\f[]
+.B \f[I]FI_MULTI_RECV\f[R]
 Applies to posted receive operations.
 This flag allows the user to post a single buffer that will receive
 multiple incoming messages.
@@ -304,40 +258,36 @@ Use of this flag may cause a single posted receive operation to generate
 multiple events as messages are placed into the buffer.
 The placement of received data into the buffer may be subjected to
 provider specific alignment restrictions.
-.RS
-.RE
 .PP
 The buffer will be released by the provider when the available buffer
 space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV).
 Note that an entry to the associated receive completion queue will
 always be generated when the buffer has been consumed, even if other
-receive completions have been suppressed (i.e.
-the Rx context has been configured for FI_SELECTIVE_COMPLETION).
-See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[](3).
+receive completions have been suppressed (i.e.\ the Rx context has been
+configured for FI_SELECTIVE_COMPLETION).
+See the FI_MULTI_RECV completion flag \f[C]fi_cq\f[R](3).
 .TP
-.B \f[I]FI_INJECT_COMPLETE\f[]
+.B \f[I]FI_INJECT_COMPLETE\f[R]
 Applies to fi_sendmsg.
 Indicates that a completion should be generated when the source
 buffer(s) may be reused.
-.RS
-.RE
 .TP
-.B \f[I]FI_TRANSMIT_COMPLETE\f[]
-Applies to fi_sendmsg.
-Indicates that a completion should not be generated until the operation
-has been successfully transmitted and is no longer being tracked by the
-provider.
-.RS
-.RE
+.B \f[I]FI_TRANSMIT_COMPLETE\f[R]
+Applies to fi_sendmsg and fi_recvmsg.
+For sends, indicates that a completion should not be generated until the
+operation has been successfully transmitted and is no longer being
+tracked by the provider.
+For receive operations, indicates that a completion may be generated as
+soon as the message has been processed by the local provider, even if
+the message data may not be visible to all processing elements.
+See \f[C]fi_cq\f[R](3) for target side completion semantics.
 .TP
-.B \f[I]FI_DELIVERY_COMPLETE\f[]
+.B \f[I]FI_DELIVERY_COMPLETE\f[R]
 Applies to fi_sendmsg.
 Indicates that a completion should be generated when the operation has
 been processed by the destination.
-.RS
-.RE
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 Applies to transmits.
 Indicates that the requested operation, also known as the fenced
 operation, and any operation posted after the fenced operation will be
@@ -345,21 +295,17 @@ deferred until all previous operations targeting the same peer endpoint
 have completed.
 Operations posted after the fencing will see and/or replace the results
 of any operations initiated prior to the fenced operation.
-.RS
-.RE
 .PP
 The ordering of operations starting at the posting of the fenced
 operation (inclusive) to the posting of a subsequent fenced operation
-(exclusive) is controlled by the endpoint\[aq]s ordering semantics.
+(exclusive) is controlled by the endpoint\[cq]s ordering semantics.
 .TP
-.B \f[I]FI_MULTICAST\f[]
+.B \f[I]FI_MULTICAST\f[R]
 Applies to transmits.
 This flag indicates that the address specified as the data transfer
 destination is a multicast address.
 This flag must be used in all multicast transfers, in conjunction with a
 multicast fi_addr_t.
-.RS
-.RE
 .SH Buffered Receives
 .PP
 Buffered receives indicate that the networking layer allocates and
@@ -387,28 +333,28 @@ buffers, the CQ entry op_context will point to a struct fi_recv_context.
 .IP
 .nf
 \f[C]
-struct\ fi_recv_context\ {
-\ \ \ \ struct\ fid_ep\ *ep;
-\ \ \ \ void\ *context;
+struct fi_recv_context {
+    struct fid_ep *ep;
+    void *context;
 };
-\f[]
+\f[R]
 .fi
 .PP
-The \[aq]ep\[aq] field will point to the receiving endpoint or Rx
-context, and \[aq]context\[aq] will be NULL.
-The CQ entry\[aq]s \[aq]buf\[aq] will point to a provider managed buffer
-where the start of the received message is located, and \[aq]len\[aq]
-will be set to the total size of the message.
+The `ep' field will point to the receiving endpoint or Rx context, and
+`context' will be NULL.
+The CQ entry\[cq]s `buf' will point to a provider managed buffer where
+the start of the received message is located, and `len' will be set to
+the total size of the message.
 .PP
 The maximum sized message that a provider can buffer is limited by an
 FI_OPT_BUFFERED_LIMIT.
 This threshold can be obtained and may be adjusted by the application
 using the fi_getopt and fi_setopt calls, respectively.
 Any adjustments must be made prior to enabling the endpoint.
-The CQ entry \[aq]buf\[aq] will point to a buffer of received data.
+The CQ entry `buf' will point to a buffer of received data.
 If the sent message is larger than the buffered amount, the CQ entry
-\[aq]flags\[aq] will have the FI_MORE bit set.
-When the FI_MORE bit is set, \[aq]buf\[aq] will reference at least
+`flags' will have the FI_MORE bit set.
+When the FI_MORE bit is set, `buf' will reference at least
 FI_OPT_BUFFERED_MIN bytes of data (see fi_endpoint.3 for more info).
 .PP
 After being notified that a buffered receive has arrived, applications
@@ -421,21 +367,21 @@ regardless of message size.
 To claim a message, an application must post a receive operation with
 the FI_CLAIM flag set.
 The struct fi_recv_context returned as part of the notification must be
-provided as the receive operation\[aq]s context.
-The struct fi_recv_context contains a \[aq]context\[aq] field.
+provided as the receive operation\[cq]s context.
+The struct fi_recv_context contains a `context' field.
 Applications may modify this field prior to claiming the message.
 When the claim operation completes, a standard receive completion entry
 will be generated on the completion queue.
-The \[aq]context\[aq] of the associated CQ entry will be set to the
-\[aq]context\[aq] value passed in through the fi_recv_context structure,
-and the CQ entry flags will have the FI_CLAIM bit set.
+The `context' of the associated CQ entry will be set to the `context'
+value passed in through the fi_recv_context structure, and the CQ entry
+flags will have the FI_CLAIM bit set.
 .PP
 Buffered receives that are not claimed must be discarded by the
 application when it is done processing the CQ entry data.
 To discard a message, an application must post a receive operation with
 the FI_DISCARD flag set.
 The struct fi_recv_context returned as part of the notification must be
-provided as the receive operation\[aq]s context.
+provided as the receive operation\[cq]s context.
 When the FI_DISCARD flag is set for a receive operation, the receive
 input buffer(s) and length parameters are ignored.
 .PP
@@ -444,8 +390,8 @@ manner.
 Failure to do so may result in increased memory usage for network
 buffering or communication stalls.
 Once a buffered receive has been claimed or discarded, the original CQ
-entry \[aq]buf\[aq] or struct fi_recv_context data may no longer be
-accessed by the application.
+entry `buf' or struct fi_recv_context data may no longer be accessed by
+the application.
 .PP
 The use of the FI_CLAIM and FI_DISCARD operation flags is also described
 with respect to tagged message transfers in fi_tagged.3.
@@ -467,10 +413,8 @@ It is most commonly used when the size of message transfers varies
 greatly, with very large messages interspersed with much smaller
 messages, making receive side message buffering difficult to manage.
 Variable messages are not subject to max message length restrictions
-(i.e.
-struct fi_ep_attr::max_msg_size limits), and may be up to the maximum
-value of size_t (e.g.
-SIZE_MAX) in length.
+(i.e.\ struct fi_ep_attr::max_msg_size limits), and may be up to the
+maximum value of size_t (e.g.\ SIZE_MAX) in length.
 .PP
 Variable length messages support requests that the provider allocate and
 manage the network message buffers.
@@ -497,19 +441,17 @@ buffer length.
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .PP
 See the discussion below for details handling FI_EAGAIN.
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EAGAIN\f[]
+.B \f[I]\-FI_EAGAIN\f[R]
 Indicates that the underlying provider currently lacks the resources
 needed to initiate the requested operation.
 The reasons for a provider returning FI_EAGAIN are varied.
 However, common reasons include insufficient internal buffering or full
 processing queues.
-.RS
-.RE
 .PP
 Insufficient internal buffering is often associated with operations that
 use FI_INJECT.
@@ -532,7 +474,7 @@ employed, as acknowledgements or flow control messages may need to be
 processed in order to resume execution.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_cq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_nic.3 b/deps/libfabric/man/man3/fi_nic.3
index 8e4bfc2a9a36427c491d569c3ccb96eb519a60ba..897c14bf6b00b858dbb690e99a0fe2b37d0691b9 100644
--- a/deps/libfabric/man/man3/fi_nic.3
+++ b/deps/libfabric/man/man3/fi_nic.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_nic" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_nic" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,7 +9,7 @@ fi_nic \- Fabric network interface card attributes
 .PP
 The fid_nic structure defines attributes for a struct fi_info that is
 directly associated with underlying networking hardware and may be
-returned directly from calling \f[C]fi_getinfo\f[](3).
+returned directly from calling \f[C]fi_getinfo\f[R](3).
 The format of fid_nic and the related substructures are defined below.
 .PP
 Note that not all fields of all structures may be available.
@@ -18,156 +18,124 @@ will be set to NULL or 0.
 .IP
 .nf
 \f[C]
-struct\ fid_nic\ {
-\ \ \ \ struct\ fid\ \ \ \ \ \ \ \ \ \ \ \ \ fid;
-\ \ \ \ struct\ fi_device_attr\ *device_attr;
-\ \ \ \ struct\ fi_bus_attr\ \ \ \ *bus_attr;
-\ \ \ \ struct\ fi_link_attr\ \ \ *link_attr;
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *prov_attr;
+struct fid_nic {
+    struct fid             fid;
+    struct fi_device_attr *device_attr;
+    struct fi_bus_attr    *bus_attr;
+    struct fi_link_attr   *link_attr;
+    void                  *prov_attr;
 };
 
-struct\ fi_device_attr\ {
-\ \ \ \ char\ *name;
-\ \ \ \ char\ *device_id;
-\ \ \ \ char\ *device_version;
-\ \ \ \ char\ *vendor_id;
-\ \ \ \ char\ *driver;
-\ \ \ \ char\ *firmware;
+struct fi_device_attr {
+    char *name;
+    char *device_id;
+    char *device_version;
+    char *vendor_id;
+    char *driver;
+    char *firmware;
 };
 
-struct\ fi_pci_attr\ {
-\ \ \ \ uint16_t\ domain_id;
-\ \ \ \ uint8_t\ \ bus_id;
-\ \ \ \ uint8_t\ \ device_id;
-\ \ \ \ uint8_t\ \ function_id;
+struct fi_pci_attr {
+    uint16_t domain_id;
+    uint8_t  bus_id;
+    uint8_t  device_id;
+    uint8_t  function_id;
 };
 
-struct\ fi_bus_attr\ {
-\ \ \ \ enum\ fi_bus_type\ \ \ \ \ \ \ bus_type;
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ struct\ fi_pci_attr\ pci;
-\ \ \ \ }\ attr;
+struct fi_bus_attr {
+    enum fi_bus_type       bus_type;
+    union {
+        struct fi_pci_attr pci;
+    } attr;
 };
 
-struct\ fi_link_attr\ {
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *address;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ mtu;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ speed;
-\ \ \ \ enum\ fi_link_state\ state;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *network_type;
+struct fi_link_attr {
+    char               *address;
+    size_t             mtu;
+    size_t             speed;
+    enum fi_link_state state;
+    char               *network_type;
 };
-\f[]
+\f[R]
 .fi
 .SS Device Attributes
 .PP
 Device attributes are used to identify the specific virtual or hardware
 NIC associated with an fi_info structure.
 .TP
-.B \f[I]name\f[]
+.B \f[I]name\f[R]
 The operating system name associated with the device.
-This may be a logical network interface name (e.g.
-eth0 or eno1) or an absolute filename.
-.RS
-.RE
+This may be a logical network interface name (e.g.\ eth0 or eno1) or an
+absolute filename.
 .TP
-.B \f[I]device_id\f[]
+.B \f[I]device_id\f[R]
 This is a vendor specific identifier for the device or product.
-.RS
-.RE
 .TP
-.B \f[I]device_version\f[]
+.B \f[I]device_version\f[R]
 Indicates the version of the device.
-.RS
-.RE
 .TP
-.B \f[I]vendor_id\f[]
+.B \f[I]vendor_id\f[R]
 Indicates the name of the vendor that distributes the NIC.
-.RS
-.RE
 .TP
-.B \f[I]driver\f[]
+.B \f[I]driver\f[R]
 The name of the driver associated with the device
-.RS
-.RE
 .TP
-.B \f[I]firmware\f[]
-The device\[aq]s firmware version.
-.RS
-.RE
+.B \f[I]firmware\f[R]
+The device\[cq]s firmware version.
 .SS Bus Attributes
 .PP
 The bus attributes are used to identify the physical location of the NIC
 in the system.
 .TP
-.B \f[I]bus_type\f[]
+.B \f[I]bus_type\f[R]
 Indicates the type of system bus where the NIC is located.
 Valid values are FI_BUS_PCI or FI_BUS_UNKNOWN.
-.RS
-.RE
 .TP
-.B \f[I]attr.pci.domain_id\f[]
+.B \f[I]attr.pci.domain_id\f[R]
 The domain where the PCI bus is located.
 Valid only if bus_type is FI_BUS_PCI.
-.RS
-.RE
 .TP
-.B \f[I]attr.pci.bus_id\f[]
+.B \f[I]attr.pci.bus_id\f[R]
 The PCI bus identifier where the device is located.
 Valid only if bus_type is FI_BUS_PCI.
-.RS
-.RE
 .TP
-.B \f[I]attr.pci.device_id\f[]
+.B \f[I]attr.pci.device_id\f[R]
 The identifier on the PCI bus where the device is located.
 Valid only if bus_type is FI_BUS_PCI.
-.RS
-.RE
 .TP
-.B \f[I]attr.pci.function_id\f[]
+.B \f[I]attr.pci.function_id\f[R]
 The function on the device being referenced.
 Valid only if bus_type is FI_BUS_PCI.
-.RS
-.RE
 .SS Link Attributes
 .PP
 Link attributes describe low\-level details about the network connection
 into the fabric.
 .TP
-.B \f[I]address\f[]
+.B \f[I]address\f[R]
 The primary link\-level address associated with the NIC, such as a MAC
 address.
 If multiple addresses are available, only one will be reported.
-.RS
-.RE
 .TP
-.B \f[I]mtu\f[]
+.B \f[I]mtu\f[R]
 The maximum transfer unit of link level frames or packets, in bytes.
-.RS
-.RE
 .TP
-.B \f[I]speed\f[]
+.B \f[I]speed\f[R]
 The active link data rate, given in bits per second.
-.RS
-.RE
 .TP
-.B \f[I]state\f[]
+.B \f[I]state\f[R]
 The current physical port state.
 Possible values are FI_LINK_UNKNOWN, FI_LINK_DOWN, and FI_LINK_UP, to
 indicate if the port state is unknown or not applicable (unknown),
 inactive (down), or active (up).
-.RS
-.RE
 .TP
-.B \f[I]network_type\f[]
+.B \f[I]network_type\f[R]
 Specifies the type of network interface currently active, such as
 Ethernet or InfiniBand.
-.RS
-.RE
 .SS Provider Attributes
 .PP
 Provider attributes reference provider specific details of the device.
 These attributes are both provider and device specific.
-The attributes can be interpreted by \f[C]fi_tostr\f[](3).
+The attributes can be interpreted by \f[C]fi_tostr\f[R](3).
 Applications may also use the other attribute fields, such as related
 fi_fabric_attr: prov_name field, to determine an appropriate structure
 to cast the attributes.
@@ -177,10 +145,10 @@ specific header file included with libfabric package.
 .SH NOTES
 .PP
 The fid_nic structure is returned as part of a call to
-\f[C]fi_getinfo\f[](3).
-It is automatically freed as part of calling \f[C]fi_freeinfo\f[](3)
+\f[C]fi_getinfo\f[R](3).
+It is automatically freed as part of calling \f[C]fi_freeinfo\f[R](3)
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3)
+\f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_open.3 b/deps/libfabric/man/man3/fi_open.3
index b356493b4b51a9cab29dcb57288c73cfe8aae7b5..f72423bcc564bc03bb4630a692cbee47f8781874 100644
--- a/deps/libfabric/man/man3/fi_open.3
+++ b/deps/libfabric/man/man3/fi_open.3
@@ -1 +1 @@
-.so man3/fi_domain.3
+.so man3/fi_provider.3
diff --git a/deps/libfabric/man/man3/fi_poll.3 b/deps/libfabric/man/man3/fi_poll.3
index 787c139501059b7f7fa14f74d40fa5b632135035..8d41f7a0aa78637e58588f3bafecfaae428cc165 100644
--- a/deps/libfabric/man/man3/fi_poll.3
+++ b/deps/libfabric/man/man3/fi_poll.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_poll" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_poll" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,128 +8,92 @@ fi_poll \- Polling and wait set operations
 .TP
 .B fi_poll_open / fi_close
 Open/close a polling set
-.RS
-.RE
 .TP
 .B fi_poll_add / fi_poll_del
 Add/remove a completion queue or counter to/from a poll set.
-.RS
-.RE
 .TP
 .B fi_poll
 Poll for progress and events across multiple completion queues and
 counters.
-.RS
-.RE
 .TP
 .B fi_wait_open / fi_close
 Open/close a wait set
-.RS
-.RE
 .TP
 .B fi_wait
 Waits for one or more wait objects in a set to be signaled.
-.RS
-.RE
 .TP
 .B fi_trywait
 Indicate when it is safe to block on wait objects using native OS calls.
-.RS
-.RE
 .TP
 .B fi_control
 Control wait set operation or attributes.
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_domain.h>
+#include <rdma/fi_domain.h>
 
-int\ fi_poll_open(struct\ fid_domain\ *domain,\ struct\ fi_poll_attr\ *attr,
-\ \ \ \ struct\ fid_poll\ **pollset);
+int fi_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr,
+    struct fid_poll **pollset);
 
-int\ fi_close(struct\ fid\ *pollset);
+int fi_close(struct fid *pollset);
 
-int\ fi_poll_add(struct\ fid_poll\ *pollset,\ struct\ fid\ *event_fid,
-\ \ \ \ uint64_t\ flags);
+int fi_poll_add(struct fid_poll *pollset, struct fid *event_fid,
+    uint64_t flags);
 
-int\ fi_poll_del(struct\ fid_poll\ *pollset,\ struct\ fid\ *event_fid,
-\ \ \ \ uint64_t\ flags);
+int fi_poll_del(struct fid_poll *pollset, struct fid *event_fid,
+    uint64_t flags);
 
-int\ fi_poll(struct\ fid_poll\ *pollset,\ void\ **context,\ int\ count);
+int fi_poll(struct fid_poll *pollset, void **context, int count);
 
-int\ fi_wait_open(struct\ fid_fabric\ *fabric,\ struct\ fi_wait_attr\ *attr,
-\ \ \ \ struct\ fid_wait\ **waitset);
+int fi_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
+    struct fid_wait **waitset);
 
-int\ fi_close(struct\ fid\ *waitset);
+int fi_close(struct fid *waitset);
 
-int\ fi_wait(struct\ fid_wait\ *waitset,\ int\ timeout);
+int fi_wait(struct fid_wait *waitset, int timeout);
 
-int\ fi_trywait(struct\ fid_fabric\ *fabric,\ struct\ fid\ **fids,\ size_t\ count);
+int fi_trywait(struct fid_fabric *fabric, struct fid **fids, size_t count);
 
-int\ fi_control(struct\ fid\ *waitset,\ int\ command,\ void\ *arg);
-\f[]
+int fi_control(struct fid *waitset, int command, void *arg);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fabric\f[]
+.B \f[I]fabric\f[R]
 Fabric provider
-.RS
-.RE
 .TP
-.B \f[I]domain\f[]
+.B \f[I]domain\f[R]
 Resource domain
-.RS
-.RE
 .TP
-.B \f[I]pollset\f[]
+.B \f[I]pollset\f[R]
 Event poll set
-.RS
-.RE
 .TP
-.B \f[I]waitset\f[]
+.B \f[I]waitset\f[R]
 Wait object set
-.RS
-.RE
 .TP
-.B \f[I]attr\f[]
+.B \f[I]attr\f[R]
 Poll or wait set attributes
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 On success, an array of user context values associated with completion
 queues or counters.
-.RS
-.RE
 .TP
-.B \f[I]fids\f[]
+.B \f[I]fids\f[R]
 An array of fabric descriptors, each one associated with a native wait
 object.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Number of entries in context or fids array.
-.RS
-.RE
 .TP
-.B \f[I]timeout\f[]
+.B \f[I]timeout\f[R]
 Time to wait for a signal, in milliseconds.
-.RS
-.RE
 .TP
-.B \f[I]command\f[]
+.B \f[I]command\f[R]
 Command of control operation to perform on the wait set.
-.RS
-.RE
 .TP
-.B \f[I]arg\f[]
+.B \f[I]arg\f[R]
 Optional control argument.
-.RS
-.RE
 .SH DESCRIPTION
 .SS fi_poll_open
 .PP
@@ -142,17 +106,15 @@ A poll set is defined with the following attributes.
 .IP
 .nf
 \f[C]
-struct\ fi_poll_attr\ {
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */
+struct fi_poll_attr {
+    uint64_t             flags;     /* operation flags */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Flags that set the default operation of the poll set.
 The use of this field is reserved and must be set to 0 by the caller.
-.RS
-.RE
 .SS fi_close
 .PP
 The fi_close call releases all resources associated with a poll set.
@@ -172,7 +134,7 @@ If events might have occurred, contexts associated with the completion
 queues and/or counters are returned.
 Completion queues will return their context if they are not empty.
 The context associated with a counter will be returned if the
-counter\[aq]s success value or error value have changed since the last
+counter\[cq]s success value or error value have changed since the last
 time fi_poll, fi_cntr_set, or fi_cntr_add were called.
 The number of contexts is limited to the size of the context array,
 indicated by the count parameter.
@@ -199,24 +161,22 @@ fi_wait_attr.
 .IP
 .nf
 \f[C]
-struct\ fi_wait_attr\ {
-\ \ \ \ enum\ fi_wait_obj\ \ \ \ \ wait_obj;\ \ /*\ requested\ wait\ object\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ flags;\ \ \ \ \ /*\ operation\ flags\ */
+struct fi_wait_attr {
+    enum fi_wait_obj     wait_obj;  /* requested wait object */
+    uint64_t             flags;     /* operation flags */
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]wait_obj\f[]
+.B \f[I]wait_obj\f[R]
 Wait sets are associated with specific wait object(s).
 Wait objects allow applications to block until the wait object is
 signaled, indicating that an event is available to be read.
 The following values may be used to specify the type of wait object
 associated with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD,
 FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_UNSPEC\f[]
+.B \- \f[I]FI_WAIT_UNSPEC\f[R]
 Specifies that the user will only wait on the wait set using fabric
 interface calls, such as fi_wait.
 In this case, the underlying provider may select the most appropriate or
@@ -224,10 +184,8 @@ highest performing wait object available, including custom wait
 mechanisms.
 Applications that select FI_WAIT_UNSPEC are not guaranteed to retrieve
 the underlying wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_FD\f[]
+.B \- \f[I]FI_WAIT_FD\f[R]
 Indicates that the wait set should use a single file descriptor as its
 wait mechanism, as exposed to the application.
 Internally, this may require the use of epoll in order to support
@@ -236,16 +194,12 @@ File descriptor wait objects must be usable in the POSIX select(2) and
 poll(2), and Linux epoll(7) routines (if available).
 Provider signal an FD wait object by marking it as readable or with an
 error.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_MUTEX_COND\f[]
+.B \- \f[I]FI_WAIT_MUTEX_COND\f[R]
 Specifies that the wait set should use a pthread mutex and cond variable
 as a wait object.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_POLLFD\f[]
+.B \- \f[I]FI_WAIT_POLLFD\f[R]
 This option is similar to FI_WAIT_FD, but allows the wait mechanism to
 use multiple file descriptors as its wait mechanism, as viewed by the
 application.
@@ -255,20 +209,14 @@ for events.
 The file descriptors must be usable in the POSIX select(2) and poll(2)
 routines, and match directly to being used with poll.
 See the NOTES section below for details on using pollfd.
-.RS
-.RE
 .TP
-.B \- \f[I]FI_WAIT_YIELD\f[]
+.B \- \f[I]FI_WAIT_YIELD\f[R]
 Indicates that the wait set will wait without a wait object but instead
 yield on every wait.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Flags that set the default operation of the wait set.
 The use of this field is reserved and must be set to 0 by the caller.
-.RS
-.RE
 .SS fi_close
 .PP
 The fi_close call releases all resources associated with a wait set.
@@ -295,19 +243,19 @@ conjunction with the OS select(2) call.
 .IP
 .nf
 \f[C]
-fi_control(&cq\->fid,\ FI_GETWAIT,\ (void\ *)\ &fd);
+fi_control(&cq\->fid, FI_GETWAIT, (void *) &fd);
 FD_ZERO(&fds);
-FD_SET(fd,\ &fds);
+FD_SET(fd, &fds);
 
-while\ (1)\ {
-\ \ \ \ if\ (fi_trywait(&cq,\ 1)\ ==\ FI_SUCCESS)
-\ \ \ \ \ \ \ \ select(fd\ +\ 1,\ &fds,\ NULL,\ &fds,\ &timeout);
+while (1) {
+    if (fi_trywait(&cq, 1) == FI_SUCCESS)
+        select(fd + 1, &fds, NULL, &fds, &timeout);
 
-\ \ \ \ do\ {
-\ \ \ \ \ \ \ \ ret\ =\ fi_cq_read(cq,\ &comp,\ 1);
-\ \ \ \ }\ while\ (ret\ >\ 0);
+    do {
+        ret = fi_cq_read(cq, &comp, 1);
+    } while (ret > 0);
 }
-\f[]
+\f[R]
 .fi
 .PP
 fi_trywait() will return FI_SUCCESS if it is safe to block on the wait
@@ -325,7 +273,7 @@ The following types of fabric descriptors may be passed into fi_trywait:
 event queues, completion queues, counters, and wait sets.
 Applications that wish to use native wait calls should select specific
 wait objects when allocating such resources.
-For example, by setting the item\[aq]s creation attribute wait_obj value
+For example, by setting the item\[cq]s creation attribute wait_obj value
 to FI_WAIT_FD.
 .PP
 In the case the wait object to check belongs to a wait set, only the
@@ -347,37 +295,30 @@ fi_control is invoked, as it may redirect the implementation of wait set
 operations.
 The following control commands are usable with a wait set or fid.
 .TP
-.B \f[I]FI_GETWAIT (void **)\f[]
+.B \f[I]FI_GETWAIT (void **)\f[R]
 This command allows the user to retrieve the low\-level wait object
 associated with a wait set or fid.
 The format of the wait set is specified during wait set creation,
 through the wait set attributes.
 The fi_control arg parameter should be an address where a pointer to the
 returned wait object will be written.
-This should be an \[aq]int *\[aq] for FI_WAIT_FD, \[aq]struct
-fi_mutex_cond\[aq] for FI_WAIT_MUTEX_COND, or \[aq]struct
-fi_wait_pollfd\[aq] for FI_WAIT_POLLFD.
+This should be an \[cq]int *\[cq] for FI_WAIT_FD, `struct fi_mutex_cond'
+for FI_WAIT_MUTEX_COND, or `struct fi_wait_pollfd' for FI_WAIT_POLLFD.
 Support for FI_GETWAIT is provider specific.
-.RS
-.RE
 .TP
-.B \f[I]FI_GETWAITOBJ (enum fi_wait_obj *)\f[]
+.B \f[I]FI_GETWAITOBJ (enum fi_wait_obj *)\f[R]
 This command returns the type of wait object associated with a wait set
 or fid.
-.RS
-.RE
 .SH RETURN VALUES
 .PP
 Returns FI_SUCCESS on success.
 On error, a negative value corresponding to fabric errno is returned.
 .PP
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .TP
 .B fi_poll
 On success, if events are available, returns the number of entries
 written to the context array.
-.RS
-.RE
 .SH NOTES
 .PP
 In many situations, blocking calls may need to wait on signals sent to a
@@ -395,7 +336,7 @@ mechanism.
 A significant different between using POLLFD versus FD wait objects is
 that with FI_WAIT_POLLFD, the file descriptors may change dynamically.
 As an example, the file descriptors associated with a completion
-queues\[aq] wait set may change as endpoint associations with the CQ are
+queues\[cq] wait set may change as endpoint associations with the CQ are
 added and removed.
 .PP
 Struct fi_wait_pollfd is used to retrieve all file descriptors for fids
@@ -403,15 +344,15 @@ using FI_WAIT_POLLFD to support blocking calls.
 .IP
 .nf
 \f[C]
-struct\ fi_wait_pollfd\ {
-\ \ \ \ uint64_t\ \ \ \ \ \ change_index;
-\ \ \ \ size_t\ \ \ \ \ \ \ \ nfds;
-\ \ \ \ struct\ pollfd\ *fd;
+struct fi_wait_pollfd {
+    uint64_t      change_index;
+    size_t        nfds;
+    struct pollfd *fd;
 };
-\f[]
+\f[R]
 .fi
 .TP
-.B \f[I]change_index\f[]
+.B \f[I]change_index\f[R]
 The change_index may be used to determine if there have been any changes
 to the file descriptor list.
 Anytime a file descriptor is added, removed, or its events are updated,
@@ -423,10 +364,8 @@ fi_control() to retrieve the current change_index and compare that
 against its cached value.
 If the values differ, then the app should update its file descriptor
 list prior to blocking.
-.RS
-.RE
 .TP
-.B \f[I]nfds\f[]
+.B \f[I]nfds\f[R]
 On input to fi_control(), this indicates the number of entries in the
 struct pollfd * array.
 On output, this will be set to the number of entries needed to store the
@@ -435,18 +374,14 @@ If the input value is smaller than the output value, fi_control() will
 return the error \-FI_ETOOSMALL.
 Note that setting nfds = 0 allows an efficient way of checking the
 change_index.
-.RS
-.RE
 .TP
-.B \f[I]fd\f[]
+.B \f[I]fd\f[R]
 This points to an array of struct pollfd entries.
 The number of entries is specified through the nfds field.
 If the number of needed entries is less than or equal to the number of
 entries available, the struct pollfd array will be filled out with a
 list of file descriptors and corresponding events that can be used in
 the select(2) and poll(2) calls.
-.RS
-.RE
 .PP
 The change_index is updated only when the file descriptors associated
 with the pollfd file set has changed.
@@ -456,7 +391,7 @@ The use of the fi_trywait() function is still required if accessing wait
 objects directly.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_domain\f[](3), \f[C]fi_cntr\f[](3),
-\f[C]fi_eq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_cntr\f[R](3),
+\f[C]fi_eq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_provider.3 b/deps/libfabric/man/man3/fi_provider.3
new file mode 100644
index 0000000000000000000000000000000000000000..1ae81a885f8fe9050fa831affbb340e56e6b2206
--- /dev/null
+++ b/deps/libfabric/man/man3/fi_provider.3
@@ -0,0 +1,252 @@
+.\" Automatically generated by Pandoc 2.5
+.\"
+.TH "fi_provider" "3" "2021\-09\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
+.hy
+.SH NAME
+.PP
+fi_prov_ini \- External provider entry point
+.TP
+.B fi_param_define / fi_param_get
+Register and retrieve environment variables with the libfabric core
+.TP
+.B fi_log_enabled / fi_log_ready / fi_log
+Control and output debug logging information.
+.TP
+.B fi_open / fi_close
+Open a named library object
+.TP
+.B fi_export_fid / fi_import_fid
+Share a fabric object between different providers or resources
+.SH SYNOPSIS
+.IP
+.nf
+\f[C]
+#include <rdma/fabric.h>
+#include <rdma/prov/fi_prov.h>
+
+struct fi_provider* fi_prov_ini(void);
+
+int fi_param_define(const struct fi_provider *provider, const char *param_name,
+    enum fi_param_type type, const char *help_string_fmt, ...);
+
+int fi_param_get_str(struct fi_provider *provider, const char *param_name,
+    char **value);
+
+int fi_param_get_int(struct fi_provider *provider, const char *param_name,
+    int *value);
+
+int fi_param_get_bool(struct fi_provider *provider, const char *param_name,
+    int *value);
+
+int fi_param_get_size_t(struct fi_provider *provider, const char *param_name,
+    size_t *value);
+\f[R]
+.fi
+.IP
+.nf
+\f[C]
+#include <rdma/fabric.h>
+#include <rdma/prov/fi_prov.h>
+#include <rdma/prov/fi_log.h>
+
+int fi_log_enabled(const struct fi_provider *prov, enum fi_log_level level,
+    enum fi_log_subsys subsys);
+
+int fi_log_ready(const struct fi_provider *prov, enum fi_log_level level,
+    enum fi_log_subsys subsys, uint64_t *showtime);
+
+void fi_log(const struct fi_provider *prov, enum fi_log_level level,
+    enum fi_log_subsys subsys, const char *func, int line,
+    const char *fmt, ...);
+\f[R]
+.fi
+.IP
+.nf
+\f[C]
+#include <rdma/fabric.h>
+
+int fi_open(uint32_t version, const char *name, void *attr,
+    size_t attr_len, uint64_t flags, struct fid **fid, void *context);
+
+int fi_close(struct fid *fid);
+\f[R]
+.fi
+.IP
+.nf
+\f[C]
+#include <rdma/fabric.h>
+#include <rdma/fi_ext.h>
+
+int fi_export_fid(struct fid *fid, uint64_t flags,
+    struct fid **expfid, void *context);
+
+int fi_import_fid(struct fid *fid, struct fid *expfid, uint64_t flags);
+\f[R]
+.fi
+.SH ARGUMENTS
+.TP
+.B \f[I]provider\f[R]
+Reference to the provider.
+.TP
+.B \f[I]version\f[R]
+API version requested by application.
+.TP
+.B \f[I]name\f[R]
+Well\-known name of the library object to open.
+.TP
+.B \f[I]attr\f[R]
+Optional attributes of object to open.
+.TP
+.B \f[I]attr_len\f[R]
+Size of any attribute structure passed to fi_open.
+Should be 0 if no attributes are give.
+.TP
+.B \f[I]fid\f[R]
+Returned fabric identifier for opened object.
+.SH DESCRIPTION
+.PP
+A fabric provider implements the application facing software interfaces
+needed to access network specific protocols, drivers, and hardware.
+The interfaces and structures defined by this man page are exported by
+the libfabric library, but are targeted for provider implementations,
+rather than for direct use by most applications.
+.PP
+Integrated providers are those built directly into the libfabric library
+itself.
+External providers are loaded dynamically by libfabric at initialization
+time.
+External providers must be in a standard library path or in the
+libfabric library search path as specified by environment variable.
+Additionally, external providers must be named with the suffix
+\[lq]\-fi.so\[rq] at the end of the name.
+.PP
+Named objects are special purpose resources which are accessible
+directly to applications.
+They may be used to enhance or modify the behavior of library core.
+For details, see the fi_open call below.
+.SS fi_prov_ini
+.PP
+This entry point must be defined by external providers.
+On loading, libfabric will invoke fi_prov_ini() to retrieve the
+provider\[cq]s fi_provider structure.
+Additional interactions between the libfabric core and the provider will
+be through the interfaces defined by that struct.
+.SS fi_param_define
+.PP
+Defines a configuration parameter for use by a specified provider.
+The help_string and param_name arguments must be non\-NULL, help_string
+must additionally be non\-empty.
+They are copied internally and may be freed after calling
+fi_param_define.
+.SS fi_param_get
+.PP
+Gets the value of a configuration parameter previously defined using
+fi_param_define().
+The value comes from the environment variable name of the form FI__, all
+converted to upper case.
+.PP
+If the parameter was previously defined and the user set a value,
+FI_SUCCESS is returned and (*value) points to the retrieved value.
+.PP
+If the parameter name was previously defined, but the user did not set a
+value, \-FI_ENODATA is returned and the value of (*value) is unchanged.
+.PP
+If the parameter name was not previously defined via fi_param_define(),
+\-FI_ENOENT will be returned and the value of (*value) is unchanged.
+.PP
+If the value in the environment is not valid for the parameter type,
+\-FI_EINVAL will be returned and the value of (*value) is unchanged.
+.SS fi_log_enabled / fi_log_ready / fi_log
+.PP
+These functions control debug and informational logging output.
+Providers typically access these functions through the FI_LOG and
+related macros in fi_log.h and do not call these function directly.
+.SS fi_open
+.PP
+Open a library resource using a well\-known name.
+This feature allows applications and providers a mechanism which can be
+used to modify or enhance core library services and behavior.
+The details are specific based on the requested object name.
+Most applications will not need this level of control.
+.PP
+The library API version known to the application should be provided
+through the version parameter.
+The use of attributes is object dependent.
+If required, attributes should be provided through the attr parameter,
+with attr_len set to the size of the referenced attribute structure.
+The following is a list of published names, along with descriptions of
+the service or resource to which they correspond.
+.TP
+.B \f[I]mr_cache\f[R]
+The mr_cache object references the internal memory registration cache
+used by the different providers.
+Additional information on the cache is available in the
+\f[C]fi_mr(3)\f[R] man page.
+.SS fi_export_fid / fi_import_fid
+.PP
+Generally, fabric objects are allocated and managed entirely by a single
+provider.
+Typically only the application facing software interfaces of a fabric
+object are defined, for example, the message or tagged operations of an
+endpoint.
+The fi_export_fid and fi_import_fid calls provide a a mechanism by which
+provider facing APIs may be accessed.
+This allows the creation of fid objects that are shareable between
+providers, or for library plug\-in services.
+The ability to export a shareable object is object and provider
+implementation dependent.
+.PP
+Shareable fids typically contain at least 3 main components: a base fid,
+a set of exporter defined ops, and a set of importer defined ops.
+.SH NOTES
+.PP
+TODO
+.SH PROVIDER INTERFACE
+.PP
+The fi_provider structure defines entry points for the libfabric core to
+use to access the provider.
+All other calls into a provider are through function pointers associated
+with allocated objects.
+.IP
+.nf
+\f[C]
+struct fi_provider {
+    uint32_t version;
+    uint32_t fi_version;
+    struct fi_context context;
+    const char *name;
+    int (*getinfo)(uint32_t version, const char *node, const char *service,
+            uint64_t flags, const struct fi_info *hints,
+            struct fi_info **info);
+    int (*fabric)(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+            void *context);
+    void    (*cleanup)(void);
+};
+\f[R]
+.fi
+.SS version
+.PP
+The provider version.
+For providers integrated with the library, this is often the same as the
+library version.
+.SS fi_version
+.PP
+The library interface version that the provider was implemented against.
+The provider\[cq]s fi_version must be greater than or equal to an
+application\[cq]s requested api version for the application to use the
+provider.
+It is a provider\[cq]s responsibility to support older versions of the
+api if it wishes to supports legacy applications.
+For integrated providers
+.SS TODO
+.SH RETURN VALUE
+.PP
+Returns FI_SUCCESS on success.
+On error, a negative value corresponding to fabric errno is returned.
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
+.SH ERRORS
+.SH SEE ALSO
+.PP
+\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3) \f[C]fi_mr\f[R](3),
+.SH AUTHORS
+OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_rma.3 b/deps/libfabric/man/man3/fi_rma.3
index 9b16a50ac81e9e4accdf76c03641e06c20e86af5..bf059370e656f9ff967f767ea175ae57ad4637a0 100644
--- a/deps/libfabric/man/man3/fi_rma.3
+++ b/deps/libfabric/man/man3/fi_rma.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_rma" "3" "2019\-09\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_rma" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,133 +8,100 @@ fi_rma \- Remote memory access operations
 .TP
 .B fi_read / fi_readv / fi_readmsg
 Initiates a read from remote memory
-.RS
-.RE
 .PP
-fi_write / fi_writev / fi_writemsg
-.PD 0
-.P
-.PD
-fi_inject_write / fi_writedata : Initiate a write to remote memory
+fi_write / fi_writev / fi_writemsg fi_inject_write / fi_writedata :
+Initiate a write to remote memory
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_rma.h>
+#include <rdma/fi_rma.h>
 
-ssize_t\ fi_read(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len,\ void\ *desc,
-\ \ \ \ fi_addr_t\ src_addr,\ uint64_t\ addr,\ uint64_t\ key,\ void\ *context);
+ssize_t fi_read(struct fid_ep *ep, void *buf, size_t len, void *desc,
+    fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context);
 
-ssize_t\ fi_readv(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,\ void\ **desc,
-\ \ \ \ size_t\ count,\ fi_addr_t\ src_addr,\ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ void\ *context);
+ssize_t fi_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
+    size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
+    void *context);
 
-ssize_t\ fi_readmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg_rma\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
+    uint64_t flags);
 
-ssize_t\ fi_write(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ void\ *context);
+ssize_t fi_write(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+    void *context);
 
-ssize_t\ fi_writev(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,\ void\ **desc,
-\ \ \ \ size_t\ count,\ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key,
-\ \ \ \ void\ *context);
+ssize_t fi_writev(struct fid_ep *ep, const struct iovec *iov, void **desc,
+    size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+    void *context);
 
-ssize_t\ fi_writemsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg_rma\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
+    uint64_t flags);
 
-ssize_t\ fi_inject_write(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key);
+ssize_t fi_inject_write(struct fid_ep *ep, const void *buf, size_t len,
+    fi_addr_t dest_addr, uint64_t addr, uint64_t key);
 
-ssize_t\ fi_writedata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ uint64_t\ data,\ fi_addr_t\ dest_addr,\ uint64_t\ addr,
-\ \ \ \ uint64_t\ key,\ void\ *context);
+ssize_t fi_writedata(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr,
+    uint64_t key, void *context);
 
-ssize_t\ fi_inject_writedata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ uint64_t\ data,\ fi_addr_t\ dest_addr,\ uint64_t\ addr,\ uint64_t\ key);
-\f[]
+ssize_t fi_inject_writedata(struct fid_ep *ep, const void *buf, size_t len,
+    uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]ep\f[]
+.B \f[I]ep\f[R]
 Fabric endpoint on which to initiate read or write operation.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 Local data buffer to read into (read target) or write from (write
 source)
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
+.B \f[I]len\f[R]
 Length of data to read or write, specified in bytes.
-Valid transfers are from 0 bytes up to the endpoint\[aq]s max_msg_size.
-.RS
-.RE
+Valid transfers are from 0 bytes up to the endpoint\[cq]s max_msg_size.
 .TP
-.B \f[I]iov\f[]
+.B \f[I]iov\f[R]
 Vectored data buffer.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Count of vectored data entries.
-.RS
-.RE
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Address of remote memory to access.
 This will be the virtual address of the remote region in the case of
 FI_MR_BASIC, or the offset from the starting address in the case of
 FI_MR_SCALABLE.
-.RS
-.RE
 .TP
-.B \f[I]key\f[]
+.B \f[I]key\f[R]
 Protection key associated with the remote memory.
-.RS
-.RE
 .TP
-.B \f[I]desc\f[]
-Descriptor associated with the local data buffer See \f[C]fi_mr\f[](3).
-.RS
-.RE
+.B \f[I]desc\f[R]
+Descriptor associated with the local data buffer See \f[C]fi_mr\f[R](3).
 .TP
-.B \f[I]data\f[]
+.B \f[I]data\f[R]
 Remote CQ data to transfer with the operation.
-.RS
-.RE
 .TP
-.B \f[I]dest_addr\f[]
+.B \f[I]dest_addr\f[R]
 Destination address for connectionless write transfers.
 Ignored for connected endpoints.
-.RS
-.RE
 .TP
-.B \f[I]src_addr\f[]
+.B \f[I]src_addr\f[R]
 Source address to read from for connectionless transfers.
 Ignored for connected endpoints.
-.RS
-.RE
 .TP
-.B \f[I]msg\f[]
+.B \f[I]msg\f[R]
 Message descriptor for read and write operations.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply for the read or write operation.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified pointer to associate with the operation.
 This parameter is ignored if the operation will not generate a
 successful completion, unless an op flag specifies the context parameter
 be used for required input.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 RMA (remote memory access) operations are used to transfer data directly
@@ -142,15 +109,16 @@ between a local data buffer and a remote data buffer.
 RMA transfers occur on a byte level granularity, and no message
 boundaries are maintained.
 .PP
-The write functions \-\- fi_write, fi_writev, fi_writemsg,
-fi_inject_write, and fi_writedata \-\- are used to transmit data into a
+The write functions \[en] fi_write, fi_writev, fi_writemsg,
+fi_inject_write, and fi_writedata \[en] are used to transmit data into a
 remote memory buffer.
 The main difference between write functions are the number and type of
 parameters that they accept as input.
 Otherwise, they perform the same general function.
 .PP
-The read functions \-\- fi_read, fi_readv, and fi_readmsg \-\- are used
-to transfer data from a remote memory region into local data buffer(s).
+The read functions \[en] fi_read, fi_readv, and fi_readmsg \[en] are
+used to transfer data from a remote memory region into local data
+buffer(s).
 Similar to the write operations, read operations operate asynchronously.
 Users should not touch the posted data buffer(s) until the read
 operation has completed.
@@ -171,11 +139,6 @@ remote endpoint, so that the immediate data may be delivered.
 .PP
 The call fi_write transfers the data contained in the user\-specified
 data buffer to a remote memory region.
-The local endpoint must be connected to a remote endpoint or destination
-before fi_write is called.
-Unless the endpoint has been configured differently, the data buffer
-passed into fi_write must not be touched by the application until the
-fi_write call completes asynchronously.
 .SS fi_writev
 .PP
 The fi_writev call adds support for a scatter\-gather list to fi_write.
@@ -184,35 +147,35 @@ parameter to the remote memory region.
 .SS fi_writemsg
 .PP
 The fi_writemsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the write operation
-per call through the use of flags.
+connectionless endpoints, with the ability to control the write
+operation per call through the use of flags.
 The fi_writemsg function takes a struct fi_msg_rma as input.
 .IP
 .nf
 \f[C]
-struct\ fi_msg_rma\ {
-\ \ \ \ const\ struct\ iovec\ *msg_iov;\ \ \ \ \ /*\ local\ scatter\-gather\ array\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ **desc;\ \ \ \ \ \ \ /*\ operation\ descriptor\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ iov_count;\ \ \ \ /*\ #\ elements\ in\ msg_iov\ */
-\ \ \ \ fi_addr_t\ \ \ \ \ \ \ \ \ \ addr;\ \ \ \ \ \ \ \ /*\ optional\ endpoint\ address\ */
-\ \ \ \ const\ struct\ fi_rma_iov\ *rma_iov;/*\ remote\ SGL\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ rma_iov_count;/*\ #\ elements\ in\ rma_iov\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context;\ \ \ \ \ /*\ user\-defined\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ data;\ \ \ \ \ \ \ \ \ /*\ optional\ immediate\ data\ */
+struct fi_msg_rma {
+    const struct iovec *msg_iov;     /* local scatter\-gather array */
+    void               **desc;       /* operation descriptor */
+    size_t             iov_count;    /* # elements in msg_iov */
+    fi_addr_t          addr;        /* optional endpoint address */
+    const struct fi_rma_iov *rma_iov;/* remote SGL */
+    size_t             rma_iov_count;/* # elements in rma_iov */
+    void               *context;     /* user\-defined context */
+    uint64_t           data;         /* optional immediate data */
 };
 
-struct\ fi_rma_iov\ {
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ addr;\ \ \ \ \ \ \ \ \ /*\ target\ RMA\ address\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ len;\ \ \ \ \ \ \ \ \ \ /*\ size\ of\ target\ buffer\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ key;\ \ \ \ \ \ \ \ \ \ /*\ access\ key\ */
+struct fi_rma_iov {
+    uint64_t           addr;         /* target RMA address */
+    size_t             len;          /* size of target buffer */
+    uint64_t           key;          /* access key */
 };
-\f[]
+\f[R]
 .fi
 .SS fi_inject_write
 .PP
 The write inject call is an optimized version of fi_write.
 It provides similar completion semantics as fi_inject
-\f[C]fi_msg\f[](3).
+\f[C]fi_msg\f[R](3).
 .SS fi_writedata
 .PP
 The write data call is similar to fi_write, but allows for the sending
@@ -226,8 +189,6 @@ the transfer.
 .PP
 The fi_read call requests that the remote endpoint transfer data from
 the remote memory region into the local data buffer.
-The local endpoint must be connected to a remote endpoint or destination
-before fi_read is called.
 .SS fi_readv
 .PP
 The fi_readv call adds support for a scatter\-gather list to fi_read.
@@ -236,7 +197,7 @@ of data buffers referenced by the iov parameter.
 .SS fi_readmsg
 .PP
 The fi_readmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the read operation
+connectionless endpoints, with the ability to control the read operation
 per call through the use of flags.
 The fi_readmsg function takes a struct fi_msg_rma as input.
 .SH FLAGS
@@ -249,32 +210,26 @@ fi_endpoint.3).
 The following list of flags are usable with fi_readmsg and/or
 fi_writemsg.
 .TP
-.B \f[I]FI_REMOTE_CQ_DATA\f[]
+.B \f[I]FI_REMOTE_CQ_DATA\f[R]
 Applies to fi_writemsg and fi_writedata.
 Indicates that remote CQ data is available and should be sent as part of
 the request.
 See fi_getinfo for additional details on FI_REMOTE_CQ_DATA.
-.RS
-.RE
 .TP
-.B \f[I]FI_COMPLETION\f[]
+.B \f[I]FI_COMPLETION\f[R]
 Indicates that a completion entry should be generated for the specified
 operation.
 The endpoint must be bound to a completion queue with
 FI_SELECTIVE_COMPLETION that corresponds to the specified operation, or
 this flag is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_MORE\f[]
+.B \f[I]FI_MORE\f[R]
 Indicates that the user has additional requests that will immediately be
 posted after the current call returns.
 Use of this flag may improve performance by enabling the provider to
 optimize its access to the fabric hardware.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT\f[]
+.B \f[I]FI_INJECT\f[R]
 Applies to fi_writemsg.
 Indicates that the outbound data buffer should be returned to user
 immediately after the write call returns, even if the operation is
@@ -282,39 +237,29 @@ handled asynchronously.
 This may require that the underlying provider implementation copy the
 data into a local buffer and transfer out of that buffer.
 This flag can only be used with messages smaller than inject_size.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT_COMPLETE\f[]
+.B \f[I]FI_INJECT_COMPLETE\f[R]
 Applies to fi_writemsg.
 Indicates that a completion should be generated when the source
 buffer(s) may be reused.
-.RS
-.RE
 .TP
-.B \f[I]FI_TRANSMIT_COMPLETE\f[]
+.B \f[I]FI_TRANSMIT_COMPLETE\f[R]
 Applies to fi_writemsg.
 Indicates that a completion should not be generated until the operation
 has been successfully transmitted and is no longer being tracked by the
 provider.
-.RS
-.RE
 .TP
-.B \f[I]FI_DELIVERY_COMPLETE\f[]
+.B \f[I]FI_DELIVERY_COMPLETE\f[R]
 Applies to fi_writemsg.
 Indicates that a completion should be generated when the operation has
 been processed by the destination.
-.RS
-.RE
 .TP
-.B \f[I]FI_COMMIT_COMPLETE\f[]
+.B \f[I]FI_COMMIT_COMPLETE\f[R]
 Applies to fi_writemsg when targeting persistent memory regions.
 Indicates that a completion should be generated only after the result of
 the operation has been made durable.
-.RS
-.RE
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 Applies to transmits.
 Indicates that the requested operation, also known as the fenced
 operation, and any operation posted after the fenced operation will be
@@ -322,26 +267,23 @@ deferred until all previous operations targeting the same peer endpoint
 have completed.
 Operations posted after the fencing will see and/or replace the results
 of any operations initiated prior to the fenced operation.
-.RS
-.RE
 .PP
 The ordering of operations starting at the posting of the fenced
 operation (inclusive) to the posting of a subsequent fenced operation
-(exclusive) is controlled by the endpoint\[aq]s ordering semantics.
+(exclusive) is controlled by the endpoint\[cq]s ordering semantics.
 .SH RETURN VALUE
 .PP
 Returns 0 on success.
 On error, a negative value corresponding to fabric errno is returned.
-Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[].
+Fabric errno values are defined in \f[C]rdma/fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EAGAIN\f[]
-See \f[C]fi_msg\f[](3) for a detailed description of handling FI_EAGAIN.
-.RS
-.RE
+.B \f[I]\-FI_EAGAIN\f[R]
+See \f[C]fi_msg\f[R](3) for a detailed description of handling
+FI_EAGAIN.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_cq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_tagged.3 b/deps/libfabric/man/man3/fi_tagged.3
index 49743a84fb12e068310a460388dfc02ed0989ea7..e7f0f063a4f659a4f088ccb01257a8825fe739df 100644
--- a/deps/libfabric/man/man3/fi_tagged.3
+++ b/deps/libfabric/man/man3/fi_tagged.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_tagged" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_tagged" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -8,129 +8,97 @@ fi_tagged \- Tagged data transfer operations
 .TP
 .B fi_trecv / fi_trecvv / fi_trecvmsg
 Post a buffer to receive an incoming message
-.RS
-.RE
 .TP
 .B fi_tsend / fi_tsendv / fi_tsendmsg / fi_tinject / fi_tsenddata
 Initiate an operation to send a message
-.RS
-.RE
 .SH SYNOPSIS
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_tagged.h>
+#include <rdma/fi_tagged.h>
 
-ssize_t\ fi_trecv(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ len,\ void\ *desc,
-\ \ \ \ fi_addr_t\ src_addr,\ uint64_t\ tag,\ uint64_t\ ignore,\ void\ *context);
+ssize_t fi_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc,
+    fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context);
 
-ssize_t\ fi_trecvv(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,\ void\ **desc,
-\ \ \ \ size_t\ count,\ fi_addr_t\ src_addr,\ uint64_t\ tag,\ uint64_t\ ignore,
-\ \ \ \ void\ *context);
+ssize_t fi_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc,
+    size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore,
+    void *context);
 
-ssize_t\ fi_trecvmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg_tagged\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg,
+    uint64_t flags);
 
-ssize_t\ fi_tsend(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ fi_addr_t\ dest_addr,\ uint64_t\ tag,\ void\ *context);
+ssize_t fi_tsend(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, fi_addr_t dest_addr, uint64_t tag, void *context);
 
-ssize_t\ fi_tsendv(struct\ fid_ep\ *ep,\ const\ struct\ iovec\ *iov,
-\ \ \ \ void\ **desc,\ size_t\ count,\ fi_addr_t\ dest_addr,\ uint64_t\ tag,
-\ \ \ \ void\ *context);
+ssize_t fi_tsendv(struct fid_ep *ep, const struct iovec *iov,
+    void **desc, size_t count, fi_addr_t dest_addr, uint64_t tag,
+    void *context);
 
-ssize_t\ fi_tsendmsg(struct\ fid_ep\ *ep,\ const\ struct\ fi_msg_tagged\ *msg,
-\ \ \ \ uint64_t\ flags);
+ssize_t fi_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg,
+    uint64_t flags);
 
-ssize_t\ fi_tinject(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ fi_addr_t\ dest_addr,\ uint64_t\ tag);
+ssize_t fi_tinject(struct fid_ep *ep, const void *buf, size_t len,
+    fi_addr_t dest_addr, uint64_t tag);
 
-ssize_t\ fi_tsenddata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ void\ *desc,\ uint64_t\ data,\ fi_addr_t\ dest_addr,\ uint64_t\ tag,
-\ \ \ \ void\ *context);
+ssize_t fi_tsenddata(struct fid_ep *ep, const void *buf, size_t len,
+    void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag,
+    void *context);
 
-ssize_t\ fi_tinjectdata(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ len,
-\ \ \ \ uint64_t\ data,\ fi_addr_t\ dest_addr,\ uint64_t\ tag);
-\f[]
+ssize_t fi_tinjectdata(struct fid_ep *ep, const void *buf, size_t len,
+    uint64_t data, fi_addr_t dest_addr, uint64_t tag);
+\f[R]
 .fi
 .SH ARGUMENTS
 .TP
-.B \f[I]fid\f[]
+.B \f[I]fid\f[R]
 Fabric endpoint on which to initiate tagged communication operation.
-.RS
-.RE
 .TP
-.B \f[I]buf\f[]
+.B \f[I]buf\f[R]
 Data buffer to send or receive.
-.RS
-.RE
 .TP
-.B \f[I]len\f[]
+.B \f[I]len\f[R]
 Length of data buffer to send or receive.
-.RS
-.RE
 .TP
-.B \f[I]iov\f[]
+.B \f[I]iov\f[R]
 Vectored data buffer.
-.RS
-.RE
 .TP
-.B \f[I]count\f[]
+.B \f[I]count\f[R]
 Count of vectored data entries.
-.RS
-.RE
 .TP
-.B \f[I]tag\f[]
+.B \f[I]tag\f[R]
 Tag associated with the message.
-.RS
-.RE
 .TP
-.B \f[I]ignore\f[]
+.B \f[I]ignore\f[R]
 Mask of bits to ignore applied to the tag for receive operations.
-.RS
-.RE
 .TP
-.B \f[I]desc\f[]
+.B \f[I]desc\f[R]
 Memory descriptor associated with the data buffer.
-See \f[C]fi_mr\f[](3).
-.RS
-.RE
+See \f[C]fi_mr\f[R](3).
 .TP
-.B \f[I]data\f[]
+.B \f[I]data\f[R]
 Remote CQ data to transfer with the sent data.
-.RS
-.RE
 .TP
-.B \f[I]dest_addr\f[]
+.B \f[I]dest_addr\f[R]
 Destination address for connectionless transfers.
 Ignored for connected endpoints.
-.RS
-.RE
 .TP
-.B \f[I]src_addr\f[]
+.B \f[I]src_addr\f[R]
 Source address to receive from for connectionless transfers.
 Applies only to connectionless endpoints with the FI_DIRECTED_RECV
 capability enabled, otherwise this field is ignored.
 If set to FI_ADDR_UNSPEC, any source address may match.
-.RS
-.RE
 .TP
-.B \f[I]msg\f[]
+.B \f[I]msg\f[R]
 Message descriptor for send and receive operations.
-.RS
-.RE
 .TP
-.B \f[I]flags\f[]
+.B \f[I]flags\f[R]
 Additional flags to apply for the send or receive operation.
-.RS
-.RE
 .TP
-.B \f[I]context\f[]
+.B \f[I]context\f[R]
 User specified pointer to associate with the operation.
 This parameter is ignored if the operation will not generate a
 successful completion, unless an op flag specifies the context parameter
 be used for required input.
-.RS
-.RE
 .SH DESCRIPTION
 .PP
 Tagged messages are data transfers which carry a key or tag with the
@@ -143,22 +111,22 @@ This can be stated as:
 .IP
 .nf
 \f[C]
-send_tag\ &\ ~ignore\ ==\ recv_tag\ &\ ~ignore
-\f[]
+send_tag & \[ti]ignore == recv_tag & \[ti]ignore
+\f[R]
 .fi
 .PP
 In general, message tags are checked against receive buffers in the
 order in which messages have been posted to the endpoint.
 See the ordering discussion below for more details.
 .PP
-The send functions \-\- fi_tsend, fi_tsendv, fi_tsendmsg, fi_tinject,
-and fi_tsenddata \-\- are used to transmit a tagged message from one
+The send functions \[en] fi_tsend, fi_tsendv, fi_tsendmsg, fi_tinject,
+and fi_tsenddata \[en] are used to transmit a tagged message from one
 endpoint to another endpoint.
 The main difference between send functions are the number and type of
 parameters that they accept as input.
 Otherwise, they perform the same general function.
 .PP
-The receive functions \-\- fi_trecv, fi_trecvv, fi_recvmsg \-\- post a
+The receive functions \[en] fi_trecv, fi_trecvv, fi_recvmsg \[en] post a
 data buffer to an endpoint to receive inbound tagged messages.
 Similar to the send operations, receive operations operate
 asynchronously.
@@ -197,29 +165,29 @@ parameter to a remote endpoint as a single message.
 .SS fi_tsendmsg
 .PP
 The fi_tsendmsg call supports data transfers over both connected and
-unconnected endpoints, with the ability to control the send operation
+connectionless endpoints, with the ability to control the send operation
 per call through the use of flags.
 The fi_tsendmsg function takes a struct fi_msg_tagged as input.
 .IP
 .nf
 \f[C]
-struct\ fi_msg_tagged\ {
-\ \ \ \ const\ struct\ iovec\ *msg_iov;\ /*\ scatter\-gather\ array\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *desc;\ \ \ \ /*\ data\ descriptor\ */
-\ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ iov_count;/*\ #\ elements\ in\ msg_iov\ */
-\ \ \ \ fi_addr_t\ \ \ \ \ \ \ \ \ \ addr;\ \ \ \ /*\ optional\ endpoint\ address\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ tag;\ \ \ \ \ \ /*\ tag\ associated\ with\ message\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ ignore;\ \ \ /*\ mask\ applied\ to\ tag\ for\ receives\ */
-\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context;\ /*\ user\-defined\ context\ */
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ data;\ \ \ \ \ /*\ optional\ immediate\ data\ */
+struct fi_msg_tagged {
+    const struct iovec *msg_iov; /* scatter\-gather array */
+    void               *desc;    /* data descriptor */
+    size_t             iov_count;/* # elements in msg_iov */
+    fi_addr_t          addr;    /* optional endpoint address */
+    uint64_t           tag;      /* tag associated with message */
+    uint64_t           ignore;   /* mask applied to tag for receives */
+    void               *context; /* user\-defined context */
+    uint64_t           data;     /* optional immediate data */
 };
-\f[]
+\f[R]
 .fi
 .SS fi_tinject
 .PP
 The tagged inject call is an optimized version of fi_tsend.
 It provides similar completion semantics as fi_inject
-\f[C]fi_msg\f[](3).
+\f[C]fi_msg\f[R](3).
 .SS fi_tsenddata
 .PP
 The tagged send data call is similar to fi_tsend, but allows for the
@@ -247,8 +215,8 @@ parameter to a receive incoming data.
 .SS fi_trecvmsg
 .PP
 The fi_trecvmsg call supports posting buffers over both connected and
-unconnected endpoints, with the ability to control the receive operation
-per call through the use of flags.
+connectionless endpoints, with the ability to control the receive
+operation per call through the use of flags.
 The fi_trecvmsg function takes a struct fi_msg_tagged as input.
 .SH FLAGS
 .PP
@@ -260,32 +228,26 @@ fi_endpoint).
 The following list of flags are usable with fi_trecvmsg and/or
 fi_tsendmsg.
 .TP
-.B \f[I]FI_REMOTE_CQ_DATA\f[]
+.B \f[I]FI_REMOTE_CQ_DATA\f[R]
 Applies to fi_tsendmsg and fi_tsenddata.
 Indicates that remote CQ data is available and should be sent as part of
 the request.
 See fi_getinfo for additional details on FI_REMOTE_CQ_DATA.
-.RS
-.RE
 .TP
-.B \f[I]FI_COMPLETION\f[]
+.B \f[I]FI_COMPLETION\f[R]
 Indicates that a completion entry should be generated for the specified
 operation.
 The endpoint must be bound to a completion queue with
 FI_SELECTIVE_COMPLETION that corresponds to the specified operation, or
 this flag is ignored.
-.RS
-.RE
 .TP
-.B \f[I]FI_MORE\f[]
+.B \f[I]FI_MORE\f[R]
 Indicates that the user has additional requests that will immediately be
 posted after the current call returns.
 Use of this flag may improve performance by enabling the provider to
 optimize its access to the fabric hardware.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT\f[]
+.B \f[I]FI_INJECT\f[R]
 Applies to fi_tsendmsg.
 Indicates that the outbound data buffer should be returned to user
 immediately after the send call returns, even if the operation is
@@ -293,33 +255,25 @@ handled asynchronously.
 This may require that the underlying provider implementation copy the
 data into a local buffer and transfer out of that buffer.
 This flag can only be used with messages smaller than inject_size.
-.RS
-.RE
 .TP
-.B \f[I]FI_INJECT_COMPLETE\f[]
+.B \f[I]FI_INJECT_COMPLETE\f[R]
 Applies to fi_tsendmsg.
 Indicates that a completion should be generated when the source
 buffer(s) may be reused.
-.RS
-.RE
 .TP
-.B \f[I]FI_TRANSMIT_COMPLETE\f[]
+.B \f[I]FI_TRANSMIT_COMPLETE\f[R]
 Applies to fi_tsendmsg.
 Indicates that a completion should not be generated until the operation
 has been successfully transmitted and is no longer being tracked by the
 provider.
-.RS
-.RE
 .TP
-.B \f[I]FI_MATCH_COMPLETE\f[]
+.B \f[I]FI_MATCH_COMPLETE\f[R]
 Applies to fi_tsendmsg.
 Indicates that a completion should be generated only after the message
 has either been matched with a tagged buffer or was discarded by the
 target application.
-.RS
-.RE
 .TP
-.B \f[I]FI_FENCE\f[]
+.B \f[I]FI_FENCE\f[R]
 Applies to transmits.
 Indicates that the requested operation, also known as the fenced
 operation, and any operation posted after the fenced operation will be
@@ -327,16 +281,14 @@ deferred until all previous operations targeting the same peer endpoint
 have completed.
 Operations posted after the fencing will see and/or replace the results
 of any operations initiated prior to the fenced operation.
-.RS
-.RE
 .PP
 The ordering of operations starting at the posting of the fenced
 operation (inclusive) to the posting of a subsequent fenced operation
-(exclusive) is controlled by the endpoint\[aq]s ordering semantics.
+(exclusive) is controlled by the endpoint\[cq]s ordering semantics.
 .PP
 The following flags may be used with fi_trecvmsg.
 .TP
-.B \f[I]FI_PEEK\f[]
+.B \f[I]FI_PEEK\f[R]
 The peek flag may be used to see if a specified message has arrived.
 A peek request is often useful on endpoints that have provider allocated
 buffering enabled (see fi_rx_attr total_buffered_recv).
@@ -349,16 +301,14 @@ endpoint.
 If no message is found matching the tags specified in the peek request,
 then a completion queue error entry with err field set to FI_ENOMSG will
 be available.
-.RS
-.RE
 .PP
 If a peek request locates a matching message, the operation will
 complete successfully.
 The returned completion data will indicate the meta\-data associated
 with the message, such as the message length, completion flags,
 available CQ data, tag, and source address.
-The data available is subject to the completion entry format (e.g.
-struct fi_cq_tagged_entry).
+The data available is subject to the completion entry format
+(e.g.\ struct fi_cq_tagged_entry).
 .PP
 An application may supply a buffer if it desires to receive data as a
 part of the peek operation.
@@ -369,23 +319,21 @@ if peek operations desire to obtain a copy of the data.
 The returned data is limited to the size of the input buffer(s) or the
 message size, if smaller.
 A provider indicates if data is available by setting the buf field of
-the CQ entry to the user\[aq]s first input buffer.
+the CQ entry to the user\[cq]s first input buffer.
 If buf is NULL, no data was available to return.
 A provider may return NULL even if the peek operation completes
 successfully.
 Note that the CQ entry len field will reference the size of the message,
 not necessarily the size of the returned data.
 .TP
-.B \f[I]FI_CLAIM\f[]
+.B \f[I]FI_CLAIM\f[R]
 If this flag is used in conjunction with FI_PEEK, it indicates if the
-peek request completes successfully \-\- indicating that a matching
-message was located \-\- the message is claimed by caller.
+peek request completes successfully \[en] indicating that a matching
+message was located \[en] the message is claimed by caller.
 Claimed messages can only be retrieved using a subsequent, paired
 receive operation with the FI_CLAIM flag set.
 A receive operation with the FI_CLAIM flag set, but FI_PEEK not set is
 used to retrieve a previously claimed message.
-.RS
-.RE
 .PP
 In order to use the FI_CLAIM flag, an application must supply a struct
 fi_context structure as the context for the receive operation, or a
@@ -399,17 +347,15 @@ When set, it is used to retrieve a tagged message that was buffered by
 the provider.
 See Buffered Tagged Receives section for details.
 .TP
-.B \f[I]FI_DISCARD\f[]
+.B \f[I]FI_DISCARD\f[R]
 This flag may be used in conjunction with either FI_PEEK or FI_CLAIM.
 If this flag is used in conjunction with FI_PEEK, it indicates if the
-peek request completes successfully \-\- indicating that a matching
-message was located \-\- the message is discarded by the provider, as
+peek request completes successfully \[en] indicating that a matching
+message was located \[en] the message is discarded by the provider, as
 the data is not needed by the application.
 This flag may also be used in conjunction with FI_CLAIM in order to
 discard a message previously claimed using an FI_PEEK + FI_CLAIM
 request.
-.RS
-.RE
 .PP
 This flag also applies to endpoints configured for FI_BUFFERED_RECV or
 FI_VARIABLE_MSG.
@@ -421,7 +367,7 @@ If this flag is set, the input buffer(s) and length parameters are
 ignored.
 .SH Buffered Tagged Receives
 .PP
-See \f[C]fi_msg\f[](3) for an introduction to buffered receives.
+See \f[C]fi_msg\f[R](3) for an introduction to buffered receives.
 The handling of buffered receives differs between fi_msg operations and
 fi_tagged.
 Although the provider is responsible for allocating and managing network
@@ -433,8 +379,8 @@ specified tags.
 When FI_BUFFERED_RECV is enabled, the application posts the tags that
 will be used for matching purposes.
 Tags are posted using fi_trecv, fi_trecvv, and fi_trecvmsg; however,
-parameters related to the input buffers are ignored (e.g.
-buf, len, iov, desc).
+parameters related to the input buffers are ignored (e.g.\ buf, len,
+iov, desc).
 When a provider receives a message for which there is a matching tag, it
 will write an entry to the completion queue associated with the
 receiving endpoint.
@@ -445,51 +391,46 @@ The op_context field will point to a struct fi_recv_context.
 .IP
 .nf
 \f[C]
-struct\ fi_recv_context\ {
-\ \ \ \ struct\ fid_ep\ *ep;
-\ \ \ \ void\ *context;
+struct fi_recv_context {
+    struct fid_ep *ep;
+    void *context;
 };
-\f[]
+\f[R]
 .fi
 .PP
-The \[aq]ep\[aq] field will be NULL.
-The \[aq]context\[aq] field will match the application context specified
-when posting the tag.
-Other fields are set as defined in \f[C]fi_msg\f[](3).
+The `ep' field will be NULL.
+The `context' field will match the application context specified when
+posting the tag.
+Other fields are set as defined in \f[C]fi_msg\f[R](3).
 .PP
 After being notified that a buffered receive has arrived, applications
 must either claim or discard the message as described in
-\f[C]fi_msg\f[](3).
+\f[C]fi_msg\f[R](3).
 .SH Variable Length Tagged Messages
 .PP
-Variable length messages are defined in \f[C]fi_msg\f[](3).
+Variable length messages are defined in \f[C]fi_msg\f[R](3).
 The requirements for handling variable length tagged messages is
 identical to those defined above for buffered tagged receives.
 .SH RETURN VALUE
 .PP
 The tagged send and receive calls return 0 on success.
-On error, a negative value corresponding to fabric \f[I]errno \f[] is
+On error, a negative value corresponding to fabric \f[I]errno \f[R] is
 returned.
-Fabric errno values are defined in \f[C]fi_errno.h\f[].
+Fabric errno values are defined in \f[C]fi_errno.h\f[R].
 .SH ERRORS
 .TP
-.B \f[I]\-FI_EAGAIN\f[]
-See \f[C]fi_msg\f[](3) for a detailed description of handling FI_EAGAIN.
-.RS
-.RE
+.B \f[I]\-FI_EAGAIN\f[R]
+See \f[C]fi_msg\f[R](3) for a detailed description of handling
+FI_EAGAIN.
 .TP
-.B \f[I]\-FI_EINVAL\f[]
+.B \f[I]\-FI_EINVAL\f[R]
 Indicates that an invalid argument was supplied by the user.
-.RS
-.RE
 .TP
-.B \f[I]\-FI_EOTHER\f[]
+.B \f[I]\-FI_EOTHER\f[R]
 Indicates that an unspecified error occurred.
-.RS
-.RE
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
-\f[C]fi_cq\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3), \f[C]fi_cq\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_trigger.3 b/deps/libfabric/man/man3/fi_trigger.3
index 590b077c0b8a800c08786194a29fae452bc91bd7..ee22d7c600d960b5f1fa66ecd0ba7a60a97dfea4 100644
--- a/deps/libfabric/man/man3/fi_trigger.3
+++ b/deps/libfabric/man/man3/fi_trigger.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_trigger" "3" "2019\-09\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_trigger" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,8 +9,8 @@ fi_trigger \- Triggered operations
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_trigger.h>
-\f[]
+#include <rdma/fi_trigger.h>
+\f[R]
 .fi
 .SH DESCRIPTION
 .PP
@@ -50,22 +50,22 @@ The format of struct fi_triggered_context[2] is described below.
 .IP
 .nf
 \f[C]
-struct\ fi_triggered_context\ {
-\ \ \ \ enum\ fi_trigger_event\ \ \ \ \ \ \ \ \ event_type;\ \ \ /*\ trigger\ type\ */
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ struct\ fi_trigger_threshold\ threshold;
-\ \ \ \ \ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *internal[3];\ /*\ reserved\ */
-\ \ \ \ }\ trigger;
+struct fi_triggered_context {
+    enum fi_trigger_event         event_type;   /* trigger type */
+    union {
+        struct fi_trigger_threshold threshold;
+        void                        *internal[3]; /* reserved */
+    } trigger;
 };
 
-struct\ fi_triggered_context2\ {
-\ \ \ \ enum\ fi_trigger_event\ \ \ \ \ \ \ \ \ event_type;\ \ \ /*\ trigger\ type\ */
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ struct\ fi_trigger_threshold\ threshold;
-\ \ \ \ \ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *internal[7];\ /*\ reserved\ */
-\ \ \ \ }\ trigger;
+struct fi_triggered_context2 {
+    enum fi_trigger_event         event_type;   /* trigger type */
+    union {
+        struct fi_trigger_threshold threshold;
+        void                        *internal[7]; /* reserved */
+    } trigger;
 };
-\f[]
+\f[R]
 .fi
 .PP
 The triggered context indicates the type of event assigned to the
@@ -75,20 +75,18 @@ event type.
 .PP
 The following trigger events are defined.
 .TP
-.B \f[I]FI_TRIGGER_THRESHOLD\f[]
+.B \f[I]FI_TRIGGER_THRESHOLD\f[R]
 This indicates that the data transfer operation will be deferred until
 an event counter crosses an application specified threshold value.
 The threshold is specified using struct fi_trigger_threshold:
-.RS
-.RE
 .IP
 .nf
 \f[C]
-struct\ fi_trigger_threshold\ {
-\ \ \ \ struct\ fid_cntr\ *cntr;\ /*\ event\ counter\ to\ check\ */
-\ \ \ \ size_t\ threshold;\ \ \ \ \ \ /*\ threshold\ value\ */
+struct fi_trigger_threshold {
+    struct fid_cntr *cntr; /* event counter to check */
+    size_t threshold;      /* threshold value */
 };
-\f[]
+\f[R]
 .fi
 .PP
 Threshold operations are triggered in the order of the threshold values.
@@ -121,26 +119,26 @@ The format of the deferred work request is as follows:
 .IP
 .nf
 \f[C]
-struct\ fi_deferred_work\ {
-\ \ \ \ struct\ fi_context2\ \ \ \ context;
+struct fi_deferred_work {
+    struct fi_context2    context;
 
-\ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ threshold;
-\ \ \ \ struct\ fid_cntr\ \ \ \ \ \ \ *triggering_cntr;
-\ \ \ \ struct\ fid_cntr\ \ \ \ \ \ \ *completion_cntr;
+    uint64_t              threshold;
+    struct fid_cntr       *triggering_cntr;
+    struct fid_cntr       *completion_cntr;
 
-\ \ \ \ enum\ fi_trigger_op\ \ \ \ op_type;
+    enum fi_trigger_op    op_type;
 
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ struct\ fi_op_msg\ \ \ \ \ \ \ \ \ \ \ \ *msg;
-\ \ \ \ \ \ \ \ struct\ fi_op_tagged\ \ \ \ \ \ \ \ \ *tagged;
-\ \ \ \ \ \ \ \ struct\ fi_op_rma\ \ \ \ \ \ \ \ \ \ \ \ *rma;
-\ \ \ \ \ \ \ \ struct\ fi_op_atomic\ \ \ \ \ \ \ \ \ *atomic;
-\ \ \ \ \ \ \ \ struct\ fi_op_fetch_atomic\ \ \ *fetch_atomic;
-\ \ \ \ \ \ \ \ struct\ fi_op_compare_atomic\ *compare_atomic;
-\ \ \ \ \ \ \ \ struct\ fi_op_cntr\ \ \ \ \ \ \ \ \ \ \ *cntr;
-\ \ \ \ }\ op;
+    union {
+        struct fi_op_msg            *msg;
+        struct fi_op_tagged         *tagged;
+        struct fi_op_rma            *rma;
+        struct fi_op_atomic         *atomic;
+        struct fi_op_fetch_atomic   *fetch_atomic;
+        struct fi_op_compare_atomic *compare_atomic;
+        struct fi_op_cntr           *cntr;
+    } op;
 };
-\f[]
+\f[R]
 .fi
 .PP
 Once a work request has been posted to the deferred work queue, it will
@@ -158,7 +156,7 @@ The completion_cntr field must be NULL for counter operations.
 Because deferred work targets support of collective communication
 operations, posted work requests do not generate any completions at the
 endpoint by default.
-For example, completed operations are not written to the EP\[aq]s
+For example, completed operations are not written to the EP\[cq]s
 completion queue or update the EP counter (unless the EP counter is
 explicitly referenced as the completion_cntr).
 An application may request EP completions by specifying the
@@ -166,13 +164,13 @@ FI_COMPLETION flag as part of the operation.
 .PP
 It is the responsibility of the application to detect and handle
 situations that occur which could result in a deferred work
-request\[aq]s condition not being met.
+request\[cq]s condition not being met.
 For example, if a work request is dependent upon the successful
 completion of a data transfer operation, which fails, then the
 application must cancel the work request.
 .PP
 To submit a deferred work request, applications should use the
-domain\[aq]s fi_control function with command FI_QUEUE_WORK and struct
+domain\[cq]s fi_control function with command FI_QUEUE_WORK and struct
 fi_deferred_work as the fi_control arg parameter.
 To cancel a deferred work request, use fi_control with command
 FI_CANCEL_WORK and the corresponding struct fi_deferred_work to cancel.
@@ -191,7 +189,7 @@ If a specific request is not supported by the provider, it will fail the
 operation with \-FI_ENOSYS.
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_alias\f[](3),
-\f[C]fi_cntr\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_alias\f[R](3), \f[C]fi_cntr\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man3/fi_version.3 b/deps/libfabric/man/man3/fi_version.3
index 27e5080975a082495f5c1ce16496eae21b149c55..f36afdc10529f1f9554d91473d17adbe5299794c 100644
--- a/deps/libfabric/man/man3/fi_version.3
+++ b/deps/libfabric/man/man3/fi_version.3
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_version" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_version" "3" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,14 +9,14 @@ fi_version \- Version of the library interfaces
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
+#include <rdma/fabric.h>
 
-uint32_t\ fi_version(void);
+uint32_t fi_version(void);
 
 FI_MAJOR(version)
 
 FI_MINOR(version)
-\f[]
+\f[R]
 .fi
 .SH DESCRIPTION
 .PP
@@ -34,6 +34,6 @@ The upper 16\-bits of the version correspond to the major number, and
 the lower 16\-bits correspond with the minor number.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fabric.7 b/deps/libfabric/man/man7/fabric.7
index c4b812bfc72a3ff051607ea7cd0e0a4f89b6148a..32c996cec50b7f075b30485b5e7838ed626651cb 100644
--- a/deps/libfabric/man/man7/fabric.7
+++ b/deps/libfabric/man/man7/fabric.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fabric" "7" "2020\-07\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fabric" "7" "2021\-09\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -9,15 +9,15 @@ fabric \- Fabric Interface Library
 .IP
 .nf
 \f[C]
-#include\ <rdma/fabric.h>
-\f[]
+#include <rdma/fabric.h>
+\f[R]
 .fi
 .PP
 Libfabric is a high\-performance fabric software library designed to
 provide low\-latency interfaces to fabric hardware.
 .SH OVERVIEW
 .PP
-Libfabric provides \[aq]process direct I/O\[aq] to application software
+Libfabric provides `process direct I/O' to application software
 communicating across fabric software and hardware.
 Process direct I/O, historically referred to as RDMA, allows an
 application to directly access network resources without operating
@@ -26,7 +26,7 @@ Data transfers can occur directly to and from application memory.
 .PP
 There are two components to the libfabric software:
 .TP
-.B \f[I]Fabric Providers\f[]
+.B \f[I]Fabric Providers\f[R]
 Conceptually, a fabric provider may be viewed as a local hardware NIC
 driver, though a provider is not limited by this definition.
 The first component of libfabric is a general purpose framework that is
@@ -35,19 +35,15 @@ All fabric hardware devices and their software drivers are required to
 support this framework.
 Devices and the drivers that plug into the libfabric framework are
 referred to as fabric providers, or simply providers.
-Provider details may be found in \f[C]fi_provider\f[](7).
-.RS
-.RE
+Provider details may be found in \f[C]fi_provider\f[R](7).
 .TP
-.B \f[I]Fabric Interfaces\f[]
+.B \f[I]Fabric Interfaces\f[R]
 The second component is a set of communication operations.
 Libfabric defines several sets of communication functions that providers
 can support.
 It is not required that providers implement all the interfaces that are
 defined; however, providers clearly indicate which interfaces they do
 support.
-.RS
-.RE
 .SH FABRIC INTERFACES
 .PP
 The fabric interfaces are designed such that they are cohesive and not
@@ -74,15 +70,13 @@ resources.
 This involves listing all the interfaces available, obtaining the
 capabilities of the interfaces and opening a provider.
 .TP
-.B \f[I]fi_getinfo \- Fabric Information\f[]
+.B \f[I]fi_getinfo \- Fabric Information\f[R]
 The fi_getinfo call is the base call used to discover and request fabric
 services offered by the system.
 Applications can use this call to indicate the type of communication
 that they desire.
 The results from fi_getinfo, fi_info, are used to reserve and configure
 fabric resources.
-.RS
-.RE
 .PP
 fi_getinfo returns a list of fi_info structures.
 Each structure references a single fabric provider, indicating the
@@ -91,7 +85,7 @@ resources.
 A fabric provider may include multiple fi_info structures in the
 returned list.
 .TP
-.B \f[I]fi_fabric \- Fabric Domain\f[]
+.B \f[I]fi_fabric \- Fabric Domain\f[R]
 A fabric domain represents a collection of hardware and software
 resources that access a single physical or virtual network.
 All network ports on a system that can communicate with each other
@@ -99,61 +93,47 @@ through the fabric belong to the same fabric domain.
 A fabric domain shares network addresses and can span multiple
 providers.
 libfabric supports systems connected to multiple fabrics.
-.RS
-.RE
 .TP
-.B \f[I]fi_domain \- Access Domains\f[]
+.B \f[I]fi_domain \- Access Domains\f[R]
 An access domain represents a single logical connection into a fabric.
 It may map to a single physical or virtual NIC or a port.
 An access domain defines the boundary across which fabric resources may
 be associated.
 Each access domain belongs to a single fabric domain.
-.RS
-.RE
 .TP
-.B \f[I]fi_endpoint \- Fabric Endpoint\f[]
+.B \f[I]fi_endpoint \- Fabric Endpoint\f[R]
 A fabric endpoint is a communication portal.
 An endpoint may be either active or passive.
 Passive endpoints are used to listen for connection requests.
 Active endpoints can perform data transfers.
 Endpoints are configured with specific communication capabilities and
 data transfer interfaces.
-.RS
-.RE
 .TP
-.B \f[I]fi_eq \- Event Queue\f[]
+.B \f[I]fi_eq \- Event Queue\f[R]
 Event queues, are used to collect and report the completion of
 asynchronous operations and events.
 Event queues report events that are not directly associated with data
 transfer operations.
-.RS
-.RE
 .TP
-.B \f[I]fi_cq \- Completion Queue\f[]
+.B \f[I]fi_cq \- Completion Queue\f[R]
 Completion queues are high\-performance event queues used to report the
 completion of data transfer operations.
-.RS
-.RE
 .TP
-.B \f[I]fi_cntr \- Event Counters\f[]
+.B \f[I]fi_cntr \- Event Counters\f[R]
 Event counters are used to report the number of completed asynchronous
 operations.
 Event counters are considered light\-weight, in that a completion simply
 increments a counter, rather than placing an entry into an event queue.
-.RS
-.RE
 .TP
-.B \f[I]fi_mr \- Memory Region\f[]
+.B \f[I]fi_mr \- Memory Region\f[R]
 Memory regions describe application local memory buffers.
 In order for fabric resources to access application memory, the
 application must first grant permission to the fabric provider by
 constructing a memory region.
 Memory regions are required for specific types of data transfer
 operations, such as RMA transfers (see below).
-.RS
-.RE
 .TP
-.B \f[I]fi_av \- Address Vector\f[]
+.B \f[I]fi_av \- Address Vector\f[R]
 Address vectors are used to map higher level addresses, such as IP
 addresses, which may be more natural for an application to use, into
 fabric specific addresses.
@@ -161,8 +141,6 @@ The use of address vectors allows providers to reduce the amount of
 memory required to maintain large address look\-up tables, and eliminate
 expensive address resolution and look\-up methods during data transfer
 operations.
-.RS
-.RE
 .SH DATA TRANSFER INTERFACES
 .PP
 Fabric endpoints are associated with multiple data transfer interfaces.
@@ -171,150 +149,110 @@ communication, with an endpoint allowing the different interfaces to be
 used in conjunction.
 The following data transfer interfaces are defined by libfabric.
 .TP
-.B \f[I]fi_msg \- Message Queue\f[]
+.B \f[I]fi_msg \- Message Queue\f[R]
 Message queues expose a simple, message\-based FIFO queue interface to
 the application.
 Message data transfers allow applications to send and receive data with
 message boundaries being maintained.
-.RS
-.RE
 .TP
-.B \f[I]fi_tagged \- Tagged Message Queues\f[]
+.B \f[I]fi_tagged \- Tagged Message Queues\f[R]
 Tagged message lists expose send/receive data transfer operations built
 on the concept of tagged messaging.
 The tagged message queue is conceptually similar to standard message
 queues, but with the addition of 64\-bit tags for each message.
 Sent messages are matched with receive buffers that are tagged with a
 similar value.
-.RS
-.RE
 .TP
-.B \f[I]fi_rma \- Remote Memory Access\f[]
+.B \f[I]fi_rma \- Remote Memory Access\f[R]
 RMA transfers are one\-sided operations that read or write data directly
 to a remote memory region.
 Other than defining the appropriate memory region, RMA operations do not
 require interaction at the target side for the data transfer to
 complete.
-.RS
-.RE
 .TP
-.B \f[I]fi_atomic \- Atomic\f[]
+.B \f[I]fi_atomic \- Atomic\f[R]
 Atomic operations can perform one of several operations on a remote
 memory region.
 Atomic operations include well\-known functionality, such as atomic\-add
 and compare\-and\-swap, plus several other pre\-defined calls.
 Unlike other data transfer interfaces, atomic operations are aware of
 the data formatting at the target memory region.
-.RS
-.RE
 .SH LOGGING INTERFACE
 .PP
 Logging can be controlled using the FI_LOG_LEVEL, FI_LOG_PROV, and
 FI_LOG_SUBSYS environment variables.
 .TP
-.B \f[I]FI_LOG_LEVEL\f[]
+.B \f[I]FI_LOG_LEVEL\f[R]
 FI_LOG_LEVEL controls the amount of logging data that is output.
 The following log levels are defined.
-.RS
-.RE
 .TP
-.B \- \f[I]Warn\f[]
+.B \- \f[I]Warn\f[R]
 Warn is the least verbose setting and is intended for reporting errors
 or warnings.
-.RS
-.RE
 .TP
-.B \- \f[I]Trace\f[]
+.B \- \f[I]Trace\f[R]
 Trace is more verbose and is meant to include non\-detailed output
 helpful to tracing program execution.
-.RS
-.RE
 .TP
-.B \- \f[I]Info\f[]
+.B \- \f[I]Info\f[R]
 Info is high traffic and meant for detailed output.
-.RS
-.RE
 .TP
-.B \- \f[I]Debug\f[]
+.B \- \f[I]Debug\f[R]
 Debug is high traffic and is likely to impact application performance.
 Debug output is only available if the library has been compiled with
 debugging enabled.
-.RS
-.RE
 .TP
-.B \f[I]FI_LOG_PROV\f[]
+.B \f[I]FI_LOG_PROV\f[R]
 The FI_LOG_PROV environment variable enables or disables logging from
 specific providers.
 Providers can be enabled by listing them in a comma separated fashion.
-If the list begins with the \[aq]^\[aq] symbol, then the list will be
+If the list begins with the `\[ha]' symbol, then the list will be
 negated.
 By default all providers are enabled.
-.RS
-.RE
 .PP
 Example: To enable logging from the psm and sockets provider:
-FI_LOG_PROV="psm,sockets"
+FI_LOG_PROV=\[lq]psm,sockets\[rq]
 .PP
 Example: To enable logging from providers other than psm:
-FI_LOG_PROV="^psm"
+FI_LOG_PROV=\[lq]\[ha]psm\[rq]
 .TP
-.B \f[I]FI_LOG_SUBSYS\f[]
+.B \f[I]FI_LOG_SUBSYS\f[R]
 The FI_LOG_SUBSYS environment variable enables or disables logging at
 the subsystem level.
 The syntax for enabling or disabling subsystems is similar to that used
 for FI_LOG_PROV.
 The following subsystems are defined.
-.RS
-.RE
 .TP
-.B \- \f[I]core\f[]
+.B \- \f[I]core\f[R]
 Provides output related to the core framework and its management of
 providers.
-.RS
-.RE
 .TP
-.B \- \f[I]fabric\f[]
+.B \- \f[I]fabric\f[R]
 Provides output specific to interactions associated with the fabric
 object.
-.RS
-.RE
 .TP
-.B \- \f[I]domain\f[]
+.B \- \f[I]domain\f[R]
 Provides output specific to interactions associated with the domain
 object.
-.RS
-.RE
 .TP
-.B \- \f[I]ep_ctrl\f[]
+.B \- \f[I]ep_ctrl\f[R]
 Provides output specific to endpoint non\-data transfer operations, such
 as CM operations.
-.RS
-.RE
 .TP
-.B \- \f[I]ep_data\f[]
+.B \- \f[I]ep_data\f[R]
 Provides output specific to endpoint data transfer operations.
-.RS
-.RE
 .TP
-.B \- \f[I]av\f[]
+.B \- \f[I]av\f[R]
 Provides output specific to address vector operations.
-.RS
-.RE
 .TP
-.B \- \f[I]cq\f[]
+.B \- \f[I]cq\f[R]
 Provides output specific to completion queue operations.
-.RS
-.RE
 .TP
-.B \- \f[I]eq\f[]
+.B \- \f[I]eq\f[R]
 Provides output specific to event queue operations.
-.RS
-.RE
 .TP
-.B \- \f[I]mr\f[]
+.B \- \f[I]mr\f[R]
 Provides output specific to memory registration.
-.RS
-.RE
 .SH PROVIDER INSTALLATION AND SELECTION
 .PP
 The libfabric build scripts will install all providers that are
@@ -325,29 +263,29 @@ library initialization and respond appropriately to application queries.
 .PP
 Users can enable or disable available providers through build
 configuration options.
-See \[aq]configure \-\-help\[aq] for details.
+See `configure \[en]help' for details.
 In general, a specific provider can be controlled using the configure
-option \[aq]\-\-enable\-\[aq].
-For example, \[aq]\-\-enable\-udp\[aq] (or
-\[aq]\-\-enable\-udp=yes\[aq]) will add the udp provider to the build.
-To disable the provider, \[aq]\-\-enable\-udp=no\[aq] can be used.
+option `\[en]enable\-'.
+For example, `\[en]enable\-udp' (or `\[en]enable\-udp=yes') will add the
+udp provider to the build.
+To disable the provider, `\[en]enable\-udp=no' can be used.
 .PP
 Providers can also be enable or disabled at run time using the
 FI_PROVIDER environment variable.
 The FI_PROVIDER variable is set to a comma separated list of providers
 to include.
-If the list begins with the \[aq]^\[aq] symbol, then the list will be
+If the list begins with the `\[ha]' symbol, then the list will be
 negated.
 .PP
 Example: To enable the udp and tcp providers only, set:
-FI_PROVIDER="udp,tcp"
+FI_PROVIDER=\[lq]udp,tcp\[rq]
 .PP
 The fi_info utility, which is included as part of the libfabric package,
 can be used to retrieve information about which providers are available
 in the system.
 Additionally, it can retrieve a list of all environment variables that
 may be used to configure libfabric and each provider.
-See \f[C]fi_info\f[](1) for more details.
+See \f[C]fi_info\f[R](1) for more details.
 .SH ENVIRONMENT VARIABLE CONTROLS
 .PP
 Core features of libfabric and its providers may be configured by an
@@ -361,8 +299,9 @@ obtain the full list of variables that may be set, along with a brief
 description of their use.
 .PP
 A full list of variables available may be obtained by running the
-fi_info application, with the \-e or \-\-env command line option.
+fi_info application, with the \-e or \[en]env command line option.
 .SH NOTES
+.SS System Calls
 .PP
 Because libfabric is designed to provide applications direct access to
 fabric hardware, there are limits on how libfabric resources may be used
@@ -373,7 +312,7 @@ Although limits are provider specific, the following restrictions apply
 to many providers and should be adhered to by applications desiring
 portability across providers.
 .TP
-.B \f[I]fork\f[]
+.B \f[I]fork\f[R]
 Fabric resources are not guaranteed to be available by child processes.
 This includes objects, such as endpoints and completion queues, as well
 as application controlled data buffers which have been assigned to the
@@ -381,8 +320,30 @@ network.
 For example, data buffers that have been registered with a fabric domain
 may not be available in a child process because of copy on write
 restrictions.
-.RS
-.RE
+.SS CUDA deadlock
+.PP
+In some cases, calls to \f[C]cudaMemcpy\f[R] within libfabric may result
+in a deadlock.
+This typically occurs when a CUDA kernel blocks until a
+\f[C]cudaMemcpy\f[R] on the host completes.
+To avoid this deadlock, \f[C]cudaMemcpy\f[R] may be disabled by setting
+\f[C]FI_HMEM_CUDA_ENABLE_XFER=0\f[R].
+If this environment variable is set and there is a call to
+\f[C]cudaMemcpy\f[R] with libfabric, a warning will be emitted and no
+copy will occur.
+Note that not all providers support this option.
+.PP
+Another mechanism which can be used to avoid deadlock is Nvidia\[cq]s
+gdrcopy.
+Using gdrcopy requires an external library and kernel module available
+at https://github.com/NVIDIA/gdrcopy.
+Libfabric must be configured with gdrcopy support using the
+\f[C]\-\-with\-gdrcopy\f[R] option, and be run with
+\f[C]FI_HMEM_CUDA_USE_GDRCOPY=1\f[R].
+This may be used in conjunction with the above option to provide a
+method for copying to/from CUDA device memory when \f[C]cudaMemcpy\f[R]
+cannot be used.
+Again, this may not be supported by all providers.
 .SH ABI CHANGES
 .PP
 libfabric releases maintain compatibility with older releases, so that
@@ -414,54 +375,55 @@ These changes included adding the fields to the following data
 structures.
 The 1.1 ABI was exported by libfabric versions 1.5 and 1.6.
 .TP
-.B \f[I]fi_fabric_attr\f[]
+.B \f[I]fi_fabric_attr\f[R]
 Added api_version
-.RS
-.RE
 .TP
-.B \f[I]fi_domain_attr\f[]
+.B \f[I]fi_domain_attr\f[R]
 Added cntr_cnt, mr_iov_limit, caps, mode, auth_key, auth_key_size,
 max_err_data, and mr_cnt fields.
 The mr_mode field was also changed from an enum to an integer flag
 field.
-.RS
-.RE
 .TP
-.B \f[I]fi_ep_attr\f[]
+.B \f[I]fi_ep_attr\f[R]
 Added auth_key_size and auth_key fields.
-.RS
-.RE
 .SS ABI 1.2
 .PP
 The 1.2 ABI version was exported by libfabric versions 1.7 and 1.8, and
 expanded the following structure.
 .TP
-.B \f[I]fi_info\f[]
+.B \f[I]fi_info\f[R]
 The fi_info structure was expanded to reference a new fabric object,
 fid_nic.
 When available, the fid_nic references a new set of attributes related
 to network hardware details.
-.RS
-.RE
 .SS ABI 1.3
 .PP
-The 1.3 ABI is also the current ABI version.
-All libfabric releases starting at 1.9 export this ABI.
+The 1.3 ABI version was exported by libfabric versions 1.9, 1.10, and
+1.11.
+Added new fields to the following attributes:
 .TP
-.B \f[I]fi_domain_attr\f[]
+.B \f[I]fi_domain_attr\f[R]
 Added tclass
-.RS
-.RE
 .TP
-.B \f[I]fi_tx_attr\f[]
+.B \f[I]fi_tx_attr\f[R]
 Added tclass
-.RS
-.RE
+.SS ABI 1.4
+.PP
+The 1.4 ABI version was exported by libfabric 1.12.
+Added fi_tostr_r, a thread\-safe (re\-entrant) version of fi_tostr.
+.SS ABI 1.5
+.PP
+ABI version starting with libfabric 1.13.
+Added new fi_open API call.
+.SS ABI 1.6
+.PP
+ABI version starting with libfabric 1.14.
+Added fi_log_ready for providers.
 .SH SEE ALSO
 .PP
-\f[C]fi_info\f[](1), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3),
-\f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3), \f[C]fi_av\f[](3),
-\f[C]fi_eq\f[](3), \f[C]fi_cq\f[](3), \f[C]fi_cntr\f[](3),
-\f[C]fi_mr\f[](3)
+\f[C]fi_info\f[R](1), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3),
+\f[C]fi_endpoint\f[R](3), \f[C]fi_domain\f[R](3), \f[C]fi_av\f[R](3),
+\f[C]fi_eq\f[R](3), \f[C]fi_cq\f[R](3), \f[C]fi_cntr\f[R](3),
+\f[C]fi_mr\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_bgq.7 b/deps/libfabric/man/man7/fi_bgq.7
index 33e5156a3eb5a0ab48a2714c231a860479c9bc30..8ea28e71fdd26c194cfd7074be18693fec996fe4 100644
--- a/deps/libfabric/man/man7/fi_bgq.7
+++ b/deps/libfabric/man/man7/fi_bgq.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_bgq" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_bgq" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -12,48 +12,44 @@ that makes direct use of the unique hardware features such as the
 Messaging Unit (MU), Base Address Table (BAT), and L2 Atomics.
 .PP
 The purpose of this provider is to demonstrate the scalability and
-performance of libfabric, providing an "extreme scale" development
-environment for applications and middleware using the libfabric API, and
-to support a functional and performant version of MPI3 on Blue Gene/Q
-via MPICH CH4.
+performance of libfabric, providing an \[lq]extreme scale\[rq]
+development environment for applications and middleware using the
+libfabric API, and to support a functional and performant version of
+MPI3 on Blue Gene/Q via MPICH CH4.
 .SH SUPPORTED FEATURES
 .PP
 The bgq provider supports most features defined for the libfabric API.
 Key features include:
 .TP
-.B \f[I]Endpoint types\f[]
+.B \f[I]Endpoint types\f[R]
 The Blue Gene/Q hardware is connectionless and reliable.
-Therefore, the bgq provider only supports the \f[I]FI_EP_RDM\f[]
+Therefore, the bgq provider only supports the \f[I]FI_EP_RDM\f[R]
 endpoint type.
-.RS
-.RE
 .TP
-.B \f[I]Capabilities\f[]
-Supported capabilities include \f[I]FI_MSG\f[], \f[I]FI_RMA\f[],
-\f[I]FI_TAGGED\f[], \f[I]FI_ATOMIC\f[], \f[I]FI_NAMED_RX_CTX\f[],
-\f[I]FI_READ\f[], \f[I]FI_WRITE\f[], \f[I]FI_SEND\f[], \f[I]FI_RECV\f[],
-\f[I]FI_REMOTE_READ\f[], \f[I]FI_REMOTE_WRITE\f[],
-\f[I]FI_MULTI_RECV\f[], \f[I]FI_DIRECTED_RECV\f[], \f[I]FI_SOURCE\f[]
-and \f[I]FI_FENCE\f[].
-.RS
-.RE
+.B \f[I]Capabilities\f[R]
+Supported capabilities include \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R],
+\f[I]FI_TAGGED\f[R], \f[I]FI_ATOMIC\f[R], \f[I]FI_NAMED_RX_CTX\f[R],
+\f[I]FI_READ\f[R], \f[I]FI_WRITE\f[R], \f[I]FI_SEND\f[R],
+\f[I]FI_RECV\f[R], \f[I]FI_REMOTE_READ\f[R], \f[I]FI_REMOTE_WRITE\f[R],
+\f[I]FI_MULTI_RECV\f[R], \f[I]FI_DIRECTED_RECV\f[R], \f[I]FI_SOURCE\f[R]
+and \f[I]FI_FENCE\f[R].
 .PP
 Notes on FI_DIRECTED_RECV capability: The immediate data which is sent
-within the \f[I]senddata\f[] call to support FI_DIRECTED_RECV for BGQ
+within the \f[I]senddata\f[R] call to support FI_DIRECTED_RECV for BGQ
 must be exactly 4 bytes, which BGQ uses to completely identify the
 source address to an exascale\-level number of ranks for tag matching on
 the recv and can be managed within the MU packet.
 Therefore the domain attribute cq_data_size is set to 4 which is the OFI
 standard minimum.
 .TP
-.B \f[I]Modes\f[]
-The bgq provider requires \f[I]FI_CONTEXT\f[] and \f[I]FI_ASYNC_IOV\f[]
-.RS
-.RE
+.B \f[I]Modes\f[R]
+The bgq provider requires \f[I]FI_CONTEXT\f[R] and
+\f[I]FI_ASYNC_IOV\f[R]
 .TP
-.B \f[I]Memory registration modes\f[]
+.B \f[I]Memory registration modes\f[R]
 Both FI_MR_SCALABLE and FI_MR_BASIC are supported, specified at
-configuration time with the "\-\-with\-bgq\-mr" configure option.
+configuration time with the \[lq]\[en]with\-bgq\-mr\[rq] configure
+option.
 The base address table utilized by FI_MR_SCALABLE for rdma transfers is
 completely software emulated, supporting FI_ATOMIC, FI_READ, FI_WRITE,
 FI_REMOTE_READ, and FI_REMOTE_WRITE capabilities.
@@ -62,57 +58,43 @@ other rdma transfers are still software emulated but the use of a base
 address table is no longer required as the offset is now the virtual
 address of the memory from the application and the key is the delta from
 which the physical address can be computed if necessary.
-.RS
-.RE
 .TP
-.B \f[I]Additional features\f[]
-Supported additional features include \f[I]FABRIC_DIRECT\f[],
-\f[I]scalable endpoints\f[], and \f[I]counters\f[].
-.RS
-.RE
+.B \f[I]Additional features\f[R]
+Supported additional features include \f[I]FABRIC_DIRECT\f[R],
+\f[I]scalable endpoints\f[R], and \f[I]counters\f[R].
 .TP
-.B \f[I]Progress\f[]
-Both progress modes, \f[I]FI_PROGRESS_AUTO\f[] and
-\f[I]FI_PROGRESS_MANUAL\f[], are supported.
-The progress mode may be specified via the "\-\-with\-bgq\-progress"
-configure option.
-.RS
-.RE
+.B \f[I]Progress\f[R]
+Both progress modes, \f[I]FI_PROGRESS_AUTO\f[R] and
+\f[I]FI_PROGRESS_MANUAL\f[R], are supported.
+The progress mode may be specified via the
+\[lq]\[en]with\-bgq\-progress\[rq] configure option.
 .TP
-.B \f[I]Address vector\f[]
-Only the \f[I]FI_AV_MAP\f[] address vector format is supported.
-.RS
-.RE
+.B \f[I]Address vector\f[R]
+Only the \f[I]FI_AV_MAP\f[R] address vector format is supported.
 .SH UNSUPPORTED FEATURES
 .TP
-.B \f[I]Endpoint types\f[]
-Unsupported endpoint types include \f[I]FI_EP_DGRAM\f[] and
-\f[I]FI_EP_MSG\f[]
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+Unsupported endpoint types include \f[I]FI_EP_DGRAM\f[R] and
+\f[I]FI_EP_MSG\f[R]
 .TP
-.B \f[I]Capabilities\f[]
-The bgq provider does not support the \f[I]FI_RMA_EVENT\f[], and
-\f[I]FI_TRIGGER\f[] capabilities.
-.RS
-.RE
+.B \f[I]Capabilities\f[R]
+The bgq provider does not support the \f[I]FI_RMA_EVENT\f[R], and
+\f[I]FI_TRIGGER\f[R] capabilities.
 .TP
-.B \f[I]Address vector\f[]
-The bgq provider does not support the \f[I]FI_AV_TABLE\f[] address
+.B \f[I]Address vector\f[R]
+The bgq provider does not support the \f[I]FI_AV_TABLE\f[R] address
 vector format.
-Support for \f[I]FI_AV_TABLE\f[] may be added in the future.
-.RS
-.RE
+Support for \f[I]FI_AV_TABLE\f[R] may be added in the future.
 .SH LIMITATIONS
 .PP
-The bgq provider only supports \f[I]FABRIC_DIRECT\f[].
-The size of the fi_context structure for \f[I]FI_CONTEXT\f[] is too
+The bgq provider only supports \f[I]FABRIC_DIRECT\f[R].
+The size of the fi_context structure for \f[I]FI_CONTEXT\f[R] is too
 small to be useful.
-In the \[aq]direct\[aq] mode the bgq provider can re\-define the struct
+In the `direct' mode the bgq provider can re\-define the struct
 fi_context to a larger size \- currently 64 bytes which is the L1 cache
 size.
 .PP
-The fi_context structure for \f[I]FI_CONTEXT\f[] must be aligned to 8
+The fi_context structure for \f[I]FI_CONTEXT\f[R] must be aligned to 8
 bytes.
 This requirement is because the bgq provider will use MU network atomics
 to track completions and the memory used with MU atomic operations must
@@ -120,18 +102,18 @@ be aligned to 8 bytes.
 Unfortunately, the libfabric API has no mechanism for applications to
 programmatically determine these alignment requirements.
 Because unaligned MU atomics operations are a fatal error, the bgq
-provider will assert on the alignment for "debug" builds (i.e., the
-\[aq]\-DNDEBUG\[aq] pre\-processor flag is not specified).
+provider will assert on the alignment for \[lq]debug\[rq] builds (i.e.,
+the `\-DNDEBUG' pre\-processor flag is not specified).
 .PP
-The progress thread used for \f[I]FI_PROGRESS_AUTO\f[] effectively
+The progress thread used for \f[I]FI_PROGRESS_AUTO\f[R] effectively
 limits the maximum number of ranks\-per\-node to 32.
 However for FI_PROGRESS_MANUAL the maximum is 64.
 .PP
 For FI_MR_SCALABLE mr mode the memory region key size (mr_key_size) is 2
-\f[I]bytes\f[]; Valid key values are 0..2^16\-1.
+\f[I]bytes\f[R]; Valid key values are 0..2\[ha]16\-1.
 .PP
-It is invalid to register memory at the base virtual address "0" with a
-length of "UINTPTR_MAX" (or equivalent).
+It is invalid to register memory at the base virtual address \[lq]0\[rq]
+with a length of \[lq]UINTPTR_MAX\[rq] (or equivalent).
 The Blue Gene/Q hardware operates on 37\-bit physical addresses and all
 virtual addresses specified in the libfabric API, such as the location
 of source/destination data and remote memory locations, must be
@@ -147,6 +129,6 @@ The fi_trecvv() and fi_recvv() functions are currently not supported.
 No runtime parameters are currently defined.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_direct.7 b/deps/libfabric/man/man7/fi_direct.7
index d73cba49bd57611922cfc1cbb20a89ce6da14c85..90637aef8152ee7f72d82c43779348f57fc4a98b 100644
--- a/deps/libfabric/man/man7/fi_direct.7
+++ b/deps/libfabric/man/man7/fi_direct.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_direct" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_direct" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -11,8 +11,8 @@ fi_direct \- Direct fabric provider access
 \f[C]
 \-DFABRIC_DIRECT
 
-#define\ FABRIC_DIRECT
-\f[]
+#define FABRIC_DIRECT
+\f[R]
 .fi
 .PP
 Fabric direct provides a mechanism for applications to compile against a
@@ -37,12 +37,12 @@ part of their build.
 In general, the use of fabric direct does not require application source
 code changes, and, instead, is limited to the build process.
 .PP
-Providers supporting fabric direct must install \[aq]direct\[aq]
-versions of all libfabric header files.
+Providers supporting fabric direct must install `direct' versions of all
+libfabric header files.
 For convenience, the libfabric sources contain sample header files that
 may be modified by a provider.
-The \[aq]direct\[aq] header file names have \[aq]fi_direct\[aq] as their
-prefix: fi_direct.h, fi_direct_endpoint.h, etc.
+The `direct' header file names have `fi_direct' as their prefix:
+fi_direct.h, fi_direct_endpoint.h, etc.
 .PP
 Direct providers are prohibited from overriding or modifying existing
 data structures.
@@ -60,21 +60,18 @@ modes, if those capabilities are supported.
 The following #define values may be used by an application to test for
 provider support of supported features.
 .TP
-.B \f[I]FI_DIRECT_CONTEXT\f[]
+.B \f[I]FI_DIRECT_CONTEXT\f[R]
 The provider sets FI_CONTEXT or FI_CONTEXT2 for fi_info:mode.
 See fi_getinfo for additional details.
 When FI_DIRECT_CONTEXT is defined, applications should use struct
 fi_context in their definitions, even if FI_CONTEXT2 is set.
-.RS
-.RE
 .TP
-.B \f[I]FI_DIRECT_LOCAL_MR\f[]
+.B \f[I]FI_DIRECT_LOCAL_MR\f[R]
 The provider sets FI_LOCAL_MR for fi_info:mode.
 See fi_getinfo for additional details.
-.RS
-.RE
 .SH SEE ALSO
 .PP
-\f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3)
+\f[C]fi_getinfo\f[R](3), \f[C]fi_endpoint\f[R](3),
+\f[C]fi_domain\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_efa.7 b/deps/libfabric/man/man7/fi_efa.7
index ca374c3b335ec2a81d4daff33016e7d03b3c91ca..7b773b50c62cd0cd8fa4862cc49023f624c121a6 100644
--- a/deps/libfabric/man/man7/fi_efa.7
+++ b/deps/libfabric/man/man7/fi_efa.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_efa" "7" "2020\-04\-22" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_efa" "7" "2021\-09\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -15,236 +15,196 @@ hardware access from userspace (OS bypass).
 .PP
 The following features are supported:
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports endpoint type \f[I]FI_EP_DGRAM\f[], and
-\f[I]FI_EP_RDM\f[] on a new Scalable (unordered) Reliable Datagram
+.B \f[I]Endpoint types\f[R]
+The provider supports endpoint type \f[I]FI_EP_DGRAM\f[R], and
+\f[I]FI_EP_RDM\f[R] on a new Scalable (unordered) Reliable Datagram
 protocol (SRD).
 SRD provides support for reliable datagrams and more complete error
 handling than typically seen with other Reliable Datagram (RD)
 implementations.
 The EFA provider provides segmentation, reassembly of out\-of\-order
 packets to provide send\-after\-send ordering guarantees to applications
-via its \f[I]FI_EP_RDM\f[] endpoint.
-.RS
-.RE
+via its \f[I]FI_EP_RDM\f[R] endpoint.
 .TP
-.B \f[I]RDM Endpoint capabilities\f[]
+.B \f[I]RDM Endpoint capabilities\f[R]
 The following data transfer interfaces are supported via the
-\f[I]FI_EP_RDM\f[] endpoint: \f[I]FI_MSG\f[], \f[I]FI_TAGGED\f[], and
-\f[I]FI_RMA\f[].
-\f[I]FI_SEND\f[], \f[I]FI_RECV\f[], \f[I]FI_DIRECTED_RECV\f[],
-\f[I]FI_MULTI_RECV\f[], and \f[I]FI_SOURCE\f[] capabilities are
+\f[I]FI_EP_RDM\f[R] endpoint: \f[I]FI_MSG\f[R], \f[I]FI_TAGGED\f[R], and
+\f[I]FI_RMA\f[R].
+\f[I]FI_SEND\f[R], \f[I]FI_RECV\f[R], \f[I]FI_DIRECTED_RECV\f[R],
+\f[I]FI_MULTI_RECV\f[R], and \f[I]FI_SOURCE\f[R] capabilities are
 supported.
 The endpoint provides send\-after\-send guarantees for data operations.
-The \f[I]FI_EP_RDM\f[] endpoint does not have a maximum message size.
-.RS
-.RE
+The \f[I]FI_EP_RDM\f[R] endpoint does not have a maximum message size.
 .TP
-.B \f[I]DGRAM Endpoint capabilities\f[]
-The DGRAM endpoint only supports \f[I]FI_MSG\f[] capability with a
+.B \f[I]DGRAM Endpoint capabilities\f[R]
+The DGRAM endpoint only supports \f[I]FI_MSG\f[R] capability with a
 maximum message size of the MTU of the underlying hardware
 (approximately 8 KiB).
-.RS
-.RE
 .TP
-.B \f[I]Address vectors\f[]
-The provider supports \f[I]FI_AV_TABLE\f[] and \f[I]FI_AV_MAP\f[]
+.B \f[I]Address vectors\f[R]
+The provider supports \f[I]FI_AV_TABLE\f[R] and \f[I]FI_AV_MAP\f[R]
 address vector types.
-\f[I]FI_EVENT\f[] is unsupported.
-.RS
-.RE
-.TP
-.B \f[I]Completion events\f[]
-The provider supports \f[I]FI_CQ_FORMAT_CONTEXT\f[],
-\f[I]FI_CQ_FORMAT_MSG\f[], and \f[I]FI_CQ_FORMAT_DATA\f[].
-\f[I]FI_CQ_FORMAT_TAGGED\f[] is supported on the RDM endpoint.
+\f[I]FI_EVENT\f[R] is unsupported.
+.TP
+.B \f[I]Completion events\f[R]
+The provider supports \f[I]FI_CQ_FORMAT_CONTEXT\f[R],
+\f[I]FI_CQ_FORMAT_MSG\f[R], and \f[I]FI_CQ_FORMAT_DATA\f[R].
+\f[I]FI_CQ_FORMAT_TAGGED\f[R] is supported on the RDM endpoint.
 Wait objects are not currently supported.
-.RS
-.RE
 .TP
-.B \f[I]Modes\f[]
-The provider requires the use of \f[I]FI_MSG_PREFIX\f[] when running
-over the DGRAM endpoint, and requires \f[I]FI_MR_LOCAL\f[] for all
+.B \f[I]Modes\f[R]
+The provider requires the use of \f[I]FI_MSG_PREFIX\f[R] when running
+over the DGRAM endpoint, and requires \f[I]FI_MR_LOCAL\f[R] for all
 memory registrations on the DGRAM endpoint.
-.RS
-.RE
-.TP
-.B \f[I]Memory registration modes\f[]
-The RDM endpoint does not require memory registration and the
-\f[I]FI_EP_DGRAM\f[] endpoint only supports \f[I]FI_MR_LOCAL\f[].
-.RS
-.RE
-.TP
-.B \f[I]Progress\f[]
-The RDM endpoint supports both \f[I]FI_PROGRESS_AUTO\f[] and
-\f[I]FI_PROGRESS_MANUAL\f[], with the default set to auto.
+.TP
+.B \f[I]Memory registration modes\f[R]
+The RDM endpoint does not require memory registration for send and
+receive operations, i.e.\ it does not require \f[I]FI_MR_LOCAL\f[R].
+Applications may specify \f[I]FI_MR_LOCAL\f[R] in the MR mode flags in
+order to use descriptors provided by the application.
+The \f[I]FI_EP_DGRAM\f[R] endpoint only supports \f[I]FI_MR_LOCAL\f[R].
+.TP
+.B \f[I]Progress\f[R]
+The RDM endpoint supports both \f[I]FI_PROGRESS_AUTO\f[R] and
+\f[I]FI_PROGRESS_MANUAL\f[R], with the default set to auto.
 However, receive side data buffers are not modified outside of
 completion processing routines.
-The DGRAM endpoint only supports \f[I]FI_PROGRESS_MANUAL\f[].
-.RS
-.RE
-.TP
-.B \f[I]Threading\f[]
-The RDM endpoint supports \f[I]FI_THREAD_SAFE\f[], the DGRAM endpoint
-supports \f[I]FI_THREAD_DOMAIN\f[], i.e.
-the provider is not thread safe when using the DGRAM endpoint.
-.RS
-.RE
+The DGRAM endpoint only supports \f[I]FI_PROGRESS_MANUAL\f[R].
+.TP
+.B \f[I]Threading\f[R]
+The RDM endpoint supports \f[I]FI_THREAD_SAFE\f[R], the DGRAM endpoint
+supports \f[I]FI_THREAD_DOMAIN\f[R], i.e.\ the provider is not thread
+safe when using the DGRAM endpoint.
 .SH LIMITATIONS
 .PP
-The provider does not support \f[I]FI_ATOMIC\f[] interfaces.
+The DGRAM endpoint does not support \f[I]FI_ATOMIC\f[R] interfaces.
 For RMA operations, completion events for RMA targets
-(\f[I]FI_RMA_EVENT\f[]) is not supported.
+(\f[I]FI_RMA_EVENT\f[R]) is not supported.
 The DGRAM endpoint does not fully protect against resource overruns, so
 resource management is disabled for this endpoint
-(\f[I]FI_RM_DISABLED\f[]).
+(\f[I]FI_RM_DISABLED\f[R]).
 .PP
 No support for selective completions.
 .PP
-No support for counters.
+No support for counters for the DGRAM endpoint.
 .PP
 No support for inject.
+.SH PROVIDER SPECIFIC ENDPOINT LEVEL OPTION
+.TP
+.B \f[I]FI_OPT_EFA_RNR_RETRY\f[R]
+Defines the number of RNR retry.
+The application can use it to reset RNR retry counter via the call to
+fi_setopt.
+Note that this option must be set before the endpoint is enabled.
+Otherwise, the call will fail.
+Also note that this option only applies to RDM endpoint.
 .SH RUNTIME PARAMETERS
 .TP
-.B \f[I]FI_EFA_TX_SIZE\f[]
+.B \f[I]FI_EFA_TX_SIZE\f[R]
 Maximum number of transmit operations before the provider returns
 \-FI_EAGAIN.
 For only the RDM endpoint, this parameter will cause transmit operations
 to be queued when this value is set higher than the default and the
 transmit queue is full.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_RX_SIZE\f[]
+.B \f[I]FI_EFA_RX_SIZE\f[R]
 Maximum number of receive operations before the provider returns
 \-FI_EAGAIN.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_TX_IOV_LIMIT\f[]
+.B \f[I]FI_EFA_TX_IOV_LIMIT\f[R]
 Maximum number of IOVs for a transmit operation.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_RX_IOV_LIMIT\f[]
+.B \f[I]FI_EFA_RX_IOV_LIMIT\f[R]
 Maximum number of IOVs for a receive operation.
-.RS
-.RE
 .SH RUNTIME PARAMETERS SPECIFIC TO RDM ENDPOINT
 .PP
 These OFI runtime parameters apply only to the RDM endpoint.
 .TP
-.B \f[I]FI_EFA_RX_WINDOW_SIZE\f[]
+.B \f[I]FI_EFA_RX_WINDOW_SIZE\f[R]
 Maximum number of MTU\-sized messages that can be in flight from any
 single endpoint as part of long message data transfer.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_TX_QUEUE_SIZE\f[]
+.B \f[I]FI_EFA_TX_QUEUE_SIZE\f[R]
 Depth of transmit queue opened with the NIC.
 This may not be set to a value greater than what the NIC supports.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_RECVWIN_SIZE\f[]
+.B \f[I]FI_EFA_RECVWIN_SIZE\f[R]
 Size of out of order reorder buffer (in messages).
 Messages received out of this window will result in an error.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_CQ_SIZE\f[]
+.B \f[I]FI_EFA_CQ_SIZE\f[R]
 Size of any cq created, in number of entries.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MR_CACHE_ENABLE\f[]
+.B \f[I]FI_EFA_MR_CACHE_ENABLE\f[R]
 Enables using the mr cache and in\-line registration instead of a bounce
-buffer for iov\[aq]s larger than max_memcpy_size.
+buffer for iov\[cq]s larger than max_memcpy_size.
 Defaults to true.
 When disabled, only uses a bounce buffer
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MR_MAX_CACHED_COUNT\f[]
+.B \f[I]FI_EFA_MR_MAX_CACHED_COUNT\f[R]
 Sets the maximum number of memory registrations that can be cached at
 any time.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MR_MAX_CACHED_SIZE\f[]
+.B \f[I]FI_EFA_MR_MAX_CACHED_SIZE\f[R]
 Sets the maximum amount of memory that cached memory registrations can
 hold onto at any time.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MAX_MEMCPY_SIZE\f[]
+.B \f[I]FI_EFA_MAX_MEMCPY_SIZE\f[R]
 Threshold size switch between using memory copy into a pre\-registered
 bounce buffer and memory registration on the user buffer.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MTU_SIZE\f[]
+.B \f[I]FI_EFA_MTU_SIZE\f[R]
 Overrides the default MTU size of the device.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_RX_COPY_UNEXP\f[]
+.B \f[I]FI_EFA_RX_COPY_UNEXP\f[R]
 Enables the use of a separate pool of bounce\-buffers to copy unexpected
 messages out of the pre\-posted receive buffers.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_RX_COPY_OOO\f[]
+.B \f[I]FI_EFA_RX_COPY_OOO\f[R]
 Enables the use of a separate pool of bounce\-buffers to copy
 out\-of\-order RTS packets out of the pre\-posted receive buffers.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_MAX_TIMEOUT\f[]
+.B \f[I]FI_EFA_MAX_TIMEOUT\f[R]
 Maximum timeout (us) for backoff to a peer after a receiver not ready
 error.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_TIMEOUT_INTERVAL\f[]
+.B \f[I]FI_EFA_TIMEOUT_INTERVAL\f[R]
 Time interval (us) for the base timeout to use for exponential backoff
 to a peer after a receiver not ready error.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_ENABLE_SHM_TRANSFER\f[]
+.B \f[I]FI_EFA_ENABLE_SHM_TRANSFER\f[R]
 Enable SHM provider to provide the communication across all intra\-node
 processes.
 SHM transfer will be disabled in the case where
-\f[C]ptrace\ protection\f[] is turned on.
+\f[C]ptrace protection\f[R] is turned on.
 You can turn it off to enable shm transfer.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_SHM_AV_SIZE\f[]
-Defines the maximum number of entries in SHM provider\[aq]s address
+.B \f[I]FI_EFA_SHM_AV_SIZE\f[R]
+Defines the maximum number of entries in SHM provider\[cq]s address
 vector.
-.RS
-.RE
 .TP
-.B \f[I]FI_EFA_SHM_MAX_MEDIUM_SIZE\f[]
+.B \f[I]FI_EFA_SHM_MAX_MEDIUM_SIZE\f[R]
 Defines the switch point between small/medium message and large message.
 The message larger than this switch point will be transferred with large
 message protocol.
-.RS
-.RE
+NOTE: This parameter is now deprecated.
 .TP
-.B \f[I]FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE\f[]
+.B \f[I]FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE\f[R]
 The maximum size for inter EFA messages to be sent by using medium
 message protocol.
 Messages which can fit in one packet will be sent as eager message.
 Messages whose sizes are smaller than this value will be sent using
 medium message protocol.
 Other messages will be sent using CTS based long message protocol.
-.RS
-.RE
+.TP
+.B \f[I]FI_EFA_FORK_SAFE\f[R]
+Enable fork() support.
+This may have a small performance impact and should only be set when
+required.
+Applications that require to register regions backed by huge pages and
+also require fork support are not supported.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_gni.7 b/deps/libfabric/man/man7/fi_gni.7
index d25f7d82f76b71cec8772ff852c6bb3c18c6f911..06573a513b7c4c76cfe48447304a6064ee3d3591 100644
--- a/deps/libfabric/man/man7/fi_gni.7
+++ b/deps/libfabric/man/man7/fi_gni.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_gni" "7" "2019\-04\-29" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_gni" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -32,72 +32,54 @@ Any other value will result in a return value of \-FI_EINVAL.
 The GNI provider supports the following features defined for the
 libfabric API:
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports the \f[I]FI_EP_RDM\f[], \f[I]FI_EP_DGRAM\f[],
-\f[I]FI_EP_MSG\f[] endpoint types, including scalable endpoints.
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports the \f[I]FI_EP_RDM\f[R], \f[I]FI_EP_DGRAM\f[R],
+\f[I]FI_EP_MSG\f[R] endpoint types, including scalable endpoints.
 .TP
-.B \f[I]Address vectors\f[]
-The provider implements both the \f[I]FI_AV_MAP\f[] and
-\f[I]FI_AV_TABLE\f[] address vector types.
+.B \f[I]Address vectors\f[R]
+The provider implements both the \f[I]FI_AV_MAP\f[R] and
+\f[I]FI_AV_TABLE\f[R] address vector types.
 FI_EVENT is unsupported.
-.RS
-.RE
 .TP
-.B \f[I]Memory registration modes\f[]
+.B \f[I]Memory registration modes\f[R]
 The provider implements basic and scalable memory registration modes.
-.RS
-.RE
 .TP
-.B \f[I]Data transfer operations\f[]
+.B \f[I]Data transfer operations\f[R]
 The following data transfer interfaces are supported for all endpoint
-types: \f[I]FI_ATOMIC\f[], \f[I]FI_MSG\f[], \f[I]FI_RMA\f[],
-\f[I]FI_TAGGED\f[].
+types: \f[I]FI_ATOMIC\f[R], \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R],
+\f[I]FI_TAGGED\f[R].
 See DATA TRANSFER OPERATIONS below for more details.
-.RS
-.RE
-.TP
-.B \f[I]Completion events\f[]
-The GNI provider supports \f[I]FI_CQ_FORMAT_CONTEXT\f[],
-\f[I]FI_CQ_FORMAT_MSG\f[], \f[I]FI_CQ_FORMAT_DATA\f[] and
-\f[I]FI_CQ_FORMAT_TAGGED\f[] with wait objects of type
-\f[I]FI_WAIT_NONE\f[], \f[I]FI_WAIT_UNSPEC\f[], \f[I]FI_WAIT_SET\f[].
-.RS
-.RE
-.TP
-.B \f[I]Modes\f[]
+.TP
+.B \f[I]Completion events\f[R]
+The GNI provider supports \f[I]FI_CQ_FORMAT_CONTEXT\f[R],
+\f[I]FI_CQ_FORMAT_MSG\f[R], \f[I]FI_CQ_FORMAT_DATA\f[R] and
+\f[I]FI_CQ_FORMAT_TAGGED\f[R] with wait objects of type
+\f[I]FI_WAIT_NONE\f[R], \f[I]FI_WAIT_UNSPEC\f[R], \f[I]FI_WAIT_SET\f[R].
+.TP
+.B \f[I]Modes\f[R]
 The GNI provider does not require any operation modes.
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
+.B \f[I]Progress\f[R]
 For both control and data progress, the GNI provider supports both
-\f[I]FI_PROGRESS_AUTO\f[] and \f[I]FI_PROGRESS_MANUAL\f[], with a
-default set to \f[I]FI_PROGRESS_AUTO\f[].
+\f[I]FI_PROGRESS_AUTO\f[R] and \f[I]FI_PROGRESS_MANUAL\f[R], with a
+default set to \f[I]FI_PROGRESS_AUTO\f[R].
 Note that for data progress, progression is only performed when data
 transfers use the rendezvous protocol.
-.RS
-.RE
 .TP
-.B \f[I]Wait Objects\f[]
+.B \f[I]Wait Objects\f[R]
 The GNI provider specifically supports wait object types
-\f[I]FI_WAIT_UNSPEC\f[], and \f[I]FI_WAIT_SET\f[].
+\f[I]FI_WAIT_UNSPEC\f[R], and \f[I]FI_WAIT_SET\f[R].
 A wait object must be used when calling fi_cntr_wait, fi_cq_sread/from,
 fi_eq_sread/from, fi_wait.
 The GNI provider spawns an internal wait progress thread that is woken
 up when clients utilize the wait system (e.g., calling fi_wait).
-.RS
-.RE
 .TP
-.B \f[I]Additional Features\f[]
+.B \f[I]Additional Features\f[R]
 The GNI provider also supports the following capabilities and features:
-\- \f[I]FI_MULTI_RECV\f[] \- \f[I]FI_SOURCE\f[] \- \f[I]FI_FENCE\f[] \-
-\f[I]FI_RM_ENABLED\f[] \- \f[I]FI_RMA_EVENT\f[] \-
-\f[I]FI_REMOTE_CQ_DATA\f[] \- \f[I]FABRIC_DIRECT\f[] compilation mode \-
-\f[I]FI_MORE\f[] (For FI_RMA)
-.RS
-.RE
+\- \f[I]FI_MULTI_RECV\f[R] \- \f[I]FI_SOURCE\f[R] \- \f[I]FI_FENCE\f[R]
+\- \f[I]FI_RM_ENABLED\f[R] \- \f[I]FI_RMA_EVENT\f[R] \-
+\f[I]FI_REMOTE_CQ_DATA\f[R] \- \f[I]FABRIC_DIRECT\f[R] compilation mode
+\- \f[I]FI_MORE\f[R] (For FI_RMA)
 .SH DATA TRANSFER OPERATIONS
 .SS FI_ATOMIC
 .PP
@@ -108,13 +90,14 @@ integer and floating point values.
 Specifically,
 .SS Basic (fi_atomic, etc.)
 .IP \[bu] 2
-\f[I]FI_MIN\f[], \f[I]FI_MAX\f[] (no unsigned)
+\f[I]FI_MIN\f[R], \f[I]FI_MAX\f[R] (no unsigned)
 .IP \[bu] 2
-\f[I]FI_SUM\f[] (no 64\-bit floating point)
+\f[I]FI_SUM\f[R] (no 64\-bit floating point)
 .IP \[bu] 2
-\f[I]FI_BOR\f[], \f[I]FI_BAND\f[], \f[I]FI_BXOR\f[] (no floating point)
+\f[I]FI_BOR\f[R], \f[I]FI_BAND\f[R], \f[I]FI_BXOR\f[R] (no floating
+point)
 .IP \[bu] 2
-\f[I]FI_ATOMIC_WRITE\f[]
+\f[I]FI_ATOMIC_WRITE\f[R]
 .SS Fetching (fi_fetch_atomic, etc.)
 .IP \[bu] 2
 All of the basic operations as above
@@ -127,255 +110,199 @@ FI_CSWAP
 FI_MSWAP
 .SS FI_MSG
 .PP
-All \f[I]FI_MSG\f[] operations are supported.
+All \f[I]FI_MSG\f[R] operations are supported.
 .SS FI_RMA
 .PP
-All \f[I]FI_RMA\f[] operations are supported.
+All \f[I]FI_RMA\f[R] operations are supported.
 .SS FI_TAGGED
 .PP
-All \f[I]FI_TAGGED\f[] operations are supported except
-\f[C]fi_tinjectdata\f[].
+All \f[I]FI_TAGGED\f[R] operations are supported except
+\f[C]fi_tinjectdata\f[R].
 .SH GNI EXTENSIONS
 .PP
 The GNI provider exposes low\-level tuning parameters via domain,
-endpoint and fabric level \f[C]fi_open_ops\f[] interfaces.
-The domain extensions have been named \f[I]FI_GNI_DOMAIN_OPS_1\f[].
-The endpoint extensions have been named \f[I]FI_GNI_EP_OPS_1\f[].
-The fabric extensions have been named \f[I]FI_GNI_FABRIC_OPS_1\f[] and
-\f[I]FI_GNI_FABRIC_OPS_2\f[].
+endpoint and fabric level \f[C]fi_open_ops\f[R] interfaces.
+The domain extensions have been named \f[I]FI_GNI_DOMAIN_OPS_1\f[R].
+The endpoint extensions have been named \f[I]FI_GNI_EP_OPS_1\f[R].
+The fabric extensions have been named \f[I]FI_GNI_FABRIC_OPS_1\f[R] and
+\f[I]FI_GNI_FABRIC_OPS_2\f[R].
 The flags parameter is currently ignored.
-The fi_open_ops function takes a \f[C]struct\ fi_gni_ops_domain\f[] or a
-\f[C]struct\ fi_gni_ops_ep\f[] parameter respectively and populates it
+The fi_open_ops function takes a \f[C]struct fi_gni_ops_domain\f[R] or a
+\f[C]struct fi_gni_ops_ep\f[R] parameter respectively and populates it
 with the following:
 .IP
 .nf
 \f[C]
-struct\ fi_gni_ops_fab\ {
-\ \ \ \ int\ (*set_val)(struct\ fid\ *fid,\ fab_ops_val_t\ t,\ void\ *val);
-\ \ \ \ int\ (*get_val)(struct\ fid\ *fid,\ fab_ops_val_t\ t,\ void\ *val);
+struct fi_gni_ops_fab {
+    int (*set_val)(struct fid *fid, fab_ops_val_t t, void *val);
+    int (*get_val)(struct fid *fid, fab_ops_val_t t, void *val);
 };
 
-struct\ fi_gni_auth_key_ops_fab\ {
-\ \ \ \ int\ (*set_val)(uint8_t\ *auth_key,\ size_t\ auth_keylen,\ gnix_auth_key_opt_t\ opt,\ void\ *val);
-\ \ \ \ int\ (*get_val)(uint8_t\ *auth_key,\ size_t\ auth_keylen,\ gnix_auth_key_opt_t\ opt,\ void\ *val);
+struct fi_gni_auth_key_ops_fab {
+    int (*set_val)(uint8_t *auth_key, size_t auth_keylen, gnix_auth_key_opt_t opt, void *val);
+    int (*get_val)(uint8_t *auth_key, size_t auth_keylen, gnix_auth_key_opt_t opt, void *val);
 };
 
-struct\ fi_gni_ops_domain\ {
-\ \ \ \ int\ (*set_val)(struct\ fid\ *fid,\ dom_ops_val_t\ t,\ void\ *val);
-\ \ \ \ int\ (*get_val)(struct\ fid\ *fid,\ dom_ops_val_t\ t,\ void\ *val);
-\ \ \ \ int\ (*flush_cache)(struct\ fid\ *fid);
+struct fi_gni_ops_domain {
+    int (*set_val)(struct fid *fid, dom_ops_val_t t, void *val);
+    int (*get_val)(struct fid *fid, dom_ops_val_t t, void *val);
+    int (*flush_cache)(struct fid *fid);
 };
 
-struct\ fi_gni_ops_ep\ {
-\ \ \ \ int\ (*set_val)(struct\ fid\ *fid,\ dom_ops_val_t\ t,\ void\ *val);
-\ \ \ \ int\ (*get_val)(struct\ fid\ *fid,\ dom_ops_val_t\ t,\ void\ *val);
-\ \ \ \ \ \ \ \ size_t\ (*native_amo)(struct\ fid_ep\ *ep,\ const\ void\ *buf,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ count,void\ *desc,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ fi_addr_t\ dest_addr,\ uint64_t\ addr,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ key,\ enum\ fi_datatype\ datatype,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ gnix_fab_req_type\ req_type,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *context);
+struct fi_gni_ops_ep {
+    int (*set_val)(struct fid *fid, dom_ops_val_t t, void *val);
+    int (*get_val)(struct fid *fid, dom_ops_val_t t, void *val);
+        size_t (*native_amo)(struct fid_ep *ep, const void *buf,
+                             size_t count,void *desc,
+                             fi_addr_t dest_addr, uint64_t addr,
+                             uint64_t key, enum fi_datatype datatype,
+                             enum gnix_fab_req_type req_type,
+                             void *context);
 };
-\f[]
+\f[R]
 .fi
 .PP
-The \f[C]set_val\f[] function sets the value of a given parameter; the
-\f[C]get_val\f[] function returns the current value.
+The \f[C]set_val\f[R] function sets the value of a given parameter; the
+\f[C]get_val\f[R] function returns the current value.
 .PP
-For \f[I]FI_GNI_FABRIC_OPS_1\f[], the currently supported values are:
+For \f[I]FI_GNI_FABRIC_OPS_1\f[R], the currently supported values are:
 .TP
-.B \f[I]GNI_WAIT_THREAD_SLEEP\f[]
+.B \f[I]GNI_WAIT_THREAD_SLEEP\f[R]
 Time in seconds for which the progress thread will sleep between periods
 of inactivity.
-.RS
-.RE
 .TP
-.B \f[I]GNI_DEFAULT_USER_REGISTRATION_LIMIT\f[]
+.B \f[I]GNI_DEFAULT_USER_REGISTRATION_LIMIT\f[R]
 The number of user registrations that an authorization key is limited to
 when using the scalable memory mode, if not specified by the user during
 init.
-.RS
-.RE
 .TP
-.B \f[I]GNI_DEFAULT_PROV_REGISTRATION_LIMIT\f[]
+.B \f[I]GNI_DEFAULT_PROV_REGISTRATION_LIMIT\f[R]
 The number of provider registration that an authorization key is limited
 to when using the scalable memory mode, if not specified by the user
 during init.
-.RS
-.RE
 .TP
-.B \f[I]GNI_WAIT_SHARED_MEMORY_TIMEOUT\f[]
+.B \f[I]GNI_WAIT_SHARED_MEMORY_TIMEOUT\f[R]
 The number of seconds that the provider should wait when attempting to
-open mmap\[aq]d shared memory files for internal mappings.
-.RS
-.RE
+open mmap\[cq]d shared memory files for internal mappings.
 .PP
-For \f[I]FI_GNI_FABRIC_OPS_2\f[], the currently supported values are:
+For \f[I]FI_GNI_FABRIC_OPS_2\f[R], the currently supported values are:
 .TP
-.B \f[I]GNIX_USER_KEY_LIMIT\f[]
+.B \f[I]GNIX_USER_KEY_LIMIT\f[R]
 The number of user registrations that an authorization key is limited to
 when using the scalable memory mode.
 This may only be set prior to the first use of an authorization key in
 the initialization of a domain, endpoint, or memory registration.
-.RS
-.RE
 .TP
-.B \f[I]GNIX_PROV_KEY_LIMIT\f[]
+.B \f[I]GNIX_PROV_KEY_LIMIT\f[R]
 The number of provider registrations that an authorization key is
 limited to when using the scalable memory mode.
 This may only be set prior to the first use of an authorization key in
 the initialization of a domain, endpoint, or memory registration.
-.RS
-.RE
 .PP
-For \f[I]FI_GNI_DOMAIN_OPS_1\f[], the currently supported values are:
+For \f[I]FI_GNI_DOMAIN_OPS_1\f[R], the currently supported values are:
 .TP
-.B \f[I]GNI_MSG_RENDEZVOUS_THRESHOLD\f[]
+.B \f[I]GNI_MSG_RENDEZVOUS_THRESHOLD\f[R]
 Threshold message size at which a rendezvous protocol is used for
-\f[I]FI_MSG\f[] data transfers.
+\f[I]FI_MSG\f[R] data transfers.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_RMA_RDMA_THRESHOLD\f[]
-Threshold message size at which RDMA is used for \f[I]FI_RMA\f[] data
+.B \f[I]GNI_RMA_RDMA_THRESHOLD\f[R]
+Threshold message size at which RDMA is used for \f[I]FI_RMA\f[R] data
 transfers.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_CONN_TABLE_INITIAL_SIZE\f[]
+.B \f[I]GNI_CONN_TABLE_INITIAL_SIZE\f[R]
 Initial size of the internal table data structure used to manage
 connections.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_CONN_TABLE_MAX_SIZE\f[]
+.B \f[I]GNI_CONN_TABLE_MAX_SIZE\f[R]
 Maximum size of the internal table data structure used to manage
 connections.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_CONN_TABLE_STEP_SIZE\f[]
+.B \f[I]GNI_CONN_TABLE_STEP_SIZE\f[R]
 Step size for increasing the size of the internal table data structure
 used to manage internal GNI connections.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_VC_ID_TABLE_CAPACITY\f[]
+.B \f[I]GNI_VC_ID_TABLE_CAPACITY\f[R]
 Size of the virtual channel (VC) table used for managing remote
 connections.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MBOX_PAGE_SIZE\f[]
+.B \f[I]GNI_MBOX_PAGE_SIZE\f[R]
 Page size for GNI SMSG mailbox allocations.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MBOX_NUM_PER_SLAB\f[]
+.B \f[I]GNI_MBOX_NUM_PER_SLAB\f[R]
 Number of GNI SMSG mailboxes per allocation slab.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MBOX_MAX_CREDIT\f[]
+.B \f[I]GNI_MBOX_MAX_CREDIT\f[R]
 Maximum number of credits per GNI SMSG mailbox.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MBOX_MSG_MAX_SIZE\f[]
+.B \f[I]GNI_MBOX_MSG_MAX_SIZE\f[R]
 Maximum size of GNI SMSG messages.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_RX_CQ_SIZE\f[]
+.B \f[I]GNI_RX_CQ_SIZE\f[R]
 Recommended GNI receive CQ size.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_TX_CQ_SIZE\f[]
+.B \f[I]GNI_TX_CQ_SIZE\f[R]
 Recommended GNI transmit CQ size.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MAX_RETRANSMITS\f[]
+.B \f[I]GNI_MAX_RETRANSMITS\f[R]
 Maximum number of message retransmits before failure.
 The value is of type uint32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_CACHE_LAZY_DEREG\f[]
+.B \f[I]GNI_MR_CACHE_LAZY_DEREG\f[R]
 Enable or disable lazy deregistration of memory.
 The value is of type int32_t.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_CACHE\f[]
+.B \f[I]GNI_MR_CACHE\f[R]
 Select the type of cache that the domain will use.
-Valid choices are the following: \[aq]internal\[aq], \[aq]udreg\[aq], or
-\[aq]none\[aq].
-\[aq]internal\[aq] refers to the GNI provider internal registration
+Valid choices are the following: `internal', `udreg', or `none'.
+`internal' refers to the GNI provider internal registration cache.
+`udreg' refers to a user level dreg library based cache.
+Lastly, `none' refers to device direct registration without a provider
 cache.
-\[aq]udreg\[aq] refers to a user level dreg library based cache.
-Lastly, \[aq]none\[aq] refers to device direct registration without a
-provider cache.
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_HARD_REG_LIMIT\f[]
+.B \f[I]GNI_MR_HARD_REG_LIMIT\f[R]
 Maximum number of registrations.
 Applies only to the GNI provider cache.
 The value is of type int32_t (\-1 for no limit).
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_SOFT_REG_LIMIT\f[]
+.B \f[I]GNI_MR_SOFT_REG_LIMIT\f[R]
 Soft cap on the registration limit.
 Applies only to the GNI provider cache.
 The value is of type int32_t (\-1 for no limit).
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_HARD_STALE_REG_LIMIT\f[]
+.B \f[I]GNI_MR_HARD_STALE_REG_LIMIT\f[R]
 Maximum number of stale registrations to be held in cache.
 This applies to the GNI provider cache and the udreg cache.
 The value is of type int32_t (\-1 for no limit for the GNI provider
 cache and udreg cache values must be greater than 0).
-.RS
-.RE
 .TP
-.B \f[I]GNI_MR_UDREG_LIMIT\f[]
+.B \f[I]GNI_MR_UDREG_LIMIT\f[R]
 Maximum number of registrations.
 Applies only to the udreg cache.
 The value is of type int32_t.
 The value must be greater than 0.
-.RS
-.RE
 .TP
-.B \f[I]GNI_XPMEM_ENABLE\f[]
+.B \f[I]GNI_XPMEM_ENABLE\f[R]
 Enable or disable use of XPMEM for on node messages using the GNI
 provider internal rendezvous protocol.
 The value is of type bool.
-.RS
-.RE
 .TP
-.B \f[I]GNI_DGRAM_PROGRESS_TIMEOUT\f[]
+.B \f[I]GNI_DGRAM_PROGRESS_TIMEOUT\f[R]
 Controls timeout value in milliseconds for the control progress thread.
 The value is of type uint32_t.
-.RS
-.RE
 .PP
-The \f[C]flush_cache\f[] function allows the user to flush any stale
+The \f[C]flush_cache\f[R] function allows the user to flush any stale
 registration cache entries from the cache.
 This has the effect of removing registrations from the cache that have
 been deregistered with the provider, but still exist in case that they
@@ -385,29 +312,27 @@ of the stale memory registrations and frees any memory related to those
 stale registrations.
 Only the provider\-level registration struct is freed, not the user
 buffer associated with the registration.
-The parameter for \f[C]flush_cache\f[] is a struct fid pointer to a
+The parameter for \f[C]flush_cache\f[R] is a struct fid pointer to a
 fi_domain.
 The memory registration cache is tied to the domain, so issuing a
-\f[C]flush_cache\f[] to the domain will flush the registration cache of
+\f[C]flush_cache\f[R] to the domain will flush the registration cache of
 the domain.
 .PP
-For \f[I]FI_GNI_EP_OPS_1\f[], the currently supported values are:
-\f[I]GNI_HASH_TAG_IMPL\f[] : Use a hashlist for the tag list
+For \f[I]FI_GNI_EP_OPS_1\f[R], the currently supported values are:
+\f[I]GNI_HASH_TAG_IMPL\f[R] : Use a hashlist for the tag list
 implementation.
 The value is of type uint32_t.
 .PP
-The \f[C]native_amo\f[] function allows the user to call GNI native
+The \f[C]native_amo\f[R] function allows the user to call GNI native
 atomics that are not implemented in the libfabric API.
 The parameters for native_amo are the same as the fi_atomic function but
 adds the following parameter:
 .TP
-.B \f[I]enum gnix_fab_req_type req_type\f[]
-The req_type\[aq]s supported with this call are GNIX_FAB_RQ_NAMO_AX (AND
+.B \f[I]enum gnix_fab_req_type req_type\f[R]
+The req_type\[cq]s supported with this call are GNIX_FAB_RQ_NAMO_AX (AND
 and XOR), and GNIX_FAB_RQ_NAMO_AX_S (AND and XOR 32 bit),
 GNIX_FAB_RQ_NAMO_FAX (Fetch AND and XOR) and GNIX_FAB_RQ_NAMO_FAX_S
 (Fetch AND and XOR 32 bit).
-.RS
-.RE
 .SH NOTES
 .PP
 The default address format is FI_ADDR_GNI.
@@ -416,25 +341,25 @@ passing.
 FI_ADDR_STR is always parsed and converted to FI_ADDR_GNI for use within
 the GNI provider.
 .PP
-\f[I]FI_ADDR_STR\f[] is formatted as follows:
+\f[I]FI_ADDR_STR\f[R] is formatted as follows:
 gni;node;service;GNIX_AV_STR_ADDR_VERSION;device_addr;cdm_id;name_type;cm_nic_cdm_id;cookie;rx_ctx_cnt;key_offset
 .PP
-The GNI provider sets the domain attribute \f[I]cntr_cnt\f[] to the CQ
+The GNI provider sets the domain attribute \f[I]cntr_cnt\f[R] to the CQ
 limit divided by 2.
 .PP
-The GNI provider sets the domain attribute \f[I]cq_cnt\f[] to the CQ
+The GNI provider sets the domain attribute \f[I]cq_cnt\f[R] to the CQ
 limit divided by 2.
 .PP
-The GNI provider sets the domain attribute \f[I]ep_cnt\f[] to SIZE_MAX.
+The GNI provider sets the domain attribute \f[I]ep_cnt\f[R] to SIZE_MAX.
 .PP
 Completion queue events may report unknown source address information
-when using \f[I]FI_SOURCE\f[].
-If \f[I]FI_SOURCE_ERR\f[] is also specified, the source address
+when using \f[I]FI_SOURCE\f[R].
+If \f[I]FI_SOURCE_ERR\f[R] is also specified, the source address
 information will be reported in the err_data member of the struct
 fi_cq_err_entry populated by fi_cq_readerr.
 The err_data member will contain the source address information in the
 FI_ADDR_GNI address format.
-In order to populate the remote peer\[aq]s address vector with this
+In order to populate the remote peer\[cq]s address vector with this
 mechanism, the application must call fi_cq_readerr to get the source
 address followed by fi_av_insert on the populated err_data member.
 .PP
@@ -446,7 +371,7 @@ buffer may be generated out of order with respect to the offset into the
 buffer into which the messages were received.
 .PP
 The GNI provider can use a maximum of 4K memory registrations per
-\f[I]node\f[] when using scalable memory registration.
+\f[I]node\f[R] when using scalable memory registration.
 Please consider this limitation when placing multiple processes on each
 node.
 .PP
@@ -493,7 +418,7 @@ Setting TMPDIR to a non\-NULL value with change the directory for the
 authorization key mapping file, and setting GNIX_AK_FILENAME to a
 non\-NULL value will change the filename.
 The default path for the authorization key mapping file is
-\[aq]/tmp/gnix_vmdh_info\[aq].
+`/tmp/gnix_vmdh_info'.
 The recommendation is that the user should not change these environment
 variables unless necessary.
 .SH KNOWN BUGS
@@ -510,13 +435,13 @@ FI_OPT_MULTI_RECV is set to 0 and will return \-FI_EINVAL if an
 application attempts to set this value to zero.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_open_ops\f[](3), \f[C]fi_provider\f[](7),
-\f[C]fi_getinfo\f[](3) \f[C]fi_atomic\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7),
+\f[C]fi_getinfo\f[R](3) \f[C]fi_atomic\f[R](3)
 .PP
-For more information on uGNI, see \f[I]Using the GNI and DMAPP APIs\f[]
+For more information on uGNI, see \f[I]Using the GNI and DMAPP APIs\f[R]
 (S\-2446\-3103, Cray Inc.).
 For more information on the GNI provider, see \f[I]An Implementation of
-OFI libfabric in Support of Multithreaded PGAS Solutions\f[] (PGAS
-\[aq]15).
+OFI libfabric in Support of Multithreaded PGAS Solutions\f[R] (PGAS
+\[cq]15).
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_hook.7 b/deps/libfabric/man/man7/fi_hook.7
index c01a43c716f90a86e83de8d8df1bc84833257693..c87554c43ca80557a6ac12287a606fb85a4f0367 100644
--- a/deps/libfabric/man/man7/fi_hook.7
+++ b/deps/libfabric/man/man7/fi_hook.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_hook" "7" "2019\-07\-19" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_hook" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -21,20 +21,17 @@ available hooking providers.
 When multiple hooks are specified, the names must be separated by a
 semi\-colon.
 To obtain a list of hooking providers available on the current system,
-one can use the fi_info utility with the \[aq]\-\-env\[aq] command line
-option.
-Hooking providers are usually identified by \[aq]hook\[aq] appearing in
-the provider name.
+one can use the fi_info utility with the `\[en]env' command line option.
+Hooking providers are usually identified by `hook' appearing in the
+provider name.
 .PP
 Known hooking providers include the following:
 .TP
-.B \f[I]ofi_hook_perf\f[]
-This hooks \[aq]fast path\[aq] data operation calls.
+.B \f[I]ofi_hook_perf\f[R]
+This hooks `fast path' data operation calls.
 Performance data is captured on call entrance and exit, in order to
 provide an average of how long each call takes to complete.
 See the PERFORMANCE HOOKS section for available performance data.
-.RS
-.RE
 .SH PERFORMANCE HOOKS
 .PP
 The hook provider allows capturing inline performance data by accessing
@@ -53,16 +50,12 @@ The environment variable FI_PERF_CNTR is used to identify which
 performance counter is tracked.
 The following counters are available:
 .TP
-.B \f[I]cpu_cycles\f[]
+.B \f[I]cpu_cycles\f[R]
 Counts the number of CPU cycles each function takes to complete.
-.RS
-.RE
 .TP
-.B \f[I]cpu_instr\f[]
+.B \f[I]cpu_instr\f[R]
 Counts the number of CPU instructions each function takes to complete.
 This is the default performance counter if none is specified.
-.RS
-.RE
 .SH LIMITATIONS
 .PP
 Hooking functionality is not available for providers built using the
@@ -74,6 +67,6 @@ Application that use FI_TRIGGER operations that attempt to hook calls
 will likely crash.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_mlx.7 b/deps/libfabric/man/man7/fi_mlx.7
index 1a3df046f40eb1e9843f6ef981936bc7bc4f6921..3f80d238a175a940d8cf63bde7b6030db0b6b648 100644
--- a/deps/libfabric/man/man7/fi_mlx.7
+++ b/deps/libfabric/man/man7/fi_mlx.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_mlx" "7" "2019\-09\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_mlx" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -11,6 +11,6 @@ The mlx provider was deprecated and removed in libfabric 1.9 due to a
 lack of a maintainer.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7),
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_mrail.7 b/deps/libfabric/man/man7/fi_mrail.7
index 4b82177bf3aa61ba834373254270f95f35f18c27..f3cc2344502b3dba2ea775dfc0b0ea141261b144 100644
--- a/deps/libfabric/man/man7/fi_mrail.7
+++ b/deps/libfabric/man/man7/fi_mrail.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_mrail" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_mrail" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -32,31 +32,23 @@ FI_RMA capability.
 below).
 .SH SUPPORTED FEATURES
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only \f[I]FI_EP_RDM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only \f[I]FI_EP_RDM\f[R].
 .TP
-.B \f[I]Endpoint capabilities\f[]
-The following data transfer interface is supported: \f[I]FI_MSG\f[],
-\f[I]FI_TAGGED\f[], \f[I]FI_RMA\f[].
-.RS
-.RE
+.B \f[I]Endpoint capabilities\f[R]
+The following data transfer interface is supported: \f[I]FI_MSG\f[R],
+\f[I]FI_TAGGED\f[R], \f[I]FI_RMA\f[R].
 .TP
 .B # LIMITATIONS
 Limitations of the underlying provider may show up as that of mrail
 provider.
-.RS
-.RE
-mrail provider doesn\[aq]t allow pass\-through of any mode bits to the
+mrail provider doesn\[cq]t allow pass\-through of any mode bits to the
 underlying provider.
-.RS
-.RE
 .SS Unsupported features
 .PP
 The following are the major libfabric features that are not supported.
-Any other feature not listed in "Supported features" can be assumed as
-unsupported.
+Any other feature not listed in \[lq]Supported features\[rq] can be
+assumed as unsupported.
 .IP \[bu] 2
 FI_ATOMIC
 .IP \[bu] 2
@@ -73,7 +65,7 @@ Triggered operations
 .PP
 For messages (FI_MSG, FI_TAGGED), the provider uses different policies
 to send messages over one or more rails based on message size (See
-\f[I]FI_OFI_MRIAL_CONFIG\f[] in the RUNTIME PARAMETERS section).
+\f[I]FI_OFI_MRIAL_CONFIG\f[R] in the RUNTIME PARAMETERS section).
 Ordering is guaranteed through the use of sequence numbers.
 .PP
 For RMA, the data is striped equally across all rails.
@@ -81,34 +73,28 @@ For RMA, the data is striped equally across all rails.
 .PP
 The ofi_mrail provider checks for the following environment variables.
 .TP
-.B \f[I]FI_OFI_MRAIL_ADDR\f[]
+.B \f[I]FI_OFI_MRAIL_ADDR\f[R]
 Comma delimited list of individual rail addresses.
 Each address can be an address in FI_ADDR_STR format, a host name, an IP
 address, or a netdev interface name.
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_MRAIL_ADDR_STRC\f[]
+.B \f[I]FI_OFI_MRAIL_ADDR_STRC\f[R]
 Deprecated.
-Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[].
-.RS
-.RE
+Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[R].
 .TP
-.B \f[I]FI_OFI_MRAIL_CONFIG\f[]
-Comma separated list of \f[C]<max_size>:<policy>\f[] pairs, sorted in
-ascending order of \f[C]<max_size>\f[].
+.B \f[I]FI_OFI_MRAIL_CONFIG\f[R]
+Comma separated list of \f[C]<max_size>:<policy>\f[R] pairs, sorted in
+ascending order of \f[C]<max_size>\f[R].
 Each pair indicated the rail sharing policy to be used for messages up
-to the size \f[C]<max_size>\f[] and not covered by all previous pairs.
-The value of \f[C]<policy>\f[] can be \f[I]fixed\f[] (a fixed rail is
-used), \f[I]round\-robin\f[] (one rail per message, selected in
-round\-robin fashion), or \f[I]striping\f[] (striping across all the
+to the size \f[C]<max_size>\f[R] and not covered by all previous pairs.
+The value of \f[C]<policy>\f[R] can be \f[I]fixed\f[R] (a fixed rail is
+used), \f[I]round\-robin\f[R] (one rail per message, selected in
+round\-robin fashion), or \f[I]striping\f[R] (striping across all the
 rails).
-The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[].
+The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[R].
 The value ULONG_MAX can be input as \-1.
-.RS
-.RE
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_netdir.7 b/deps/libfabric/man/man7/fi_netdir.7
index 15031b255367cf3cda5c8ba1befef491210255ac..f2473814c7f5a87ac6dda9811625a870bd255a0b 100644
--- a/deps/libfabric/man/man7/fi_netdir.7
+++ b/deps/libfabric/man/man7/fi_netdir.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_netdir" "7" "2019\-11\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_netdir" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -25,124 +25,88 @@ service provider interface (SPI) for their hardware.
 The Network Direct provider support the following features defined for
 the libfabric API:
 .TP
-.B \f[I]Endpoint types\f[]
+.B \f[I]Endpoint types\f[R]
 The provider support the FI_EP_MSG endpoint types.
-.RS
-.RE
 .TP
-.B \f[I]Memory registration modes\f[]
-The provider implements the \f[I]FI_MR_BASIC\f[] memory registration
+.B \f[I]Memory registration modes\f[R]
+The provider implements the \f[I]FI_MR_BASIC\f[R] memory registration
 mode.
-.RS
-.RE
 .TP
-.B \f[I]Data transfer operations\f[]
+.B \f[I]Data transfer operations\f[R]
 The following data transfer interfaces are supported for the following
-endpoint types: \f[I]FI_MSG\f[], \f[I]FI_RMA\f[].
+endpoint types: \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R].
 See DATA TRANSFER OPERATIONS below for more details.
-.RS
-.RE
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The Network Direct provider requires applications to support the
 following modes: * FI_LOCAL_MR for all applications.
-.RS
-.RE
 .TP
-.B \f[I]Addressing Formats\f[]
+.B \f[I]Addressing Formats\f[R]
 Supported addressing formats include FI_SOCKADDR, FI_SOCKADDR_IN,
 FI_SOCKADDR_IN6
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
+.B \f[I]Progress\f[R]
 The Network Direct provider supports FI_PROGRESS_AUTO: Asynchronous
 operations make forward progress automatically.
-.RS
-.RE
 .TP
-.B \f[I]Operation flags\f[]
+.B \f[I]Operation flags\f[R]
 The provider supports FI_INJECT, FI_COMPLETION, FI_TRANSMIT_COMPLETE,
 FI_INJECT_COMPLETE, FI_DELIVERY_COMPLETE, FI_SELECTIVE_COMPLETION
-.RS
-.RE
 .TP
-.B \f[I]Completion ordering\f[]
+.B \f[I]Completion ordering\f[R]
 RX/TX contexts: FI_ORDER_STRICT
-.RS
-.RE
 .TP
-.B \f[I]Other supported features\f[]
+.B \f[I]Other supported features\f[R]
 Multiple input/output vector (IOV) is supported for FI_RMA read/write
 and FI_MSG receive/transmit operations.
-.RS
-.RE
 .SH LIMITATIONS
 .TP
-.B \f[I]Memory Regions\f[]
+.B \f[I]Memory Regions\f[R]
 Only FI_MR_BASIC mode is supported.
 Adding regions via s/g list is supported only up to a s/g list size of
 1.
 No support for binding memory regions to a counter.
-.RS
-.RE
 .TP
-.B \f[I]Wait objects\f[]
+.B \f[I]Wait objects\f[R]
 Wait object and wait sets are not supported.
-.RS
-.RE
 .TP
-.B \f[I]Resource Management\f[]
+.B \f[I]Resource Management\f[R]
 Application has to make sure CQs are not overrun as this cannot be
 detected by the provider.
-.RS
-.RE
 .TP
-.B \f[I]Unsupported Endpoint types\f[]
+.B \f[I]Unsupported Endpoint types\f[R]
 FI_EP_DGRAM, FI_EP_RDM
-.RS
-.RE
 .TP
-.B \f[I]Other unsupported features\f[]
+.B \f[I]Other unsupported features\f[R]
 Scalable endpoints, FABRIC_DIRECT
-.RS
-.RE
 .TP
-.B \f[I]Unsupported features specific to MSG endpoints\f[]
+.B \f[I]Unsupported features specific to MSG endpoints\f[R]
 FI_SOURCE, FI_TAGGED, FI_CLAIM, fi_ep_alias, shared TX context,
 operations.
-.RS
-.RE
 .SH RUNTIME PARAMETERS
 .PP
 The Network Direct provider checks for the following environment
 variables.
 .SS Variables specific to RDM endpoints
 .TP
-.B \f[I]FI_NETDIR_INLINETHR\f[]
+.B \f[I]FI_NETDIR_INLINETHR\f[R]
 The size of the (default: 8 Kbyte): * Transmitted data that can be
 inlined * Preposted data for the unexpected receive queue
-.RS
-.RE
 .TP
-.B \f[I]FI_NETDIR_PREPOSTCNT\f[]
+.B \f[I]FI_NETDIR_PREPOSTCNT\f[R]
 The number of pre\-registered buffers between the endpoints that are not
 require internal ACK messages, must be a power of 2 (default: 8).
-.RS
-.RE
 .TP
-.B \f[I]FI_NETDIR_PREPOSTBUFCNT\f[]
+.B \f[I]FI_NETDIR_PREPOSTBUFCNT\f[R]
 The number of preposted arrays of buffers, must be a power of 2
 (default: 1).
-.RS
-.RE
 .SS Environment variables notes
 .PP
 The fi_info utility would give the up\-to\-date information on
 environment variables: fi_info \-p netdir \-e
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_open_ops\f[](3), \f[C]fi_provider\f[](7),
-\f[C]fi_getinfo\f[](3) \f[C]fi_atomic\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7),
+\f[C]fi_getinfo\f[R](3) \f[C]fi_atomic\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_provider.7 b/deps/libfabric/man/man7/fi_provider.7
index 0b1c126145ff0287db3df62a48a7b42325e80bf2..ef050cfe6b3c8664c5b4bbc8661b03d7ba0a85fa 100644
--- a/deps/libfabric/man/man7/fi_provider.7
+++ b/deps/libfabric/man/man7/fi_provider.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_provider" "7" "2020\-02\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_provider" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -20,86 +20,74 @@ This distribution of libfabric contains the following providers
 (although more may be available via run\-time plug\-ins):
 .SS Core providers
 .TP
-.B \f[I]GNI\f[]
+.B \f[I]GNI\f[R]
 A provider for the Aries interconnect in Cray XC(TM) systems utilizing
-the user\-space \f[I]Generic Networking Interface\f[].
-See \f[C]fi_gni\f[](7) for more information.
-.RS
-.RE
+the user\-space \f[I]Generic Networking Interface\f[R].
+See \f[C]fi_gni\f[R](7) for more information.
 .TP
-.B \f[I]PSM\f[]
+.B \f[I]PSM\f[R]
 High\-speed InfiniBand networking from Intel.
-See \f[C]fi_psm\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_psm\f[R](7) for more information.
+.TP
+.B \f[I]PSM2\f[R]
+High\-speed Omni\-Path networking from Intel.
+See \f[C]fi_psm2\f[R](7) for more information.
+.TP
+.B \f[I]PSM3\f[R]
+High\-speed Ethernet networking from Intel.
+See \f[C]fi_psm3\f[R](7) for more information.
 .TP
-.B \f[I]Sockets\f[]
+.B \f[I]Sockets\f[R]
 A general purpose provider that can be used on any network that supports
 TCP/UDP sockets.
 This provider is not intended to provide performance improvements over
 regular TCP/UDP sockets, but rather to allow developers to write, test,
 and debug application code even on platforms that do not have
 high\-speed networking.
-See \f[C]fi_sockets\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_sockets\f[R](7) for more information.
 .TP
-.B \f[I]usNIC\f[]
+.B \f[I]usNIC\f[R]
 Ultra low latency Ethernet networking over Cisco userspace VIC adapters.
-See \f[C]fi_usnic\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_usnic\f[R](7) for more information.
 .TP
-.B \f[I]Verbs\f[]
+.B \f[I]Verbs\f[R]
 This provider uses the Linux Verbs API for network transport.
 Application performance is, obviously expected to be similar to that of
 the native Linux Verbs API.
 Analogous to the Sockets provider, the Verbs provider is intended to
 enable developers to write, test, and debug application code on
 platforms that only have Linux Verbs\-based networking.
-See \f[C]fi_verbs\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_verbs\f[R](7) for more information.
 .TP
-.B \f[I]Blue Gene/Q\f[]
-See \f[C]fi_bgq\f[](7) for more information.
-.RS
-.RE
+.B \f[I]Blue Gene/Q\f[R]
+See \f[C]fi_bgq\f[R](7) for more information.
 .TP
-.B \f[I]EFA\f[]
+.B \f[I]EFA\f[R]
 A provider for the Amazon EC2 Elastic Fabric Adapter
 (EFA) (https://aws.amazon.com/hpc/efa/), a custom\-built OS bypass
 hardware interface for inter\-instance communication on EC2.
-See \f[C]fi_efa\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_efa\f[R](7) for more information.
 .TP
-.B \f[I]SHM\f[]
+.B \f[I]SHM\f[R]
 A provider for intranode communication using shared memory.
 The provider makes use of the Linux kernel feature Cross Memory Attach
-(CMA) which allows processes to have full access to another process\[aq]
+(CMA) which allows processes to have full access to another process\[cq]
 address space.
-See \f[C]fi_shm\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_shm\f[R](7) for more information.
 .SS Utility providers
 .TP
-.B \f[I]RxM\f[]
+.B \f[I]RxM\f[R]
 The RxM provider (ofi_rxm) is an utility provider that supports RDM
 endpoints emulated over MSG endpoints of a core provider.
-See \f[C]fi_rxm\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_rxm\f[R](7) for more information.
 .TP
-.B \f[I]RxD\f[]
+.B \f[I]RxD\f[R]
 The RxD provider (ofi_rxd) is a utility provider that supports RDM
 endpoints emulated over DGRAM endpoints of a core provider.
-See \f[C]fi_rxd\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_rxd\f[R](7) for more information.
 .SS Special providers
 .TP
-.B \f[I]Hook\f[]
+.B \f[I]Hook\f[R]
 The hook provider is a special type of provider that can layer over any
 other provider, unless FI_FABRIC_DIRECT is used.
 The hook provider is always available, but has no impact unless enabled.
@@ -107,9 +95,7 @@ When enabled, the hook provider will intercept all calls to the
 underlying core or utility provider(s).
 The hook provider is useful for capturing performance data or providing
 debugging information, even in release builds of the library.
-See \f[C]fi_hook\f[](7) for more information.
-.RS
-.RE
+See \f[C]fi_hook\f[R](7) for more information.
 .SH CORE VERSUS UTILITY PROVIDERS
 .PP
 Core providers implement the libfabric interfaces directly over
@@ -130,9 +116,9 @@ datagram endpoints.
 The utility providers will not layer over the sockets provider unless it
 is explicitly requested.
 .PP
-Utility providers show up as a component in the core provider\[aq]s
+Utility providers show up as a component in the core provider\[cq]s
 component list.
-See \f[C]fi_fabric\f[](3).
+See \f[C]fi_fabric\f[R](3).
 Utility providers are enabled automatically for core providers that do
 not support the feature set requested by an application.
 .SH PROVIDER REQUIREMENTS
@@ -160,15 +146,18 @@ All endpoints must support the message queue data transfer interface
 .IP \[bu] 2
 An endpoint that advertises support for a specific endpoint capability
 must support the corresponding data transfer interface.
+.RS 2
 .IP \[bu] 2
 FI_ATOMIC \- fi_ops_atomic
 .IP \[bu] 2
 FI_RMA \- fi_ops_rma
 .IP \[bu] 2
 FI_TAGGED \- fi_ops_tagged
+.RE
 .IP \[bu] 2
 Endpoints must support all transmit and receive operations for any data
 transfer interface that they support.
+.RS 2
 .IP \[bu] 2
 Exception: If an operation is only usable for an operation that the
 provider does not support, and support for that operation is conveyed
@@ -180,12 +169,13 @@ For example, if the provider does not support injected data, it can set
 the attribute inject_size = 0, and fail all fi_inject operations.
 .RE
 .IP \[bu] 2
-The framework supplies wrappers around the \[aq]msg\[aq] operations that
-can be used.
+The framework supplies wrappers around the `msg' operations that can be
+used.
 For example, the framework implements the sendv() msg operation by
 calling sendmsg().
 Providers may reference the general operation, and supply on the
 sendmsg() implementation.
+.RE
 .IP \[bu] 2
 Providers must set all operations to an implementation.
 Function pointers may not be left NULL or uninitialized.
@@ -193,12 +183,14 @@ The framework supplies empty functions that return \-FI_ENOSYS which can
 be used for this purpose.
 .IP \[bu] 2
 Endpoints must support the CM interface as follows:
+.RS 2
 .IP \[bu] 2
 FI_EP_MSG endpoints must support all CM operations.
 .IP \[bu] 2
 FI_EP_DGRAM endpoints must support CM getname and setname.
 .IP \[bu] 2
 FI_EP_RDM endpoints must support CM getname and setname.
+.RE
 .IP \[bu] 2
 Providers that support connectionless endpoints must support all AV
 operations (fi_ops_av).
@@ -207,6 +199,7 @@ Providers that support memory registration, must support all MR
 operations (fi_ops_mr).
 .IP \[bu] 2
 Providers should support both completion queues and counters.
+.RS 2
 .IP \[bu] 2
 If FI_RMA_EVENT is not supported, counter support is limited to local
 events only.
@@ -218,10 +211,11 @@ Providers that support FI_REMOTE_CQ_DATA shall support
 FI_CQ_FORMAT_DATA.
 .IP \[bu] 2
 Providers that support FI_TAGGED shall support FI_CQ_FORMAT_TAGGED.
+.RE
 .IP \[bu] 2
 A provider is expected to be forward compatible, and must be able to be
-compiled against expanded \f[C]fi_xxx_ops\f[] structures that define new
-functions added after the provider was written.
+compiled against expanded \f[C]fi_xxx_ops\f[R] structures that define
+new functions added after the provider was written.
 Any unknown functions must be set to NULL.
 .IP \[bu] 2
 Providers shall document in their man page which features they support,
@@ -237,55 +231,41 @@ Logging is performed using the FI_ERR, FI_LOG, and FI_DEBUG macros.
 .IP
 .nf
 \f[C]
-#define\ FI_ERR(prov_name,\ subsystem,\ ...)
+#define FI_ERR(prov_name, subsystem, ...)
 
-#define\ FI_LOG(prov_name,\ prov,\ level,\ subsystem,\ ...)
+#define FI_LOG(prov_name, prov, level, subsystem, ...)
 
-#define\ FI_DEBUG(prov_name,\ subsystem,\ ...)
-\f[]
+#define FI_DEBUG(prov_name, subsystem, ...)
+\f[R]
 .fi
 .SS ARGUMENTS
 .TP
-.B \f[I]prov_name\f[]
+.B \f[I]prov_name\f[R]
 String representing the provider name.
-.RS
-.RE
 .TP
-.B \f[I]prov\f[]
+.B \f[I]prov\f[R]
 Provider context structure.
-.RS
-.RE
 .TP
-.B \f[I]level\f[]
+.B \f[I]level\f[R]
 Log level associated with log statement.
-.RS
-.RE
 .TP
-.B \f[I]subsystem\f[]
+.B \f[I]subsystem\f[R]
 Subsystem being logged from.
-.RS
-.RE
 .SS DESCRIPTION
 .TP
-.B \f[I]FI_ERR\f[]
+.B \f[I]FI_ERR\f[R]
 Always logged.
-.RS
-.RE
 .TP
-.B \f[I]FI_LOG\f[]
+.B \f[I]FI_LOG\f[R]
 Logged if the intended provider, log level, and subsystem parameters
 match the user supplied values.
-.RS
-.RE
 .TP
-.B \f[I]FI_DEBUG\f[]
-Logged if configured with the \-\-enable\-debug flag.
-.RS
-.RE
+.B \f[I]FI_DEBUG\f[R]
+Logged if configured with the \[en]enable\-debug flag.
 .SH SEE ALSO
 .PP
-\f[C]fi_gni\f[](7), \f[C]fi_hook\f[](7), \f[C]fi_psm\f[](7),
-\f[C]fi_sockets\f[](7), \f[C]fi_usnic\f[](7), \f[C]fi_verbs\f[](7),
-\f[C]fi_bgq\f[](7),
+\f[C]fi_gni\f[R](7), \f[C]fi_hook\f[R](7), \f[C]fi_psm\f[R](7),
+\f[C]fi_sockets\f[R](7), \f[C]fi_usnic\f[R](7), \f[C]fi_verbs\f[R](7),
+\f[C]fi_bgq\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_psm.7 b/deps/libfabric/man/man7/fi_psm.7
index 513355278be047e2a8876637303f6ecb45b7a62d..7746ccc4fa2bcb622827305df8b5244a89482019 100644
--- a/deps/libfabric/man/man7/fi_psm.7
+++ b/deps/libfabric/man/man7/fi_psm.7
@@ -1,96 +1,87 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_psm" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_psm" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
 fi_psm \- The PSM Fabric Provider
 .SH OVERVIEW
 .PP
-The \f[I]psm\f[] provider runs over the PSM 1.x interface that is
+The \f[I]psm\f[R] provider runs over the PSM 1.x interface that is
 currently supported by the Intel TrueScale Fabric.
 PSM provides tag\-matching message queue functions that are optimized
 for MPI implementations.
 PSM also has limited Active Message support, which is not officially
 published but is quite stable and well documented in the source code
 (part of the OFED release).
-The \f[I]psm\f[] provider makes use of both the tag\-matching message
+The \f[I]psm\f[R] provider makes use of both the tag\-matching message
 queue functions and the Active Message functions to support a variety of
 libfabric data transfer APIs, including tagged message queue, message
 queue, RMA, and atomic operations.
 .PP
-The \f[I]psm\f[] provider can work with the psm2\-compat library, which
+The \f[I]psm\f[R] provider can work with the psm2\-compat library, which
 exposes a PSM 1.x interface over the Intel Omni\-Path Fabric.
 .SH LIMITATIONS
 .PP
-The \f[I]psm\f[] provider doesn\[aq]t support all the features defined
+The \f[I]psm\f[R] provider doesn\[cq]t support all the features defined
 in the libfabric API.
 Here are some of the limitations:
 .TP
 .B Endpoint types
-Only support non\-connection based types \f[I]FI_DGRAM\f[] and
-\f[I]FI_RDM\f[]
-.RS
-.RE
+Only support non\-connection based types \f[I]FI_DGRAM\f[R] and
+\f[I]FI_RDM\f[R]
 .TP
 .B Endpoint capabilities
 Endpoints can support any combination of data transfer capabilities
-\f[I]FI_TAGGED\f[], \f[I]FI_MSG\f[], \f[I]FI_ATOMICS\f[], and
-\f[I]FI_RMA\f[].
-These capabilities can be further refined by \f[I]FI_SEND\f[],
-\f[I]FI_RECV\f[], \f[I]FI_READ\f[], \f[I]FI_WRITE\f[],
-\f[I]FI_REMOTE_READ\f[], and \f[I]FI_REMOTE_WRITE\f[] to limit the
+\f[I]FI_TAGGED\f[R], \f[I]FI_MSG\f[R], \f[I]FI_ATOMICS\f[R], and
+\f[I]FI_RMA\f[R].
+These capabilities can be further refined by \f[I]FI_SEND\f[R],
+\f[I]FI_RECV\f[R], \f[I]FI_READ\f[R], \f[I]FI_WRITE\f[R],
+\f[I]FI_REMOTE_READ\f[R], and \f[I]FI_REMOTE_WRITE\f[R] to limit the
 direction of operations.
 The limitation is that no two endpoints can have overlapping receive or
 RMA target capabilities in any of the above categories.
-For example it is fine to have two endpoints with \f[I]FI_TAGGED\f[] |
-\f[I]FI_SEND\f[], one endpoint with \f[I]FI_TAGGED\f[] |
-\f[I]FI_RECV\f[], one endpoint with \f[I]FI_MSG\f[], one endpoint with
-\f[I]FI_RMA\f[] | \f[I]FI_ATOMICS\f[].
-But it is not allowed to have two endpoints with \f[I]FI_TAGGED\f[], or
-two endpoints with \f[I]FI_RMA\f[].
-.RS
-.RE
-.PP
-\f[I]FI_MULTI_RECV\f[] is supported for non\-tagged message queue only.
-.PP
-Other supported capabilities include \f[I]FI_TRIGGER\f[].
+For example it is fine to have two endpoints with \f[I]FI_TAGGED\f[R] |
+\f[I]FI_SEND\f[R], one endpoint with \f[I]FI_TAGGED\f[R] |
+\f[I]FI_RECV\f[R], one endpoint with \f[I]FI_MSG\f[R], one endpoint with
+\f[I]FI_RMA\f[R] | \f[I]FI_ATOMICS\f[R].
+But it is not allowed to have two endpoints with \f[I]FI_TAGGED\f[R], or
+two endpoints with \f[I]FI_RMA\f[R].
+.PP
+\f[I]FI_MULTI_RECV\f[R] is supported for non\-tagged message queue only.
+.PP
+Other supported capabilities include \f[I]FI_TRIGGER\f[R].
 .TP
 .B Modes
-\f[I]FI_CONTEXT\f[] is required for the \f[I]FI_TAGGED\f[] and
-\f[I]FI_MSG\f[] capabilities.
+\f[I]FI_CONTEXT\f[R] is required for the \f[I]FI_TAGGED\f[R] and
+\f[I]FI_MSG\f[R] capabilities.
 That means, any request belonging to these two categories that generates
 a completion must pass as the operation context a valid pointer to type
-\f[I]struct fi_context\f[], and the space referenced by the pointer must
-remain untouched until the request has completed.
-If none of \f[I]FI_TAGGED\f[] and \f[I]FI_MSG\f[] is asked for, the
-\f[I]FI_CONTEXT\f[] mode is not required.
-.RS
-.RE
+\f[I]struct fi_context\f[R], and the space referenced by the pointer
+must remain untouched until the request has completed.
+If none of \f[I]FI_TAGGED\f[R] and \f[I]FI_MSG\f[R] is asked for, the
+\f[I]FI_CONTEXT\f[R] mode is not required.
 .TP
 .B Progress
-The \f[I]psm\f[] provider requires manual progress.
-The application is expected to call \f[I]fi_cq_read\f[] or
-\f[I]fi_cntr_read\f[] function from time to time when no other libfabric
-function is called to ensure progress is made in a timely manner.
+The \f[I]psm\f[R] provider requires manual progress.
+The application is expected to call \f[I]fi_cq_read\f[R] or
+\f[I]fi_cntr_read\f[R] function from time to time when no other
+libfabric function is called to ensure progress is made in a timely
+manner.
 The provider does support auto progress mode.
 However, the performance can be significantly impacted if the
 application purely depends on the provider to make auto progress.
-.RS
-.RE
 .TP
 .B Unsupported features
 These features are unsupported: connection management, scalable
 endpoint, passive endpoint, shared receive context, send/inject with
 immediate data.
-.RS
-.RE
 .SH RUNTIME PARAMETERS
 .PP
-The \f[I]psm\f[] provider checks for the following environment
+The \f[I]psm\f[R] provider checks for the following environment
 variables:
 .TP
-.B \f[I]FI_PSM_UUID\f[]
+.B \f[I]FI_PSM_UUID\f[R]
 PSM requires that each job has a unique ID (UUID).
 All the processes in the same job need to use the same UUID in order to
 be able to talk to each other.
@@ -98,35 +89,31 @@ The PSM reference manual advises to keep UUID unique to each job.
 In practice, it generally works fine to reuse UUID as long as (1) no two
 jobs with the same UUID are running at the same time; and (2) previous
 jobs with the same UUID have exited normally.
-If running into "resource busy" or "connection failure" issues with
-unknown reason, it is advisable to manually set the UUID to a value
-different from the default.
-.RS
-.RE
+If running into \[lq]resource busy\[rq] or \[lq]connection failure\[rq]
+issues with unknown reason, it is advisable to manually set the UUID to
+a value different from the default.
 .PP
 The default UUID is 0FFF0FFF\-0000\-0000\-0000\-0FFF0FFF0FFF.
 .TP
-.B \f[I]FI_PSM_NAME_SERVER\f[]
-The \f[I]psm\f[] provider has a simple built\-in name server that can be
-used to resolve an IP address or host name into a transport address
-needed by the \f[I]fi_av_insert\f[] call.
+.B \f[I]FI_PSM_NAME_SERVER\f[R]
+The \f[I]psm\f[R] provider has a simple built\-in name server that can
+be used to resolve an IP address or host name into a transport address
+needed by the \f[I]fi_av_insert\f[R] call.
 The main purpose of this name server is to allow simple client\-server
-type applications (such as those in \f[I]fabtests\f[]) to be written
+type applications (such as those in \f[I]fabtests\f[R]) to be written
 purely with libfabric, without using any out\-of\-band communication
 mechanism.
 For such applications, the server would run first to allow endpoints be
 created and registered with the name server, and then the client would
-call \f[I]fi_getinfo\f[] with the \f[I]node\f[] parameter set to the IP
-address or host name of the server.
-The resulting \f[I]fi_info\f[] structure would have the transport
-address of the endpoint created by the server in the \f[I]dest_addr\f[]
+call \f[I]fi_getinfo\f[R] with the \f[I]node\f[R] parameter set to the
+IP address or host name of the server.
+The resulting \f[I]fi_info\f[R] structure would have the transport
+address of the endpoint created by the server in the \f[I]dest_addr\f[R]
 field.
-Optionally the \f[I]service\f[] parameter can be used in addition to
-\f[I]node\f[].
-Notice that the \f[I]service\f[] number is interpreted by the provider
+Optionally the \f[I]service\f[R] parameter can be used in addition to
+\f[I]node\f[R].
+Notice that the \f[I]service\f[R] number is interpreted by the provider
 and is not a TCP/IP port number.
-.RS
-.RE
 .PP
 The name server is on by default.
 It can be turned off by setting the variable to 0.
@@ -136,17 +123,15 @@ created when the name server is on.
 The provider detects OpenMPI and MPICH runs and changes the default
 setting to off.
 .TP
-.B \f[I]FI_PSM_TAGGED_RMA\f[]
+.B \f[I]FI_PSM_TAGGED_RMA\f[R]
 The RMA functions are implemented on top of the PSM Active Message
 functions.
 The Active Message functions have limit on the size of data can be
 transferred in a single message.
 Large transfers can be divided into small chunks and be pipe\-lined.
 However, the bandwidth is sub\-optimal by doing this way.
-.RS
-.RE
 .PP
-The \f[I]psm\f[] provider use PSM tag\-matching message queue functions
+The \f[I]psm\f[R] provider use PSM tag\-matching message queue functions
 to achieve higher bandwidth for large size RMA.
 For this purpose, a bit is reserved from the tag space to separate the
 RMA traffic from the regular tagged message queue.
@@ -154,62 +139,53 @@ RMA traffic from the regular tagged message queue.
 The option is on by default.
 To turn it off set the variable to 0.
 .TP
-.B \f[I]FI_PSM_AM_MSG\f[]
-The \f[I]psm\f[] provider implements the non\-tagged message queue over
+.B \f[I]FI_PSM_AM_MSG\f[R]
+The \f[I]psm\f[R] provider implements the non\-tagged message queue over
 the PSM tag\-matching message queue.
 One tag bit is reserved for this purpose.
 Alternatively, the non\-tagged message queue can be implemented over
 Active Message.
 This experimental feature has slightly larger latency.
-.RS
-.RE
 .PP
 This option is off by default.
 To turn it on set the variable to 1.
 .TP
-.B \f[I]FI_PSM_DELAY\f[]
+.B \f[I]FI_PSM_DELAY\f[R]
 Time (seconds) to sleep before closing PSM endpoints.
 This is a workaround for a bug in some versions of PSM library.
-.RS
-.RE
 .PP
 The default setting is 1.
 .TP
-.B \f[I]FI_PSM_TIMEOUT\f[]
+.B \f[I]FI_PSM_TIMEOUT\f[R]
 Timeout (seconds) for gracefully closing PSM endpoints.
 A forced closing will be issued if timeout expires.
-.RS
-.RE
 .PP
 The default setting is 5.
 .TP
-.B \f[I]FI_PSM_PROG_INTERVAL\f[]
+.B \f[I]FI_PSM_PROG_INTERVAL\f[R]
 When auto progress is enabled (asked via the hints to
-\f[I]fi_getinfo\f[]), a progress thread is created to make progress
+\f[I]fi_getinfo\f[R]), a progress thread is created to make progress
 calls from time to time.
 This option set the interval (microseconds) between progress calls.
-.RS
-.RE
 .PP
 The default setting is 1 if affinity is set, or 1000 if not.
-See \f[I]FI_PSM_PROG_AFFINITY\f[].
+See \f[I]FI_PSM_PROG_AFFINITY\f[R].
 .TP
-.B \f[I]FI_PSM_PROG_AFFINITY\f[]
+.B \f[I]FI_PSM_PROG_AFFINITY\f[R]
 When set, specify the set of CPU cores to set the progress thread
 affinity to.
 The format is
-\f[C]<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*\f[], where
-each triplet \f[C]<start>:<end>:<stride>\f[] defines a block of
+\f[C]<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*\f[R],
+where each triplet \f[C]<start>:<end>:<stride>\f[R] defines a block of
 core_ids.
-Both \f[C]<start>\f[] and \f[C]<end>\f[] can be either the
-\f[C]core_id\f[] (when >=0) or \f[C]core_id\ \-\ num_cores\f[] (when
+Both \f[C]<start>\f[R] and \f[C]<end>\f[R] can be either the
+\f[C]core_id\f[R] (when >=0) or \f[C]core_id \- num_cores\f[R] (when
 <0).
-.RS
-.RE
 .PP
 By default affinity is not set.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_psm2\f[](7),
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm2\f[R](7),
+\f[C]fi_psm3\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_psm2.7 b/deps/libfabric/man/man7/fi_psm2.7
index 0cee4e13c653425837ab74ca994bcf25279f5468..e2983c04759537d2090bcad3597a43588e64c1c1 100644
--- a/deps/libfabric/man/man7/fi_psm2.7
+++ b/deps/libfabric/man/man7/fi_psm2.7
@@ -1,98 +1,90 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_psm2" "7" "2019\-04\-09" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_psm2" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
 fi_psm2 \- The PSM2 Fabric Provider
 .SH OVERVIEW
 .PP
-The \f[I]psm2\f[] provider runs over the PSM 2.x interface that is
+The \f[I]psm2\f[R] provider runs over the PSM 2.x interface that is
 supported by the Intel Omni\-Path Fabric.
 PSM 2.x has all the PSM 1.x features plus a set of new functions with
 enhanced capabilities.
-Since PSM 1.x and PSM 2.x are not ABI compatible the \f[I]psm2\f[]
-provider only works with PSM 2.x and doesn\[aq]t support Intel TrueScale
+Since PSM 1.x and PSM 2.x are not ABI compatible the \f[I]psm2\f[R]
+provider only works with PSM 2.x and doesn\[cq]t support Intel TrueScale
 Fabric.
 .SH LIMITATIONS
 .PP
-The \f[I]psm2\f[] provider doesn\[aq]t support all the features defined
+The \f[I]psm2\f[R] provider doesn\[cq]t support all the features defined
 in the libfabric API.
 Here are some of the limitations:
 .TP
 .B Endpoint types
-Only support non\-connection based types \f[I]FI_DGRAM\f[] and
-\f[I]FI_RDM\f[]
-.RS
-.RE
+Only support non\-connection based types \f[I]FI_DGRAM\f[R] and
+\f[I]FI_RDM\f[R]
 .TP
 .B Endpoint capabilities
 Endpoints can support any combination of data transfer capabilities
-\f[I]FI_TAGGED\f[], \f[I]FI_MSG\f[], \f[I]FI_ATOMICS\f[], and
-\f[I]FI_RMA\f[].
-These capabilities can be further refined by \f[I]FI_SEND\f[],
-\f[I]FI_RECV\f[], \f[I]FI_READ\f[], \f[I]FI_WRITE\f[],
-\f[I]FI_REMOTE_READ\f[], and \f[I]FI_REMOTE_WRITE\f[] to limit the
+\f[I]FI_TAGGED\f[R], \f[I]FI_MSG\f[R], \f[I]FI_ATOMICS\f[R], and
+\f[I]FI_RMA\f[R].
+These capabilities can be further refined by \f[I]FI_SEND\f[R],
+\f[I]FI_RECV\f[R], \f[I]FI_READ\f[R], \f[I]FI_WRITE\f[R],
+\f[I]FI_REMOTE_READ\f[R], and \f[I]FI_REMOTE_WRITE\f[R] to limit the
 direction of operations.
-.RS
-.RE
 .PP
-\f[I]FI_MULTI_RECV\f[] is supported for non\-tagged message queue only.
+\f[I]FI_MULTI_RECV\f[R] is supported for non\-tagged message queue only.
 .PP
 Scalable endpoints are supported if the underlying PSM2 library supports
 multiple endpoints.
 This condition must be satisfied both when the provider is built and
 when the provider is used.
-See the \f[I]Scalable endpoints\f[] section for more information.
+See the \f[I]Scalable endpoints\f[R] section for more information.
 .PP
-Other supported capabilities include \f[I]FI_TRIGGER\f[],
-\f[I]FI_REMOTE_CQ_DATA\f[], \f[I]FI_RMA_EVENT\f[], \f[I]FI_SOURCE\f[],
-and \f[I]FI_SOURCE_ERR\f[].
-Furthermore, \f[I]FI_NAMED_RX_CTX\f[] is supported when scalable
+Other supported capabilities include \f[I]FI_TRIGGER\f[R],
+\f[I]FI_REMOTE_CQ_DATA\f[R], \f[I]FI_RMA_EVENT\f[R],
+\f[I]FI_SOURCE\f[R], and \f[I]FI_SOURCE_ERR\f[R].
+Furthermore, \f[I]FI_NAMED_RX_CTX\f[R] is supported when scalable
 endpoints are enabled.
 .TP
 .B Modes
-\f[I]FI_CONTEXT\f[] is required for the \f[I]FI_TAGGED\f[] and
-\f[I]FI_MSG\f[] capabilities.
+\f[I]FI_CONTEXT\f[R] is required for the \f[I]FI_TAGGED\f[R] and
+\f[I]FI_MSG\f[R] capabilities.
 That means, any request belonging to these two categories that generates
 a completion must pass as the operation context a valid pointer to type
-\f[I]struct fi_context\f[], and the space referenced by the pointer must
-remain untouched until the request has completed.
-If none of \f[I]FI_TAGGED\f[] and \f[I]FI_MSG\f[] is asked for, the
-\f[I]FI_CONTEXT\f[] mode is not required.
-.RS
-.RE
+\f[I]struct fi_context\f[R], and the space referenced by the pointer
+must remain untouched until the request has completed.
+If none of \f[I]FI_TAGGED\f[R] and \f[I]FI_MSG\f[R] is asked for, the
+\f[I]FI_CONTEXT\f[R] mode is not required.
 .TP
 .B Progress
-The \f[I]psm2\f[] provider requires manual progress.
-The application is expected to call \f[I]fi_cq_read\f[] or
-\f[I]fi_cntr_read\f[] function from time to time when no other libfabric
-function is called to ensure progress is made in a timely manner.
+The \f[I]psm2\f[R] provider requires manual progress.
+The application is expected to call \f[I]fi_cq_read\f[R] or
+\f[I]fi_cntr_read\f[R] function from time to time when no other
+libfabric function is called to ensure progress is made in a timely
+manner.
 The provider does support auto progress mode.
 However, the performance can be significantly impacted if the
 application purely depends on the provider to make auto progress.
-.RS
-.RE
 .TP
 .B Scalable endpoints
 Scalable endpoints support depends on the multi\-EP feature of the
-\f[I]PSM2\f[] library.
-If the \f[I]PSM2\f[] library supports this feature, the availability is
-further controlled by an environment variable \f[I]PSM2_MULTI_EP\f[].
-The \f[I]psm2\f[] provider automatically sets this variable to 1 if it
+\f[I]PSM2\f[R] library.
+If the \f[I]PSM2\f[R] library supports this feature, the availability is
+further controlled by an environment variable \f[I]PSM2_MULTI_EP\f[R].
+The \f[I]psm2\f[R] provider automatically sets this variable to 1 if it
 is not set.
-The feature can be disabled explicitly by setting \f[I]PSM2_MULTI_EP\f[]
-to 0.
-.RS
-.RE
+The feature can be disabled explicitly by setting
+\f[I]PSM2_MULTI_EP\f[R] to 0.
 .PP
 When creating a scalable endpoint, the exact number of contexts
-requested should be set in the "fi_info" structure passed to the
-\f[I]fi_scalable_ep\f[] function.
-This number should be set in "fi_info\->ep_attr\->tx_ctx_cnt" or
-"fi_info\->ep_attr\->rx_ctx_cnt" or both, whichever greater is used.
-The \f[I]psm2\f[] provider allocates all requested contexts upfront when
-the scalable endpoint is created.
+requested should be set in the \[lq]fi_info\[rq] structure passed to the
+\f[I]fi_scalable_ep\f[R] function.
+This number should be set in \[lq]fi_info\->ep_attr\->tx_ctx_cnt\[rq] or
+\[lq]fi_info\->ep_attr\->rx_ctx_cnt\[rq] or both, whichever greater is
+used.
+The \f[I]psm2\f[R] provider allocates all requested contexts upfront
+when the scalable endpoint is created.
 The same context is used for both Tx and Rx.
 .PP
 For optimal performance, it is advised to avoid having multiple threads
@@ -105,7 +97,7 @@ supported.
 Instead, individual tx context or rx context of the scalable endpoint
 should be used.
 Similarly, using the address of the scalable endpoint as the source
-address or destination address doesn\[aq]t collectively address all the
+address or destination address doesn\[cq]t collectively address all the
 tx/rx contexts.
 It addresses only the first tx/rx context, instead.
 .TP
@@ -117,20 +109,16 @@ The reason is that Rx capability always requires a PSM context, which
 can also be automatically used for Tx.
 As the result, allocating a shared Tx context for Rx capable endpoints
 actually consumes one extra context instead of saving some.
-.RS
-.RE
 .TP
 .B Unsupported features
 These features are unsupported: connection management, passive endpoint,
 and shared receive context.
-.RS
-.RE
 .SH RUNTIME PARAMETERS
 .PP
-The \f[I]psm2\f[] provider checks for the following environment
+The \f[I]psm2\f[R] provider checks for the following environment
 variables:
 .TP
-.B \f[I]FI_PSM2_UUID\f[]
+.B \f[I]FI_PSM2_UUID\f[R]
 PSM requires that each job has a unique ID (UUID).
 All the processes in the same job need to use the same UUID in order to
 be able to talk to each other.
@@ -138,35 +126,39 @@ The PSM reference manual advises to keep UUID unique to each job.
 In practice, it generally works fine to reuse UUID as long as (1) no two
 jobs with the same UUID are running at the same time; and (2) previous
 jobs with the same UUID have exited normally.
-If running into "resource busy" or "connection failure" issues with
-unknown reason, it is advisable to manually set the UUID to a value
-different from the default.
-.RS
-.RE
+If running into \[lq]resource busy\[rq] or \[lq]connection failure\[rq]
+issues with unknown reason, it is advisable to manually set the UUID to
+a value different from the default.
 .PP
 The default UUID is 00FF00FF\-0000\-0000\-0000\-00FF0F0F00FF.
+.PP
+It is possible to create endpoints with UUID different from the one set
+here.
+To achieve that, set `info\->ep_attr\->auth_key' to the uuid value and
+`info\->ep_attr\->auth_key_size' to its size (16 bytes) when calling
+fi_endpoint() or fi_scalable_ep().
+It is still true that an endpoint can only communicate with endpoints
+with the same UUID.
 .TP
-.B \f[I]FI_PSM2_NAME_SERVER\f[]
-The \f[I]psm2\f[] provider has a simple built\-in name server that can
+.B \f[I]FI_PSM2_NAME_SERVER\f[R]
+The \f[I]psm2\f[R] provider has a simple built\-in name server that can
 be used to resolve an IP address or host name into a transport address
-needed by the \f[I]fi_av_insert\f[] call.
+needed by the \f[I]fi_av_insert\f[R] call.
 The main purpose of this name server is to allow simple client\-server
-type applications (such as those in \f[I]fabtests\f[]) to be written
+type applications (such as those in \f[I]fabtests\f[R]) to be written
 purely with libfabric, without using any out\-of\-band communication
 mechanism.
 For such applications, the server would run first to allow endpoints be
 created and registered with the name server, and then the client would
-call \f[I]fi_getinfo\f[] with the \f[I]node\f[] parameter set to the IP
-address or host name of the server.
-The resulting \f[I]fi_info\f[] structure would have the transport
-address of the endpoint created by the server in the \f[I]dest_addr\f[]
+call \f[I]fi_getinfo\f[R] with the \f[I]node\f[R] parameter set to the
+IP address or host name of the server.
+The resulting \f[I]fi_info\f[R] structure would have the transport
+address of the endpoint created by the server in the \f[I]dest_addr\f[R]
 field.
-Optionally the \f[I]service\f[] parameter can be used in addition to
-\f[I]node\f[].
-Notice that the \f[I]service\f[] number is interpreted by the provider
+Optionally the \f[I]service\f[R] parameter can be used in addition to
+\f[I]node\f[R].
+Notice that the \f[I]service\f[R] number is interpreted by the provider
 and is not a TCP/IP port number.
-.RS
-.RE
 .PP
 The name server is on by default.
 It can be turned off by setting the variable to 0.
@@ -176,93 +168,77 @@ created when the name server is on.
 The provider detects OpenMPI and MPICH runs and changes the default
 setting to off.
 .TP
-.B \f[I]FI_PSM2_TAGGED_RMA\f[]
+.B \f[I]FI_PSM2_TAGGED_RMA\f[R]
 The RMA functions are implemented on top of the PSM Active Message
 functions.
 The Active Message functions have limit on the size of data can be
 transferred in a single message.
 Large transfers can be divided into small chunks and be pipe\-lined.
 However, the bandwidth is sub\-optimal by doing this way.
-.RS
-.RE
 .PP
-The \f[I]psm2\f[] provider use PSM tag\-matching message queue functions
-to achieve higher bandwidth for large size RMA.
+The \f[I]psm2\f[R] provider use PSM tag\-matching message queue
+functions to achieve higher bandwidth for large size RMA.
 It takes advantage of the extra tag bits available in PSM2 to separate
 the RMA traffic from the regular tagged message queue.
 .PP
 The option is on by default.
 To turn it off set the variable to 0.
 .TP
-.B \f[I]FI_PSM2_DELAY\f[]
+.B \f[I]FI_PSM2_DELAY\f[R]
 Time (seconds) to sleep before closing PSM endpoints.
 This is a workaround for a bug in some versions of PSM library.
-.RS
-.RE
 .PP
 The default setting is 0.
 .TP
-.B \f[I]FI_PSM2_TIMEOUT\f[]
+.B \f[I]FI_PSM2_TIMEOUT\f[R]
 Timeout (seconds) for gracefully closing PSM endpoints.
 A forced closing will be issued if timeout expires.
-.RS
-.RE
 .PP
 The default setting is 5.
 .TP
-.B \f[I]FI_PSM2_CONN_TIMEOUT\f[]
+.B \f[I]FI_PSM2_CONN_TIMEOUT\f[R]
 Timeout (seconds) for establishing connection between two PSM endpoints.
-.RS
-.RE
 .PP
 The default setting is 5.
 .TP
-.B \f[I]FI_PSM2_PROG_INTERVAL\f[]
+.B \f[I]FI_PSM2_PROG_INTERVAL\f[R]
 When auto progress is enabled (asked via the hints to
-\f[I]fi_getinfo\f[]), a progress thread is created to make progress
+\f[I]fi_getinfo\f[R]), a progress thread is created to make progress
 calls from time to time.
 This option set the interval (microseconds) between progress calls.
-.RS
-.RE
 .PP
 The default setting is 1 if affinity is set, or 1000 if not.
-See \f[I]FI_PSM2_PROG_AFFINITY\f[].
+See \f[I]FI_PSM2_PROG_AFFINITY\f[R].
 .TP
-.B \f[I]FI_PSM2_PROG_AFFINITY\f[]
+.B \f[I]FI_PSM2_PROG_AFFINITY\f[R]
 When set, specify the set of CPU cores to set the progress thread
 affinity to.
 The format is
-\f[C]<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*\f[], where
-each triplet \f[C]<start>:<end>:<stride>\f[] defines a block of
+\f[C]<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*\f[R],
+where each triplet \f[C]<start>:<end>:<stride>\f[R] defines a block of
 core_ids.
-Both \f[C]<start>\f[] and \f[C]<end>\f[] can be either the
-\f[C]core_id\f[] (when >=0) or \f[C]core_id\ \-\ num_cores\f[] (when
+Both \f[C]<start>\f[R] and \f[C]<end>\f[R] can be either the
+\f[C]core_id\f[R] (when >=0) or \f[C]core_id \- num_cores\f[R] (when
 <0).
-.RS
-.RE
 .PP
 By default affinity is not set.
 .TP
-.B \f[I]FI_PSM2_INJECT_SIZE\f[]
+.B \f[I]FI_PSM2_INJECT_SIZE\f[R]
 Maximum message size allowed for fi_inject and fi_tinject calls.
 This is an experimental feature to allow some applications to override
 default inject size limitation.
 When the inject size is larger than the default value, some inject calls
 might block.
-.RS
-.RE
 .PP
 The default setting is 64.
 .TP
-.B \f[I]FI_PSM2_LOCK_LEVEL\f[]
+.B \f[I]FI_PSM2_LOCK_LEVEL\f[R]
 When set, dictate the level of locking being used by the provider.
 Level 2 means all locks are enabled.
 Level 1 disables some locks and is suitable for runs that limit the
 access to each PSM2 context to a single thread.
 Level 0 disables all locks and thus is only suitable for single threaded
 runs.
-.RS
-.RE
 .PP
 To use level 0 or level 1, wait object and auto progress mode cannot be
 used because they introduce internal threads that may break the
@@ -270,7 +246,7 @@ conditions needed for these levels.
 .PP
 The default setting is 2.
 .TP
-.B \f[I]FI_PSM2_LAZY_CONN\f[]
+.B \f[I]FI_PSM2_LAZY_CONN\f[R]
 There are two strategies on when to establish connections between the
 PSM2 endpoints that OFI endpoints are built on top of.
 In eager connection mode, connections are established when addresses are
@@ -279,8 +255,6 @@ In lazy connection mode, connections are established when addresses are
 used the first time in communication.
 Eager connection mode has slightly lower critical path overhead but lazy
 connection mode scales better.
-.RS
-.RE
 .PP
 This option controls how the two connection modes are used.
 When set to 1, lazy connection mode is always used.
@@ -292,11 +266,11 @@ PSM2_MULTI_EP=0; and (2) the address vector type is FI_AV_MAP.
 .PP
 The default setting is 0.
 .TP
-.B \f[I]FI_PSM2_DISCONNECT\f[]
+.B \f[I]FI_PSM2_DISCONNECT\f[R]
 The provider has a mechanism to automatically send disconnection
 notifications to all connected peers before the local endpoint is
 closed.
-As the response, the peers call \f[I]psm2_ep_disconnect\f[] to clean up
+As the response, the peers call \f[I]psm2_ep_disconnect\f[R] to clean up
 the connection state at their side.
 This allows the same PSM2 epid be used by different dynamically started
 processes (clients) to communicate with the same peer (server).
@@ -304,8 +278,6 @@ This mechanism, however, introduce extra overhead to the finalization
 phase.
 For applications that never reuse epids within the same session such
 overhead is unnecessary.
-.RS
-.RE
 .PP
 This option controls whether the automatic disconnection notification
 mechanism should be enabled.
@@ -314,32 +286,46 @@ set this option to 1, but the server should set it to 0.
 .PP
 The default setting is 0.
 .TP
-.B \f[I]FI_PSM2_TAG_LAYOUT\f[]
+.B \f[I]FI_PSM2_TAG_LAYOUT\f[R]
 Select how the 96\-bit PSM2 tag bits are organized.
-Currently three choices are available: \f[I]tag60\f[] means 32\-4\-60
+Currently three choices are available: \f[I]tag60\f[R] means 32\-4\-60
 partitioning for CQ data, internal protocol flags, and application tag.
-\f[I]tag64\f[] means 4\-28\-64 partitioning for internal protocol flags,
-CQ data, and application tag.
-\f[I]auto\f[] means to choose either \f[I]tag60\f[] or \f[I]tag64\f[]
-based on the hints passed to fi_getinfo \-\- \f[I]tag60\f[] is used if
+\f[I]tag64\f[R] means 4\-28\-64 partitioning for internal protocol
+flags, CQ data, and application tag.
+\f[I]auto\f[R] means to choose either \f[I]tag60\f[R] or \f[I]tag64\f[R]
+based on the hints passed to fi_getinfo \[en] \f[I]tag60\f[R] is used if
 remote CQ data support is requested explicitly, either by passing
-non\-zero value via \f[I]hints\->domain_attr\->cq_data_size\f[] or by
-including \f[I]FI_REMOTE_CQ_DATA\f[] in \f[I]hints\->caps\f[], otherwise
-\f[I]tag64\f[] is used.
-If \f[I]tag64\f[] is the result of automatic selection,
-\f[I]fi_getinfo\f[] also returns a second instance of the provider with
-\f[I]tag60\f[] layout.
-.RS
-.RE
+non\-zero value via \f[I]hints\->domain_attr\->cq_data_size\f[R] or by
+including \f[I]FI_REMOTE_CQ_DATA\f[R] in \f[I]hints\->caps\f[R],
+otherwise \f[I]tag64\f[R] is used.
+If \f[I]tag64\f[R] is the result of automatic selection,
+\f[I]fi_getinfo\f[R] also returns a second instance of the provider with
+\f[I]tag60\f[R] layout.
 .PP
-The default setting is \f[I]auto\f[].
+The default setting is \f[I]auto\f[R].
 .PP
 Notice that if the provider is compiled with macro
-\f[I]PSMX2_TAG_LAYOUT\f[] defined to 1 (means \f[I]tag60\f[]) or 2
-(means \f[I]tag64\f[]), the choice is fixed at compile time and this
+\f[I]PSMX2_TAG_LAYOUT\f[R] defined to 1 (means \f[I]tag60\f[R]) or 2
+(means \f[I]tag64\f[R]), the choice is fixed at compile time and this
 runtime option will be disabled.
+.SH PSM2 EXTENSIONS
+.PP
+The \f[I]psm2\f[R] provider supports limited low level parameter setting
+through the fi_set_val() and fi_get_val() functions.
+Currently the following parameters can be set via the domain fid:
+\[bu] .RS 2
+.TP
+.B FI_PSM2_DISCONNECT *
+Overwite the global runtime parameter \f[I]FI_PSM2_DISCONNECT\f[R] for
+this domain.
+See the \f[I]RUNTIME PARAMETERS\f[R] section for details.
+.RE
+.PP
+Valid parameter names are defined in the header file
+\f[I]rdma/fi_ext_psm2.h\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_psm\f[](7),
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm\f[R](7),
+\f[C]fi_psm3\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_psm3.7 b/deps/libfabric/man/man7/fi_psm3.7
new file mode 100644
index 0000000000000000000000000000000000000000..1bf0914116063fc7f1f81d9970abd86464ee655b
--- /dev/null
+++ b/deps/libfabric/man/man7/fi_psm3.7
@@ -0,0 +1,308 @@
+.\" Automatically generated by Pandoc 2.5
+.\"
+.TH "fi_psm3" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
+.hy
+.SH NAME
+.PP
+fi_psm3 \- The PSM3 Fabric Provider
+.SH OVERVIEW
+.PP
+The \f[I]psm3\f[R] provider implements a Performance Scaled Messaging
+capability which supports Intel RoCEv2 capable NICs.
+PSM3 represents an Ethernet and standard RoCEv2 enhancement of previous
+PSM implementations.
+.SH SUPPORTED FEATURES
+.PP
+The \f[I]psm3\f[R] provider supports a subset of all the features
+defined in the libfabric API.
+.TP
+.B Endpoint types
+Supports non\-connection based types \f[I]FI_DGRAM\f[R] and
+\f[I]FI_RDM\f[R].
+.TP
+.B Endpoint capabilities
+Endpoints can support any combination of data transfer capabilities
+\f[I]FI_TAGGED\f[R], \f[I]FI_MSG\f[R], \f[I]FI_ATOMICS\f[R], and
+\f[I]FI_RMA\f[R].
+These capabilities can be further refined by \f[I]FI_SEND\f[R],
+\f[I]FI_RECV\f[R], \f[I]FI_READ\f[R], \f[I]FI_WRITE\f[R],
+\f[I]FI_REMOTE_READ\f[R], and \f[I]FI_REMOTE_WRITE\f[R] to limit the
+direction of operations.
+.PP
+\f[I]FI_MULTI_RECV\f[R] is supported for non\-tagged message queue only.
+.PP
+Scalable endpoints are supported if the underlying PSM3 library supports
+multiple endpoints.
+This condition must be satisfied both when the provider is built and
+when the provider is used.
+See the \f[I]Scalable endpoints\f[R] section for more information.
+.PP
+Other supported capabilities include \f[I]FI_TRIGGER\f[R],
+\f[I]FI_REMOTE_CQ_DATA\f[R], \f[I]FI_RMA_EVENT\f[R],
+\f[I]FI_SOURCE\f[R], and \f[I]FI_SOURCE_ERR\f[R].
+Furthermore, \f[I]FI_NAMED_RX_CTX\f[R] is supported when scalable
+endpoints are enabled.
+.TP
+.B Modes
+\f[I]FI_CONTEXT\f[R] is required for the \f[I]FI_TAGGED\f[R] and
+\f[I]FI_MSG\f[R] capabilities.
+That means, any request belonging to these two categories that generates
+a completion must pass as the operation context a valid pointer to type
+\f[I]struct fi_context\f[R], and the space referenced by the pointer
+must remain untouched until the request has completed.
+If none of \f[I]FI_TAGGED\f[R] and \f[I]FI_MSG\f[R] is asked for, the
+\f[I]FI_CONTEXT\f[R] mode is not required.
+.TP
+.B Progress
+The \f[I]psm3\f[R] provider performs optimal with manual progress.
+By default, the application is expected to call \f[I]fi_cq_read\f[R] or
+\f[I]fi_cntr_read\f[R] function from time to time when no other
+libfabric function is called to ensure progress is made in a timely
+manner.
+The provider does support auto progress mode.
+However, the performance can be significantly impacted if the
+application purely depends on the provider to make auto progress.
+.TP
+.B Scalable endpoints
+Scalable endpoints support depends on the multi\-EP feature of the
+\f[I]PSM3\f[R] library.
+If the \f[I]PSM3\f[R] library supports this feature, the availability is
+further controlled by an environment variable \f[I]PSM3_MULTI_EP\f[R].
+The \f[I]psm3\f[R] provider automatically sets this variable to 1 if it
+is not set.
+The feature can be disabled explicitly by setting
+\f[I]PSM3_MULTI_EP\f[R] to 0.
+.PP
+When creating a scalable endpoint, the exact number of contexts
+requested should be set in the \[lq]fi_info\[rq] structure passed to the
+\f[I]fi_scalable_ep\f[R] function.
+This number should be set in \[lq]fi_info\->ep_attr\->tx_ctx_cnt\[rq] or
+\[lq]fi_info\->ep_attr\->rx_ctx_cnt\[rq] or both, whichever greater is
+used.
+The \f[I]psm3\f[R] provider allocates all requested contexts upfront
+when the scalable endpoint is created.
+The same context is used for both Tx and Rx.
+.PP
+For optimal performance, it is advised to avoid having multiple threads
+accessing the same context, either directly by posting
+send/recv/read/write request, or indirectly by polling associated
+completion queues or counters.
+.PP
+Using the scalable endpoint as a whole in communication functions is not
+supported.
+Instead, individual tx context or rx context of the scalable endpoint
+should be used.
+Similarly, using the address of the scalable endpoint as the source
+address or destination address doesn\[cq]t collectively address all the
+tx/rx contexts.
+It addresses only the first tx/rx context, instead.
+.SH LIMITATIONS
+.PP
+The \f[I]psm3\f[R] provider doesn\[cq]t support all the features defined
+in the libfabric API.
+Here are some of the limitations not listed above:
+.TP
+.B Unsupported features
+These features are unsupported: connection management, passive endpoint,
+and shared receive context.
+.SH RUNTIME PARAMETERS
+.PP
+The \f[I]psm3\f[R] provider checks for the following environment
+variables:
+.TP
+.B \f[I]FI_PSM3_UUID\f[R]
+PSM requires that each job has a unique ID (UUID).
+All the processes in the same job need to use the same UUID in order to
+be able to talk to each other.
+The PSM reference manual advises to keep UUID unique to each job.
+In practice, it generally works fine to reuse UUID as long as (1) no two
+jobs with the same UUID are running at the same time; and (2) previous
+jobs with the same UUID have exited normally.
+If running into \[lq]resource busy\[rq] or \[lq]connection failure\[rq]
+issues with unknown reason, it is advisable to manually set the UUID to
+a value different from the default.
+.PP
+The default UUID is 00FF00FF\-0000\-0000\-0000\-00FF0F0F00FF.
+.PP
+It is possible to create endpoints with UUID different from the one set
+here.
+To achieve that, set `info\->ep_attr\->auth_key' to the uuid value and
+`info\->ep_attr\->auth_key_size' to its size (16 bytes) when calling
+fi_endpoint() or fi_scalable_ep().
+It is still true that an endpoint can only communicate with endpoints
+with the same UUID.
+.TP
+.B \f[I]FI_PSM3_NAME_SERVER\f[R]
+The \f[I]psm3\f[R] provider has a simple built\-in name server that can
+be used to resolve an IP address or host name into a transport address
+needed by the \f[I]fi_av_insert\f[R] call.
+The main purpose of this name server is to allow simple client\-server
+type applications (such as those in \f[I]fabtests\f[R]) to be written
+purely with libfabric, without using any out\-of\-band communication
+mechanism.
+For such applications, the server would run first to allow endpoints be
+created and registered with the name server, and then the client would
+call \f[I]fi_getinfo\f[R] with the \f[I]node\f[R] parameter set to the
+IP address or host name of the server.
+The resulting \f[I]fi_info\f[R] structure would have the transport
+address of the endpoint created by the server in the \f[I]dest_addr\f[R]
+field.
+Optionally the \f[I]service\f[R] parameter can be used in addition to
+\f[I]node\f[R].
+Notice that the \f[I]service\f[R] number is interpreted by the provider
+and is not a TCP/IP port number.
+.PP
+The name server is on by default.
+It can be turned off by setting the variable to 0.
+This may save a small amount of resource since a separate thread is
+created when the name server is on.
+.PP
+The provider detects OpenMPI and MPICH runs and changes the default
+setting to off.
+.TP
+.B \f[I]FI_PSM3_TAGGED_RMA\f[R]
+The RMA functions are implemented on top of the PSM Active Message
+functions.
+The Active Message functions have limit on the size of data can be
+transferred in a single message.
+Large transfers can be divided into small chunks and be pipe\-lined.
+However, the bandwidth is sub\-optimal by doing this way.
+.PP
+The \f[I]psm3\f[R] provider use PSM tag\-matching message queue
+functions to achieve higher bandwidth for large size RMA.
+It takes advantage of the extra tag bits available in PSM3 to separate
+the RMA traffic from the regular tagged message queue.
+.PP
+The option is on by default.
+To turn it off set the variable to 0.
+.TP
+.B \f[I]FI_PSM3_DELAY\f[R]
+Time (seconds) to sleep before closing PSM endpoints.
+This is a workaround for a bug in some versions of PSM library.
+.PP
+The default setting is 0.
+.TP
+.B \f[I]FI_PSM3_TIMEOUT\f[R]
+Timeout (seconds) for gracefully closing PSM endpoints.
+A forced closing will be issued if timeout expires.
+.PP
+The default setting is 5.
+.TP
+.B \f[I]FI_PSM3_CONN_TIMEOUT\f[R]
+Timeout (seconds) for establishing connection between two PSM endpoints.
+.PP
+The default setting is 5.
+.TP
+.B \f[I]FI_PSM3_PROG_INTERVAL\f[R]
+When auto progress is enabled (asked via the hints to
+\f[I]fi_getinfo\f[R]), a progress thread is created to make progress
+calls from time to time.
+This option set the interval (microseconds) between progress calls.
+.PP
+The default setting is 1 if affinity is set, or 1000 if not.
+See \f[I]FI_PSM3_PROG_AFFINITY\f[R].
+.TP
+.B \f[I]FI_PSM3_PROG_AFFINITY\f[R]
+When set, specify the set of CPU cores to set the progress thread
+affinity to.
+The format is
+\f[C]<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*\f[R],
+where each triplet \f[C]<start>:<end>:<stride>\f[R] defines a block of
+core_ids.
+Both \f[C]<start>\f[R] and \f[C]<end>\f[R] can be either the
+\f[C]core_id\f[R] (when >=0) or \f[C]core_id \- num_cores\f[R] (when
+<0).
+.PP
+By default affinity is not set.
+.TP
+.B \f[I]FI_PSM3_INJECT_SIZE\f[R]
+Maximum message size allowed for fi_inject and fi_tinject calls.
+This is an experimental feature to allow some applications to override
+default inject size limitation.
+When the inject size is larger than the default value, some inject calls
+might block.
+.PP
+The default setting is 64.
+.TP
+.B \f[I]FI_PSM3_LOCK_LEVEL\f[R]
+When set, dictate the level of locking being used by the provider.
+Level 2 means all locks are enabled.
+Level 1 disables some locks and is suitable for runs that limit the
+access to each PSM3 context to a single thread.
+Level 0 disables all locks and thus is only suitable for single threaded
+runs.
+.PP
+To use level 0 or level 1, wait object and auto progress mode cannot be
+used because they introduce internal threads that may break the
+conditions needed for these levels.
+.PP
+The default setting is 2.
+.TP
+.B \f[I]FI_PSM3_LAZY_CONN\f[R]
+There are two strategies on when to establish connections between the
+PSM3 endpoints that OFI endpoints are built on top of.
+In eager connection mode, connections are established when addresses are
+inserted into the address vector.
+In lazy connection mode, connections are established when addresses are
+used the first time in communication.
+Eager connection mode has slightly lower critical path overhead but lazy
+connection mode scales better.
+.PP
+This option controls how the two connection modes are used.
+When set to 1, lazy connection mode is always used.
+When set to 0, eager connection mode is used when required conditions
+are all met and lazy connection mode is used otherwise.
+The conditions for eager connection mode are: (1) multiple endpoint (and
+scalable endpoint) support is disabled by explicitly setting
+PSM3_MULTI_EP=0; and (2) the address vector type is FI_AV_MAP.
+.PP
+The default setting is 0.
+.TP
+.B \f[I]FI_PSM3_DISCONNECT\f[R]
+The provider has a mechanism to automatically send disconnection
+notifications to all connected peers before the local endpoint is
+closed.
+As the response, the peers call \f[I]psm3_ep_disconnect\f[R] to clean up
+the connection state at their side.
+This allows the same PSM3 epid be used by different dynamically started
+processes (clients) to communicate with the same peer (server).
+This mechanism, however, introduce extra overhead to the finalization
+phase.
+For applications that never reuse epids within the same session such
+overhead is unnecessary.
+.PP
+This option controls whether the automatic disconnection notification
+mechanism should be enabled.
+For client\-server application mentioned above, the client side should
+set this option to 1, but the server should set it to 0.
+.PP
+The default setting is 0.
+.TP
+.B \f[I]FI_PSM3_TAG_LAYOUT\f[R]
+Select how the 96\-bit PSM3 tag bits are organized.
+Currently three choices are available: \f[I]tag60\f[R] means 32\-4\-60
+partitioning for CQ data, internal protocol flags, and application tag.
+\f[I]tag64\f[R] means 4\-28\-64 partitioning for internal protocol
+flags, CQ data, and application tag.
+\f[I]auto\f[R] means to choose either \f[I]tag60\f[R] or \f[I]tag64\f[R]
+based on the hints passed to fi_getinfo \[en] \f[I]tag60\f[R] is used if
+remote CQ data support is requested explicitly, either by passing
+non\-zero value via \f[I]hints\->domain_attr\->cq_data_size\f[R] or by
+including \f[I]FI_REMOTE_CQ_DATA\f[R] in \f[I]hints\->caps\f[R],
+otherwise \f[I]tag64\f[R] is used.
+If \f[I]tag64\f[R] is the result of automatic selection,
+\f[I]fi_getinfo\f[R] also returns a second instance of the provider with
+\f[I]tag60\f[R] layout.
+.PP
+The default setting is \f[I]auto\f[R].
+.PP
+Notice that if the provider is compiled with macro
+\f[I]PSMX3_TAG_LAYOUT\f[R] defined to 1 (means \f[I]tag60\f[R]) or 2
+(means \f[I]tag64\f[R]), the choice is fixed at compile time and this
+runtime option will be disabled.
+.SH SEE ALSO
+.PP
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_psm\f[R](7),
+\f[C]fi_psm2\f[R](7),
+.SH AUTHORS
+OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_rstream.7 b/deps/libfabric/man/man7/fi_rstream.7
index ac3f94cd5eaeac235a90671035d643ce77dbf80c..562267886fc6045699a97e74004aef3f07994056 100644
--- a/deps/libfabric/man/man7/fi_rstream.7
+++ b/deps/libfabric/man/man7/fi_rstream.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_rstream" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_rstream" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -19,38 +19,28 @@ For messaging completions, use FI_PEEK on send/recv after poll to see
 what type of transaction has transpired.
 .SH SUPPORTED FEATURES
 .PP
-The rstream provider currently supports \f[I]FI_MSG\f[] capabilities.
+The rstream provider currently supports \f[I]FI_MSG\f[R] capabilities.
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only endpoint type \f[I]FI_EP_SOCK_STREAM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only endpoint type \f[I]FI_EP_SOCK_STREAM\f[R].
 .PP
-\f[I]Endpoint capabilities\f[] : The following data transfer interface
-is supported: \f[I]fi_msg\f[].
+\f[I]Endpoint capabilities\f[R] : The following data transfer interface
+is supported: \f[I]fi_msg\f[R].
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The provider does not require the use of any mode bits but supports core
 providers that require FI_CONTEXT and FI_RX_CQ_DATA.
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
-The rstream provider only supports \f[I]FI_PROGRESS_MANUAL\f[].
-.RS
-.RE
+.B \f[I]Progress\f[R]
+The rstream provider only supports \f[I]FI_PROGRESS_MANUAL\f[R].
 .TP
-.B \f[I]Threading Model\f[]
+.B \f[I]Threading Model\f[R]
 The provider supports FI_THREAD_SAFE
-.RS
-.RE
 .TP
-.B \f[I]Verbs\-iWarp\f[]
+.B \f[I]Verbs\-iWarp\f[R]
 The provider has added features to enable iWarp.
 To use this feature, the ep protocol iWarp must be requested in an
 fi_getinfo call.
-.RS
-.RE
 .SH LIMITATIONS
 .PP
 The rstream provider is experimental and lacks performance validation
@@ -64,32 +54,24 @@ memory region size and CQ size).
 These can be modified by fi_setopt.
 .SH SETTINGS
 .PP
-The \f[I]rstream\f[] provider settings can be modified via fi_setopt on
+The \f[I]rstream\f[R] provider settings can be modified via fi_setopt on
 the endpoint (FI_OPT_ENDPOINT) along with the following parameters:
 .TP
-.B \f[I]FI_OPT_SEND_BUF_SIZE\f[]
+.B \f[I]FI_OPT_SEND_BUF_SIZE\f[R]
 Size of the send buffer.
 Default is 32KB.
-.RS
-.RE
 .TP
-.B \f[I]FI_OPT_RECV_BUF_SIZE\f[]
+.B \f[I]FI_OPT_RECV_BUF_SIZE\f[R]
 Size of the recv buffer.
 Default is 32KB.
-.RS
-.RE
 .TP
-.B \f[I]FI_OPT_TX_SIZE\f[]
+.B \f[I]FI_OPT_TX_SIZE\f[R]
 Size of the send queue.
 Default is 384.
-.RS
-.RE
 .TP
-.B \f[I]FI_OPT_RX_SIZE\f[]
+.B \f[I]FI_OPT_RX_SIZE\f[R]
 Size of the recv queue.
 Default is 384.
-.RS
-.RE
 .SH OFI EXTENSIONS
 .PP
 The rstream provider has extended the current OFI API set in order to
@@ -98,6 +80,6 @@ Specifically sendmsg(FI_PEEK) is supported which replicates the behavior
 of the recvmsg(FI_PEEK) feature.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_rxd.7 b/deps/libfabric/man/man7/fi_rxd.7
index d4723ec0cfc017a08506632a192a27d898353833..273269311b4c6a3453c4ab4c4fcc0698402d94b3 100644
--- a/deps/libfabric/man/man7/fi_rxd.7
+++ b/deps/libfabric/man/man7/fi_rxd.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_rxd" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_rxd" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -11,26 +11,20 @@ The RxD provider is a utility provider that supports RDM endpoints
 emulated over a base DGRAM provider.
 .SH SUPPORTED FEATURES
 .PP
-The RxD provider currently supports \f[I]FI_MSG\f[] capabilities.
+The RxD provider currently supports \f[I]FI_MSG\f[R] capabilities.
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only endpoint type \f[I]FI_EP_RDM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only endpoint type \f[I]FI_EP_RDM\f[R].
 .PP
-\f[I]Endpoint capabilities\f[] : The following data transfer interface
-is supported: \f[I]fi_msg\f[].
+\f[I]Endpoint capabilities\f[R] : The following data transfer interface
+is supported: \f[I]fi_msg\f[R].
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The provider does not require the use of any mode bits but supports core
 DGRAM providers that require FI_CONTEXT and FI_MSG_PREFIX.
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
-The RxD provider only supports \f[I]FI_PROGRESS_MANUAL\f[].
-.RS
-.RE
+.B \f[I]Progress\f[R]
+The RxD provider only supports \f[I]FI_PROGRESS_MANUAL\f[R].
 .SH LIMITATIONS
 .PP
 The RxD provider has hard\-coded maximums for supported queue sizes and
@@ -45,36 +39,28 @@ The RxD provider is still under development and is not extensively
 tested.
 .SH RUNTIME PARAMETERS
 .PP
-The \f[I]rxd\f[] provider checks for the following environment
+The \f[I]rxd\f[R] provider checks for the following environment
 variables:
 .TP
-.B \f[I]FI_OFI_RXD_SPIN_COUNT\f[]
-Number of times to read the core provider\[aq]s CQ for a segment
+.B \f[I]FI_OFI_RXD_SPIN_COUNT\f[R]
+Number of times to read the core provider\[cq]s CQ for a segment
 completion before trying to progress sends.
 Default is 1000.
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXD_RETRY\f[]
+.B \f[I]FI_OFI_RXD_RETRY\f[R]
 Toggles retrying of packets and assumes reliability of individual
 packets and will reassemble all received packets.
 Retrying is turned on by default.
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXD_MAX_PEERS\f[]
+.B \f[I]FI_OFI_RXD_MAX_PEERS\f[R]
 Maximum number of peers the provider should prepare to track.
 Default: 1024
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXD_MAX_UNACKED\f[]
+.B \f[I]FI_OFI_RXD_MAX_UNACKED\f[R]
 Maximum number of packets (per peer) to send at a time.
 Default: 128
-.RS
-.RE
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_rxm.7 b/deps/libfabric/man/man7/fi_rxm.7
index 3883ac2fdc0598e002951e60a36774ac23af9dab..209b64400a220205f1ba8b48cd93ab7a96e2c73c 100644
--- a/deps/libfabric/man/man7/fi_rxm.7
+++ b/deps/libfabric/man/man7/fi_rxm.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_rxm" "7" "2020\-06\-06" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_rxm" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -14,7 +14,7 @@ FI_EP_RDM endpoints have a reliable datagram interface and RxM emulates
 this by hiding the connection management of underlying FI_EP_MSG
 endpoints from the user.
 Additionally, RxM can hide memory registration requirement from a core
-provider like verbs if the apps don\[aq]t support it.
+provider like verbs if the apps don\[cq]t support it.
 .SH REQUIREMENTS
 .SS Requirements for core provider
 .PP
@@ -41,38 +41,29 @@ Not doing so would result in a stall.
 See also the ERRORS section in fi_msg(3).
 .SH SUPPORTED FEATURES
 .PP
-The RxM provider currently supports \f[I]FI_MSG\f[], \f[I]FI_TAGGED\f[],
-\f[I]FI_RMA\f[] and \f[I]FI_ATOMIC\f[] capabilities.
+The RxM provider currently supports \f[I]FI_MSG\f[R],
+\f[I]FI_TAGGED\f[R], \f[I]FI_RMA\f[R] and \f[I]FI_ATOMIC\f[R]
+capabilities.
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only \f[I]FI_EP_RDM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only \f[I]FI_EP_RDM\f[R].
 .TP
-.B \f[I]Endpoint capabilities\f[]
-The following data transfer interface is supported: \f[I]FI_MSG\f[],
-\f[I]FI_TAGGED\f[], \f[I]FI_RMA\f[], \f[I]FI_ATOMIC\f[].
-.RS
-.RE
+.B \f[I]Endpoint capabilities\f[R]
+The following data transfer interface is supported: \f[I]FI_MSG\f[R],
+\f[I]FI_TAGGED\f[R], \f[I]FI_RMA\f[R], \f[I]FI_ATOMIC\f[R].
 .TP
-.B \f[I]Progress\f[]
-The RxM provider supports both \f[I]FI_PROGRESS_MANUAL\f[] and
-\f[I]FI_PROGRESS_AUTO\f[].
+.B \f[I]Progress\f[R]
+The RxM provider supports both \f[I]FI_PROGRESS_MANUAL\f[R] and
+\f[I]FI_PROGRESS_AUTO\f[R].
 Manual progress in general has better connection scale\-up and lower CPU
-utilization since there\[aq]s no separate auto\-progress thread.
-.RS
-.RE
+utilization since there\[cq]s no separate auto\-progress thread.
 .TP
-.B \f[I]Addressing Formats\f[]
+.B \f[I]Addressing Formats\f[R]
 FI_SOCKADDR, FI_SOCKADDR_IN
-.RS
-.RE
 .TP
-.B \f[I]Memory Region\f[]
+.B \f[I]Memory Region\f[R]
 FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY MR mode bits would be
 required from the app in case the core provider requires it.
-.RS
-.RE
 .SH LIMITATIONS
 .PP
 When using RxM provider, some limitations from the underlying MSG
@@ -126,86 +117,73 @@ supported.
 .SS Miscellaneous limitations
 .IP \[bu] 2
 RxM protocol peers should have same endian\-ness otherwise connections
-won\[aq]t successfully complete.
+won\[cq]t successfully complete.
 This enables better performance at run\-time as byte order translations
 are avoided.
 .SH RUNTIME PARAMETERS
 .PP
 The ofi_rxm provider checks for the following environment variables.
 .TP
-.B \f[I]FI_OFI_RXM_BUFFER_SIZE\f[]
+.B \f[I]FI_OFI_RXM_BUFFER_SIZE\f[R]
 Defines the transmit buffer size / inject size.
 Messages of size less than this would be transmitted via an eager
 protocol and those above would be transmitted via a rendezvous or SAR
 (Segmentation And Reassembly) protocol.
-Transmit data would be copied up to this size (default: ~16k).
-.RS
-.RE
+Transmit data would be copied up to this size (default: \[ti]16k).
 .TP
-.B \f[I]FI_OFI_RXM_COMP_PER_PROGRESS\f[]
+.B \f[I]FI_OFI_RXM_COMP_PER_PROGRESS\f[R]
 Defines the maximum number of MSG provider CQ entries (default: 1) that
 would be read per progress (RxM CQ read).
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_SAR_LIMIT\f[]
+.B \f[I]FI_OFI_RXM_ENABLE_DYN_RBUF\f[R]
+Enables support for dynamic receive buffering, if available by the
+message endpoint provider.
+This feature allows direct placement of received message data into
+application buffers, bypassing RxM bounce buffers.
+This feature targets providers that provide internal network buffering,
+such as the tcp provider.
+(default: false)
+.TP
+.B \f[I]FI_OFI_RXM_SAR_LIMIT\f[R]
 Set this environment variable to control the RxM SAR (Segmentation And
 Reassembly) protocol.
 Messages of size greater than this (default: 128 Kb) would be
 transmitted via rendezvous protocol.
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_USE_SRX\f[]
+.B \f[I]FI_OFI_RXM_USE_SRX\f[R]
 Set this to 1 to use shared receive context from MSG provider, or 0 to
 disable using shared receive context.
 Shared receive contexts reduce overall memory usage, but may increase in
 message latency.
 If not set, verbs will not use shared receive contexts by default, but
 the tcp provider will.
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_TX_SIZE\f[]
+.B \f[I]FI_OFI_RXM_TX_SIZE\f[R]
 Defines default TX context size (default: 1024)
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_RX_SIZE\f[]
+.B \f[I]FI_OFI_RXM_RX_SIZE\f[R]
 Defines default RX context size (default: 1024)
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_MSG_TX_SIZE\f[]
+.B \f[I]FI_OFI_RXM_MSG_TX_SIZE\f[R]
 Defines FI_EP_MSG TX size that would be requested (default: 128).
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_MSG_RX_SIZE\f[]
+.B \f[I]FI_OFI_RXM_MSG_RX_SIZE\f[R]
 Defines FI_EP_MSG RX size that would be requested (default: 128).
-.RS
-.RE
 .TP
-.B \f[I]FI_UNIVERSE_SIZE\f[]
+.B \f[I]FI_UNIVERSE_SIZE\f[R]
 Defines the expected number of ranks / peers an endpoint would
 communicate with (default: 256).
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_CM_PROGRESS_INTERVAL\f[]
+.B \f[I]FI_OFI_RXM_CM_PROGRESS_INTERVAL\f[R]
 Defines the duration of time in microseconds between calls to RxM CM
 progression functions when using manual progress.
 Higher values may provide less noise for calls to fi_cq read functions,
 but may increase connection setup time (default: 10000)
-.RS
-.RE
 .TP
-.B \f[I]FI_OFI_RXM_CQ_EQ_FAIRNESS\f[]
+.B \f[I]FI_OFI_RXM_CQ_EQ_FAIRNESS\f[R]
 Defines the maximum number of message provider CQ entries that can be
 consecutively read across progress calls without checking to see if the
 CM progress interval has been reached (default: 128)
-.RS
-.RE
 .SH Tuning
 .SS Bandwidth
 .PP
@@ -226,7 +204,8 @@ to only required values.
 .PP
 The data transfer API may return \-FI_EAGAIN during on\-demand
 connection setup of the core provider FI_MSG_EP.
-See \f[C]fi_msg\f[](3) for a detailed description of handling FI_EAGAIN.
+See \f[C]fi_msg\f[R](3) for a detailed description of handling
+FI_EAGAIN.
 .SH Troubleshooting / Known issues
 .PP
 If an RxM endpoint is expected to communicate with more peers than the
@@ -242,6 +221,6 @@ The workaround is to use shared receive contexts for the MSG provider
 (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE).
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_shm.7 b/deps/libfabric/man/man7/fi_shm.7
index ed04e5a7b2db61915e01da429f6e6bc2369ed121..43eede7399559d7bd1eb6e27670a6041b555ec27 100644
--- a/deps/libfabric/man/man7/fi_shm.7
+++ b/deps/libfabric/man/man7/fi_shm.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_shm" "7" "2020\-04\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_shm" "7" "2021\-04\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -17,29 +17,23 @@ between processes on the same system.
 This release contains an initial implementation of the SHM provider that
 offers the following support:
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only endpoint type \f[I]FI_EP_RDM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only endpoint type \f[I]FI_EP_RDM\f[R].
 .TP
-.B \f[I]Endpoint capabilities\f[]
+.B \f[I]Endpoint capabilities\f[R]
 Endpoints cna support any combinations of the following data transfer
-capabilities: \f[I]FI_MSG\f[], \f[I]FI_TAGGED\f[], \f[I]FI_RMA\f[], amd
-\f[I]FI_ATOMICS\f[].
-These capabilities can be further defined by \f[I]FI_SEND\f[],
-\f[I]FI_RECV\f[], \f[I]FI_READ\f[], \f[I]FI_WRITE\f[],
-\f[I]FI_REMOTE_READ\f[], and \f[I]FI_REMOTE_WRITE\f[] to limit the
+capabilities: \f[I]FI_MSG\f[R], \f[I]FI_TAGGED\f[R], \f[I]FI_RMA\f[R],
+amd \f[I]FI_ATOMICS\f[R].
+These capabilities can be further defined by \f[I]FI_SEND\f[R],
+\f[I]FI_RECV\f[R], \f[I]FI_READ\f[R], \f[I]FI_WRITE\f[R],
+\f[I]FI_REMOTE_READ\f[R], and \f[I]FI_REMOTE_WRITE\f[R] to limit the
 direction of operations.
-.RS
-.RE
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The provider does not require the use of any mode bits.
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
-The SHM provider supports \f[I]FI_PROGRESS_MANUAL\f[].
+.B \f[I]Progress\f[R]
+The SHM provider supports \f[I]FI_PROGRESS_MANUAL\f[R].
 Receive side data buffers are not modified outside of completion
 processing routines.
 The provider processes messages using three different methods, based on
@@ -48,70 +42,69 @@ For messages smaller than 4096 bytes, tx completions are generated
 immediately after the send.
 For larger messages, tx completions are not generated until the
 receiving side has processed the message.
-.RS
-.RE
 .TP
-.B \f[I]Address Format\f[]
+.B \f[I]Address Format\f[R]
 The SHM provider uses the address format FI_ADDR_STR, which follows the
-general format pattern "[prefix]://[addr]".
+general format pattern \[lq][prefix]://[addr]\[rq].
 The application can provide addresses through the node or hints
 parameter.
 As long as the address is in a valid FI_ADDR_STR format (contains
-"://"), the address will be used as is.
+\[lq]://\[rq]), the address will be used as is.
 If the application input is incorrectly formatted or no input was
 provided, the SHM provider will resolve it according to the following
 SHM provider standards:
-.RS
-.RE
 .PP
 (flags & FI_SOURCE) ?
-src_addr : dest_addr = \- if (node && service) : "fi_ns://node:service"
-\- if (service) : "fi_ns://service" \- if (node && !service) :
-"fi_shm://node" \- if (!node && !service) : "fi_shm://PID"
+src_addr : dest_addr = \- if (node && service) :
+\[lq]fi_ns://node:service\[rq] \- if (service) :
+\[lq]fi_ns://service\[rq] \- if (node && !service) :
+\[lq]fi_shm://node\[rq] \- if (!node && !service) :
+\[lq]fi_shm://PID\[rq]
 .PP
-!(flags & FI_SOURCE) \- src_addr = "fi_shm://PID"
+!(flags & FI_SOURCE) \- src_addr = \[lq]fi_shm://PID\[rq]
 .PP
 In other words, if the application provides a source and/or destination
-address in an acceptable FI_ADDR_STR format (contains "://"), the call
-to util_getinfo will successfully fill in src_addr and dest_addr with
-the provided input.
+address in an acceptable FI_ADDR_STR format (contains \[lq]://\[rq]),
+the call to util_getinfo will successfully fill in src_addr and
+dest_addr with the provided input.
 If the input is not in an ADDR_STR format, the shared memory provider
-will then create a proper FI_ADDR_STR address with either the "fi_ns://"
-(node/service format) or "fi_shm://" (shm format) prefixes signaling
-whether the addr is a "unique" address and does or does not need an
-extra endpoint name identifier appended in order to make it unique.
+will then create a proper FI_ADDR_STR address with either the
+\[lq]fi_ns://\[rq] (node/service format) or \[lq]fi_shm://\[rq] (shm
+format) prefixes signaling whether the addr is a \[lq]unique\[rq]
+address and does or does not need an extra endpoint name identifier
+appended in order to make it unique.
 For the shared memory provider, we assume that the service (with or
 without a node) is enough to make it unique, but a node alone is not
 sufficient.
-If only a node is provided, the "fi_shm://" prefix is used to signify
-that it is not a unique address.
+If only a node is provided, the \[lq]fi_shm://\[rq] prefix is used to
+signify that it is not a unique address.
 If no node or service are provided (and in the case of setting the src
 address without FI_SOURCE and no hints), the process ID will be used as
 a default address.
-On endpoint creation, if the src_addr has the "fi_shm://" prefix, the
-provider will append ":[uid]:[dom_idx]:[ep_idx]" as a unique endpoint
-name (essentially, in place of a service).
-In the case of the "fi_ns://" prefix (or any other prefix if one was
-provided by the application), no supplemental information is required to
-make it unique and it will remain with only the application\-defined
-address.
+On endpoint creation, if the src_addr has the \[lq]fi_shm://\[rq]
+prefix, the provider will append \[lq]:[uid]:[ep_idx]\[rq] as a unique
+endpoint name (essentially, in place of a service).
+In the case of the \[lq]fi_ns://\[rq] prefix (or any other prefix if one
+was provided by the application), no supplemental information is
+required to make it unique and it will remain with only the
+application\-defined address.
 Note that the actual endpoint name will not include the FI_ADDR_STR
-"*://" prefix since it cannot be included in any shared memory region
-names.
+\[dq]*://\[dq] prefix since it cannot be included in any shared memory
+region names.
 The provider will strip off the prefix before setting the endpoint name.
-As a result, the addresses "fi_prefix1://my_node:my_service" and
-"fi_prefix2://my_node:my_service" would result in endpoints and regions
-of the same name.
+As a result, the addresses \[lq]fi_prefix1://my_node:my_service\[rq] and
+\[lq]fi_prefix2://my_node:my_service\[rq] would result in endpoints and
+regions of the same name.
 The application can also override the endpoint name after creating an
 endpoint using setname() without any address format restrictions.
 .PP
-\f[I]Msg flags\f[] The provider currently only supports the
+\f[I]Msg flags\f[R] The provider currently only supports the
 FI_REMOTE_CQ_DATA msg flag.
 .PP
-\f[I]MR registration mode\f[] The provider implements FI_MR_VIRT_ADDR
+\f[I]MR registration mode\f[R] The provider implements FI_MR_VIRT_ADDR
 memory mode.
 .PP
-\f[I]Atomic operations\f[] The provider supports all combinations of
+\f[I]Atomic operations\f[R] The provider supports all combinations of
 datatype and operations as long as the message is less than 4096 bytes
 (or 2048 for compare operations).
 .SH LIMITATIONS
@@ -125,29 +118,27 @@ EPs must be bound to both RX and TX CQs.
 No support for counters.
 .SH RUNTIME PARAMETERS
 .PP
-The \f[I]shm\f[] provider checks for the following environment
+The \f[I]shm\f[R] provider checks for the following environment
 variables:
 .TP
-.B \f[I]FI_SHM_SAR_THRESHOLD\f[]
+.B \f[I]FI_SHM_SAR_THRESHOLD\f[R]
 Maximum message size to use segmentation protocol before switching to
 mmap (only valid when CMA is not available).
 Default: SIZE_MAX (18446744073709551615)
-.RS
-.RE
 .TP
-.B \f[I]FI_SHM_TX_SIZE\f[]
+.B \f[I]FI_SHM_TX_SIZE\f[R]
 Maximum number of outstanding tx operations.
 Default 1024
-.RS
-.RE
 .TP
-.B \f[I]FI_SHM_RX_SIZE\f[]
+.B \f[I]FI_SHM_RX_SIZE\f[R]
 Maximum number of outstanding rx operations.
 Default 1024
-.RS
-.RE
+.TP
+.B \f[I]FI_SHM_DISABLE_CMA\f[R]
+Manually disables CMA.
+Default false
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_sockets.7 b/deps/libfabric/man/man7/fi_sockets.7
index 68c4cb45ea5771b5bc9275d3253d9a8f86601a39..574e4f5bd5a6694881dfb51247e43199e0ee6b7d 100644
--- a/deps/libfabric/man/man7/fi_sockets.7
+++ b/deps/libfabric/man/man7/fi_sockets.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_sockets" "7" "2019\-05\-30" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_sockets" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -26,34 +26,26 @@ The sockets provider supports all the features defined for the libfabric
 API.
 Key features include:
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports all endpoint types: \f[I]FI_EP_MSG\f[],
-\f[I]FI_EP_RDM\f[], and \f[I]FI_EP_DGRAM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports all endpoint types: \f[I]FI_EP_MSG\f[R],
+\f[I]FI_EP_RDM\f[R], and \f[I]FI_EP_DGRAM\f[R].
 .TP
-.B \f[I]Endpoint capabilities\f[]
+.B \f[I]Endpoint capabilities\f[R]
 The following data transfer interface is supported for a all endpoint
-types: \f[I]fi_msg\f[].
+types: \f[I]fi_msg\f[R].
 Additionally, these interfaces are supported for reliable endpoints
-(\f[I]FI_EP_MSG\f[] and \f[I]FI_EP_RDM\f[]): \f[I]fi_tagged\f[],
-\f[I]fi_atomic\f[], and \f[I]fi_rma\f[].
-.RS
-.RE
+(\f[I]FI_EP_MSG\f[R] and \f[I]FI_EP_RDM\f[R]): \f[I]fi_tagged\f[R],
+\f[I]fi_atomic\f[R], and \f[I]fi_rma\f[R].
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The sockets provider supports all operational modes including
-\f[I]FI_CONTEXT\f[] and \f[I]FI_MSG_PREFIX\f[].
-.RS
-.RE
+\f[I]FI_CONTEXT\f[R] and \f[I]FI_MSG_PREFIX\f[R].
 .TP
-.B \f[I]Progress\f[]
-Sockets provider supports both \f[I]FI_PROGRESS_AUTO\f[] and
-\f[I]FI_PROGRESS_MANUAL\f[], with a default set to auto.
+.B \f[I]Progress\f[R]
+Sockets provider supports both \f[I]FI_PROGRESS_AUTO\f[R] and
+\f[I]FI_PROGRESS_MANUAL\f[R], with a default set to auto.
 When progress is set to auto, a background thread runs to ensure that
 progress is made for asynchronous requests.
-.RS
-.RE
 .SH LIMITATIONS
 .PP
 Sockets provider attempts to emulate the entire API set, including all
@@ -69,102 +61,73 @@ Does not support FI_ADDR_STR address format.
 .PP
 The sockets provider checks for the following environment variables \-
 .TP
-.B \f[I]FI_SOCKETS_PE_WAITTIME\f[]
+.B \f[I]FI_SOCKETS_PE_WAITTIME\f[R]
 An integer value that specifies how many milliseconds to spin while
-waiting for progress in \f[I]FI_PROGRESS_AUTO\f[] mode.
-.RS
-.RE
+waiting for progress in \f[I]FI_PROGRESS_AUTO\f[R] mode.
 .TP
-.B \f[I]FI_SOCKETS_CONN_TIMEOUT\f[]
+.B \f[I]FI_SOCKETS_CONN_TIMEOUT\f[R]
 An integer value that specifies how many milliseconds to wait for one
 connection establishment.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_MAX_CONN_RETRY\f[]
+.B \f[I]FI_SOCKETS_MAX_CONN_RETRY\f[R]
 An integer value that specifies the number of socket connection retries
 before reporting as failure.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_DEF_CONN_MAP_SZ\f[]
+.B \f[I]FI_SOCKETS_DEF_CONN_MAP_SZ\f[R]
 An integer to specify the default connection map size.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_DEF_AV_SZ\f[]
+.B \f[I]FI_SOCKETS_DEF_AV_SZ\f[R]
 An integer to specify the default address vector size.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_DEF_CQ_SZ\f[]
+.B \f[I]FI_SOCKETS_DEF_CQ_SZ\f[R]
 An integer to specify the default completion queue size.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_DEF_EQ_SZ\f[]
+.B \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R]
 An integer to specify the default event queue size.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_DGRAM_DROP_RATE\f[]
+.B \f[I]FI_SOCKETS_DGRAM_DROP_RATE\f[R]
 An integer value to specify the drop rate of dgram frame when endpoint
-is \f[I]FI_EP_DGRAM\f[].
+is \f[I]FI_EP_DGRAM\f[R].
 This is for debugging purpose only.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_PE_AFFINITY\f[]
+.B \f[I]FI_SOCKETS_PE_AFFINITY\f[R]
 If specified, progress thread is bound to the indicated range(s) of
 Linux virtual processor ID(s).
 This option is currently not supported on OS X.
 The usage is \- id_start[\-id_end[:stride]][,].
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[]
+.B \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[R]
 A boolean to enable the keepalive support.
-.RS
-.RE
 .TP
-.B \f[I]FI_SOCKETS_KEEPALIVE_TIME\f[]
+.B \f[I]FI_SOCKETS_KEEPALIVE_TIME\f[R]
 An integer to specify the idle time in seconds before sending the first
 keepalive probe.
-Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[] is enabled.
-.RS
-.RE
+Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[R] is enabled.
 .TP
-.B \f[I]FI_SOCKETS_KEEPALIVE_INTVL\f[]
+.B \f[I]FI_SOCKETS_KEEPALIVE_INTVL\f[R]
 An integer to specify the time in seconds between individual keepalive
 probes.
-Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[] is enabled.
-.RS
-.RE
+Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[R] is enabled.
 .TP
-.B \f[I]FI_SOCKETS_KEEPALIVE_PROBES\f[]
+.B \f[I]FI_SOCKETS_KEEPALIVE_PROBES\f[R]
 An integer to specify the maximum number of keepalive probes sent before
 dropping the connection.
-Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[] is enabled.
-.RS
-.RE
+Only relevant if \f[I]FI_SOCKETS_KEEPALIVE_ENABLE\f[R] is enabled.
 .TP
-.B \f[I]FI_SOCKETS_IFACE\f[]
+.B \f[I]FI_SOCKETS_IFACE\f[R]
 The prefix or the name of the network interface (default: any)
-.RS
-.RE
 .SH LARGE SCALE JOBS
 .PP
 For large scale runs one can use these environment variables to set the
-default parameters e.g.
-size of the address vector(AV), completion queue (CQ), connection map
-etc.
+default parameters e.g.\ size of the address vector(AV), completion
+queue (CQ), connection map etc.
 that satisfies the requirement of the particular benchmark.
 The recommended parameters for large scale runs are
-\f[I]FI_SOCKETS_MAX_CONN_RETRY\f[], \f[I]FI_SOCKETS_DEF_CONN_MAP_SZ\f[],
-\f[I]FI_SOCKETS_DEF_AV_SZ\f[], \f[I]FI_SOCKETS_DEF_CQ_SZ\f[],
-\f[I]FI_SOCKETS_DEF_EQ_SZ\f[].
+\f[I]FI_SOCKETS_MAX_CONN_RETRY\f[R],
+\f[I]FI_SOCKETS_DEF_CONN_MAP_SZ\f[R], \f[I]FI_SOCKETS_DEF_AV_SZ\f[R],
+\f[I]FI_SOCKETS_DEF_CQ_SZ\f[R], \f[I]FI_SOCKETS_DEF_EQ_SZ\f[R].
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_tcp.7 b/deps/libfabric/man/man7/fi_tcp.7
index cc8b07285bc33a7229125f13dd5f0ae1adeb7d55..b1d60b3e027cb892467430eed2980e27ef3718de 100644
--- a/deps/libfabric/man/man7/fi_tcp.7
+++ b/deps/libfabric/man/man7/fi_tcp.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_tcp" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_tcp" "7" "2021\-05\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -16,52 +16,42 @@ high\-performance fabric hardware.
 .PP
 The following features are supported
 .TP
-.B \f[I]Endpoint types\f[]
-\f[I]FI_EP_MSG\f[] is the only supported endpoint type.
+.B \f[I]Endpoint types\f[R]
+\f[I]FI_EP_MSG\f[R] is the only supported endpoint type.
 Reliable datagram endpoint over TCP sockets can be achieved by layering
 RxM over tcp provider.
-.RS
-.RE
-\f[I]FI_EP_RDM\f[] is supported by layering ofi_rxm provider on top of
+\f[I]FI_EP_RDM\f[R] is supported by layering ofi_rxm provider on top of
 the tcp provider.
-.RS
-.RE
 .TP
-.B \f[I]Endpoint capabilities\f[]
-The tcp provider currently supports \f[I]FI_MSG\f[], \f[I]FI_RMA\f[]
-.RS
-.RE
+.B \f[I]Endpoint capabilities\f[R]
+The tcp provider currently supports \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R]
 .TP
-.B \f[I]Progress\f[]
-Currently tcp provider supports only \f[I]FI_PROGRESS_MANUAL\f[]
-.RS
-.RE
+.B \f[I]Progress\f[R]
+Currently tcp provider supports only \f[I]FI_PROGRESS_MANUAL\f[R]
 .TP
-.B \f[I]Shared Rx Context\f[]
+.B \f[I]Shared Rx Context\f[R]
 The tcp provider supports shared receive context
-.RS
-.RE
 .TP
-.B \f[I]Multi recv buffers\f[]
+.B \f[I]Multi recv buffers\f[R]
 The tcp provider supports multi recv buffers
-.RS
-.RE
 .SH RUNTIME PARAMETERS
 .PP
 The tcp provider check for the following enviroment variables \-
 .TP
-.B \f[I]FI_TCP_IFACE\f[]
+.B \f[I]FI_TCP_IFACE\f[R]
 A specific can be requested with this variable
-.RS
-.RE
 .TP
-.B \f[I]FI_TCP_PORT_LOW_RANGE/FI_TCP_PORT_HIGH_RANGE\f[]
+.B \f[I]FI_TCP_PORT_LOW_RANGE/FI_TCP_PORT_HIGH_RANGE\f[R]
 These variables are used to set the range of ports to be used by the tcp
 provider for its passive endpoint creation.
 This is useful where only a range of ports are allowed by firewall for
 tcp connections.
-.RS
-.RE
+.TP
+.B \f[I]FI_TCP_TX_SIZE\f[R]
+Default tx context size (default: 256)
+.TP
+.B \f[I]FI_TCP_RX_SIZE\f[R]
+Default rx context size (default: 256)
 .SH LIMITATIONS
 .PP
 The tcp provider is implemented over TCP sockets to emulate libfabric
@@ -71,6 +61,6 @@ implementing to sockets directly, depending on the types of data
 transfers the application is trying to achieve.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_udp.7 b/deps/libfabric/man/man7/fi_udp.7
index 5da716a7b85ba4848fbf7d04a09ab8422204270a..d0a8fca421c842064383f32925ba3d7d029b4e66 100644
--- a/deps/libfabric/man/man7/fi_udp.7
+++ b/deps/libfabric/man/man7/fi_udp.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_udp" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_udp" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -19,30 +19,22 @@ the implementation of libfabric features over any hardware.
 The UDP provider supports a minimal set of features useful for sending
 and receiving datagram messages over an unreliable endpoint.
 .TP
-.B \f[I]Endpoint types\f[]
-The provider supports only endpoint type \f[I]FI_EP_DGRAM\f[].
-.RS
-.RE
+.B \f[I]Endpoint types\f[R]
+The provider supports only endpoint type \f[I]FI_EP_DGRAM\f[R].
 .TP
-.B \f[I]Endpoint capabilities\f[]
-The following data transfer interface is supported: \f[I]fi_msg\f[].
+.B \f[I]Endpoint capabilities\f[R]
+The following data transfer interface is supported: \f[I]fi_msg\f[R].
 The provider supports standard unicast datagram transfers, as well as
 multicast operations.
-.RS
-.RE
 .TP
-.B \f[I]Modes\f[]
+.B \f[I]Modes\f[R]
 The provider does not require the use of any mode bits.
-.RS
-.RE
 .TP
-.B \f[I]Progress\f[]
-The UDP provider supports both \f[I]FI_PROGRESS_AUTO\f[] and
-\f[I]FI_PROGRESS_MANUAL\f[], with a default set to auto.
+.B \f[I]Progress\f[R]
+The UDP provider supports both \f[I]FI_PROGRESS_AUTO\f[R] and
+\f[I]FI_PROGRESS_MANUAL\f[R], with a default set to auto.
 However, receive side data buffers are not modified outside of
 completion processing routines.
-.RS
-.RE
 .SH LIMITATIONS
 .PP
 The UDP provider has hard\-coded maximums for supported queue sizes and
@@ -59,6 +51,6 @@ No support for counters.
 No runtime parameters are currently defined.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3)
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), \f[C]fi_getinfo\f[R](3)
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_usnic.7 b/deps/libfabric/man/man7/fi_usnic.7
index c75126f9c19b303d9a99f094d945df5f3ce35bb1..ea1fdefdba91ac8acc8e49203bab4700165b3094 100644
--- a/deps/libfabric/man/man7/fi_usnic.7
+++ b/deps/libfabric/man/man7/fi_usnic.7
@@ -1,21 +1,21 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_usnic" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_usnic" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
 fi_usnic \- The usNIC Fabric Provider
 .SH OVERVIEW
 .PP
-The \f[I]usnic\f[] provider is designed to run over the Cisco VIC
+The \f[I]usnic\f[R] provider is designed to run over the Cisco VIC
 (virtualized NIC) hardware on Cisco UCS servers.
 It utilizes the Cisco usNIC (userspace NIC) capabilities of the VIC to
 enable ultra low latency and other offload capabilities on Ethernet
 networks.
 .SH RELEASE NOTES
 .IP \[bu] 2
-The \f[I]usnic\f[] libfabric provider requires the use of the "libnl"
-library.
+The \f[I]usnic\f[R] libfabric provider requires the use of the
+\[lq]libnl\[rq] library.
 .RS 2
 .IP \[bu] 2
 There are two versions of libnl generally available: v1 and v3; the
@@ -24,12 +24,12 @@ usnic provider can use either version.
 If you are building libfabric/the usnic provider from source, you will
 need to have the libnl header files available (e.g., if you are
 installing libnl from RPM or other packaging system, install the
-"\-devel" versions of the package).
+\[lq]\-devel\[rq] versions of the package).
 .IP \[bu] 2
 If you have libnl (either v1 or v3) installed in a non\-standard
 location (e.g., not in /usr/lib or /usr/lib64), you may need to tell
-libfabric\[aq]s configure where to find libnl via the
-\f[C]\-\-with\-libnl=DIR\f[] command line option (where DIR is the
+libfabric\[cq]s configure where to find libnl via the
+\f[C]\-\-with\-libnl=DIR\f[R] command line option (where DIR is the
 installation prefix of the libnl package).
 .RE
 .IP \[bu] 2
@@ -37,51 +37,51 @@ The most common way to use the libfabric usnic provider is via an MPI
 implementation that uses libfabric (and the usnic provider) as a lower
 layer transport.
 MPI applications do not need to know anything about libfabric or usnic
-in this use case \-\- the MPI implementation hides all these details
+in this use case \[en] the MPI implementation hides all these details
 from the application.
 .IP \[bu] 2
 If you are writing applications directly to the libfabric API:
 .RS 2
 .IP \[bu] 2
-\f[I]FI_EP_DGRAM\f[] endpoints are the best supported method of
+\f[I]FI_EP_DGRAM\f[R] endpoints are the best supported method of
 utilizing the usNIC interface.
-Specifically, the \f[I]FI_EP_DGRAM\f[] endpoint type has been
-extensively tested as the underlying layer for Open MPI\[aq]s
-\f[I]usnic\f[] BTL.
+Specifically, the \f[I]FI_EP_DGRAM\f[R] endpoint type has been
+extensively tested as the underlying layer for Open MPI\[cq]s
+\f[I]usnic\f[R] BTL.
 .IP \[bu] 2
-\f[I]FI_EP_MSG\f[] and \f[I]FI_EP_RDM\f[] endpoints are implemented, but
-are only lightly tested.
+\f[I]FI_EP_MSG\f[R] and \f[I]FI_EP_RDM\f[R] endpoints are implemented,
+but are only lightly tested.
 It is likely that there are still some bugs in these endpoint types.
 In particular, there are known bugs in RDM support in the presence of
 congestion or packet loss (issue 1621).
 RMA is not yet supported.
 .IP \[bu] 2
-\f[C]fi_provider\f[](7) lists requirements for all providers.
-The following limitations exist in the \f[I]usnic\f[] provider:
+\f[C]fi_provider\f[R](7) lists requirements for all providers.
+The following limitations exist in the \f[I]usnic\f[R] provider:
 .RS 2
 .IP \[bu] 2
-multicast operations are not supported on \f[I]FI_EP_DGRAM\f[] and
-\f[I]FI_EP_RDM\f[] endpoints.
+multicast operations are not supported on \f[I]FI_EP_DGRAM\f[R] and
+\f[I]FI_EP_RDM\f[R] endpoints.
 .IP \[bu] 2
-\f[I]FI_EP_MSG\f[] endpoints only support connect, accept, and getname
+\f[I]FI_EP_MSG\f[R] endpoints only support connect, accept, and getname
 CM operations.
 .IP \[bu] 2
 Passive endpoints only support listen, setname, and getname CM
 operations.
 .IP \[bu] 2
-\f[I]FI_EP_DGRAM\f[] endpoints support \f[C]fi_sendmsg()\f[] and
-\f[C]fi_recvmsg()\f[], but some flags are ignored.
-\f[C]fi_sendmsg()\f[] supports \f[C]FI_INJECT\f[] and
-\f[C]FI_COMPLETION\f[].
-\f[C]fi_recvmsg()\f[] supports \f[C]FI_MORE\f[].
+\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and
+\f[C]fi_recvmsg()\f[R], but some flags are ignored.
+\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and
+\f[C]FI_COMPLETION\f[R].
+\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R].
 .IP \[bu] 2
-Address vectors only support \f[C]FI_AV_MAP\f[].
+Address vectors only support \f[C]FI_AV_MAP\f[R].
 .IP \[bu] 2
 No counters are supported.
 .IP \[bu] 2
 The tag matching interface is not supported.
 .IP \[bu] 2
-\f[I]FI_MSG_PREFIX\f[] is only supported on \f[I]FI_EP_DGRAM\f[] and
+\f[I]FI_MSG_PREFIX\f[R] is only supported on \f[I]FI_EP_DGRAM\f[R] and
 usage is limited to releases 1.1 and beyond.
 .IP \[bu] 2
 fi_control with FI_GETWAIT may only be used on CQs that have been bound
@@ -104,7 +104,7 @@ The application is responsible for resource protection.
 .IP \[bu] 2
 The usnic libfabric provider supports extensions that provide
 information and functionality beyond the standard libfabric interface.
-See the "USNIC EXTENSIONS" section, below.
+See the \[lq]USNIC EXTENSIONS\[rq] section, below.
 .RE
 .SH USNIC EXTENSIONS
 .PP
@@ -112,24 +112,26 @@ The usnic libfabric provider exports extensions for additional VIC,
 usNIC, and Ethernet capabilities not provided by the standard libfabric
 interface.
 .PP
-These extensions are available via the "fi_ext_usnic.h" header file.
+These extensions are available via the \[lq]fi_ext_usnic.h\[rq] header
+file.
 .SS Fabric Extension: getinfo
 .PP
-Version 2 of the "fabric getinfo" extension was introduced in Libfabric
-release v1.3.0 and can be used to retrieve IP and SR\-IOV information
-about a usNIC device obtained from the \f[C]fi_getinfo\f[](3) function.
+Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in
+Libfabric release v1.3.0 and can be used to retrieve IP and SR\-IOV
+information about a usNIC device obtained from the
+\f[C]fi_getinfo\f[R](3) function.
 .PP
-The "fabric getinfo" extension is obtained by calling
-\f[C]fi_open_ops\f[] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[] to
+The \[lq]fabric getinfo\[rq] extension is obtained by calling
+\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to
 get the usNIC fabric extension operations.
-The \f[C]getinfo\f[] function accepts a version parameter that can be
+The \f[C]getinfo\f[R] function accepts a version parameter that can be
 used to select different versions of the extension.
-The information returned by the "fabric getinfo" extension is accessible
-through a \f[C]fi_usnic_info\f[] struct that uses a version tagged
-union.
+The information returned by the \[lq]fabric getinfo\[rq] extension is
+accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version
+tagged union.
 The accessed union member must correspond with the requested version.
 It is recommended that applications explicitly request a version rather
-than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[].
+than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R].
 Although there is a version 1 of the extension, its use is discouraged,
 and it may not be available in future releases.
 .SS Compatibility issues
@@ -149,213 +151,201 @@ patched version of an older release.
 .IP
 .nf
 \f[C]
-#include\ <rdma/fi_ext_usnic.h>
+#include <rdma/fi_ext_usnic.h>
 
-struct\ fi_usnic_info\ {
-\ \ \ \ uint32_t\ ui_version;
-\ \ \ \ uint8_t\ ui_pad0[4];
-\ \ \ \ union\ {
-\ \ \ \ \ \ \ \ struct\ fi_usnic_info_v1\ v1;
-\ \ \ \ \ \ \ \ struct\ fi_usnic_info_v2\ v2;
-\ \ \ \ }\ ui;
-}\ __attribute__((packed));
+struct fi_usnic_info {
+    uint32_t ui_version;
+    uint8_t ui_pad0[4];
+    union {
+        struct fi_usnic_info_v1 v1;
+        struct fi_usnic_info_v2 v2;
+    } ui;
+} __attribute__((packed));
 
-int\ getinfo(uint32_t\ version,\ struct\ fid_fabric\ *fabric,
-\ \ \ \ \ \ \ \ struct\ fi_usnic_info\ *info);
-\f[]
+int getinfo(uint32_t version, struct fid_fabric *fabric,
+        struct fi_usnic_info *info);
+\f[R]
 .fi
 .TP
-.B \f[I]version\f[]
+.B \f[I]version\f[R]
 Version of getinfo to be used
-.RS
-.RE
 .TP
-.B \f[I]fabric\f[]
+.B \f[I]fabric\f[R]
 Fabric descriptor
-.RS
-.RE
 .TP
-.B \f[I]info\f[]
+.B \f[I]info\f[R]
 Upon successful return, this parameter will contain information about
 the fabric.
-.RS
-.RE
 .IP \[bu] 2
 Version 2
 .IP
 .nf
 \f[C]
-struct\ fi_usnic_cap\ {
-\ \ \ \ const\ char\ *uc_capability;
-\ \ \ \ int\ uc_present;
-}\ __attribute__((packed));
+struct fi_usnic_cap {
+    const char *uc_capability;
+    int uc_present;
+} __attribute__((packed));
 
-struct\ fi_usnic_info_v2\ {
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_link_speed;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_netmask_be;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ ui_ifname[IFNAMSIZ];
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_num_vf;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_qp_per_vf;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_cq_per_vf;
+struct fi_usnic_info_v2 {
+    uint32_t        ui_link_speed;
+    uint32_t        ui_netmask_be;
+    char            ui_ifname[IFNAMSIZ];
+    unsigned        ui_num_vf;
+    unsigned        ui_qp_per_vf;
+    unsigned        ui_cq_per_vf;
 
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ ui_devname[FI_EXT_USNIC_MAX_DEVNAME];
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ ui_mac_addr[6];
+    char            ui_devname[FI_EXT_USNIC_MAX_DEVNAME];
+    uint8_t         ui_mac_addr[6];
 
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ ui_pad0[2];
+    uint8_t         ui_pad0[2];
 
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_ipaddr_be;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_prefixlen;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_mtu;
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ ui_link_up;
+    uint32_t        ui_ipaddr_be;
+    uint32_t        ui_prefixlen;
+    uint32_t        ui_mtu;
+    uint8_t         ui_link_up;
 
-\ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ ui_pad1[3];
+    uint8_t         ui_pad1[3];
 
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_vendor_id;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_vendor_part_id;
-\ \ \ \ uint32_t\ \ \ \ \ \ \ \ ui_device_id;
-\ \ \ \ char\ \ \ \ \ \ \ \ \ \ \ \ ui_firmware[64];
+    uint32_t        ui_vendor_id;
+    uint32_t        ui_vendor_part_id;
+    uint32_t        ui_device_id;
+    char            ui_firmware[64];
 
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_intr_per_vf;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_max_cq;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_max_qp;
+    unsigned        ui_intr_per_vf;
+    unsigned        ui_max_cq;
+    unsigned        ui_max_qp;
 
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_max_cqe;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_max_send_credits;
-\ \ \ \ unsigned\ \ \ \ \ \ \ \ ui_max_recv_credits;
+    unsigned        ui_max_cqe;
+    unsigned        ui_max_send_credits;
+    unsigned        ui_max_recv_credits;
 
-\ \ \ \ const\ char\ \ \ \ \ \ *ui_nicname;
-\ \ \ \ const\ char\ \ \ \ \ \ *ui_pid;
+    const char      *ui_nicname;
+    const char      *ui_pid;
 
-\ \ \ \ struct\ fi_usnic_cap\ **ui_caps;
-}\ __attribute__((packed));
-\f[]
+    struct fi_usnic_cap **ui_caps;
+} __attribute__((packed));
+\f[R]
 .fi
 .IP \[bu] 2
 Version 1
 .IP
 .nf
 \f[C]
-struct\ fi_usnic_info_v1\ {
-\ \ \ \ uint32_t\ ui_link_speed;
-\ \ \ \ uint32_t\ ui_netmask_be;
-\ \ \ \ char\ ui_ifname[IFNAMSIZ];
+struct fi_usnic_info_v1 {
+    uint32_t ui_link_speed;
+    uint32_t ui_netmask_be;
+    char ui_ifname[IFNAMSIZ];
 
-\ \ \ \ uint32_t\ ui_num_vf;
-\ \ \ \ uint32_t\ ui_qp_per_vf;
-\ \ \ \ uint32_t\ ui_cq_per_vf;
-}\ __attribute__((packed));
-\f[]
+    uint32_t ui_num_vf;
+    uint32_t ui_qp_per_vf;
+    uint32_t ui_cq_per_vf;
+} __attribute__((packed));
+\f[R]
 .fi
 .PP
-Version 1 of the "fabric getinfo" extension can be used by explicitly
-requesting it in the call to \f[C]getinfo\f[] and accessing the
-\f[C]v1\f[] portion of the \f[C]fi_usnic_info.ui\f[] union.
+Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by
+explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing
+the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union.
 Use of version 1 is not recommended and it may be removed from future
 releases.
 .PP
 The following is an example of how to utilize version 2 of the usnic
-"fabric getinfo" extension.
+\[lq]fabric getinfo\[rq] extension.
 .IP
 .nf
 \f[C]
-#include\ <stdio.h>
-#include\ <rdma/fabric.h>
+#include <stdio.h>
+#include <rdma/fabric.h>
 
-/*\ The\ usNIC\ extensions\ are\ all\ in\ the
-\ \ \ rdma/fi_ext_usnic.h\ header\ */
-#include\ <rdma/fi_ext_usnic.h>
+/* The usNIC extensions are all in the
+   rdma/fi_ext_usnic.h header */
+#include <rdma/fi_ext_usnic.h>
 
-int\ main(int\ argc,\ char\ *argv[])\ {
-\ \ \ \ struct\ fi_info\ *info;
-\ \ \ \ struct\ fi_info\ *info_list;
-\ \ \ \ struct\ fi_info\ hints\ =\ {0};
-\ \ \ \ struct\ fi_ep_attr\ ep_attr\ =\ {0};
-\ \ \ \ struct\ fi_fabric_attr\ fabric_attr\ =\ {0};
+int main(int argc, char *argv[]) {
+    struct fi_info *info;
+    struct fi_info *info_list;
+    struct fi_info hints = {0};
+    struct fi_ep_attr ep_attr = {0};
+    struct fi_fabric_attr fabric_attr = {0};
 
-\ \ \ \ fabric_attr.prov_name\ =\ "usnic";
-\ \ \ \ ep_attr.type\ =\ FI_EP_DGRAM;
+    fabric_attr.prov_name = \[dq]usnic\[dq];
+    ep_attr.type = FI_EP_DGRAM;
 
-\ \ \ \ hints.caps\ =\ FI_MSG;
-\ \ \ \ hints.mode\ =\ FI_LOCAL_MR\ |\ FI_MSG_PREFIX;
-\ \ \ \ hints.addr_format\ =\ FI_SOCKADDR;
-\ \ \ \ hints.ep_attr\ =\ &ep_attr;
-\ \ \ \ hints.fabric_attr\ =\ &fabric_attr;
+    hints.caps = FI_MSG;
+    hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX;
+    hints.addr_format = FI_SOCKADDR;
+    hints.ep_attr = &ep_attr;
+    hints.fabric_attr = &fabric_attr;
 
-\ \ \ \ /*\ Find\ all\ usnic\ providers\ */
-\ \ \ \ fi_getinfo(FI_VERSION(1,\ 0),\ NULL,\ 0,\ 0,\ &hints,\ &info_list);
+    /* Find all usnic providers */
+    fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list);
 
-\ \ \ \ for\ (info\ =\ info_list;\ NULL\ !=\ info;\ info\ =\ info\->next)\ {
-\ \ \ \ \ \ \ \ /*\ Open\ the\ fabric\ on\ the\ interface\ */
-\ \ \ \ \ \ \ \ struct\ fid_fabric\ *fabric;
-\ \ \ \ \ \ \ \ fi_fabric(info\->fabric_attr,\ &fabric,\ NULL);
+    for (info = info_list; NULL != info; info = info\->next) {
+        /* Open the fabric on the interface */
+        struct fid_fabric *fabric;
+        fi_fabric(info\->fabric_attr, &fabric, NULL);
 
-\ \ \ \ \ \ \ \ /*\ Pass\ FI_USNIC_FABRIC_OPS_1\ to\ get\ usnic\ ops
-\ \ \ \ \ \ \ \ \ \ \ on\ the\ fabric\ */
-\ \ \ \ \ \ \ \ struct\ fi_usnic_ops_fabric\ *usnic_fabric_ops;
-\ \ \ \ \ \ \ \ fi_open_ops(&fabric\->fid,\ FI_USNIC_FABRIC_OPS_1,\ 0,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (void\ **)\ &usnic_fabric_ops,\ NULL);
+        /* Pass FI_USNIC_FABRIC_OPS_1 to get usnic ops
+           on the fabric */
+        struct fi_usnic_ops_fabric *usnic_fabric_ops;
+        fi_open_ops(&fabric\->fid, FI_USNIC_FABRIC_OPS_1, 0,
+                (void **) &usnic_fabric_ops, NULL);
 
-\ \ \ \ \ \ \ \ /*\ Now\ use\ the\ returned\ usnic\ ops\ structure\ to\ call
-\ \ \ \ \ \ \ \ \ \ \ usnic\ extensions.\ \ The\ following\ extension\ queries
-\ \ \ \ \ \ \ \ \ \ \ some\ IP\ and\ SR\-IOV\ characteristics\ about\ the
-\ \ \ \ \ \ \ \ \ \ \ usNIC\ device.\ */
-\ \ \ \ \ \ \ \ struct\ fi_usnic_info\ usnic_info;
+        /* Now use the returned usnic ops structure to call
+           usnic extensions.  The following extension queries
+           some IP and SR\-IOV characteristics about the
+           usNIC device. */
+        struct fi_usnic_info usnic_info;
 
-\ \ \ \ \ \ \ \ /*\ Explicitly\ request\ version\ 2.\ */
-\ \ \ \ \ \ \ \ usnic_fabric_ops\->getinfo(2,\ fabric,\ &usnic_info);
+        /* Explicitly request version 2. */
+        usnic_fabric_ops\->getinfo(2, fabric, &usnic_info);
 
-\ \ \ \ \ \ \ \ printf("Fabric\ interface\ %s\ is\ %s:\\n"
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\\tNetmask:\ \ 0x%08x\\n\\tLink\ speed:\ %d\\n"
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\\tSR\-IOV\ VFs:\ %d\\n\\tQPs\ per\ SR\-IOV\ VF:\ %d\\n"
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\\tCQs\ per\ SR\-IOV\ VF:\ %d\\n",
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ info\->fabric_attr\->name,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_ifname,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_netmask_be,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_link_speed,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_num_vf,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_qp_per_vf,
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ usnic_info.ui.v2.ui_cq_per_vf);
+        printf(\[dq]Fabric interface %s is %s:\[rs]n\[dq]
+               \[dq]\[rs]tNetmask:  0x%08x\[rs]n\[rs]tLink speed: %d\[rs]n\[dq]
+               \[dq]\[rs]tSR\-IOV VFs: %d\[rs]n\[rs]tQPs per SR\-IOV VF: %d\[rs]n\[dq]
+               \[dq]\[rs]tCQs per SR\-IOV VF: %d\[rs]n\[dq],
+               info\->fabric_attr\->name,
+               usnic_info.ui.v2.ui_ifname,
+               usnic_info.ui.v2.ui_netmask_be,
+               usnic_info.ui.v2.ui_link_speed,
+               usnic_info.ui.v2.ui_num_vf,
+               usnic_info.ui.v2.ui_qp_per_vf,
+               usnic_info.ui.v2.ui_cq_per_vf);
 
-\ \ \ \ \ \ \ \ fi_close(&fabric\->fid);
-\ \ \ \ }
+        fi_close(&fabric\->fid);
+    }
 
-\ \ \ \ fi_freeinfo(info_list);
-\ \ \ \ return\ 0;
+    fi_freeinfo(info_list);
+    return 0;
 }
-\f[]
+\f[R]
 .fi
 .SS Adress Vector Extension: get_distance
 .PP
-The "address vector get_distance" extension was introduced in Libfabric
-release v1.0.0 and can be used to retrieve the network distance of an
-address.
+The \[lq]address vector get_distance\[rq] extension was introduced in
+Libfabric release v1.0.0 and can be used to retrieve the network
+distance of an address.
 .PP
-The "get_distance" extension is obtained by calling \f[C]fi_open_ops\f[]
-and requesting \f[C]FI_USNIC_AV_OPS_1\f[] to get the usNIC address
-vector extension operations.
+The \[lq]get_distance\[rq] extension is obtained by calling
+\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get
+the usNIC address vector extension operations.
 .IP
 .nf
 \f[C]
-int\ get_distance(struct\ fid_av\ *av,\ void\ *addr,\ int\ *metric);
-\f[]
+int get_distance(struct fid_av *av, void *addr, int *metric);
+\f[R]
 .fi
 .TP
-.B \f[I]av\f[]
+.B \f[I]av\f[R]
 Address vector
-.RS
-.RE
 .TP
-.B \f[I]addr\f[]
+.B \f[I]addr\f[R]
 Destination address
-.RS
-.RE
 .TP
-.B \f[I]metric\f[]
-On output this will contain \f[C]\-1\f[] if the destination host is
-unreachable, \f[C]0\f[] is the destination host is locally connected,
-and \f[C]1\f[] otherwise.
-.RS
-.RE
+.B \f[I]metric\f[R]
+On output this will contain \f[C]\-1\f[R] if the destination host is
+unreachable, \f[C]0\f[R] is the destination host is locally connected,
+and \f[C]1\f[R] otherwise.
 .PP
 See fi_ext_usnic.h for more details.
 .SH VERSION DIFFERENCES
@@ -365,28 +355,28 @@ The release of libfabric v1.4 introduced a new naming convention for
 fabric and domain.
 However the usNIC provider remains backward compatible with applications
 supporting the old scheme and decides which one to use based on the
-version passed to \f[C]fi_getinfo\f[]:
+version passed to \f[C]fi_getinfo\f[R]:
 .IP \[bu] 2
-When \f[C]FI_VERSION(1,4)\f[] or higher is used:
+When \f[C]FI_VERSION(1,4)\f[R] or higher is used:
 .RS 2
 .IP \[bu] 2
 fabric name is the network address with the CIDR notation (i.e.,
-\f[C]a.b.c.d/e\f[])
+\f[C]a.b.c.d/e\f[R])
 .IP \[bu] 2
-domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[])
+domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R])
 .RE
 .IP \[bu] 2
-When a lower version number is used, like \f[C]FI_VERSION(1,\ 3)\f[], it
+When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it
 follows the same behavior the usNIC provider exhibited in libfabric <=
 v1.3:
 .RS 2
 .IP \[bu] 2
-fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[])
+fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R])
 .IP \[bu] 2
-domain name is \f[C]NULL\f[]
+domain name is \f[C]NULL\f[R]
 .RE
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_open_ops\f[](3), \f[C]fi_provider\f[](7),
+\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/man/man7/fi_verbs.7 b/deps/libfabric/man/man7/fi_verbs.7
index c40759bfb25935aea12155528b5b93ade30d5aa6..91954a319b708d4c52af7ded72374a551871abff 100644
--- a/deps/libfabric/man/man7/fi_verbs.7
+++ b/deps/libfabric/man/man7/fi_verbs.7
@@ -1,6 +1,6 @@
-.\" Automatically generated by Pandoc 1.19.2.4
+.\" Automatically generated by Pandoc 2.5
 .\"
-.TH "fi_verbs" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_verbs" "7" "2021\-03\-22" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
 .hy
 .SH NAME
 .PP
@@ -21,8 +21,7 @@ librdmacm * librdmacm\-devel
 .PP
 You may also want to look into any OS specific instructions for enabling
 RDMA.
-e.g.
-RHEL has instructions on their documentation for enabling RDMA.
+e.g.\ RHEL has instructions on their documentation for enabling RDMA.
 .PP
 The IPoIB interface should be configured with a valid IP address.
 This is a requirement from librdmacm.
@@ -35,8 +34,8 @@ FI_EP_MSG, FI_EP_DGRAM (beta), FI_EP_RDM.
 .PP
 FI_EP_RDM is supported via OFI RxM and RxD utility providers which are
 layered on top of verbs.
-To the app, the provider name string would appear as "verbs;ofi_rxm" or
-"verbs;ofi_rxd".
+To the app, the provider name string would appear as
+\[lq]verbs;ofi_rxm\[rq] or \[lq]verbs;ofi_rxd\[rq].
 Please refer the man pages for RxM (fi_rxm.7) and RxD (fi_rxd.7) to know
 about the capabilities and limitations for the FI_EP_RDM endpoint.
 .SS Endpoint capabilities and features
@@ -103,7 +102,7 @@ See ibv_fork_init(3) for additional details.
 .SS Memory Registration Cache
 .PP
 The verbs provider uses the common memory registration cache
-functionality that\[aq]s part of libfabric utility code.
+functionality that\[cq]s part of libfabric utility code.
 This speeds up memory registration calls from applications by caching
 registrations of frequently used memory regions.
 Please refer to fi_mr(3): Memory Registration Cache section for more
@@ -154,88 +153,64 @@ to be re\-mapped when the process is forked (MADV_DONTFORK).
 .PP
 The XRC transport is intended to be used when layered with the RXM
 provider and requires the use of shared receive contexts.
-See \f[C]fi_rxm\f[](7).
+See \f[C]fi_rxm\f[R](7).
+To enable XRC, the following environment variables must usually be set:
+FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX.
 .SH RUNTIME PARAMETERS
 .PP
 The verbs provider checks for the following environment variables.
 .SS Common variables:
 .TP
-.B \f[I]FI_VERBS_TX_SIZE\f[]
+.B \f[I]FI_VERBS_TX_SIZE\f[R]
 Default maximum tx context size (default: 384)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_RX_SIZE\f[]
+.B \f[I]FI_VERBS_RX_SIZE\f[R]
 Default maximum rx context size (default: 384)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_TX_IOV_LIMIT\f[]
+.B \f[I]FI_VERBS_TX_IOV_LIMIT\f[R]
 Default maximum tx iov_limit (default: 4).
 Note: RDM (internal \- deprecated) EP type supports only 1
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_RX_IOV_LIMIT\f[]
+.B \f[I]FI_VERBS_RX_IOV_LIMIT\f[R]
 Default maximum rx iov_limit (default: 4).
 Note: RDM (internal \- deprecated) EP type supports only 1
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_INLINE_SIZE\f[]
+.B \f[I]FI_VERBS_INLINE_SIZE\f[R]
 Default maximum inline size.
 Actual inject size returned in fi_info may be greater (default: 64)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_MIN_RNR_TIMER\f[]
+.B \f[I]FI_VERBS_MIN_RNR_TIMER\f[R]
 Set min_rnr_timer QP attribute (0 \- 31) (default: 12)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_CQREAD_BUNCH_SIZE\f[]
+.B \f[I]FI_VERBS_CQREAD_BUNCH_SIZE\f[R]
 The number of entries to be read from the verbs completion queue at a
 time (default: 8).
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_PREFER_XRC\f[]
+.B \f[I]FI_VERBS_PREFER_XRC\f[R]
 Prioritize XRC transport fi_info before RC transport fi_info (default:
 0, RC fi_info will be before XRC fi_info)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_GID_IDX\f[]
+.B \f[I]FI_VERBS_GID_IDX\f[R]
 The GID index to use (default: 0)
-.RS
-.RE
 .TP
-.B \f[I]FI_VERBS_DEVICE_NAME\f[]
+.B \f[I]FI_VERBS_DEVICE_NAME\f[R]
 Specify a specific verbs device to use by name
-.RS
-.RE
 .SS Variables specific to MSG endpoints
 .TP
-.B \f[I]FI_VERBS_IFACE\f[]
+.B \f[I]FI_VERBS_IFACE\f[R]
 The prefix or the full name of the network interface associated with the
 verbs device (default: ib)
-.RS
-.RE
 .SS Variables specific to DGRAM endpoints
 .TP
-.B \f[I]FI_VERBS_DGRAM_USE_NAME_SERVER\f[]
+.B \f[I]FI_VERBS_DGRAM_USE_NAME_SERVER\f[R]
 The option that enables/disables OFI Name Server thread.
 The NS thread is used to resolve IP\-addresses to provider specific
-addresses (default: 1, if "OMPI_COMM_WORLD_RANK" and "PMI_RANK"
-environment variables aren\[aq]t defined)
-.RS
-.RE
+addresses (default: 1, if \[lq]OMPI_COMM_WORLD_RANK\[rq] and
+\[lq]PMI_RANK\[rq] environment variables aren\[cq]t defined)
 .TP
-.B \f[I]FI_VERBS_NAME_SERVER_PORT\f[]
+.B \f[I]FI_VERBS_NAME_SERVER_PORT\f[R]
 The port on which Name Server thread listens incoming connections and
 requests (default: 5678)
-.RS
-.RE
 .SS Environment variables notes
 .PP
 The fi_info utility would give the up\-to\-date information on
@@ -247,18 +222,20 @@ Set FI_LOG_LEVEL=info or FI_LOG_LEVEL=debug (if debug build of libfabric
 is available) and check if there any errors because of incorrect input
 parameters to fi_getinfo.
 .IP \[bu] 2
-Check if "fi_info \-p verbs" is successful.
+Check if \[lq]fi_info \-p verbs\[rq] is successful.
 If that fails the following checklist may help in ensuring that the RDMA
 verbs stack is functional:
+.RS 2
 .IP \[bu] 2
 If libfabric was compiled, check if verbs provider was built.
 Building verbs provider would be skipped if its dependencies (listed in
-requirements) aren\[aq]t available on the system.
+requirements) aren\[cq]t available on the system.
 .IP \[bu] 2
 Verify verbs device is functional:
 .RS 2
 .IP \[bu] 2
 Does ibv_rc_pingpong (available in libibverbs) test work?
+.RS 2
 .IP \[bu] 2
 Does ibv_devinfo (available in libibverbs) show the device with
 PORT_ACTIVE status?
@@ -270,30 +247,32 @@ nodes in the cluster.
 Is the cable connected?
 .RE
 .RE
+.RE
 .IP \[bu] 2
 Verify librdmacm is functional:
 .RS 2
 .IP \[bu] 2
 Does ucmatose test (available in librdmacm) work?
 .IP \[bu] 2
-Is the IPoIB interface (e.g.
-ib0) up and configured with a valid IP address?
+Is the IPoIB interface (e.g.\ ib0) up and configured with a valid IP
+address?
+.RE
 .RE
 .SS Other issues
 .PP
 When running an app over verbs provider with Valgrind, there may be
-reports of memory leak in functions from dependent libraries (e.g.
-libibverbs, librdmacm).
+reports of memory leak in functions from dependent libraries
+(e.g.\ libibverbs, librdmacm).
 These leaks are safe to ignore.
 .PP
 The provider protects CQ overruns that may happen because more TX
 operations were posted to endpoints than CQ size.
-On the receive side, it isn\[aq]t expected to overrun the CQ.
+On the receive side, it isn\[cq]t expected to overrun the CQ.
 In case it happens the application developer should take care not to
 post excess receives without draining the CQ.
 CQ overruns can make the MSG endpoints unusable.
 .SH SEE ALSO
 .PP
-\f[C]fabric\f[](7), \f[C]fi_provider\f[](7),
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7),
 .SH AUTHORS
 OpenFabrics.
diff --git a/deps/libfabric/pingpong.vcxproj b/deps/libfabric/pingpong.vcxproj
index 2e65c22b83f078d20e5b80d6bf0a5273218b9340..8b6846d7c3a43c6b4671ac946c88b6db0451674d 100755
--- a/deps/libfabric/pingpong.vcxproj
+++ b/deps/libfabric/pingpong.vcxproj
@@ -13,6 +13,10 @@
       <Configuration>Debug-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug-v142|x64">
+      <Configuration>Debug-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release-ICC|x64">
       <Configuration>Release-ICC</Configuration>
       <Platform>x64</Platform>
@@ -25,6 +29,10 @@
       <Configuration>Release-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-v142|x64">
+      <Configuration>Release-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}</ProjectGuid>
@@ -45,6 +53,12 @@
     <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -65,6 +79,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -83,6 +104,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -92,6 +116,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -106,6 +133,11 @@
     <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
@@ -121,6 +153,11 @@
     <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\pingpong\</IntDir>
@@ -158,6 +195,22 @@
       <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <ClCompile>
       <PrecompiledHeader>
@@ -214,6 +267,26 @@
       <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -238,9 +311,11 @@
     <ClCompile Include="util\pingpong.c">
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">true</C99Support>
     </ClCompile>
     <ClCompile Include="util\windows\getopt\getopt.cpp" />
@@ -256,4 +331,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/deps/libfabric/prov/bgq/configure.m4 b/deps/libfabric/prov/bgq/configure.m4
index 4a0205983a0c5dcc10364d20f752942b0dab4cc9..5f88db683f54cc52ed422d597a2ae73af5ede032 100644
--- a/deps/libfabric/prov/bgq/configure.m4
+++ b/deps/libfabric/prov/bgq/configure.m4
@@ -72,7 +72,7 @@ AC_DEFUN([FI_BGQ_CONFIGURE],[
 				],
 				[bgq_external_source=$with_bgq_src])
 
-			AS_IF([test x"$bgq_external_source" == x"auto"], [
+			AS_IF([test x"$bgq_external_source" = x"auto"], [
 				for bgq_dir in `ls -r /bgsys/source`; do
 					AC_MSG_CHECKING([for bgq opensource distribution])
 					AS_IF([test -f /bgsys/source/$bgq_dir/spi/src/kernel/cnk/memory_impl.c],
@@ -80,7 +80,7 @@ AC_DEFUN([FI_BGQ_CONFIGURE],[
 						AC_MSG_RESULT([$bgq_external_source])
 						break)
 				done
-				AS_IF([test x"$bgq_external_source" == x"auto"], [
+				AS_IF([test x"$bgq_external_source" = x"auto"], [
 					bgq_happy=0
 					AC_MSG_RESULT([no])])
 			])
diff --git a/deps/libfabric/prov/efa/Makefile.include b/deps/libfabric/prov/efa/Makefile.include
index a4d027f0e77cb4ad98e4fdc13f7886645b28803a..da3fa73aa0d1acbb0ba001b9e9c9f19e664c9fb4 100644
--- a/deps/libfabric/prov/efa/Makefile.include
+++ b/deps/libfabric/prov/efa/Makefile.include
@@ -43,7 +43,6 @@ _efa_files = \
 	prov/efa/src/efa_rma.c \
 	prov/efa/src/rxr/rxr_attr.c	\
 	prov/efa/src/rxr/rxr_init.c	\
-	prov/efa/src/rxr/rxr_fabric.c	\
 	prov/efa/src/rxr/rxr_domain.c	\
 	prov/efa/src/rxr/rxr_cq.c	\
 	prov/efa/src/rxr/rxr_ep.c	\
@@ -52,6 +51,7 @@ _efa_files = \
 	prov/efa/src/rxr/rxr_msg.c	\
 	prov/efa/src/rxr/rxr_pkt_entry.c \
 	prov/efa/src/rxr/rxr_pkt_type_req.c \
+	prov/efa/src/rxr/rxr_pkt_type_base.c \
 	prov/efa/src/rxr/rxr_pkt_type_data.c \
 	prov/efa/src/rxr/rxr_pkt_type_misc.c \
 	prov/efa/src/rxr/rxr_pkt_cmd.c \
@@ -67,9 +67,11 @@ _efa_headers = \
 	prov/efa/src/rxr/rxr_pkt_entry.h \
 	prov/efa/src/rxr/rxr_pkt_type.h \
 	prov/efa/src/rxr/rxr_pkt_type_req.h \
+	prov/efa/src/rxr/rxr_pkt_type_base.h \
 	prov/efa/src/rxr/rxr_pkt_cmd.h \
 	prov/efa/src/rxr/rxr_read.h \
-	prov/efa/src/rxr/rxr_atomic.h
+	prov/efa/src/rxr/rxr_atomic.h \
+	prov/efa/src/rxr/rdm_proto_v4.h
 
 efa_CPPFLAGS += \
 	-I$(top_srcdir)/prov/efa/src/ \
diff --git a/deps/libfabric/prov/efa/configure.m4 b/deps/libfabric/prov/efa/configure.m4
index a61e6f08f850fbf2a2330c9f21beef3020ea5235..9bc7bf495b1f879b703c8d15019598e051aabc13 100644
--- a/deps/libfabric/prov/efa/configure.m4
+++ b/deps/libfabric/prov/efa/configure.m4
@@ -29,7 +29,7 @@ AC_DEFUN([FI_EFA_CONFIGURE],[
 		],
 		[efa_h_enable_poisoning=$enableval],
 		[efa_h_enable_poisoning=no])
-	AS_IF([test x"$efa_h_enable_poisoning" == x"yes"],
+	AS_IF([test x"$efa_h_enable_poisoning" = x"yes"],
 		[AC_DEFINE([ENABLE_EFA_POISONING], [1],
 			[EFA memory poisoning support for debugging])],
 		[])
@@ -75,8 +75,36 @@ AC_DEFUN([FI_EFA_CONFIGURE],[
 			      [],
 			      [[#include <infiniband/efadv.h>]])
 	      ])
+
+	AS_IF([test x"$enable_efa" != x"no"],
+	      [AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_RNR_RETRY,
+			    [AC_DEFINE([HAVE_CAPS_RNR_RETRY], [1], [EFADV_DEVICE_ATTR_CAPS_RNR_RETRY is defined])],
+			    [],
+			    [[#include <infiniband/efadv.h>]])
+	      ])
 	CPPFLAGS=$save_CPPFLAGS
 
+	dnl Check for ibv_is_fork_initialized() in libibverbs
+	have_ibv_is_fork_initialized=0
+	AS_IF([test $efa_happy -eq 1],
+		[AC_CHECK_DECL([ibv_is_fork_initialized],
+			[have_ibv_is_fork_initialized=1],
+			[],
+			[[#include <infiniband/verbs.h>]])
+		])
+
+	AC_DEFINE_UNQUOTED([HAVE_IBV_IS_FORK_INITIALIZED],
+		[$have_ibv_is_fork_initialized],
+		[Define to 1 if libibverbs has ibv_is_fork_initialized])
+
+	AS_IF([test "$enable_efa" = "no"], [efa_happy=0])
+
+	AS_IF([test $ac_cv_sizeof_void_p -eq 4],
+		[
+			efa_happy=0
+			AC_MSG_WARN([The EFA provider is not supported on 32-bit systems.])
+		])
+
 	AS_IF([test $efa_happy -eq 1 ], [$1], [$2])
 
 	efa_CPPFLAGS="$efa_ibverbs_CPPFLAGS $efadv_CPPFLAGS"
diff --git a/deps/libfabric/prov/efa/docs/atomic_fetch_compare.drawio b/deps/libfabric/prov/efa/docs/atomic_fetch_compare.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..e6592ff66a773ce602a927d5a6d4874fb9bec3d1
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/atomic_fetch_compare.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T17:06:42.692Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="w2Sv1vzeACmmFzqIVc2Y" version="12.4.8" type="device"><diagram id="y0qt14K1OZjQ2kAYbhKE" name="Page-1">3Zhfd5owGMY/jWdX2xGpgpdorb2Ytkfbs+7KE+EVchYIC1Fhn36JCULA9qw7tnW7Mu+TkD+/502IdOxxnE8ZSqMZDYB0et0g79jXnV7P6lqu+JFKoRTHcpQQMhzoRpWwxL+gfFKrWxxAZjTklBKOU1P0aZKAzw0NMUb3ZrMNJeaoKQqhJSx9RNrqNxzwSKluv1vpt4DDqBzZ6uqaGJWNtZBFKKD7mmRPOvaYUcpVKc7HQCS8kot67uaZ2uPEGCT8Tx6IZsPocT1y89XnJ/zorvNN9Pi5p3rZIbLVC/bSlGAfcUwTiRURkolfgtcbtMbM/ySjDXA/EiP5NJbjS9ScxtjXC+VFSY/RbRKAnEC3Y4/2EeawTJEva/ciX4QW8ZiIyBLF9oL0GnfAOOQ1SS9wCjQGzgrRRNeWrAsz3FfOWaUW1VwbaA3pZAmPHVc8RUEjfQVeu4V3AVlKBRN2AJkyGjLIZBGSECfw8QQvDeGghbDFCJLAk1tdRD5BWSYS0cASoCw6IJSBQMOKJ8nzS78Mv2u8h+A6N6JCR8+y5YiFwF9YwVC1g8A4adoO1BD3TxAuNQZE7M6deT6dwq5HuKdYzPhosDMU66477Dacy+iW+aCfqp8njY7cRqL0G/0oLq1+DklwXPXf54Xzz+dF+Yq7kMQ4X2a0U2P4rqnhnjk1nrX4Qpxr4La6g/P4Zg0G7+rbsOWbeRkRY1d3EYbVXUTeQQioFh/97myk/dVHvzvLE6ZGdE4vjNnlQbtqQRvfze69xWS1ePBElzeTh/GtKjfgiTVzk1DGGf0BY0ooE0pCxQ3PHm0wIQ0JERwm8jASJEHoI0lQZD7xdEWMg0AOc9IS0zRGudoxYgrOeVyyBo2TofwXV3Pp6oRJvTOYNJ+u5h5fFc7X6c/Ntrd7mi9vTlys/+8jvnU0n+vWZr3Zre2kb+295T3czRbL+8vfS3b/xeveKzZTv+Gl82abSYTVBwblYfWZxp78Bg==</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/atomic_fetch_compare.png b/deps/libfabric/prov/efa/docs/atomic_fetch_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ea72243d2e45a6fbf080adafac9cffc37e493d2
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/atomic_fetch_compare.png differ
diff --git a/deps/libfabric/prov/efa/docs/atomic_write.drawio b/deps/libfabric/prov/efa/docs/atomic_write.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..a34ae324cce029600ac8d0d878dafa4fd350e997
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/atomic_write.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T17:00:30.288Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="9_luOnClqkntQMJL5NXL" version="12.4.8" type="device"><diagram id="ukeF6noGK0I00wAWyjMB" name="Page-1">5Zddk5owFIZ/jdOr7YCsiJfq2m0vttPqOru9ciIcIdNAmBAV+uubSCKEaKfbcVZneqOcN1+c5z2E0POmafnIUJ480QhIr+9EZc976PX7ruMG4k8qVa0M3WEtxAxHqlMjLPAv0COVusURFEZHTinhODfFkGYZhNzQEGN0b3bbUGKumqMYLGERImKrLzjiSa0GA6fRPwOOE72y66iWFOnOSigSFNF9S/JmPW/KKOX1VVpOgUh4mks97tOZ1uONMcj43wxInkbJcj0JytXdK14G63KTLO/69Sw7RLYq4XGeExwijmkmsaLDHASvN2jNcPihkBDxCnGa4lAlxitNi9FtFoFc0Ol5k32COSxyFMrWvagPoSU8JSJyxaWdgMppB4xD2ZJUQo9AU+CsEl1Uq2ZbmeG+ccrVWtJyyVcaUsURHydu+IkLhfANOD0L5xyKnAom7EAuZzRmUMhLyGKcwfUJ3hpC30JoMYIsGstHW0QhQUUhCtHAEqEiOSCUgUDDqlfJ8+NAhz8U3kPwUBpRpaKzbDliMfA/ZDCq+0Fk7Cy2Ay3EgxOEtcaAiKdxZ+5Hp7CrFb5RLO74aPBwJPJuOxx0nCvoloWgRrX3j85EQadQBp15ai7WPIciOGb973UxvHBdnPX3Rny7nHG2c6N3dS74z5zr4HYd/zK+ub7/rr6NLN/Ms4HcBbtHg5CmOYG6w7XfbJ2q96/9ZtNH2hbQr1TE0+/iZ5bpJW7qNHB9aPcWtJf5l+fZav48tnCJLLnJpOCM/oQpJZQJJaPixOVNNpiQjoQIjjO5+wh2IPSJZCZKnYxVQ4qjSC5z0gTTJkZ5/YiIWxhexhfX72wF+iuq5cv9CVv6b7dFhM03Sb2VNF923uw3</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/atomic_write.png b/deps/libfabric/prov/efa/docs/atomic_write.png
new file mode 100644
index 0000000000000000000000000000000000000000..5148840fe93c62ebc496e9a92825fafcfe8e5edc
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/atomic_write.png differ
diff --git a/deps/libfabric/prov/efa/docs/building.md b/deps/libfabric/prov/efa/docs/building.md
new file mode 100644
index 0000000000000000000000000000000000000000..1eb0991fdaa1dfa9e57129ec9eb7723513527f4a
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/building.md
@@ -0,0 +1,55 @@
+## Building the EFA Libfabric Provider
+
+This document describes how to build the Libfabric provider once you've
+followed the prerequisite steps to install required software, see the overview
+doc if you are unsure what's needed.
+
+An example of building and installing Libfabric and verifying that the EFA
+device is available via libfabric:
+```
+$ ./autogen.sh
+$ ./configure --enable-efa=<path to rdma-core install> --prefix=$PWD/install
+$ make -j install
+$ ./install/bin/fi_info -p efa
+provider: efa
+    fabric: EFA-fe80::df:57ff:fe1a:beb3
+    domain: efa_0-rdm
+    version: 112.0
+    type: FI_EP_RDM
+    protocol: FI_PROTO_EFA
+provider: efa
+    fabric: EFA-fe80::df:57ff:fe1a:beb3
+    domain: efa_0-dgrm
+    version: 112.0
+    type: FI_EP_DGRAM
+    protocol: FI_PROTO_EFA
+```
+
+Configure flags that may be useful in the context of the EFA provider:
+
+* `--enable-debug`: will turn on `FI_LOG_LEVEL=debug`, add `-g` among others to
+CFLAGS (see configure.ac for full list), and compile in some extra data
+structures that may be helpful for debugging. Note that debug will likely
+impact performance. See `ENABLE_DEBUG` in the code.
+* `--enable-efa`: allows you to specify the rdma-core install path which is
+needed if rdma-core is not in the default paths. Also allows you to compile the
+provider as a shared library.
+* `--enable-efa-mem-poisoning`: Write a poison value into memory structures after
+they are freed. This has a performance overhead like debug. See
+`ENABLE_EFA_POISONING` in the code.
+* `--with-cuda`: Build Libfabric with cuda support (if cuda libraries are not in
+the default path). The EFA provider supports sends/RDMA reads with GPUDirect
+via FI_HMEM when Libfabric has CUDA support enabled.
+* `--with-gdrcopy`: Build Libfabric with the NVIDIA GDRCopy library enabled. If
+not enabled the EFA provider will have to utilize the EFA device (via a
+loopback read) to copy receives in the bounce buffers (host memory) matched to
+GPU memory.
+
+CFLAGS that might be useful:
+
+* `EFA_PERF_ENABLED`: enable the perf hooks to determine cycle/instruction count
+for functions in the send/receive/completion paths. See fi_hook(7) and the
+Linux perf documentation for more information.
+* `ENABLE_RXR_PKT_DUMP`: turn on packet dump prints, very verbose. These
+functions haven't been kept up to date with recent protocol changes so this
+might not be useful until fixed.
diff --git a/deps/libfabric/prov/efa/docs/efa_rdm_protocol_v4.md b/deps/libfabric/prov/efa/docs/efa_rdm_protocol_v4.md
new file mode 100644
index 0000000000000000000000000000000000000000..359cdc90e1375ff1e888abd13704ebabebd5d006
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/efa_rdm_protocol_v4.md
@@ -0,0 +1,1420 @@
+# EFA RDM Communication Protocol version 4
+
+## 0. Overview
+
+This document describes version 4 of EFA RDM communication protocol (protocol v4),
+which is adopted by libfabric EFA provider's RDM endpoint since libfabric 1.10.0 release.
+
+The purpose of this document is to provide a definition of the protocol that is
+not tied to a specific implementation. It is useful to distinguish protocol and
+implementation, because protocol change can cause backward compatibility issue,
+therefore needs to be handled with extra care.
+
+It is organized as the following:
+
+Chapter 1 "Basics" introduces some basic facts/concepts of EFA RDM protocol, including:
+
+ * Section 1.1 Why is EFA RDM protocol needed?
+
+ * Section 1.2 A list of features/sub-protocols.
+
+ * Section 1.3 packet, packet base header and a list of packet types.
+
+Chapter 2 "Handshake sub-protocol" describes the handshake sub-protocol, including:
+
+ * Section 2.1 "Handshake sub-protocol and backward compatibility" describes how to introduce
+   backward compatible changes to protocol v4, and how handshake sub-protocol is used to
+   facilitate the process.
+
+ * Section 2.2 "Handshake sub-protocol and raw address exchange" describes how handshake sub-protocol
+   impacts the behavior of including raw address in packet header.
+
+ * Section 2.3 "Implementation tips" include tips when implementing handshake sub-protocol.
+
+Chapter 3 "baseline features" describes the baseline features of protocol v4.
+
+ *  Section 3.1 "REQ packets" introduces the binary format of REQ packets, which all baseline features
+    use to initialize the communication.
+
+ *  Section 3.2 "baseline features for two-sided communications" describe 3 two-sided communication baseline features:
+
+    - eager message transfer,
+    - medium message transfer and
+    - long-cts message transfer.
+
+ *  Section 3.3 "baseline features for one-sided communications" describe 7 one-sided communication baseline features:
+
+    - emulated eager write,
+    - emulated long-cts write,
+    - emulated short read,
+    - emulated long-cts read,
+    - emulated write atomic,
+    - emulated fetch atomic and
+    - emulated compare atomic.
+
+Chapter 4 "extra features/requests" describes the extra features/requests defined in version 4.
+
+ *  Section 4.1 describe the extra feature: RDMA read based message transfer.
+
+ *  Section 4.2 describe the extra feature: delivery complete.
+
+ *  Section 4.3 describe the extra request: constant header length.
+
+ *  Section 4.4 describe the extra request: connid (connection ID) header.
+
+Chapter 5 "what's not covered?" describe the contents that are intentionally left out of
+this document because they are considered "implementation details".
+
+## 1. Basics
+
+EFA RDM communication protocol is for two lifabric endpoints to use EFA device to communication
+with each other.
+
+### 1.1 Why is EFA RDM communication protocol needed?
+
+The reason we need a EFA RDM communication protocol is to support features that
+EFA device does not directly support. Currently, EFA device supports the following
+two types of communications:
+
+ 1. send/receive a message up to EFA device's Maximum Transmission Unit (MTU) size.
+ 2. RDMA read of a memory buffer up to 1GB (if both endpoints' software stacks support RDMA read).
+
+Moreover, for send/receive, EFA device does not guarantee ordered delivery, e.g. when sender
+sends multiple messages to a receiver, the receiver may receive the packets in an order different
+from how they are sent.
+
+Protocol v4 defines how two endpoints can use EFA device's capability to achieve:
+
+ * send/receive up to 2^64-1 bytes,
+ * read up to 2^64-1 bytes,
+ * write up to 2^64-1 bytes,
+ * atomics up to MTU size.
+
+Moreover, protocol v4 provide mechanisms to meet extra requirements of application, which EFA device
+does not support, such as ordered send/receive (`FI_ORDER_SAS`) and delivery complete (DC).
+
+### 1.2 a list of sub-protocols
+
+To meet application's specific needs, protocol v4 defines a set of sub-protocols for
+as listed in table 1.1:
+
+Table: 1.1 a list of sub-protocols
+
+| Sub Protocol Name             | Used For  | Definition in |
+|-|-|-|
+| Eager message                 | Two sided | Section 3.2   |
+| Medium message                | Two sided | Section 3.2   |
+| Long-CTS message              | Two sided | Section 3.2   |
+| Long READ message             | Two sided | Section 4.1   |
+| DC Eager message              | Two sided | Section 4.2   |
+| DC Medium message             | Two sided | Section 4.2   |
+| DC Long-CTS message           | Two sided | Section 4.2   |
+| Emulated eager write          | One sided | Section 3.3   |
+| Emulated long-CTS write       | One sided | Section 3.3   |
+| Emulated long-read write      | One sided | Section 4.1   |
+| Emulated DC eager write       | One sided | Section 4.2   |
+| Emulated DC long-CTS write    | One sided | Section 4.2   |
+| Emulated short read           | One sided | Section 3.3   |
+| Emulated long-CTS read        | One sided | Section 3.3   |
+| Direct read                   | One sided | Section 4.1   |
+| Emulated atomic               | One sided | Section 3.3   |
+| Emulated fetch atomic         | One sided | Section 3.3   |
+| Emulated compare atomic       | One sided | Section 3.3   |
+| Handshake                     | Backward compatibility | Chapter 2 |
+
+### 1.3 packet, packet base header and a list of packets
+
+All the sub-protocols (except the Direct Read protocol) use packet(s) to exchange
+information between two endpoints.
+
+A packet is a message that does not exceed MTU size, and is exchanged between
+two endpoints using EFA device's send/receive capability.
+
+Protocol v4 defines a set of packet types. They can be split into two category:
+REQ packet types and non-REQ packet types.
+
+A REQ packet was the 1st packet sender/requester send to the receiver/responder
+in the workflow of a sub-protocol. Each sub-protocol is unique, thus each
+sub-protocol defines its own REQ packet type.
+
+A non-REQ packet is used to by some sub-protocols to transfer additional
+information that is not covered in the REQ packet.
+
+To distinguish various types of packets sent/received between two endpoints,
+each packet type was assigned an unique packet type ID. Table 1.2 lists
+all the packet types in protocol v4 and sub-protocol(s) that used it:
+
+Table: 1.2 a list of packet type IDs
+
+| Packet Type ID  | Nick Name         | Full Name                 | Category | Used by                       |
+|-|-|-|-|-|
+| 1               | RTS               | Request To Send           | non-REQ  | Deprecated                    |
+| 2               | CONNACK           | CONNection ACKnowlegement | non-REQ  | Deprecated                    |
+| 3               | CTS               | Clear To Send             | non-REQ  | long-CTS message/read/write |
+| 4               | DATA              | Data                      | non-REQ  | long-CTS message/read/write |
+| 5               | READRSP           | READ ReSPonse             | non-REQ  | emulated short/long-read      |
+| 6               | reserved          | N/A                       | non-REQ  | reserved for internal use      |
+| 7               | EOR               | End Of Read               | non-REQ  | long-read message/write     |
+| 8               | ATOMRSP           | ATOMic ResSPonse          | non-REQ  | emulated write/fetch/compare atomic |
+| 9               | HANDSHAKE         | Handshake                 | non-REQ  | handshake                     |
+| 10              | RECEIPT           | Receipt                   | non-REQ  | delivery complete (DC)         |
+| 64              | EAGER_MSGRTM      | Eager non-tagged Request To Message       | REQ  | eager message |
+| 65              | EAGER_TAGRTM      | Eager tagged Request To Message           | REQ  | eager message |
+| 66              | MEDIUM_MSGRTM     | Medium non-tagged Request To Message      | REQ  | medium message |
+| 67              | MEDIUM_TAGRTM     | Medium tagged Request To Message          | REQ  | medium message |
+| 68              | LONGCTS_MSGRTM    | Long-CTS non-tagged Request To Message    | REQ  | long-CTS message |
+| 69              | LONGCTS_TAGRTM    | Long-CTS tagged Request To Message        | REQ  | long-CTS message |
+| 70              | EAGER_RTW         | Eager Request To Write                    | REQ  | emulated eager write |
+| 71              | LONGCTS_RTW       | Long-CTS Request To Write                 | REQ  | emulated long-CTS write |
+| 72              | SHORT_RTR         | Eager Request To Read                     | REQ  | emulated short read |
+| 73              | LONGCTS_RTR       | Long-CTS Request To Read                  | REQ  | emulated long-CTS read |
+| 74              | WRITE_RTA         | Write Request To Atomic                   | REQ  | emulated write atomic |
+| 75              | FETCH_RTA         | Fetch Request To Atomic                   | REQ  | emulated fetch atomic |
+| 76              | COMPARE_RTA       | Compare Request To Atomic                 | REQ  | emulated compare atomic |
+| 128             | LONGREAD_MSGRTM   | Long-read non-tagged Request To Message   | REQ  | Long-read message |
+| 129             | LONGREAD_TAGRTM   | Long-read tagged Request To Message       | REQ  | Long-read message |
+| 130             | LONGREAD_RTW      | Long-read Request To Write                | REQ  | Long-read message |
+| 131             | reserved          | N/A                                       | N/A  | N/A               |
+| 132             | reserved          | N/A                                       | N/A  | N/A               |
+| 133             | DC_EAGER_MSGRTM   | DC Eager non-tagged Request To Message    | REQ  | DC eager message |
+| 134             | DC_EAGER_TAGRTM   | DC Eager tagged Request To Message        | REQ  | DC eager message |
+| 135             | DC_MEDIUM_MSGRTM  | DC Medium non-tagged Request To Message   | REQ  | DC medium message |
+| 136             | DC_MEDIUM_TAGRTM  | DC Medium tagged Request To Message       | REQ  | DC medium message |
+| 137             | DC_LONGCTS_MSGRTM | DC long-CTS non-tagged Request To Message | REQ  | DC long-CTS message |
+| 138             | DC_LONTCTS_TAGRTM | DC long-CTS tagged Request To Message     | REQ  | DC long-CTS message |
+| 139             | DC_EAGER_RTW      | DC Eager Request To Write                 | REQ  | DC emulated eager write |
+| 140             | DC_LONGCTS_RTW    | DC long-CTS Request To Write              | REQ  | DC emulated long-CTS write |
+| 141             | DC_WRITE_RTA      | DC Write Request To Atomic                | REQ  | DC emulated write atomic |
+
+The packet type ID is included in the 4 bytes EFA RDM base header, which every packet must be started
+with. The format of the EFA RDM base header is listed in table 1.3:
+
+Table: 1.3 format of EFA RDM base header
+
+| Name | Length (bytes) | type | C language type |
+|-|-|-|-|
+| `type`    | 1 | integer | `uint8_t` |
+| `version` | 1 | integer | `uint8_t` |
+| `flags`   | 2 | integer | `uint16_t` |
+
+In the table, `type` is the packet type ID.
+
+`version` is the EFA RDM protocol version, which is 4 for protocol v4.
+
+`flags` is a set of flags each packet type uses to customize its behavior. Typically, it is used
+to indicate the existence of optional header(s) in the packet header. Each packet type defines its own flags.
+
+Protocol v4 define the following universal flag, which every packet type should use:
+
+Table: 1.4 a list of universal flags
+
+| Bit ID | Value | Name | Description | Used by |
+|-|-|-|-|
+| 15     | 0x8000 | CONNID_HDR | This packet has "connid" in header | extra request "connid header" (section 4.4) |
+
+Note, the flag `CONNID_HDR` only indicate the presence of connid in the header. The exact location of connid
+would be different for each packet type.
+
+Other then the universal flags, each packet type defines its own flags.
+
+The format of each packet type is introduced in the sections where the sub-protocols are introduced.
+
+### 1.4 raw address
+
+raw address is the ID of an EFA RDM endpoint.
+
+To send message to an EFA endpoint, one need to know the endpoint's raw address. It the call
+`fi_av_insert` to insert the raw address to its address vector. `fi_av_insert` will return a libfabric
+internal address, the internal address is used to send message.
+(see [fi_av](https://ofiwg.github.io/libfabric/v1.1.1/man/fi_av.3.html) for more details)
+
+Interestingly, to receive message from an EFA endpoint, one does not need to know the endpoint's
+raw address. See section 2.3 for more discussion on this topic.
+
+Each provider defines its address format, the raw address of EFA RDM endpoint uses the
+format in the following table 1.5.
+
+Table: 1.5 binary format of EFA RDM raw address
+
+| Name | Lengths (bytes) | type | C language type | Notes |
+|-|-|-|-|-|
+| `gid`  | 16 | array   | `uint8_t[16]` | ipv6 format |
+| `qpn`  |  2 | integer | `uint16_t`    | queue pair number |
+| `pad`  |  2 | integer | `uint16_t`    | pad to 4 bytes |
+| `connid` | 4 | integer | `uint32_t`   | connection ID |
+| `reserved` | 8 | integer | `uint64_t` | reserved for internal use |
+
+The field `connid` warrants extra explanation: it is a 4-byte random integer generated
+during endpoint initialization, which can be used to identify the endpoint. When protocol v4
+was initially introduced, the field `connid` was named `qkey`, which is a concept of
+EFA device. Later it is realized that this is in fact a connection ID, which we happen
+to use a EFA device's Q-Key.
+
+Currently, the raw address of EFA is 32 bytes, but it can be expanded in the future without
+breaking backward compatibility.
+
+## 2. Handshake sub-protocol
+
+Handshake sub-protocol serves two purposes in protocol v4.
+
+First, it is used to exchange two endpoints' capability information, which allows to introduce
+changes to protocol v4 without breaking backward compatibility. (section 2.1)
+
+Second, it is used to adjust the behavior of including EFA raw address in REQ packet header
+(section 2.2)
+
+### 2.1 Handshake sub-protocol and backward compatibility
+
+The biggest problem when designing a communication protocol is how to maintain backward compatibility
+when introducing changes to the protocol. Imagine the following scenario: there are endpoints that are
+using protocol v4 in its current form. If a change is made to the protocol, how to make sure that
+the existing endpoints still be able to communicate with endpoints that have adopted changes?
+
+To tackle this issue, protocol v4 first introduced the concepts of "feature" and "request".
+
+- A feature is a functionality that an endpoint can support. Typically, a feature
+is the support of  a set of sub-protocols.
+
+- A request is an expectation an endpoint has on its peer. Typically, a request is
+for its peer to include some extra information in packet header.
+
+Protocol v4 defines the following 10 features as baseline features:
+
+- Eager message (send/receive)
+- Medium message (send/receive)
+- Long-CTS message (send/receive)
+- Emulated eager write
+- Emulated long-CTS write
+- Emulated short read
+- Emulated long-CTS read
+- Emulated write atomic
+- Emulated fetch atomic
+- Emulated compare atomic
+
+The definition of these baseline features are in chapter 3. Any endpoint that adopts protocol
+v4 must support these baseline features.
+
+Protocol v4 then allow changes to be introduced as "extra feature" and "extra request".
+Each extra feature/request will be assigned an ID, when it is introduce to protocol v4.
+The ID starts from 0, and increases by 1 for each extra feature/request. Typically,
+new extra feature/request is introduced with libfaric minor releases, and will NOT be
+back ported.
+
+Currently there are 4 such extra features/requests, as listed in table 2.1:
+
+Table: 2.1 a list of extra features/requests
+
+| ID | Name              |  Type    | Introduced since | Described in |
+|-|-|-|-|-|
+| 0  | RDMA read based data transfer    | extra feature | libfabric 1.10.0 | Section 4.1 |
+| 1  | delivery complete                | extra feature | libfabric 1.12.0 | Section 4.2 |
+| 2  | keep packet header length constant | extra request | libfabric 1.13.0 | Section 4.3 |
+| 3  | sender connection id in packet header  | extra request | libfabric 1.14.0 | Section 4.4 |
+
+How does protocol v4 maintain backward compatibility when extra features/requests are introduced?
+
+First, protocol v4 states that endpoint's support of an extra feature/request is optional,
+therefore cannot be assumed.
+
+Second, protocol v4 defines the handshake sub-protocol for two endpoint to exchange its extra
+feature/request status. Its workflow is:
+
+1. If an endpoint has never communicated with a peer, it does not know the peer's
+   extra/feature request status. Therefore, it can only use the baseline features to
+   communicate with a peer, which means it will send REQ packets (section 3.1) to the peer
+   to initialize a communication.
+2. Upon receiving the 1st packet from a peer, an endpoint must send back a handshake
+   packet, which contains the endpoint's capability information.
+3. Upon receiving the handshake packet, an endpoint will know the peer's extra feature/request
+   status.
+
+Regarding extra feature, if the peer support the extra feature the endpoint want to use,
+the endpoint can start using the extra feature. Otherwise, one of the following should happen:
+
+- a. the communication continues without using the extra feature/request, though
+   the performance may be sub-optimal. For example, if the peer does not support
+   the extra feature "RDMA read based data transfer", the endpoint can choose to
+   use baseline features to carry on the communication, though the performance will
+   be sub-optimal (section 4.1).
+
+- b. the requester of the communication aborts the communication and return an error
+   to the application. For example, if application requires delivery complete, but
+   the peer does not support it (this can happen when endpoint is using libfabric 1.12,
+   but the peer is using libfabric 1.10), the requester need to return an error to the
+   application (section 4.2)
+
+Regarding extra request, if an endpoint can support an extra request the peer has requested,
+it should comply the request. Otherwise, it can ignore the request. Peer should be do
+one of the following:
+
+- a. carry on the communication without using the extra request.
+
+- b. abort the communication and return an error to application. (see section 4.3
+     for example)
+
+For example, if sender is using libfabric 1.10, and receiver is using libfabric 1.13.
+If receiver is in zero copy receive mode, it will have the the extra request
+"constant header length", but sender does not support it. In this case, it is OK
+for sender to ignore the request, and send packets with different header length.
+It is receiver's responsibility to react accordingly. (section 4.3)
+
+This concludes the workflow of the handshake sub-protocol.
+
+The binary format of a HANDSHAKE packet is listed in table 2.2.
+
+Table: 2.2 binary format of the HANDSHAKE packet
+
+| Name      | Length (bytes) | type | C language type |
+|-|-|-|-|
+| `type`    | 1 | integer | `uint8_t`  |
+| `version` | 1 | integer | `uint8_t`  |
+| `flags`   | 2 | integer | `uint16_t` |
+| `nextra_p3`  | 4 | integer | `uint32_t` |
+| `extra_info`  | `8 * (nextra_p3 - 3)` | integer array | `uint64_t[]` |
+| `connid`  | 4 | integer | sender connection ID, optional, present when the CONNID_HDR flag is on `flags` |
+| `padding` | 4 | integer | padding for `connid`, optional, present when the CONNID_HDR flag is on `flags` |
+
+The first 4 bytes (3 fields: `type`, `version`, `flags`) is the EFA RDM base header (section 1.3).
+
+Immediately after the base header, there are 2 fields `nextra_p3` and `extra_info`.
+
+The field `extra_info` is an array of 8 byte integers, which stores the capability of an endpoint.
+
+As mentioned before, each extra feature/request was assigned an ID when it was introduced to protocol v4.
+When constructing the handshake packet, for each extra feature it supports (or an extra request it want to impose),
+an endpoint need to toggle on a corresponding bit in the `extra_info` array. Specifically, if an endpoint supports
+the extra feature with ID `i` (or want to impose extra request with ID `i`), it needs to toggle on the
+No. `i%64` bit of the No. `i/64` member of the `extra_info` array.
+
+For example, if an endpoint supports the extra feature "RDMA read based data transfer" (ID 0), it needs to
+toggle on the No. 0 (the first) bit of `extra_info[0]`. (section 4.1)
+
+If an endpoint wants to impose the "constant header length" extra request it need to toggle on bit No 2.
+in `extra_info[0]`. (section 4.3)
+
+Note, the field `extra_info` was named `features` when protocol v4 was initially introduced, at that time we
+only planned for extra features. Later, we discovered that the handshake sub-protocol can also be used to pass
+additional request information, thus introduced the concept of "extra request" and renamed this field `extra_info`.
+
+`nextra_p3` is number of `extra_info` flags of the endpoint plus 3. The "plus 3" is for historical reasons.
+When protocol v4 was initially introduced, this field is named `maxproto`. The original plan was that protocol
+v4 can only have 64 extra features/requests. If the number of extra feature/request ever exceeds 64, the next
+feature/request will be defined as version 5 feature/request, (version 6 if the number exceeds 128, so on so
+forth). The field `maxproto` means maximumly supported protocol version by an endpoint. The recipient of the
+HANDSHAKE packet use `maxproto` to calculate how many members `extra_info` has, which is `maxproto - 4 + 1`.
+(Starting from v4, each version has 1 flag, so if `maxproto` is 5, there are 2 members in `extra_info`. One
+for v4, the other for v5. Therefore the formula to compute number of number of members is `maxproto - 4 + 1`)
+
+However, it was later realized the original plan is overly complicated, and can cause a lot of confusion.
+For example, if an endpoint support a feature defined in version 5, what version number should it put in
+the base header? Given that the sole purpose of the field `maxproto` is to provide a way to calculate
+how many members the `extra_info` array has, the protocol would be much easier to understand if we re-interpret
+the field `maxproto` as `nextra_p3` and allow protocol v4 to have more than 64 extra feature/requests.
+
+After `extra_info`, there are two optional field `connid` and `padding`:
+
+`connid` is the sender's connection ID (4 bytes), `padding` is a 4 byte space to make the packet to align
+to 8 bytes boundary.
+
+These two fields were introduced with the extra request "connid in header". They are optional,
+therefore an implemenation is not required to set them. (section 4.4 for more details) If an implementation
+does set the connid, the implementation needs to toggle on the CONNID_HDR flag in `flags` (table 1.4).
+
+### 2.2 handshake sub-protocol and raw address exchange
+
+Another functionality of the handshake sub-protocol is to adjust behavior of including raw address in packet header.
+
+Currently, if an endpoint is communicating with a peer for the first time, it will include its raw address
+in the REQ packets it sends.
+
+After the endpoint received the HANDSHAKE packet from the peer, it will stop including its raw address
+in the header (see section 4.3 for an exception).
+
+This behavior is to compensate for a limitation of EFA device, which is EFA device cannot report
+the address of a packet of an unknown sender.
+
+The EFA device keeps an address book, which contains a list of raw addresses of its peer. Each address is assigned
+an address handle number (AHN). When EFA device received a message, it will report the AHN of the address.
+
+However, if the address of a received message is not in the address book, there is no AHN assigned to
+the address. In this case, EFA device will not be able to report the AHN.
+
+For the communication to proceed, an endpoint needs to have to the address of any received packet, because
+it need to send packets back.
+
+Therefore, if an endpoint is communicating with a peer for the 1st time, it will have to include its raw
+address in the header of REQ packets it sends to the peer. (see section 3.1 for the details of REQ packets).
+
+Upon receiving the packet, if the peer does not have the endpoint's address in peer's address book,
+the peer can get the endpoint's raw address from REQ packet header, then insert the raw address to its address book.
+
+This insertion only need to happen once, for the next packets from the endpoint, the EFA device will be able
+to report AHN. Therefore, it is desirable to have a mechanism for an endpoint to stop including raw address
+in packet header to reduce packet header length.
+
+As it turns out, the handshake sub-protocol is the perfect mechanism for that.
+
+In handshake sub-protocol, an endpoint will send a HANDSHAKE packet upon receiving 1st REQ packet from
+a peer. At that point, the peer's raw address must have been inserted to its address book.
+
+If an endpoint received a HANDSHAKE packet from a peer, the peer must know the endpoint's address, therefore
+the endpoint can stop including raw address in packet header.
+
+This concludes the discussion of the workflow.
+
+
+### 2.3 Implementation tips
+
+When implementing the handshake sub-protocol, keep in mind that the application
+does not know the existence of a HANDSHAKE packet, therefore will not wait
+for its completion.
+
+For example, it is normal that a HANDSHAKE packet encounter an send error
+because peer has already been closed, because application might just send
+1 message and close the endpoint.
+
+It is also possible to close an endpoint when there are inflight HANDSHAKE
+packet, because the application might just want to receive 1 message, then
+close the endpoint. However, the action of receiving a message, will cause
+a HANDSHAKE packet to be sent.
+
+## 3. Baseline features
+
+This part describes the 10 baseline features in protocol v4, which uses only the send/receive
+functionality of EFA device, and should be supported by any endpoint that implements protocol v4.
+
+### 3.1 REQ packet types
+
+Before getting into details of each baseline feature, we give a general introduction to
+the REQ packet types, which all these baseline features use to initialize the communication.
+
+REQ packets is not one but a category of packet types. In this chapter, 10 REQ packet types will be
+covered, as each baseline feature has its own REQ packet type.
+
+According to the type of communications it is used for, REQ packet types can be further divided into
+4 categories:
+
+RTM (Request To Message) is used by message sub-protocols (for two-sided communication). RTM can be
+further divided into MSGRTM and TAGRTM. TAGRTM is used when application calls libfabric's tagged
+send/receive API (such as `fi_tsend` and `fi_trecv`), MSGRTM is used by the non-tagged send/receive
+API (such as `fi_send` and `fi_recv`).
+
+RTW (Request To Write) is used by emulated write sub-protocols.
+
+RTR (Request To Read) is used by emulated read sub-protocols.
+
+RTA (Request To Atomic) is used by emulated atomic sub-protocols.
+
+Regardless, all REQ packets are consisted of 3 parts: REQ mandatory header, REQ optional header and
+application data (optional).
+
+**REQ mandatory header** is unique for each individual REQ packet type. However, they all must start with
+the same 4 bytes EFA RDM base header (section 1.3). Recall that a base header is consisted with 3 fields:
+`type`, `version` and `flags`. Among them, `flags` warrants more discussion here, as all REQ packets share
+the same set of flags, which is listed in table 3.1:
+
+Table: 3.1 a list of REQ packet flags
+
+| Bit Id | Value | Name | meaning |
+|-|-|-|-|
+|  0     | 0x1    | REQ_OPT_RAW_ADDR_HDR | This REQ packet has the optional raw address header |
+|  1     | 0x2    | REQ_OPT_CQ_DATA_HDR  | This REQ packet has the optional CQ data header |
+|  2     | 0x4    | REQ_MSG              | This REQ packet is used by two-sided communication |
+|  3     | 0x8    | REQ_TAGGED           | This REQ packet is used by tagged two-sided communication |
+|  4     | 0x10   | REQ_RMA              | This REQ packet is used by an emulated RMA (read or write) communication |
+|  5     | 0x20   | REQ_ATOMIC           | This REQ packet is used by an emulated atomic (write,fetch or compare) communication |
+| 15     | 0x8000 | CONNID_HDR           | This REQ packet has the optional connid header |
+
+Note, the CONNID_HDR flag is an universal flag (table 1.4), and is listed here for completeness.
+
+**REQ optional headers** contain additional information needed by the receiver of the REQ packets.
+As mentioned earlier, the existence of optional header in a REQ packet is indicated by bits in the `flags`
+field of the base header. There are currently 3 REQ optional headers defined:
+
+1. the raw address header, which has the following format:
+
+Table: 3.2 format of REQ optional raw address header
+
+| Field | type    | Length | C type |
+|-|-|-|-|
+| `size`  | integer | 4      | `uint32` |
+| `addr`  | array   | `size` | `uint8[]` |
+
+As can be seen, the optional raw address is consisted of two fields `size` and `addr`. The field `size` describes
+number of bytes in the `addr` array. The field `addr` contains the raw address. The `size` field is necessary because
+the raw address format of EFA can be expanded in the future.
+
+As mentioned before, an endpoint will include raw address in REQ packet before it receives a handshake packet back
+from a peer. This is because the peer might not have the endpoint's raw address in its address vector, thus cannot
+communicate with the endpoint.
+
+2. the CQ data header, which is an 8 byte integer. CQ data header is used when application called libfabric's
+CQ data send/write API (such as `fi_senddata`, `fi_tsenddata` and `fi_writedata`), which will include an extra
+data in the RX completion entry written to application.
+
+3. the connid (connection ID) header, which is a 4 byte integer. It is used when peer has the "connid header"
+extra request, and the endpoint can support it. More information about this header in section 4.4.
+
+Note, it is possible to have multiple optional REQ headers in one REQ packets. In this case, the order they appear
+in the REQ packets must be the same as their bit appear in the `flags` field. e.g. the raw address header
+must precede the CQ data header, and the CQ data header must precede the connid header.
+
+**Application data** follows immediately after the optional header. Note that not all REQ packet types contain
+application data. For example, the RTR (Request To Read) packet type does not contain application data.
+
+### 3.2 baseline features for two-sided communication
+
+This section describes the 3 baseline features for two sided communication: eager message, medium message, long-CTS message.
+Each of them correspond to the same named sub-protocol. When describing a sub-protocol, we always follow
+the same structure: workflow, packet format and implementation tips.
+
+#### Eager message feature/sub-protocol
+
+Eager message feature/sub-protocol is used when application's send buffer is small enough to be fit in one packet.
+This protocol works in the following order:
+
+1. On sender side, application call libfabric's send API, providing a send buffer.
+2. On receiver side, application call libfabric's receive API, providing a receive buffer.
+3. Sender sends an EAGER_RTM (EAGER_MSGRTM or EAGER_TAGRTM) packet, which contains the application's data.
+4. Upon receiving the packet, receiver will process the received packet, and make sure the received
+   data is in application's receive buffer.
+
+The following diagram illustrate the workflow:
+
+![eager message](message_eager.png)
+
+The mandatory header of an EAGER_RTM packet is described in table 3.3:
+
+Table: 3.3 format of an EAGER_RTM packet
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`      | 1 | integer | `uint8_t`  | part of base header |
+| `version`   | 1 | integer | `uint8_t`  | part of base header|
+| `flags`     | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`    | 4 | integer | `uint32_t` | message ID |
+| `tag`       | 8 | integer | `uint64_t` | for eager TAGRTM only |
+
+The field `msg_id` records the sending order of all RTM packets between two endpoint.
+Receiver can use it to re-order the received RTM packet from the endpoint.
+
+When implementing the eager message sub-protocol, there are a few points worth attention:
+
+1. Noticing that `msg_id` is 4 bytes integer, which means its maximum value is 4,294,967,295.
+After it reaches the maximum value, next message's `msg_id` will became 0. This "wrap around" of
+message id can happen when two endpoints communicate for an extended period of time. Implementation
+must be able to handle it.
+
+2. receiver can either use application buffer to receive data directly (such an implementation is called zero copy receive),
+or it can use a bounce buffer to temporarily hold the application data and copy the data to application's receive buffer
+later. The difficulty of implementing zero copy receive is that EFA device does not guarantee ordered delivery (see Part 0),
+therefore if application want ordered send (`FI_ORDER_SAS`), using a bounce buffer might be the only choice.
+
+3. if a bounce buffer is to be used to receive packets, the receiver need to be able to handle an "unexpected message", which
+is the eager RTM packet arrived before application called libfabric's receive API.
+
+4. if application does not require ordered send, it would be possible to use application's receive buffer to receive data
+directly. In this case, receiver might need the sender to keep the packet header length constant through out the communication.
+The extra request "constant header length" is designed for this use case, see chapter 4.3 for more discussion on this topic.
+
+5. One might notice that there is no application data length in the header, so how can the receiver of an eager RTM packet
+   know how many application data is in the packet? The answer is to use the following formula:
+
+        application_data_length = total_packet_size - RTM mandatory header length - REQ optional header length
+
+   total packet size is reported by EFA device when a packet is received. REQ optional header length can be derived from
+   the `flags` field in the base header. The choice of not including data length in the header is because eager messages
+   are most sensitive to header length, and we want its header to be as compact as possible.
+
+#### Medium message feature/sub-protocol
+
+Medium message protocol split application data into multiple MEDIUM_RTM (either MEDIUM_MSGRTM or
+MEDIUM_TAGRTM) packets, and sender will try send them at once.
+
+In principal, medium message sub-protocol can be used on messages of any size. However, it is not
+recommended to use medium message sub-protocol for long messages, because it does not have flow
+control thus can overwhelm the receiver and cause network congestion. The exact size boundary for
+medium message protocol to be used is up to the implementation to decide.
+
+The following diagram illustrates its workflow:
+
+![medium message](message_medium.png)
+
+Table 3.4 describe the binary structure of a MEDIUM_RTM packet's mandatory header:
+
+Table: 3.4 the format of a MEDIUM_RTM packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`        | 1 | integer | `uint8_t`  | part of base header |
+| `version`     | 1 | integer | `uint8_t`  | part of base header|
+| `flags`       | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`      | 4 | integer | `uint32_t` | message ID |
+| `seg_length` | 8 | integer | `uint64_t` | application data length |
+| `seg_offset` | 8 | integer | `uint64_t` | application data offset |
+| `tag`         | 8 | integer | `uint64_t  | for medium TAGRTM only |
+
+Most of the fields have been introduced before, and their meaning does not change.
+The two new fields are `seg_length` and `seg_offset`. (`seg` means segment, which
+refers to the segment of data in the packet)
+
+`seg_length` is the length of data segment in the medium RTM packet.
+
+`seg_offset` is the offset of data segment in the original send buffer.
+
+`seg_offset` seems redundant at the first glance, as it can be deduced
+from the `seg_length` of other packets.
+
+However, because EFA device does not guarantee ordered delivery, thus
+the MEDIUM_RTM packets of same message can arrive in different order.
+Therefore, the recipent of MEDIUM_RTM packets need `seg_offset` to
+put the data in the correct location in the receive buffer.
+
+When implementing the medium message protocol, please keep in mind
+that because EFA device has a limited TX queue (e.g. it can only send
+limited number of packets at a time), it is possible when
+sending multiple medium RTM packets, some of them were sent successfully,
+others were not sent due to temporary out of resource. Implementation needs
+to be able to handle this case.
+
+Note, this "partial send" situation is unique to medium message sub-protocol
+because medium message sub-protocol is the only one that sends multiple
+REQ packets. In all other protocol, only 1 REQ packet was sent to initialize
+the communication, if the REQ failed to send, the whole communication is
+cancelled.
+
+#### Long-CTS message feature/sub-protocol
+
+Long-CTS message protocol is designed for long messages, because it supports flow control.
+
+
+In long-CTS message protocol, the sender will send a LONGCTS_RTM (either LONGCTS_MSGRTM or LONGCTS_TAGRTM)
+packet to the receiver.
+
+Upon receiving the LONGCTS_RTM, receiver will match it with an application's call to
+libfabric's receive API. Receiver will then calculate how many data it can handle,
+and include that information in a CTS packet it sends back to the sender.
+
+Upon receiving the CTS packet, sender will send multiple DATA packets according to
+information in the CTS packet.
+
+After receiving all the DATA packets it was expecting, receiver will calculate and
+send a CTS packet again.
+
+The above process repeat until all data has been sent/received.
+
+The workflow of long-CTS protocol is demonstrated in the following diagram:
+
+![long-CTS message](message_longcts.png)
+
+There 3 packet types involved in the long-CTS message sub-protocol: LONGCTS_RTM, CTS
+and DATA.
+
+A LONGCTS_RTM packet, like any REQ packet, is consisted with 3 parts: LONGCTS RTM mandatory
+header, REQ optional header and application data.
+
+The format of the LONGCTS_RTM mandatory header is listed in table 3.5:
+
+Table: 3.5 The format of a LONGCTS_RTM packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`         | 4 | integer | `uint32_t` | message ID |
+| `msg_length`     | 8 | integer | `uint64_t` | total length of the whole message |
+| `send_id`        | 4 | integer | `uint32_t` | ID of the ongoing TX operation |
+| `credit_request` | 4 | integer | `uint64_t` | number of data packets preferred to send |
+| `tag`            | 8 | integer | `uint64_t` | for LONGCTS TAGRTM only |
+
+There are 3 fields that is new:
+
+`msg_length` is the length of the whole application message.
+
+`send_id` is an ID the sending endpoint assigned to the send operation, and receive should include
+`send_id` in CTS packet. An endpoint will have multiple send operations at the same time, thus
+when processing a CTS packet from a receive, it needs a way to locate the send operation the
+CTS packet is referring to.
+
+Admittedly, the introduction of `send_id` is not absolute necessary, because receiver could have
+included `msg_id` in CTS header, and sender should be able to locate the send operation using
+the combination of receiver's address and message ID. However, that approach would require
+the sending endpoint set up a map between (address + `msg_id`) and send operation, and look up the map every time
+it received a CTS packet. We considered that approach too burdensome for an endpoint to implement
+and decided to introduce a 4 byte `send_id` in LONGCTS_RTM header to eliminate the cost.
+
+Another note about `send_id` is that it can be reused between messages. Because `send_id` is used to
+distinguish on-the-fly TX operations, so a send operation may have the same `send_id` as a previous
+one that has already finished.
+
+The field `send_id` was named `tx_id` when the protocol was initially introduced. It is renamed
+because the new name is clearer.
+
+The field `credit_request` is how many DATA packets the sender wish to receive from the receiver,
+the receiver will try to honor the request, but is not obligated to. However, receiver must allow
+the sender to send at least 1 DATA packet back, to keep the communication moving forward.
+
+Besides the LONGCTS_RTM packet, there are two other packet types used by the long-CTS message protocol:
+CTS and DATA.
+
+The binary format of a CTS packet is listed in table 3.6:
+
+Table: 3.6 the binary format a CTS packet
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `multiuse(connid/padding)`  | 4 | integer | `uint32_t` | `connid` if CONNID_HDR flag is set, otherwise `padding` |
+| `send_id`        | 4 | integer | `uint32_t` | send id from LONGCTS_RTM |
+| `recv_id`        | 4 | integer | `uint32_t` | receive id to be used in DATA packet |
+| `recv_length`    | 8 | integer | `uint64_t` | number of bytes the receiver is ready to receive |
+
+The 3 new fields in the header are `multiuse`, `recv_id` and `recv_length`.
+
+The field `multiuse` is 4 byte integer. As the name indicates, it is a multi-purpose field.
+Its exact usage is determined by the the `flags` field.
+
+If the CONNID_HDR universal flag is toggled in `flags`, this field is the sender's connection ID (connid).
+Otherwise, it is a padding space.
+
+An implementation is free to choose how to use this field.
+
+Note, when protocol v4 was originally introduced. This field was simply a 4-bytes padding space.
+Later, when we introduce the "connid header" extra feature, we re-purposed this field to to store
+connid. Because "connid header" is an extra request, an endpoint is not obligated to comply.
+In practice, if an endpoint is using libfabric 1.10 to 1.13, it uses this field as padding.
+If an endpoint is using libfabric 1.14 and above, it uses this field to store `connid`.
+
+The field `recv_id` is similar to `send_id` introduced earlier, but for an on-going receive operation.
+Sender should include `recv_id` in the DATA packet.
+
+The field `recv_length` is the number of bytes receiver is ready to receive for this operation,
+it must be > 0 to make the communication going forward.
+
+CTS packet header has 1 flag `CTS_EMULATED_READ` that can be set in `flags` field. This flags
+indicates the CTS packet is used by long-CTS emulated read protocol.
+The Bit ID for this flag is 7, and its value is 0x80.
+
+CTS packet does not contain application data.
+
+A DATA packet is consisted of two parts: DATA packet header and application data.
+Table 3.7 shows the binary format of DATA packet header:
+
+Table: 3.7 the binary format of DATA packet header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `recv_id`        | 4 | integer | `uint32_t` | `recv_id` from the CTS packet |
+| `seg_length`     | 8 | integer | `uint32_t` | length of the application data in the packet |
+| `seg_offset`     | 8 | integer | `uint64_t` | offset of the application data in the packet |
+| `connid`         | 4 | integer | `uint32_t` | sender connection id, optional, |
+| `padding`        | 4 | integer | `uint32_t` | padding for connid, optional |
+
+The last two fields `connid` and `padding` was introduced with the extra request "connid in header".
+They are optional, which means an implemenation was not required to include them in the DATA of the
+data packet. If an implementation does include them in DATA packet header, the implementation need
+to toggle on the CONNID_DHR flag in `flags` field (table 1.4).
+
+When implementing the long-CTS protocol, please keep in mind that although each implementation is allowed
+to choose its own flow control algorithm. They must allow some data to be sent in each CTS packet, e.g
+the `recv_length` field in CTS packet must be > 0. This is to avoid infinite loop.
+
+### 3.3 baseline features for one-sided communication
+
+This section explain the 7 baseline features for one-sided communication. These features/sub-procotols
+emulate one-sided operation by using send/receive functionality of the device. The 7 features are:
+emulated eager write, emulated long-CTS write, emulated short read, emulated long-CTS read, emulated write
+atomic, emulated fetch atomic and emulated compare atomic.
+
+Before getting into details of each feature, we want to discuss some topics related to one-sided operation.
+
+There are 3 types of one-sided operations: write, read and atomic.
+
+Like in two-sided communcation, there are also two endpoints involved in one-sided communcation.
+However, only on one side will application call libfabric's one-sided API (such as `fi_write`,
+`fi_read` and `fi_atomic`). In protocol v4, this side is called requester.
+
+On the other side (which is called responder), application does not make calls to lifabric API call,
+but the EFA provider requires application to keep the progress engine running on responder
+to facilitate the communication. This is because EFA provider only support `FI_PROGRESS_MANUAL`.
+
+Generally, in one-sided communication, only on the requester side will lifabric write a completion
+to notify the application that an one-sided communication is finished. Only exception to this
+rule is when application added the `FI_REMOTE_CQ_DATA` flag when calling libfabric's write API,
+in this case, the provider is required to write an CQ entry on responder with the CQ data in it.
+
+(In fact, there is another exception to this rule: which is if a provider claims support for
+the `FI_RMA_EVENT` capability, the provider will need to write CQ entry for any one-sided operation
+on the responder. However, this exception does not apply to EFA provider because the EFA provider
+does not support the `FI_RMA_EVENT` capability.)
+
+One key difference between one-sided and two-sided communication is that: in one-sided communication, the
+requester must know the remote buffer's information when submitting the request.
+
+In protocol v4, because one-sided operations are emulated, the remote buffer's information are stored
+in REQ packet header. For that, protocol v4 defines a data type `efa_rma_iov`, which is used by
+all REQ packets for one-side communication.
+
+A `efa_rma_iov` struct is consisted of 3 members: `addr`, `len` and `key`. Each member is a 8 byte integer.
+`addr` is the remote buffer address, `len` is the remote buffer length, and `key` is the memory registration
+key for the remote buffer, which is provided by the responder through prior communication.
+
+Another difference is that one-sided operation does not support tag matching, thus each one-sided
+sub-protocol only needs to define 1 REQ packet type.
+
+#### emulated eager write feature/sub-protocol
+
+Emulated eager write sub-protocol is used when the buffer size is small enough to fit in one
+packet.
+
+The workflow of the emulated eager write protocol is shown in the following diagram:
+
+![eager write](write_eager.png)
+
+Emulated eager write protocol is similar to eager message protocol, except an EAGER_RTW
+is used to initiate the communication. Like other REQ packets, an eager RTW packet is consisted of eager RTW mandatory header,
+REQ optional header and application data. The binary format of EAGER_RTW mandatory header is listed
+in table 3.8:
+
+Table: 3.8 the binary format of EAGER_RTW packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+One thing worth noting is that there is no `msg_id` in the eager RTW header, because EFA provider does not support
+ordered write operation.
+
+#### emulated long-CTS write feature/sub-protocol
+
+emulated long-CTS write sub-protocol is used when the buffer size is too big to fit in one packet.
+
+The workflow of emulated long-CTS write general follow the long-CTS message sub-protocol, as illustrated
+in the following diagram:
+
+![emulated long-CTS write](write_longcts.png)
+
+The main difference between the two protocol is that the LONGCTS_RTW packet is used instead of the
+LONGCTS_RTM packet. The binary format of LONGCTS_RTW packet's mandatory header is listed in table 3.9:
+
+Table: 3.9 the format of LONGCTS_RTW packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `msg_length`     | 8 | integer | `uint64_t` | total length of the application buffer |
+| `send_id`        | 4 | integer | `uint32_t` | ID of send operation |
+| `credit_request` | 4 | integer | `uint32_t` | number of packets requester is ready to send |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+All fields have been described before, but some explanation is warranted for the `send_id` field. It is not
+named `write_id` because this protocol is using send/receive to emulated write, therefore it is implied that
+the requester is treating this communication as a send operation internally, and this communication is subject
+to same flow control as a long-CTS message communication does.
+
+#### emulated read features/sub-protocols
+
+This section describes two emulated read sub-protocols: emulated short read and emulated long-CTS read. Both
+sub-protocols use send/receive to emulate read. The interesting part is, in an emulated read communication,
+the responder is the sender and the requester is the receiver.
+
+The workflow of emulated short read protocol is illustrated in the following diagram:
+
+![emulated short read](read_short.png)
+
+As can be seen, in this protocol, the requester send a short RTR packet to the responder and the responder send
+a READRSP packet back to the requester.
+
+The binary format of a SHORT_RTR mandatory header is listed in the table 3.10:
+
+Table: 3.10 the format of a SHORT_RTR packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `msg_length`     | 8 | integer | `uint64_t` | total length of the application buffer |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation, to be included in READRSP packet |
+| `padding`	   | 4 | integer | `uint32_t` | alignment for 8 bytes |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+Among the fields, the `recv_id` is most interesting. As mentioned before, in an emulated read protocol, the requester is the
+receiver, so it is necessary to include `recv_id` in the request. The responder needs to include this `recv_id` in
+the READRSP packet, for the requester to properly process it.
+
+A READRSP (READ ReSPonse) packet consists of two parts: READRSP header and application data. The binary format
+of the READRSP header is in table 3.11:
+
+Table: 3.11 the format of a READRSP packet's header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `multiuse(padding/connid)`         | 4 | integer | `uint32_t` | `connid` if CONNID_HDR flag is set, otherwise `padding` |
+| `send_id`        | 4 | integer | `uint64_t` | ID of the send operation, to be included in the CTS header |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation  |
+| `recv_length`    | 8 | integer | `uint64_t` | length of the application data in the packet |
+
+The field `multiuse` has been introduced before when introducing the CTS packet (table 3.6).
+It is a multi-purpose field, which can be used to store `connid` or as a padding
+space, depend on whether the CONNID_HDR universal flag is toggled in `flags`. See section 4.4
+for more information about the field `connid`.
+
+The workflow of the emulated long-CTS read sub-protocol is illustrated in the following diagram:
+
+![emulated long-CTS read](read_longcts.png)
+
+The protocol started by the requester send a LONGCTS_RTR packet. After that, the workflow generally follow that of the
+long-CTS message sub-protocol, except the responder is the sender and the requester is the receiver.
+
+The mandatory header of LONGCTS_RTR packet is listed in table 3.12:
+
+Table: 3.12 the format of a LONGCTS_RTR packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `msg_length`     | 8 | integer | `uint64_t` | total length of the application buffer |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation, to be included in READRSP packet |
+| `recv_length`	   | 4 | integer | `uint32_t` | Number of bytes the responder is ready to receive |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+The only difference between LONGCTS_RTR and SHORT_RTR is the field `padding` in SHORT_RTR is replaced by the field `recv_length`.
+Here, the LONGCTS_RTR packet serves the same functionality of the first CTS packet in long-CTS message sub-protocol. The reason
+is: when the endpoint is preparing the LONGCTS_RTR, it already knows it is going to receive some data, thus it should calculate
+how many bytes it is ready to receive using the flow control algorithm, and put the number in the packet.
+
+The short RTR protocol can only be used if the read buffer can fit in one READRSP packet, so the maximum size of a short emulated
+read protocol is (MTU size - READRSP header size). For messages whose size is larger, the emulated long-CTS read protocol has
+to be used.
+
+#### emulated atomic protocols
+
+This section describes the 3 emulated atomic protocols: emulated write atomic, emulate fetch atomic and emulated compare atomic.
+
+The workflow of emulated write atomic is illustrated in the following diagram:
+
+![atomic_write](atomic_write.png)
+
+It is similar to emulated eager write sub-protocol, except an WRITE_RTA packet was
+sent. Table 3.13 lists the binary structure of an WRITE_RTA packet's mandatory
+header:
+
+Table: 3.13 the format of an WRITE_RTA packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`         | 4 | integer | `uint32_t` | message ID |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `atomic_datatype`| 4 | integer | `uint32_t` | atomic data type |
+| `atomic_op`      | 4 | integer | `uint32_t` | atomic operation ID |
+| `pad`            | 4 | integer | `uint32_t` | atomic operation ID |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+The two new fields introduced are `atomic_datatype` and `atomic_op`. There are atomic data type and atomic operations
+defined in libfabric standard. A list of atomic datatypes can be find in libfabric [fi_atomic](https://ofiwg.github.io/libfabric/v1.4.0/man/fi_atomic.3.html) man page.
+
+The field `msg_id` is provided as message ID. It is used to implement ordered atomic operations, which is supported by libfabric EFA provider,
+and is required by some application such as MPICH.
+
+The workflows of emulated fetch/compare atomic are the same, as illustrated in the following diagram:
+
+![atomic_fetch_compare](atomic_fetch_compare.png)
+
+Comparing to write atomic, the differences are:
+
+First, an FETCH_RTA/COMPARE_RTA is used to initiate the communication.
+second, that responder will send an ATOMRSP (atomic response) packet back.
+
+The binary format of FETCH_RTA and COMPARE_RTA are the same.
+Table 3.14 shows the format of mandatory header of a FETCH_RTA/COMPARE_RTA packet:
+
+Table: 3.14 the format of an FETCH_RTA/COMPARE_RTA packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`         | 4 | integer | `uint32_t` | message ID |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov structure |
+| `atomic_datatype`| 4 | integer | `uint32_t` | atomic data type |
+| `atomic_op`      | 4 | integer | `uint32_t` | atomic operation ID |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation on the requester side |
+| `rma_iov`        | `rma_iov_count` * 24 | array of `efa_rma_iov` | `efa_rma_iov[]` | remote buffer information |
+
+The differences between a FETCH_RTA and a COMPARE_RTA are:
+
+First, The value of `atomic_op` is different between FETCH_RTA and COMPARE_RTA.
+
+Second, the application data part of a COMPARE_RTA packet contains two segments of data: `buf` and `compare`.
+(see [fi_atomic](https://ofiwg.github.io/libfabric/v1.4.0/man/fi_atomic.3.html))
+
+The difference between an WRITE_RTA and an FETCH_RTA/COMPARE_RTA is that the field `pad` was replaced by `recv_id`.
+Because we are using send/receive to emulate a fetch/compare atomic operation. The requester is going to receive
+data from the responder, the field `recv_id` is the ID of the receive operation the requester side, which is to
+be included in the header of ATOMRSP packet.
+
+An ATOMRSP packet is consisted of two parts: header and application data. Table 3.15 shows the format of the header of an ATOMRSP packet:
+
+Table: 3.15 the binary format of an ATOMRSP packet header.
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `multiuse(connid/padding)` | 4 | integer | `uint32_t` | `connid` if CONNID_HDR is set, otherwise `padding` |
+| `reserved`       | 4 | integer | `uint32_t` | reserved for future use |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation on the requester side |
+| `seg_length`     | 8 | integer | `uint64_t` | length of the application data in the packet |
+
+The field `multiuse` has been introduced before when introducing the CTS packet (table 3.6).
+It is a multi-purpose field, which can be used to store `connid` or as a padding
+space, depend on whether the CONNID_HDR universal flag is togged in `flags`. See section 4.4
+for more information about the field `connid`.
+
+## 4. Extra features and requests
+
+This chapter describes the extra features and requests of protocol v4.
+
+### 4.1 RDMA read based data transfer (RDMA read)
+
+The extra feature "RDMA read based data transfer" (RDMA read) was introduced together
+with protocol v4, when libfabric 1.10 was released. It was assigned ID 0.
+
+It is defined as an extra feature because there is a set of requirements (firmware,
+EFA kernel module and rdma-core) to be met before an endpoint can use the RDMA
+read capability, therefore an endpoint cannot assume the other party support RDMA read.
+
+The "RDMA read" extra feature corresponds to the following sub-protocols:
+long-read message, emulated long-read write, direct read.
+
+#### Long-read message sub-protocol
+
+The long-read message sub-protocol uses RDMA read to implement two-sided communication.
+
+The work flow of long-read message sub-protocol is illustrated in the following diagram:
+
+![long-read message](message_longread.png)
+
+There are two packet types involved in this protocol: LONGREAD_RTM and EOR (End Of Read).
+
+LONGREAD_RTM is sent by the sender to initiate the communication.
+
+Like all REQ packets, a LONGREAD_RTM consists of 3 parts: mandatory header, REQ optional
+header and the application data. However, the application data part of a LONGREAD_RTM is
+special: it is not the data in the application's send buffer, but information of the
+sender buffer.
+
+In long-read message sub-protocol, sender need to construct an `read_iov`, which is
+an array of `efa_rma_iov` of application's send buffer. The `read_iov` is used
+as the application data in the LONGREAD_RTM packet.
+
+The binary format of a LONGREAD_RTM packet's mandatory header is listed in table 4.1
+
+Table: 4.1 the binary format of a LONGREAD_RTM packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `msg_id`         | 4 | integer | `uint32_t` | message ID |
+| `msg_length`     | 4 | integer | `uint64_t` | total length of the message |
+| `send_id`        | 4 | integer | `uint32_t` | ID of the receive operation  |
+| `read_iov_count` | 4 | integer | `uint32_t` | number of iov to read |
+
+Noticing the new field `read_iov_count`, which is number of `struct efa_rma_iov` in `read_iov`.
+
+To construct `read_iov`, sender need to make sure the send buffer is registered with EFA device and fill
+the registration key in `read_iov`.
+
+There are two ways to achieve that:
+
+First, if the buffer has already been registered with device, application will provide
+a memory descriptor along with the sender buffer, registration key can be extracted from the descriptor;
+
+Second, if the buffer has not been registered with EFA device, sender need to register the buffer,
+and can get the key from the registration. Note because memory registration is a limited resource,
+it is possible for memory registration to fail and sender need to be able to handle the case.
+
+Upon receiving a long-read RTM, the receiver will use RDMA read to copy data from application's
+send buffer to application's receive buffer (avoiding copy). That is why this protocol is
+sometime referred as zero-copy.
+
+After all read is finished, the receiver will send an EOR packet to the sender to notify it
+the work is done.
+
+The binary format of the EOR packet is listed in table 4.2
+
+Table: 4.2 the format of an EOR packet
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `send_id`        | 4 | integer | `uint32_t` | ID of the send operation |
+| `recv_id`        | 4 | integer | `uint32_t` | ID of the receive operation |
+| `multiuse(connid/padding)`  | 4 | integer | `uint32_t` | `connid` if CONNID_HDR is set, otherwise `padding` |
+
+The field `multiuse` has been introduced before when introducing the CTS packet (table 3.6).
+It is a multi-purpose field, which can be used to store `connid` or as a padding
+space, depend on whether the CONNID_HDR universal flag is togged in `flags`. See section 4.4
+for more information about the field `connid`.
+
+#### emulated long-read write sub-protocol
+
+The emulated long-read write sub-protocol uses RDMA read to emulate an write operation.
+
+The workflow of this protocol is illustrated in the following diagram:
+
+![long-read write](write_longread.png)
+
+The workflow is similar to that of long-read message sub-protocol. One key difference is that
+a LONGREAD_RTW packet is used to initiate the communication. The binary format of the LONGREAD_RTW
+packet mandatory header is listed in table 4.3.
+
+Table: 4.3 the format of a LONGREAD_RTW packet's mandatory header
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `rma_iov_count`  | 4 | integer | `uint32_t` | number of RMA iov on the responder |
+| `msg_length`     | 8 | integer | `uint64_t` | total length of the message |
+| `send_id`        | 4 | integer | `uint32_t` | ID of the receive operation  |
+| `read_iov_count` | 4 | integer | `uint32_t` | number of iov on requester (to be read by responder) |
+| `rma_iov`        | `rma_iov_count` * 24 | array | `efa_rma_iov[]` | write iov information |
+
+One thing worth noting is the existence of both `rma_iov_count` and `read_iov_count`.
+
+Though both have been explained before, this is the first time they appear in same header, so it might
+be helpful to revisit them.
+
+The field `rma_iov_count` (and  `rma_iov`) are provided by application, which application called libfabric's write API.
+They contain information of the target buffer (of write) on the responder side.
+
+The field `read_iov_count` (and a `read_iov`) is constructed by the write requester,
+which contains information of the source buffer (of write) on the requester side.
+The `read_iov` is not part of the mandatory header, because it is considered
+application data, which is located right after the REQ optional header.
+
+#### direct read sub-protocol
+
+Direct read sub-protocol is the simplest sub-protocol in protocol v4. It does not involve a REQ packet.
+The workflow is just for read requester keep using RDMA read on the responder. For this protocol, it is
+not necessary that responder keep progress engine running.
+
+### 4.2 delivery complete
+
+The extra feature "delivery complete" was introduced with libfabric 1.12.0, and was assigned ID 1.
+
+Delivery complete is a requirement application can impose on an endpoint when opening the endpoint.
+It requires that when application gets the send/write completion, the application data must have
+been delivered to application's target buffer.
+
+The reason it is implemented as an extra feature is because, not all sub-protocols in the baseline
+features support delivery complete. Specifically, the following 6 sub-protocols do NOT:
+
+* eager message,
+* medium message,
+* long-CTS message,
+* eager write,
+* long-CTS write and
+* write atomic.
+
+These sub-protocols are designed to support a weaker completion model: transmit complete.
+Transmit complete requires that when the send/write completion was written, the data has been transmitted
+to the receiver/responder.
+
+The difference between transmit complete and delivery complete is transmit complete indicate
+that data has arrived at A buffer on the receive/responder, but the buffer is not necessary the application's
+target buffer. buffer. In fact, because of the limitation of the EFA device (no ordering guarantee) and the nature
+of the communications (emulated write), for some protocols, the implementation have to use a temporary
+buffer to receive data, and copy the data to application buffer later, and the time difference can be indefinite.
+
+The "delivery complete" extra feature was introduced to support applications with such requirements.
+It comes with 6 sub-protocols:
+
+* DC eager message,
+* DC medium message,
+* DC long-CTS message,
+* DC eager write,
+* DC long-CTS write and
+* DC write atomic.
+
+The workflow of these sub-protocols are same to that of there non-DC counterpart, with 3 differences changes:
+
+First, each DC capable sub-protocol defines its own REQ packet type.
+
+Second, after data was delivered to application buffer, the receiver/responder will send a RECEIPT
+packet back to the sender/requester.
+
+Third, sender/responder will not write completion until it received the RECEIPT packet.
+
+The binary format of a RECEIPT packet is as the following:
+
+| Name | Length (bytes) | type | C language type | Note |
+|-|-|-|-|-|
+| `type`           | 1 | integer | `uint8_t`  | part of base header |
+| `version`        | 1 | integer | `uint8_t`  | part of base header|
+| `flags`          | 2 | integer | `uint16_t` | part of base header |
+| `send_id`        | 4 | integer | `uint32_t` | ID of the send operation |
+| `msg_id`         | 4 | integer | `uint32_t` | message ID |
+| `multiuse(connid/padding)`  | 4 | integer | `uint32_t` | `connid` if CONNID_HDR is set in `flags`, otherwise `padding` |
+
+The field `multiuse` has been introduced before when introducing the CTS packet (table 3.6).
+It is a multi-purpose field, which can be used to store `connid` or as a padding
+space, depend on whether the CONNID_HDR universal flag is togged in `flags`. See section 4.4
+for more information about the field `connid`.
+
+### 4.3 keep packet header length constant (constant header length) and zero-copy receive
+
+The extra request "keep packet header length constant" (constant header length) was introduced in libfabric 1.13.0
+release, and was assigned the ID 2.
+
+This extra request would be useful if endpoint want to implement the "zero copy receive" optimization.
+
+As can be seen from previous discussion, because EFA device does not support ordered delivery, an endpoint
+usually needs to use a temporary buffer to receive incoming packets, and copy data to application's receive buffer
+later. However, if an application has the following set of requirement:
+
+   1. does not need ordered send/receive (`FI_ORDER_SAS`).
+   2. only send/receive eager messages.
+   3. does not use tagged send.
+   4. does not require FI_DIRECTED_RECV (the ability to receive only from certain address).
+
+it should be possible to receive data directly using application buffer directly. Because under such condition,
+receiver does not have special requirement on the data it is going to receive, thus will accept any message from
+the sender.
+
+However, there is one more hurdle to overcome in implementing "zero copy receive", which is the packet header
+length.
+
+Under the condition that endpoints "will only send eager messages" and "do not use tag matching", a sender will
+send data in an EAGER_MSGRTM packet. An EAGER_MSGRTM packet is consisted of packet header and application data.
+However, we cannot put packet header in application's receive buffer. Therefore, for "zero copy receive" to work,
+the receiver need to:
+
+   a. be able to estimate the packer header length of an incoming EAGER_MSGRTM packet, and the
+   b. the packet header length of EAGER_MSGRTM cannot change throughout the communication.
+
+However, there is no guarantee in the base protocol that the packet header length of EAGER_MSGRTM will not
+change.
+
+In fact, because of the existance of the handshake sub-protocol, the packet header length of an EAGER_MSGRTM
+will definitely change. Recall handshake sub-protocol's workflow is:
+
+Before receiving handshake packet, an endpoint will always include the optional raw address header in REQ packets.
+
+After receiving handshake packet, an endpoint will stop including the optional raw address header in REQ packets.
+
+The extra feature "keep packet header length constant" (constant header length) is designed to solve this problem.
+
+When an endpoint toggle on this extra request, its peer will try to satisfy it by keep the header length constant.
+Exactly how to achieve that is up to the implementation to decide, the easiest way to do that is to keep including
+raw address header in the EAGER_MSGRTM even after receiving the handshake packet.
+
+Note, because this is an extra request, an endpoint cannot assume its peer will comply with the request. Therefore,
+the receiving endpoint must be able to handle the situation that a received packet does not have the expected header length.
+
+In that case, implementation will have two choices:
+
+1. write a truncated error completion entry
+2. move the application data to right place.
+
+Note this extra request was initially introduced as an extra feature named "zero copy receive", but it is later realized
+that is not an feature because the peer does not do anything different. Rather, it is an expectation the receiving
+endpoint has for the sender. Therefore, it was re-interpreted as an extra request named "constant header length".
+This re-interpretation does not change the implementation, thus does not cause backward incompatibility.
+
+### 4.4 have connection ID in packet header (connid header)
+
+The "have connection ID in packet header" extra request was introduced with libfabric 1.14.0 release, and was
+assigned the ID 3.
+
+This extra feature is designed to solve the "QP collision" problem, which is commonly experienced in
+client-server type of application.
+
+The "QP collision" problem arise from fact that the EFA device uses the Device ID (GID)
++ QP number (QPN) as the unique
+identifier of a peer. Recall that raw address of EFA endpoint is consisted of 3 parts:
+GID + QPN + Connection ID (CONNID). EFA device only recognizes GID and QPN.
+The connection ID was generated by the endpoint itself during its initialization.
+
+Because of that, it is possible for an endpoint to receive packets from a destroyed QP, which was used
+by a previous process that used the same QPN. As can be seen throughout the document, each packet in the
+EFA RDM communication protocol is not indepenedent. The correct processing of a packet need prior knowledge.
+
+For example, there is a `recv_id` in the header of a DATA packet (section 3.2), which assumes
+the receiver to maintain a list of receive operations and can find the operation correspond to
+the message using `recv_id`.
+
+To solve this problem, receiver need to know the full address of the sender. As shown in table
+1.5, EFA's full address consists of: GID, QPN and CONNID. Currently, EFA device will report
+Address Handle Number (AHN) and QPN of a received packet. Because GID can be obtained from AHN,
+the only unknown part is CONNID.
+
+The extra request "connid header" was introduced to address the issue. An endpoint can flag
+the bit correspond to 
+Also because this is an extra request, an endpoint cannot assume that the peer support it, thus need to be able
+to handle the case that incoming packets does not have sender connection ID in it. It is up to the
+implementation to decide whether in this case the endpoint should abort the communication or continue without
+using the extra request.
+
+A universal flag CONNID_HDR (table 1.4) was designated for CONNID in packet header. An implementation is not required to
+set connid. However, when it does include connid in packet header, it need to toggle on the CONNID_HDR flag in
+the `flags` field of the base header. The exact location of connid is different for each packet type.
+
+## 5. What's not covered?
+
+The purpose of this document is to define the communication protocol, therefore it is intentionally written
+as "implementation neutral". It should be possible to rewrite a libfabric EFA provider from scratch by
+following this document, and the newly written EFA provider should be able to interoperate with any
+libfabric EFA provider since libfabric 1.10.0.
+
+Because of this, in various places, the document provides tips about how to implementation certain
+aspects of the protocol, but did not give specific instruction about what implementation should do.
+
+There are a few things that this document consider as implementation specific thus does not cover.
+
+For example, this document does not specify the selection logic of various protocols. For example, there are
+three message sub-protocols medium message, long-CTS message and long-read message that can be used for
+long message (it is not recommended), and an implementation can choose switch point without breaking
+protocol.
+
+On similar note, this document does not describe the shared memory (SHM) implementation of EFA provider, which is an optimization technique
+an endpoint can use to speed up intra-instance communication. The idea is that if two endpoints are on
+same instance, and opened by same user, they can use the SHM mechanism on the instance to communicate,
+which has higher bandwidth and lower latency.
+
+It is not covered in this document because EFA device is not used, and because two endpoints are opened
+on same instance by the same user, they should be using the same libfabric library, so the concern of backward
+compatibility does not apply to this case.
+
diff --git a/deps/libfabric/prov/efa/docs/handshake.drawio b/deps/libfabric/prov/efa/docs/handshake.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..826b1b012507344c5a3c6621d4b689bce88a356e
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/handshake.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2020-02-28T22:28:00.796Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:68.0) Gecko/20100101 Firefox/68.0" version="12.4.8" etag="AZ6Ufx_XuPE9K7quufH2" pages="2"><diagram id="Sn-nw_Uit1KFLxTq_Swt" name="Page-1">7Vpdb6M4FP010e4+ZIQxEHhsPraV+qFts9lp92XkgANWHZwxTgP769duTBIwI3Uq0kRl8hJ8MTY+x+fea5seHC3zS45WyS2LMO3ZVpT34Lhn23bgePJPWYqtBVi+v7XEnETatjdMyX+4rKitaxLhrFJRMEYFWVWNIUtTHIqKDXHONtVqC0arva5QjA3DNETUtH4lkUi2Vt+19vYrTOKk7BlY+s4SlZW1IUtQxDYHJjjpwRFnTGyvlvkIU4Veicv2uT9/cHf3Yhyn4i0PJLdBMpsP/fxb/5HM/Hm+SGZ9e9vKC6JrPeCH/KFne1Q2OZxzeRWrqwf8fY0zgbkeiihKfDhbpxFWXViy3iYhAk9XKFR3N3JKSFsillSWgLw0X1mP4gVzgfMDkx7CJWZLLHghq5RTyh5sH9ETCmp0N3tyQIl4ckCMp21Iz4d41/IeMnmhUfsJBOFbEcxWTCJ1egSdc0PQMxA0MMJpdKHELEshRVlGwiosEcqSVwhVQULDi0eF5xe3LD5peF8L47xSKnTph9jiqOIjTGQPoHMbkCttHFMkyEvVszTBqXv4ixH5Jvup73tyQIfUBTVKMrbmIdaPHboCo6XqFHBhrSGBeIyF0dArvbtxv5/xQcuMnztzTmvMqZYCz7d8F1ge9O3gpDz6HeOxrhtQx/vtNNYaGnwsccAxo9bkXhp+3xCJowQKbdTbRBHHWfaHwaoMNaLKYyY4e8YjRhmXlpSlsuZwQSitmRAlcaomg6RYBkM4VIGLyHzrQt9YkihS3TRGwmqs5ExINpl6CrQUHCG0mxk+jI6wYYbZLUTHdHrZn/4zGi2L0B9/8xy6jhb9oGMaM6ThtxTkDJ/bnsYaiStXMAfMXV3cjadXF9eT8xcUhLAlRQU1GoCpKKdhOoFjCQqYa57PrSgjarlBS1ErqDXUnqImNzfLGOZ39B7c8WeneP7X+7tJUZ+dOa+qHei+l7laQwP3WMzdD8aWfe3e387610+Dm9C+fIwbEv4vr7/z94OtJRZOXYQDww16R8orGikxc/euUyKTv9NS0kH/VmPAamk9tcsoPsi/AWAw13U1geDEDq7zeR5sa3cCHm93opk512BuiDJMidTBeeqpBf3UDzZ2e/An04+5L99BFpxTezEzde4gC/6pWTCz5e6xYJ/cI5l7oXdM7VM/p+qcX/VbP3bl5bHrb/o+Wiqs0nmm/kK0QnNCiSg+L4ugzC5KFhu0BJqykqPRaJvh/eDwga1VJtLZ84cySa3ncKdj69dGAbRqpAzeSAqARyIFdn2nwFiSvHdt49TPGdpb24zBQ0y+zhZPxfd5kaf31Ov3G3ZCu5BI1PTzgYlEIwlmHtEFEqpfoEB3901K+zTI4v7Dxq109t+Hwsn/</diagram><diagram id="w6Jld2eqyOy6yJ187mOw" name="Page-2">ldFPD4IgFADwT8OxTaU2PZvVxbVlf1w3JiRs6HOI0/z06cSMdakT8OPx4D0QDotur0jFY6BMIs+hHcJb5HlusA6GYZTnJL7vT5ArQU3QAonomUHHaCMoq61ADSC1qGzMoCxZpi0jSkFrhz1A2rdWJGdfkGREfutNUM1NFRtn8QMTOZ9vdh2zU5A52EDNCYX2g3CEcKgA9DQrupDJsXlzX3SV9peUtCeZXe9eEx/PBaymZLt/jrxLUKzUv6YeJsvThoX1wTh6AQ==</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/message_dc_eager.drawio b/deps/libfabric/prov/efa/docs/message_dc_eager.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..d843f02dea23e1947027785336091278cb4c25ba
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_dc_eager.drawio
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T18:20:04.067Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15" etag="J5Nr3i0nlfrDectmQK_h" version="12.4.8" type="device"><diagram id="aD31mCK-H1K8Hn6eNp0I" name="Page-1">5Zhbb5swGIZ/TbSrTgESklwmlGXV0i1Lj+tN5GAHvBmMjFOgv352MIRTq66iTaRdxX5tbL7n/XwgPcPykzkDoXdJISI9vQ+TnnHe03VNn5jiRypppow0PRNchqHqdBCu8BNSYl+pOwxRVOnIKSUch1XRoUGAHF7RAGM0rnbbUlKdNQQuaghXDiBN9Q5D7mXqeNg/6F8Rdr18Zq2vWnyQd1ZC5AFI45Jk2D3DYpTyrOQnFiISXs4le+7LM63FizEU8Nc84F1OvJvNbJysz+7xzXiTbL2bM2XGIyA7FfA0DAl2AMc0kFjBfgyCN1uwYdj5FMlIUADFz3R5oSLjaY6L0V0AkZyx3zNmsYc5ugqBI1tjkSBC87hPRE0TxWYEKqhHxDhKSpKKaI6ojzhLRRfVmsNNq9X4YJWWa17JJlNpQGWHWwx8ACgKiuE/8DQaPFfIQVgEtCcXMuoyFMkiClwcoOMDPDWCZoNgg5FIv6lc2qLmEBBF2KligSDy9ghlRaBh6b3k+XmYV38pvPvKeVKppar2LFsOmIv4CxFMsn4IVnaWpgMlxMMWwrnGEBGr8bG6H7VhVzMsKRZvXBg8moi4yw6Pa85FdMccpJ4q7x+1gca1RBnWxsm4NMbZJ0ER9dvzYtRxXjzr74n41p1xTecmH+rc+D9zroZb65vd+KaZ5of6Nmn4Vr0buJQXlwGTCIazDRMlV5Ys6ocEZR2PfcLVst889gmXX21fBlsnyrJ7xGmjPj3Wgwbrc2ttT+f2ar26vhQMgS85BJsoLBiUCIrAeRVTxBn9gyxKKBNKQMUdzphtMSE1CRDsBnI/EziR0GcSo/CYTFWDjyGU07T6UnWOUZ7lhniFUTdWaWZtc9FGDasGLU7pHTi10G3j5zw1sbm43U7tuzgePLztS6RYFCfxMWKY1QNbM5pM3y39l/CbtV5Q42F7u/j9/cfonl6sWqB2cfTWztkWake7NNWPzK4uu/W07+7obfWt5TPStuyL5fXpb1BnmvzMglgsTaXEKOJF7jRW06uyJ39gOKjZq7/XpiWqh79mMlsPf3AZ9l8=</diagram></mxfile>
diff --git a/deps/libfabric/prov/efa/docs/message_dc_eager.png b/deps/libfabric/prov/efa/docs/message_dc_eager.png
new file mode 100644
index 0000000000000000000000000000000000000000..90aeee1e978fe438f1bd39fd0297061ec9347db5
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_dc_eager.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_dc_longcts.drawio b/deps/libfabric/prov/efa/docs/message_dc_longcts.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..34fe6b4f8c93d94f2a64352c40786b9eea8d566d
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_dc_longcts.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T18:29:53.714Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="9H__y9q8GqsaccNC4onW" version="12.4.8" type="device"><diagram id="OzKOerL5p1M4QQdfBGPd" name="Page-1">5ZpRd5owFMc/jWdP7QECiI8Wnds5bddVu3V76UGIkNNAXMCK+/QLEqok6dZ5wHLqk+QSgfx/917ujfaAG+cT6i2jKxJA3DO0IO+BUc8wdE132Edh2ZSWvt4vDSFFAZ+0M0zRb1h9k1tXKIBpbWJGCM7Qsm70SZJAP6vZPErJuj5tQXD9rksvhJJh6ntYtn5HQRaVVsfSdvZPEIVRdWdd42dir5rMDWnkBWS9ZwLjHnApIVl5FOcuxIV4lS7l9z6+cPb5wShMstd8IboaRHfzCyd/OLtHd848X0R3Z0Z5lScPr/iCh8slRr6XIZIUsnrba1yi+cKbU+R/SIuVwCRgH8Obz3xl2aaSi5JVEsDijloPXKwjlMHp0vOLs2vmIMwWZTFmI50dyivgi3qCNIP5nomvaAJJDDO6YVOqs5W63L0AH653rJ6nRHucbG7zuHuEz1feKcgOuIj/ISiQBL2FPkRsRVvplpSEFKbFIUxClMA3VxB0TUFbUlDSiPnfsIhtNvKxl6bIr8sSeGm0lbAYMGno5r7Q89yqhj+4vNvBKK+NNnz0oraZR0OY/WUFg3IeDGqpRSawJ7GlULiyUYhZOD7VE5JKdn6HG4LYE+9CpD9gC99HPBDQpWRFfci/tp9BxCs5dVeRfKBURrrQ1g2e1324Z/Qb9ozDCVdvp44gBo0hBhLiwVERO8dC3BFyYkjpoCFwev+4sTmQwNULiZAUD4uLOkIoI3wSLzEsp711NaGb545R09EavPX7sMo1/5C2XqLRsujolrxiqdEBbU1J25H7cPnleuLOpg+3sytJM7bUrC5MmlHyCF2CCWWWhLCaDlwsEMaCycMoTIqkxQSEzH5RCMco4iE/EaMgKG6jJFFnRUlW0meP0BAcA0i+r6tqQU07r3q6fURGA4iS6eRs+s11443vjB5sE6+ChSKxvO83gpTInaaqNRFRc28EJThF3mJB1f2AAuagoYiyHIGkLYWTqXAkva1Q0nWJyGg4G3YfSWM5zqhXycpmV1cgaS276fIGzPtOb1JWaqrgNdoreMeXl3EI8mv8Vb+mj+bm8ac9U+ycnRg4qylwYq/aMjh5h+7EwImBcig4IL6oWgYn1+onBq6pTR0g5tyWwVkSuFf8yLDrYFW/M3SjRHnRg/6jJxbLfVtRktgtlSRKWk3vv3c9zKToMA8MMzFegdi5tRxm8t7pifVbtgiyL4VSW/2WEoi8dXHa7RZQ7fe11W4piSj2JN53cpNyUlM1hHnkGkKxd3Fa5EwxTR1MTmzc2iZ36p2yFCoHkzvyFod+6q2yFCoHk2tvjyO3J97ihwYef83vJzOAZ3dJcNZOshQoKUh2pYa3NHAYONEDLBMcFZzcfN2O3fHnm5nEr3NVY1FuV54i1Yiv8pWqbNRFmPLvJk2VjWy4+wdmCXH3P1Yw/gM=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/message_dc_longcts.png b/deps/libfabric/prov/efa/docs/message_dc_longcts.png
new file mode 100644
index 0000000000000000000000000000000000000000..e08402099be4ad41e85402e9857335094465441f
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_dc_longcts.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_dc_medium.drawio b/deps/libfabric/prov/efa/docs/message_dc_medium.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..6f5d9287629cff4280ca70c196d5573486c7a606
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_dc_medium.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T18:25:50.342Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="2xACA87PeaCZUWKGTXLx" version="12.4.8" type="device"><diagram id="dTPj6XVY3x3ysBFnl7aE" name="Page-1">5VnbbuIwEP0atE+tciXJI7de1KJladltnyqTmMSqiVPHFNivX4c4QOxQ0W4KSPCC5/jGnDP2TELD7EwX1xQkUZ8EEDcMLVg0zG7DMHRNd/lXhixzxNGdHAgpCsSgDfCA/sJipkBnKIBpaSAjBDOUlEGfxDH0WQkDlJJ5ediE4PKuCQihAjz4AKvoHxSwKEddW9vgNxCFUbGzromeKSgGCyCNQEDmW5DZa5gdSgjLW9NFB+KMvIKXfN7Vjt71D6MwZvtMiPpeNBq33cXLxRMauePFJBpdGPkq7wDPhMOtJMHIBwyROKMVrNbAaDwBY4r8H2nmCYwD/tUa3ArP2LKgi5JZHMBsR61htucRYvAhAX7WO+cBwrGITTG3dN5UPRBOvUPK4GILEh5dQzKFjC75ENFbkLssm/ONVHqBRVsyNQUGRHSE64U3BPKG4PATfJoKn0PoQ8QdWjGXUBJSmGZNGIcohscn8NQYbCoMKhzx8GtlR5tbPgZpivwyLQFIoxWFmcGpocunjM9LuzCfBb0ro7soWUth7eSWARpC9oEHXj4OBqWbRVVgi2K7guECoxDz0/hevo+qaBc7DAjiv3gtsONxv7cVdiXlUjKjPhSztu8PaSFXChRbWifnRVlnFQRrr78eF86px4Wk9x6BomvHDIz6IkMNDe+goeHWHBr7Snws5SS6da1Zj266d9gj7Sm6lYuPkLAdtUcTc0bbY8pbYdbyyTTBMJ927IQqnQXv2Am1qKQ/Q7PML82LmNMm/vSYtxTmu52Xfq97O+q/DB/7CmXcU1bmJWWUvMIOwYRyJCa8YjTbE4SxBAGMwji73Dh/kOPtjDcuMW6JjikKgmybSiHKUlHC8tAwu8aHGW9/bfSmdNMYjqKNVSGNUYM0zE7eru/mOrOMeHgzvB+kDr7QFWXOK2UU/P93qpcVqi9lVOqmPq+el25y0f1l3bTD6qY+F5+Xbk5dupnfphu9e30z9Yth8jM1W7HXuR+7dsVT1+Xqc/qpS3dqSl22W5bS1Q+WuiolUZ929nhlty7gTuKtnWnvOB6HKNUqSdXVxHJ2gd6UAt06XI327P3q/7zx8NC1mo7V+x358VXtbwIrE0QFaUd7ISPf9dYXk4a8jnxq6ssZlbqpF9Sw1+ndDh5P/yyZtrUOFOXk7BUqO7KGYX5b1uDm5k+kXMPNX3Fm7x8=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/message_dc_medium.png b/deps/libfabric/prov/efa/docs/message_dc_medium.png
new file mode 100644
index 0000000000000000000000000000000000000000..9769883a3388aca866320f62eaa9b4da55f3cdbe
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_dc_medium.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_eager.drawio b/deps/libfabric/prov/efa/docs/message_eager.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..56d85d550507de553df5e3bcc4ba06b8c1f1bf3c
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_eager.drawio
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-19T13:47:34.573Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15" etag="WZxVpdtQeOrHKn9q1XDl" version="12.4.8" type="browser">
+  <diagram id="aD31mCK-H1K8Hn6eNp0I" name="Page-1">
+    <mxGraphModel dx="935" dy="627" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
+      <root>
+        <mxCell id="0"/>
+        <mxCell id="1" parent="0"/>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-2" value="Application call libfabric&#39;s send API" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="20" y="20" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-3" value="Receiver&#39;s progress engine" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="220" y="20" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-6" value="" style="endArrow=classic;html=1;dashed=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="hM9hUbB8x_-XiU8bxfhU-9" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="79.5" y="80" as="sourcePoint"/>
+            <mxPoint x="80" y="250" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-7" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="279.5" y="80" as="sourcePoint"/>
+            <mxPoint x="280" y="259" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-8" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="106" as="sourcePoint"/>
+            <mxPoint x="280" y="166" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-9" value="Application got send&lt;br&gt;Completion" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="20" y="260" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-10" value="Application got&lt;br&gt;receive&lt;br&gt;Completion" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="220" y="260" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-14" value="EAGER_RTM&amp;nbsp;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=17;" parent="1" vertex="1">
+          <mxGeometry x="160" y="117" width="40" height="20" as="geometry"/>
+        </mxCell>
+        <mxCell id="L2E3QGy6i6LVfAEWww4Z-2" value="Application call libfabric&#39;s receive API" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="365" y="137" width="120" height="60" as="geometry"/>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
diff --git a/deps/libfabric/prov/efa/docs/message_eager.png b/deps/libfabric/prov/efa/docs/message_eager.png
new file mode 100644
index 0000000000000000000000000000000000000000..978c8d19cd934a3eed5286fab8c0277780956cbe
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_eager.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_longcts.drawio b/deps/libfabric/prov/efa/docs/message_longcts.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..0da67a90038e9213d5dfcf3a84584b8dc544301d
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_longcts.drawio
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-20T15:06:47.205Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15" etag="2F3kZEbBkw2vVyTIGyXy" version="12.4.8" type="browser">
+  <diagram id="OzKOerL5p1M4QQdfBGPd" name="Page-1">
+    <mxGraphModel dx="1418" dy="627" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
+      <root>
+        <mxCell id="0"/>
+        <mxCell id="1" parent="0"/>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-2" value="Application call Libfabric&#39;s send API" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="120" y="30" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-3" value="Receiver&#39;s progress engine" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="320" y="30" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-6" value="" style="endArrow=classic;html=1;dashed=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="hM9hUbB8x_-XiU8bxfhU-9" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="179.5" y="90" as="sourcePoint"/>
+            <mxPoint x="180" y="260" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-7" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" target="hM9hUbB8x_-XiU8bxfhU-10" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="379.5" y="90" as="sourcePoint"/>
+            <mxPoint x="380" y="269" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-8" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="130" as="sourcePoint"/>
+            <mxPoint x="380" y="170" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-9" value="Application got libfaric&#39;s send completion" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="120" y="530" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-10" value="Application got Libfabric&#39;s receive completion" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="320" y="530" width="120" height="60" as="geometry"/>
+        </mxCell>
+        <mxCell id="hM9hUbB8x_-XiU8bxfhU-14" value="LONGCTS RTM" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=11;" parent="1" vertex="1">
+          <mxGeometry x="234.82" y="130" width="100.18" height="20" as="geometry"/>
+        </mxCell>
+        <mxCell id="nSG-SVCCmyc8D_64ludf-9" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="380" y="180" as="sourcePoint"/>
+            <mxPoint x="180" y="220" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="nSG-SVCCmyc8D_64ludf-10" value="CTS" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=349;" parent="1" vertex="1">
+          <mxGeometry x="258" y="186" width="40" height="10" as="geometry"/>
+        </mxCell>
+        <mxCell id="nSG-SVCCmyc8D_64ludf-11" value="DATA" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=11;" parent="1" vertex="1">
+          <mxGeometry x="225" y="230" width="110" height="20" as="geometry"/>
+        </mxCell>
+        <mxCell id="nSG-SVCCmyc8D_64ludf-12" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="230" as="sourcePoint"/>
+            <mxPoint x="380" y="270" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-2" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="250" as="sourcePoint"/>
+            <mxPoint x="380" y="290" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-3" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="270" as="sourcePoint"/>
+            <mxPoint x="380" y="310" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-4" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="290" as="sourcePoint"/>
+            <mxPoint x="380" y="330" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-5" value="Application call Libfabric&#39;s receive API" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="380" y="160" width="160" height="20" as="geometry"/>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-6" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="380" y="340" as="sourcePoint"/>
+            <mxPoint x="180" y="380" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-8" value="CTS" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=349;" parent="1" vertex="1">
+          <mxGeometry x="260" y="347" width="40" height="10" as="geometry"/>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-9" value="DATA" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;rotation=11;" parent="1" vertex="1">
+          <mxGeometry x="225" y="390" width="110" height="20" as="geometry"/>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-10" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="390" as="sourcePoint"/>
+            <mxPoint x="380" y="430" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-11" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="410" as="sourcePoint"/>
+            <mxPoint x="380" y="450" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-12" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="430" as="sourcePoint"/>
+            <mxPoint x="380" y="470" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="ELLmg3xNlQ1Nrk4ykZ6T-13" value="" style="endArrow=classic;html=1;dashed=1;" parent="1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="450" as="sourcePoint"/>
+            <mxPoint x="380" y="490" as="targetPoint"/>
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
diff --git a/deps/libfabric/prov/efa/docs/message_longcts.png b/deps/libfabric/prov/efa/docs/message_longcts.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6f33c1e36878947a0036c6b1c5b9cc804b73867
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_longcts.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_longread.drawio b/deps/libfabric/prov/efa/docs/message_longread.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..a6994ac90f42202fa9dd99d6aaba2725ce8fb244
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_longread.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T03:58:26.253Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="bT_koqUHzDPkJrmW13qh" version="12.4.8" type="device"><diagram id="APAEDZxGAzosg-hluIWG" name="Page-1">3Vhdc6IwFP01zj51B0QUH/3a7s6s046tu21fnAgBMhsIG6Li/vpNICAQbF1rrdsnkpOQ5J5zc3OTljEKkmsKIn9KHIhbbc1JWsa41W7rnXaXfwSyzZCebmWAR5EjO+2AO/QHSlCT6Ao5MK50ZIRghqIqaJMwhDarYIBSsql2cwmuzhoBDyrAnQ2wiv5EDvMz1DK1Hf4VIs/PZ9Y12RKAvLMEYh84ZFOCjEnLGFFCWFYKkhHEgrycl+y/L3tai4VRGLJDfvCnfX++HFrJ4uoBza1l4vrzq3Y2yhrglTR4EEUY2YAhEgpaQToGRksXLCmyP8XCEhg6/DO4/SYtY9ucLkpWoQPFjFrLGG58xOBdBGzRuuEOwjGfBZjXdF5ULZBGrSFlMClB0qJrSALI6JZ3ka05udtqdbOTSs8xvyRTV2JAeodXDLwjkBckh//Ap6HwOYM2RNyglLmIEo/CWBRh6KEQvj+Bl8ZgV2FQ4Yi730BsbV6zMYhjZFdpcUDspxSKCqeGbh8En5/NvPoo6U0r46RS28raXm4ZoB5kz1jQz/pBpxJZVAVKFJsNDOcYhZjvxnU1HjXRLme4JYivuBC41+d2lxW2asrFZEVtKP8qx4/aQFbNUczaOBkvyjipExRWH+8XvRP7xfH65kfThQh8OoVViftnldg6l8QXolyN7iLOvlY3vR6c31i3vqJbNYnwCCvnEGm2FkQYZs3vfQDWfN567wMwDy+H0pkmFpdEqN55mVHjrIx2FEZnk8F4MbufKmxxI1mVkphR8guOCCaUIyHhSZsxdBHGNQhg5IUiLnHqIMeHgjKuGh7IhgA5jpimUYOqSpSwTG2+hBPJUgSFItroiiydBlXqQel0qbKqyjzmCbE2G08HaXAFTnrbS8uMIrgWrQ5gQFzkKAnkVYSTfZkqnkI1raZar2Ezdc8pW0e94nzsU1rJivT/M4HuqPttcjO70K1TCoBGp/82EbCtny8CPoVPv0BMxu7NqPd4f6u5G/K94fFFUcNe0XXpKvuBt1UR2V59a9EO21ecSrAtdYtEh/iZBZvNC967rlr3/gvdzWYzdg6WLfjYmHAzjqNkgYfd+8XGv47dH7+Bd2UqHnjA8x/NXrRazS+AlxE/9sSGhg2xP4+tO6jZcPQ2bZAj4gWv7h6CM3F3z+nG5C8=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/message_longread.png b/deps/libfabric/prov/efa/docs/message_longread.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6b882f937677e75d693e782b101b0f76191f0d2
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_longread.png differ
diff --git a/deps/libfabric/prov/efa/docs/message_medium.drawio b/deps/libfabric/prov/efa/docs/message_medium.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..267daff7032f1d62732dd8024a34fc06bdd99f1c
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/message_medium.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-20T12:12:56.946Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="jeIVPnwt3zu9YySMpD0F" version="12.4.8" type="device"><diagram id="dTPj6XVY3x3ysBFnl7aE" name="Page-1">5VjbbuIwEP0atE9UuZDbI5duW7VoK7pou0+VSUxi1YlTx5SwX782cYDEoaJtSpHgBc/xLXPO2DNJxxzG+RUFaTQmAcQdQwvyjjnqGIbeM2z+J5BVgTi6WwAhRYEctAUe0D8oQU2iCxTArDKQEYIZSqugT5IE+qyCAUrJsjpsTnB11xSEUAEefIBV9A8KWFSgrqVt8WuIwqjcWddkTwzKwRLIIhCQ5Q5kXnbMISWEFa04H0IsyCt5Keb93NO7eTAKE3bIhGjsRdPZwM2fuo9o6s7yeTTtGsUqrwAvpMP9NMXIBwyRRNAK1mtgNJuDGUX+j0x4ApOA//Xvb6RnbFXSRckiCaDYUeuYg2WEGHxIgS96lzxAOBaxGHNL503VA+nUK6QM5juQ9OgKkhgyuuJDZG9J7qpqLrdS6SUW7chkSwzI6Ag3C28J5A3J4Tv4NBU+J9CHiDu0Zi6lJKQwE02YhCiB30/gqTFoKwwqHPHw64ujzS0fgyxDfpWWAGTRmkJhcGro6lHweWGV5l9J79oY5RVrJa293DJAQ8je8MArxsGgcrOoCuxQbDUwXGIUYn4aX6v3URPtcod7gvgTbwR2PO73rsJuTbmMLKgP5azd+6O2kFsLFKu2TsGLss46CDZefzwunJbjYq++J6Jbe8KpynlHVc49M+VqdOua3Y5uunfcE+cpulVrg5CwPaWBjTmjgxnlrVC0fBKnGBbTvjvf1c6C/d35rix030NznV9a1BinTfzpMd9TmB9fjm6m46fJ77HCF3eTVUnJGCXPcEgwoRxJCK/mzMEcYVyDAEZhIm42Th7k+ECQxvXFfdkRoyAQ2zSqUNWJElbEhTky3qxSDhdGt2vXjOEowvQadDFa0IVZ6cvV7VJnPSOZXE/u7jMHd3VFlvPKFyX/n87zdYXayxeNuqnvkuelW70g/rBu2nF1U99Zz0s3py3dzC/Tjd4+v5h6d5L+ysx+4g3vZq7V8EZ0sf6dfurSnU2YKHmqIXL2py7LrUrp6kdLXY2SqK86B3xO21RvbX1R+xSnprXneByjTmskVVcTy9kFul0L9N6X1Wjc3H6TLq6q7Zd98/I/</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/message_medium.png b/deps/libfabric/prov/efa/docs/message_medium.png
new file mode 100644
index 0000000000000000000000000000000000000000..465f717432f724e224a2bb0e6ac31af4c89ebb97
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/message_medium.png differ
diff --git a/deps/libfabric/prov/efa/docs/overview.md b/deps/libfabric/prov/efa/docs/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..5dc4588624d75be3e8e784aeff4daa18400e3cf0
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/overview.md
@@ -0,0 +1,69 @@
+## EFA Libfabric Provider Documentation
+
+The EFA Libfabric provider supports the Amazon Elastic Fabric Adapter (EFA), an
+OS bypass network interface available on Amazon EC2 instances. The EFA device
+supports both reliable and unreliable datagram send and receive semantics, the
+EFA Libfabric provider adds additional functionality in software such as tag
+matching, reordering, and software emulation for features the hardware does not
+support natively. EFA provides lower and more consistent latency and higher
+throughput compared to TCP transports which provides better application
+performance for HPC and Machine Learning applications on Amazon EC2.
+
+Please see the [fi_efa(7) man
+page](https://ofiwg.github.io/libfabric/master/man/fi_efa.7.html) for more
+information on the features and capabilities of the EFA Libfabric provider.
+
+### Background information
+
+The EFA developer documentation assumes a working knowledge of OS bypass
+networking and the Libfabric API. The [OFI Programmer's
+Guide](https://github.com/ofiwg/ofi-guide/blob/master/OFIGuide.md) provides
+motivation for Libfabric and defines the API and structures used by Libfabric
+applications.
+
+For more information on EFA, SRD and the [AWS Nitro
+System](https://aws.amazon.com/ec2/nitro/), please refer to these resources:
+
+* [A Cloud-Optimized Transport Protocol for Elastic and Scalable
+  HPC](https://ieeexplore.ieee.org/document/91673990) whitepaper
+* [AWS re:Invent 2019 - Monday Night Live with Peter
+  DeSantis](https://www.youtube.com/watch?v=GPUWATKe15E&feature=youtu.be&t=228)
+ keynote
+* [HPC Application Scaling with Elastic Fabric Adapter (EFA) and Scalable
+  Reliable Datagram
+  (SRD)](https://pages.awscloud.com/HPC-Application-Scaling-with-Elastic-Fabric-Adapter-EFA-and-Scalable-Reliable-Datagram-SRD_2020_0004-CMP_OD.html)
+  tech talk
+
+### Getting started with EFA Libfabric provider development
+
+You will need an Amazon EC2 instance which has EFA support. The [EFA getting
+started guide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) in
+the AWS documentation is a good reference on which Amazon EC2 instances support
+EFA and how to setup an EFA enabled instance.
+
+To get started with EFA Libfabric provider development, you will need to either
+install a pre-packaged version or build the:
+
+* [EFA kernel
+  driver](https://github.com/amzn/amzn-drivers/tree/master/kernel/linux/efa) - The
+  driver is required to enable the EFA device and is utilized by libibverbs and
+  Libfabric to setup and teardown device resources such as queue pairs,
+  completion queues, memory registration, and address handles. Some OS
+  distributions provide an up-to-date version of the EFA kernel driver such as
+  Amazon Linux 2 and Ubuntu.
+* [rdma-core](https://github.com/linux-rdma/rdma-core) - The EFA Libfabric
+  provider utilizes the libibverbs library which provides an abstraction layer
+  for the Linux kernel verbs interface. This avoids tightly coupling the
+  Libfabric provider to the EFA kernel driver and simplifies the Libfabric
+  provider. Similar to the driver, there are OS distributions that pre-package
+  rdma-core. EFA device support was added to rdma-core version 24.0. However,
+  it's best to use the latest rdma-core release for bugfixes and to support the
+  latest device features.
+* Ensure you have configured your instance to increase the locked memory limits
+  (unlimited is fine) and set aside [huge
+  pages](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt) for the best
+  performance. The EFA Libfabric provider will utilize these for the bounce
+  buffers used for sends and matching receives.
+* `FI_HMEM` and CUDA support for NVIDIA GPUDirect + EFA is available, see the
+  `--with-cuda` configure flag. GPUDirect will be enabled by default if CUDA is
+  installed in the default system paths.
diff --git a/deps/libfabric/prov/efa/docs/pkt-processing.md b/deps/libfabric/prov/efa/docs/pkt-processing.md
new file mode 100644
index 0000000000000000000000000000000000000000..5995b28e97953bf1443548e439bb7f2bafd34c46
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/pkt-processing.md
@@ -0,0 +1,86 @@
+## EFA Libfabric Send/Receive/Completion Paths
+
+### Overview
+
+The EFA provider supports two different endpoint types, `FI_EP_RDM` and
+`FI_EP_DGRAM`. This document covers `FI_EP_RDM` as it implements a wire
+protocol and software support for some of the Libfabric API such as tag
+matching, send after send ordering guarantees, segmentation and reassembly for
+large messages, emulation for RMA and atomics, and more.
+
+There are a couple key data structures that are used to implement these
+software-level features. The wire protocol that we implement is covered in a
+separate document.
+
+### Relevant data structures and functions
+
+`rxr_ep` contains device information and structures for the endpoint including
+the device/shm endpoints and completion queues and their state, the packet
+pools for recv/send, outstanding app receives to be matched, outstanding sends
+in progress, sends and receives queued due to resource exhaustion, unexpected
+messages, and structures to track out of order packets and remote peer
+capabilities and status.
+
+`rxr_tx_entry` contains information and structures for a send posted either
+directly by the app or indirectly such as an emulated read/write. When the send
+is completed a send completion will be written and the tx_entry will be
+released.
+
+`rxr_rx_entry` contains information and structures for a receive posted by the
+app. This structure is used for tag matching, to queue unexpected messages to
+be matched later, and to keep track of whether long message receives are
+complete. Just like the tx_entry, when done a receive completion is written to
+the app and the rx_entry is freed.
+
+`rxr_ep_progress` is the progress handler we register when the completion queue
+is created and is called via the util completion queue functions. While the EFA
+device will progress sends and receives posted to it, the Libfabric provider
+has to process those device completions, potentially copy data out of a bounce
+buffer into the application buffer, and write the application completions. This
+all happens in this function. The progress handler also progresses long
+messages and queued messages.
+
+### Dealing with device resource exhaustion
+
+The EFA device has fixed send and receive queue sizes which the Libfabric
+provider has to manage. In general, we try to write an error to the app when
+resources are exhausted as the app can manage resource exhaustion better than
+the provider. However, there are some cases where we have to queue packets or
+store state about a send or receive to be acted on later.
+
+The first case is control messages that have to be queued, for example, we may
+send parts of a message and then hit the device limit when sending a segmented,
+medium message, or fail to send a control packet containing information that
+can't be reconstructed in the future. `rxr_pkt_post_ctrl_or_queue` handles
+those cases.
+
+We also may queue an rx/tx entry if we're unable to continue sending segments
+or if we fail to post a control message for that entry. You'll find the lists
+where those are queued and progressed in `rxr_ep_progress_internal`.
+
+### Dealing with receiver not ready errors (RNR)
+
+Note: this functionality is currently turned off. We configure the device to do
+infinite retries as there are known bugs in the queuing/RNR logic that need to
+be resolved first.
+
+Finally, the EFA device may write an error completion for RNR, meaning there is
+no receive buffer available for the device to place the payload. This can
+happen when the application is not posting receive buffers fast enough, but for
+the `FI_EP_RDM` receive buffers are pre posted as packets are processed. When
+we get RNR in that case, this means that a peer is overloaded.  This can happen
+for any control or data packet we post, so to handle this we queue these
+packets to be sent later after we backoff for the remote peer.
+
+The occasional RNR is expected so we configure the device to retransmit a
+handful of times without writing an error to the host. This is to avoid the
+latency penalty of the device writing an error completion, the provider
+processing that completion, and trying the send again. However, once the
+Libfabric provider receives an RNR for the same packet that we already tried to
+retransmit we start random exponential backoff for that peer. We stop sending
+to that peer until the peer exits backoff, meaning we either received a
+successful send completion for that peer or the backoff timer expires.
+
+See `rxr_cq_queue_pkt` for where the packets are queued and backoff timers are
+set, and see `rxr_ep_check_peer_backoff_timer` for where those timers are
+checked and we allow sends to that remote peer again.
diff --git a/deps/libfabric/prov/efa/docs/qp_collision.drawio b/deps/libfabric/prov/efa/docs/qp_collision.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..1e10097f6b82784ef558392b3a327ada87075182
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/qp_collision.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-08-03T20:04:57.121Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="jaKwibttJulnFHG_Ro7a" version="12.4.8" type="device"><diagram id="VktbdoG0Y93yvyAxLsci" name="Page-1">7Vpbb9owFP41qE+bcuXyWCjbKm1SJaS1e5q85EC8OjZyzK2/fjZxrg5toBR62QvEJ/aJ833fOfYxdNxRvP7K0Tz6wUIgHccK1x33quM4tud05ZeybFJLz+6nhhnHoe5UGCb4AbTR0tYFDiGpdBSMEYHnVWPAKIVAVGyIc7aqdpsyUn3qHM3AMEwCREzrLQ5FlFr7vlXYvwGeRdmTbUvfiVHWWRuSCIVsVTK544474oyJ9Cpej4Ao8DJc0nFfdtzNJ8aBijYD7sa9h4jDxBqPAeLfd7fJFH3SXpaILPQL68mKTYYA0PBSASlblFFpHIYoiUB5tWUjEjHRl+lQCA1Eiyna+YtLxQCLQfCN7LIqoM2QjUqoZjYOBAm8rLpHmuFZ7i5/wg3D8sGOpdXoZNxoLXpZO3ORsAUPQI8qQ/mEI9upORKIz0AYjuRF6bUL05apPVhzDNauaSIQDUBaLw0GOVvQMCdsFWEBkzkK1N2VjNomEpfABaz3pTGP+io8Xd0ssZwjVqa5a+1mtALhvni5j+E1PDtenvXK8PI+UlbwulX0ffvArOD5g8/+WfOC38BblwhFDl7Ky5m6HNNwriduK5Dl3HSnPzzrk1nkLEojG5wFHJCQrLf2cOZQ69USt3/uUOvuSZnz4SjzXh1nvb3SY0BQkuDgbSVFY4fT9w9MirXsKrPkSVOiPTh5TpT1iuBs84ZCrBZhjnvuCHOa6pLHWHM/XFqscebaZ+fMrErefVp0vUP3irW0aJSiL11B7rfFfxdkOQdv7OsVwqnJ8k1uJOoT3WRcRGzGKCLjwjos8pElW0Wf74zNNYF/QYiNPnJDC8Gq9MIaizs1XBY1aetX6c7VWnveNja60VYS8l222LdYuFNsn+xoiqy1ep53tGAZgXSDgntQ8plyFis4qpsLiXPFZgaeXBRElQu1obiHESOMd/KCe4oJqZkQwTOq4lXCDdI+VEsMDhC51DdiHIZbeTQtUFXJcCYkfkyNst3jrFlO7WDIHjjmotVvyAj1Ovp49JmL1vujz/OPRJ/rVRPqoHtu+syTvUfoc98mfUeLPre2eemdPfrMXQimiXzHWthdJA0bdhSGHJIk4/Ty58Fb99fB+BEY9moMO01HJb2TMtzqSLKJdPfiOdWckkUhELz92D5EVXiFYEatHXKI2RJ26vLJ8cVkdF76r1d1FlQ9L/ecBr02VQgvp9dWx0UjRClTCogQVXBJRUWwffN07WmpiQUlWp+pL3WZSH0BV8La/qQtv66v/gtlZwVWFsrgOEKRzeJH+bSEK/7a4I7/AQ==</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/qp_collision.png b/deps/libfabric/prov/efa/docs/qp_collision.png
new file mode 100644
index 0000000000000000000000000000000000000000..4891cff0b0a9f19397558b85b66159baf693001f
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/qp_collision.png differ
diff --git a/deps/libfabric/prov/efa/docs/read_longcts.drawio b/deps/libfabric/prov/efa/docs/read_longcts.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..9490fb482c25e5d375ba89f8ea55dcde181368ab
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/read_longcts.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T03:39:40.468Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="WTTV_khaAKcNuogEBubj" version="12.4.8" type="device"><diagram id="FbJiT4IxuIQ_kcybeP4A" name="Page-1">5Zpdb5swFIZ/TbSrVoAhkMuEdNmktauaduuuKhccsOZgZJyv/fqZYBqM6dRFhKImN8EHY+z38Tk+GAbAX25nDKbxNQ0RGVhGuB2A6cCyTNsair/csissrukVhojhUFY6GOb4D5JGQ1pXOESZUpFTSjhOVWNAkwQFXLFBxuhGrbagRL1rCiOkGeYBJLr1Jw55XFg9xzjYvyAcxeWdTUOeWcKysjRkMQzppmICVwPgM0p5cbTc+ojk4pW6FNd9fuXsS8cYSvhbLoivR/HD88TbPl084gfvebuIHy6sopU1JCs54HGaEhxAjmmSywr3bRD8vIDPDAefMlFiCIbib3z7VY6M70q5GF0lIcrvaAzAZBNjjuYpDPKzGzFBhC3mSyJKpjjURyAHtUaMo23FJEc0Q3SJONuJKuXZUl05vYAsbg6sXqrEFU5DaYNyekQvLR8UFAdSxP8QFGiC3qEspUIUttcuZTRiKMsPURLhBL27hKBvEg41CTWNUBKOc+cWpYDALMOBKksIs3gvYV4Q0rDdY67npVMWf0l594XpVintZOlVbTlkEeL/GMGoqIdCJbboBCoSOw0KlzaGiPDHtRqRmmSXd7ilWPT44CPuSAy8inhUQ5fRFQuQvKwaQuoteepU0eZAoYzW0H4avIz7+JnhtjwzjidcLk89QQxaQww0xKNOEXtdIe4JubpLmaAlcKbbrW+ONHBqJhFRriUSAV2mBBUVepZIOO++DJYhpqLoDe2XaPXUoQei2Zpo377fzPz7uTDe3d9pkomRclWXjDP6G/mUUCYsCRUpGpgsMCE1EyQ4SvIYJPRDwj7JdRPznYzliSUOw/w2jSBUVIzywk9EF1piY9nOpVH9gVqccS5dV6M1aoBlnQpWGduryfLVeHo3v+0/JuA4LXEClgrGs3QfMrvEAkwNy8deg4HrqQhay4/NbtdgoDuUv2J72XMtk3z/Yf/4HsTiX/TNz/tGCBWrtHgQNRK0qVTspQu24XE1SF7DojU8kcMl89nF/IfvL3eBN30a2mQVLpo2YQ5MGAoQXuMkOj8yZtnCu5HRd3POKxIKBC1FQlBrqL1I2AhOTwM/ODhNb/tYcGo6Yjl2p+CccwfnOC2Bc51Owem7ttPx/bina1UXibxlNKxep0rkG5HoWzIf25fqiw4wjszj604J7JPl8Y3gGrZ+9jsYffel1vYuhq+ArMwju0tPMvVH4nOLbpb6fgE4DU9NnUa3MjU9m/CmRSWn9mrm2FQBuCd7x9NM7tyzc+CBdsjZBuiW3Nml56VrlILXd/TeTs5UG6q/5zs1uba/qug9uZrP2fUc7mifa+2jB1E8fP5VVD98RAeu/gI=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/read_longcts.png b/deps/libfabric/prov/efa/docs/read_longcts.png
new file mode 100644
index 0000000000000000000000000000000000000000..d259c7c8917edbe6799634ccf77fa910200e62e4
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/read_longcts.png differ
diff --git a/deps/libfabric/prov/efa/docs/read_short.drawio b/deps/libfabric/prov/efa/docs/read_short.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..25971a173464e99f5812af9eb101700cb41190df
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/read_short.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T03:42:10.864Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="IdlnvQ-IAYbCGyDJeS47" version="12.4.8" type="device"><diagram id="FbJiT4IxuIQ_kcybeP4A" name="Page-1">3ZjRbpswFIafJtpVq4BDgcsk7dpddKuSVeuuKgefgDWDkXEasqefCSbgOJ22iqVouQnnt7E5339sLEZonpa3AufJPSfARu6YlCN0PXJdZ+Jeqb9K2dWK7wS1EAtKdKdWWNKfoMWxVjeUQGF0lJwzSXNTjHiWQSQNDQvBt2a3NWfmrDmOwRKWEWa2+o0SmdRq4I1b/Q5onDQzO2PdkuKmsxaKBBO+7UjoZoTmgnNZX6XlHFgFr+FS3/fxldbDgwnI5J/ckNyHyeNqFpTPF0/0MViV6+Txwq1HecFsoxOe5jmjEZaUZxVWvB+D0dUarwSNPhQqEoCJ+ps+fNKZyV2DS/BNRqCacTxCs21CJSxzHFWtW1UgSktkylTkqEs7A53UCwgJZUfSGd0CT0GKnerStDZ0dXkhHW5brw5dko5PV1rDujziw8gtQXWhIf4FUGQBXUCRcwVF7NnlgscCiuoSsphm8O4I0dAQXlkILUaQkWm1uFUUMVwUNDKxEFwke4RVoNCI3VPF89Jrwu8a7z64Lo1op6NX2UosYpC/ySCs+wEx9hbbgQ5i7wThRhPA1Hp8MXekU9j1DA+cqidu14gfqsS7FodH1hV8IyLQt3W3kOORArNUrBqoyVgD7cvgkPfbK8PvuTLe7nDzehqIxag3i5FlcXhWi4NzWTwQ546XlIN6Ms7xz7s2Q8s48yQRc2kdJCKe5gzqDgM7SLiT934NNltMh+hnPixox0eHAUCbWNCWd18WX58XXxcWLpWlNJkUUvAfMOeMC6VkXB3P0GxNGTuSMKNxVu0/ih0ofVYxU7XOprohpYRU05w0wbRJcFmvEfUIPfniTrzLcfeHjvYY79L3LafCE0a5/8qoZl/vHpRvpteL5cPwbUKe15NPyDWNCVx7/TjntAU5li3/9/sX+YFpQW9nY6ev968K268Edff2Wwu6+QU=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/read_short.png b/deps/libfabric/prov/efa/docs/read_short.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a88b04b0e60af6d9c84d5481fb230d20648fb65
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/read_short.png differ
diff --git a/deps/libfabric/prov/efa/docs/write_dc_eager.drawio b/deps/libfabric/prov/efa/docs/write_dc_eager.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..0df3b62d2546474bd3c81bc576b66bbd69aea4cf
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/write_dc_eager.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-25T18:35:32.678Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="kGBGikaECE3CBg2lTirp" version="12.4.8" type="device"><diagram id="lewDNAd5vCCO2mxxK25Q" name="Page-1">5Zhdc6IwFIZ/jbNX3QFRxEtE7PbCbdfqtN0bJ8IRMhsIG2KF/voNEhSI7nQ7tnZmr8x5ExLO8+YLO4YTZdcMJeGU+kA6Xc3POsa40+3qmm6Jn0LJS2WgD0ohYNiXjQ7CPX6B6kmpbrAPaaMhp5RwnDRFj8YxeLyhIcbottlsTUlz1AQFoAj3HiKq+oB9Hpaq1dcO+jfAQViNrGuyJkJVYymkIfLptiYZbsdwGKW8LEWZA6SAV3Epn5ucqN2/GIOYv+aBcDoMF6uRlS2vHvHCWmXrcHHVLXt5RmQjE7aThGAPcUzjAiva9UHwao1WDHtfUhFtGeYgfu27G5kazytejG5iH4ohtY4x2oai5X2CvKJ2K2aI0EIeERHpoqimILN6BsYhq0kypWugEXCWiyaytqKbN8PtwSu90sKaT6bUkJwewb7jA0FRkBD/AaihAJ1BmlDBhO3YJYwGDNKiCHGAY7g8wc+G0FQQKowg9u1icYvIIyhNsdfE4qM03CEsAoGG5Y8Fz6/9KnySeHfBOGtEuYxOsuWIBcD/ksGwbAd+Y29RHagh7h8hXGkMiFiPz80d6Rh2OcIdxeKN9wYPhiLvusNWy7mUbpgH8qn6DtLqyGpNlH6rn5KL0s9uEuyzfvu8GJx5Xpz095P4dj7jVOeGH+qc9Z8518Kta+Z5fNNN80N9Gyq+NW8HAeXK5cCjUUKgbHDpk601681Ln2zVpbYG9DttMTOJeIvRiolSUJQWMSmvC2rV5GY5c6e3c3fp/FiO7bl9eeL6K5D3PhR5T0E+dpaufe3OlrP5g0JMJMqbWFLO6C9wKKFMKDEVVzZjtMaEtCREcBAX25fAB0IfFdjEWiG2rIiw7xfDHPWh6RSjvFxj4hUG57FGN1t7SfUhVrPmmDPdMziDfwN6yD3/xbz96WZTbT7h0ytdMeYch0LrBDgC7WLHeXszH7zxVGgf5sa7HQpHfVM/GWeu497czT//WjJ6xn6iKCvnVVPl1GKy+u+1mER4+I+g9PDwT4vh/gE=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/write_dc_eager.png b/deps/libfabric/prov/efa/docs/write_dc_eager.png
new file mode 100644
index 0000000000000000000000000000000000000000..cfee7b929d601cd849c12794499cfb7172bd64a1
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/write_dc_eager.png differ
diff --git a/deps/libfabric/prov/efa/docs/write_eager.drawio b/deps/libfabric/prov/efa/docs/write_eager.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..7d4e2e0086c2064149f628c256cb1a3b01fc335c
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/write_eager.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T03:33:35.090Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="VSjpR7l5KNzOPeatHdgu" version="12.4.8" type="device"><diagram id="lewDNAd5vCCO2mxxK25Q" name="Page-1">5ZfRbtowFIafBu2qEyElhMtAadeLbh0tanuFTHJIrDlx5JgS9vQ7Jg7EMUzthEqlXcXnt2P7fL/tOB13nJY3guTJHY+AdXrdqOy4V51ez7nsefhQyqZSBo5fCbGgkW60Fx7ob9BiV6srGkFhNJScM0lzUwx5lkEoDY0IwddmsyVn5qg5icESHkLCbPWJRjKpVL/f3evfgMZJPbLT1TUpqRtroUhIxNcNyZ103LHgXFaltBwDU/BqLtV710dqdxMTkMm3vJDcDZPZYuSX84tnOvMX5TKZXfSqXl4JW+mEgzxnNCSS8kxhJds+GF0syULQ8EuB0VpQCfgM7m91anJT8xJ8lUWghux23NE6wZYPOQlV7RpXCGqJTBlGDhbtFHRWryAklA1Jp3QDPAUpNthE19Z0N2a43nvl1FrS8MnTGtHLI951vCeIBQ3xHUBdC+gUipwjE7FllwseCyhUEbKYZnB+gp8NoWchtBhBFgVqc2MUMlIUNDSxRKRItghVgGjE5lnx/NqvwxeNdxtclUa00dFRtpKIGORfMhhW7SAyzhbbgQbi/gHCtSaA4X58NU+kQ9j1CPec4ox3Bg+GmHfTYb/lXMFXIgT9VvMEaXXktxZKv9VPxcXqZ7sIdln/+7oYnHhdHPX3k/h2OuNs54Yf6pz/nznXwu10vdP45njeh/o2tHwzbwcxl9blIORpzqBqcO4vW2vVe+f+stWX2gbQ77zFzGM4i9FCYClWpVnGquuCXXV9O59O7n48Tubjn/Or4DE4P3HnDcgvPxT5pYV8EtxMpvPp45OFC7OUJpNCCv4LxpxxgUrG8b7mjpaUsZZEGI0zdXYhO0B9pJjhRmGBrkhpFKlhDppg2iS4rDYYTmFwGl8cr3WQOAPLl0O29N5vC4b7f5rqINr/GbqTPw==</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/write_eager.png b/deps/libfabric/prov/efa/docs/write_eager.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa0cffa77310d321baa9d74572cc8ecf31a2ff72
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/write_eager.png differ
diff --git a/deps/libfabric/prov/efa/docs/write_longcts.drawio b/deps/libfabric/prov/efa/docs/write_longcts.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..7307e2c97001229e7d971c28a8f3eac7c5174768
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/write_longcts.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T04:01:36.429Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="PEkWYDq-Nzlm6bRhR7C_" version="12.4.8" type="device"><diagram id="HXWCCChorXi73rZmTyBJ" name="Page-1">5VrRkpowFP0ap0/bEYKIj8rabWd2t+3qdtu+OAgRMhsIE3DVfn0TCQpJ2mkdcJ3lieQQIDkn997cqz3gxtsb6qXRHQkg7pn9YNsD1z3TNCzTZheO7ApkaDgFEFIUiEFHYIZ+QQH2BbpGAcxqA3NCcI7SOuiTJIF+XsM8SsmmPmxFcP2rqRdCBZj5HlbRJxTkUYE6g/4R/whRGJVfNvriTuyVgwWQRV5ANhUITHvApYTkRSveuhBz8kpeiuc+/OHuYWIUJvm/PBDdjaLH5cTZLq6+o0dnuV1Fj1dm8ZYXD6/FgsdpipHv5YgknFZv/w6MlitvSZH/LmO9DUU5ZNfxl09iafmu5IuSdRJA/sl+D0w2ERs5Sz2f392wHcKwKI8x6xmsqS5BrOoF0hxuK5BY0g0kMczpjg0p75b0iv0FRHdzFOswJKoIZQvME/sjPLz5SCFrCBb/g1GgMPoAs5QwUuievJSSkMKMN2ESogS+OoXg0ii0FQoVjmASjLl1s56PvSxDfp2WwMuiPYW8w6ihu++cz/eDsvtD0LvvXG9rvZ3o/ZHb3KMhzP+yglExDgY156IqUKF4oGG4xCjEzCBf6i5JR7v4wheC2IyPNjIcsYVXJR5J0mVkTX0oHqv6EPlNTn2rKHugYEZ50X4bHNZ9+s4YNrwzTle4jE8XIjFoTGKgSDw6q8TOuSS+EOVkkzJAQ8IZw/Pa5kgRrn6U4PYknyR8EqcYFgMu7CAxePUwWLqYCqP3RCKtv07w/jBhYzadyZKyVshbHz4tHqZ3n+fThft1cT2ej1+dYGD8A8HWWQm2FIJvP9/fuPMZAx/mTwplbKV5nZcsp+QZugQTypCEsOMcmKwQxhLkYRQm3F8x/iDDJ5w3Zht4LG7EKAj4Z7RC1KWiJC9sik2hIW1MUA8ehk6bvkYbs7VjtJqZuGu6XyKfVcJTKe57PT9iVzYLl88CY8L8Dc9NEripDLxIHZuwKcnnA51udku6JbObq9k31413vnO9sC28DlaaKPC2w7cSdZ0Tw7dytJYlai58a4XTRJu9I7xI06m4QGCNGvKBA0dS0lZsSReejLZMyTAURbSB/OIkaSwqmfWopK1MGOf0boYald62e1O8UlPZidledjK9vY1DsL3HX417+mztnn/ac02hs2PCDZoSTi4stCycWk/tmHCyoZwqHJADVcvCqclVx4RrqgIHZJ/bsnADRbjxMa+i0IfoBSVhB7OrQyZ1juxKK03Tv4xcuk0ppmCdaFOycQI5TWvZptSqdseSK1sWcqiYUlvJlVYQtU7R7dwKjDS+ra3cSquIpgDxtp2b4pOaOjBYZz4waAoV3VLOkt3UycrJWVrbynU9LVZM5WTlzlzPMLqeFyumcrJyjRU0WPf4d75i+PFPkWD6Gw==</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/write_longcts.png b/deps/libfabric/prov/efa/docs/write_longcts.png
new file mode 100644
index 0000000000000000000000000000000000000000..340307215f8cce547476cab74db8d5b595fdc1c7
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/write_longcts.png differ
diff --git a/deps/libfabric/prov/efa/docs/write_longread.drawio b/deps/libfabric/prov/efa/docs/write_longread.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..1a26218ccb02a0bb83cc66fbfd38aa06a19c4dd7
--- /dev/null
+++ b/deps/libfabric/prov/efa/docs/write_longread.drawio
@@ -0,0 +1 @@
+<mxfile host="drawio.corp.amazon.com" modified="2021-07-21T04:03:55.374Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0" etag="p7vFC4OL6i7cI91cCkVy" version="12.4.8" type="device"><diagram id="ZDJ_sdb5zI1M9X0xq4Zh" name="Page-1">3Vhbc6IwFP41zj7tjogXfES03c7U2lqdXl6cCBEyjYQJUWF//SYQEAi9bGut2xfN+XIIOd+XnJzQ0K11dE5B4I2JA3Gj1XSihj5stFpau9XlfwKJU6SnGSngUuRIpz1wi/5ACTYlukEODEuOjBDMUFAGbeL70GYlDFBKdmW3FcHltwbAhQpwawOsonfIYV6KGp3mHv8Nketlb9aasmcNMmcJhB5wyK4A6aOGblFCWNpaRxbEgryMl/S5s2d684lR6LO3POCN+958OTCixc97NDeW0cqb/2ylo2wB3siAzSDAyAYMEV/QCpIxMFquwJIi+0fIrR1FDPJ/8/pChsbijC9KNr4DxSubDX2w87jnbQBs0bvjK4RjHltjbmm8qYYgo9pCymBUgGRI55CsIaMxd5G9Gbtx2dzttdIyzCvo1JUYkMvDzQfeM8gbksR/IFRXCJ3CMCCcE5pwF1DiUhiKJvRd5MOvZ/DUKOwqFCocQd8xxebmlo1BGCK7TIsDQi+hUBicGhrfCz5/dTLzQdKbGMOoZMXSepZbBqgL2QsR9FM/6JRyi6pAgeJODcMZRiHm+3Fbzkh1tMs3XBPEZ5wL3OvzuIsKGxXlQrKhNpRPFTNIZSCjslA6lXFSXpRxkkWQR/3+ddE78Lp4v77Z4XQiAh9OYVXi/lElNo4l8YkoV6E7z7Mf1U2rJudP1q2vnnrRtNHqYk7XYEl5yxWtu7RwqMIWWQcYphXHVx+GlfVvfPVhmKWaArVXhNvWDf8Z+enkNz5OSooqsWcXi+loPJmNFtbNYmjOzBPgt/c6we3uMQluKwRfTq7OpyNzuJjO7hTGeKCsTEvIKHmCFsGEcsQnvKbTByuEcQUCGLm+SFucPsjxgaCNl9rYlB1r5DjiNbU6lJWihKUlOp/CgaTJc0aejDRFmnbN0q/mrMOV0qoy81DcO6bDsZnkXuAk18GkzSiCW9HrAAZOVLVDqKRXVOrVbaBjytRWrzzf+9BWiiTt/6yn2+r+Gk2mJ7p1CglPb/c/J+O1tONlvEf/8QmEZLiaWL2H2XVztSOXNV9jFDXsDd0WbrbfeFvlme3Dl5jm2/YVpxLEBbdAOIQvTLhTP+Fn51VxN15x79SHsV9g6YTfmBO4uf/WmLrvv9jqo78=</diagram></mxfile>
\ No newline at end of file
diff --git a/deps/libfabric/prov/efa/docs/write_longread.png b/deps/libfabric/prov/efa/docs/write_longread.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbd0312a0441a9ebddcadfbd8605f1fe63c02c2d
Binary files /dev/null and b/deps/libfabric/prov/efa/docs/write_longread.png differ
diff --git a/deps/libfabric/prov/efa/src/efa.h b/deps/libfabric/prov/efa/src/efa.h
index e41cd38157adf527f903944449fdb59604d226e6..ce79b6f55b6f2e8f1b15623c9e1dd73db1358787 100644
--- a/deps/libfabric/prov/efa/src/efa.h
+++ b/deps/libfabric/prov/efa/src/efa.h
@@ -83,12 +83,19 @@
 #define EFA_EP_TYPE_IS_RDM(_info) \
 	(_info && _info->ep_attr && (_info->ep_attr->type == FI_EP_RDM))
 
+#define EFA_DEF_POOL_ALIGNMENT (8)
 #define EFA_MEM_ALIGNMENT (64)
 
 #define EFA_DEF_CQ_SIZE 1024
 #define EFA_MR_IOV_LIMIT 1
 #define EFA_MR_SUPPORTED_PERMISSIONS (FI_SEND | FI_RECV | FI_REMOTE_READ)
 
+/*
+ * Setting ibv_qp_attr.rnr_retry to this number when modifying qp
+ * to cause firmware to retry indefininetly.
+ */
+#define EFA_RNR_INFINITE_RETRY 7
+
 /*
  * Multiplier to give some room in the device memory registration limits
  * to allow processes added to a running job to bootstrap.
@@ -101,6 +108,8 @@
  * Specific flags and attributes for shm provider
  */
 #define EFA_SHM_MAX_AV_COUNT       (256)
+/* maximum name length for shm endpoint */
+#define EFA_SHM_NAME_MAX	   (256)
 
 extern int efa_mr_cache_enable;
 extern size_t efa_mr_max_cached_count;
@@ -111,26 +120,29 @@ extern struct util_prov efa_util_prov;
 
 struct efa_fabric {
 	struct util_fabric	util_fabric;
+	struct fid_fabric *shm_fabric;
+#ifdef EFA_PERF_ENABLED
+	struct ofi_perfset perf_set;
+#endif
 };
 
-struct efa_ep_addr {
-	uint8_t			raw[16];
-	uint16_t		qpn;
-	uint16_t		pad;
-	uint32_t		qkey;
-	struct efa_ep_addr	*next;
-};
-
-#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr)
 
 struct efa_ah {
-	struct ibv_ah	*ibv_ah;
-	uint16_t	ahn;
+	uint8_t		gid[EFA_GID_LEN]; /* efa device GID */
+	struct ibv_ah	*ibv_ah; /* created by ibv_create_ah() using GID */
+	uint16_t	ahn; /* adress handle number */
+	int		refcnt; /* reference counter. Multiple efa_conn can share an efa_ah */
+	UT_hash_handle	hh; /* hash map handle, link all efa_ah with efa_ep->ah_map */
 };
 
 struct efa_conn {
-	struct efa_ah		ah;
-	struct efa_ep_addr	ep_addr;
+	struct efa_ah		*ah;
+	struct efa_ep_addr	*ep_addr;
+	/* for FI_AV_TABLE, fi_addr is same as util_av_fi_addr,
+	 * for FI_AV_MAP, fi_addr is pointer to efa_conn; */
+	fi_addr_t		fi_addr;
+	fi_addr_t		util_av_fi_addr;
+	struct rdm_peer		rdm_peer;
 };
 
 /*
@@ -156,6 +168,62 @@ struct efa_domain {
 	size_t			qp_table_sz_m1;
 };
 
+/**
+ * @brief get a pointer to struct efa_domain from a domain_fid
+ *
+ * @param[in]	domain_fid	a fid to a domain
+ * @return	return the pointer to struct efa_domain
+ */
+static inline
+struct efa_domain *efa_domain_from_fid(struct fid_domain *domain_fid)
+{
+	struct util_domain *util_domain;
+	struct efa_domain_base *efa_domain_base;
+	struct rxr_domain *rxr_domain;
+	struct efa_domain *efa_domain;
+
+	util_domain = container_of(domain_fid, struct util_domain,
+				   domain_fid);
+	efa_domain_base = container_of(util_domain, struct efa_domain_base,
+				       util_domain.domain_fid);
+
+	/*
+	 * An rxr_domain fid was passed to the user if this is an RDM
+	 * endpoint, otherwise it is an efa_domain fid.  This will be
+	 * removed once the rxr and efa domain structures are combined.
+	 */
+	if (efa_domain_base->type == EFA_DOMAIN_RDM) {
+		rxr_domain = (struct rxr_domain *)efa_domain_base;
+		efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+					  util_domain.domain_fid);
+	} else {
+		assert(efa_domain_base->type == EFA_DOMAIN_DGRAM);
+		efa_domain = (struct efa_domain *)efa_domain_base;
+	}
+
+	return efa_domain;
+}
+
+/**
+ * @brief get efa domain type from domain fid
+ *
+ * @param[in]	domain_fid	a fid to a domain
+ * @return	efa domain type, either EFA_DOMAIN_DGRAM or EFA_DOMAIN_RDM
+ */
+static inline
+enum efa_domain_type efa_domain_get_type(struct fid_domain *domain_fid)
+{
+	struct util_domain *util_domain;
+	struct efa_domain_base *efa_domain_base;
+
+	util_domain = container_of(domain_fid, struct util_domain,
+				   domain_fid);
+	efa_domain_base = container_of(util_domain, struct efa_domain_base,
+				       util_domain.domain_fid);
+
+	return efa_domain_base->type;
+}
+
 extern struct fi_ops_mr efa_domain_mr_ops;
 extern struct fi_ops_mr efa_domain_mr_cache_ops;
 int efa_mr_cache_entry_reg(struct ofi_mr_cache *cache,
@@ -244,6 +312,7 @@ struct efa_ep {
 	struct efa_cq		*scq;
 	struct efa_av		*av;
 	struct fi_info		*info;
+	size_t			rnr_retry;
 	void			*src_addr;
 	struct ibv_send_wr	xmit_more_wr_head;
 	struct ibv_send_wr	*xmit_more_wr_tail;
@@ -256,53 +325,63 @@ struct efa_ep {
 
 struct efa_send_wr {
 	struct ibv_send_wr wr;
-	struct ibv_sge sge[0];
+	struct ibv_sge sge[];
 };
 
 struct efa_recv_wr {
 	struct ibv_recv_wr wr;
-	struct ibv_sge sge[0];
+	struct ibv_sge sge[];
 };
 
-typedef struct efa_conn *
-	(*efa_addr_to_conn_func)
-	(struct efa_av *av, fi_addr_t addr);
-
 struct efa_av {
 	struct fid_av		*shm_rdm_av;
 	fi_addr_t		shm_rdm_addr_map[EFA_SHM_MAX_AV_COUNT];
 	struct efa_domain       *domain;
 	struct efa_ep           *ep;
 	size_t			used;
-	size_t			next;
 	size_t			shm_used;
 	enum fi_av_type		type;
-	efa_addr_to_conn_func	addr_to_conn;
-	struct efa_reverse_av	*reverse_av;
+	/* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_conn.
+	 * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_conns.
+	 * cur_revsere_av is faster to search because its key's size is smaller
+	 */
+	struct efa_cur_reverse_av *cur_reverse_av;
+	struct efa_prv_reverse_av *prv_reverse_av;
+	struct efa_ah		*ah_map;
 	struct util_av		util_av;
 	enum fi_ep_type         ep_type;
-	/* Used only for FI_AV_TABLE */
-	struct efa_conn         **conn_table;
 };
 
 struct efa_av_entry {
 	uint8_t			ep_addr[EFA_EP_ADDR_LEN];
-	fi_addr_t		rdm_addr;
-	fi_addr_t		shm_rdm_addr;
-	bool			local_mapping;
+	struct efa_conn		conn;
+};
+
+struct efa_cur_reverse_av_key {
+	uint16_t ahn;
+	uint16_t qpn;
+};
+
+struct efa_cur_reverse_av {
+	struct efa_cur_reverse_av_key key;
+	struct efa_conn *conn;
+	UT_hash_handle hh;
 };
 
-struct efa_ah_qpn {
+struct efa_prv_reverse_av_key {
 	uint16_t ahn;
 	uint16_t qpn;
+	uint32_t connid;
 };
 
-struct efa_reverse_av {
-	struct efa_ah_qpn key;
-	fi_addr_t fi_addr;
+struct efa_prv_reverse_av {
+	struct efa_prv_reverse_av_key key;
+	struct efa_conn *conn;
 	UT_hash_handle hh;
 };
 
+#define EFA_DGRAM_CONNID (0x0)
+
 struct efa_ep_domain {
 	char		*suffix;
 	enum fi_ep_type	type;
@@ -360,24 +439,46 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 		struct fid_av **av_fid, void *context);
 int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 		struct fid_cq **cq_fid, void *context);
+int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid,
+	       void *context);
+int efa_getinfo(uint32_t version, const char *node, const char *service,
+		uint64_t flags, const struct fi_info *hints, struct fi_info **info);
+void efa_finalize_prov(void);
 
 /* AV sub-functions */
-int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr,
-		       fi_addr_t *fi_addr, uint64_t flags, void *context);
+int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
+		      fi_addr_t *fi_addr, uint64_t flags, void *context);
+
+struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr);
 
 /* Caller must hold cq->inner_lock. */
 void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx);
 /* Caller must hold cq->inner_lock. */
 void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx);
 
-fi_addr_t efa_ahn_qpn_to_addr(struct efa_av *av, uint16_t ahn, uint16_t qpn);
+fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct rxr_pkt_entry *pkt_entry);
+
+fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn);
 
-struct fi_provider *init_lower_efa_prov();
+int efa_init_prov(void);
+
+ssize_t efa_post_flush(struct efa_ep *ep, struct ibv_send_wr **bad_wr);
 
 ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr);
 
 ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags);
 
+/*
+ * ON will avoid using huge pages for bounce buffers, so that the libibverbs
+ * fork support can be used safely.
+ */
+enum efa_fork_support_status {
+	EFA_FORK_SUPPORT_OFF = 0,
+	EFA_FORK_SUPPORT_ON,
+	EFA_FORK_SUPPORT_UNNEEDED,
+};
+extern enum efa_fork_support_status efa_fork_status;
+
 bool efa_device_support_rdma_read(void);
 
 static inline
@@ -390,18 +491,65 @@ bool efa_ep_support_rdma_read(struct fid_ep *ep_fid)
 }
 
 static inline
-bool efa_peer_support_rdma_read(struct rxr_peer *peer)
+bool efa_ep_support_rnr_retry_modify(struct fid_ep *ep_fid)
+{
+#ifdef HAVE_CAPS_RNR_RETRY
+	struct efa_ep *efa_ep;
+
+	efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid);
+	return efa_ep->domain->ctx->device_caps & EFADV_DEVICE_ATTR_CAPS_RNR_RETRY;
+#else
+	return false;
+#endif
+}
+
+/**
+ * @brief return whether this endpoint should write error cq entry for RNR.
+ *
+ * For an endpoint to write RNR completion, two conditions must be met:
+ *
+ * First, the end point must be able to receive RNR completion from rdma-core,
+ * which means rnr_etry must be less then EFA_RNR_INFINITE_RETRY.
+ *
+ * Second, the app need to request this feature when opening endpoint
+ * (by setting info->domain_attr->resource_mgmt to FI_RM_DISABLED).
+ * The setting was saved as rxr_ep->handle_resource_management.
+ *
+ * @param[in]	ep	endpoint
+ */
+static inline
+bool rxr_ep_should_write_rnr_completion(struct rxr_ep *ep)
+{
+	return (rxr_env.rnr_retry < EFA_RNR_INFINITE_RETRY) &&
+		(ep->handle_resource_management == FI_RM_DISABLED);
+}
+
+static inline
+bool efa_peer_support_rdma_read(struct rdm_peer *peer)
 {
 	/* RDMA READ is an extra feature defined in version 4 (the base version).
 	 * Because it is an extra feature, an EP will assume the peer does not support
 	 * it before a handshake packet was received.
 	 */
 	return (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) &&
-	       (peer->features[0] & RXR_REQ_FEATURE_RDMA_READ);
+	       (peer->extra_info[0] & RXR_EXTRA_FEATURE_RDMA_READ);
+}
+
+static inline
+bool rxr_peer_support_delivery_complete(struct rdm_peer *peer)
+{
+	/* FI_DELIVERY_COMPLETE is an extra feature defined
+	 * in version 4 (the base version).
+	 * Because it is an extra feature,
+	 * an EP will assume the peer does not support
+	 * it before a handshake packet was received.
+	 */
+	return (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) &&
+	       (peer->extra_info[0] & RXR_EXTRA_FEATURE_DELIVERY_COMPLETE);
 }
 
 static inline
-bool efa_both_support_rdma_read(struct rxr_ep *ep, struct rxr_peer *peer)
+bool efa_both_support_rdma_read(struct rxr_ep *ep, struct rdm_peer *peer)
 {
 	if (!rxr_env.use_device_rdma)
 		return 0;
@@ -410,58 +558,82 @@ bool efa_both_support_rdma_read(struct rxr_ep *ep, struct rxr_peer *peer)
 	       (peer->is_self || efa_peer_support_rdma_read(peer));
 }
 
+/**
+ * @brief determines whether a peer needs the endpoint to include
+ * raw address int the req packet header.
+ *
+ * There are two cases a peer need the raw address in REQ packet header:
+ *
+ * 1. the initial packets to a peer should include the raw address,
+ * because the peer might not have ep's address in its address vector
+ * causing the peer to be unable to send packet back. Normally, after
+ * an endpoint received a hanshake packet from a peer, it can stop
+ * including raw address in packet header.
+ *
+ * 2. If the peer requested to keep the header length constant through
+ * out the communiciton, endpoint will include the raw address in the
+ * header even afer received handshake from a header to conform to the
+ * request. Usually, peer has this request because they are in zero
+ * copy receive mode, which requires the packet header size to remain
+ * the same.
+ *
+ * @params[in]	peer	pointer to rdm_peer
+ * @return	a boolean indicating whether the peer needs the raw address header
+ */
 static inline
-size_t efa_max_rdma_size(struct fid_ep *ep_fid)
+bool rxr_peer_need_raw_addr_hdr(struct rdm_peer *peer)
 {
-	struct efa_ep *efa_ep;
+	if (OFI_UNLIKELY(!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)))
+		return true;
 
-	efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid);
-	return efa_ep->domain->ctx->max_rdma_size;
+	return peer->extra_info[0] & RXR_EXTRA_REQUEST_CONSTANT_HEADER_LENGTH;
 }
 
-static inline
-struct rxr_peer *efa_ep_get_peer(struct dlist_entry *ep_list_entry,
-				 fi_addr_t addr)
-{
-	struct util_ep *util_ep;
-	struct rxr_ep *rxr_ep;
-
-	util_ep = container_of(ep_list_entry, struct util_ep,
-			       av_entry);
-	rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
-	return rxr_ep_get_peer(rxr_ep, addr);
-}
 
+/**
+ * @brief determines whether a peer needs the endpoint to include
+ * connection ID (connid) in packet header.
+ *
+ * Connection ID is a 4 bytes random integer identifies an endpoint.
+ * Including connection ID in a packet's header allows peer to
+ * identify sender of the packet. It is necessary because device
+ * only report GID+QPN of a received packet, while QPN may be reused
+ * accross device endpoint teardown and initialization.
+ *
+ * EFA uses qkey as connection ID.
+ *
+ * @params[in]	peer	pointer to rdm_peer
+ * @return	a boolean indicating whether the peer needs connection ID
+ */
 static inline
-int efa_peer_in_use(struct rxr_peer *peer)
+bool rxr_peer_need_connid(struct rdm_peer *peer)
 {
-	struct rxr_pkt_entry *pending_pkt;
-
-	if ((peer->tx_pending) || (peer->flags & RXR_PEER_IN_BACKOFF))
-		return -FI_EBUSY;
-	if (peer->rx_init) {
-		pending_pkt = *ofi_recvwin_peek(peer->robuf);
-		if (pending_pkt && pending_pkt->pkt)
-			return -FI_EBUSY;
-	}
-	return 0;
+	return (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) &&
+	       (peer->extra_info[0] & RXR_EXTRA_REQUEST_CONNID_HEADER);
 }
+
 static inline
-void efa_free_robuf(struct rxr_peer *peer)
+size_t efa_max_rdma_size(struct fid_ep *ep_fid)
 {
-	ofi_recvwin_free(peer->robuf);
-	ofi_buf_free(peer->robuf);
+	struct efa_ep *efa_ep;
+
+	efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid);
+	return efa_ep->domain->ctx->max_rdma_size;
 }
 
 static inline
-void efa_peer_reset(struct rxr_peer *peer)
+struct rdm_peer *rxr_ep_get_peer(struct rxr_ep *ep, fi_addr_t addr)
 {
-	efa_free_robuf(peer);
-#ifdef ENABLE_EFA_POISONING
-	rxr_poison_mem_region((uint32_t *)peer, sizeof(struct rxr_peer));
-#endif
-	memset(peer, 0, sizeof(struct rxr_peer));
-	dlist_init(&peer->rnr_entry);
+	struct util_av_entry *util_av_entry;
+	struct efa_av_entry *av_entry;
+
+	if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL))
+		return NULL;
+
+	util_av_entry = ofi_bufpool_get_ibuf(ep->util_ep.av->av_entry_pool,
+	                                     addr);
+	av_entry = (struct efa_av_entry *)util_av_entry->data;
+	return av_entry->conn.ep_addr ? &av_entry->conn.rdm_peer : NULL;
 }
 
 static inline bool efa_ep_is_cuda_mr(struct efa_mr *efa_mr)
@@ -469,9 +641,64 @@ static inline bool efa_ep_is_cuda_mr(struct efa_mr *efa_mr)
 	return efa_mr ? (efa_mr->peer.iface == FI_HMEM_CUDA): false;
 }
 
+/*
+ * efa_is_cache_available() is a check to see whether a memory registration
+ * cache is available to be used by this domain.
+ *
+ * Return value:
+ *    return true if a memory registration cache exists in this domain.
+ *    return false if a memory registration cache does not exist in this domain.
+ */
 static inline bool efa_is_cache_available(struct efa_domain *efa_domain)
 {
 	return efa_domain->cache;
 }
 
+#define RXR_REQ_OPT_HDR_ALIGNMENT 8
+#define RXR_REQ_OPT_RAW_ADDR_HDR_SIZE (((sizeof(struct rxr_req_opt_raw_addr_hdr) + EFA_EP_ADDR_LEN - 1)/RXR_REQ_OPT_HDR_ALIGNMENT + 1) * RXR_REQ_OPT_HDR_ALIGNMENT)
+
+/*
+ * Per libfabric standard, the prefix must be a multiple of 8, hence the static assert
+ */
+#define RXR_MSG_PREFIX_SIZE (sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr) + RXR_REQ_OPT_RAW_ADDR_HDR_SIZE)
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(RXR_MSG_PREFIX_SIZE % 8 == 0, "message prefix size alignment check");
+#endif
+
+/* Performance counter declarations */
+#ifdef EFA_PERF_ENABLED
+#define EFA_PERF_FOREACH(DECL)	\
+	DECL(perf_efa_tx),	\
+	DECL(perf_efa_recv),	\
+	DECL(efa_perf_size)	\
+
+enum efa_perf_counters {
+	EFA_PERF_FOREACH(OFI_ENUM_VAL)
+};
+
+extern const char *efa_perf_counters_str[];
+
+static inline void efa_perfset_start(struct rxr_ep *ep, size_t index)
+{
+	struct rxr_domain *domain = rxr_ep_domain(ep);
+	struct efa_fabric *fabric = container_of(domain->util_domain.fabric,
+						 struct efa_fabric,
+						 util_fabric);
+	ofi_perfset_start(&fabric->perf_set, index);
+}
+
+static inline void efa_perfset_end(struct rxr_ep *ep, size_t index)
+{
+	struct rxr_domain *domain = rxr_ep_domain(ep);
+	struct efa_fabric *fabric = container_of(domain->util_domain.fabric,
+						 struct efa_fabric,
+						 util_fabric);
+	ofi_perfset_end(&fabric->perf_set, index);
+}
+#else
+#define efa_perfset_start(ep, index) do {} while (0)
+#define efa_perfset_end(ep, index) do {} while (0)
+#endif
+
 #endif /* EFA_H */
diff --git a/deps/libfabric/prov/efa/src/efa_av.c b/deps/libfabric/prov/efa/src/efa_av.c
index f1b821067b06215d0f66499e77e58eee8a1eb44d..30dcb23e8ea948e08386bfde9d449c16efe8994a 100644
--- a/deps/libfabric/prov/efa/src/efa_av.c
+++ b/deps/libfabric/prov/efa/src/efa_av.c
@@ -36,10 +36,11 @@
 #include <stdio.h>
 
 #include <infiniband/efadv.h>
-
 #include <ofi_enosys.h>
+
 #include "efa.h"
 #include "rxr.h"
+#include "rxr_pkt_type_base.h"
 
 /*
  * Local/remote peer detection by comparing peer GID with stored local GIDs
@@ -74,31 +75,198 @@ static bool efa_is_same_addr(struct efa_ep_addr *lhs, struct efa_ep_addr *rhs)
 	       lhs->qpn == rhs->qpn && lhs->qkey == rhs->qkey;
 }
 
-static inline struct efa_conn *efa_av_tbl_idx_to_conn(struct efa_av *av, fi_addr_t addr)
+/**
+ * @brief initialize a rdm peer
+ *
+ * @param[in,out]	peer	rdm peer
+ * @param[in]		ep	rdm endpoint
+ * @param[in]		conn	efa conn object
+ */
+static inline
+void efa_rdm_peer_init(struct rdm_peer *peer, struct rxr_ep *ep, struct efa_conn *conn)
 {
-	if (OFI_UNLIKELY(addr == FI_ADDR_UNSPEC))
-		return NULL;
-	return av->conn_table[addr];
+	memset(peer, 0, sizeof(struct rdm_peer));
+
+	peer->efa_fiaddr = conn->fi_addr;
+	peer->is_self = efa_is_same_addr((struct efa_ep_addr *)ep->core_addr,
+					 conn->ep_addr);
+
+	ofi_recvwin_buf_alloc(&peer->robuf, rxr_env.recvwin_size);
+	peer->rx_credits = rxr_env.rx_window_size;
+	peer->tx_credits = rxr_env.tx_max_credits;
+	dlist_init(&peer->outstanding_tx_pkts);
+	dlist_init(&peer->rx_unexp_list);
+	dlist_init(&peer->rx_unexp_tagged_list);
+	dlist_init(&peer->tx_entry_list);
+	dlist_init(&peer->rx_entry_list);
+}
+
+/**
+ * @brief clear resources accociated with a peer
+ *
+ * release reorder buffer, tx_entry list and rx_entry list of a peer
+ *
+ * @param[in,out]	peer 	rdm peer
+ */
+void efa_rdm_peer_clear(struct rxr_ep *ep, struct rdm_peer *peer)
+{
+	struct dlist_entry *tmp;
+	struct rxr_tx_entry *tx_entry;
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_pkt_entry *pkt_entry;
+	/*
+	 * TODO: Add support for wait/signal until all pending messages have
+	 * been sent/received so we do not attempt to complete a data transfer
+	 * or internal transfer after the EP is shutdown.
+	 */
+	if ((peer->flags & RXR_PEER_REQ_SENT) &&
+	    !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))
+		FI_WARN_ONCE(&rxr_prov, FI_LOG_EP_CTRL, "Closing EP with unacked CONNREQs in flight\n");
+
+	if (peer->robuf.pending)
+		ofi_recvwin_free(&peer->robuf);
+
+	if (!ep) {
+		/* ep is NULL means the endpoint has been closed.
+		 * In this case there is no need to proceed because
+		 * all the tx_entry, rx_entry, pkt_entry has been released.
+		 */
+		return;
+	}
+
+	/* we cannot release outstanding TX packets because device
+	 * will report completion of these packets later. Setting
+	 * the address to FI_ADDR_NOTAVAIL, so rxr_ep_get_peer()
+	 * will return NULL for the address, so the completion will
+	 * be ignored.
+	 */
+	dlist_foreach_container(&peer->outstanding_tx_pkts,
+				struct rxr_pkt_entry,
+				pkt_entry, entry) {
+		pkt_entry->addr = FI_ADDR_NOTAVAIL;
+	}
+
+	dlist_foreach_container_safe(&peer->tx_entry_list,
+				     struct rxr_tx_entry,
+				     tx_entry, peer_entry, tmp) {
+		rxr_release_tx_entry(ep, tx_entry);
+	}
+
+	dlist_foreach_container_safe(&peer->rx_entry_list,
+				     struct rxr_rx_entry,
+				     rx_entry, peer_entry, tmp) {
+		rxr_release_rx_entry(ep, rx_entry);
+	}
+
+	if (peer->flags & RXR_PEER_HANDSHAKE_QUEUED)
+		dlist_remove(&peer->handshake_queued_entry);
+
+	if (peer->flags & RXR_PEER_IN_BACKOFF)
+		dlist_remove(&peer->rnr_backoff_entry);
+
+#ifdef ENABLE_EFA_POISONING
+	rxr_poison_mem_region((uint32_t *)peer, sizeof(struct rdm_peer));
+#endif
 }
 
-static inline struct efa_conn *efa_av_map_addr_to_conn(struct efa_av *av, fi_addr_t addr)
+/**
+ * @brief find efa_conn struct using fi_addr
+ *
+ * @param[in]	av	efa av
+ * @param[in]	addr	fi_addr
+ * @return	if address is valid, return pointer to efa_conn struct
+ * 		otherwise, return NULL
+ */
+struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr)
 {
-	if (OFI_UNLIKELY(addr == FI_ADDR_UNSPEC))
+	struct util_av_entry *util_av_entry;
+	struct efa_av_entry *efa_av_entry;
+
+	if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC))
+		return NULL;
+
+	if (av->type == FI_AV_MAP) {
+		return (struct efa_conn *)fi_addr;
+	}
+
+	assert(av->type == FI_AV_TABLE);
+	util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr);
+	if (!util_av_entry)
 		return NULL;
-	return (struct efa_conn *)(void *)addr;
+
+	efa_av_entry = (struct efa_av_entry *)util_av_entry->data;
+	return efa_av_entry->conn.ep_addr ? &efa_av_entry->conn : NULL;
 }
 
-fi_addr_t efa_ahn_qpn_to_addr(struct efa_av *av, uint16_t ahn, uint16_t qpn)
+/**
+ * @brief find fi_addr for dgram endpoint
+ *
+ * @param[in]	av	address vector
+ * @param[in]	ahn	address handle number
+ * @param[in]	qpn	QP number
+ * @return	On success, return fi_addr to the peer who send the packet
+ * 		If no such peer exist, return FI_ADDR_NOTAVAIL
+ */
+fi_addr_t efa_av_reverse_lookup_dgram(struct efa_av *av, uint16_t ahn, uint16_t qpn)
 {
-	struct efa_reverse_av *reverse_av;
-	struct efa_ah_qpn key = {
-		.ahn = ahn,
-		.qpn = qpn,
-	};
+	struct efa_cur_reverse_av *cur_entry;
+	struct efa_cur_reverse_av_key cur_key;
 
-	HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av);
+	memset(&cur_key, 0, sizeof(cur_key));
+	cur_key.ahn = ahn;
+	cur_key.qpn = qpn;
+	HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
 
-	return OFI_LIKELY(!!reverse_av) ? reverse_av->fi_addr : FI_ADDR_NOTAVAIL;
+	return (OFI_LIKELY(!!cur_entry)) ? cur_entry->conn->fi_addr : FI_ADDR_NOTAVAIL;
+}
+
+/**
+ * @brief find fi_addr for rdm endpoint
+ *
+ * @param[in]	av	address vector
+ * @param[in]	ahn	address handle number
+ * @param[in]	qpn	QP number
+ * @param[in]   pkt_entry	rdm packet entry, used to extract connid
+ * @return	On success, return fi_addr to the peer who send the packet
+ * 		If no such peer exist, return FI_ADDR_NOTAVAIL
+ */
+fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct rxr_pkt_entry *pkt_entry)
+{
+	struct efa_cur_reverse_av *cur_entry;
+	struct efa_prv_reverse_av *prv_entry;
+	struct efa_cur_reverse_av_key cur_key;
+	struct efa_prv_reverse_av_key prv_key;
+	uint32_t *connid;
+
+	memset(&cur_key, 0, sizeof(cur_key));
+	cur_key.ahn = ahn;
+	cur_key.qpn = qpn;
+	HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
+
+	if (OFI_UNLIKELY(!cur_entry))
+		return FI_ADDR_NOTAVAIL;
+
+	connid = rxr_pkt_connid_ptr(pkt_entry);
+	if (!connid) {
+		FI_WARN_ONCE(&rxr_prov, FI_LOG_EP_CTRL,
+			     "An incoming packet does NOT have connection ID in its header.\n"
+			     "This means the peer is using an older version of libfabric.\n"
+			     "The communication can continue but it is encouraged to use\n"
+			     "a newer version of libfabric\n");
+		return cur_entry->conn->fi_addr;
+	}
+
+	if (OFI_LIKELY(*connid == cur_entry->conn->ep_addr->qkey))
+		return cur_entry->conn->fi_addr;
+
+	/* the packet is from a previous peer, look for its address from the prv_reverse_av */
+	memset(&prv_key, 0, sizeof(prv_key));
+	prv_key.ahn = ahn;
+	prv_key.qpn = qpn;
+	prv_key.connid = *connid;
+	HASH_FIND(hh, av->prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry);
+
+	return OFI_LIKELY(!!prv_entry) ? prv_entry->conn->fi_addr : FI_ADDR_NOTAVAIL;
 }
 
 static inline int efa_av_is_valid_address(struct efa_ep_addr *addr)
@@ -108,305 +276,457 @@ static inline int efa_av_is_valid_address(struct efa_ep_addr *addr)
 	return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw));
 }
 
-/* Returns the first NULL index in av connection table, starting from @hint */
-static size_t efa_av_tbl_find_first_empty(struct efa_av *av, size_t hint)
+/**
+ * @brief allocate an ibv_ah object from GID.
+ * This function use a hash map to store GID to ibv_ah map,
+ * and re-use ibv_ah for same GID
+ *
+ * @param[in]	av	address vector
+ * @param[in]	gid	GID
+ */
+static
+struct efa_ah *efa_ah_alloc(struct efa_av *av, const uint8_t *gid)
 {
-	struct efa_conn **conn_table;
+	struct ibv_pd *ibv_pd = av->domain->ibv_pd;
+	struct efa_ah *efa_ah;
+	struct ibv_ah_attr ibv_ah_attr = { 0 };
+	struct efadv_ah_attr efa_ah_attr = { 0 };
+	int err;
 
-	assert(av->type == FI_AV_TABLE);
+	efa_ah = NULL;
+	HASH_FIND(hh, av->ah_map, gid, EFA_GID_LEN, efa_ah);
+	if (efa_ah) {
+		efa_ah->refcnt += 1;
+		return efa_ah;
+	}
+
+	efa_ah = malloc(sizeof(struct efa_ah));
+	if (!efa_ah) {
+		errno = FI_ENOMEM;
+		EFA_WARN(FI_LOG_AV, "cannot allocate memory for efa_ah");
+		return NULL;
+	}
 
-	conn_table = av->conn_table;
-	for (; hint < av->util_av.count; hint++) {
-		if (!conn_table[hint])
-			return hint;
+	ibv_ah_attr.port_num = 1;
+	ibv_ah_attr.is_global = 1;
+	memcpy(ibv_ah_attr.grh.dgid.raw, gid, EFA_GID_LEN);
+	efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr);
+	if (!efa_ah->ibv_ah) {
+		EFA_WARN(FI_LOG_AV, "ibv_create_ah failed! errno: %d\n", errno);
+		goto err_free_efa_ah;
+	}
+
+	err = efadv_query_ah(efa_ah->ibv_ah, &efa_ah_attr, sizeof(efa_ah_attr));
+	if (err) {
+		errno = err;
+		EFA_WARN(FI_LOG_AV, "efadv_query_ah failed! err: %d\n", err);
+		goto err_destroy_ibv_ah;
 	}
 
-	return -1;
+	efa_ah->refcnt = 1;
+	efa_ah->ahn = efa_ah_attr.ahn;
+	memcpy(efa_ah->gid, gid, EFA_GID_LEN);
+	HASH_ADD(hh, av->ah_map, gid, EFA_GID_LEN, efa_ah);
+	return efa_ah;
+
+err_destroy_ibv_ah:
+	ibv_destroy_ah(efa_ah->ibv_ah);
+err_free_efa_ah:
+	free(efa_ah);
+	return NULL;
 }
 
-static int efa_peer_resize(struct rxr_ep *ep, size_t current_count,
-			   size_t new_count)
+/**
+ * @brief release an efa_ah object
+ *
+ * @param[in]	av	address vector
+ * @param[in]	ah	efa_ah object pointer
+ */
+static
+void efa_ah_release(struct efa_av *av, struct efa_ah *ah)
 {
-	void *p = realloc(&ep->peer[0], (new_count * sizeof(struct rxr_peer)));
+	int err;
+#if ENABLE_DEBUG
+	struct efa_ah *tmp;
 
-	if (p)
-		ep->peer = p;
-	else
-		return -FI_ENOMEM;
-#ifdef ENABLE_EFA_POISONING
-	rxr_poison_mem_region((uint32_t *)&ep->peer[current_count], (new_count -
-			      current_count) * sizeof(struct rxr_peer));
+	HASH_FIND(hh, av->ah_map, ah->gid, EFA_GID_LEN, tmp);
+	assert(tmp == ah);
 #endif
-	memset(&ep->peer[current_count], 0,
-		(new_count - current_count) * sizeof(struct rxr_peer));
+	assert(ah->refcnt > 0);
+	ah->refcnt -= 1;
+	if (ah->refcnt == 0) {
+		HASH_DEL(av->ah_map, ah);
+		err = ibv_destroy_ah(ah->ibv_ah);
+		if (err)
+			EFA_WARN(FI_LOG_AV, "ibv_destroy_ah failed! err=%d\n", err);
+		free(ah);
+	}
+}
+
+static
+void efa_conn_release(struct efa_av *av, struct efa_conn *conn);
+
+/**
+ * @brief initialize the rdm related resources of an efa_conn object
+ *
+ * This function setup rdm_peer and shm address for an efa_conn.
+ * If shm transfer is enabled and the addr comes from local peer,
+ *  1. convert addr to format 'gid_qpn', which will be set as shm's ep name later.
+ *  2. insert gid_qpn into shm's av
+ *  3. store returned fi_addr from shm into the hash table
+ *
+ * @param[in]	av	address vector
+ * @param[in]	conn	efa_conn object
+ * @return	On success return 0, otherwise return a negative error code
+ */
+static
+int efa_conn_rdm_init(struct efa_av *av, struct efa_conn *conn)
+{
+	int err, ret;
+	char smr_name[EFA_SHM_NAME_MAX];
+	size_t smr_name_len;
+	struct rxr_ep *rxr_ep;
+	struct rdm_peer *peer;
+
+	assert(av->ep_type == FI_EP_RDM);
+	assert(conn->ep_addr);
+
+	/* currently multiple EP bind to same av is not supported */
+	assert(!dlist_empty(&av->util_av.ep_list));
+	rxr_ep = container_of(av->util_av.ep_list.next, struct rxr_ep, util_ep.av_entry);
+
+	peer = &conn->rdm_peer;
+	efa_rdm_peer_init(peer, rxr_ep, conn);
+
+	/* If peer is local, insert the address into shm provider's av */
+	if (rxr_ep->use_shm && efa_is_local_peer(av, conn->ep_addr)) {
+		if (av->shm_used >= rxr_env.shm_av_size) {
+			EFA_WARN(FI_LOG_AV,
+				 "Max number of shm AV entry (%d) has been reached.\n",
+				 rxr_env.shm_av_size);
+			return -FI_ENOMEM;
+		}
+
+		smr_name_len = EFA_SHM_NAME_MAX;
+		err = rxr_raw_addr_to_smr_name(conn->ep_addr, smr_name, &smr_name_len);
+		if (err != FI_SUCCESS) {
+			EFA_WARN(FI_LOG_AV,
+				 "rxr_ep_efa_addr_to_str() failed! err=%d\n", err);
+			return err;
+		}
+
+		ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &peer->shm_fiaddr, 0, NULL);
+		if (OFI_UNLIKELY(ret != 1)) {
+			EFA_WARN(FI_LOG_AV,
+				 "Failed to insert address to shm provider's av: %s\n",
+				 fi_strerror(-ret));
+			return ret;
+		}
+
+		EFA_INFO(FI_LOG_AV,
+			"Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n",
+			smr_name, conn->fi_addr, peer->shm_fiaddr);
+
+		assert(peer->shm_fiaddr < rxr_env.shm_av_size);
+		av->shm_used++;
+		av->shm_rdm_addr_map[peer->shm_fiaddr] = conn->fi_addr;
+		peer->is_local = 1;
+	}
+
 	return 0;
 }
 
-static int efa_av_resize(struct efa_av *av, size_t new_av_count)
+/**
+ * @brief release the rdm related resources of an efa_conn object
+ *
+ * this function release the shm av entry and rdm peer;
+ *
+ * @param[in]	av	address vector
+ * @param[in]	conn	efa_conn object
+ */
+static
+void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn)
+{
+	int err;
+	struct rdm_peer *peer;
+	struct rxr_ep *ep;
+
+	assert(av->ep_type == FI_EP_RDM);
+
+	peer = &conn->rdm_peer;
+	if (peer->is_local) {
+		err = fi_av_remove(av->shm_rdm_av, &peer->shm_fiaddr, 1, 0);
+		if (err) {
+			EFA_WARN(FI_LOG_AV, "remove address from shm av failed! err=%d\n", err);
+		} else {
+			av->shm_used--;
+			assert(peer->shm_fiaddr < rxr_env.shm_av_size);
+			av->shm_rdm_addr_map[peer->shm_fiaddr] = FI_ADDR_UNSPEC;
+		}
+	}
+
+	/*
+	 * We need peer->shm_fiaddr to remove shm address from shm av table,
+	 * so efa_rdm_peer_clear must be after removing shm av table.
+	 */
+	ep = dlist_empty(&av->util_av.ep_list) ? NULL : container_of(av->util_av.ep_list.next, struct rxr_ep, util_ep.av_entry);
+	efa_rdm_peer_clear(ep, peer);
+}
+
+/*
+ * @brief update reverse_av when inserting an new address to AV
+ *
+ * @param[in,out]	av		efa AV
+ * @param[in]		raw_addr	raw address
+ * @param[in]		conn		efa_conn object
+ * @return		On success, return 0.
+ * 			Otherwise, return a negative libfabric error code
+ */
+static
+int efa_av_update_reverse_av(struct efa_av *av, struct efa_ep_addr *raw_addr,
+				    struct efa_conn *conn)
 {
-	if (av->type == FI_AV_TABLE) {
-		void *p = realloc(av->conn_table,
-				  (new_av_count *
-				  sizeof(*av->conn_table)));
-
-		if (p)
-			av->conn_table = p;
-		else
+	struct efa_cur_reverse_av *cur_entry;
+	struct efa_prv_reverse_av *prv_entry;
+	struct efa_cur_reverse_av_key cur_key;
+
+	memset(&cur_key, 0, sizeof(cur_key));
+	cur_key.ahn = conn->ah->ahn;
+	cur_key.qpn = raw_addr->qpn;
+	cur_entry = NULL;
+
+	HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
+	if (!cur_entry) {
+		cur_entry = malloc(sizeof(*cur_entry));
+		if (!cur_entry) {
+			FI_WARN(&rxr_prov, FI_LOG_AV, "Cannot allocate memory for cur_reverse_av entry");
 			return -FI_ENOMEM;
+		}
 
-#ifdef ENABLE_EFA_POISONING
-	rxr_poison_mem_region((uint32_t *)av->conn_table + av->util_av.count,
-			      (new_av_count - av->util_av.count) *
-			      sizeof(*av->conn_table));
-#endif
+		cur_entry->key.ahn = cur_key.ahn;
+		cur_entry->key.qpn = cur_key.qpn;
+		cur_entry->conn = conn;
+		HASH_ADD(hh, av->cur_reverse_av, key, sizeof(cur_key), cur_entry);
+		return 0;
+	}
 
-		memset(av->conn_table + av->util_av.count, 0,
-		       (new_av_count - av->util_av.count) * sizeof(*av->conn_table));
+	/* We used a static connid for all dgram endpoints, therefore cur_entry should always be NULL,
+	 * and only RDM endpoint can reach here. hence the following assertion
+	 */
+	assert(av->ep_type == FI_EP_RDM);
+	prv_entry = malloc(sizeof(*prv_entry));
+	if (!prv_entry) {
+		FI_WARN(&rxr_prov, FI_LOG_AV, "Cannot allocate memory for prv_reverse_av entry");
+		return -FI_ENOMEM;
 	}
 
-	av->util_av.count = new_av_count;
+	prv_entry->key.ahn = cur_key.ahn;
+	prv_entry->key.qpn = cur_key.qpn;
+	prv_entry->key.connid = cur_entry->conn->ep_addr->qkey;
+	prv_entry->conn = cur_entry->conn;
+	HASH_ADD(hh, av->prv_reverse_av, key, sizeof(prv_entry->key), prv_entry);
 
+	cur_entry->conn = conn;
 	return 0;
 }
 
-/* Inserts a single AH to AV. */
-static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr,
-				fi_addr_t *fi_addr, uint64_t flags, void *context)
+/**
+ * @brief allocate an efa_conn object
+ * caller of this function must obtain av->util_av.lock
+ *
+ * @param[in]	av		efa address vector
+ * @param[in]	raw_addr	raw efa address
+ * @param[in]	flags		flags application passed to fi_av_insert
+ * @param[in]	context		context application passed to fi_av_insert
+ * @return	on success, return a pointer to an efa_conn object
+ *		otherwise, return NULL. errno will be set to a positive error code.
+ */
+static
+struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,
+				uint64_t flags, void *context)
 {
-	struct ibv_pd *ibv_pd = av->domain->ibv_pd;
-	struct ibv_ah_attr ah_attr = { 0 };
-
-	char str[INET6_ADDRSTRLEN] = { 0 };
-	struct efadv_ah_attr attr = { 0 };
-	struct efa_reverse_av *reverse_av;
-	struct efa_ah_qpn key;
+	struct util_av_entry *util_av_entry = NULL;
+	struct efa_av_entry *efa_av_entry = NULL;
 	struct efa_conn *conn;
+	fi_addr_t util_av_fi_addr;
 	int err;
 
-	if (av->util_av.flags & FI_EVENT)
-		return -FI_ENOEQ;
-	if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT)))
-		return -FI_EINVAL;
-	else if (flags & FI_SYNC_ERR)
+	if (flags & FI_SYNC_ERR)
 		memset(context, 0, sizeof(int));
 
-	memset(&ah_attr, 0, sizeof(struct ibv_ah_attr));
-	inet_ntop(AF_INET6, addr->raw, str, INET6_ADDRSTRLEN);
-	EFA_INFO(FI_LOG_AV, "Insert address: GID[%s] QP[%u] QKEY[%u]\n", str, addr->qpn, addr->qkey);
-	if (!efa_av_is_valid_address(addr)) {
+	if (!efa_av_is_valid_address(raw_addr)) {
 		EFA_WARN(FI_LOG_AV, "Failed to insert bad addr");
-		err = -FI_EADDRNOTAVAIL;
-		goto err_invalid;
+		errno = FI_EINVAL;
+		return NULL;
 	}
 
-	err = ofi_memalign((void **)&conn, EFA_MEM_ALIGNMENT, sizeof(*conn));
+	err = ofi_av_insert_addr(&av->util_av, raw_addr, &util_av_fi_addr);
 	if (err) {
-		err = -FI_ENOMEM;
-		goto err_invalid;
-	}
-
-	ah_attr.port_num = 1;
-	ah_attr.is_global = 1;
-	memcpy(ah_attr.grh.dgid.raw, addr->raw, sizeof(addr->raw));
-	conn->ah.ibv_ah = ibv_create_ah(ibv_pd, &ah_attr);
-	if (!conn->ah.ibv_ah) {
-		err = -FI_EINVAL;
-		goto err_free_conn;
-	}
-	memcpy((void *)&conn->ep_addr, addr, sizeof(*addr));
-
-	switch (av->type) {
-	case FI_AV_MAP:
-		*fi_addr = (uintptr_t)(void *)conn;
-
-		break;
-	case FI_AV_TABLE:
-		if (av->ep_type == FI_EP_DGRAM) {
-			av->next = efa_av_tbl_find_first_empty(av, av->next);
-			assert(av->next != -1);
-			*fi_addr = av->next;
+		EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n",
+			 fi_strerror(err));
+		return NULL;
+	}
+
+	util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool,
+					     util_av_fi_addr);
+	efa_av_entry = (struct efa_av_entry *)util_av_entry->data;
+	assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr));
+
+	conn = &efa_av_entry->conn;
+	memset(conn, 0, sizeof(*conn));
+	conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr;
+	assert(av->type == FI_AV_MAP || av->type == FI_AV_TABLE);
+	conn->fi_addr = (av->type == FI_AV_MAP) ? (uintptr_t)(void *)conn : util_av_fi_addr;
+	conn->util_av_fi_addr = util_av_fi_addr;
+
+	conn->ah = efa_ah_alloc(av, raw_addr->raw);
+	if (!conn->ah)
+		goto err_release;
+
+	if (av->ep_type == FI_EP_RDM) {
+		err = efa_conn_rdm_init(av, conn);
+		if (err) {
+			errno = -err;
+			goto err_release;
 		}
+	}
 
-		av->conn_table[*fi_addr] = conn;
-		av->next++;
-		break;
-	default:
-		assert(0);
-		break;
+	err = efa_av_update_reverse_av(av, raw_addr, conn);
+	if (err) {
+		if (av->ep_type == FI_EP_RDM)
+			efa_conn_rdm_deinit(av, conn);
+		goto err_release;
 	}
 
-	err = -efadv_query_ah(conn->ah.ibv_ah, &attr, sizeof(attr));
+	av->used++;
+	return conn;
+
+err_release:
+	if (conn->ah)
+		efa_ah_release(av, conn->ah);
+
+	conn->ep_addr = NULL;
+	err = ofi_av_remove_addr(&av->util_av, util_av_fi_addr);
 	if (err)
-		goto err_destroy_ah;
-
-	conn->ah.ahn = attr.ahn;
-	key.ahn = conn->ah.ahn;
-	key.qpn = addr->qpn;
-	/* This is correct since the same address should be mapped to the same ah. */
-	HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av);
-	if (!reverse_av) {
-		reverse_av = malloc(sizeof(*reverse_av));
-		if (!reverse_av) {
-			err = -FI_ENOMEM;
-			goto err_destroy_ah;
-		}
+		EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n",
+			 err);
+
+	return NULL;
+}
 
-		memcpy(&reverse_av->key, &key, sizeof(key));
-		reverse_av->fi_addr = *fi_addr;
-		HASH_ADD(hh, av->reverse_av, key,
-			 sizeof(reverse_av->key), reverse_av);
+/**
+ * @brief release an efa conn object
+ * Caller of this function must obtain av->util_av.lock
+ *
+ * @param[in]	av	address vector
+ * @param[in]	conn	efa_conn object pointer
+ */
+static
+void efa_conn_release(struct efa_av *av, struct efa_conn *conn)
+{
+	struct efa_cur_reverse_av *cur_reverse_av_entry;
+	struct efa_prv_reverse_av *prv_reverse_av_entry;
+	struct util_av_entry *util_av_entry;
+	struct efa_av_entry *efa_av_entry;
+	struct efa_cur_reverse_av_key cur_key;
+	struct efa_prv_reverse_av_key prv_key;
+	char gidstr[INET6_ADDRSTRLEN];
+
+	memset(&cur_key, 0, sizeof(cur_key));
+	cur_key.ahn = conn->ah->ahn;
+	cur_key.qpn = conn->ep_addr->qpn;
+	HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_reverse_av_entry);
+	if (cur_reverse_av_entry) {
+		HASH_DEL(av->cur_reverse_av, cur_reverse_av_entry);
+		free(cur_reverse_av_entry);
+	} else {
+		memset(&prv_key, 0, sizeof(prv_key));
+		prv_key.ahn = conn->ah->ahn;
+		prv_key.qpn = conn->ep_addr->qpn;
+		prv_key.connid = conn->ep_addr->qkey;
+		HASH_FIND(hh, av->prv_reverse_av, &prv_key, sizeof(prv_key), prv_reverse_av_entry);
+		assert(prv_reverse_av_entry);
+		HASH_DEL(av->prv_reverse_av, prv_reverse_av_entry);
+		free(prv_reverse_av_entry);
 	}
 
-	EFA_INFO(FI_LOG_AV, "av successfully inserted conn[%p] fi_addr[%" PRIu64 "]\n",
-		 conn, *fi_addr);
+	if (av->ep_type == FI_EP_RDM)
+		efa_conn_rdm_deinit(av, conn);
 
-	av->used++;
-	return FI_SUCCESS;
-
-err_destroy_ah:
-	ibv_destroy_ah(conn->ah.ibv_ah);
-err_free_conn:
-	ofi_freealign(conn);
-err_invalid:
-	*fi_addr = FI_ADDR_NOTAVAIL;
-	return err;
+	efa_ah_release(av, conn->ah);
+
+	util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, conn->util_av_fi_addr);
+	assert(util_av_entry);
+	efa_av_entry = (struct efa_av_entry *)util_av_entry->data;
+
+	ofi_av_remove_addr(&av->util_av, conn->util_av_fi_addr);
+
+	inet_ntop(AF_INET6, conn->ep_addr->raw, gidstr, INET6_ADDRSTRLEN);
+	EFA_INFO(FI_LOG_AV, "efa_conn released! conn[%p] GID[%s] QP[%u]\n",
+		 conn, gidstr, conn->ep_addr->qpn);
+
+	conn->ep_addr = NULL;
+	memset(efa_av_entry->ep_addr, 0, EFA_EP_ADDR_LEN);
+
+	av->used--;
 }
 
-/*
- * Insert address translation in core av & in hash.
+/**
+ * @brief insert one address into address vector (AV)
  *
- * If shm transfer is enabled and the addr comes from local peer,
- * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later.
- * 2. insert gid_qpn into shm's av
- * 3. store returned fi_addr from shm into the hash table
+ * @param[in]	av	address vector
+ * @param[in]	addr	raw address, in the format of gid:qpn:qkey
+ * @param[out]	fi_addr pointer the output fi address. This addres is used by fi_send
+ * @param[in]	flags	flags user passed to fi_av_insert.
+ * @param[in]	context	context user passed to fi_av_insert
+ * @return	0 on success, a negative error code on failure
  */
-int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr,
-			   fi_addr_t *fi_addr, uint64_t flags,
-			   void *context)
+int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
+		      fi_addr_t *fi_addr, uint64_t flags, void *context)
 {
-	struct efa_av_entry *av_entry;
-	struct util_av_entry *util_av_entry;
+	struct efa_conn *conn;
+	char raw_gid_str[INET6_ADDRSTRLEN];
+	fi_addr_t efa_fiaddr;
 	int ret = 0;
-	struct rxr_peer *peer;
-	struct rxr_ep *rxr_ep;
-	struct util_ep *util_ep;
-	struct dlist_entry *ep_list_entry;
-	fi_addr_t shm_fiaddr;
-	char smr_name[NAME_MAX];
 
-	fastlock_acquire(&av->util_av.lock);
-	ret = ofi_av_insert_addr(&av->util_av, addr, fi_addr);
+	if (av->ep_type == FI_EP_DGRAM)
+		addr->qkey = EFA_DGRAM_CONNID;
 
-	if (ret) {
-		EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n",
-			 fi_strerror(ret));
+	fastlock_acquire(&av->util_av.lock);
+	memset(raw_gid_str, 0, sizeof(raw_gid_str));
+	if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) {
+		EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d", errno);
+		ret = -FI_EINVAL;
+		*fi_addr = FI_ADDR_NOTAVAIL;
 		goto out;
 	}
-	util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool,
-					     *fi_addr);
-	/*
-	 * If the entry already exists then calling ofi_av_insert_addr would
-	 * increase the use_cnt by 1. For a new entry use_cnt will be 1, whereas
-	 * for a duplicate entry, use_cnt will be more that 1.
-	 */
-	if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1)
-		goto find_out;
-
-	av_entry = (struct efa_av_entry *)util_av_entry->data;
-	av_entry->rdm_addr = *fi_addr;
-	av_entry->local_mapping = 0;
-
-	if (av->used + 1 > av->util_av.count) {
-		ret = efa_av_resize(av, av->util_av.count * 2);
-		if (ret)
-			goto out;
-		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
-			util_ep = container_of(ep_list_entry, struct util_ep,
-					       av_entry);
-			rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
-			ret = efa_peer_resize(rxr_ep, av->used,
-					      av->util_av.count);
-			if (ret)
-				goto out;
-		}
-	}
+
+	EFA_INFO(FI_LOG_AV, "Inserting address GID[%s] QP[%u] QKEY[%u] to AV ....\n",
+		 raw_gid_str, addr->qpn, addr->qkey);
 
 	/*
-	 * Walk through all the EPs that bound to the AV,
-	 * update is_self flag corresponding peer structure
+	 * Check if this address already has been inserted, if so set *fi_addr to existing address,
+	 * and return 0 for success.
 	 */
-	dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
-		util_ep = container_of(ep_list_entry, struct util_ep, av_entry);
-		rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
-		peer = rxr_ep_get_peer(rxr_ep, *fi_addr);
-		assert(peer);
-		peer->is_self = efa_is_same_addr((struct efa_ep_addr *)rxr_ep->core_addr,
-						 addr);
+	efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->util_av, addr);
+	if (efa_fiaddr != FI_ADDR_NOTAVAIL) {
+		*fi_addr = efa_fiaddr;
+		EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", *fi_addr);
+		ret = 0;
+		goto out;
 	}
 
-	/* If peer is local, insert the address into shm provider's av */
-	if (rxr_env.enable_shm_transfer && efa_is_local_peer(av, addr)) {
-		if (av->shm_used >= rxr_env.shm_av_size) {
-			ret = -FI_ENOMEM;
-			EFA_WARN(FI_LOG_AV,
-				 "Max number of shm AV entry %d has been reached.\n",
-				 rxr_env.shm_av_size);
-			goto err_free_av_entry;
-		}
-		ret = rxr_ep_efa_addr_to_str(addr, smr_name);
-		if (ret != FI_SUCCESS)
-			goto err_free_av_entry;
-
-		ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &shm_fiaddr,
-					flags, context);
-		if (OFI_UNLIKELY(ret != 1)) {
-			EFA_WARN(FI_LOG_AV,
-				 "Failed to insert address to shm provider's av: %s\n",
-				 fi_strerror(-ret));
-			goto err_free_av_entry;
-		} else {
-			ret = 0;
-		}
-		EFA_INFO(FI_LOG_AV,
-			"Insert %s to shm provider's av. addr = %" PRIu64
-			" rdm_fiaddr = %" PRIu64 " shm_rdm_fiaddr = %" PRIu64
-			"\n", smr_name, *(uint64_t *)addr, *fi_addr, shm_fiaddr);
-
-		assert(shm_fiaddr < rxr_env.shm_av_size);
-		av->shm_used++;
-		av_entry->local_mapping = 1;
-		av_entry->shm_rdm_addr = shm_fiaddr;
-		av->shm_rdm_addr_map[shm_fiaddr] = av_entry->rdm_addr;
-
-		/*
-		 * Walk through all the EPs that bound to the AV,
-		 * update is_local flag and shm fi_addr_t in corresponding peer structure
-		 */
-		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
-			util_ep = container_of(ep_list_entry, struct util_ep, av_entry);
-			rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
-			if (rxr_ep->use_shm) {
-				peer = rxr_ep_get_peer(rxr_ep, *fi_addr);
-				peer->shm_fiaddr = shm_fiaddr;
-				peer->is_local = 1;
-			}
-		}
+	conn = efa_conn_alloc(av, addr, flags, context);
+	if (!conn) {
+		*fi_addr = FI_ADDR_NOTAVAIL;
+		ret = -FI_EADDRNOTAVAIL;
+		goto out;
 	}
-	ret = efa_av_insert_ah(av, addr, fi_addr,
-			       flags, context);
-	if (ret) {
-		EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n",
-			 fi_strerror(ret));
-		goto err_free_av_entry;
-	}
-
-find_out:
-	EFA_INFO(FI_LOG_AV,
-			"addr = %" PRIu64 " rdm_fiaddr =  %" PRIu64 "\n",
-			*(uint64_t *)addr, *fi_addr);
-	goto out;
-err_free_av_entry:
-	ofi_ibuf_free(util_av_entry);
+
+	*fi_addr = conn->fi_addr;
+	EFA_INFO(FI_LOG_AV, "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to AV. fi_addr: %ld\n",
+		 raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
+	ret = 0;
 out:
 	fastlock_release(&av->util_av.lock);
 	return ret;
@@ -422,43 +742,34 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
 	struct efa_ep_addr *addr_i;
 	fi_addr_t fi_addr_res;
 
+	if (av->util_av.flags & FI_EVENT)
+		return -FI_ENOEQ;
+
+	if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT)))
+		return -FI_EINVAL;
+
 	/*
 	 * Providers are allowed to ignore FI_MORE.
 	 */
-
 	flags &= ~FI_MORE;
 	if (flags)
 		return -FI_ENOSYS;
 
-	if (av->ep_type == FI_EP_RDM) {
-		for (i = 0; i < count; i++) {
-			addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
-			ret = efa_av_insert_addr(av, addr_i, &fi_addr_res,
-					flags, context);
-			if (ret)
-				break;
-			if (fi_addr)
-				fi_addr[i] = fi_addr_res;
-			success_cnt++;
-		}
-	} else {
-		if (av->used + count > av->util_av.count) {
-			ret = efa_av_resize(av, av->used + count);
-			if (ret)
-				goto out;
-		}
-		for (i = 0; i < count; i++) {
-			addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
-			ret = efa_av_insert_ah(av, addr_i, &fi_addr_res,
-					     flags, context);
-			if (ret)
-				break;
-			if (fi_addr)
-				fi_addr[i] = fi_addr_res;
-			success_cnt++;
+	for (i = 0; i < count; i++) {
+		addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
+
+		ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context);
+		if (ret) {
+			EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n",
+				 ret);
+			break;
 		}
+
+		if (fi_addr)
+			fi_addr[i] = fi_addr_res;
+		success_cnt++;
 	}
-out:
+
 	/* cancel remaining request and log to event queue */
 	for (; i < count ; i++) {
 		if (av->util_av.eq)
@@ -476,7 +787,6 @@ out:
 }
 
 static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
-
 			 void *addr, size_t *addrlen)
 {
 	struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid);
@@ -488,156 +798,78 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
 	if (fi_addr == FI_ADDR_NOTAVAIL)
 		return -FI_EINVAL;
 
-	if (av->type == FI_AV_MAP) {
-		conn = (struct efa_conn *)fi_addr;
-	} else { /* (av->type == FI_AV_TABLE) */
-		if (fi_addr >= av->util_av.count)
-			return -FI_EINVAL;
-
-		conn = av->conn_table[fi_addr];
-	}
+	conn = efa_av_addr_to_conn(av, fi_addr);
 	if (!conn)
 		return -FI_EINVAL;
 
-	memcpy(addr, (void *)&conn->ep_addr, MIN(sizeof(conn->ep_addr), *addrlen));
-	*addrlen = sizeof(conn->ep_addr);
+	memcpy(addr, (void *)conn->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen));
+	if (*addrlen > EFA_EP_ADDR_LEN)
+		*addrlen = EFA_EP_ADDR_LEN;
 	return 0;
 }
 
-static int efa_av_remove_ah(struct fid_av *av_fid, fi_addr_t *fi_addr,
-			    size_t count, uint64_t flags)
-{
-	struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid);
-	struct efa_conn *conn = NULL;
-	struct efa_reverse_av *reverse_av;
-	struct efa_ah_qpn key;
-	char str[INET6_ADDRSTRLEN];
-	int ret = 0;
-
-	if (!fi_addr || (av->type != FI_AV_MAP && av->type != FI_AV_TABLE))
-		return -FI_EINVAL;
-
-	if (*fi_addr == FI_ADDR_NOTAVAIL)
-		return ret;
-
-	if (av->type == FI_AV_MAP) {
-		conn = (struct efa_conn *)fi_addr;
-	} else { /* (av->type == FI_AV_TABLE) */
-		conn = av->conn_table[*fi_addr];
-		av->conn_table[*fi_addr] = NULL;
-		av->next = MIN(av->next, *fi_addr);
-	}
-	if (!conn)
-		return ret;
-
-	key.ahn = conn->ah.ahn;
-	key.qpn = conn->ep_addr.qpn;
-	HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av);
-	if (OFI_LIKELY(!!reverse_av)) {
-		HASH_DEL(av->reverse_av, reverse_av);
-		free(reverse_av);
-	}
-
-	ret = -ibv_destroy_ah(conn->ah.ibv_ah);
-	if (ret)
-		goto err_free_conn;
-
-	memset(str, 0, sizeof(str));
-	inet_ntop(AF_INET6, conn->ep_addr.raw, str, INET6_ADDRSTRLEN);
-	EFA_INFO(FI_LOG_AV, "av_remove conn[%p] with GID[%s] QP[%u]\n", conn,
-			str, conn->ep_addr.qpn);
-	av->used--;
-
-err_free_conn:
-	ofi_freealign(conn);
-	return ret;
-}
-
+/*
+ * @brief remove a set of addresses from AV and release its resources
+ *
+ * This function implements fi_av_remove() for EFA provider.
+ *
+ * Note that even after an address was removed from AV, it is still
+ * possible to get TX and RX completion for the address. Per libfabric
+ * standard, these completions should be ignored.
+ *
+ * To help TX completion handler to identify such a TX completion,
+ * when removing an address, all its outstanding TX packet's addr
+ * was set to FI_ADDR_NOTAVAIL. The TX completion handler will
+ * ignore TX packet whose address is FI_ADDR_NOTAVAIL.
+ *
+ * Meanwhile, lower provider  will set a packet's address to
+ * FI_ADDR_NOTAVAIL from it is from a removed address. RX completion
+ * handler will ignore such packets.
+ *
+ * @param[in]	av_fid	fid of AV (address vector)
+ * @param[in]	fi_addr pointer to an array of libfabric addresses
+ * @param[in]	counter	number of libfabric addresses in the array
+ * @param[in]	flags	flags
+ * @return	0 if all addresses have been removed successfully,
+ * 		negative libfabric error code if error was encoutnered.
+ */
 static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
 			 size_t count, uint64_t flags)
 {
-	int ret = 0;
+	int err = 0;
 	size_t i;
 	struct efa_av *av;
-	struct util_av_entry *util_av_entry;
-	struct efa_av_entry *av_entry;
-	struct rxr_peer *peer;
-	struct dlist_entry *ep_list_entry;
+	struct efa_conn *conn;
+
+	if (!fi_addr)
+		return -FI_EINVAL;
 
 	av = container_of(av_fid, struct efa_av, util_av.av_fid);
-	if (av->ep_type == FI_EP_DGRAM) {
-		for (i = 0; i < count; i++) {
-			ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i],
-					       1, flags);
-			if (ret)
-				goto out;
-		}
-		goto out;
-	}
+	if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE)
+		return -FI_EINVAL;
+
 	fastlock_acquire(&av->util_av.lock);
 	for (i = 0; i < count; i++) {
-		if (fi_addr[i] == FI_ADDR_NOTAVAIL ||
-		    fi_addr[i] > av->util_av.count) {
-			ret = -FI_ENOENT;
-			goto release_lock;
-		}
-		util_av_entry = ofi_bufpool_get_ibuf(
-						av->util_av.av_entry_pool,
-						fi_addr[i]);
-		if (!util_av_entry) {
-			ret = -FI_ENOENT;
-			goto release_lock;
-		}
-		/*
-		 * If use_cnt is greater than 1, then just decrement
-		 * the count by 1, without removing the entry.
-		 */
-		if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1) {
-			ret = ofi_av_remove_addr(&av->util_av, fi_addr[i]);
-			goto release_lock;
-		}
-		av_entry = (struct efa_av_entry *)util_av_entry->data;
-
-		/* Check if the peer is in use if it is then return */
-		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
-			peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]);
-			ret = efa_peer_in_use(peer);
-			if (ret)
-				goto release_lock;
+		conn = efa_av_addr_to_conn(av, fi_addr[i]);
+		if (!conn) {
+			err = -FI_EINVAL;
+			break;
 		}
 
-		/* Only if the peer is not in use reset the peer */
-		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
-			peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]);
-			if (peer->rx_init)
-				efa_peer_reset(peer);
-		}
-		ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], 1,
-				       flags);
-		if (ret)
-			goto release_lock;
-		/* remove an address from shm provider's av */
-		if (rxr_env.enable_shm_transfer && av_entry->local_mapping) {
-			ret = fi_av_remove(av->shm_rdm_av, &av_entry->shm_rdm_addr, 1, flags);
-			if (ret)
-				goto err_free_av_entry;
+		efa_conn_release(av, conn);
+	}
 
-			av->shm_used--;
-			assert(av_entry->shm_rdm_addr < rxr_env.shm_av_size);
-			av->shm_rdm_addr_map[av_entry->shm_rdm_addr] = FI_ADDR_UNSPEC;
+	if (i < count) {
+		/* something went wrong, so err cannot be zero */
+		assert(err);
+		if (av->util_av.eq) {
+			for (; i < count; ++i)
+				ofi_av_write_event(&av->util_av, i, FI_ECANCELED, NULL);
 		}
-		ret = ofi_av_remove_addr(&av->util_av, *fi_addr);
-		if (ret)
-			goto err_free_av_entry;
 	}
+
 	fastlock_release(&av->util_av.lock);
-	goto out;
-err_free_av_entry:
-	ofi_ibuf_free(util_av_entry);
-release_lock:
-	fastlock_release(&av->util_av.lock);
-out:
-	return ret;
+	return err;
 }
 
 static const char *efa_av_straddr(struct fid_av *av_fid, const void *addr,
@@ -656,28 +888,44 @@ static struct fi_ops_av efa_av_ops = {
 	.straddr = efa_av_straddr
 };
 
+static void efa_av_close_reverse_av(struct efa_av *av)
+{
+	struct efa_cur_reverse_av *cur_entry, *curtmp;
+	struct efa_prv_reverse_av *prv_entry, *prvtmp;
+
+	fastlock_acquire(&av->util_av.lock);
+
+	HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) {
+		efa_conn_release(av, cur_entry->conn);
+	}
+
+	HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) {
+		efa_conn_release(av, prv_entry->conn);
+	}
+
+	fastlock_release(&av->util_av.lock);
+}
+
 static int efa_av_close(struct fid *fid)
 {
 	struct efa_av *av;
 	int ret = 0;
 	int err = 0;
-	int i;
 
 	av = container_of(fid, struct efa_av, util_av.av_fid.fid);
-	for (i = 0; i < av->util_av.count; i++) {
-		fi_addr_t addr = i;
 
-		ret = efa_av_remove_ah(&av->util_av.av_fid, &addr, 1, 0);
-		if (ret) {
-			err = ret;
-			EFA_WARN(FI_LOG_AV, "Failed to remove ah: %s\n",
-				fi_strerror(ret));
-		}
+	efa_av_close_reverse_av(av);
+
+	ret = ofi_av_close(&av->util_av);
+	if (ret) {
+		err = ret;
+		EFA_WARN(FI_LOG_AV, "Failed to close av: %s\n",
+			fi_strerror(ret));
 	}
-	free(av->conn_table);
+
 	if (av->ep_type == FI_EP_RDM) {
-		if (rxr_env.enable_shm_transfer && av->shm_rdm_av &&
-		    &av->shm_rdm_av->fid) {
+		if (av->shm_rdm_av) {
+			assert(rxr_env.enable_shm_transfer);
 			ret = fi_close(&av->shm_rdm_av->fid);
 			if (ret) {
 				err = ret;
@@ -685,12 +933,6 @@ static int efa_av_close(struct fid *fid)
 					fi_strerror(ret));
 			}
 		}
-		ret = ofi_av_close(&av->util_av);
-		if (ret) {
-			err = ret;
-			EFA_WARN(FI_LOG_AV, "Failed to close av: %s\n",
-				fi_strerror(ret));
-		}
 	}
 	free(av);
 	return err;
@@ -709,16 +951,40 @@ static struct fi_ops efa_av_fi_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
+/**
+ * @brief initialize the util_av field in efa_av
+ *
+ * @param[in]	util_domain	util_domain which is part of efa_domain_base
+ * @param[in]	attr		AV attr application passed to fi_av_open
+ * @param[out]	util_av		util_av field in efa_av
+ * @param[in]	context		contexted application passed to fi_av_open
+ * @return	On success, return 0.
+ *		On failure, return a negative libfabric error code.
+ */
+int efa_av_init_util_av(struct efa_domain *efa_domain,
+			struct fi_av_attr *attr,
+			struct util_av *util_av,
+			void *context)
+{
+	struct util_av_attr util_attr;
+	size_t universe_size;
+
+	if (fi_param_get_size_t(NULL, "universe_size",
+				&universe_size) == FI_SUCCESS)
+		attr->count = MAX(attr->count, universe_size);
+
+	util_attr.addrlen = EFA_EP_ADDR_LEN;
+	util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN;
+	util_attr.flags = 0;
+	return ofi_av_init(&efa_domain->util_domain, attr, &util_attr,
+			   util_av, context);
+}
+
 int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 		struct fid_av **av_fid, void *context)
 {
 	struct efa_domain *efa_domain;
-	struct util_domain *util_domain;
-	struct rxr_domain *rxr_domain;
-	struct efa_domain_base *efa_domain_base;
 	struct efa_av *av;
-	struct util_av_attr util_attr;
-	size_t universe_size;
 	struct fi_av_attr av_attr;
 	int i, ret, retv;
 
@@ -745,35 +1011,19 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 	if (!av)
 		return -FI_ENOMEM;
 
-	util_domain = container_of(domain_fid, struct util_domain,
-				   domain_fid);
-	efa_domain_base = container_of(util_domain, struct efa_domain_base,
-				       util_domain.domain_fid);
 	attr->type = FI_AV_TABLE;
-	/*
-	 * An rxr_domain fid was passed to the user if this is an RDM
-	 * endpoint, otherwise it is an efa_domain fid.  This will be
-	 * removed once the rxr and efa domain structures are combined.
-	 */
-	if (efa_domain_base->type == EFA_DOMAIN_RDM) {
-		rxr_domain = (struct rxr_domain *)efa_domain_base;
-		efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
-						util_domain.domain_fid);
+
+	efa_domain = efa_domain_from_fid(domain_fid);
+
+	ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context);
+	if (ret)
+		goto err;
+
+	if (efa_domain_get_type(domain_fid) == EFA_DOMAIN_RDM) {
 		av->ep_type = FI_EP_RDM;
 
-		if (fi_param_get_size_t(NULL, "universe_size",
-					&universe_size) == FI_SUCCESS)
-			attr->count = MAX(attr->count, universe_size);
-
-		util_attr.addrlen = EFA_EP_ADDR_LEN;
-		util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN;
-		util_attr.flags = 0;
-		ret = ofi_av_init(&efa_domain->util_domain, attr, &util_attr,
-					&av->util_av, context);
-		if (ret)
-			goto err;
 		av_attr = *attr;
-		if (rxr_env.enable_shm_transfer) {
+		if (efa_domain->fab && efa_domain->fab->shm_fabric) {
 			/*
 			 * shm av supports maximum 256 entries
 			 * Reset the count to 128 to reduce memory footprint and satisfy
@@ -797,7 +1047,6 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 				av->shm_rdm_addr_map[i] = FI_ADDR_UNSPEC;
 		}
 	} else {
-		efa_domain = (struct efa_domain *)efa_domain_base;
 		av->ep_type = FI_EP_DGRAM;
 	}
 
@@ -807,25 +1056,8 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 	av->domain = efa_domain;
 	av->type = attr->type;
 	av->used = 0;
-	av->next = 0;
 	av->shm_used = 0;
 
-	if (av->type == FI_AV_TABLE && av->util_av.count > 0) {
-		av->conn_table = calloc(av->util_av.count, sizeof(*av->conn_table));
-		if (!av->conn_table) {
-			ret = -FI_ENOMEM;
-			if (av->ep_type == FI_EP_DGRAM)
-				goto err_close_util_av;
-			else
-				goto err_close_shm_av;
-		}
-	}
-
-	if (av->type == FI_AV_MAP)
-		av->addr_to_conn = efa_av_map_addr_to_conn;
-	else /* if (av->type == FI_AV_TABLE) */
-		av->addr_to_conn = efa_av_tbl_idx_to_conn;
-
 	*av_fid = &av->util_av.av_fid;
 	(*av_fid)->fid.fclass = FI_CLASS_AV;
 	(*av_fid)->fid.context = context;
@@ -834,13 +1066,6 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 
 	return 0;
 
-err_close_shm_av:
-	if (rxr_env.enable_shm_transfer) {
-		retv = fi_close(&av->shm_rdm_av->fid);
-		if (retv)
-			EFA_WARN(FI_LOG_AV, "Unable to close shm av: %s\n",
-				fi_strerror(ret));
-	}
 err_close_util_av:
 	retv = ofi_av_close(&av->util_av);
 	if (retv)
diff --git a/deps/libfabric/prov/efa/src/efa_cq.c b/deps/libfabric/prov/efa/src/efa_cq.c
index 2b8fe67a9a0c28b252751f81e3a20a5011f1f38a..1d89b47f6778b3cd86f47c39b4e373f8bd94679f 100644
--- a/deps/libfabric/prov/efa/src/efa_cq.c
+++ b/deps/libfabric/prov/efa/src/efa_cq.c
@@ -171,9 +171,9 @@ ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count,
 			av = cq->domain->qp_table[wc.ibv_wc.qp_num &
 			     cq->domain->qp_table_sz_m1]->ep->av;
 
-			src_addr[i] = efa_ahn_qpn_to_addr(av,
-							  wc.ibv_wc.slid,
-							  wc.ibv_wc.src_qp);
+			src_addr[i] = efa_av_reverse_lookup_dgram(av,
+								  wc.ibv_wc.slid,
+								  wc.ibv_wc.src_qp);
 		}
 		cq->read_entry(&wc, i, buf);
 	}
diff --git a/deps/libfabric/prov/efa/src/efa_device.c b/deps/libfabric/prov/efa/src/efa_device.c
index d60da55e281bb614e4eaa502764880035a0e780e..29d952bdab4c7899068d84cb488329d855ae25d8 100644
--- a/deps/libfabric/prov/efa/src/efa_device.c
+++ b/deps/libfabric/prov/efa/src/efa_device.c
@@ -90,8 +90,12 @@ int efa_device_init(void)
 	fastlock_init(&pd_list_lock);
 
 	device_list = ibv_get_device_list(&dev_cnt);
-	if (dev_cnt <= 0)
-		return -ENODEV;
+	if (device_list == NULL)
+		return -ENOMEM;
+	if (dev_cnt <= 0) {
+		ret = -ENODEV;
+		goto err_free_dev_list;
+	}
 
 	ctx_list = calloc(dev_cnt, sizeof(*ctx_list));
 	if (!ctx_list) {
diff --git a/deps/libfabric/prov/efa/src/efa_domain.c b/deps/libfabric/prov/efa/src/efa_domain.c
index f0add07ee654ce2455a0f8e4f0c0fa27c317aa19..b4bfcf079fb3989fe896add1af024d2fe2c61ad3 100644
--- a/deps/libfabric/prov/efa/src/efa_domain.c
+++ b/deps/libfabric/prov/efa/src/efa_domain.c
@@ -40,6 +40,8 @@
 fastlock_t pd_list_lock;
 struct efa_pd *pd_list = NULL;
 
+enum efa_fork_support_status efa_fork_status = EFA_FORK_SUPPORT_OFF;
+
 static int efa_domain_close(fid_t fid)
 {
 	struct efa_domain *domain;
@@ -139,23 +141,49 @@ static int efa_open_device_by_name(struct efa_domain *domain, const char *name)
 	return ret;
 }
 
-/*
- * Register a temporary buffer and call ibv_fork_init() to determine if fork
- * support is enabled.
+/* @brief Check if rdma-core fork support is enabled and prevent fork
+ * support from being enabled later.
+ *
+ * Register a temporary buffer and call ibv_fork_init() to determine
+ * if fork support is enabled. Registering a buffer prevents future
+ * calls to ibv_fork_init() from completing successfully.
  *
  * This relies on internal behavior in rdma-core and is a temporary workaround.
+ *
+ * @param domain_fid domain fid so we can register memory
+ * @return 1 if fork support is enabled, 0 otherwise
  */
 static int efa_check_fork_enabled(struct fid_domain *domain_fid)
 {
 	struct fid_mr *mr;
 	char *buf;
 	int ret;
+	long page_size;
 
-	buf = malloc(ofi_get_page_size());
+	/* If ibv_is_fork_initialized is availble, check if the function
+	 * can exit early.
+	 */
+#if HAVE_IBV_IS_FORK_INITIALIZED == 1
+	enum ibv_fork_status fork_status = ibv_is_fork_initialized();
+
+	/* If fork support is enabled or unneeded, return. */
+	if (fork_status != IBV_FORK_DISABLED)
+		return fork_status == IBV_FORK_ENABLED;
+
+#endif /* HAVE_IBV_IS_FORK_INITIALIZED */
+
+	page_size = ofi_get_page_size();
+	if (page_size <= 0) {
+		EFA_WARN(FI_LOG_DOMAIN, "Unable to determine page size %ld\n",
+			 page_size);
+		return -FI_EINVAL;
+	}
+
+	buf = malloc(page_size);
 	if (!buf)
 		return -FI_ENOMEM;
 
-	ret = fi_mr_reg(domain_fid, buf, ofi_get_page_size(),
+	ret = fi_mr_reg(domain_fid, buf, page_size,
 			FI_SEND, 0, 0, 0, &mr, NULL);
 	if (ret) {
 		free(buf);
@@ -202,19 +230,198 @@ static struct fi_ops_domain efa_domain_ops = {
 	.query_collective = fi_no_query_collective,
 };
 
+/* @brief Fork handler that is installed when EFA is loaded
+ *
+ * We register this fork handler so that users do not inadvertently trip over
+ * memory corruption when fork is called. Calling fork() without enabling fork
+ * support in rdma-core can cause corruption, even if the registered pages are
+ * not used in the child process.
+ *
+ * It is critical that this fork handler is only installed once an EFA device
+ * is present and selected. We don't want this to trigger when Libfabric is not
+ * running on an EC2 instance.
+ */
+static
+void efa_atfork_callback()
+{
+	static int visited = 0;
+
+	if (visited)
+		return;
+	visited = 1;
+
+	fprintf(stderr,
+		"A process has executed an operation involving a call\n"
+		"to the fork() system call to create a child process.\n"
+		"\n"
+		"As a result, the Libfabric EFA provider is operating in\n"
+		"a condition that could result in memory corruption or\n"
+		"other system errors.\n"
+		"\n"
+		"For the Libfabric EFA provider to work safely when fork()\n"
+		"is called please do one of the following:\n"
+		"1) Set the environment variable:\n"
+		"          FI_EFA_FORK_SAFE=1\n"
+		"and verify you are using rdma-core v31.1 or later.\n"
+		"\n"
+		"OR\n"
+		"2) Use Linux Kernel 5.13+ with rdma-core v35.0+\n"
+		"\n"
+		"Please note that enabling fork support may cause a\n"
+		"small performance impact.\n"
+		"\n"
+		"You may want to check with your application vendor to see\n"
+		"if an application-level alternative (of not using fork)\n"
+		"exists.\n"
+		"\n"
+		"Your job will now abort.\n");
+	abort();
+}
+
+/* @brief Setup the MR cache.
+ *
+ * This function enables the MR cache using the util MR cache code. Note that
+ * if the call to ofi_mr_cache_init fails, we continue but disable the cache.
+ *
+ * @param efa_domain The EFA domain where cache ops should be set
+ * @param info Validated info struct selected by the user
+ * @return 0 on success, fi_errno on failure.
+ */
+static int efa_mr_cache_init(struct efa_domain *domain, struct fi_info *info)
+{
+	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
+		[FI_HMEM_SYSTEM] = default_monitor,
+		[FI_HMEM_CUDA] = cuda_monitor,
+	};
+	int ret;
+
+	/* Both Open MPI (and possibly other MPI implementations) and
+	 * Libfabric use the same live binary patching to enable memory
+	 * monitoring, but the patching technique only allows a single
+	 * "winning" patch.  The Libfabric memhooks monitor will not
+	 * overwrite a previous patch, but instead return
+	 * -FI_EALREADY.  There are three cases of concern, and in all
+	 * but one of them, we can avoid changing the default monitor.
+	 *
+	 * (1) Upper layer does not patch, such as Open MPI 4.0 and
+	 * earlier.  In this case, the default monitor will be used,
+	 * as the default monitor is either not the memhooks monitor
+	 * (because the user specified a different monitor) or the
+	 * default monitor is the memhooks monitor, but we were able
+	 * to install the patches.  We will use the default monitor in
+	 * this case.
+	 *
+	 * (2) Upper layer does patch, but does not export a memory
+	 * monitor, such as Open MPI 4.1.0 and 4.1.1.  In this case,
+	 * if the default memory monitor is not memhooks, we will use
+	 * the default monitor.  If the default monitor is memhooks,
+	 * the patch will fail to apply, and we will change the
+	 * requested monitor to UFFD to avoid a broken configuration.
+	 * If the user explicitly requested memhooks, we will return
+	 * an error, as we can not satisfy that request.
+	 *
+	 * (3) Upper layer does patch and exports a memory monitor,
+	 * such as Open MPI 4.1.2 and later.  In this case, the
+	 * default monitor will have been changed from the memhooks
+	 * monitor to the imported monitor, so we will use the
+	 * imported monitor.
+	 *
+	 * The only known cases in which we will not use the default
+	 * monitor are Open MPI 4.1.0/4.1.1.
+	 *
+	 * It is possible that this could be better handled at the
+	 * mem_monitor level in Libfabric, but so far we have not
+	 * reached agreement on how that would work.
+	 */
+	if (default_monitor == memhooks_monitor) {
+		ret = memhooks_monitor->start(memhooks_monitor);
+		if (ret == -FI_EALREADY) {
+			if (cache_params.monitor) {
+				EFA_WARN(FI_LOG_DOMAIN,
+					 "Memhooks monitor requested via FI_MR_CACHE_MONITOR, but memhooks failed to\n"
+					 "install.  No working monitor availale.\n");
+				return -FI_ENOSYS;
+			}
+			EFA_INFO(FI_LOG_DOMAIN,
+				 "Detected potential memhooks monitor conflict. Switching to UFFD.\n");
+			memory_monitors[FI_HMEM_SYSTEM] = uffd_monitor;
+		}
+	} else if (default_monitor == NULL) {
+		/* TODO: Fail if we don't find a system monitor.  This
+		 * is a debatable decision, as the VERBS provider
+		 * falls back to a no-cache mode in this case.  We
+		 * fail the domain creation because the rest of the MR
+		 * code hasn't been audited to deal with a NULL
+		 * monitor.
+		 */
+		EFA_WARN(FI_LOG_DOMAIN,
+			 "No default SYSTEM monitor available.\n");
+		return -FI_ENOSYS;
+	}
+
+	domain->cache = (struct ofi_mr_cache *)calloc(1, sizeof(struct ofi_mr_cache));
+	if (!domain->cache)
+		return -FI_ENOMEM;
+
+	if (!efa_mr_max_cached_count)
+		efa_mr_max_cached_count = info->domain_attr->mr_cnt *
+					  EFA_MR_CACHE_LIMIT_MULT;
+	if (!efa_mr_max_cached_size)
+		efa_mr_max_cached_size = domain->ctx->max_mr_size *
+					 EFA_MR_CACHE_LIMIT_MULT;
+	/*
+	 * XXX: we're modifying a global in the util mr cache? do we need an
+	 * API here instead?
+	 */
+	cache_params.max_cnt = efa_mr_max_cached_count;
+	cache_params.max_size = efa_mr_max_cached_size;
+	domain->cache->entry_data_size = sizeof(struct efa_mr);
+	domain->cache->add_region = efa_mr_cache_entry_reg;
+	domain->cache->delete_region = efa_mr_cache_entry_dereg;
+	ret = ofi_mr_cache_init(&domain->util_domain, memory_monitors,
+				domain->cache);
+	if (!ret) {
+		domain->util_domain.domain_fid.mr = &efa_domain_mr_cache_ops;
+		EFA_INFO(FI_LOG_DOMAIN, "EFA MR cache enabled, max_cnt: %zu max_size: %zu\n",
+			 cache_params.max_cnt, cache_params.max_size);
+	} else {
+		EFA_WARN(FI_LOG_DOMAIN, "EFA MR cache init failed: %s\n",
+		         fi_strerror(ret));
+		free(domain->cache);
+		domain->cache = NULL;
+	}
+
+	return 0;
+}
+
+/* @brief Allocate a domain, open the device, and set it up based on the hints.
+ *
+ * This function creates a domain and uses the info struct to configure the
+ * domain based on what capabilities are set. Fork support is checked here and
+ * the MR cache is also set up here.
+ *
+ * Note the trickery with rxr_domain where detect whether this endpoint is RDM
+ * or DGRAM to set some state in rxr_domain. We can do this as the type field
+ * is at the beginning of efa_domain and rxr_domain, and we know efa_domain
+ * stored within rxr_domain. This will be removed when rxr_domain_open and
+ * efa_domain_open are combined.
+ *
+ * @param fabric_fid fabric that the domain should be tied to
+ * @param info info struct that was validated and returned by fi_getinfo
+ * @param domain_fid pointer where newly domain fid should be stored
+ * @param context void pointer stored with the domain fid
+ * @return 0 on success, fi_errno on error
+ */
 int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 		    struct fid_domain **domain_fid, void *context)
 {
+	static int fork_handler_installed = 0;
 	struct efa_domain *domain;
 	struct efa_fabric *fabric;
-	struct rxr_domain *rxr_domain;
 	const struct fi_info *fi;
 	size_t qp_table_size;
 	bool app_mr_local;
-	int ret;
-	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
-		[FI_HMEM_SYSTEM] = memhooks_monitor,
-	};
+	int ret, err;
 
 	fi = efa_get_efa_info(info->domain_attr->name);
 	if (!fi)
@@ -251,6 +458,7 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 	}
 
 	if (EFA_EP_TYPE_IS_RDM(info)) {
+		struct rxr_domain *rxr_domain;
 		domain->type = EFA_DOMAIN_RDM;
 		rxr_domain = container_of(domain_fid, struct rxr_domain,
 					  rdm_domain);
@@ -287,59 +495,70 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 	domain->cache = NULL;
 
 	/*
-	 * Check whether fork support is enabled when app does not request
-	 * FI_MR_LOCAL even if the cache is disabled.
+	 * Call ibv_fork_init if the user asked for fork support.
+	 */
+	if (efa_fork_status == EFA_FORK_SUPPORT_ON) {
+		ret = -ibv_fork_init();
+		if (ret) {
+			EFA_WARN(FI_LOG_DOMAIN,
+			         "Fork support requested but ibv_fork_init failed: %s\n",
+			         strerror(-ret));
+			goto err_free_info;
+		}
+	}
+
+	/*
+	 * Run check to see if fork support was enabled by another library. If
+	 * one of the environment variables was set to enable fork support,
+	 * this variable was set to ON during provider init.  Huge pages for
+	 * bounce buffers will not be used if fork support is on.
 	 */
-	if (!app_mr_local && efa_check_fork_enabled(*domain_fid)) {
-		fprintf(stderr,
-		         "\nlibibverbs fork support is not supported by the EFA Libfabric\n"
-			 "provider when memory registrations are handled by the provider.\n"
-			 "\nFork support may currently be enabled via the RDMAV_FORK_SAFE\n"
-			 "or IBV_FORK_SAFE environment variable or another library in your\n"
-			 "application may be calling ibv_fork_init().\n"
-			 "\nPlease refer to https://github.com/ofiwg/libfabric/issues/6332\n"
-			 "for more information. Your job will now abort.\n");
-		abort();
+	if (efa_fork_status == EFA_FORK_SUPPORT_OFF &&
+	    efa_check_fork_enabled(*domain_fid))
+		efa_fork_status = EFA_FORK_SUPPORT_ON;
+
+	if (efa_fork_status == EFA_FORK_SUPPORT_ON &&
+	    getenv("RDMAV_HUGEPAGES_SAFE")) {
+		EFA_WARN(FI_LOG_DOMAIN,
+			 "Using libibverbs fork support and huge pages is not supported by the EFA provider.\n");
+		ret = -FI_EINVAL;
+		goto err_free_info;
 	}
 
+	/*
+	 * It'd be better to install this during provider init (since that's
+	 * only invoked once) but we need to do a memory registration for the
+	 * fork check above. This can move to the provider init once that check
+	 * is gone.
+	 */
+	if (!fork_handler_installed && efa_fork_status == EFA_FORK_SUPPORT_OFF) {
+		ret = pthread_atfork(efa_atfork_callback, NULL, NULL);
+		if (ret) {
+			EFA_WARN(FI_LOG_DOMAIN,
+				 "Unable to register atfork callback: %s\n",
+				 strerror(-ret));
+			goto err_free_info;
+		}
+		fork_handler_installed = 1;
+	}
 	/*
 	 * If FI_MR_LOCAL is set, we do not want to use the MR cache.
 	 */
 	if (!app_mr_local && efa_mr_cache_enable) {
-		domain->cache = (struct ofi_mr_cache *)calloc(1, sizeof(struct ofi_mr_cache));
-		if (!domain->cache) {
-			ret = -FI_ENOMEM;
+		ret = efa_mr_cache_init(domain, info);
+		if (ret)
 			goto err_free_info;
-		}
-
-		if (!efa_mr_max_cached_count)
-			efa_mr_max_cached_count = info->domain_attr->mr_cnt *
-			                          EFA_MR_CACHE_LIMIT_MULT;
-		if (!efa_mr_max_cached_size)
-			efa_mr_max_cached_size = domain->ctx->max_mr_size *
-			                         EFA_MR_CACHE_LIMIT_MULT;
-		cache_params.max_cnt = efa_mr_max_cached_count;
-		cache_params.max_size = efa_mr_max_cached_size;
-		domain->cache->entry_data_size = sizeof(struct efa_mr);
-		domain->cache->add_region = efa_mr_cache_entry_reg;
-		domain->cache->delete_region = efa_mr_cache_entry_dereg;
-		ret = ofi_mr_cache_init(&domain->util_domain, memory_monitors,
-					domain->cache);
-		if (!ret) {
-			domain->util_domain.domain_fid.mr = &efa_domain_mr_cache_ops;
-			EFA_INFO(FI_LOG_DOMAIN, "EFA MR cache enabled, max_cnt: %zu max_size: %zu\n",
-			         cache_params.max_cnt, cache_params.max_size);
-			return 0;
-		}
 	}
 
-	free(domain->cache);
-	domain->cache = NULL;
 	return 0;
 err_free_info:
 	fi_freeinfo(domain->info);
 err_close_domain:
-	ofi_domain_close(&domain->util_domain);
+	err = ofi_domain_close(&domain->util_domain);
+	if (err) {
+		EFA_WARN(FI_LOG_DOMAIN,
+			   "ofi_domain_close fails: %d", err);
+	}
 err_free_qp_table:
 	free(domain->qp_table);
 err_free_domain:
diff --git a/deps/libfabric/prov/efa/src/efa_ep.c b/deps/libfabric/prov/efa/src/efa_ep.c
index bbc376ea44695fc81b512206d5c8068fae109034..865a145251303b1a81f205a16efc691eb37d3ab7 100644
--- a/deps/libfabric/prov/efa/src/efa_ep.c
+++ b/deps/libfabric/prov/efa/src/efa_ep.c
@@ -38,7 +38,7 @@
 #include <infiniband/efadv.h>
 #define EFA_CQ_PROGRESS_ENTRIES 500
 
-static int efa_generate_qkey()
+static int efa_generate_rdm_connid()
 {
 	struct timeval tv;
 	struct timezone tz;
@@ -80,8 +80,8 @@ static int efa_ep_destroy_qp(struct efa_qp *qp)
 	return err;
 }
 
-static int efa_ep_modify_qp_state(struct efa_qp *qp, enum ibv_qp_state qp_state,
-				  int attr_mask)
+static int efa_ep_modify_qp_state(struct efa_ep *ep, struct efa_qp *qp,
+				  enum ibv_qp_state qp_state, int attr_mask)
 {
 	struct ibv_qp_attr attr = {};
 
@@ -93,25 +93,33 @@ static int efa_ep_modify_qp_state(struct efa_qp *qp, enum ibv_qp_state qp_state,
 	if (attr_mask & IBV_QP_QKEY)
 		attr.qkey = qp->qkey;
 
+	if (attr_mask & IBV_QP_RNR_RETRY)
+		attr.rnr_retry = ep->rnr_retry;
+
 	return -ibv_modify_qp(qp->ibv_qp, &attr, attr_mask);
 
 }
 
-static int efa_ep_modify_qp_rst2rts(struct efa_qp *qp)
+static int efa_ep_modify_qp_rst2rts(struct efa_ep *ep, struct efa_qp *qp)
 {
 	int err;
 
-	err = efa_ep_modify_qp_state(qp, IBV_QPS_INIT,
+	err = efa_ep_modify_qp_state(ep, qp, IBV_QPS_INIT,
 				     IBV_QP_STATE | IBV_QP_PKEY_INDEX |
 				     IBV_QP_PORT | IBV_QP_QKEY);
 	if (err)
 		return err;
 
-	err = efa_ep_modify_qp_state(qp, IBV_QPS_RTR, IBV_QP_STATE);
+	err = efa_ep_modify_qp_state(ep, qp, IBV_QPS_RTR, IBV_QP_STATE);
 	if (err)
 		return err;
 
-	return efa_ep_modify_qp_state(qp, IBV_QPS_RTS,
+	if (ep->util_ep.type != FI_EP_DGRAM &&
+	    efa_ep_support_rnr_retry_modify(&ep->util_ep.ep_fid))
+		return efa_ep_modify_qp_state(ep, qp, IBV_QPS_RTS,
+			IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_RNR_RETRY);
+
+	return efa_ep_modify_qp_state(ep, qp, IBV_QPS_RTS,
 				      IBV_QP_STATE | IBV_QP_SQ_PSN);
 }
 
@@ -145,8 +153,8 @@ static int efa_ep_create_qp_ex(struct efa_ep *ep,
 	}
 
 	qp->ibv_qp_ex = ibv_qp_to_qp_ex(qp->ibv_qp);
-	qp->qkey = efa_generate_qkey();
-	err = efa_ep_modify_qp_rst2rts(qp);
+	qp->qkey = (init_attr_ex->qp_type == IBV_QPT_UD) ? EFA_DGRAM_CONNID: efa_generate_rdm_connid();
+	err = efa_ep_modify_qp_rst2rts(ep, qp);
 	if (err)
 		goto err_destroy_qp;
 
@@ -292,6 +300,15 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 		break;
 	case FI_CLASS_AV:
 		av = container_of(bfid, struct efa_av, util_av.av_fid.fid);
+		/*
+		 * Binding multiple endpoints to a single AV is currently not
+		 * supported.
+		 */
+		if (av->ep) {
+			EFA_WARN(FI_LOG_EP_CTRL,
+				 "Address vector already has endpoint bound to it.\n");
+			return -FI_ENOSYS;
+		}
 		if (ep->domain != av->domain) {
 			EFA_WARN(FI_LOG_EP_CTRL,
 				 "Address vector doesn't belong to same domain as EP.\n");
@@ -674,6 +691,7 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *info,
 	ep->domain = domain;
 	ep->xmit_more_wr_tail = &ep->xmit_more_wr_head;
 	ep->recv_more_wr_tail = &ep->recv_more_wr_head;
+	ep->rnr_retry = rxr_env.rnr_retry;
 
 	if (info->src_addr) {
 		ep->src_addr = (void *)calloc(1, EFA_EP_ADDR_LEN);
diff --git a/deps/libfabric/prov/efa/src/efa_fabric.c b/deps/libfabric/prov/efa/src/efa_fabric.c
index 006c47f82cff56b0b1c8186baa3af68345ea5b3a..b471aea4e9e674fa7bdb0541685c7b257c7dd8af 100644
--- a/deps/libfabric/prov/efa/src/efa_fabric.c
+++ b/deps/libfabric/prov/efa/src/efa_fabric.c
@@ -80,10 +80,15 @@
 
 #define EFA_DEF_MR_CACHE_ENABLE 1
 
+#ifdef EFA_PERF_ENABLED
+const char *efa_perf_counters_str[] = {
+	EFA_PERF_FOREACH(OFI_STR)
+};
+#endif
+
 int efa_mr_cache_enable		= EFA_DEF_MR_CACHE_ENABLE;
 size_t efa_mr_max_cached_count;
 size_t efa_mr_max_cached_size;
-int efa_set_rdmav_hugepages_safe = 0;
 
 static void efa_addr_to_str(const uint8_t *raw_addr, char *str);
 static int efa_get_addr(struct efa_context *ctx, void *src_addr);
@@ -182,7 +187,7 @@ static int efa_check_hints(uint32_t version, const struct fi_info *hints,
 
 	if (hints->caps & ~(info->caps)) {
 		EFA_INFO(FI_LOG_CORE, "Unsupported capabilities\n");
-		FI_INFO_CHECK(&efa_prov, info, hints, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(&efa_prov, info, hints, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
@@ -190,7 +195,7 @@ static int efa_check_hints(uint32_t version, const struct fi_info *hints,
 
 	if ((hints->mode & prov_mode) != prov_mode) {
 		EFA_INFO(FI_LOG_CORE, "Required hints mode bits not set\n");
-		FI_INFO_MODE(&efa_prov, prov_mode, hints->mode);
+		OFI_INFO_MODE(&efa_prov, prov_mode, hints->mode);
 		return -FI_ENODATA;
 	}
 
@@ -268,7 +273,7 @@ static char *get_sysfs_path(void)
 			sysfs_path[len] = '\0';
 		}
 	} else {
-		sysfs_path = strndup("/sys", IBV_SYSFS_PATH_MAX);
+		sysfs_path = strdup("/sys");
 	}
 
 	return sysfs_path;
@@ -468,23 +473,34 @@ err_free_nic:
 
 #if HAVE_LIBCUDA
 /*
- * efa_get_gdr_support() check if GPUDirect RDMA is supported by
- * reading from sysfs file "class/infiniband/<device_name>/gdr"
- * and set content of gdr_support accordingly.
+ * efa_get_gdr_support() checks if the provider can support GPUDirect RDMA. It
+ * checks whether the hmem initialization succeeded and also reads from the EFA
+ * driver sysfs file "class/infiniband/<device_name>/gdr" to verify the EFA
+ * driver was able to successfully load p2p device support.
+ *
+ * TODO: the gdr sysfs file does not necessarily mean a specific p2p transfer
+ * will succeed, more work is needed here.
  *
  * Return value:
- *   return 1 if sysfs file exist and has 1 in it.
- *   return 0 if sysfs file does not exist or has 0 in it.
- *   return a negatie value if error happened.
+ *   return 1 if gdr is supported
+ *   return 0 if it is not
+ *   return a negative value on error
  */
-static int efa_get_gdr_support(char *device_name)
+static int efa_get_gdr_support(struct efa_context *efa_context)
 {
 	static const int MAX_GDR_SUPPORT_STRLEN = 8;
 	char *gdr_path = NULL;
 	char gdr_support_str[MAX_GDR_SUPPORT_STRLEN];
 	int ret, read_len;
 
-	ret = asprintf(&gdr_path, "class/infiniband/%s/device/gdr", device_name);
+	if (!ofi_hmem_is_initialized(FI_HMEM_CUDA)) {
+		EFA_WARN(FI_LOG_MR,
+		         "FI_HMEM_CUDA is not initialized\n");
+		return 0;
+	}
+
+	ret = asprintf(&gdr_path, "class/infiniband/%s/device/gdr",
+		       efa_context->ibv_ctx->device->name);
 	if (ret < 0) {
 		EFA_INFO_ERRNO(FI_LOG_FABRIC, "asprintf to build sysfs file name failed", ret);
 		goto out;
@@ -509,6 +525,7 @@ static int efa_get_gdr_support(char *device_name)
 
 	read_len = MIN(ret, MAX_GDR_SUPPORT_STRLEN);
 	ret = (0 == strncmp(gdr_support_str, "1", read_len));
+
 out:
 	free(gdr_path);
 	return ret;
@@ -563,7 +580,7 @@ static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info)
 
 #if HAVE_LIBCUDA
 	if (info->ep_attr->type == FI_EP_RDM) {
-		ret = efa_get_gdr_support(ctx->ibv_ctx->device->name);
+		ret = efa_get_gdr_support(ctx);
 		if (ret < 0) {
 			EFA_WARN(FI_LOG_FABRIC, "get gdr support failed!\n");
 			return ret;
@@ -741,7 +758,7 @@ static int efa_alloc_info(struct efa_context *ctx, struct fi_info **info,
 	fi->domain_attr->name = malloc(name_len + 1);
 	if (!fi->domain_attr->name) {
 		ret = -FI_ENOMEM;
-		goto err_free_fab_name;
+		goto err_free_info;
 	}
 
 	snprintf(fi->domain_attr->name, name_len + 1, "%s%s",
@@ -752,24 +769,18 @@ static int efa_alloc_info(struct efa_context *ctx, struct fi_info **info,
 	fi->src_addr = calloc(1, EFA_EP_ADDR_LEN);
 	if (!fi->src_addr) {
 		ret = -FI_ENOMEM;
-		goto err_free_dom_name;
+		goto err_free_info;
 	}
 	fi->src_addrlen = EFA_EP_ADDR_LEN;
 	ret = efa_get_addr(ctx, fi->src_addr);
 	if (ret)
-		goto err_free_src;
+		goto err_free_info;
 
 	fi->domain_attr->av_type = FI_AV_TABLE;
 
 	*info = fi;
 	return 0;
 
-err_free_src:
-	free(fi->src_addr);
-err_free_dom_name:
-	free(fi->domain_attr->name);
-err_free_fab_name:
-	free(fi->fabric_attr->name);
 err_free_info:
 	fi_freeinfo(fi);
 	return ret;
@@ -921,14 +932,33 @@ out:
 
 static int efa_fabric_close(fid_t fid)
 {
-	struct efa_fabric *fab;
+	struct efa_fabric *efa_fabric;
 	int ret;
 
-	fab = container_of(fid, struct efa_fabric, util_fabric.fabric_fid.fid);
-	ret = ofi_fabric_close(&fab->util_fabric);
-	if (ret)
+	efa_fabric = container_of(fid, struct efa_fabric, util_fabric.fabric_fid.fid);
+	ret = ofi_fabric_close(&efa_fabric->util_fabric);
+	if (ret) {
+		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
+			"Unable to close fabric: %s\n",
+			fi_strerror(-ret));
 		return ret;
-	free(fab);
+	}
+
+	if (efa_fabric->shm_fabric) {
+		ret = fi_close(&efa_fabric->shm_fabric->fid);
+		if (ret) {
+			FI_WARN(&rxr_prov, FI_LOG_FABRIC,
+				"Unable to close fabric: %s\n",
+				fi_strerror(-ret));
+			return ret;
+		}
+	}
+
+#ifdef EFA_PERF_ENABLED
+	ofi_perfset_log(&efa_fabric->perf_set, efa_perf_counters_str);
+	ofi_perfset_close(&efa_fabric->perf_set);
+#endif
+	free(efa_fabric);
 
 	return 0;
 }
@@ -943,93 +973,87 @@ static struct fi_ops efa_fi_ops = {
 
 static struct fi_ops_fabric efa_ops_fabric = {
 	.size = sizeof(struct fi_ops_fabric),
-	.domain = efa_domain_open,
+	/*
+	 * The reason we use rxr_domain_open() here is because it actually handles
+	 * both RDM and DGRAM.
+	 */
+	.domain = rxr_domain_open,
 	.passive_ep = fi_no_passive_ep,
 	.eq_open = ofi_eq_create,
 	.wait_open = ofi_wait_fd_open,
 	.trywait = ofi_trywait
 };
 
-static
-void efa_atfork_callback()
-{
-	static int visited = 0;
-
-	if (visited)
-		return;
-
-	visited = 1;
-	if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE") )
-		return;
-
-	fprintf(stderr,
-		"A process has executed an operation involving a call\n"
-		"to the fork() system call to create a child process.\n"
-		"\n"
-		"As a result, the libfabric EFA provider is operating in\n"
-		"a condition that could result in memory corruption or\n"
-		"other system errors.\n"
-		"\n"
-		"For the libfabric EFA provider to work safely when fork()\n"
-		"is called, the application must handle memory registrations\n"
-		"(FI_MR_LOCAL) and you will need to set the following environment\n"
-		"variables:\n"
-		"          RDMAV_FORK_SAFE=1\n"
-		"MPI applications do not support this mode.\n"
-		"\n"
-		"However, this setting can result in signficant performance\n"
-		"impact to your application due to increased cost of memory\n"
-		"registration.\n"
-		"\n"
-		"You may want to check with your application vendor to see\n"
-		"if an application-level alternative (of not using fork)\n"
-		"exists.\n"
-		"\n"
-		"Please refer to https://github.com/ofiwg/libfabric/issues/6332\n"
-		"for more information.\n"
-		"\n"
-		"Your job will now abort.\n");
-	abort();
-}
-
 int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid,
 	       void *context)
 {
 	const struct fi_info *info;
-	struct efa_fabric *fab;
-	int ret = 0;
-	fab = calloc(1, sizeof(*fab));
-	if (!fab)
+	struct efa_fabric *efa_fabric;
+	int ret = 0, retv;
+
+	efa_fabric = calloc(1, sizeof(*efa_fabric));
+	if (!efa_fabric)
 		return -FI_ENOMEM;
 
 	for (info = efa_util_prov.info; info; info = info->next) {
 		ret = ofi_fabric_init(&efa_prov, info->fabric_attr, attr,
-				      &fab->util_fabric, context);
+				      &efa_fabric->util_fabric, context);
 		if (ret != -FI_ENODATA)
 			break;
 	}
-	if (ret) {
-		free(fab);
-		return ret;
+
+	if (ret)
+		goto err_free_fabric;
+
+	/* Open shm provider's fabric domain */
+	if (rxr_env.enable_shm_transfer) {
+		assert(!strcmp(shm_info->fabric_attr->name, "shm"));
+		ret = fi_fabric(shm_info->fabric_attr,
+				    &efa_fabric->shm_fabric, context);
+		if (ret)
+			goto err_close_util_fabric;
+	} else {
+		efa_fabric->shm_fabric = NULL;
 	}
 
-	*fabric_fid = &fab->util_fabric.fabric_fid;
+
+#ifdef EFA_PERF_ENABLED
+	ret = ofi_perfset_create(&rxr_prov, &efa_fabric->perf_set,
+				 efa_perf_size, perf_domain, perf_cntr,
+				 perf_flags);
+
+	if (ret)
+		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
+			"Error initializing EFA perfset: %s\n",
+			fi_strerror(-ret));
+#endif
+
+
+	*fabric_fid = &efa_fabric->util_fabric.fabric_fid;
 	(*fabric_fid)->fid.fclass = FI_CLASS_FABRIC;
 	(*fabric_fid)->fid.ops = &efa_fi_ops;
 	(*fabric_fid)->ops = &efa_ops_fabric;
 	(*fabric_fid)->api_version = attr->api_version;
 
 	return 0;
+
+err_close_util_fabric:
+	retv = ofi_fabric_close(&efa_fabric->util_fabric);
+	if (retv)
+		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
+			"Unable to close fabric: %s\n",
+			fi_strerror(-retv));
+err_free_fabric:
+	free(efa_fabric);
+
+	return ret;
 }
 
-static void fi_efa_fini(void)
+void efa_finalize_prov(void)
 {
 	struct efa_context **ctx_list;
 	int num_devices;
 
-	if (efa_set_rdmav_hugepages_safe)
-		unsetenv("RDMAV_HUGEPAGES_SAFE");
-
 	fi_freeinfo((void *)efa_util_prov.info);
 	efa_util_prov.info = NULL;
 
@@ -1047,7 +1071,7 @@ struct fi_provider efa_prov = {
 	.fi_version = OFI_VERSION_LATEST,
 	.getinfo = efa_getinfo,
 	.fabric = efa_fabric,
-	.cleanup = fi_efa_fini
+	.cleanup = efa_finalize_prov
 };
 
 struct util_prov efa_util_prov = {
@@ -1068,17 +1092,23 @@ static int efa_init_info(const struct fi_info **all_infos)
 		return ret;
 
 	ctx_list = efa_device_get_context_list(&num_devices);
-	if (!num_devices)
+	if (!num_devices) {
+		if (ctx_list) {
+			free(ctx_list);
+		}
 		return -FI_ENODEV;
+	}
 
 	*all_infos = NULL;
 	for (i = 0; i < num_devices; i++) {
 		ret = efa_alloc_info(ctx_list[i], &fi, &efa_rdm_domain);
 		if (!ret) {
-			if (!*all_infos)
+			if (!*all_infos) {
 				*all_infos = fi;
-			else
+			} else {
+				assert(tail);
 				tail->next = fi;
+			}
 			tail = fi;
 			ret = efa_alloc_info(ctx_list[i], &fi, &efa_dgrm_domain);
 			if (!ret) {
@@ -1098,36 +1128,7 @@ static int efa_init_info(const struct fi_info **all_infos)
 	return retv;
 }
 
-struct fi_provider *init_lower_efa_prov()
+int efa_init_prov(void)
 {
-	int err;
-
-	if (!getenv("RDMAV_HUGEPAGES_SAFE")) {
-		/*
-		 * Setting RDMAV_HUGEPAGES_SAFE alone will not impact
-		 * application performance, because rdma-core will only
-		 * check this environment variable when either
-		 * RDMAV_FORK_SAFE or IBV_FORK_SAFE is set.
-		 */
-		err = setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
-		if (err) {
-			EFA_WARN(FI_LOG_FABRIC,
-				 "Unable to set environment variable RDMAV_HUGEPAGES_SAFE\n");
-			return NULL;
-		}
-
-		efa_set_rdmav_hugepages_safe = 1;
-	}
-
-	err = pthread_atfork(efa_atfork_callback, NULL, NULL);
-	if (err) {
-		EFA_WARN(FI_LOG_FABRIC,
-			 "Unable to register atfork callback\n");
-		return NULL;
-	}
-
-	if (efa_init_info(&efa_util_prov.info))
-		return NULL;
-
-	return &efa_prov;
+	return efa_init_info(&efa_util_prov.info);
 }
diff --git a/deps/libfabric/prov/efa/src/efa_mr.c b/deps/libfabric/prov/efa/src/efa_mr.c
index f89eb34bc34b6d3551cd845dde52a6f2aeb8c7e1..46238a44b0e895c30519744447a6c5265cd042de 100644
--- a/deps/libfabric/prov/efa/src/efa_mr.c
+++ b/deps/libfabric/prov/efa/src/efa_mr.c
@@ -57,6 +57,58 @@ static struct fi_ops efa_mr_cache_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
+/*
+ * @brief Validate HMEM attributes and populate efa_mr struct
+ *
+ * Check if FI_HMEM is enabled for the domain, validate whether the specific
+ * device type requested is currently supported by the provider, and update the
+ * efa_mr structure based on the attributes requested by the user.
+ *
+ * @params[in]	efa_mr	efa_mr structure to be updated
+ * @params[in]	attr	fi_mr_attr from the user's registration call
+ *
+ * @return FI_SUCCESS or negative FI error code
+ */
+static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
+                             const struct fi_mr_attr *attr)
+{
+	if (attr->iface == FI_HMEM_SYSTEM) {
+		efa_mr->peer.iface = FI_HMEM_SYSTEM;
+		return FI_SUCCESS;
+	} else if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM) {
+		/*
+		 * Skipping the domain type check above is okay here since
+		 * util_domain is at the beginning of both efa_domain and
+		 * rxr_domain.
+		 */
+		if (ofi_hmem_is_initialized(attr->iface)) {
+			efa_mr->peer.iface = attr->iface;
+		} else {
+			EFA_WARN(FI_LOG_MR,
+				 "FI_HMEM is not initialized for device type %d\n",
+				 attr->iface);
+			return -FI_ENOSYS;
+		}
+	} else {
+		/*
+		 * It's possible that attr->iface is not initialized when
+		 * FI_HMEM is off, so this can't be a fatal error. Print a
+		 * warning in case this value is not FI_HMEM_SYSTEM for
+		 * whatever reason.
+		 */
+		FI_WARN_ONCE(&efa_prov, FI_LOG_MR,
+		             "FI_HMEM support is disabled, assuming FI_HMEM_SYSTEM not type: %d.\n",
+		             attr->iface);
+		efa_mr->peer.iface = FI_HMEM_SYSTEM;
+	}
+
+	if (efa_mr->peer.iface == FI_HMEM_CUDA)
+		efa_mr->peer.device.cuda = attr->device.cuda;
+
+	return FI_SUCCESS;
+}
+
+
 int efa_mr_cache_entry_reg(struct ofi_mr_cache *cache,
 			   struct ofi_mr_entry *entry)
 {
@@ -79,12 +131,16 @@ int efa_mr_cache_entry_reg(struct ofi_mr_cache *cache,
 	efa_mr->mr_fid.fid.context = NULL;
 
 	attr.mr_iov = &entry->info.iov;
+	/* ofi_mr_info only stores one iov */
 	attr.iov_count = 1;
 	attr.access = access;
 	attr.offset = 0;
 	attr.requested_key = 0;
 	attr.context = NULL;
-	attr.iface = FI_HMEM_SYSTEM;
+	attr.iface = entry->info.iface;
+
+	if (attr.iface == FI_HMEM_CUDA)
+		attr.device.cuda = entry->info.device;
 
 	ret = efa_mr_reg_impl(efa_mr, 0, (void *)&attr);
 	return ret;
@@ -151,7 +207,6 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	struct efa_mr *efa_mr;
 	struct ofi_mr_entry *entry;
 	int ret;
-	static const int EFA_MR_CACHE_FLUSH_CHECK = 512;
 
 	if (flags & OFI_MR_NOCACHE) {
 		ret = efa_mr_regattr(fid, attr, flags, mr_fid);
@@ -167,10 +222,6 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	domain = container_of(fid, struct efa_domain,
 			      util_domain.domain_fid.fid);
 
-	if (domain->cache->cached_cnt > 0 && domain->cache->cached_cnt % EFA_MR_CACHE_FLUSH_CHECK==0) {
-		ofi_mr_cache_flush(domain->cache, false);
-	}
-
 	ret = ofi_mr_cache_search(domain->cache, attr, &entry);
 	if (OFI_UNLIKELY(ret))
 		return ret;
@@ -178,12 +229,9 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	efa_mr = (struct efa_mr *)entry->data;
 	efa_mr->entry = entry;
 
-	if (domain->util_domain.info_domain_caps & FI_HMEM)
-		efa_mr->peer.iface = attr->iface;
-	else
-		efa_mr->peer.iface = FI_HMEM_SYSTEM;
-	if (efa_mr->peer.iface == FI_HMEM_CUDA)
-		efa_mr->peer.device.cuda = attr->device.cuda;
+	ret = efa_mr_hmem_setup(efa_mr, attr);
+	if (ret)
+		return ret;
 
 	*mr_fid = &efa_mr->mr_fid;
 	return 0;
@@ -248,7 +296,8 @@ static int efa_mr_dereg_impl(struct efa_mr *efa_mr)
 			fi_strerror(-ret));
 		ret = err;
 	}
-	if (rxr_env.enable_shm_transfer && efa_mr->shm_mr) {
+	if (efa_mr->shm_mr) {
+		assert(rxr_env.enable_shm_transfer);
 		err = fi_close(&efa_mr->shm_mr->fid);
 		if (err) {
 			EFA_WARN(FI_LOG_MR,
@@ -260,7 +309,6 @@ static int efa_mr_dereg_impl(struct efa_mr *efa_mr)
 }
 
 static int efa_mr_close(fid_t fid)
-
 {
 	struct efa_mr *efa_mr;
 	int ret;
@@ -293,6 +341,10 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 	int fi_ibv_access = 0;
 	int ret = 0;
 
+	ret = efa_mr_hmem_setup(efa_mr, mr_attr);
+	if (ret)
+		return ret;
+
 	/* To support Emulated RMA path, if the access is not supported
 	 * by EFA, modify it to FI_SEND | FI_RECV
 	 */
@@ -307,6 +359,9 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 	if (efa_mr->domain->ctx->device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ)
 		fi_ibv_access |= IBV_ACCESS_REMOTE_READ;
 
+	if (efa_mr->domain->cache)
+		ofi_mr_cache_flush(efa_mr->domain->cache, false);
+
 	efa_mr->ibv_mr = ibv_reg_mr(efa_mr->domain->ibv_pd, 
 				    (void *)mr_attr->mr_iov->iov_base,
 				    mr_attr->mr_iov->iov_len, fi_ibv_access);
@@ -318,16 +373,6 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 
 	efa_mr->mr_fid.mem_desc = efa_mr;
 	efa_mr->mr_fid.key = efa_mr->ibv_mr->rkey;
-	/*
-	 * Skipping the domain type check is okay here since util_domain is at
-	 * the beginning of efa_domain and rxr_domain.
-	 */
-	if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM)
-		efa_mr->peer.iface = mr_attr->iface;
-	else
-		efa_mr->peer.iface = FI_HMEM_SYSTEM;
-	if (efa_mr->peer.iface == FI_HMEM_CUDA)
-		efa_mr->peer.device.cuda = mr_attr->device.cuda;
 	assert(efa_mr->mr_fid.key != FI_KEY_NOTAVAIL);
 
 	mr_attr->requested_key = efa_mr->mr_fid.key;
@@ -341,10 +386,11 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 			mr_attr->mr_iov->iov_len);
 		return ret;
 	}
-	if (efa_mr->domain->shm_domain && rxr_env.enable_shm_transfer) {
+	if (efa_mr->domain->shm_domain) {
 		/* We need to add FI_REMOTE_READ to allow for Read implemented
 		* message protocols.
 		*/
+		assert(rxr_env.enable_shm_transfer);
 		original_access = mr_attr->access;
 		mr_attr->access |= FI_REMOTE_READ;
 		ret = fi_mr_regattr(efa_mr->domain->shm_domain, attr,
@@ -369,12 +415,32 @@ static int efa_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 {
 	struct fid_domain *domain_fid;
 	struct efa_mr *efa_mr = NULL;
+	uint64_t supported_flags;
 	int ret = 0;
 
-	if (flags && flags != OFI_MR_NOCACHE) {
+	/*
+	 * Notes supported memory registration flags:
+	 *
+	 * OFI_MR_NOCACHE:
+	 * when MR cache is enabled, application's call to fi_mr_regattr
+	 * was directed to efa_mr_cache_regattr(). If OFI_MR_NOCACHE
+	 * was specified, efa_mr_cache_regattr() will call this
+	 * function directly (bypassing MR cache), therefore
+	 * this function does not do anything special for this flag
+	 * other than allow it.
+	 *
+	 * FI_HMEM_DEVICE_ONLY:
+	 * This flag is used by some provider that need to distinguish
+	 * whether a device memory can be accessed from device only, or
+	 * can be access from host. EFA provider considers all device memory
+	 * to be accessed by device only. Therefore, this function claim
+	 * support of this flag, but do not save it in efa_mr.
+	 */
+	supported_flags = OFI_MR_NOCACHE | FI_HMEM_DEVICE_ONLY;
+	if (flags & (~supported_flags)) {
 		EFA_WARN(FI_LOG_MR, "Unsupported flag type. requested"
 			 "[0x%" PRIx64 "] supported[0x%" PRIx64 "]\n",
-			 flags, (uint64_t) OFI_MR_NOCACHE);
+			 flags, supported_flags);
 		return -FI_EBADFLAGS;
 	}
 
diff --git a/deps/libfabric/prov/efa/src/efa_msg.c b/deps/libfabric/prov/efa/src/efa_msg.c
index 56c696ccb592735a3c6f2f7c01371a8e771a6240..047aa20350314df7659e50dc9466ee64e6a01285 100644
--- a/deps/libfabric/prov/efa/src/efa_msg.c
+++ b/deps/libfabric/prov/efa/src/efa_msg.c
@@ -123,6 +123,15 @@ static ssize_t efa_post_recv_validate(struct efa_ep *ep, const struct fi_msg *ms
 	return 0;
 }
 
+/**
+ * @brief post receive buffer to EFA device via ibv_post_recv
+ *
+ * @param[in]	ep	endpoint
+ * @param[in]	msg	libfabric message
+ * @param[in]	flags	libfabric flags, currently only FI_MORE is supported.
+ * @reutrn	On Success, return 0
+ * 		On failure, return negative libfabric error code
+ */
 static ssize_t efa_post_recv(struct efa_ep *ep, const struct fi_msg *msg, uint64_t flags)
 {
 	struct efa_mr *efa_mr;
@@ -170,6 +179,13 @@ static ssize_t efa_post_recv(struct efa_ep *ep, const struct fi_msg *msg, uint64
 		return 0;
 
 	err = ibv_post_recv(qp->ibv_qp, ep->recv_more_wr_head.next, &bad_wr);
+	if (OFI_UNLIKELY(err)) {
+		/* On failure, ibv_post_recv() return positive errno.
+		 * Meanwhile, this function return a negative errno.
+		 * So, we do the conversion here.
+		 */
+		err = (err == ENOMEM) ? -FI_EAGAIN : -err;
+	}
 
 	free_recv_wr_list(ep->recv_more_wr_head.next);
 	ep->recv_more_wr_tail = &ep->recv_more_wr_head;
@@ -287,6 +303,16 @@ static void efa_post_send_sgl(struct efa_ep *ep, const struct fi_msg *msg,
 	}
 }
 
+ssize_t efa_post_flush(struct efa_ep *ep, struct ibv_send_wr **bad_wr)
+{
+	ssize_t ret;
+
+	ret = ibv_post_send(ep->qp->ibv_qp, ep->xmit_more_wr_head.next, bad_wr);
+	free_send_wr_list(ep->xmit_more_wr_head.next);
+	ep->xmit_more_wr_tail = &ep->xmit_more_wr_head;
+	return ret;
+}
+
 static ssize_t efa_post_send(struct efa_ep *ep, const struct fi_msg *msg, uint64_t flags)
 {
 	struct efa_qp *qp = ep->qp;
@@ -305,7 +331,8 @@ static ssize_t efa_post_send(struct efa_ep *ep, const struct fi_msg *msg, uint64
 
 	memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count);
 	wr = &ewr->wr;
-	conn = ep->av->addr_to_conn(ep->av, msg->addr);
+	conn = efa_av_addr_to_conn(ep->av, msg->addr);
+	assert(conn && conn->ep_addr);
 
 	ret = efa_post_send_validate(ep, msg, conn, flags, &len);
 	if (OFI_UNLIKELY(ret)) {
@@ -315,14 +342,14 @@ static ssize_t efa_post_send(struct efa_ep *ep, const struct fi_msg *msg, uint64
 
 	efa_post_send_sgl(ep, msg, ewr);
 
-	if (flags & FI_INJECT)
+	if (len <= ep->domain->ctx->inline_buf_size)
 		wr->send_flags |= IBV_SEND_INLINE;
 
 	wr->opcode = IBV_WR_SEND;
 	wr->wr_id = (uintptr_t)msg->context;
-	wr->wr.ud.ah = conn->ah.ibv_ah;
-	wr->wr.ud.remote_qpn = conn->ep_addr.qpn;
-	wr->wr.ud.remote_qkey = conn->ep_addr.qkey;
+	wr->wr.ud.ah = conn->ah->ibv_ah;
+	wr->wr.ud.remote_qpn = conn->ep_addr->qpn;
+	wr->wr.ud.remote_qkey = conn->ep_addr->qkey;
 
 	ep->xmit_more_wr_tail->next = wr;
 	ep->xmit_more_wr_tail = wr;
@@ -330,10 +357,7 @@ static ssize_t efa_post_send(struct efa_ep *ep, const struct fi_msg *msg, uint64
 	if (flags & FI_MORE)
 		return 0;
 
-	ret = ibv_post_send(qp->ibv_qp, ep->xmit_more_wr_head.next, &bad_wr);
-
-	free_send_wr_list(ep->xmit_more_wr_head.next);
-	ep->xmit_more_wr_tail = &ep->xmit_more_wr_head;
+	ret = efa_post_flush(ep, &bad_wr);
 
 	return ret;
 
diff --git a/deps/libfabric/prov/efa/src/efa_rma.c b/deps/libfabric/prov/efa/src/efa_rma.c
index 97c681311c7683d5e129f3b5ff70b3e26fccddb3..32ee572e186c21d11a32671811a5f20022a459b4 100644
--- a/deps/libfabric/prov/efa/src/efa_rma.c
+++ b/deps/libfabric/prov/efa/src/efa_rma.c
@@ -101,9 +101,10 @@ ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg,
 		ibv_wr_set_ud_addr(qp->ibv_qp_ex, ep->self_ah,
 				   qp->qp_num, qp->qkey);
 	} else {
-		conn = ep->av->addr_to_conn(ep->av, msg->addr);
-		ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah.ibv_ah,
-				   conn->ep_addr.qpn, conn->ep_addr.qkey);
+		conn = efa_av_addr_to_conn(ep->av, msg->addr);
+		assert(conn && conn->ep_addr);
+		ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah->ibv_ah,
+				   conn->ep_addr->qpn, conn->ep_addr->qkey);
 	}
 
 	return ibv_wr_complete(qp->ibv_qp_ex);
diff --git a/deps/libfabric/prov/efa/src/rxr/rdm_proto_v4.h b/deps/libfabric/prov/efa/src/rxr/rdm_proto_v4.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0f73731e2b4c11eb55496b0f27938123d452488
--- /dev/null
+++ b/deps/libfabric/prov/efa/src/rxr/rdm_proto_v4.h
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _RXR_PROTO_V4_H
+#define _RXR_PROTO_V4_H
+
+/*
+ * This header file contains constants, flags and data structures
+ * that are defined in EFA RDM protocol v4. Any change to this
+ * header file can potentially break backward compatibility, thus
+ * need to be reviewed with extra care.
+ *
+ * The section number in this file refers to the sections
+ * in EFA RDM protocol version 4.
+ */
+
+#define RXR_PROTOCOL_VERSION	(4)
+
+/* raw address format. (section 1.4) */
+#define EFA_GID_LEN	16
+
+struct efa_ep_addr {
+	uint8_t			raw[EFA_GID_LEN];
+	uint16_t		qpn;
+	uint16_t		pad;
+	uint32_t		qkey;
+	struct efa_ep_addr	*next;
+};
+
+#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr)
+
+/*
+ * Extra Feature/Request Flags (section 2.1)
+ */
+#define RXR_EXTRA_FEATURE_RDMA_READ			BIT_ULL(0)
+#define RXR_EXTRA_FEATURE_DELIVERY_COMPLETE 		BIT_ULL(1)
+#define RXR_EXTRA_REQUEST_CONSTANT_HEADER_LENGTH	BIT_ULL(2)
+#define RXR_EXTRA_REQUEST_CONNID_HEADER			BIT_ULL(3)
+#define RXR_NUM_EXTRA_FEATURE_OR_REQUEST		4
+#define RXR_MAX_NUM_EXINFO	(256)
+
+/*
+ * Packet type ID of each packet type (section 1.3)
+ *
+ * Changing packet type ID would break backward compatiblity thus is strictly
+ * prohibited.
+ *
+ * New packet types can be added with introduction of an extra feature
+ * (section 2.1)
+ */
+#define RXR_RETIRED_RTS_PKT		1
+#define RXR_RETIRED_CONNACK_PKT		2
+#define RXR_CTS_PKT			3
+#define RXR_DATA_PKT			4
+#define RXR_READRSP_PKT			5
+#define RXR_RMA_CONTEXT_PKT		6
+#define RXR_EOR_PKT			7
+#define RXR_ATOMRSP_PKT 	        8
+#define RXR_HANDSHAKE_PKT		9
+#define RXR_RECEIPT_PKT 		10
+
+#define RXR_REQ_PKT_BEGIN		64
+#define RXR_BASELINE_REQ_PKT_BEGIN	64
+#define RXR_EAGER_MSGRTM_PKT		64
+#define RXR_EAGER_TAGRTM_PKT		65
+#define RXR_MEDIUM_MSGRTM_PKT		66
+#define RXR_MEDIUM_TAGRTM_PKT		67
+#define RXR_LONGCTS_MSGRTM_PKT		68
+#define RXR_LONGCTS_TAGRTM_PKT		69
+#define RXR_EAGER_RTW_PKT		70
+#define RXR_LONGCTS_RTW_PKT		71
+#define RXR_SHORT_RTR_PKT		72
+#define RXR_LONGCTS_RTR_PKT		73
+#define RXR_WRITE_RTA_PKT		74
+#define RXR_FETCH_RTA_PKT		75
+#define RXR_COMPARE_RTA_PKT		76
+#define RXR_BASELINE_REQ_PKT_END	77
+
+#define RXR_EXTRA_REQ_PKT_BEGIN		128
+#define RXR_LONGREAD_MSGRTM_PKT		128
+#define RXR_LONGREAD_TAGRTM_PKT		129
+#define RXR_LONGREAD_RTW_PKT		130
+#define RXR_READ_RTR_PKT		131
+
+#define RXR_DC_REQ_PKT_BEGIN		132
+#define RXR_DC_EAGER_MSGRTM_PKT 	133
+#define RXR_DC_EAGER_TAGRTM_PKT 	134
+#define RXR_DC_MEDIUM_MSGRTM_PKT 	135
+#define RXR_DC_MEDIUM_TAGRTM_PKT 	136
+#define RXR_DC_LONGCTS_MSGRTM_PKT  	137
+#define RXR_DC_LONGCTS_TAGRTM_PKT  	138
+#define RXR_DC_EAGER_RTW_PKT    	139
+#define RXR_DC_LONGCTS_RTW_PKT     	140
+#define RXR_DC_WRITE_RTA_PKT    	141
+#define RXR_DC_REQ_PKT_END		142
+#define RXR_EXTRA_REQ_PKT_END   	142
+
+/*
+ *  Packet fields common to all rxr packets. The other packet headers below must
+ *  be changed if this is updated.
+ */
+struct rxr_base_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_base_hdr) == 4, "rxr_base_hdr check");
+#endif
+
+/* Universal flags that can be applied on "rxr_base_hdr.flags".
+ *
+ * Universal flags start from the last bit and goes backwards.
+ * Because "rxr_base_hdr.flags" is a 16-bits integer, the
+ * last bit is the 15th bit.
+ * Other than universal flags, each packet type defines its
+ * own set of flags, which generally starts from the 0th bit
+ * in "rxr_base_hdr.flags".
+ */
+
+/* indicate this packet has the sender connid */
+#define RXR_PKT_CONNID_HDR		BIT_ULL(15)
+
+struct efa_rma_iov {
+	uint64_t		addr;
+	size_t			len;
+	uint64_t		key;
+};
+
+/*
+ * @breif header format of CTS packet (Packet Type ID 3)
+ *
+ * CTS is used in long-CTS sub-protocols for flow control.
+ *
+ * It is sent from receiver to sender, and contains number of bytes
+ * receiver is ready to receive.
+ *
+ * long-CTS is used not only by two-sided communication but also
+ * by emulated write and emulated read protocols.
+ *
+ * In emulated write, requester is sender, and responder is receiver.
+ *
+ * In emulated read, requester is receiver, and responder is sender.
+ */
+struct rxr_cts_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	union {
+		uint32_t connid; /* sender connection ID, set when RXR_PKT_CONNID_HDR is on */
+		uint32_t padding; /* otherwise, a padding space to 8 bytes */
+	};
+	uint32_t send_id; /* ID of the send opertaion on sender side */
+	uint32_t recv_id; /* ID of the receive operatin on receive side */
+	uint64_t recv_length; /* number of bytes receiver is ready to receive */
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_cts_hdr) == 24, "rxr_cts_hdr check");
+#endif
+
+/* this flag is to indicated the CTS is the response of a RTR packet */
+#define RXR_CTS_READ_REQ		BIT_ULL(7)
+
+
+/*
+ * @brief optional connid header for DATA packet
+ */
+struct rxr_data_opt_connid_hdr {
+	uint32_t connid;
+	uint32_t padding;
+};
+
+/*
+ * @brief header format of DATA packet header (Packet Type ID 4)
+ *
+ * DATA is used in long-CTS sub-protocols.
+ *
+ * It is sent from sender to receiver, and contains a segment
+ * of application data.
+ *
+ * long-CTS is used not only by two-sided communication but also
+ * by emulated write and emulated read protocols.
+ *
+ * In emulated write, requester is sender, and responder is receiver.
+ *
+ * In emulated read, requester is receiver, and responder is sender.
+ */
+struct rxr_data_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t recv_id; /* ID of the receive operation on receiver */
+	uint64_t seg_length;
+	uint64_t seg_offset;
+	/* optional connid header, present when RXR_PKT_CONNID_HDR is on */
+	struct rxr_data_opt_connid_hdr connid_hdr[0];
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_data_hdr) == 24, "rxr_data_hdr check");
+#endif
+
+/*
+ *  @brief READRSP packet header (Packet Type ID 5)
+ *
+ *  READRSP is sent from read responder to read requester, and it contains
+ *  application data.
+ */
+struct rxr_readrsp_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	union {
+		uint32_t connid; /* sender connection ID, set when RXR_PKT_CONNID_HDR is on */
+		uint32_t padding; /* otherwise, a padding space to 8 bytes boundary */
+	};
+	uint32_t recv_id; /* ID of the receive operation on the read requester, from rtr packet */
+	uint32_t send_id; /* ID of the send operation on the read responder, will be included in CTS packet */
+	uint64_t seg_length;
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_readrsp_hdr) == sizeof(struct rxr_data_hdr), "rxr_readrsp_hdr check");
+#endif
+
+struct rxr_readrsp_pkt {
+	struct rxr_readrsp_hdr hdr;
+	char data[];
+};
+
+/*
+ * RMA Context pkt (Packe Type ID 6) is a special type
+ * of packet. It is used as the context of an RMA
+ * operatation, thus is not sent over wire. Therefore
+ * its header format is not part of protocol. In doc,
+ * the packet type ID 6 is marked as reserved
+ */
+
+/*
+ * @brief format of the EOR packet. (Packet Type ID 7)
+ *
+ * EOR packet is used in long-read sub-protocols, which is
+ * part of the extra request: RDMA read based data transfer.
+ *
+ * It is sent from receiver to sender, to notify
+ * the finish of data transfer.
+ *
+ * long-read is used not only by two-sided communication but also
+ * by emulated write.
+ *
+ * In emulated write, requester is sender, and responder is receiver.
+ */
+struct rxr_eor_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t send_id; /* ID of the send operation on sender */
+	uint32_t recv_id; /* ID of the receive operation on receiver */
+	union {
+		uint32_t connid; /* sender connection ID, optional, set whne RXR_PKT_CONNID_HDR is on */
+		uint32_t padding; /* otherwise, a padding space to 8 bytes boundary */
+	};
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_eor_hdr) == 16, "rxr_eor_hdr check");
+#endif
+
+/**
+ * @brief header format of ATOMRSP packet. (Packet Type ID 8)
+ * ATOMRSP packet is used in emulated fetch/compare atomic sub-protocol.
+ * 
+ * It is sent from responder to requester, which contains the response
+ * to a fetch/compare atomic request
+ */
+struct rxr_atomrsp_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	union {
+		uint32_t connid; /* sender connid. set when RXR_PKT_CONNID_HDR is on in flags */
+		uint32_t padding; /* otherwise, a padding space to 8 bytes boundary */
+	};
+	uint32_t reserved;
+	uint32_t recv_id;
+	uint64_t seg_length;
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_atomrsp_hdr) == 24, "rxr_atomrsp_hdr check");
+#endif
+
+struct rxr_atomrsp_pkt {
+	struct rxr_atomrsp_hdr hdr;
+	char data[];
+};
+
+/**
+ * @breif header format of a HANDSHAKE packet
+ *
+ * HANDSHAKE packet is used in the handshake sub-protocol.
+ *
+ * Upon receiving 1st packet from a peer, an endpoint will
+ * send a HANDSHAKE packet back, which contains its capablity bits
+ */
+struct rxr_handshake_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	/* nextra_p3 is number of members in extra_info plus 3.
+	 * The "p3" part was introduced for backward compatibility.
+	 * See protocol v4 document section 2.1 for detail.
+	 */
+	uint32_t nextra_p3;
+	uint64_t extra_info[0];
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_handshake_hdr) == 8, "rxr_handshake_hdr check");
+#endif
+
+struct rxr_handshake_opt_connid_hdr {
+	uint32_t connid;
+	uint32_t padding; /* padding to 8 bytes boundary */
+};
+
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(sizeof(struct rxr_handshake_opt_connid_hdr) == 8, "rxr_handshake_opt_connid_hdr check");
+#endif
+
+/* @brief header format of RECEIPT packet */
+struct rxr_receipt_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t tx_id;
+	uint32_t msg_id;
+	union {
+		uint32_t connid; /* sender connection ID, set when RXR_PKT_CONNID_HDR is on */
+		uint32_t padding; /* otherwise, a padding space to 8 bytes */
+	};
+};
+
+/*
+ * The following are REQ packets related constants, flags
+ * and data structures.
+ *
+ * REQ packets can be classifed into 4 categories (section 3.1):
+ *    RTM (Request To Message) is used by message
+ *    RTW (Request To Write) is used by RMA write
+ *    RTR (Request To Read) is used by RMA read
+ *    RTA (Request To Atomic) is used by Atomic
+ */
+
+
+/*
+ * REQ Packets common Header Flags (section 3.1)
+ */
+#define RXR_REQ_OPT_RAW_ADDR_HDR	BIT_ULL(0)
+#define RXR_REQ_OPT_CQ_DATA_HDR		BIT_ULL(1)
+#define RXR_REQ_MSG			BIT_ULL(2)
+#define RXR_REQ_TAGGED			BIT_ULL(3)
+#define RXR_REQ_RMA			BIT_ULL(4)
+#define RXR_REQ_ATOMIC			BIT_ULL(5)
+
+/*
+ * optional headers for REQ packets
+ */
+struct rxr_req_opt_raw_addr_hdr {
+	uint32_t addr_len;
+	char raw_addr[0];
+};
+
+struct rxr_req_opt_cq_data_hdr {
+	int64_t cq_data;
+};
+
+struct rxr_req_opt_connid_hdr {
+	uint32_t connid; /* sender's connection ID */
+};
+
+#define RXR_REQ_OPT_HDR_ALIGNMENT 8
+#define RXR_REQ_OPT_RAW_ADDR_HDR_SIZE (((sizeof(struct rxr_req_opt_raw_addr_hdr) + EFA_EP_ADDR_LEN - 1)/RXR_REQ_OPT_HDR_ALIGNMENT + 1) * RXR_REQ_OPT_HDR_ALIGNMENT)
+
+/*
+ * Base header for all RTM packets
+ */
+struct rxr_rtm_base_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	uint32_t msg_id;
+};
+
+/**
+ * @brief header format of EAGER_MSGRTM packet (Packet Type ID 64)
+ */
+struct rxr_eager_msgrtm_hdr {
+	struct rxr_rtm_base_hdr hdr;
+};
+
+
+/**
+ * @brief header format of EAGER_TAGRTM packet (Packet Type ID 65)
+ */
+struct rxr_eager_tagrtm_hdr {
+	struct rxr_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+struct rxr_medium_rtm_base_hdr {
+	struct rxr_rtm_base_hdr hdr;
+	uint64_t msg_length;
+	uint64_t seg_offset;
+};
+
+/**
+ * @brief header format of MEDIUM_MSGRTM packet (Packet Type ID 66)
+ */
+struct rxr_medium_msgrtm_hdr {
+	struct rxr_medium_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of MEDIUM_TAGRTM packet (Packet Type ID 67)
+ */
+struct rxr_medium_tagrtm_hdr {
+	struct rxr_medium_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+struct rxr_longcts_rtm_base_hdr {
+	struct rxr_rtm_base_hdr hdr;
+	uint64_t msg_length;
+	uint32_t send_id;
+	uint32_t credit_request;
+};
+
+/**
+ * @brief header format of LONGCTS_MSGRTM packet (Packet Type ID 68)
+ */
+struct rxr_longcts_msgrtm_hdr {
+	struct rxr_longcts_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of LONGCTS_TAGRTM packet (Packet Type ID 69)
+ */
+struct rxr_longcts_tagrtm_hdr {
+	struct rxr_longcts_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+struct rxr_rtw_base_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+};
+
+/**
+ * @brief header format of EAGER_RTW packet (Packet Type ID 70)
+ */
+struct rxr_eager_rtw_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	struct efa_rma_iov rma_iov[0];
+};
+
+/**
+ * @brief header format of LONGCTS_RTW packet (Packet Type ID 71)
+ */
+struct rxr_longcts_rtw_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	uint64_t msg_length;
+	uint32_t send_id;
+	uint32_t credit_request;
+	struct efa_rma_iov rma_iov[0];
+};
+
+/*
+ * rxr_rtr_hdr is used by both SHORT_RTR (Packet Type ID 72)
+ * and LONGCTS_RTR (Packet Type ID 73)
+ */
+struct rxr_rtr_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	uint64_t msg_length;
+	uint32_t recv_id; /* ID of the receive operation of the read requester, will be included in DATA/READRSP header */
+	uint32_t recv_length; /* number of bytes that the read requester is ready to receive */
+	struct efa_rma_iov rma_iov[0];
+};
+
+/* @brief rxr_rta_hdr are shared by 4 types of RTA:
+ *    WRITE_RTA (Packet Type ID 74),
+ *    FETCH_RTA (Packet Type ID 75),
+ *    COMPARE_RTA (Packet Type ID 76) and
+ *    DC_WRTIE_RTA (Packe Type ID 141)
+ */
+struct rxr_rta_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	uint32_t msg_id;
+	/* end of rtm_base_hdr, atomic packet need msg_id for reordering */
+	uint32_t rma_iov_count;
+	uint32_t atomic_datatype;
+	uint32_t atomic_op;
+	union {
+		/* padding is used by WRITE_RTA, align to 8 bytes */
+		uint32_t padding;
+		/* recv_id is used by FETCH_RTA and COMPARE_RTA. It is the ID of the receive operation on atomic requester,
+		 * it will be included in ATOMRSP packet header.
+		 */
+		uint32_t recv_id;
+		/* send_id is used by DC_WRITE_RTA. It is ID of the send operation on the atomic requester.
+		 * It will be included in RECEIPT packet header.
+		 */
+		uint32_t send_id;
+	};
+
+	struct efa_rma_iov rma_iov[0];
+};
+
+/*
+ * Extra request: RDMA read based data transfer (section 4.1)
+ */
+struct rxr_longread_rtm_base_hdr {
+	struct rxr_rtm_base_hdr hdr;
+	uint64_t msg_length;
+	uint32_t send_id;
+	uint32_t read_iov_count;
+};
+
+/**
+ * @brief header format of LONGREAD_MSGRTM (Packet Type ID 128)
+ */
+struct rxr_longread_msgrtm_hdr {
+	struct rxr_longread_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of LONGREAD_MSGRTM (Packet Type ID 129)
+ */
+struct rxr_longread_tagrtm_hdr {
+	struct rxr_longread_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+/**
+ * @brief header format of LONGREAD_MSGRTM (Packet Type ID 130)
+ */
+struct rxr_longread_rtw_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	uint64_t msg_length;
+	uint32_t send_id;
+	uint32_t read_iov_count;
+	struct efa_rma_iov rma_iov[0];
+};
+
+/*
+ * Extra requester: delivery complete (section 4.2)
+ */
+
+struct rxr_dc_eager_rtm_base_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	uint32_t msg_id;
+	uint32_t send_id;
+	uint32_t padding;
+};
+
+/**
+ * @brief header format of a DC_EAGER_MSGRTM packet
+ */
+struct rxr_dc_eager_msgrtm_hdr {
+	struct rxr_dc_eager_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of a DC_EAGER_TAGRTM packet
+ */
+struct rxr_dc_eager_tagrtm_hdr {
+	struct rxr_dc_eager_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+struct rxr_dc_medium_rtm_base_hdr {
+	struct rxr_rtm_base_hdr hdr;
+	uint32_t send_id;
+	uint32_t padding;
+	uint64_t msg_length;
+	uint64_t seg_offset;
+};
+
+/**
+ * @brief header format of a DC_MEDIUM_MSGRTM packet
+ */
+struct rxr_dc_medium_msgrtm_hdr {
+	struct rxr_dc_medium_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of a DC_MEDIUM_TAGRTM packet
+ */
+struct rxr_dc_medium_tagrtm_hdr {
+	struct rxr_dc_medium_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+/**
+ * @brief header format of a DC_LONGCTS_MSGRTM packet
+ */
+struct rxr_dc_longcts_msgrtm_hdr {
+	struct rxr_longcts_rtm_base_hdr hdr;
+};
+
+/**
+ * @brief header format of a DC_LONGCTS_TAGRTM packet
+ */
+struct rxr_dc_longcts_tagrtm_hdr {
+	struct rxr_longcts_rtm_base_hdr hdr;
+	uint64_t tag;
+};
+
+/**
+ * @brief header format of a DC_EAGER_RTW packet
+ */
+struct rxr_dc_eager_rtw_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	/* end of rxr_rtw_base_hdr */
+	uint32_t send_id;
+	uint32_t padding;
+	struct efa_rma_iov rma_iov[0];
+};
+
+/**
+ * @brief header format of a DC_LONGCTS_RTW packet
+ */
+struct rxr_dc_longcts_rtw_hdr {
+	uint8_t type;
+	uint8_t version;
+	uint16_t flags;
+	/* end of rxr_base_hdr */
+	uint32_t rma_iov_count;
+	uint64_t msg_length;
+	uint32_t send_id;
+	uint32_t credit_request;
+	struct efa_rma_iov rma_iov[0];
+};
+
+/* DC_WRITE_RTA header format is merged into rxr_rta_hdr */
+
+#endif
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr.h b/deps/libfabric/prov/efa/src/rxr/rxr.h
index 8397b35a94724292db1c0328928ffd15da156fe1..a0dcfa4e6710157cc09ad475368946f37df784f2 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr.h
@@ -49,6 +49,7 @@
 #include <rdma/fi_rma.h>
 #include <rdma/fi_tagged.h>
 #include <rdma/fi_trigger.h>
+#include <rdma/fi_ext.h>
 
 #include <ofi.h>
 #include <ofi_iov.h>
@@ -66,15 +67,6 @@
 #include "rxr_pkt_entry.h"
 #include "rxr_pkt_type.h"
 
-/*
- * EFA support interoperability between protocol version 4 and above,
- * and version 4 is considered the base version.
- */
-#define RXR_BASE_PROTOCOL_VERSION	(4)
-#define RXR_CUR_PROTOCOL_VERSION	(4)
-#define RXR_NUM_PROTOCOL_VERSION	(RXR_CUR_PROTOCOL_VERSION - RXR_BASE_PROTOCOL_VERSION + 1)
-#define RXR_MAX_PROTOCOL_VERSION	(100)
-
 #define RXR_FI_VERSION		OFI_VERSION_LATEST
 
 #define RXR_IOV_LIMIT		(4)
@@ -102,8 +94,17 @@ static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size)
 #define RXR_DEF_CQ_SIZE			(8192)
 #define RXR_REMOTE_CQ_DATA_LEN		(8)
 
-/* maximum timeout for RNR backoff (microseconds) */
-#define RXR_DEF_RNR_MAX_TIMEOUT		(1000000)
+/* the default value for rxr_env.rnr_backoff_wait_time_cap */
+#define RXR_DEFAULT_RNR_BACKOFF_WAIT_TIME_CAP	(1000000)
+
+/*
+ * the maximum value for rxr_env.rnr_backoff_wait_time_cap
+ * Because the backoff wait time is multiplied by 2 when
+ * RNR is encountered, its value must be < INT_MAX/2.
+ * Therefore, its cap must be < INT_MAX/2 too.
+ */
+#define RXR_MAX_RNR_BACKOFF_WAIT_TIME_CAP	(INT_MAX/2 - 1)
+
 /* bounds for random RNR backoff timeout */
 #define RXR_RAND_MIN_TIMEOUT		(40)
 #define RXR_RAND_MAX_TIMEOUT		(120)
@@ -156,6 +157,29 @@ static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size)
 #define RXR_MULTI_RECV_POSTED	BIT_ULL(4)
 #define RXR_MULTI_RECV_CONSUMER	BIT_ULL(5)
 
+/*
+ * Flag to tell if the transmission is using FI_DELIVERY_COMPLETE
+ * protocols
+ */
+
+#define RXR_DELIVERY_COMPLETE_REQUESTED	BIT_ULL(6)
+
+/*
+ * Flag to tell if the sender
+ * receives the receipt packet for the tx_entry.
+ */
+#define RXR_RECEIPT_RECEIVED BIT_ULL(7)
+
+/*
+ * Flag to tell that
+ * long message protocol is used
+ */
+#define RXR_LONGCTS_PROTOCOL BIT_ULL(8)
+
+#define RXR_TX_ENTRY_QUEUED_RNR BIT_ULL(9)
+
+#define RXR_RX_ENTRY_QUEUED_RNR BIT_ULL(9)
+
 /*
  * OFI flags
  * The 64-bit flag field is used as follows:
@@ -178,7 +202,6 @@ static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size)
 
 extern struct fi_info *shm_info;
 
-extern struct fi_provider *lower_efa_prov;
 extern struct fi_provider rxr_prov;
 extern struct fi_info rxr_info;
 extern struct rxr_env rxr_env;
@@ -198,7 +221,10 @@ struct rxr_env {
 	int shm_av_size;
 	int shm_max_medium_size;
 	int recvwin_size;
+	int ooo_pool_chunk_size;
+	int unexp_pool_chunk_size;
 	int readcopy_pool_size;
+	int atomrsp_pool_size;
 	int cq_size;
 	size_t max_memcpy_size;
 	size_t mtu_size;
@@ -208,14 +234,25 @@ struct rxr_env {
 	size_t rx_iov_limit;
 	int rx_copy_unexp;
 	int rx_copy_ooo;
-	int max_timeout;
-	int timeout_interval;
+	int rnr_backoff_wait_time_cap; /* unit is us */
+	int rnr_backoff_initial_wait_time; /* unit is us */
 	size_t efa_cq_read_size;
 	size_t shm_cq_read_size;
 	size_t efa_max_medium_msg_size;
 	size_t efa_min_read_msg_size;
 	size_t efa_min_read_write_size;
 	size_t efa_read_segment_size;
+	/* If first attempt to send a packet failed,
+	 * this value controls how many times firmware
+	 * retries the send before it report an RNR error
+	 * (via rdma-core error cq entry).
+	 *
+	 * The valid number is from
+	 *      0 (no retry)
+	 * to
+	 *      EFA_RNR_INFINITY_RETRY (retry infinitely)
+	 */
+	int rnr_retry;
 };
 
 enum rxr_lower_ep_type {
@@ -226,6 +263,7 @@ enum rxr_lower_ep_type {
 enum rxr_x_entry_type {
 	RXR_TX_ENTRY = 1,
 	RXR_RX_ENTRY,
+	RXR_READ_ENTRY,
 };
 
 enum rxr_tx_comm_type {
@@ -234,19 +272,6 @@ enum rxr_tx_comm_type {
 	RXR_TX_SEND,		/* tx_entry sending data in progress */
 	RXR_TX_QUEUED_SHM_RMA,	/* tx_entry was unable to send RMA operations over shm provider */
 	RXR_TX_QUEUED_CTRL,	/* tx_entry was unable to send ctrl packet */
-	RXR_TX_QUEUED_REQ_RNR,  /* tx_entry RNR sending REQ packet */
-	RXR_TX_QUEUED_DATA_RNR,	/* tx_entry RNR sending data packets */
-	RXR_TX_SENT_READRSP,	/* tx_entry (on remote EP) sent
-				 * read response (FI_READ only)
-				 */
-	RXR_TX_QUEUED_READRSP, /* tx_entry (on remote EP) was
-				* unable to send read response
-				* (FI_READ only)
-				*/
-	RXR_TX_WAIT_READ_FINISH, /* tx_entry (on initiating EP) wait
-				  * for rx_entry to finish receiving
-				  * (FI_READ only)
-				  */
 };
 
 enum rxr_rx_comm_type {
@@ -255,54 +280,50 @@ enum rxr_rx_comm_type {
 	RXR_RX_UNEXP,		/* rx_entry unexp msg waiting for post recv */
 	RXR_RX_MATCHED,		/* rx_entry matched with RTM */
 	RXR_RX_RECV,		/* rx_entry large msg recv data pkts */
-	RXR_RX_QUEUED_CTRL,	/* rx_entry was unable to send ctrl packet */
-	RXR_RX_QUEUED_EOR,	/* rx_entry was unable to send EOR over shm */
-	RXR_RX_QUEUED_CTS_RNR,	/* rx_entry RNR sending CTS */
+	RXR_RX_QUEUED_CTRL,	/* rx_entry encountered error when sending control
+				   it is in rxr_ep->rx_queued_entry_list, progress
+				   engine will resend the ctrl packet */
 	RXR_RX_WAIT_READ_FINISH, /* rx_entry wait for send to finish, FI_READ */
 	RXR_RX_WAIT_ATOMRSP_SENT, /* rx_entry wait for atomrsp packet sent completion */
 };
 
-enum rxr_rx_buf_owner {
-	RXR_RX_PROV_BUF = 0,	 /* Bounce buffers allocated and owned by provider */
-	RXR_RX_USER_BUF,	 /* Recv buffers posted by applications */
-};
-
 #define RXR_PEER_REQ_SENT BIT_ULL(0) /* sent a REQ to the peer, peer should send a handshake back */
-#define RXR_PEER_HANDSHAKE_SENT BIT_ULL(1)
+#define RXR_PEER_HANDSHAKE_SENT BIT_ULL(1) /* a handshake packet has been sent to a peer */
 #define RXR_PEER_HANDSHAKE_RECEIVED BIT_ULL(2)
 #define RXR_PEER_IN_BACKOFF BIT_ULL(3) /* peer is in backoff, not allowed to send */
-#define RXR_PEER_BACKED_OFF BIT_ULL(4) /* peer backoff was increased during this loop of the progress engine */
-
-struct rxr_fabric {
-	struct util_fabric util_fabric;
-	struct fid_fabric *lower_fabric;
-	struct fid_fabric *shm_fabric;
-#ifdef RXR_PERF_ENABLED
-	struct ofi_perfset perf_set;
-#endif
-};
-
+/*
+ * FI_EAGAIN error was encountered when sending handsahke to this peer,
+ * the peer was put in rxr_ep->handshake_queued_peer_list.
+ * Progress engine will retry sending handshake.
+ */
+#define RXR_PEER_HANDSHAKE_QUEUED      BIT_ULL(5)
 #define RXR_MAX_NUM_PROTOCOLS (RXR_MAX_PROTOCOL_VERSION - RXR_BASE_PROTOCOL_VERSION + 1)
 
-struct rxr_peer {
-	bool tx_init;			/* tracks initialization of tx state */
-	bool rx_init;			/* tracks initialization of rx state */
+struct rdm_peer {
 	bool is_self;			/* self flag */
 	bool is_local;			/* local/remote peer flag */
+	fi_addr_t efa_fiaddr;		/* fi_addr_t addr from efa provider */
 	fi_addr_t shm_fiaddr;		/* fi_addr_t addr from shm provider */
-	struct rxr_robuf *robuf;	/* tracks expected msg_id on rx */
+	struct rxr_robuf robuf;		/* tracks expected msg_id on rx */
+	uint32_t prev_qkey;		/* each peer has unique gid+qpn. the qkey can change */
 	uint32_t next_msg_id;		/* sender's view of msg_id */
 	uint32_t flags;
-	uint32_t maxproto;		/* maximum supported protocol version by this peer */
-	uint64_t features[RXR_MAX_NUM_PROTOCOLS]; /* the feature flag for each version */
-	size_t tx_pending;		/* tracks pending tx ops to this peer */
+	uint32_t nextra_p3;		/* number of members in extra_info plus 3 */
+	uint64_t extra_info[RXR_MAX_NUM_EXINFO]; /* the feature/request flag for each version */
+	size_t efa_outstanding_tx_ops;	/* tracks outstanding tx ops to this peer on EFA device */
+	size_t shm_outstanding_tx_ops;  /* tracks outstanding tx ops to this peer on SHM */
+	struct dlist_entry outstanding_tx_pkts; /* a list of outstanding tx pkts to the peer */
 	uint16_t tx_credits;		/* available send credits */
 	uint16_t rx_credits;		/* available credits to allocate */
-	uint64_t rnr_ts;		/* timestamp for RNR backoff tracking */
+	uint64_t rnr_backoff_begin_ts;	/* timestamp for RNR backoff period begin */
+	uint64_t rnr_backoff_wait_time;	/* how long the RNR backoff period last */
 	int rnr_queued_pkt_cnt;		/* queued RNR packet count */
-	int timeout_interval;		/* initial RNR timeout value */
-	int rnr_timeout_exp;		/* RNR timeout exponentation calc val */
-	struct dlist_entry rnr_entry;	/* linked to rxr_ep peer_backoff_list */
+	struct dlist_entry rnr_backoff_entry;	/* linked to rxr_ep peer_backoff_list */
+	struct dlist_entry handshake_queued_entry; /* linked with rxr_ep->handshake_queued_peer_list */
+	struct dlist_entry rx_unexp_list; /* a list of unexpected untagged rx_entry for this peer */
+	struct dlist_entry rx_unexp_tagged_list; /* a list of unexpected tagged rx_entry for this peer */
+	struct dlist_entry tx_entry_list; /* a list of tx_entry related to this peer */
+	struct dlist_entry rx_entry_list; /* a list of rx_entry relased to this peer */
 };
 
 struct rxr_queued_ctrl_info {
@@ -333,6 +354,7 @@ struct rxr_rx_entry {
 	enum rxr_x_entry_type type;
 
 	fi_addr_t addr;
+	struct rdm_peer *peer;
 
 	/*
 	 * freestack ids used to lookup rx_entry during pkt recv
@@ -373,7 +395,6 @@ struct rxr_rx_entry {
 
 	/* App-provided buffers and descriptors */
 	void *desc[RXR_IOV_LIMIT];
-	enum rxr_rx_buf_owner owner;
 	struct fi_msg *posted_recv;
 
 	/* iov_count on sender side, used for large message READ over shm */
@@ -385,8 +406,13 @@ struct rxr_rx_entry {
 	/* entry is linked with rx entry lists in rxr_ep */
 	struct dlist_entry entry;
 
-	/* queued_entry is linked with rx_queued_ctrl_list in rxr_ep */
-	struct dlist_entry queued_entry;
+	struct dlist_entry peer_unexp_entry; /* linked to peer->rx_unexp_list or peer->rx_unexp_tagged_list */
+
+	/* queued_ctrl_entry is linked with rx_queued_ctrl_list in rxr_ep */
+	struct dlist_entry queued_ctrl_entry;
+
+	/* queued_rnr_entry is linked with rx_queued_rnr_list in rxr_ep */
+	struct dlist_entry queued_rnr_entry;
 
 	/* Queued packets due to TX queue full or RNR backoff */
 	struct dlist_entry queued_pkts;
@@ -403,14 +429,16 @@ struct rxr_rx_entry {
 	struct rxr_rx_entry *master_entry;
 
 	struct rxr_pkt_entry *unexp_pkt;
-	struct rxr_pkt_entry *atomrsp_pkt;
-	char *atomrsp_buf;
+	char *atomrsp_data;
 
+	/* linked with rx_entry_list in rdm_peer */
+	struct dlist_entry peer_entry;
+
+	/* linked with rx_entry_list in rxr_ep */
+	struct dlist_entry ep_entry;
 #if ENABLE_DEBUG
 	/* linked with rx_pending_list in rxr_ep */
 	struct dlist_entry rx_pending_entry;
-	/* linked with rx_entry_list in rxr_ep */
-	struct dlist_entry rx_entry_entry;
 #endif
 };
 
@@ -420,6 +448,7 @@ struct rxr_tx_entry {
 
 	uint32_t op;
 	fi_addr_t addr;
+	struct rdm_peer *peer;
 
 	/*
 	 * freestack ids used to lookup tx_entry during ctrl pkt recv
@@ -443,7 +472,8 @@ struct rxr_tx_entry {
 	struct rxr_queued_ctrl_info queued_ctrl;
 
 	uint64_t fi_flags;
-	uint64_t send_flags;
+	uint64_t rxr_flags;
+
 	size_t iov_count;
 	size_t iov_index;
 	size_t iov_offset;
@@ -470,16 +500,20 @@ struct rxr_tx_entry {
 	/* entry is linked with tx_pending_list in rxr_ep */
 	struct dlist_entry entry;
 
-	/* queued_entry is linked with tx_queued_ctrl_list in rxr_ep */
-	struct dlist_entry queued_entry;
+	/* queued_ctrl_entry is linked with tx_queued_ctrl_list in rxr_ep */
+	struct dlist_entry queued_ctrl_entry;
+
+	/* queued_rnr_entry is linked with tx_queued_rnr_list in rxr_ep */
+	struct dlist_entry queued_rnr_entry;
 
 	/* Queued packets due to TX queue full or RNR backoff */
 	struct dlist_entry queued_pkts;
 
-#if ENABLE_DEBUG
+	/* peer_entry is linked with tx_entry_list in rdm_peer */
+	struct dlist_entry peer_entry;
+
 	/* linked with tx_entry_list in rxr_ep */
-	struct dlist_entry tx_entry_entry;
-#endif
+	struct dlist_entry ep_entry;
 };
 
 #define RXR_GET_X_ENTRY_TYPE(pkt_entry)	\
@@ -497,12 +531,10 @@ struct rxr_domain {
 	struct fid_domain *rdm_domain;
 	size_t mtu_size;
 	size_t addrlen;
-	uint8_t mr_local;
 	uint8_t rxr_mr_local;
 	uint64_t rdm_mode;
 	int do_progress;
 	size_t cq_size;
-	enum fi_resource_mgmt resource_mgmt;
 };
 
 struct rxr_ep {
@@ -511,14 +543,8 @@ struct rxr_ep {
 	uint8_t core_addr[RXR_MAX_NAME_LENGTH];
 	size_t core_addrlen;
 
-	/* per-version feature flag */
-	uint64_t features[RXR_NUM_PROTOCOL_VERSION];
-
-	/* per-peer information */
-	struct rxr_peer *peer;
-
-	/* bufpool for reorder buffer */
-	struct ofi_bufpool *robuf_pool;
+	/* per-version extra feature/request flag */
+	uint64_t extra_info[RXR_MAX_NUM_EXINFO];
 
 	/* core provider fid */
 	struct fid_ep *rdm_ep;
@@ -539,6 +565,7 @@ struct rxr_ep {
 	size_t mtu_size;
 	size_t rx_iov_limit;
 	size_t tx_iov_limit;
+	size_t inject_size;
 
 	/* core's capabilities */
 	uint64_t core_caps;
@@ -546,9 +573,12 @@ struct rxr_ep {
 	/* Endpoint's capability to support zero-copy rx */
 	bool use_zcpy_rx;
 
+	/* Application requested resource management support */
+	int handle_resource_management;
+
 	/* rx/tx queue size of core provider */
 	size_t core_rx_size;
-	size_t max_outstanding_tx;
+	size_t efa_max_outstanding_tx_ops;
 	size_t core_inject_size;
 	size_t max_data_payload_size;
 
@@ -563,6 +593,9 @@ struct rxr_ep {
 	/* Application's maximum msg size hint */
 	size_t max_msg_size;
 
+	/* Applicaiton's message prefix size. */
+	size_t msg_prefix_size;
+
 	/* RxR protocol's max header size */
 	size_t max_proto_hdr_size;
 
@@ -573,15 +606,21 @@ struct rxr_ep {
 	size_t min_multi_recv_size;
 
 	/* buffer pool for send & recv */
-	struct ofi_bufpool *tx_pkt_efa_pool;
-	struct ofi_bufpool *rx_pkt_efa_pool;
+	struct ofi_bufpool *efa_tx_pkt_pool;
+	struct ofi_bufpool *efa_rx_pkt_pool;
+
+	/*
+	 * buffer pool for rxr_pkt_sendv struct, which is used
+	 * to store iovec related information
+	 */
+	struct ofi_bufpool *pkt_sendv_pool;
 
 	/*
 	 * buffer pool for send & recv for shm as mtu size is different from
 	 * the one of efa, and do not require local memory registration
 	 */
-	struct ofi_bufpool *tx_pkt_shm_pool;
-	struct ofi_bufpool *rx_pkt_shm_pool;
+	struct ofi_bufpool *shm_tx_pkt_pool;
+	struct ofi_bufpool *shm_rx_pkt_pool;
 
 	/* staging area for unexpected and out-of-order packets */
 	struct ofi_bufpool *rx_unexp_pkt_pool;
@@ -608,6 +647,11 @@ struct rxr_ep {
 	struct ofi_bufpool *map_entry_pool;
 	/* rxr medium message pkt_entry to rx_entry map */
 	struct rxr_pkt_rx_map *pkt_rx_map;
+	/*
+	 * buffer pool for atomic response data, used by
+	 * emulated fetch and compare atomic.
+	 */
+	struct ofi_bufpool *rx_atomrsp_pool;
 	/* rx_entries with recv buf */
 	struct dlist_entry rx_list;
 	/* rx_entries without recv buf (unexpected message) */
@@ -620,16 +664,22 @@ struct rxr_ep {
 	struct dlist_entry rx_posted_buf_list;
 	/* list of pre-posted recv buffers for shm */
 	struct dlist_entry rx_posted_buf_shm_list;
-	/* tx entries with queued messages */
-	struct dlist_entry tx_entry_queued_list;
-	/* rx entries with queued messages */
-	struct dlist_entry rx_entry_queued_list;
+	/* tx entries with queued ctrl packets */
+	struct dlist_entry tx_entry_queued_ctrl_list;
+	/* tx entries with queued rnr packets */
+	struct dlist_entry tx_entry_queued_rnr_list;
+	/* rx entries with queued ctrl packets */
+	struct dlist_entry rx_entry_queued_ctrl_list;
+	/* rx entries with queued rnr packets */
+	struct dlist_entry rx_entry_queued_rnr_list;
 	/* tx_entries with data to be sent (large messages) */
 	struct dlist_entry tx_pending_list;
 	/* read entries with data to be read */
 	struct dlist_entry read_pending_list;
 	/* rxr_peer entries that are in backoff due to RNR */
 	struct dlist_entry peer_backoff_list;
+	/* rxr_peer entries that will retry posting handshake pkt */
+	struct dlist_entry handshake_queued_peer_list;
 
 #if ENABLE_DEBUG
 	/* rx_entries waiting for data to arrive (large messages) */
@@ -643,29 +693,43 @@ struct rxr_ep {
 	/* tx packets waiting for send completion */
 	struct dlist_entry tx_pkt_list;
 
-	/* track allocated rx_entries and tx_entries for endpoint cleanup */
-	struct dlist_entry rx_entry_list;
-	struct dlist_entry tx_entry_list;
-
-	size_t sends;
+	size_t efa_total_posted_tx_ops;
+	size_t shm_total_posted_tx_ops;
 	size_t send_comps;
 	size_t failed_send_comps;
 	size_t recv_comps;
 #endif
-	/* number of posted buffer for shm */
-	size_t posted_bufs_shm;
-	size_t rx_bufs_shm_to_post;
+	/* track allocated rx_entries and tx_entries for endpoint cleanup */
+	struct dlist_entry rx_entry_list;
+	struct dlist_entry tx_entry_list;
 
-	/* number of posted buffers */
-	size_t posted_bufs_efa;
-	size_t rx_bufs_efa_to_post;
+	/*
+	 * number of posted RX packets for shm
+	 */
+	size_t shm_rx_pkts_posted;
+	/*
+	 * number of RX packets to be posted by progress engine for shm.
+	 * It exists because posting RX packets by bulk is more efficient.
+	 */
+	size_t shm_rx_pkts_to_post;
+	/*
+	 * number of posted RX packets for EFA device
+	 */
+	size_t efa_rx_pkts_posted;
+	/*
+	 * Number of RX packets to be posted by progress engine for EFA device.
+	 * It exists because posting RX packets by bulk is more efficient.
+	 */
+	size_t efa_rx_pkts_to_post;
 	/* number of buffers available for large messages */
 	size_t available_data_bufs;
 	/* Timestamp of when available_data_bufs was exhausted. */
 	uint64_t available_data_bufs_ts;
 
-	/* number of outstanding sends */
-	size_t tx_pending;
+	/* number of outstanding tx ops on efa device */
+	size_t efa_outstanding_tx_ops;
+	/* number of outstanding tx ops on shm */
+	size_t shm_outstanding_tx_ops;
 };
 
 #define rxr_rx_flags(rxr_ep) ((rxr_ep)->util_ep.rx_op_flags)
@@ -687,11 +751,6 @@ static inline void rxr_copy_shm_cq_entry(struct fi_cq_tagged_entry *cq_tagged_en
 	cq_tagged_entry->tag = 0; // No tag for RMA;
 
 }
-static inline struct rxr_peer *rxr_ep_get_peer(struct rxr_ep *ep,
-					       fi_addr_t addr)
-{
-	return &ep->peer[addr];
-}
 
 static inline void rxr_setup_msg(struct fi_msg *msg, const struct iovec *iov, void **desc,
 				 size_t count, fi_addr_t addr, void *context, uint32_t data)
@@ -704,24 +763,13 @@ static inline void rxr_setup_msg(struct fi_msg *msg, const struct iovec *iov, vo
 	msg->data = data;
 }
 
-static inline void rxr_ep_peer_init_rx(struct rxr_ep *ep, struct rxr_peer *peer)
-{
-	assert(!peer->rx_init);
-
-	peer->robuf = ofi_buf_alloc(ep->robuf_pool);
-	assert(peer->robuf);
-	peer->robuf = ofi_recvwin_buf_alloc(peer->robuf,
-					    rxr_env.recvwin_size);
-	peer->rx_credits = rxr_env.rx_window_size;
-	peer->rx_init = 1;
-}
+struct efa_ep_addr *rxr_ep_raw_addr(struct rxr_ep *ep);
 
-static inline void rxr_ep_peer_init_tx(struct rxr_peer *peer)
-{
-	assert(!peer->tx_init);
-	peer->tx_credits = rxr_env.tx_max_credits;
-	peer->tx_init = 1;
-}
+const char *rxr_ep_raw_addr_str(struct rxr_ep *ep, char *buf, size_t *buflen);
+
+struct efa_ep_addr *rxr_peer_raw_addr(struct rxr_ep *ep, fi_addr_t addr);
+
+const char *rxr_peer_raw_addr_str(struct rxr_ep *ep, fi_addr_t addr, char *buf, size_t *buflen);
 
 struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep,
 					 const struct fi_msg *msg,
@@ -747,30 +795,33 @@ struct rxr_tx_entry *rxr_ep_alloc_tx_entry(struct rxr_ep *rxr_ep,
 					   uint64_t tag,
 					   uint64_t flags);
 
-int rxr_tx_entry_mr_dereg(struct rxr_tx_entry *tx_entry);
+void rxr_release_tx_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry);
 
-static inline void rxr_release_tx_entry(struct rxr_ep *ep,
-					struct rxr_tx_entry *tx_entry)
-{
-#if ENABLE_DEBUG
-	dlist_remove(&tx_entry->tx_entry_entry);
-#endif
-	assert(dlist_empty(&tx_entry->queued_pkts));
-#ifdef ENABLE_EFA_POISONING
-	rxr_poison_mem_region((uint32_t *)tx_entry,
-			      sizeof(struct rxr_tx_entry));
-#endif
-	tx_entry->state = RXR_TX_FREE;
-	ofi_buf_free(tx_entry);
-}
+struct rxr_rx_entry *rxr_ep_alloc_rx_entry(struct rxr_ep *ep,
+					   fi_addr_t addr, uint32_t op);
 
 static inline void rxr_release_rx_entry(struct rxr_ep *ep,
 					struct rxr_rx_entry *rx_entry)
 {
-#if ENABLE_DEBUG
-	dlist_remove(&rx_entry->rx_entry_entry);
-#endif
-	assert(dlist_empty(&rx_entry->queued_pkts));
+	struct rxr_pkt_entry *pkt_entry;
+	struct dlist_entry *tmp;
+
+	if (rx_entry->peer)
+		dlist_remove(&rx_entry->peer_entry);
+
+	dlist_remove(&rx_entry->ep_entry);
+
+	if (!dlist_empty(&rx_entry->queued_pkts)) {
+		dlist_foreach_container_safe(&rx_entry->queued_pkts,
+					     struct rxr_pkt_entry,
+					     pkt_entry, entry, tmp) {
+			rxr_pkt_entry_release_tx(ep, pkt_entry);
+		}
+		dlist_remove(&rx_entry->queued_rnr_entry);
+	} else if (rx_entry->state == RXR_RX_QUEUED_CTRL) {
+		dlist_remove(&rx_entry->queued_ctrl_entry);
+	}
+
 #ifdef ENABLE_EFA_POISONING
 	rxr_poison_mem_region((uint32_t *)rx_entry,
 			      sizeof(struct rxr_rx_entry));
@@ -790,27 +841,9 @@ static inline int rxr_match_tag(uint64_t tag, uint64_t ignore,
 	return ((tag | ignore) == (match_tag | ignore));
 }
 
-static inline void rxr_ep_inc_tx_pending(struct rxr_ep *ep,
-					 struct rxr_peer *peer)
-{
-	ep->tx_pending++;
-	peer->tx_pending++;
-#if ENABLE_DEBUG
-	ep->sends++;
-#endif
-}
+void rxr_ep_record_tx_op_submitted(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
-static inline void rxr_ep_dec_tx_pending(struct rxr_ep *ep,
-					 struct rxr_peer *peer,
-					 int failed)
-{
-	ep->tx_pending--;
-	peer->tx_pending--;
-#if ENABLE_DEBUG
-	if (failed)
-		ep->failed_send_comps++;
-#endif
-}
+void rxr_ep_record_tx_op_completed(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
 static inline size_t rxr_get_rx_pool_chunk_cnt(struct rxr_ep *ep)
 {
@@ -819,7 +852,7 @@ static inline size_t rxr_get_rx_pool_chunk_cnt(struct rxr_ep *ep)
 
 static inline size_t rxr_get_tx_pool_chunk_cnt(struct rxr_ep *ep)
 {
-	return MIN(ep->max_outstanding_tx, ep->tx_size);
+	return MIN(ep->efa_max_outstanding_tx_ops, ep->tx_size);
 }
 
 static inline int rxr_need_sas_ordering(struct rxr_ep *ep)
@@ -845,8 +878,6 @@ int rxr_get_lower_rdm_info(uint32_t version, const char *node, const char *servi
 			   uint64_t flags, const struct util_prov *util_prov,
 			   const struct fi_info *util_hints,
 			   struct fi_info **core_info);
-int rxr_fabric(struct fi_fabric_attr *attr,
-	       struct fid_fabric **fabric, void *context);
 int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		    struct fid_domain **dom, void *context);
 int rxr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
@@ -857,8 +888,9 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 /* EP sub-functions */
 void rxr_ep_progress(struct util_ep *util_ep);
 void rxr_ep_progress_internal(struct rxr_ep *rxr_ep);
-int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv,
-		    uint64_t flags, enum rxr_lower_ep_type lower_ep);
+
+int rxr_ep_post_user_recv_buf(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
+			      uint64_t flags);
 
 int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep,
 				 struct rxr_tx_entry *tx_entry);
@@ -867,6 +899,8 @@ int rxr_ep_tx_init_mr_desc(struct rxr_domain *rxr_domain,
 			   struct rxr_tx_entry *tx_entry,
 			   int mr_iov_start, uint64_t access);
 
+void rxr_convert_desc_for_shm(int numdesc, void **desc);
+
 void rxr_prepare_desc_send(struct rxr_domain *rxr_domain,
 			   struct rxr_tx_entry *tx_entry);
 
@@ -887,20 +921,24 @@ struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep,
 					   struct rxr_rx_entry *posted_entry,
 					   struct rxr_rx_entry *consumer_entry,
 					   struct rxr_pkt_entry *pkt_entry);
-int rxr_ep_efa_addr_to_str(const void *addr, char *temp_name);
+
+int rxr_raw_addr_to_smr_name(void *addr, char *smr_name, size_t *smr_name_len);
 
 /* CQ sub-functions */
-int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
-			   ssize_t prov_errno);
-int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
-			   ssize_t prov_errno);
-int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err);
+void rxr_cq_write_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
+			   int err, int prov_errno);
+
+void rxr_cq_write_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
+			   int err, int prov_errno);
+
+void rxr_cq_queue_rnr_pkt(struct rxr_ep *ep,
+			  struct dlist_entry *list,
+			  struct rxr_pkt_entry *pkt_entry);
 
 void rxr_cq_write_rx_completion(struct rxr_ep *ep,
 				struct rxr_rx_entry *rx_entry);
 
 void rxr_cq_handle_rx_completion(struct rxr_ep *ep,
-				 struct rxr_pkt_entry *pkt_entry,
 				 struct rxr_rx_entry *rx_entry);
 
 void rxr_cq_write_tx_completion(struct rxr_ep *ep,
@@ -914,11 +952,11 @@ void rxr_cq_handle_shm_completion(struct rxr_ep *ep,
 				  fi_addr_t src_addr);
 
 int rxr_cq_reorder_msg(struct rxr_ep *ep,
-		       struct rxr_peer *peer,
+		       struct rdm_peer *peer,
 		       struct rxr_pkt_entry *pkt_entry);
 
 void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep,
-					  struct rxr_peer *peer);
+					  struct rdm_peer *peer);
 
 void rxr_cq_handle_shm_rma_write_data(struct rxr_ep *ep,
 				      struct fi_cq_data_entry *shm_comp,
@@ -961,14 +999,6 @@ static inline struct rxr_domain *rxr_ep_domain(struct rxr_ep *ep)
 	return container_of(ep->util_ep.domain, struct rxr_domain, util_domain);
 }
 
-static inline uint8_t rxr_ep_mr_local(struct rxr_ep *ep)
-{
-	struct rxr_domain *domain = container_of(ep->util_ep.domain,
-						 struct rxr_domain,
-						 util_domain);
-	return domain->mr_local;
-}
-
 /*
  * today we have only cq res check, in future we will have ctx, and other
  * resource check as well.
@@ -1003,47 +1033,4 @@ static inline void rxr_rm_tx_cq_check(struct rxr_ep *ep, struct util_cq *tx_cq)
 	fastlock_release(&tx_cq->cq_lock);
 }
 
-static inline bool rxr_peer_timeout_expired(struct rxr_ep *ep,
-					    struct rxr_peer *peer,
-					    uint64_t ts)
-{
-	return (ts >= (peer->rnr_ts + MIN(rxr_env.max_timeout,
-					  peer->timeout_interval *
-					  (1 << peer->rnr_timeout_exp))));
-}
-
-/* Performance counter declarations */
-#ifdef RXR_PERF_ENABLED
-#define RXR_PERF_FOREACH(DECL)	\
-	DECL(perf_rxr_tx),	\
-	DECL(perf_rxr_recv),	\
-	DECL(rxr_perf_size)	\
-
-enum rxr_perf_counters {
-	RXR_PERF_FOREACH(OFI_ENUM_VAL)
-};
-
-extern const char *rxr_perf_counters_str[];
-
-static inline void rxr_perfset_start(struct rxr_ep *ep, size_t index)
-{
-	struct rxr_domain *domain = rxr_ep_domain(ep);
-	struct rxr_fabric *fabric = container_of(domain->util_domain.fabric,
-						 struct rxr_fabric,
-						 util_fabric);
-	ofi_perfset_start(&fabric->perf_set, index);
-}
-
-static inline void rxr_perfset_end(struct rxr_ep *ep, size_t index)
-{
-	struct rxr_domain *domain = rxr_ep_domain(ep);
-	struct rxr_fabric *fabric = container_of(domain->util_domain.fabric,
-						 struct rxr_fabric,
-						 util_fabric);
-	ofi_perfset_end(&fabric->perf_set, index);
-}
-#else
-#define rxr_perfset_start(ep, index) do {} while (0)
-#define rxr_perfset_end(ep, index) do {} while (0)
-#endif
 #endif
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_atomic.c b/deps/libfabric/prov/efa/src/rxr/rxr_atomic.c
index 693818d8493782f67bb376883c31e601e0ffc58a..a6ee1f1a0d1d8d53b7d90fc4dfd250cc28cda335 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_atomic.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_atomic.c
@@ -32,15 +32,17 @@
  */
 
 #include <ofi_atomic.h>
+#include "efa.h"
 #include "rxr.h"
 #include "rxr_rma.h"
 #include "rxr_cntr.h"
 #include "rxr_atomic.h"
 #include "rxr_pkt_cmd.h"
 
-static void rxr_atomic_copy_shm_msg(struct fi_msg_atomic *shm_msg,
+static void rxr_atomic_init_shm_msg(struct fi_msg_atomic *shm_msg,
 				    const struct fi_msg_atomic *msg,
-				    struct fi_rma_ioc *rma_iov)
+				    struct fi_rma_ioc *rma_iov,
+				    void **shm_desc)
 {
 	int i;
 
@@ -53,6 +55,14 @@ static void rxr_atomic_copy_shm_msg(struct fi_msg_atomic *shm_msg,
 			rma_iov[i].addr = 0;
 		shm_msg->rma_iov = rma_iov;
 	}
+
+	if (msg->desc) {
+		memcpy(shm_desc, msg->desc, msg->iov_count * sizeof(void *));
+		rxr_convert_desc_for_shm(msg->iov_count, shm_desc);
+		shm_msg->desc = shm_desc;
+	} else {
+		shm_msg->desc = NULL;
+	}
 }
 
 static
@@ -65,7 +75,12 @@ rxr_atomic_alloc_tx_entry(struct rxr_ep *rxr_ep,
 	struct rxr_tx_entry *tx_entry;
 	struct fi_msg msg;
 	struct iovec iov[RXR_IOV_LIMIT];
-	size_t datatype_size = ofi_datatype_size(msg_atomic->datatype);
+	size_t datatype_size;
+
+	datatype_size = ofi_datatype_size(msg_atomic->datatype);
+	if (OFI_UNLIKELY(!datatype_size)) {
+		return NULL;
+	}
 
 	tx_entry = ofi_buf_alloc(rxr_ep->tx_entry_pool);
 	if (OFI_UNLIKELY(!tx_entry)) {
@@ -73,9 +88,8 @@ rxr_atomic_alloc_tx_entry(struct rxr_ep *rxr_ep,
 		return NULL;
 	}
 
-#if ENABLE_DEBUG
-	dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list);
-#endif
+	dlist_insert_tail(&tx_entry->ep_entry, &rxr_ep->tx_entry_list);
+
 	ofi_ioc_to_iov(msg_atomic->msg_iov, iov, msg_atomic->iov_count, datatype_size);
 	msg.addr = msg_atomic->addr;
 	msg.msg_iov = iov;
@@ -111,7 +125,8 @@ ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep,
 			       uint32_t op, uint64_t flags)
 {
 	struct rxr_tx_entry *tx_entry;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
+	bool delivery_complete_requested;
 	ssize_t err;
 	static int req_pkt_type_list[] = {
 		[ofi_op_atomic] = RXR_WRITE_RTA_PKT,
@@ -120,7 +135,11 @@ ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep,
 	};
 
 	assert(msg->iov_count <= rxr_ep->tx_iov_limit);
-	rxr_perfset_start(rxr_ep, perf_rxr_tx);
+	efa_perfset_start(rxr_ep, perf_efa_tx);
+
+	if (efa_ep_is_cuda_mr(msg->desc[0]))
+		return -FI_ENOSYS;
+
 	fastlock_acquire(&rxr_ep->util_ep.lock);
 
 	if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) {
@@ -129,6 +148,12 @@ ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep,
 	}
 
 	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
+
+	if (peer->flags & RXR_PEER_IN_BACKOFF) {
+		err = -FI_EAGAIN;
+		goto out;
+	}
 
 	tx_entry = rxr_atomic_alloc_tx_entry(rxr_ep, msg, atomic_ex, op, flags);
 	if (OFI_UNLIKELY(!tx_entry)) {
@@ -137,12 +162,58 @@ ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep,
 		goto out;
 	}
 
+	delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE;
+	if (delivery_complete_requested && !(peer->is_local)) {
+		tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+		/*
+		 * Because delivery complete is defined as an extra
+		 * feature, the receiver might not support it.
+		 *
+		 * The sender cannot send with FI_DELIVERY_COMPLETE
+		 * if the peer is not able to handle it.
+		 *
+		 * If the sender does not know whether the peer
+		 * can handle it, it needs to trigger
+		 * a handshake packet from the peer.
+		 *
+		 * The handshake packet contains
+		 * the information whether the peer
+		 * support it or not.
+		 */
+		err = rxr_pkt_trigger_handshake(rxr_ep, tx_entry->addr, peer);
+		if (OFI_UNLIKELY(err))
+			goto out;
+
+		if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
+			err = -FI_EAGAIN;
+			goto out;
+		} else if (!rxr_peer_support_delivery_complete(peer)) {
+			err = -FI_EOPNOTSUPP;
+			goto out;
+		}
+	}
+
 	tx_entry->msg_id = (peer->next_msg_id != ~0) ?
 			    peer->next_msg_id++ : ++peer->next_msg_id;
 
-	err = rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY,
-					tx_entry, req_pkt_type_list[op],
+	if (delivery_complete_requested && op == ofi_op_atomic) {
+		err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY,
+					tx_entry,
+					RXR_DC_WRITE_RTA_PKT,
+					0,
 					0);
+	} else {
+		/*
+		 * Fetch atomic and compare atomic
+		 * support DELIVERY_COMPLETE
+		 * by nature
+		 */
+		err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY,
+					tx_entry,
+					req_pkt_type_list[op],
+					0,
+					0);
+	}
 
 	if (OFI_UNLIKELY(err)) {
 		rxr_release_tx_entry(rxr_ep, tx_entry);
@@ -151,7 +222,7 @@ ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep,
 
 out:
 	fastlock_release(&rxr_ep->util_ep.lock);
-	rxr_perfset_end(rxr_ep, perf_rxr_tx);
+	efa_perfset_end(rxr_ep, perf_efa_tx);
 	return err;
 }
 
@@ -166,14 +237,16 @@ rxr_atomic_inject(struct fid_ep *ep,
 	struct fi_msg_atomic msg;
 
 	struct rxr_ep *rxr_ep;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	peer = rxr_ep_get_peer(rxr_ep, dest_addr);
+	assert(peer);
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
 		if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR))
 			remote_addr = 0;
+
 		return fi_inject_atomic(rxr_ep->shm_ep, buf, count, peer->shm_fiaddr,
 					remote_addr, remote_key, datatype, op);
 	}
@@ -207,8 +280,9 @@ rxr_atomic_writemsg(struct fid_ep *ep,
 {
 	struct fi_msg_atomic shm_msg;
 	struct rxr_ep *rxr_ep;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT];
+	void *shm_desc[RXR_IOV_LIMIT];
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 	       "%s: iov_len: %lu flags: %lx\n",
@@ -216,9 +290,10 @@ rxr_atomic_writemsg(struct fid_ep *ep,
 
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
-		rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov);
+		rxr_atomic_init_shm_msg(&shm_msg, msg, rma_iov, shm_desc);
 		shm_msg.addr = peer->shm_fiaddr;
 		return fi_atomicmsg(rxr_ep->shm_ep, &shm_msg, flags);
 	}
@@ -278,20 +353,27 @@ rxr_atomic_readwritemsg(struct fid_ep *ep,
 			uint64_t flags)
 {
 	struct rxr_ep *rxr_ep;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct fi_msg_atomic shm_msg;
-	struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT];
+	struct fi_rma_ioc shm_rma_iov[RXR_IOV_LIMIT];
+	void *shm_desc[RXR_IOV_LIMIT];
 	struct rxr_atomic_ex atomic_ex;
-	size_t datatype_size = ofi_datatype_size(msg->datatype);
+	size_t datatype_size;
+
+	datatype_size = ofi_datatype_size(msg->datatype);
+	if (OFI_UNLIKELY(!datatype_size)) {
+		return -errno;
+	}
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s total_len=%ld atomic_op=%d\n", __func__,
 	       ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), msg->op);
 
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
-		rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov);
+		rxr_atomic_init_shm_msg(&shm_msg, msg, shm_rma_iov, shm_desc);
 		shm_msg.addr = peer->shm_fiaddr;
 		return fi_fetch_atomicmsg(rxr_ep->shm_ep, &shm_msg,
 					  resultv, result_desc, result_count,
@@ -359,11 +441,17 @@ rxr_atomic_compwritemsg(struct fid_ep *ep,
 			uint64_t flags)
 {
 	struct rxr_ep *rxr_ep;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct fi_msg_atomic shm_msg;
-	struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT];
+	struct fi_rma_ioc shm_rma_iov[RXR_IOV_LIMIT];
+	void *shm_desc[RXR_IOV_LIMIT];
 	struct rxr_atomic_ex atomic_ex;
-	size_t datatype_size = ofi_datatype_size(msg->datatype);
+	size_t datatype_size;
+
+	datatype_size = ofi_datatype_size(msg->datatype);
+	if (OFI_UNLIKELY(!datatype_size)) {
+		return -errno;
+	}
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 	       "%s: iov_len: %lu flags: %lx\n",
@@ -371,9 +459,10 @@ rxr_atomic_compwritemsg(struct fid_ep *ep,
 
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
-		rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov);
+		rxr_atomic_init_shm_msg(&shm_msg, msg, shm_rma_iov, shm_desc);
 		shm_msg.addr = peer->shm_fiaddr;
 		return fi_compare_atomicmsg(rxr_ep->shm_ep, &shm_msg,
 					    comparev, compare_desc, compare_count,
@@ -460,6 +549,12 @@ int rxr_query_atomic(struct fid_domain *domain,
 		return -FI_EINVAL;
 	}
 
+	if ((datatype == FI_INT128) || (datatype == FI_UINT128)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"128-bit atomic integers not supported\n");
+		return -FI_EOPNOTSUPP;
+	}
+
 	ret = ofi_atomic_valid(&rxr_prov, datatype, op, flags);
 	if (ret || !attr)
 		return ret;
@@ -475,6 +570,9 @@ int rxr_query_atomic(struct fid_domain *domain,
 		max_atomic_size /= 2;
 
 	attr->size = ofi_datatype_size(datatype);
+	if (OFI_UNLIKELY(!attr->size)) {
+		return -errno;
+	}
 	attr->count = max_atomic_size / attr->size;
 	return 0;
 }
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_attr.c b/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
index 77b2a9eb6d97227b3aaac77347c8162da6d2c548..9934a637b78be316a53ce1a4202b71c9ecc85966 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
@@ -82,7 +82,7 @@ struct fi_ep_attr rxr_ep_attr = {
 	.type = FI_EP_RDM,
 	.protocol = FI_PROTO_EFA,
 	.mem_tag_format = FI_TAG_GENERIC,
-	.protocol_version = RXR_CUR_PROTOCOL_VERSION,
+	.protocol_version = RXR_PROTOCOL_VERSION,
 	.max_msg_size = UINT64_MAX,
 	.msg_prefix_size = 0,
 	.tx_ctx_cnt = 1,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_cq.c b/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
index 36218960b07e6d13ba156f9c3cb5566b32106de3..74167aa2a97a98efa970cf0f308cc400bf13c3cf 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
@@ -40,6 +40,7 @@
 #include "rxr_rma.h"
 #include "rxr_msg.h"
 #include "rxr_cntr.h"
+#include "rxr_read.h"
 #include "rxr_atomic.h"
 #include "efa.h"
 
@@ -66,32 +67,42 @@ static const char *rxr_cq_strerror(struct fid_cq *cq_fid, int prov_errno,
 	return str;
 }
 
-/*
- * Teardown rx_entry and write an error cq entry. With our current protocol we
- * will only encounter an RX error when sending a queued REQ or CTS packet or
- * if we are sending a CTS message. Because of this, the sender will not send
- * any additional data packets if the receiver encounters an error. If there is
- * a scenario in the future where the sender will continue to send data packets
- * we need to prevent rx_id mismatch. Ideally, we should add a NACK message and
- * tear down both RX and TX entires although whatever caused the error may
- * prevent that.
+/**
+ * @brief handle error happened to an RX (receive) operation
+ *
+ * This function will write an error cq entry to notify application the rx
+ * operation failed. If write failed, it will write an eq entry.
+ *
+ * It will also release resources owned by the RX entry, such as unexpected
+ * packet entry, because the RX operation is aborted.
+ *
+ * It will remove the rx_entry from queued rx_entry list for the same reason.
+ *
+ * It will NOT release the rx_entry because it is still possible to receive
+ * packet for this rx_entry.
  *
  * TODO: add a NACK message to tear down state on sender side
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	rx_entry	rx_entry that contains information of the tx operation
+ * @param[in]	err		positive libfabric error code
+ * @param[in]	prov_errno	positive provider specific error code
  */
-int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
-			   ssize_t prov_errno)
+void rxr_cq_write_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
+			   int err, int prov_errno)
 {
 	struct fi_cq_err_entry err_entry;
 	struct util_cq *util_cq;
 	struct dlist_entry *tmp;
 	struct rxr_pkt_entry *pkt_entry;
+	int write_cq_err;
 
 	memset(&err_entry, 0, sizeof(err_entry));
 
 	util_cq = ep->util_ep.rx_cq;
 
-	err_entry.err = FI_EIO;
-	err_entry.prov_errno = (int)prov_errno;
+	err_entry.err = err;
+	err_entry.prov_errno = prov_errno;
 
 	switch (rx_entry->state) {
 	case RXR_RX_INIT:
@@ -106,9 +117,7 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 #endif
 		break;
 	case RXR_RX_QUEUED_CTRL:
-	case RXR_RX_QUEUED_CTS_RNR:
-	case RXR_RX_QUEUED_EOR:
-		dlist_remove(&rx_entry->queued_entry);
+		dlist_remove(&rx_entry->queued_ctrl_entry);
 		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "rx_entry unknown state %d\n",
@@ -116,10 +125,13 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 		assert(0 && "rx_entry unknown state");
 	}
 
-	dlist_foreach_container_safe(&rx_entry->queued_pkts,
-				     struct rxr_pkt_entry,
-				     pkt_entry, entry, tmp)
-		rxr_pkt_entry_release_tx(ep, pkt_entry);
+	if (rx_entry->rxr_flags & RXR_RX_ENTRY_QUEUED_RNR) {
+		dlist_foreach_container_safe(&rx_entry->queued_pkts,
+					     struct rxr_pkt_entry,
+					     pkt_entry, entry, tmp)
+			rxr_pkt_entry_release_tx(ep, pkt_entry);
+		dlist_remove(&rx_entry->queued_rnr_entry);
+	}
 
 	if (rx_entry->unexp_pkt) {
 		rxr_pkt_entry_release_rx(ep, rx_entry->unexp_pkt);
@@ -139,7 +151,7 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 	rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
 
         FI_WARN(&rxr_prov, FI_LOG_CQ,
-		"rxr_cq_handle_rx_error: err: %d, prov_err: %s (%d)\n",
+		"rxr_cq_write_rx_error: err: %d, prov_err: %s (%d)\n",
 		err_entry.err, fi_strerror(-err_entry.prov_errno),
 		err_entry.prov_errno);
 
@@ -151,35 +163,50 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 	//rxr_release_rx_entry(ep, rx_entry);
 
 	efa_cntr_report_error(&ep->util_ep, err_entry.flags);
-	return ofi_cq_write_error(util_cq, &err_entry);
+	write_cq_err = ofi_cq_write_error(util_cq, &err_entry);
+	if (write_cq_err) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"Error writing error cq entry when handling RX error");
+		efa_eq_write_error(&ep->util_ep, err, prov_errno);
+	}
 }
 
-/*
- * Teardown tx_entry and write an error cq entry. With our current protocol the
- * receiver will only send a CTS once the window is exhausted, meaning that all
- * data packets for that window will have been received successfully. This
- * means that the receiver will not send any CTS packets if the sender
- * encounters and error sending data packets. If that changes in the future we
- * will need to be careful to prevent tx_id mismatch.
+/**
+ * @brief write error CQ entry for a TX operation.
+ *
+ * This function write an error cq entry for a TX operation, if writing
+ * CQ error entry failed, it will write eq entry.
+ *
+ * If also remote the TX entry from ep->tx_queued_list and ep->tx_pending_list
+ * if the tx_entry is on it.
+ *
+ * It does NOT release tx entry because it is still possible to receive
+ * send completion for this TX entry
  *
  * TODO: add NACK message to tear down receive side state
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	tx_entry	tx_entry that contains information of the tx operation
+ * @param[in]	err		positive libfabric error code
+ * @param[in]	prov_errno	positive EFA provider specific error code
  */
-int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
-			   ssize_t prov_errno)
+void rxr_cq_write_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
+			   int err, int prov_errno)
 {
 	struct fi_cq_err_entry err_entry;
 	struct util_cq *util_cq;
 	uint32_t api_version;
 	struct dlist_entry *tmp;
 	struct rxr_pkt_entry *pkt_entry;
+	int write_cq_err;
 
 	memset(&err_entry, 0, sizeof(err_entry));
 
 	util_cq = ep->util_ep.tx_cq;
 	api_version = util_cq->domain->fabric->fabric_fid.api_version;
 
-	err_entry.err = FI_EIO;
-	err_entry.prov_errno = (int)prov_errno;
+	err_entry.err = err;
+	err_entry.prov_errno = prov_errno;
 
 	switch (tx_entry->state) {
 	case RXR_TX_REQ:
@@ -189,12 +216,7 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 		break;
 	case RXR_TX_QUEUED_CTRL:
 	case RXR_TX_QUEUED_SHM_RMA:
-	case RXR_TX_QUEUED_REQ_RNR:
-	case RXR_TX_QUEUED_DATA_RNR:
-		dlist_remove(&tx_entry->queued_entry);
-		break;
-	case RXR_TX_SENT_READRSP:
-	case RXR_TX_WAIT_READ_FINISH:
+		dlist_remove(&tx_entry->queued_ctrl_entry);
 		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "tx_entry unknown state %d\n",
@@ -202,6 +224,9 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 		assert(0 && "tx_entry unknown state");
 	}
 
+	if (tx_entry->rxr_flags & RXR_TX_ENTRY_QUEUED_RNR)
+		dlist_remove(&tx_entry->queued_rnr_entry);
+
 	dlist_foreach_container_safe(&tx_entry->queued_pkts,
 				     struct rxr_pkt_entry,
 				     pkt_entry, entry, tmp)
@@ -216,7 +241,7 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 		err_entry.err_data_size = 0;
 
 	FI_WARN(&rxr_prov, FI_LOG_CQ,
-		"rxr_cq_handle_tx_error: err: %d, prov_err: %s (%d)\n",
+		"rxr_cq_write_tx_error: err: %d, prov_err: %s (%d)\n",
 		err_entry.err, fi_strerror(-err_entry.prov_errno),
 		err_entry.prov_errno);
 
@@ -228,214 +253,122 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	//rxr_release_tx_entry(ep, tx_entry);
 
 	efa_cntr_report_error(&ep->util_ep, tx_entry->cq_entry.flags);
-	return ofi_cq_write_error(util_cq, &err_entry);
+	write_cq_err = ofi_cq_write_error(util_cq, &err_entry);
+	if (write_cq_err) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"Error writing error cq entry when handling TX error");
+		efa_eq_write_error(&ep->util_ep, err, prov_errno);
+	}
 }
 
-/*
- * Queue a packet on the appropriate list when an RNR error is received.
+/* @brief Queue a packet that encountered RNR error and setup RNR backoff
+ *
+ * We uses an exponential backoff strategy to handle RNR errors.
+ *
+ * `Backoff` means if a peer encountered RNR, an endpoint will
+ * wait a period of time before sending packets to the peer again
+ *
+ * `Exponential` means the more RNR encountered, the longer the
+ * backoff wait time will be.
+ *
+ * To quantify how long a peer stay in backoff mode, two parameters
+ * are defined:
+ *
+ *    rnr_backoff_begin_ts (ts is timestamp) and rnr_backoff_wait_time.
+ *
+ * A peer stays in backoff mode until:
+ *
+ * current_timestamp >= (rnr_backoff_begin_ts + rnr_backoff_wait_time),
+ *
+ * with one exception: a peer can got out of backoff mode early if a
+ * packet's send completion to this peer was reported by the device.
+ *
+ * Specifically, the implementation of RNR backoff is:
+ *
+ * For a peer, the first time RNR is encountered, the packet will
+ * be resent immediately.
+ *
+ * The second time RNR is encountered, the endpoint will put the
+ * peer in backoff mode, and initialize rnr_backoff_begin_timestamp
+ * and rnr_backoff_wait_time.
+ *
+ * The 3rd and following time RNR is encounter, the RNR will be handled
+ * like this:
+ *
+ *     If peer is already in backoff mode, rnr_backoff_begin_ts
+ *     will be updated
+ *
+ *     Otherwise, peer will be put in backoff mode again,
+ *     rnr_backoff_begin_ts will be updated and rnr_backoff_wait_time
+ *     will be doubled until it reached maximum wait time.
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	list		queued RNR packet list
+ * @param[in]	pkt_entry	packet entry that encounter RNR
  */
-static inline void rxr_cq_queue_pkt(struct rxr_ep *ep,
-				    struct dlist_entry *list,
-				    struct rxr_pkt_entry *pkt_entry)
+void rxr_cq_queue_rnr_pkt(struct rxr_ep *ep,
+			  struct dlist_entry *list,
+			  struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 
-	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+#if ENABLE_DEBUG
+	dlist_remove(&pkt_entry->dbg_entry);
+#endif
+	dlist_insert_tail(&pkt_entry->entry, list);
 
-	/*
-	 * Queue the packet if it has not been retransmitted yet.
-	 */
-	if (pkt_entry->state != RXR_PKT_ENTRY_RNR_RETRANSMIT) {
-		pkt_entry->state = RXR_PKT_ENTRY_RNR_RETRANSMIT;
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	assert(peer);
+	if (!(pkt_entry->flags & RXR_PKT_ENTRY_RNR_RETRANSMIT)) {
+		/* This is the first time this packet encountered RNR,
+		 * we are NOT going to put the peer in backoff mode just yet.
+		 */
+		pkt_entry->flags |= RXR_PKT_ENTRY_RNR_RETRANSMIT;
 		peer->rnr_queued_pkt_cnt++;
-		goto queue_pkt;
+		return;
 	}
 
-	/*
-	 * Otherwise, increase the backoff if the peer is already not in
-	 * backoff. Reset the timer when starting backoff or if another RNR for
-	 * a retransmitted packet is received while waiting for the timer to
-	 * expire.
+	/* This packet has encountered RNR multiple times, therefore the peer
+	 * need to be in backoff mode.
+	 *
+	 * If the peer is already in backoff mode, we just need to update the
+	 * RNR backoff begin time.
+	 *
+	 * Otherwise, we need to put the peer in backoff mode and set up backoff
+	 * begin time and wait time.
 	 */
-	peer->rnr_ts = ofi_gettime_us();
-	if (peer->flags & RXR_PEER_IN_BACKOFF)
-		goto queue_pkt;
+	if (peer->flags & RXR_PEER_IN_BACKOFF) {
+		peer->rnr_backoff_begin_ts = ofi_gettime_us();
+		return;
+	}
 
 	peer->flags |= RXR_PEER_IN_BACKOFF;
+	dlist_insert_tail(&peer->rnr_backoff_entry,
+			  &ep->peer_backoff_list);
 
-	if (!peer->timeout_interval) {
-		if (rxr_env.timeout_interval)
-			peer->timeout_interval = rxr_env.timeout_interval;
+	peer->rnr_backoff_begin_ts = ofi_gettime_us();
+	if (peer->rnr_backoff_wait_time == 0) {
+		if (rxr_env.rnr_backoff_initial_wait_time > 0)
+			peer->rnr_backoff_wait_time = rxr_env.rnr_backoff_initial_wait_time;
 		else
-			peer->timeout_interval = MAX(RXR_RAND_MIN_TIMEOUT,
-						     rand() %
-						     RXR_RAND_MAX_TIMEOUT);
+			peer->rnr_backoff_wait_time = MAX(RXR_RAND_MIN_TIMEOUT,
+							  rand() %
+							  RXR_RAND_MAX_TIMEOUT);
 
-		peer->rnr_timeout_exp = 1;
 		FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 		       "initializing backoff timeout for peer: %" PRIu64
-		       " timeout: %d rnr_queued_pkts: %d\n",
-		       pkt_entry->addr, peer->timeout_interval,
+		       " timeout: %ld rnr_queued_pkts: %d\n",
+		       pkt_entry->addr, peer->rnr_backoff_wait_time,
 		       peer->rnr_queued_pkt_cnt);
 	} else {
-		/* Only backoff once per peer per progress thread loop. */
-		if (!(peer->flags & RXR_PEER_BACKED_OFF)) {
-			peer->flags |= RXR_PEER_BACKED_OFF;
-			peer->rnr_timeout_exp++;
-			FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
-			       "increasing backoff for peer: %" PRIu64
-			       " rnr_timeout_exp: %d rnr_queued_pkts: %d\n",
-			       pkt_entry->addr, peer->rnr_timeout_exp,
-			       peer->rnr_queued_pkt_cnt);
-		}
-	}
-	dlist_insert_tail(&peer->rnr_entry,
-			  &ep->peer_backoff_list);
-
-queue_pkt:
-#if ENABLE_DEBUG
-	dlist_remove(&pkt_entry->dbg_entry);
-#endif
-	dlist_insert_tail(&pkt_entry->entry, list);
-}
-
-int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err)
-{
-	struct fi_cq_err_entry err_entry;
-	struct rxr_pkt_entry *pkt_entry;
-	struct rxr_rx_entry *rx_entry;
-	struct rxr_tx_entry *tx_entry;
-	struct rxr_peer *peer;
-	ssize_t ret;
-
-	memset(&err_entry, 0, sizeof(err_entry));
-
-	/*
-	 * If the cq_read failed with another error besides -FI_EAVAIL or
-	 * the cq_readerr fails we don't know if this is an rx or tx error.
-	 * We'll write an error eq entry to the event queue instead.
-	 */
-
-	err_entry.err = FI_EIO;
-	err_entry.prov_errno = (int)err;
-
-	if (err != -FI_EAVAIL) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "fi_cq_read: %s\n",
-			fi_strerror(-err));
-		goto write_err;
-	}
-
-	ret = fi_cq_readerr(ep->rdm_cq, &err_entry, 0);
-	if (ret != 1) {
-		if (ret < 0) {
-			FI_WARN(&rxr_prov, FI_LOG_CQ, "fi_cq_readerr: %s\n",
-				fi_strerror(-ret));
-			err_entry.prov_errno = ret;
-		} else {
-			FI_WARN(&rxr_prov, FI_LOG_CQ,
-				"fi_cq_readerr unexpected size %zu expected %zu\n",
-				ret, sizeof(err_entry));
-			err_entry.prov_errno = -FI_EIO;
-		}
-		goto write_err;
-	}
-
-	if (err_entry.err != -FI_EAGAIN)
-		OFI_CQ_STRERROR(&rxr_prov, FI_LOG_WARN, FI_LOG_CQ, ep->rdm_cq,
-				&err_entry);
-
-	pkt_entry = (struct rxr_pkt_entry *)err_entry.op_context;
-	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
-
-	/*
-	 * A handshake send could fail at the core provider if the peer endpoint
-	 * is shutdown soon after it receives a send completion for the REQ
-	 * packet that included src_address. The handshake itself is irrelevant if
-	 * that happens, so just squelch this error entry and move on without
-	 * writing an error completion or event to the application.
-	 */
-	if (rxr_get_base_hdr(pkt_entry->pkt)->type == RXR_HANDSHAKE_PKT) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"Squelching error CQE for RXR_HANDSHAKE_PKT\n");
-		/*
-		 * HANDSHAKE packets do not have an associated rx/tx entry. Use
-		 * the flags instead to determine if this is a send or recv.
-		 */
-		if (err_entry.flags & FI_SEND) {
-			rxr_ep_dec_tx_pending(ep, peer, 1);
-			rxr_pkt_entry_release_tx(ep, pkt_entry);
-		} else if (err_entry.flags & FI_RECV) {
-			rxr_pkt_entry_release_rx(ep, pkt_entry);
-		} else {
-			assert(0 && "unknown err_entry flags in HANDSHAKE packet");
-		}
-		return 0;
-	}
-
-	if (!pkt_entry->x_entry) {
-		/*
-		 * A NULL x_entry means this is a recv posted buf pkt_entry.
-		 * Since we don't have any context besides the error code,
-		 * we will write to the eq instead.
-		 */
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		goto write_err;
-	}
-
-	/*
-	 * If x_entry is set this rx or tx entry error is for a sent
-	 * packet. Decrement the tx_pending counter and fall through to
-	 * the rx or tx entry handlers.
-	 */
-	if (!peer->is_local)
-		rxr_ep_dec_tx_pending(ep, peer, 1);
-	if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_TX_ENTRY) {
-		tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
-		if (err_entry.err != -FI_EAGAIN ||
-		    rxr_ep_domain(ep)->resource_mgmt == FI_RM_ENABLED) {
-			ret = rxr_cq_handle_tx_error(ep, tx_entry,
-						     err_entry.prov_errno);
-			rxr_pkt_entry_release_tx(ep, pkt_entry);
-			return ret;
-		}
-
-		rxr_cq_queue_pkt(ep, &tx_entry->queued_pkts, pkt_entry);
-		if (tx_entry->state == RXR_TX_SEND) {
-			dlist_remove(&tx_entry->entry);
-			tx_entry->state = RXR_TX_QUEUED_DATA_RNR;
-			dlist_insert_tail(&tx_entry->queued_entry,
-					  &ep->tx_entry_queued_list);
-		} else if (tx_entry->state == RXR_TX_REQ) {
-			tx_entry->state = RXR_TX_QUEUED_REQ_RNR;
-			dlist_insert_tail(&tx_entry->queued_entry,
-					  &ep->tx_entry_queued_list);
-		}
-		return 0;
-	} else if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_RX_ENTRY) {
-		rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry;
-		if (err_entry.err != -FI_EAGAIN ||
-		    rxr_ep_domain(ep)->resource_mgmt == FI_RM_ENABLED) {
-			ret = rxr_cq_handle_rx_error(ep, rx_entry,
-						     err_entry.prov_errno);
-			rxr_pkt_entry_release_tx(ep, pkt_entry);
-			return ret;
-		}
-		rxr_cq_queue_pkt(ep, &rx_entry->queued_pkts, pkt_entry);
-		if (rx_entry->state == RXR_RX_RECV) {
-			rx_entry->state = RXR_RX_QUEUED_CTS_RNR;
-			dlist_insert_tail(&rx_entry->queued_entry,
-					  &ep->rx_entry_queued_list);
-		}
-		return 0;
+		peer->rnr_backoff_wait_time = MIN(peer->rnr_backoff_wait_time * 2,
+						  rxr_env.rnr_backoff_wait_time_cap);
+		FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
+		       "increasing backoff timeout for peer: %" PRIu64
+		       "to %ld rnr_queued_pkts: %d\n",
+		       pkt_entry->addr, peer->rnr_backoff_wait_time,
+		       peer->rnr_queued_pkt_cnt);
 	}
-
-	FI_WARN(&rxr_prov, FI_LOG_CQ,
-		"%s unknown x_entry state %d\n",
-		__func__, RXR_GET_X_ENTRY_TYPE(pkt_entry));
-	assert(0 && "unknown x_entry state");
-write_err:
-	efa_eq_write_error(&ep->util_ep, err_entry.err, err_entry.prov_errno);
-	return 0;
 }
 
 void rxr_cq_write_rx_completion(struct rxr_ep *ep,
@@ -461,11 +394,14 @@ void rxr_cq_write_rx_completion(struct rxr_ep *ep,
 
 		rxr_rm_rx_cq_check(ep, rx_cq);
 
-		if (OFI_UNLIKELY(ret))
+		if (OFI_UNLIKELY(ret)) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"Unable to write recv error cq: %s\n",
 				fi_strerror(-ret));
+			return;
+		}
 
+		rx_entry->fi_flags |= RXR_NO_COMPLETION;
 		efa_cntr_report_error(&ep->util_ep, rx_entry->cq_entry.flags);
 		return;
 	}
@@ -504,32 +440,28 @@ void rxr_cq_write_rx_completion(struct rxr_ep *ep,
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"Unable to write recv completion: %s\n",
 				fi_strerror(-ret));
-			if (rxr_cq_handle_rx_error(ep, rx_entry, ret))
-				assert(0 && "failed to write err cq entry");
+			rxr_cq_write_rx_error(ep, rx_entry, -ret, -ret);
 			return;
 		}
+
+		rx_entry->fi_flags |= RXR_NO_COMPLETION;
 	}
 
 	efa_cntr_report_rx_completion(&ep->util_ep, rx_entry->cq_entry.flags);
 }
 
 void rxr_cq_handle_rx_completion(struct rxr_ep *ep,
-				 struct rxr_pkt_entry *pkt_entry,
 				 struct rxr_rx_entry *rx_entry)
 {
 	struct rxr_tx_entry *tx_entry = NULL;
 
 	if (rx_entry->cq_entry.flags & FI_WRITE) {
 		/*
-		 * must be on the remote side, notify cq/counter
-		 * if FI_RMA_EVENT is requested or REMOTE_CQ_DATA is on
+		 * must be on the remote side, notify cq if REMOTE_CQ_DATA is on
 		 */
 		if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA)
 			rxr_cq_write_rx_completion(ep, rx_entry);
-		else if (ep->util_ep.caps & FI_RMA_EVENT)
-			efa_cntr_report_rx_completion(&ep->util_ep, rx_entry->cq_entry.flags);
 
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
 	}
 
@@ -552,8 +484,7 @@ void rxr_cq_handle_rx_completion(struct rxr_ep *ep,
 		 * rx_entry receiving data
 		 * receive completed              send completed
 		 * handle_rx_completion()         handle_pkt_send_completion()
-		 * |->write_tx_completion()       |-> if (FI_RMA_EVENT)
-		 *                                         write_rx_completion()
+		 * |->write_tx_completion()
 		 *
 		 * As can be seen, although there is a rx_entry on remote side,
 		 * the entry will not enter into rxr_cq_handle_rx_completion
@@ -562,20 +493,18 @@ void rxr_cq_handle_rx_completion(struct rxr_ep *ep,
 		 *     2. call rxr_cq_write_tx_completion()
 		 */
 		tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, rx_entry->rma_loc_tx_id);
-		assert(tx_entry->state == RXR_TX_WAIT_READ_FINISH);
+		assert(tx_entry->state == RXR_TX_REQ);
 		if (tx_entry->fi_flags & FI_COMPLETION) {
-			/* Note write_tx_completion() will release tx_entry */
 			rxr_cq_write_tx_completion(ep, tx_entry);
 		} else {
 			efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
-			rxr_release_tx_entry(ep, tx_entry);
 		}
 
+		rxr_release_tx_entry(ep, tx_entry);
 		/*
 		 * do not call rxr_release_rx_entry here because
 		 * caller will release
 		 */
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
 	}
 
@@ -583,43 +512,53 @@ void rxr_cq_handle_rx_completion(struct rxr_ep *ep,
 		rxr_msg_multi_recv_handle_completion(ep, rx_entry);
 
 	rxr_cq_write_rx_completion(ep, rx_entry);
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
 	return;
 }
 
 int rxr_cq_reorder_msg(struct rxr_ep *ep,
-		       struct rxr_peer *peer,
+		       struct rdm_peer *peer,
 		       struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_pkt_entry *ooo_entry;
 	struct rxr_pkt_entry *cur_ooo_entry;
+	struct rxr_robuf *robuf;
 	uint32_t msg_id;
 
 	assert(rxr_get_base_hdr(pkt_entry->pkt)->type >= RXR_REQ_PKT_BEGIN);
 
 	msg_id = rxr_pkt_msg_id(pkt_entry);
-	/*
-	 * TODO: Initialize peer state  at the time of AV insertion
-	 * where duplicate detection is available.
-	 */
-	if (!peer->rx_init)
-		rxr_ep_peer_init_rx(ep, peer);
 
+	robuf = &peer->robuf;
 #if ENABLE_DEBUG
-	if (msg_id != ofi_recvwin_next_exp_id(peer->robuf))
+	if (msg_id != ofi_recvwin_next_exp_id(robuf))
 		FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
 		       "msg OOO msg_id: %" PRIu32 " expected: %"
 		       PRIu32 "\n", msg_id,
-		       ofi_recvwin_next_exp_id(peer->robuf));
+		       ofi_recvwin_next_exp_id(robuf));
 #endif
-	if (ofi_recvwin_is_exp(peer->robuf, msg_id))
+	if (ofi_recvwin_is_exp(robuf, msg_id))
 		return 0;
-	else if (!ofi_recvwin_id_valid(peer->robuf, msg_id))
-		return -FI_EALREADY;
+	else if (!ofi_recvwin_id_valid(robuf, msg_id)) {
+		if (ofi_recvwin_id_processed(robuf, msg_id)) {
+			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			       "Error: message id has already been processed. received: %" PRIu32 " expected: %"
+			       PRIu32 "\n", msg_id, ofi_recvwin_next_exp_id(robuf));
+			return -FI_EALREADY;
+		} else {
+			fprintf(stderr,
+				"Current receive window size (%d) is too small to hold incoming messages.\n"
+				"As a result, you application cannot proceed.\n"
+				"Receive window size can be increased by setting the environment variable:\n"
+				"              FI_EFA_RECVWIN_SIZE\n"
+				"\n"
+				"Your job will now abort.\n\n", rxr_env.recvwin_size);
+			abort();
+		}
+	}
 
 	if (OFI_LIKELY(rxr_env.rx_copy_ooo)) {
-		assert(pkt_entry->type == RXR_PKT_ENTRY_POSTED);
-		ooo_entry = rxr_pkt_entry_clone(ep, ep->rx_ooo_pkt_pool, pkt_entry, RXR_PKT_ENTRY_OOO);
+		assert(pkt_entry->alloc_type == RXR_PKT_FROM_EFA_RX_POOL);
+		ooo_entry = rxr_pkt_entry_clone(ep, ep->rx_ooo_pkt_pool, RXR_PKT_FROM_OOO_POOL, pkt_entry);
 		if (OFI_UNLIKELY(!ooo_entry)) {
 			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 				"Unable to allocate rx_pkt_entry for OOO msg\n");
@@ -630,30 +569,32 @@ int rxr_cq_reorder_msg(struct rxr_ep *ep,
 		ooo_entry = pkt_entry;
 	}
 
-	cur_ooo_entry = *ofi_recvwin_get_msg(peer->robuf, msg_id);
+	cur_ooo_entry = *ofi_recvwin_get_msg(robuf, msg_id);
 	if (cur_ooo_entry) {
 		assert(rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_MEDIUM_MSGRTM_PKT ||
-		       rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_MEDIUM_TAGRTM_PKT);
+		       rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_MEDIUM_TAGRTM_PKT ||
+		       rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+		       rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_DC_MEDIUM_TAGRTM_PKT);
 		assert(rxr_pkt_msg_id(cur_ooo_entry) == msg_id);
 		assert(rxr_pkt_rtm_total_len(cur_ooo_entry) == rxr_pkt_rtm_total_len(ooo_entry));
 		rxr_pkt_entry_append(cur_ooo_entry, ooo_entry);
 	} else {
-		ofi_recvwin_queue_msg(peer->robuf, &ooo_entry, msg_id);
+		ofi_recvwin_queue_msg(robuf, &ooo_entry, msg_id);
 	}
 
 	return 1;
 }
 
 void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep,
-					  struct rxr_peer *peer)
+					  struct rdm_peer *peer)
 {
 	struct rxr_pkt_entry *pending_pkt;
 	int ret = 0;
 	uint32_t msg_id;
 
 	while (1) {
-		pending_pkt = *ofi_recvwin_peek(peer->robuf);
-		if (!pending_pkt || !pending_pkt->pkt)
+		pending_pkt = *ofi_recvwin_peek((&peer->robuf));
+		if (!pending_pkt)
 			return;
 
 		msg_id = rxr_pkt_msg_id(pending_pkt);
@@ -661,7 +602,7 @@ void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep,
 		       "Processing msg_id %d from robuf\n", msg_id);
 		/* rxr_pkt_proc_rtm_rta will write error cq entry if needed */
 		ret = rxr_pkt_proc_rtm_rta(ep, pending_pkt);
-		*ofi_recvwin_get_next_msg(peer->robuf) = NULL;
+		*ofi_recvwin_get_next_msg((&peer->robuf)) = NULL;
 		if (OFI_UNLIKELY(ret)) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"Error processing msg_id %d from robuf: %s\n",
@@ -744,7 +685,14 @@ bool rxr_cq_need_tx_completion(struct rxr_ep *ep,
 	       tx_entry->fi_flags & FI_COMPLETION;
 }
 
-
+/**
+ * @brief write a cq entry for an tx operation (send/read/write) if application wants it.
+ *        Sometimes application does not want to receive a cq entry for an tx
+ *        operation.
+ *
+ * @param[in]	ep		end point
+ * @param[in]	tx_entry	tx entry that contains information of the TX operation
+ */
 void rxr_cq_write_tx_completion(struct rxr_ep *ep,
 				struct rxr_tx_entry *tx_entry)
 {
@@ -784,93 +732,54 @@ void rxr_cq_write_tx_completion(struct rxr_ep *ep,
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"Unable to write send completion: %s\n",
 				fi_strerror(-ret));
-			if (rxr_cq_handle_tx_error(ep, tx_entry, ret))
-				assert(0 && "failed to write err cq entry");
+			rxr_cq_write_tx_error(ep, tx_entry, -ret, -ret);
 			return;
 		}
 	}
 
 	efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
-	rxr_release_tx_entry(ep, tx_entry);
+	tx_entry->fi_flags |= RXR_NO_COMPLETION;
 	return;
 }
 
-int rxr_tx_entry_mr_dereg(struct rxr_tx_entry *tx_entry)
-{
-	int i, err = 0;
-
-	for (i = 0; i < tx_entry->iov_count; i++) {
-		if (tx_entry->mr[i]) {
-			err = fi_close((struct fid *)tx_entry->mr[i]);
-			if (OFI_UNLIKELY(err)) {
-				FI_WARN(&rxr_prov, FI_LOG_CQ, "mr dereg failed. err=%d\n", err);
-				return err;
-			}
-
-			tx_entry->mr[i] = NULL;
-		}
-	}
-
-	return 0;
-}
-
 void rxr_cq_handle_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 {
-	int ret;
-	struct rxr_peer *peer;
-	struct efa_domain *efa_domain;
-	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
-
-	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
-				  util_domain.domain_fid);
+	struct rdm_peer *peer;
 
 	if (tx_entry->state == RXR_TX_SEND)
 		dlist_remove(&tx_entry->entry);
 
-	if (efa_is_cache_available(efa_domain) && rxr_ep_mr_local(ep)) {
-		ret = rxr_tx_entry_mr_dereg(tx_entry);
-		if (OFI_UNLIKELY(ret)) {
-			FI_WARN(&rxr_prov, FI_LOG_MR,
-				"In-line memory deregistration failed with error: %s.\n",
-				fi_strerror(-ret));
-		}
-	}
-
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
+	assert(peer);
 	peer->tx_credits += tx_entry->credit_allocated;
 
 	if (tx_entry->cq_entry.flags & FI_READ) {
 		/*
-		 * this must be on remote side
-		 * see explaination on rxr_cq_handle_rx_completion
+		 * This is on responder side of an emulated read operation.
+		 * In this case, we do not write any completion.
+		 * The TX entry is allocated for emulated read, so no need to write tx completion.
+		 * EFA does not support FI_RMA_EVENT, so no need to write rx completion.
 		 */
 		struct rxr_rx_entry *rx_entry = NULL;
 
 		rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, tx_entry->rma_loc_rx_id);
 		assert(rx_entry);
 		assert(rx_entry->state == RXR_RX_WAIT_READ_FINISH);
-
-		if (ep->util_ep.caps & FI_RMA_EVENT) {
-			rx_entry->cq_entry.len = rx_entry->total_len;
-			rx_entry->bytes_copied = rx_entry->total_len;
-			efa_cntr_report_rx_completion(&ep->util_ep, rx_entry->cq_entry.flags);
-		}
-
 		rxr_release_rx_entry(ep, rx_entry);
-		/* just release tx, do not write completion */
-		rxr_release_tx_entry(ep, tx_entry);
 	} else if (tx_entry->cq_entry.flags & FI_WRITE) {
 		if (tx_entry->fi_flags & FI_COMPLETION) {
 			rxr_cq_write_tx_completion(ep, tx_entry);
 		} else {
 			if (!(tx_entry->fi_flags & RXR_NO_COUNTER))
 				efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
-			rxr_release_tx_entry(ep, tx_entry);
 		}
+
 	} else {
 		assert(tx_entry->cq_entry.flags & FI_SEND);
 		rxr_cq_write_tx_completion(ep, tx_entry);
 	}
+
+	rxr_release_tx_entry(ep, tx_entry);
 }
 
 static int rxr_cq_close(struct fid *fid)
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_domain.c b/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
index 4edf617ff9e6598cacbe8ec5cf077ca8d05978f3..bdb745cc6623c8b0cd176e88b0b372285ce998b0 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
@@ -97,9 +97,6 @@ int rxr_mr_regattr(struct fid *domain_fid, const struct fi_mr_attr *attr,
 	rxr_domain = container_of(domain_fid, struct rxr_domain,
 				  util_domain.domain_fid.fid);
 
-	if (attr->iface == FI_HMEM_CUDA)
-		flags |= OFI_MR_NOCACHE;
-
 	ret = fi_mr_regattr(rxr_domain->rdm_domain, attr, flags, mr);
 	if (ret) {
 		FI_WARN(&rxr_prov, FI_LOG_MR,
@@ -153,14 +150,10 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	struct fi_info *rdm_info;
 	struct rxr_domain *rxr_domain;
 	struct efa_domain *efa_domain;
-	struct rxr_fabric *rxr_fabric;
-
-	rxr_fabric = container_of(fabric, struct rxr_fabric,
-				  util_fabric.fabric_fid);
+	struct efa_fabric *efa_fabric;
 
 	if (info->ep_attr->type == FI_EP_DGRAM)
-		return fi_domain(rxr_fabric->lower_fabric, info, domain,
-				 context);
+		return efa_domain_open(fabric, info, domain, context);
 
 	rxr_info.addr_format = info->addr_format;
 
@@ -188,18 +181,20 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	if (ret)
 		goto err_free_domain;
 
-	ret = fi_domain(rxr_fabric->lower_fabric, rdm_info,
+	ret = efa_domain_open(fabric, rdm_info,
 			&rxr_domain->rdm_domain, context);
 	if (ret)
 		goto err_free_core_info;
 
 	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
-				  util_domain.domain_fid);
+				  	util_domain.domain_fid);
 
 	/* Open shm provider's access domain */
-	if (rxr_env.enable_shm_transfer) {
+	efa_fabric = container_of(fabric, struct efa_fabric,
+							  util_fabric.fabric_fid);
+	if (efa_fabric->shm_fabric) {
 		assert(!strcmp(shm_info->fabric_attr->name, "shm"));
-		ret = fi_domain(rxr_fabric->shm_fabric, shm_info,
+		ret = fi_domain(efa_fabric->shm_fabric, shm_info,
 				&efa_domain->shm_domain, context);
 		if (ret)
 			goto err_close_core_domain;
@@ -211,8 +206,6 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 				info->src_addrlen : info->dest_addrlen;
 	rxr_domain->cq_size = MAX(info->rx_attr->size + info->tx_attr->size,
 				  rxr_env.cq_size);
-	rxr_domain->mr_local = ofi_mr_local(rdm_info);
-	rxr_domain->resource_mgmt = rdm_info->domain_attr->resource_mgmt;
 
 	ret = ofi_domain_init(fabric, info, &rxr_domain->util_domain, context);
 	if (ret)
@@ -235,7 +228,7 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	return 0;
 
 err_close_shm_domain:
-	if (rxr_env.enable_shm_transfer) {
+	if (efa_domain->shm_domain) {
 		retv = fi_close(&efa_domain->shm_domain->fid);
 		if (retv)
 			FI_WARN(&rxr_prov, FI_LOG_DOMAIN,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_ep.c b/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
index b1cecde6ec2e9b5a11fd6dfa1b928030cf96596e..978576093244f655e75d573160cbfe353fcf6f5a 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
@@ -37,64 +37,86 @@
 #include "ofi.h"
 #include <ofi_util.h>
 #include <ofi_iov.h>
-
 #include "rxr.h"
 #include "efa.h"
 #include "rxr_msg.h"
 #include "rxr_rma.h"
 #include "rxr_pkt_cmd.h"
+#include "rxr_pkt_type_base.h"
 #include "rxr_read.h"
 #include "rxr_atomic.h"
 
-struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep,
-					  struct rxr_rx_entry *rx_entry,
-					  const struct fi_msg *msg,
-					  uint64_t tag,
-					  uint64_t ignore,
-					  uint32_t op,
-					  uint64_t flags)
+struct efa_ep_addr *rxr_ep_raw_addr(struct rxr_ep *ep)
 {
-	rx_entry->type = RXR_RX_ENTRY;
-	rx_entry->rx_id = ofi_buf_index(rx_entry);
-	rx_entry->addr = msg->addr;
-	rx_entry->fi_flags = flags;
-	rx_entry->rxr_flags = 0;
-	rx_entry->bytes_received = 0;
-	rx_entry->bytes_copied = 0;
-	rx_entry->window = 0;
-	rx_entry->iov_count = msg->iov_count;
-	rx_entry->tag = tag;
-	rx_entry->op = op;
-	rx_entry->ignore = ignore;
-	rx_entry->unexp_pkt = NULL;
-	rx_entry->rma_iov_count = 0;
-	dlist_init(&rx_entry->queued_pkts);
+	return (struct efa_ep_addr *)ep->core_addr;
+}
+
+const char *rxr_ep_raw_addr_str(struct rxr_ep *ep, char *buf, size_t *buflen)
+{
+	return ofi_straddr(buf, buflen, FI_ADDR_EFA, rxr_ep_raw_addr(ep));
+}
+
+struct efa_ep_addr *rxr_peer_raw_addr(struct rxr_ep *ep, fi_addr_t addr)
+{
+	struct efa_ep *efa_ep;
+	struct efa_av *efa_av;
+	struct efa_conn *efa_conn;
+
+	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
+	efa_av = efa_ep->av;
+	efa_conn = efa_av_addr_to_conn(efa_av, addr);
+	return efa_conn ? efa_conn->ep_addr : NULL;
+}
 
-	memset(&rx_entry->cq_entry, 0, sizeof(rx_entry->cq_entry));
+const char *rxr_peer_raw_addr_str(struct rxr_ep *ep, fi_addr_t addr, char *buf, size_t *buflen)
+{
+	return ofi_straddr(buf, buflen, FI_ADDR_EFA, rxr_peer_raw_addr(ep, addr));
+}
 
-	rx_entry->owner = ep->use_zcpy_rx ? RXR_RX_USER_BUF : RXR_RX_PROV_BUF;
+/**
+ * @brief allocate an rx entry for an operation
+ *
+ * @param ep[in]	end point
+ * @param addr[in]	fi address of the sender/requester.
+ * @param op[in]	operation type (ofi_op_msg/ofi_op_tagged/ofi_op_read/ofi_op_write/ofi_op_atomic_xxx)
+ * @return		if allocation succeeded, return pointer to rx_entry
+ * 			if allocation failed, return NULL
+ */
+struct rxr_rx_entry *rxr_ep_alloc_rx_entry(struct rxr_ep *ep, fi_addr_t addr, uint32_t op)
+{
+	struct rxr_rx_entry *rx_entry;
 
-	/* Handle case where we're allocating an unexpected rx_entry */
-	if (msg->msg_iov) {
-		memcpy(rx_entry->iov, msg->msg_iov, sizeof(*rx_entry->iov) * msg->iov_count);
-		rx_entry->cq_entry.len = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
-		rx_entry->cq_entry.buf = msg->msg_iov[0].iov_base;
+	rx_entry = ofi_buf_alloc(ep->rx_entry_pool);
+	if (OFI_UNLIKELY(!rx_entry)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "RX entries exhausted\n");
+		return NULL;
 	}
+	memset(rx_entry, 0, sizeof(struct rxr_rx_entry));
 
-	if (msg->desc)
-		memcpy(&rx_entry->desc[0], msg->desc, sizeof(*msg->desc) * msg->iov_count);
-	else
-		memset(&rx_entry->desc[0], 0, sizeof(rx_entry->desc));
+	dlist_insert_tail(&rx_entry->ep_entry, &ep->rx_entry_list);
+	rx_entry->type = RXR_RX_ENTRY;
+	rx_entry->rx_id = ofi_buf_index(rx_entry);
+	dlist_init(&rx_entry->queued_pkts);
 
-	rx_entry->cq_entry.op_context = msg->context;
-	rx_entry->cq_entry.tag = 0;
-	rx_entry->ignore = ~0;
+	rx_entry->state = RXR_RX_INIT;
+	rx_entry->addr = addr;
+	if (addr != FI_ADDR_UNSPEC) {
+		rx_entry->peer = rxr_ep_get_peer(ep, addr);
+		assert(rx_entry->peer);
+		dlist_insert_tail(&rx_entry->peer_entry, &rx_entry->peer->rx_entry_list);
+	} else {
+		/*
+		 * If msg->addr is not provided, rx_entry->peer will be set
+		 * after it is matched with a message.
+		 */
+		assert(op == ofi_op_msg || op == ofi_op_tagged);
+		rx_entry->peer = NULL;
+	} 
 
+	rx_entry->op = op;
 	switch (op) {
 	case ofi_op_tagged:
 		rx_entry->cq_entry.flags = (FI_RECV | FI_MSG | FI_TAGGED);
-		rx_entry->cq_entry.tag = tag;
-		rx_entry->ignore = ignore;
 		break;
 	case ofi_op_msg:
 		rx_entry->cq_entry.flags = (FI_RECV | FI_MSG);
@@ -121,147 +143,88 @@ struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep,
 	return rx_entry;
 }
 
-struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep,
-					 const struct fi_msg *msg,
-					 uint64_t tag,
-					 uint64_t ignore,
-					 uint32_t op,
-					 uint64_t flags)
-{
-	struct rxr_rx_entry *rx_entry;
-
-	rx_entry = ofi_buf_alloc(ep->rx_entry_pool);
-	if (OFI_UNLIKELY(!rx_entry)) {
-		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "RX entries exhausted\n");
-		return NULL;
-	}
-
-#if ENABLE_DEBUG
-	dlist_insert_tail(&rx_entry->rx_entry_entry, &ep->rx_entry_list);
-#endif
-	rx_entry = rxr_ep_rx_entry_init(ep, rx_entry, msg, tag, ignore, op, flags);
-	rx_entry->state = RXR_RX_INIT;
-	rx_entry->op = op;
-	return rx_entry;
-}
-
-struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_msgrtm(struct rxr_ep *ep,
-							    struct rxr_pkt_entry **pkt_entry_ptr)
+/**
+ * @brief post user provided receiving buffer to the device.
+ *
+ * The user receive buffer was converted to an RX packet, then posted to the device.
+ *
+ * @param[in]	ep		endpint
+ * @param[in]	rx_entry	rx_entry that contain user buffer information
+ * @param[in]	flags		user supplied flags passed to fi_recv
+ */
+int rxr_ep_post_user_recv_buf(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, uint64_t flags)
 {
-	struct rxr_rx_entry *rx_entry;
-	struct rxr_pkt_entry *unexp_pkt_entry;
+	struct rxr_pkt_entry *pkt_entry;
+	struct efa_mr *mr;
+	struct iovec msg_iov;
 	struct fi_msg msg = {0};
+	int err;
 
-	unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr);
-	if (OFI_UNLIKELY(!unexp_pkt_entry)) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n");
-		return NULL;
-	}
+	assert(rx_entry->iov_count == 1);
+	assert(rx_entry->iov[0].iov_len >= ep->msg_prefix_size);
+	pkt_entry = (struct rxr_pkt_entry *)rx_entry->iov[0].iov_base;
+	assert(pkt_entry);
 
-	msg.addr = unexp_pkt_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_msg, 0);
-	if (OFI_UNLIKELY(!rx_entry)) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "RX entries exhausted.\n");
-		return NULL;
-	}
+	/*
+	 * The ownership of the prefix buffer lies with the application, do not
+	 * put it on the dbg list for cleanup during shutdown or poison it. The
+	 * provider loses jurisdiction over it soon after writing the rx
+	 * completion.
+	 */
+	dlist_init(&pkt_entry->entry);
+	mr = (struct efa_mr *)rx_entry->desc[0];
+	pkt_entry->mr = &mr->mr_fid;
+	pkt_entry->alloc_type = RXR_PKT_FROM_USER_BUFFER;
+	pkt_entry->flags = RXR_PKT_ENTRY_IN_USE;
+	pkt_entry->next = NULL;
+	/*
+	 * The actual receiving buffer size (pkt_size) is
+	 *    rx_entry->total_len - sizeof(struct rxr_pkt_entry)
+	 * because the first part of user buffer was used to
+	 * construct pkt_entry. The actual receiving buffer
+	 * posted to device starts from pkt_entry->pkt.
+	 */
+	pkt_entry->pkt_size = rx_entry->iov[0].iov_len - sizeof(struct rxr_pkt_entry);
 
-	rx_entry->rxr_flags = 0;
-	rx_entry->state = RXR_RX_UNEXP;
-	rx_entry->unexp_pkt = unexp_pkt_entry;
-	rxr_pkt_rtm_init_rx_entry(unexp_pkt_entry, rx_entry);
-	dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_list);
-	return rx_entry;
-}
+	pkt_entry->x_entry = rx_entry;
+	rx_entry->state = RXR_RX_MATCHED;
 
-struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_tagrtm(struct rxr_ep *ep,
-							    struct rxr_pkt_entry **pkt_entry_ptr)
-{
-	uint64_t tag;
-	struct rxr_rx_entry *rx_entry;
-	struct rxr_pkt_entry *unexp_pkt_entry;
-	struct fi_msg msg = {0};
+	msg_iov.iov_base = pkt_entry->pkt;
+	msg_iov.iov_len = pkt_entry->pkt_size;
+	assert(msg_iov.iov_len <= ep->mtu_size);
 
-	unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr);
-	if (OFI_UNLIKELY(!unexp_pkt_entry)) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n");
-		return NULL;
-	}
+	msg.iov_count = 1;
+	msg.msg_iov = &msg_iov;
+	msg.desc = rx_entry->desc;
+	msg.addr = FI_ADDR_UNSPEC;
+	msg.context = pkt_entry;
+	msg.data = 0;
 
-	tag = rxr_pkt_rtm_tag(unexp_pkt_entry);
-	msg.addr = unexp_pkt_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, tag, ~0, ofi_op_tagged, 0);
-	if (OFI_UNLIKELY(!rx_entry)) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "RX entries exhausted.\n");
-		return NULL;
+	err = fi_recvmsg(ep->rdm_ep, &msg, flags);
+	if (OFI_UNLIKELY(err)) {
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"failed to post user supplied buffer %d (%s)\n", -err,
+			fi_strerror(-err));
+		return err;
 	}
 
-	rx_entry->rxr_flags = 0;
-	rx_entry->state = RXR_RX_UNEXP;
-	rx_entry->unexp_pkt = unexp_pkt_entry;
-	rxr_pkt_rtm_init_rx_entry(unexp_pkt_entry, rx_entry);
-	dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_tagged_list);
-	return rx_entry;
-}
-
-struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep,
-					   struct rxr_rx_entry *posted_entry,
-					   struct rxr_rx_entry *consumer_entry,
-					   struct rxr_pkt_entry *pkt_entry)
-{
-	struct rxr_rx_entry *rx_entry;
-	size_t buf_len, consumed_len, data_len;
-	uint64_t tag;
-	struct fi_msg msg = {0};
-
-	assert(rxr_get_base_hdr(pkt_entry->pkt)->type >= RXR_REQ_PKT_BEGIN);
-	tag = 0;
-
-	if (!consumer_entry) {
-		msg.msg_iov = posted_entry->iov;
-		msg.iov_count = posted_entry->iov_count;
-		msg.addr = pkt_entry->addr;
-		rx_entry = rxr_ep_get_rx_entry(ep, &msg, tag, 0, ofi_op_msg,
-					       posted_entry->fi_flags);
-		if (OFI_UNLIKELY(!rx_entry))
-			return NULL;
-
-		FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
-		       "Splitting into new multi_recv consumer rx_entry %d from rx_entry %d\n",
-		       rx_entry->rx_id,
-		       posted_entry->rx_id);
-	} else {
-		rx_entry = consumer_entry;
-		memcpy(rx_entry->iov, posted_entry->iov,
-		       sizeof(*posted_entry->iov) * posted_entry->iov_count);
-		rx_entry->iov_count = posted_entry->iov_count;
-	}
-
-	rxr_pkt_rtm_init_rx_entry(pkt_entry, rx_entry);
-	data_len = rx_entry->total_len;
-	buf_len = ofi_total_iov_len(rx_entry->iov,
-				    rx_entry->iov_count);
-	consumed_len = MIN(buf_len, data_len);
-
-	rx_entry->rxr_flags |= RXR_MULTI_RECV_CONSUMER;
-	rx_entry->total_len = data_len;
-	rx_entry->fi_flags |= FI_MULTI_RECV;
-	rx_entry->master_entry = posted_entry;
-	rx_entry->cq_entry.len = consumed_len;
-	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
-	rx_entry->cq_entry.op_context = posted_entry->cq_entry.op_context;
-	rx_entry->cq_entry.flags = (FI_RECV | FI_MSG);
-
-	ofi_consume_iov(posted_entry->iov, &posted_entry->iov_count,
-			consumed_len);
-
-	dlist_init(&rx_entry->multi_recv_entry);
-	dlist_insert_tail(&rx_entry->multi_recv_entry,
-			  &posted_entry->multi_recv_consumers);
-	return rx_entry;
+	ep->efa_rx_pkts_posted++;
+	return 0;
 }
 
-/* Post buffers as undirected recv (FI_ADDR_UNSPEC) */
-int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_t flags, enum rxr_lower_ep_type lower_ep_type)
+/**
+ * @brief post an internal receive buffer to lower endpoint
+ *
+ * The buffer was posted as undirected recv, (address was set to FI_ADDR_UNSPEC)
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	flags		flags passed to lower provider, can have FI_MORE
+ * @param[in]	lower_ep_type	lower endpoint type, can be either SHM_EP or EFA_EP
+ * @return	On success, return 0
+ * 		On failure, return a negative error code.
+ */
+int rxr_ep_post_internal_rx_pkt(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lower_ep_type)
 {
 	struct fi_msg msg = {0};
 	struct iovec msg_iov;
@@ -271,15 +234,15 @@ int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_
 
 	switch (lower_ep_type) {
 	case SHM_EP:
-		rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_shm_pool);
+		rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->shm_rx_pkt_pool, RXR_PKT_FROM_SHM_RX_POOL);
 		break;
 	case EFA_EP:
-		if (posted_recv)
-			rx_pkt_entry = rxr_pkt_entry_init_prefix(ep, posted_recv, ep->rx_pkt_efa_pool);
-		else
-			rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_efa_pool);
+		rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->efa_rx_pkt_pool, RXR_PKT_FROM_EFA_RX_POOL);
 		break;
 	default:
+		/* Coverity will complain about this being a dead code segment,
+		 * but it is useful for future proofing.
+		 */
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 			"invalid lower EP type %d\n", lower_ep_type);
 		assert(0 && "invalid lower EP type\n");
@@ -313,26 +276,15 @@ int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_
 				fi_strerror(-ret));
 			return ret;
 		}
-		ep->posted_bufs_shm++;
+		ep->shm_rx_pkts_posted++;
 		break;
 	case EFA_EP:
 #if ENABLE_DEBUG
-		if (rx_pkt_entry->type != RXR_PKT_ENTRY_USER)
-			dlist_insert_tail(&rx_pkt_entry->dbg_entry,
-					  &ep->rx_posted_buf_list);
+		dlist_insert_tail(&rx_pkt_entry->dbg_entry,
+				  &ep->rx_posted_buf_list);
 #endif
-		desc = rxr_ep_mr_local(ep) ? fi_mr_desc(rx_pkt_entry->mr) : NULL;
+		desc = fi_mr_desc(rx_pkt_entry->mr);
 		msg.desc = &desc;
-		/*
-		 * Use the actual receive sizes from the application
-		 * rather than posting the full MTU size, like we do
-		 * when using the bufpool.
-		 */
-		if (posted_recv) {
-			msg_iov.iov_len = posted_recv->msg_iov->iov_len;
-			msg.data = posted_recv->data;
-			assert(msg_iov.iov_len <= ep->mtu_size);
-		}
 		ret = fi_recvmsg(ep->rdm_ep, &msg, flags);
 		if (OFI_UNLIKELY(ret)) {
 			rxr_pkt_entry_release_rx(ep, rx_pkt_entry);
@@ -341,7 +293,7 @@ int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_
 				fi_strerror(-ret));
 			return ret;
 		}
-		ep->posted_bufs_efa++;
+		ep->efa_rx_pkts_posted++;
 		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
@@ -352,6 +304,39 @@ int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_
 	return 0;
 }
 
+/**
+ * @brief bulk post internal receive buffer(s) to device
+ *
+ * When posting multiple buffers, this function will use
+ * FI_MORE flag to achieve better performance.
+ *
+ * @param[in]	ep		endpint
+ * @param[in]	nrecv		number of receive buffers to post
+ * @param[in]	lower_ep_type	device type, can be SHM_EP or EFA_EP
+ * @return	On success, return 0
+ * 		On failure, return negative libfabric error code
+ */
+static inline
+ssize_t rxr_ep_bulk_post_internal_rx_pkts(struct rxr_ep *ep, int nrecv,
+					  enum rxr_lower_ep_type lower_ep_type)
+{
+	int i;
+	ssize_t err;
+	uint64_t flags;
+
+	flags = FI_MORE;
+	for (i = 0; i < nrecv; ++i) {
+		if (i == nrecv - 1)
+			flags = 0;
+
+		err = rxr_ep_post_internal_rx_pkt(ep, flags, lower_ep_type);
+		if (OFI_UNLIKELY(err))
+			return err;
+	}
+
+	return 0;
+}
+
 void rxr_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 		       const struct fi_msg *msg, uint32_t op, uint64_t flags)
 {
@@ -362,12 +347,14 @@ void rxr_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	tx_entry->tx_id = ofi_buf_index(tx_entry);
 	tx_entry->state = RXR_TX_REQ;
 	tx_entry->addr = msg->addr;
+	tx_entry->peer = rxr_ep_get_peer(ep, tx_entry->addr);
+	assert(tx_entry->peer);
+	dlist_insert_tail(&tx_entry->peer_entry, &tx_entry->peer->tx_entry_list);
 
-	tx_entry->send_flags = 0;
+	tx_entry->rxr_flags = 0;
 	tx_entry->bytes_acked = 0;
 	tx_entry->bytes_sent = 0;
 	tx_entry->window = 0;
-	tx_entry->total_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
 	tx_entry->iov_count = msg->iov_count;
 	tx_entry->iov_index = 0;
 	tx_entry->iov_mr_start = 0;
@@ -382,17 +369,14 @@ void rxr_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	else
 		memset(tx_entry->desc, 0, sizeof(tx_entry->desc));
 
-	/*
-	 * The prefix is currently not used by the sender, but needs to be
-	 * accounted for when copying the payload into the bounce-buffer.
-	 */
-	if (ep->use_zcpy_rx) {
-		assert(tx_entry->iov[0].iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr));
-		tx_entry->iov[0].iov_base = (char *)tx_entry->iov[0].iov_base
-					     + sizeof(struct rxr_pkt_entry)
-					     + sizeof(struct rxr_eager_msgrtm_hdr);
+	if (ep->msg_prefix_size > 0) {
+		assert(tx_entry->iov[0].iov_len >= ep->msg_prefix_size);
+		tx_entry->iov[0].iov_base = (char *)tx_entry->iov[0].iov_base + ep->msg_prefix_size;
+		tx_entry->iov[0].iov_len -= ep->msg_prefix_size;
 	}
 
+	tx_entry->total_len = ofi_total_iov_len(tx_entry->iov, tx_entry->iov_count);
+
 	/* set flags */
 	assert(ep->util_ep.tx_msg_flags == 0 ||
 	       ep->util_ep.tx_msg_flags == FI_COMPLETION);
@@ -457,12 +441,53 @@ struct rxr_tx_entry *rxr_ep_alloc_tx_entry(struct rxr_ep *rxr_ep,
 		tx_entry->tag = tag;
 	}
 
-#if ENABLE_DEBUG
-	dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list);
-#endif
+	dlist_insert_tail(&tx_entry->ep_entry, &rxr_ep->tx_entry_list);
 	return tx_entry;
 }
 
+void rxr_release_tx_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
+{
+	int i, err = 0;
+	struct dlist_entry *tmp;
+	struct rxr_pkt_entry *pkt_entry;
+
+	assert(tx_entry->peer);
+	dlist_remove(&tx_entry->peer_entry);
+
+	for (i = 0; i < tx_entry->iov_count; i++) {
+		if (tx_entry->mr[i]) {
+			err = fi_close((struct fid *)tx_entry->mr[i]);
+			if (OFI_UNLIKELY(err)) {
+				FI_WARN(&rxr_prov, FI_LOG_CQ, "mr dereg failed. err=%d\n", err);
+				efa_eq_write_error(&ep->util_ep, err, -err);
+			}
+
+			tx_entry->mr[i] = NULL;
+		}
+	}
+
+	dlist_remove(&tx_entry->ep_entry);
+
+	dlist_foreach_container_safe(&tx_entry->queued_pkts,
+				     struct rxr_pkt_entry,
+				     pkt_entry, entry, tmp) {
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+	}
+
+	if (tx_entry->rxr_flags & RXR_TX_ENTRY_QUEUED_RNR)
+		dlist_remove(&tx_entry->queued_rnr_entry);
+
+	if (tx_entry->state == RXR_TX_QUEUED_CTRL)
+		dlist_remove(&tx_entry->queued_ctrl_entry);
+
+#ifdef ENABLE_EFA_POISONING
+	rxr_poison_mem_region((uint32_t *)tx_entry,
+			      sizeof(struct rxr_tx_entry));
+#endif
+	tx_entry->state = RXR_TX_FREE;
+	ofi_buf_free(tx_entry);
+}
+
 int rxr_ep_tx_init_mr_desc(struct rxr_domain *rxr_domain,
 			   struct rxr_tx_entry *tx_entry,
 			   int mr_iov_start, uint64_t access)
@@ -502,6 +527,30 @@ int rxr_ep_tx_init_mr_desc(struct rxr_domain *rxr_domain,
 	return ret;
 }
 
+/**
+ * @brief convert EFA descriptors to shm descriptors.
+ *
+ * Each provider defines its descriptors format. The descriptor for
+ * EFA provider is of struct efa_mr *, which shm provider cannot
+ * understand. This function convert EFA descriptors to descriptors
+ * shm can use.
+ *
+ * @param numdesc[in]       number of descriptors in the array
+ * @param desc[in,out]      descriptors input is EFA descriptor, output
+ *                          is shm descriptor.
+ */
+void rxr_convert_desc_for_shm(int numdesc, void **desc)
+{
+	int i;
+	struct efa_mr *efa_mr;
+
+	for (i = 0; i < numdesc; ++i) {
+		efa_mr = desc[i];
+		if (efa_mr)
+			desc[i] = fi_mr_desc(efa_mr->shm_mr);
+	}
+}
+
 void rxr_prepare_desc_send(struct rxr_domain *rxr_domain,
 			   struct rxr_tx_entry *tx_entry)
 {
@@ -531,26 +580,19 @@ void rxr_prepare_desc_send(struct rxr_domain *rxr_domain,
 /* Generic send */
 int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 {
-	struct rxr_peer *peer;
-	int pending;
+	struct rdm_peer *peer;
+	int outstanding;
 
 	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
 	assert(peer);
-	/*
-	 * Init tx state for this peer. The rx state and reorder buffers will be
-	 * initialized on the first recv so as to not allocate resources unless
-	 * necessary.
-	 */
-	if (!peer->tx_init)
-		rxr_ep_peer_init_tx(peer);
 
 	/*
 	 * Divy up available credits to outstanding transfers and request the
 	 * minimum of that and the amount required to finish the current long
 	 * message.
 	 */
-	pending = peer->tx_pending + 1;
-	tx_entry->credit_request = MIN(ofi_div_ceil(peer->tx_credits, pending),
+	outstanding = peer->efa_outstanding_tx_ops + 1;
+	tx_entry->credit_request = MIN(ofi_div_ceil(peer->tx_credits, outstanding),
 				       ofi_div_ceil(tx_entry->total_len,
 						    rxr_ep->max_data_payload_size));
 	tx_entry->credit_request = MAX(tx_entry->credit_request,
@@ -567,104 +609,114 @@ int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_
 
 static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 {
-	size_t i = 0;
-	struct rxr_peer *peer;
-#if ENABLE_DEBUG
-	struct dlist_entry *tmp;
-	struct dlist_entry *entry;
+	struct dlist_entry *entry, *tmp;
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_tx_entry *tx_entry;
+#if ENABLE_DEBUG
 	struct rxr_pkt_entry *pkt;
 #endif
 
-	if (rxr_need_sas_ordering(rxr_ep)) {
-		for (i = 0; i < rxr_ep->util_ep.av->count; ++i) {
-			peer = rxr_ep_get_peer(rxr_ep, i);
-			if (peer->rx_init)
-				efa_free_robuf(peer);
-		}
-		if (rxr_ep->robuf_pool)
-			ofi_bufpool_destroy(rxr_ep->robuf_pool);
-	}
-
-#if ENABLE_DEBUG
-	for (i = 0; i < rxr_ep->util_ep.av->count; ++i) {
-		peer = rxr_ep_get_peer(rxr_ep, i);
-		/*
-		 * TODO: Add support for wait/signal until all pending messages
-		 * have been sent/received so the core does not attempt to
-		 * complete a data operation or an internal RxR transfer after
-		 * the EP is shutdown.
-		 */
-		if ((peer->flags & RXR_PEER_REQ_SENT) && !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))
-			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Closing EP with unacked CONNREQs in flight\n");
-	}
-
-	dlist_foreach(&rxr_ep->rx_unexp_list, entry) {
+	dlist_foreach_safe(&rxr_ep->rx_unexp_list, entry, tmp) {
 		rx_entry = container_of(entry, struct rxr_rx_entry, entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unmatched unexpected rx_entry: %p pkt_entry %p\n",
+			rx_entry, rx_entry->unexp_pkt);
 		rxr_pkt_entry_release_rx(rxr_ep, rx_entry->unexp_pkt);
+		rxr_release_rx_entry(rxr_ep, rx_entry);
 	}
 
-	dlist_foreach(&rxr_ep->rx_unexp_tagged_list, entry) {
+	dlist_foreach_safe(&rxr_ep->rx_unexp_tagged_list, entry, tmp) {
 		rx_entry = container_of(entry, struct rxr_rx_entry, entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unmatched unexpected tagged rx_entry: %p pkt_entry %p\n",
+			rx_entry, rx_entry->unexp_pkt);
 		rxr_pkt_entry_release_rx(rxr_ep, rx_entry->unexp_pkt);
+		rxr_release_rx_entry(rxr_ep, rx_entry);
+	}
+
+	dlist_foreach_safe(&rxr_ep->rx_entry_queued_rnr_list, entry, tmp) {
+		rx_entry = container_of(entry, struct rxr_rx_entry,
+					queued_rnr_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with queued rnr rx_entry: %p\n",
+			rx_entry);
+		rxr_release_rx_entry(rxr_ep, rx_entry);
 	}
 
-	dlist_foreach(&rxr_ep->rx_entry_queued_list, entry) {
+	dlist_foreach_safe(&rxr_ep->rx_entry_queued_ctrl_list, entry, tmp) {
 		rx_entry = container_of(entry, struct rxr_rx_entry,
-					queued_entry);
-		dlist_foreach_container_safe(&rx_entry->queued_pkts,
-					     struct rxr_pkt_entry,
-					     pkt, entry, tmp)
-			rxr_pkt_entry_release_tx(rxr_ep, pkt);
+					queued_ctrl_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with queued ctrl rx_entry: %p\n",
+			rx_entry);
+		rxr_release_rx_entry(rxr_ep, rx_entry);
 	}
 
-	dlist_foreach(&rxr_ep->tx_entry_queued_list, entry) {
+	dlist_foreach_safe(&rxr_ep->tx_entry_queued_rnr_list, entry, tmp) {
 		tx_entry = container_of(entry, struct rxr_tx_entry,
-					queued_entry);
-		dlist_foreach_container_safe(&tx_entry->queued_pkts,
-					     struct rxr_pkt_entry,
-					     pkt, entry, tmp)
-			rxr_pkt_entry_release_tx(rxr_ep, pkt);
+					queued_rnr_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with queued rnr tx_entry: %p\n",
+			tx_entry);
+		rxr_release_tx_entry(rxr_ep, tx_entry);
 	}
 
-	if (!rxr_ep->use_zcpy_rx) {
-		/*
-		 * The provider does not own these entries, and there's no need
-		 * to deep-free them even in a debug build.
-		 */
-		dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) {
-			pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
-			rxr_pkt_entry_release_rx(rxr_ep, pkt);
-		}
-		dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) {
+	dlist_foreach_safe(&rxr_ep->tx_entry_queued_ctrl_list, entry, tmp) {
+		tx_entry = container_of(entry, struct rxr_tx_entry,
+					queued_ctrl_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with queued ctrl tx_entry: %p\n",
+			tx_entry);
+		rxr_release_tx_entry(rxr_ep, tx_entry);
+	}
+
+#if ENABLE_DEBUG
+	dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) {
+		pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
+		ofi_buf_free(pkt);
+	}
+
+	if (rxr_ep->use_shm) {
+		dlist_foreach_safe(&rxr_ep->rx_posted_buf_shm_list, entry, tmp) {
 			pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
 			ofi_buf_free(pkt);
 		}
 	}
 
+	dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) {
+		pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unreleased RX pkt_entry: %p\n",
+			pkt);
+		rxr_pkt_entry_release_rx(rxr_ep, pkt);
+	}
+
 	dlist_foreach_safe(&rxr_ep->tx_pkt_list, entry, tmp) {
 		pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unreleased TX pkt_entry: %p\n",
+			pkt);
 		rxr_pkt_entry_release_tx(rxr_ep, pkt);
 	}
+#endif
 
 	dlist_foreach_safe(&rxr_ep->rx_entry_list, entry, tmp) {
 		rx_entry = container_of(entry, struct rxr_rx_entry,
-					rx_entry_entry);
+					ep_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unreleased rx_entry: %p\n",
+			rx_entry);
 		rxr_release_rx_entry(rxr_ep, rx_entry);
 	}
+
 	dlist_foreach_safe(&rxr_ep->tx_entry_list, entry, tmp) {
 		tx_entry = container_of(entry, struct rxr_tx_entry,
-					tx_entry_entry);
+					ep_entry);
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Closing ep with unreleased tx_entry: %p\n",
+			tx_entry);
 		rxr_release_tx_entry(rxr_ep, tx_entry);
 	}
-	if (rxr_ep->use_shm) {
-		dlist_foreach_safe(&rxr_ep->rx_posted_buf_shm_list, entry, tmp) {
-			pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
-			ofi_buf_free(pkt);
-		}
-	}
-#endif
 
 	if (rxr_ep->rx_entry_pool)
 		ofi_bufpool_destroy(rxr_ep->rx_entry_pool);
@@ -696,19 +748,63 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 	if (rxr_ep->rx_unexp_pkt_pool)
 		ofi_bufpool_destroy(rxr_ep->rx_unexp_pkt_pool);
 
-	if (rxr_ep->rx_pkt_efa_pool)
-		ofi_bufpool_destroy(rxr_ep->rx_pkt_efa_pool);
+	if (rxr_ep->efa_rx_pkt_pool)
+		ofi_bufpool_destroy(rxr_ep->efa_rx_pkt_pool);
 
-	if (rxr_ep->tx_pkt_efa_pool)
-		ofi_bufpool_destroy(rxr_ep->tx_pkt_efa_pool);
+	if (rxr_ep->efa_tx_pkt_pool)
+		ofi_bufpool_destroy(rxr_ep->efa_tx_pkt_pool);
+
+	if (rxr_ep->pkt_sendv_pool)
+		ofi_bufpool_destroy(rxr_ep->pkt_sendv_pool);
 
 	if (rxr_ep->use_shm) {
-		if (rxr_ep->rx_pkt_shm_pool)
-			ofi_bufpool_destroy(rxr_ep->rx_pkt_shm_pool);
+		if (rxr_ep->shm_rx_pkt_pool)
+			ofi_bufpool_destroy(rxr_ep->shm_rx_pkt_pool);
+
+		if (rxr_ep->shm_tx_pkt_pool)
+			ofi_bufpool_destroy(rxr_ep->shm_tx_pkt_pool);
+	}
+}
 
-		if (rxr_ep->tx_pkt_shm_pool)
-			ofi_bufpool_destroy(rxr_ep->tx_pkt_shm_pool);
+/*
+ * @brief determine whether an endpoint has unfinished send
+ *
+ * Unfinished send includes queued ctrl packets, queued
+ * RNR packets and inflight TX packets.
+ *
+ * @param[in]	rxr_ep	endpoint
+ * @return	a boolean
+ */
+static
+bool rxr_ep_has_unfinished_send(struct rxr_ep *rxr_ep)
+{
+	return !dlist_empty(&rxr_ep->rx_entry_queued_rnr_list) ||
+	       !dlist_empty(&rxr_ep->rx_entry_queued_ctrl_list) ||
+	       !dlist_empty(&rxr_ep->tx_entry_queued_rnr_list) ||
+	       !dlist_empty(&rxr_ep->tx_entry_queued_ctrl_list) ||
+	       (rxr_ep->efa_outstanding_tx_ops > 0) ||
+	       (rxr_ep->shm_outstanding_tx_ops > 0);
+}
+
+/*
+ * @brief wait for send to finish
+ *
+ * Wait for queued packet to be sent, and inflight send to
+ * complete.
+ *
+ * @param[in]	rxr_ep		endpoint
+ * @return 	no return
+ */
+static inline
+void rxr_ep_wait_send(struct rxr_ep *rxr_ep)
+{
+	fastlock_acquire(&rxr_ep->util_ep.lock);
+
+	while (rxr_ep_has_unfinished_send(rxr_ep)) {
+		rxr_ep_progress_internal(rxr_ep);
 	}
+
+	fastlock_release(&rxr_ep->util_ep.lock);
 }
 
 static int rxr_ep_close(struct fid *fid)
@@ -718,6 +814,8 @@ static int rxr_ep_close(struct fid *fid)
 
 	rxr_ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid);
 
+	rxr_ep_wait_send(rxr_ep);
+
 	ret = fi_close(&rxr_ep->rdm_ep->fid);
 	if (ret) {
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close EP\n");
@@ -752,7 +850,6 @@ static int rxr_ep_close(struct fid *fid)
 		retv = ret;
 	}
 	rxr_ep_free_res(rxr_ep);
-	free(rxr_ep->peer);
 	free(rxr_ep);
 	return retv;
 }
@@ -765,16 +862,21 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 	struct efa_av *av;
 	struct util_cntr *cntr;
 	struct util_eq *eq;
-	struct dlist_entry *ep_list_first_entry;
-	struct util_ep *util_ep;
-	struct rxr_ep *rxr_first_ep;
-	struct rxr_peer *first_ep_peer, *peer;
 	int ret = 0;
-	size_t i;
 
 	switch (bfid->fclass) {
 	case FI_CLASS_AV:
 		av = container_of(bfid, struct efa_av, util_av.av_fid.fid);
+		/*
+		 * Binding multiple endpoints to a single AV is currently not
+		 * supported.
+		 */
+		if (av->ep) {
+			EFA_WARN(FI_LOG_EP_CTRL,
+				 "Address vector already has endpoint bound to it.\n");
+			return -FI_ENOSYS;
+		}
+
 		/* Bind util provider endpoint and av */
 		ret = ofi_ep_bind_av(&rxr_ep->util_ep, &av->util_av);
 		if (ret)
@@ -784,48 +886,11 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 		if (ret)
 			return ret;
 
-		rxr_ep->peer = calloc(av->util_av.count,
-				      sizeof(struct rxr_peer));
-		if (!rxr_ep->peer)
-			return -FI_ENOMEM;
-
-		if (rxr_need_sas_ordering(rxr_ep)) {
-			ret = ofi_bufpool_create(&rxr_ep->robuf_pool,
-						 sizeof(struct rxr_robuf), 16,
-						 0, 0, 0);
-			if (ret)
-				return ret;
-		}
-
 		/* Bind shm provider endpoint & shm av */
 		if (rxr_ep->use_shm) {
 			ret = fi_ep_bind(rxr_ep->shm_ep, &av->shm_rdm_av->fid, flags);
 			if (ret)
 				return ret;
-
-			/*
-			 * We always update the new added EP's local information with the first
-			 * bound EP. The if (ep_list_first_entry->next) check here is to skip the
-			 * update for the first bound EP.
-			 */
-			ep_list_first_entry = av->util_av.ep_list.next;
-			if (ep_list_first_entry->next) {
-				util_ep = container_of(ep_list_first_entry, struct util_ep, av_entry);
-				rxr_first_ep = container_of(util_ep, struct rxr_ep, util_ep);
-
-				/*
-				 * Copy the entire peer array, because we may not be able to make the
-				 * assumption that insertions are always indexed in order in the future.
-				 */
-				for (i = 0; i < av->util_av.count; i++) {
-					first_ep_peer = rxr_ep_get_peer(rxr_first_ep, i);
-					if (first_ep_peer->is_local) {
-						peer = rxr_ep_get_peer(rxr_ep, i);
-						peer->shm_fiaddr = first_ep_peer->shm_fiaddr;
-						peer->is_local = 1;
-					}
-				}
-			}
 		}
 		break;
 	case FI_CLASS_CQ:
@@ -858,56 +923,47 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 }
 
 static
-void rxr_ep_set_features(struct rxr_ep *ep)
+void rxr_ep_set_extra_info(struct rxr_ep *ep)
 {
-	memset(ep->features, 0, sizeof(ep->features));
+	memset(ep->extra_info, 0, sizeof(ep->extra_info));
 
 	/* RDMA read is an extra feature defined in protocol version 4 (the base version) */
 	if (efa_ep_support_rdma_read(ep->rdm_ep))
-		ep->features[0] |= RXR_REQ_FEATURE_RDMA_READ;
+		ep->extra_info[0] |= RXR_EXTRA_FEATURE_RDMA_READ;
+
+	ep->extra_info[0] |= RXR_EXTRA_FEATURE_DELIVERY_COMPLETE;
+
+	if (ep->use_zcpy_rx) {
+		/*
+		 * zero copy receive requires the packet header length remains
+		 * constant, so the application receive buffer is match with
+		 * incoming application data.
+		 */
+		ep->extra_info[0] |= RXR_EXTRA_REQUEST_CONSTANT_HEADER_LENGTH;
+	}
+
+	ep->extra_info[0] |= RXR_EXTRA_REQUEST_CONNID_HEADER;
 }
 
 static int rxr_ep_ctrl(struct fid *fid, int command, void *arg)
 {
 	ssize_t ret;
-	size_t i;
 	struct rxr_ep *ep;
-	uint64_t flags = FI_MORE;
-	size_t rx_size, shm_rx_size;
-	char shm_ep_name[NAME_MAX];
+	char shm_ep_name[EFA_SHM_NAME_MAX];
+	size_t shm_ep_name_len;
 
 	switch (command) {
 	case FI_ENABLE:
 		/* Enable core endpoints & post recv buff */
 		ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid);
 
-		/*
-		 * If the endpoint is configured for zero-copy receives, the
-		 * provider will use the application's undirected receives for
-		 * its internal control packets as well. The onus will be on the
-		 * application to ensure the receive queue is hydrated to avoid
-		 * RNRs.
-		 */
-		rx_size = ep->use_zcpy_rx ? rxr_env.zcpy_rx_seed : rxr_get_rx_pool_chunk_cnt(ep);
 		ret = fi_enable(ep->rdm_ep);
 		if (ret)
 			return ret;
 
 		fastlock_acquire(&ep->util_ep.lock);
 
-		rxr_ep_set_features(ep);
-
-		for (i = 0; i < rx_size; i++) {
-			if (i == rx_size - 1)
-				flags = 0;
-
-			ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP);
-
-			if (ret)
-				goto out;
-		}
-
-		ep->available_data_bufs = rx_size;
+		rxr_ep_set_extra_info(ep);
 
 		ep->core_addrlen = RXR_MAX_NAME_LENGTH;
 		ret = fi_getname(&ep->rdm_ep->fid,
@@ -925,25 +981,14 @@ static int rxr_ep_ctrl(struct fid *fid, int command, void *arg)
 		 * shared memory region.
 		 */
 		if (ep->use_shm) {
-			ret = rxr_ep_efa_addr_to_str(ep->core_addr, shm_ep_name);
+			shm_ep_name_len = EFA_SHM_NAME_MAX;
+			ret = rxr_raw_addr_to_smr_name(ep->core_addr, shm_ep_name, &shm_ep_name_len);
 			if (ret < 0)
 				goto out;
-
-			fi_setname(&ep->shm_ep->fid, shm_ep_name, sizeof(shm_ep_name));
-			shm_rx_size = shm_info->rx_attr->size;
+			fi_setname(&ep->shm_ep->fid, shm_ep_name, shm_ep_name_len);
 			ret = fi_enable(ep->shm_ep);
 			if (ret)
-				return ret;
-			/* Pre-post buffer to receive from shm provider */
-			for (i = 0; i < shm_rx_size; i++) {
-				if (i == shm_rx_size - 1)
-					flags = 0;
-
-				ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP);
-
-				if (ret)
-					goto out;
-			}
+				goto out;
 		}
 
 out:
@@ -1053,14 +1098,29 @@ static ssize_t rxr_ep_cancel(fid_t fid_ep, void *context)
 static int rxr_ep_getopt(fid_t fid, int level, int optname, void *optval,
 			 size_t *optlen)
 {
-	struct rxr_ep *rxr_ep = container_of(fid, struct rxr_ep,
-					     util_ep.ep_fid.fid);
+	struct rxr_ep *rxr_ep;
+	struct efa_ep *efa_ep;
+
+	rxr_ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid);
+	efa_ep = container_of(rxr_ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
 
-	if (level != FI_OPT_ENDPOINT || optname != FI_OPT_MIN_MULTI_RECV)
+	if (level != FI_OPT_ENDPOINT)
 		return -FI_ENOPROTOOPT;
 
-	*(size_t *)optval = rxr_ep->min_multi_recv_size;
-	*optlen = sizeof(size_t);
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		*(size_t *)optval = rxr_ep->min_multi_recv_size;
+		*optlen = sizeof(size_t);
+		break;
+	case FI_OPT_EFA_RNR_RETRY:
+		*(size_t *)optval = efa_ep->rnr_retry;
+		*optlen = sizeof(size_t);
+		break;
+	default:
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Unknown endpoint option %s\n", __func__);
+		return -FI_ENOPROTOOPT;
+	}
 
 	return FI_SUCCESS;
 }
@@ -1068,16 +1128,54 @@ static int rxr_ep_getopt(fid_t fid, int level, int optname, void *optval,
 static int rxr_ep_setopt(fid_t fid, int level, int optname,
 			 const void *optval, size_t optlen)
 {
-	struct rxr_ep *rxr_ep = container_of(fid, struct rxr_ep,
-					     util_ep.ep_fid.fid);
+	struct rxr_ep *rxr_ep;
+	struct efa_ep *efa_ep;
 
-	if (level != FI_OPT_ENDPOINT || optname != FI_OPT_MIN_MULTI_RECV)
+	rxr_ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid);
+	efa_ep = container_of(rxr_ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
+
+	if (level != FI_OPT_ENDPOINT)
 		return -FI_ENOPROTOOPT;
 
-	if (optlen < sizeof(size_t))
-		return -FI_EINVAL;
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		if (optlen != sizeof(size_t))
+			return -FI_EINVAL;
+
+		rxr_ep->min_multi_recv_size = *(size_t *)optval;
+		break;
+	case FI_OPT_EFA_RNR_RETRY:
+		if (optlen != sizeof(size_t))
+			return -FI_EINVAL;
+
+		/*
+		 * Application is required to call to fi_setopt before EP
+		 * enabled. If it's calling to fi_setopt after EP enabled,
+		 * fail the call.
+		 *
+		 * efa_ep->qp will be NULL before EP enabled, use it to check
+		 * if the call to fi_setopt is before or after EP enabled for
+		 * convience, instead of calling to ibv_query_qp
+		 */
+		if (efa_ep->qp) {
+			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+				"The option FI_OPT_EFA_RNR_RETRY is required \
+				to be set before EP enabled %s\n", __func__);
+			return -FI_EINVAL;
+		}
 
-	rxr_ep->min_multi_recv_size = *(size_t *)optval;
+		if (!efa_ep_support_rnr_retry_modify(rxr_ep->rdm_ep)) {
+			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+				"RNR capability is not supported %s\n", __func__);
+			return -FI_ENOSYS;
+		}
+		efa_ep->rnr_retry = *(size_t *)optval;
+		break;
+	default:
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Unknown endpoint option %s\n", __func__);
+		return -FI_ENOPROTOOPT;
+	}
 
 	return FI_SUCCESS;
 }
@@ -1137,10 +1235,8 @@ static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size,
 		.alignment	= RXR_BUF_POOL_ALIGNMENT,
 		.max_cnt	= chunk_count,
 		.chunk_cnt	= chunk_count,
-		.alloc_fn	= rxr_ep_mr_local(ep) ?
-					rxr_buf_region_alloc_hndlr : NULL,
-		.free_fn	= rxr_ep_mr_local(ep) ?
-					rxr_buf_region_free_hndlr : NULL,
+		.alloc_fn	= rxr_buf_region_alloc_hndlr,
+		.free_fn	= rxr_buf_region_free_hndlr,
 		.init_fn	= NULL,
 		.context	= rxr_ep_domain(ep),
 		.flags		= flags,
@@ -1149,9 +1245,18 @@ static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size,
 	return ofi_bufpool_create_attr(&attr, buf_pool);
 }
 
+/** @brief Initializes the endpoint.
+ *
+ * This function allocates the various buffer pools for the EFA and SHM
+ * provider and does other endpoint initialization.
+ *
+ * @param ep rxr_ep struct to initialize.
+ * @return 0 on success, fi_errno on error.
+ */
 int rxr_ep_init(struct rxr_ep *ep)
 {
-	size_t entry_sz;
+	size_t entry_sz, sendv_pool_size;
+	int hp_pool_flag;
 	int ret;
 
 	entry_sz = ep->mtu_size + sizeof(struct rxr_pkt_entry);
@@ -1160,34 +1265,40 @@ int rxr_ep_init(struct rxr_ep *ep)
 	ep->rx_pkt_pool_entry_sz = entry_sz;
 #endif
 
+	if (efa_fork_status == EFA_FORK_SUPPORT_ON)
+		hp_pool_flag = 0;
+	else
+		hp_pool_flag = OFI_BUFPOOL_HUGEPAGES;
+
 	ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_tx_pool_chunk_cnt(ep),
-				  OFI_BUFPOOL_HUGEPAGES,
-				  &ep->tx_pkt_efa_pool);
+				  hp_pool_flag,
+				  &ep->efa_tx_pkt_pool);
 	if (ret)
-		goto err_out;
+		goto err_free;
 
 	ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_rx_pool_chunk_cnt(ep),
-				  OFI_BUFPOOL_HUGEPAGES,
-				  &ep->rx_pkt_efa_pool);
+				  hp_pool_flag,
+				  &ep->efa_rx_pkt_pool);
 	if (ret)
-		goto err_free_tx_pool;
+		goto err_free;
 
 	if (rxr_env.rx_copy_unexp) {
 		ret = ofi_bufpool_create(&ep->rx_unexp_pkt_pool, entry_sz,
 					 RXR_BUF_POOL_ALIGNMENT, 0,
-					 rxr_get_rx_pool_chunk_cnt(ep), 0);
+					 rxr_env.unexp_pool_chunk_size, 0);
 
 		if (ret)
-			goto err_free_rx_pool;
+			goto err_free;
 	}
 
 	if (rxr_env.rx_copy_ooo) {
 		ret = ofi_bufpool_create(&ep->rx_ooo_pkt_pool, entry_sz,
 					 RXR_BUF_POOL_ALIGNMENT, 0,
-					 rxr_env.recvwin_size, 0);
+					 rxr_env.ooo_pool_chunk_size, 0);
 
 		if (ret)
-			goto err_free_rx_unexp_pool;
+			goto err_free;
+
 	}
 
 	if ((rxr_env.rx_copy_unexp || rxr_env.rx_copy_ooo) &&
@@ -1200,16 +1311,7 @@ int rxr_ep_init(struct rxr_ep *ep)
 					  0, &ep->rx_readcopy_pkt_pool);
 
 		if (ret)
-			goto err_free_rx_ooo_pool;
-
-		ret = ofi_bufpool_grow(ep->rx_readcopy_pkt_pool);
-		if (ret) {
-			FI_WARN(&rxr_prov, FI_LOG_CQ,
-				"cannot allocate and register memory for readcopy packet pool. error: %s\n",
-				strerror(-ret));
-			goto err_free_rx_readcopy_pool;
-		}
-
+			goto err_free;
 		ep->rx_readcopy_pkt_pool_used = 0;
 		ep->rx_readcopy_pkt_pool_max_used = 0;
 	}
@@ -1219,15 +1321,15 @@ int rxr_ep_init(struct rxr_ep *ep)
 				 RXR_BUF_POOL_ALIGNMENT,
 				 ep->tx_size, ep->tx_size, 0);
 	if (ret)
-		goto err_free_rx_readcopy_pool;
+		goto err_free;
 
 	ret = ofi_bufpool_create(&ep->read_entry_pool,
 				 sizeof(struct rxr_read_entry),
 				 RXR_BUF_POOL_ALIGNMENT,
-				 ep->tx_size + RXR_MAX_RX_QUEUE_SIZE, 
+				 ep->tx_size + RXR_MAX_RX_QUEUE_SIZE,
 				 ep->tx_size + ep->rx_size, 0);
 	if (ret)
-		goto err_free_tx_entry_pool;
+		goto err_free;
 
 	ret = ofi_bufpool_create(&ep->readrsp_tx_entry_pool,
 				 sizeof(struct rxr_tx_entry),
@@ -1235,7 +1337,7 @@ int rxr_ep_init(struct rxr_ep *ep)
 				 RXR_MAX_RX_QUEUE_SIZE,
 				 ep->rx_size, 0);
 	if (ret)
-		goto err_free_read_entry_pool;
+		goto err_free;
 
 	ret = ofi_bufpool_create(&ep->rx_entry_pool,
 				 sizeof(struct rxr_rx_entry),
@@ -1243,7 +1345,7 @@ int rxr_ep_init(struct rxr_ep *ep)
 				 RXR_MAX_RX_QUEUE_SIZE,
 				 ep->rx_size, 0);
 	if (ret)
-		goto err_free_readrsp_tx_entry_pool;
+		goto err_free;
 
 	ret = ofi_bufpool_create(&ep->map_entry_pool,
 				 sizeof(struct rxr_pkt_rx_map),
@@ -1252,25 +1354,44 @@ int rxr_ep_init(struct rxr_ep *ep)
 				 ep->rx_size, 0);
 
 	if (ret)
-		goto err_free_rx_entry_pool;
+		goto err_free;
+
+	ret = ofi_bufpool_create(&ep->rx_atomrsp_pool,
+				 ep->mtu_size,
+				 RXR_BUF_POOL_ALIGNMENT,
+				 RXR_MAX_RX_QUEUE_SIZE,
+				 rxr_env.atomrsp_pool_size, 0);
+	if (ret)
+		goto err_free;
+
+	sendv_pool_size = rxr_get_tx_pool_chunk_cnt(ep);
+	if (ep->use_shm)
+		sendv_pool_size += shm_info->tx_attr->size;
+	ret = ofi_bufpool_create(&ep->pkt_sendv_pool,
+				 sizeof(struct rxr_pkt_sendv),
+				 RXR_BUF_POOL_ALIGNMENT,
+				 sendv_pool_size,
+				 sendv_pool_size, 0);
+	if (ret)
+		goto err_free;
 
 	/* create pkt pool for shm */
 	if (ep->use_shm) {
-		ret = ofi_bufpool_create(&ep->tx_pkt_shm_pool,
+		ret = ofi_bufpool_create(&ep->shm_tx_pkt_pool,
 					 entry_sz,
 					 RXR_BUF_POOL_ALIGNMENT,
 					 shm_info->tx_attr->size,
 					 shm_info->tx_attr->size, 0);
 		if (ret)
-			goto err_free_map_entry_pool;
+			goto err_free;
 
-		ret = ofi_bufpool_create(&ep->rx_pkt_shm_pool,
+		ret = ofi_bufpool_create(&ep->shm_rx_pkt_pool,
 					 entry_sz,
 					 RXR_BUF_POOL_ALIGNMENT,
 					 shm_info->rx_attr->size,
 					 shm_info->rx_attr->size, 0);
 		if (ret)
-			goto err_free_tx_pkt_shm_pool;
+			goto err_free;
 
 		dlist_init(&ep->rx_posted_buf_shm_list);
 	}
@@ -1281,56 +1402,66 @@ int rxr_ep_init(struct rxr_ep *ep)
 	dlist_init(&ep->rx_tagged_list);
 	dlist_init(&ep->rx_unexp_tagged_list);
 	dlist_init(&ep->rx_posted_buf_list);
-	dlist_init(&ep->rx_entry_queued_list);
-	dlist_init(&ep->tx_entry_queued_list);
+	dlist_init(&ep->rx_entry_queued_rnr_list);
+	dlist_init(&ep->rx_entry_queued_ctrl_list);
+	dlist_init(&ep->tx_entry_queued_rnr_list);
+	dlist_init(&ep->tx_entry_queued_ctrl_list);
 	dlist_init(&ep->tx_pending_list);
 	dlist_init(&ep->read_pending_list);
 	dlist_init(&ep->peer_backoff_list);
+	dlist_init(&ep->handshake_queued_peer_list);
 #if ENABLE_DEBUG
 	dlist_init(&ep->rx_pending_list);
 	dlist_init(&ep->rx_pkt_list);
 	dlist_init(&ep->tx_pkt_list);
+#endif
 	dlist_init(&ep->rx_entry_list);
 	dlist_init(&ep->tx_entry_list);
-#endif
+
 	/* Initialize pkt to rx map */
 	ep->pkt_rx_map = NULL;
 	return 0;
 
-err_free_tx_pkt_shm_pool:
-	if (ep->tx_pkt_shm_pool)
-		ofi_bufpool_destroy(ep->tx_pkt_shm_pool);
-err_free_map_entry_pool:
+err_free:
+	if (ep->shm_tx_pkt_pool)
+		ofi_bufpool_destroy(ep->shm_tx_pkt_pool);
+
+	if (ep->pkt_sendv_pool)
+		ofi_bufpool_destroy(ep->pkt_sendv_pool);
+
+	if (ep->rx_atomrsp_pool)
+		ofi_bufpool_destroy(ep->rx_atomrsp_pool);
+
 	if (ep->map_entry_pool)
 		ofi_bufpool_destroy(ep->map_entry_pool);
-err_free_rx_entry_pool:
+
 	if (ep->rx_entry_pool)
 		ofi_bufpool_destroy(ep->rx_entry_pool);
-err_free_readrsp_tx_entry_pool:
+
 	if (ep->readrsp_tx_entry_pool)
 		ofi_bufpool_destroy(ep->readrsp_tx_entry_pool);
-err_free_read_entry_pool:
+
 	if (ep->read_entry_pool)
 		ofi_bufpool_destroy(ep->read_entry_pool);
-err_free_tx_entry_pool:
+
 	if (ep->tx_entry_pool)
 		ofi_bufpool_destroy(ep->tx_entry_pool);
-err_free_rx_readcopy_pool:
+
 	if (ep->rx_readcopy_pkt_pool)
 		ofi_bufpool_destroy(ep->rx_readcopy_pkt_pool);
-err_free_rx_ooo_pool:
+
 	if (rxr_env.rx_copy_ooo && ep->rx_ooo_pkt_pool)
 		ofi_bufpool_destroy(ep->rx_ooo_pkt_pool);
-err_free_rx_unexp_pool:
+
 	if (rxr_env.rx_copy_unexp && ep->rx_unexp_pkt_pool)
 		ofi_bufpool_destroy(ep->rx_unexp_pkt_pool);
-err_free_rx_pool:
-	if (ep->rx_pkt_efa_pool)
-		ofi_bufpool_destroy(ep->rx_pkt_efa_pool);
-err_free_tx_pool:
-	if (ep->tx_pkt_efa_pool)
-		ofi_bufpool_destroy(ep->tx_pkt_efa_pool);
-err_out:
+
+	if (ep->efa_rx_pkt_pool)
+		ofi_bufpool_destroy(ep->efa_rx_pkt_pool);
+
+	if (ep->efa_tx_pkt_pool)
+		ofi_bufpool_destroy(ep->efa_tx_pkt_pool);
+
 	return ret;
 }
 
@@ -1363,35 +1494,193 @@ struct fi_ops_cm rxr_ep_cm = {
 	.join = fi_no_join,
 };
 
-static inline int rxr_ep_bulk_post_recv(struct rxr_ep *ep)
+/*
+ * @brief explicitly allocate a chunk of memory for 5 packet pools on RX side:
+ *     efa's receive packet pool (efa_rx_pkt_pool)
+ *     shm's receive packet pool (shm_rx_pkt_pool)
+ *     unexpected packet pool (rx_unexp_pkt_pool),
+ *     out-of-order packet pool (rx_ooo_pkt_pool), and
+ *     local read-copy packet pool (rx_readcopy_pkt_pool).
+ *
+ * @param ep[in]	endpoint
+ * @return		On success, return 0
+ * 			On failure, return a negative error code.
+ */
+int rxr_ep_grow_rx_pkt_pools(struct rxr_ep *ep)
 {
-	uint64_t flags = FI_MORE;
-	int ret;
+	int err;
+
+	assert(ep->efa_rx_pkt_pool);
+	err = ofi_bufpool_grow(ep->efa_rx_pkt_pool);
+	if (err) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"cannot allocate memory for EFA's RX packet pool. error: %s\n",
+			strerror(-err));
+		return err;
+	}
 
-	while (ep->rx_bufs_efa_to_post) {
-		if (ep->rx_bufs_efa_to_post == 1)
-			flags = 0;
-		ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP);
-		if (OFI_LIKELY(!ret))
-			ep->rx_bufs_efa_to_post--;
-		else
-			return ret;
+	if (ep->use_shm) {
+		assert(ep->shm_rx_pkt_pool);
+		err = ofi_bufpool_grow(ep->shm_rx_pkt_pool);
+		if (err) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ,
+				"cannot allocate memory for SHM's RX packet pool. error: %s\n",
+				strerror(-err));
+			return err;
+		}
 	}
-	/* bulk post recv buf for shm provider */
-	flags = FI_MORE;
-	while (ep->use_shm && ep->rx_bufs_shm_to_post) {
-		if (ep->rx_bufs_shm_to_post == 1)
-			flags = 0;
-		ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP);
-		if (OFI_LIKELY(!ret))
-			ep->rx_bufs_shm_to_post--;
-		else
-			return ret;
+
+	if (ep->rx_unexp_pkt_pool) {
+		assert(ep->rx_unexp_pkt_pool);
+		err = ofi_bufpool_grow(ep->rx_unexp_pkt_pool);
+		if (err) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ,
+				"cannot allocate memory for unexpected packet pool. error: %s\n",
+				strerror(-err));
+			return err;
+		}
+	}
+
+	if (ep->rx_ooo_pkt_pool) {
+		assert(ep->rx_ooo_pkt_pool);
+		err = ofi_bufpool_grow(ep->rx_ooo_pkt_pool);
+		if (err) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ,
+				"cannot allocate memory for out-of-order packet pool. error: %s\n",
+				strerror(-err));
+			return err;
+		}
+	}
+
+	if (ep->rx_readcopy_pkt_pool) {
+		err = ofi_bufpool_grow(ep->rx_readcopy_pkt_pool);
+		if (err) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ,
+				"cannot allocate and register memory for readcopy packet pool. error: %s\n",
+				strerror(-err));
+			return err;
+		}
 	}
 
 	return 0;
 }
 
+/**
+ * @brief post internal receive buffers for progress engine.
+ *
+ * It is more efficient to post multiple receive buffers
+ * to the device at once than to post each receive buffer
+ * individually.
+ *
+ * Therefore, after an internal receive buffer (a packet
+ * entry) was processed, it is not posted to the device
+ * right away.
+ *
+ * Instead, we increase counter
+ *      ep->efa/shm_rx_pkts_to_post
+ * by one.
+ *
+ * Later, progress engine calls this function to
+ * bulk post internal receive buffers (according to
+ * the counter).
+ *
+ * This function also control number of internal
+ * buffers posted to the device in zero copy receive
+ * mode.
+ *
+ * param[in]	ep	endpoint
+ */
+static inline
+void rxr_ep_progress_post_internal_rx_pkts(struct rxr_ep *ep)
+{
+	int err;
+
+	if (ep->use_zcpy_rx) {
+		/*
+		 * In zero copy receive mode,
+		 *
+		 * If application did not post any receive buffer,
+		 * we post one internal buffer so endpoint can
+		 * receive RxR control packets such as handshake.
+		 *
+		 * If buffers have posted to the device, we do NOT
+		 * repost internal buffers to maximize the chance
+		 * user buffer is used to receive data.
+		 */
+		if (ep->efa_rx_pkts_posted == 0 && ep->efa_rx_pkts_to_post == 0) {
+			ep->efa_rx_pkts_to_post = 1;
+		} else if (ep->efa_rx_pkts_posted > 0 && ep->efa_rx_pkts_to_post > 0){
+			ep->efa_rx_pkts_to_post = 0;
+		}
+	} else {
+		if (ep->efa_rx_pkts_posted == 0 && ep->efa_rx_pkts_to_post == 0) {
+			/* Both efa_rx_pkts_posted and efa_rx_pkts_to_post equal to 0 means
+			 * this is the first call of the progress engine on this endpoint.
+			 *
+			 * In this case, we explictly allocate the 1st chunk of memory
+			 * for unexp/ooo/readcopy RX packet pool.
+			 *
+			 * The reason to explicitly allocate the memory for RX packet
+			 * pool is to improve efficiency.
+			 *
+			 * Without explicit memory allocation, a pkt pools's memory
+			 * is allocated when 1st packet is allocated from it.
+			 * During the computation, different processes got their 1st
+			 * unexp/ooo/read-copy packet at different time. Therefore,
+			 * if we do not explicitly allocate memory at the beginning,
+			 * memory will be allocated at different time.
+			 *
+			 * When one process is allocating memory, other processes
+			 * have to wait. When each process allocate memory at different
+			 * time, the accumulated waiting time became significant.
+			 *
+			 * By explicitly allocating memory at 1st call to progress
+			 * engine, the memory allocation is parallelized.
+			 * (This assumes the 1st call to the progress engine on
+			 * all processes happen at roughly the same time, which
+			 * is a valid assumption according to our knowledge of
+			 * the workflow of most application)
+			 *
+			 * The memory was not allocated during endpoint initialization
+			 * because some applications will initialize some endpoints
+			 * but never uses it, thus allocating memory initialization
+			 * causes waste.
+			 */
+			err = rxr_ep_grow_rx_pkt_pools(ep);
+			if (err)
+				goto err_exit;
+
+			ep->efa_rx_pkts_to_post = rxr_get_rx_pool_chunk_cnt(ep);
+			ep->available_data_bufs = rxr_get_rx_pool_chunk_cnt(ep);
+
+			if (ep->use_shm) {
+				assert(ep->shm_rx_pkts_posted == 0 && ep->shm_rx_pkts_to_post == 0);
+				ep->shm_rx_pkts_to_post = shm_info->rx_attr->size;
+			}
+		}
+	}
+
+	err = rxr_ep_bulk_post_internal_rx_pkts(ep, ep->efa_rx_pkts_to_post, EFA_EP);
+	if (err)
+		goto err_exit;
+
+	ep->efa_rx_pkts_to_post = 0;
+
+	if (ep->use_shm) {
+		err = rxr_ep_bulk_post_internal_rx_pkts(ep, ep->shm_rx_pkts_to_post, SHM_EP);
+		if (err)
+			goto err_exit;
+
+		ep->shm_rx_pkts_to_post = 0;
+	}
+
+	return;
+
+err_exit:
+
+	efa_eq_write_error(&ep->util_ep, err, err);
+}
+
 static inline int rxr_ep_send_queued_pkts(struct rxr_ep *ep,
 					  struct dlist_entry *pkts)
 {
@@ -1405,10 +1694,22 @@ static inline int rxr_ep_send_queued_pkts(struct rxr_ep *ep,
 			dlist_remove(&pkt_entry->entry);
 			continue;
 		}
-		ret = rxr_pkt_entry_send(ep, pkt_entry, pkt_entry->addr);
-		if (ret)
-			return ret;
+
+		/* If send succeeded, pkt_entry->entry will be added
+		 * to peer->outstanding_tx_pkts. Therefore, it must
+		 * be removed from the list before send.
+		 */
 		dlist_remove(&pkt_entry->entry);
+
+		ret = rxr_pkt_entry_send(ep, pkt_entry, 0);
+		if (ret) {
+			if (ret == -FI_EAGAIN) {
+				/* add the pkt back to pkts, so it can be resent again */
+				dlist_insert_tail(&pkt_entry->entry, pkts);
+			}
+
+			return ret;
+		}
 	}
 	return 0;
 }
@@ -1429,28 +1730,124 @@ static inline void rxr_ep_check_available_data_bufs_timer(struct rxr_ep *ep)
 
 static inline void rxr_ep_check_peer_backoff_timer(struct rxr_ep *ep)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct dlist_entry *tmp;
 
 	if (OFI_LIKELY(dlist_empty(&ep->peer_backoff_list)))
 		return;
 
-	dlist_foreach_container_safe(&ep->peer_backoff_list, struct rxr_peer,
-				     peer, rnr_entry, tmp) {
-		peer->flags &= ~RXR_PEER_BACKED_OFF;
-		if (!rxr_peer_timeout_expired(ep, peer, ofi_gettime_us()))
-			continue;
-		peer->flags &= ~RXR_PEER_IN_BACKOFF;
-		dlist_remove(&peer->rnr_entry);
+	dlist_foreach_container_safe(&ep->peer_backoff_list, struct rdm_peer,
+				     peer, rnr_backoff_entry, tmp) {
+		if (ofi_gettime_us() >= peer->rnr_backoff_begin_ts +
+					peer->rnr_backoff_wait_time) {
+			peer->flags &= ~RXR_PEER_IN_BACKOFF;
+			dlist_remove(&peer->rnr_backoff_entry);
+		}
 	}
 }
 
-static inline void rxr_ep_poll_cq(struct rxr_ep *ep,
-				  struct fid_cq *cq,
-				  size_t cqe_to_process,
-				  bool is_shm_cq)
+/**
+ * @brief poll rdma-core cq and process the cq entry
+ *
+ * @param[in]	ep		end point
+ * @param[in]	cqe_to_process	max number of cq entry to poll and process
+ */
+static inline void rdm_ep_poll_ibv_cq(struct rxr_ep *ep,
+				      size_t cqe_to_process)
+{
+	struct ibv_wc ibv_wc;
+	struct efa_cq *efa_cq;
+	struct efa_av *efa_av;
+	struct efa_ep *efa_ep;
+	struct rxr_pkt_entry *pkt_entry;
+	ssize_t ret;
+	int i, err, prov_errno;
+
+	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
+	efa_av = efa_ep->av;
+	efa_cq = container_of(ep->rdm_cq, struct efa_cq, util_cq.cq_fid);
+	for (i = 0; i < cqe_to_process; i++) {
+		ret = ibv_poll_cq(efa_cq->ibv_cq, 1, &ibv_wc);
+
+		if (ret == 0)
+			return;
+
+		if (OFI_UNLIKELY(ret < 0 || ibv_wc.status)) {
+			if (ret < 0) {
+				efa_eq_write_error(&ep->util_ep, -ret, -ret);
+				return;
+			}
+
+			pkt_entry = (void *)(uintptr_t)ibv_wc.wr_id;
+			err = ibv_wc.status;
+			prov_errno = ibv_wc.status;
+			if (ibv_wc.opcode == IBV_WC_SEND) {
+#if ENABLE_DEBUG
+				ep->failed_send_comps++;
+#endif
+				rxr_pkt_handle_send_error(ep, pkt_entry, err, prov_errno);
+			} else {
+				assert(ibv_wc.opcode == IBV_WC_RECV);
+				rxr_pkt_handle_recv_error(ep, pkt_entry, err, prov_errno);
+			}
+
+			return;
+		}
+
+		pkt_entry = (void *)(uintptr_t)ibv_wc.wr_id;
+
+		switch (ibv_wc.opcode) {
+		case IBV_WC_SEND:
+#if ENABLE_DEBUG
+			ep->send_comps++;
+#endif
+			rxr_pkt_handle_send_completion(ep, pkt_entry);
+			break;
+		case IBV_WC_RECV:
+			pkt_entry->addr = efa_av_reverse_lookup_rdm(efa_av, ibv_wc.slid, ibv_wc.src_qp, pkt_entry);
+			pkt_entry->pkt_size = ibv_wc.byte_len;
+			assert(pkt_entry->pkt_size > 0);
+			rxr_pkt_handle_recv_completion(ep, pkt_entry);
+#if ENABLE_DEBUG
+			ep->recv_comps++;
+#endif
+			break;
+		default:
+			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+				"Unhandled cq type\n");
+			assert(0 && "Unhandled cq type");
+		}
+	}
+}
+
+static inline
+void rdm_ep_poll_shm_err_cq(struct fid_cq *shm_cq, struct fi_cq_err_entry *cq_err_entry)
+{
+	int ret;
+
+	ret = fi_cq_readerr(shm_cq, cq_err_entry, 0);
+	if (ret == 1)
+		return;
+
+	if (ret < 0) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "encountered error when fi_cq_readerr: %s\n",
+			fi_strerror(-ret));
+		cq_err_entry->err = -ret;
+		cq_err_entry->prov_errno = -ret;
+		return;
+	}
+
+	FI_WARN(&rxr_prov, FI_LOG_CQ, "fi_cq_readerr got expected return: %d\n", ret);
+	cq_err_entry->err = FI_EIO;
+	cq_err_entry->prov_errno = FI_EIO;
+}
+
+static inline void rdm_ep_poll_shm_cq(struct rxr_ep *ep,
+				      size_t cqe_to_process)
 {
 	struct fi_cq_data_entry cq_entry;
+	struct fi_cq_err_entry cq_err_entry = { 0 };
+	struct rxr_pkt_entry *pkt_entry;
 	fi_addr_t src_addr;
 	ssize_t ret;
 	struct efa_ep *efa_ep;
@@ -1462,43 +1859,50 @@ static inline void rxr_ep_poll_cq(struct rxr_ep *ep,
 	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
 	efa_av = efa_ep->av;
 	for (i = 0; i < cqe_to_process; i++) {
-		ret = fi_cq_readfrom(cq, &cq_entry, 1, &src_addr);
+		ret = fi_cq_readfrom(ep->shm_cq, &cq_entry, 1, &src_addr);
 
 		if (ret == -FI_EAGAIN)
 			return;
 
 		if (OFI_UNLIKELY(ret < 0)) {
-			if (rxr_cq_handle_cq_error(ep, ret))
-				assert(0 &&
-				       "error writing error cq entry after reading from cq");
-			if (!ep->use_zcpy_rx)
-				rxr_ep_bulk_post_recv(ep);
+			if (ret != -FI_EAVAIL) {
+				efa_eq_write_error(&ep->util_ep, -ret, -ret);
+				return;
+			}
+
+			rdm_ep_poll_shm_err_cq(ep->shm_cq, &cq_err_entry);
+			if (cq_err_entry.flags & (FI_SEND | FI_READ | FI_WRITE)) {
+				assert(cq_entry.op_context);
+				rxr_pkt_handle_send_error(ep, cq_entry.op_context, cq_err_entry.err, cq_err_entry.prov_errno);
+			} else if (cq_err_entry.flags & FI_RECV) {
+				assert(cq_entry.op_context);
+				rxr_pkt_handle_recv_error(ep, cq_entry.op_context, cq_err_entry.err, cq_err_entry.prov_errno);
+			} else {
+				efa_eq_write_error(&ep->util_ep, cq_err_entry.err, cq_err_entry.prov_errno);
+			}
+
 			return;
 		}
 
 		if (OFI_UNLIKELY(ret == 0))
 			return;
 
-		if (is_shm_cq && src_addr != FI_ADDR_UNSPEC) {
+		pkt_entry = cq_entry.op_context;
+		if (src_addr != FI_ADDR_UNSPEC) {
 			/* convert SHM address to EFA address */
 			assert(src_addr < EFA_SHM_MAX_AV_COUNT);
 			src_addr = efa_av->shm_rdm_addr_map[src_addr];
 		}
 
-		if (is_shm_cq && (cq_entry.flags & (FI_ATOMIC | FI_REMOTE_CQ_DATA))) {
+		if (cq_entry.flags & (FI_ATOMIC | FI_REMOTE_CQ_DATA)) {
 			rxr_cq_handle_shm_completion(ep, &cq_entry, src_addr);
 		} else if (cq_entry.flags & (FI_SEND | FI_READ | FI_WRITE)) {
-#if ENABLE_DEBUG
-			if (!is_shm_cq)
-				ep->send_comps++;
-#endif
-			rxr_pkt_handle_send_completion(ep, &cq_entry);
+			rxr_pkt_handle_send_completion(ep, pkt_entry);
 		} else if (cq_entry.flags & (FI_RECV | FI_REMOTE_CQ_DATA)) {
-			rxr_pkt_handle_recv_completion(ep, &cq_entry, src_addr);
-#if ENABLE_DEBUG
-			if (!is_shm_cq)
-				ep->recv_comps++;
-#endif
+			pkt_entry->addr = src_addr;
+			pkt_entry->pkt_size = cq_entry.len;
+			assert(pkt_entry->pkt_size > 0);
+			rxr_pkt_handle_recv_completion(ep, pkt_entry);
 		} else {
 			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 				"Unhandled cq type\n");
@@ -1509,81 +1913,177 @@ static inline void rxr_ep_poll_cq(struct rxr_ep *ep,
 
 void rxr_ep_progress_internal(struct rxr_ep *ep)
 {
+	struct ibv_send_wr *bad_wr;
+	struct efa_ep *efa_ep;
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_read_entry *read_entry;
+	struct rdm_peer *peer;
 	struct dlist_entry *tmp;
 	ssize_t ret;
-
-	if (!ep->use_zcpy_rx)
-		rxr_ep_check_available_data_bufs_timer(ep);
+	uint64_t flags;
 
 	// Poll the EFA completion queue
-	rxr_ep_poll_cq(ep, ep->rdm_cq, rxr_env.efa_cq_read_size, 0);
+	rdm_ep_poll_ibv_cq(ep, rxr_env.efa_cq_read_size);
 
 	// Poll the SHM completion queue if enabled
 	if (ep->use_shm)
-		rxr_ep_poll_cq(ep, ep->shm_cq, rxr_env.shm_cq_read_size, 1);
+		rdm_ep_poll_shm_cq(ep, rxr_env.shm_cq_read_size);
 
-	if (!ep->use_zcpy_rx) {
-		ret = rxr_ep_bulk_post_recv(ep);
+	rxr_ep_progress_post_internal_rx_pkts(ep);
+
+	rxr_ep_check_peer_backoff_timer(ep);
+
+	if (!ep->use_zcpy_rx)
+		rxr_ep_check_available_data_bufs_timer(ep);
+	/*
+	 * Resend handshake packet for any peers where the first
+	 * handshake send failed.
+	 */
+	dlist_foreach_container_safe(&ep->handshake_queued_peer_list,
+				     struct rdm_peer, peer,
+				     handshake_queued_entry, tmp) {
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+
+		ret = rxr_pkt_post_handshake(ep, peer);
+		if (ret == -FI_EAGAIN)
+			break;
 
 		if (OFI_UNLIKELY(ret)) {
-			if (rxr_cq_handle_cq_error(ep, ret))
-				assert(0 &&
-				       "error writing error cq entry after failed post recv");
+			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+				"Failed to post HANDSHAKE to peer %ld: %s\n",
+				peer->efa_fiaddr, fi_strerror(-ret));
+			efa_eq_write_error(&ep->util_ep, FI_EIO, -ret);
 			return;
 		}
-	}
 
-	rxr_ep_check_peer_backoff_timer(ep);
+		dlist_remove(&peer->handshake_queued_entry);
+		peer->flags &= ~RXR_PEER_HANDSHAKE_QUEUED;
+		peer->flags |= RXR_PEER_HANDSHAKE_SENT;
+	}
 
 	/*
 	 * Send any queued ctrl packets.
 	 */
-	dlist_foreach_container_safe(&ep->rx_entry_queued_list,
+	dlist_foreach_container_safe(&ep->rx_entry_queued_rnr_list,
 				     struct rxr_rx_entry,
-				     rx_entry, queued_entry, tmp) {
-		if (rx_entry->state == RXR_RX_QUEUED_CTRL)
-			ret = rxr_pkt_post_ctrl(ep, RXR_RX_ENTRY, rx_entry,
-						rx_entry->queued_ctrl.type,
-						rx_entry->queued_ctrl.inject);
-		else
-			ret = rxr_ep_send_queued_pkts(ep,
-						      &rx_entry->queued_pkts);
+				     rx_entry, queued_rnr_entry, tmp) {
+		peer = rxr_ep_get_peer(ep, rx_entry->addr);
+		assert(peer);
+
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+
+		assert(rx_entry->rxr_flags & RXR_RX_ENTRY_QUEUED_RNR);
+		assert(!dlist_empty(&rx_entry->queued_pkts));
+		ret = rxr_ep_send_queued_pkts(ep, &rx_entry->queued_pkts);
+
 		if (ret == -FI_EAGAIN)
 			break;
-		if (OFI_UNLIKELY(ret))
-			goto rx_err;
 
-		dlist_remove(&rx_entry->queued_entry);
+		if (OFI_UNLIKELY(ret)) {
+			rxr_cq_write_rx_error(ep, rx_entry, -ret, -ret);
+			return;
+		}
+
+		dlist_remove(&rx_entry->queued_rnr_entry);
+		rx_entry->rxr_flags &= ~RXR_RX_ENTRY_QUEUED_RNR;
+	}
+
+	dlist_foreach_container_safe(&ep->rx_entry_queued_ctrl_list,
+				     struct rxr_rx_entry,
+				     rx_entry, queued_ctrl_entry, tmp) {
+		peer = rxr_ep_get_peer(ep, rx_entry->addr);
+		assert(peer);
+
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+		/*
+		 * rx_entry only send one ctrl packet at a time. The
+		 * ctrl packet can be CTS, EOR, RECEIPT.
+		 */
+		assert(rx_entry->state == RXR_RX_QUEUED_CTRL);
+		ret = rxr_pkt_post_ctrl(ep, RXR_RX_ENTRY, rx_entry,
+					rx_entry->queued_ctrl.type,
+					rx_entry->queued_ctrl.inject,
+					0);
+		if (ret == -FI_EAGAIN)
+			break;
+
+		if (OFI_UNLIKELY(ret)) {
+			rxr_cq_write_rx_error(ep, rx_entry, -ret, -ret);
+			return;
+		}
+
+		/* it can happen that rxr_pkt_post_ctrl() released rx_entry
+		 * (if the packet type is EOR and inject is used). In
+		 * that case rx_entry's state has been set to RXR_RX_FREE and
+		 * it has been removed from ep->rx_queued_entry_list, so nothing
+		 * is left to do.
+		 */
+		if (rx_entry->state == RXR_RX_FREE)
+			continue;
+
+		dlist_remove(&rx_entry->queued_ctrl_entry);
+		/*
+		 * For CTS packet, the state need to be RXR_RX_RECV.
+		 * For EOR/RECEIPT, all data has been received, so any state
+		 * other than RXR_RX_QUEUED_CTRL should work.
+		 * In all, we set the state to RXR_RX_RECV
+		 */
 		rx_entry->state = RXR_RX_RECV;
 	}
 
-	dlist_foreach_container_safe(&ep->tx_entry_queued_list,
+	dlist_foreach_container_safe(&ep->tx_entry_queued_rnr_list,
 				     struct rxr_tx_entry,
-				     tx_entry, queued_entry, tmp) {
-		if (tx_entry->state == RXR_TX_QUEUED_CTRL)
-			ret = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry,
-						tx_entry->queued_ctrl.type,
-						tx_entry->queued_ctrl.inject);
-		else
-			ret = rxr_ep_send_queued_pkts(ep, &tx_entry->queued_pkts);
+				     tx_entry, queued_rnr_entry, tmp) {
+		peer = rxr_ep_get_peer(ep, tx_entry->addr);
+		assert(peer);
 
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+
+		assert(tx_entry->rxr_flags & RXR_TX_ENTRY_QUEUED_RNR);
+		ret = rxr_ep_send_queued_pkts(ep, &tx_entry->queued_pkts);
 		if (ret == -FI_EAGAIN)
 			break;
-		if (OFI_UNLIKELY(ret))
-			goto tx_err;
 
-		dlist_remove(&tx_entry->queued_entry);
+		if (OFI_UNLIKELY(ret)) {
+			rxr_cq_write_tx_error(ep, tx_entry, -ret, -ret);
+			return;
+		}
+
+		dlist_remove(&tx_entry->queued_rnr_entry);
+		tx_entry->rxr_flags &= ~RXR_TX_ENTRY_QUEUED_RNR;
+	}
 
-		if (tx_entry->state == RXR_TX_QUEUED_REQ_RNR)
-			tx_entry->state = RXR_TX_REQ;
-		else if (tx_entry->state == RXR_TX_QUEUED_DATA_RNR) {
-			tx_entry->state = RXR_TX_SEND;
-			dlist_insert_tail(&tx_entry->entry,
-					  &ep->tx_pending_list);
+	dlist_foreach_container_safe(&ep->tx_entry_queued_ctrl_list,
+				     struct rxr_tx_entry,
+				     tx_entry, queued_ctrl_entry, tmp) {
+		peer = rxr_ep_get_peer(ep, tx_entry->addr);
+		assert(peer);
+
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+
+		assert(tx_entry->state == RXR_TX_QUEUED_CTRL);
+
+		ret = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry,
+					tx_entry->queued_ctrl.type,
+					tx_entry->queued_ctrl.inject,
+					0);
+		if (ret == -FI_EAGAIN)
+			break;
+
+		if (OFI_UNLIKELY(ret)) {
+			rxr_cq_write_tx_error(ep, tx_entry, -ret, -ret);
+			return;
 		}
+
+		dlist_remove(&tx_entry->queued_ctrl_entry);
+		if (tx_entry->state == RXR_TX_QUEUED_CTRL)
+			tx_entry->state = RXR_TX_REQ;
 	}
 
 	/*
@@ -1591,25 +2091,59 @@ void rxr_ep_progress_internal(struct rxr_ep *ep)
 	 */
 	dlist_foreach_container(&ep->tx_pending_list, struct rxr_tx_entry,
 				tx_entry, entry) {
-		if (tx_entry->window > 0)
-			tx_entry->send_flags |= FI_MORE;
-		else
+		peer = rxr_ep_get_peer(ep, tx_entry->addr);
+		assert(peer);
+
+		if (peer->flags & RXR_PEER_IN_BACKOFF)
+			continue;
+
+		/*
+		 * Do not send DATA packet until we received HANDSHAKE packet from the peer,
+		 * this is because endpoint does not know whether peer need connid in header
+		 * until it get the HANDSHAKE packet.
+		 *
+		 * We only do this for DATA packet because other types of packets always
+		 * has connid in there packet header. If peer does not make use of the connid,
+		 * the connid can be safely ignored.
+		 *
+		 * DATA packet is different because for DATA packet connid is an optional
+		 * header inserted between the mandatory header and the application data.
+		 * Therefore if peer does not use/understand connid, it will take connid
+		 * as application data thus cause data corruption.
+		 *
+		 * This will not cause deadlock because peer will send a HANDSHAKE packet
+		 * back upon receiving 1st packet from the endpoint, and in all 3 sub0protocols
+		 * (long-CTS message, emulated long-CTS write and emulated long-CTS read)
+		 * where DATA packet is used, endpoint will send other types of packet to
+		 * peer before sending DATA packets. The workflow of the 3 sub-protocol
+		 * can be found in protocol v4 document chapter 3.
+		 */
+		if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))
 			continue;
 
 		while (tx_entry->window > 0) {
-			if (ep->max_outstanding_tx - ep->tx_pending <= 1 ||
+			flags = FI_MORE;
+			if (ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops <= 1 ||
 			    tx_entry->window <= ep->max_data_payload_size)
-				tx_entry->send_flags &= ~FI_MORE;
+				flags = 0;
 			/*
 			 * The core's TX queue is full so we can't do any
 			 * additional work.
 			 */
-			if (ep->tx_pending == ep->max_outstanding_tx)
+			if (ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops)
 				goto out;
-			ret = rxr_pkt_post_data(ep, tx_entry);
+
+			if (peer->flags & RXR_PEER_IN_BACKOFF)
+				break;
+
+			ret = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry,
+						RXR_DATA_PKT, false, flags);
 			if (OFI_UNLIKELY(ret)) {
-				tx_entry->send_flags &= ~FI_MORE;
-				goto tx_err;
+				if (ret == -FI_EAGAIN)
+					goto out;
+
+				rxr_cq_write_tx_error(ep, tx_entry, -ret, -ret);
+				return;
 			}
 		}
 	}
@@ -1619,40 +2153,43 @@ void rxr_ep_progress_internal(struct rxr_ep *ep)
 	 */
 	dlist_foreach_container_safe(&ep->read_pending_list, struct rxr_read_entry,
 				     read_entry, pending_entry, tmp) {
+		peer = rxr_ep_get_peer(ep, read_entry->addr);
+		/*
+		 * Here peer can be NULL, when the read request is a
+		 * local read request. Local read request is used to copy
+		 * data from host memory to device memory on same process.
+		 */
+		if (peer && (peer->flags & RXR_PEER_IN_BACKOFF))
+			continue;
+
 		/*
 		 * The core's TX queue is full so we can't do any
 		 * additional work.
 		 */
-		if (ep->tx_pending == ep->max_outstanding_tx)
+		if (ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops)
 			goto out;
 
 		ret = rxr_read_post(ep, read_entry);
 		if (ret == -FI_EAGAIN)
 			break;
 
-		if (OFI_UNLIKELY(ret))
-			goto read_err;
+		if (OFI_UNLIKELY(ret)) {
+			rxr_read_write_error(ep, read_entry, -ret, -ret);
+			return;
+		}
 
+		read_entry->state = RXR_RDMA_ENTRY_SUBMITTED;
 		dlist_remove(&read_entry->pending_entry);
 	}
 
 out:
-	return;
-rx_err:
-	if (rxr_cq_handle_rx_error(ep, rx_entry, ret))
-		assert(0 &&
-		       "error writing error cq entry when handling RX error");
-	return;
-tx_err:
-	if (rxr_cq_handle_tx_error(ep, tx_entry, ret))
-		assert(0 &&
-		       "error writing error cq entry when handling TX error");
-	return;
+	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
+	if (efa_ep->xmit_more_wr_tail != &efa_ep->xmit_more_wr_head) {
+		ret = efa_post_flush(efa_ep, &bad_wr);
+		if (OFI_UNLIKELY(ret))
+			efa_eq_write_error(&ep->util_ep, -ret, -ret);
+	}
 
-read_err:
-	if (rxr_read_handle_error(ep, read_entry, ret))
-		assert(0 &&
-		       "error writing err cq entry while handling RDMA error");
 	return;
 }
 
@@ -1686,6 +2223,23 @@ bool rxr_ep_use_shm(struct fi_info *info)
 	    && !(info->caps & FI_LOCAL_COMM))
 		return 0;
 
+	/*
+	 * Currently, shm provider uses the SAR protocol for cuda
+	 * memory buffer, whose performance is worse than using EFA device.
+	 *
+	 * To address this issue, shm usage is disabled if application
+	 * requested the FI_HMEM capablity.
+	 *
+	 * This is not ideal, because host memory commuications are
+	 * also going through device.
+	 *
+	 * The long term fix is make shm provider to support cuda
+	 * buffers through cuda IPC. Once that is implemented, the
+	 * following two lines need to be removed.
+	 */
+	if (info && (info->caps & FI_HMEM))
+		return 0;
+
 	return rxr_env.enable_shm_transfer;
 }
 
@@ -1744,7 +2298,8 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	rxr_ep->tx_size = info->tx_attr->size;
 	rxr_ep->rx_iov_limit = info->rx_attr->iov_limit;
 	rxr_ep->tx_iov_limit = info->tx_attr->iov_limit;
-	rxr_ep->max_outstanding_tx = rdm_info->tx_attr->size;
+	rxr_ep->inject_size = info->tx_attr->inject_size;
+	rxr_ep->efa_max_outstanding_tx_ops = rdm_info->tx_attr->size;
 	rxr_ep->core_rx_size = rdm_info->rx_attr->size;
 	rxr_ep->core_iov_limit = rdm_info->tx_attr->iov_limit;
 	rxr_ep->core_caps = rdm_info->caps;
@@ -1760,6 +2315,7 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	rxr_ep->core_msg_order = rdm_info->rx_attr->msg_order;
 	rxr_ep->core_inject_size = rdm_info->tx_attr->inject_size;
 	rxr_ep->max_msg_size = info->ep_attr->max_msg_size;
+	rxr_ep->msg_prefix_size = info->ep_attr->msg_prefix_size;
 	rxr_ep->max_proto_hdr_size = rxr_pkt_max_header_size();
 	rxr_ep->mtu_size = rdm_info->ep_attr->max_msg_size;
 	fi_freeinfo(rdm_info);
@@ -1770,29 +2326,36 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	if (rxr_ep->mtu_size > RXR_MTU_MAX_LIMIT)
 		rxr_ep->mtu_size = RXR_MTU_MAX_LIMIT;
 
-	rxr_ep->max_data_payload_size = rxr_ep->mtu_size - sizeof(struct rxr_data_hdr);
+	rxr_ep->max_data_payload_size = rxr_ep->mtu_size - sizeof(struct rxr_data_hdr) - sizeof(struct rxr_data_opt_connid_hdr);
 	rxr_ep->min_multi_recv_size = rxr_ep->mtu_size - rxr_ep->max_proto_hdr_size;
 
 	if (rxr_env.tx_queue_size > 0 &&
-	    rxr_env.tx_queue_size < rxr_ep->max_outstanding_tx)
-		rxr_ep->max_outstanding_tx = rxr_env.tx_queue_size;
+	    rxr_env.tx_queue_size < rxr_ep->efa_max_outstanding_tx_ops)
+		rxr_ep->efa_max_outstanding_tx_ops = rxr_env.tx_queue_size;
 
 
 	rxr_ep->use_zcpy_rx = rxr_ep_use_zcpy_rx(rxr_ep, info);
 	FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "rxr_ep->use_zcpy_rx = %d\n", rxr_ep->use_zcpy_rx);
 
+	rxr_ep->handle_resource_management = info->domain_attr->resource_mgmt;
+	FI_INFO(&rxr_prov, FI_LOG_EP_CTRL,
+		"rxr_ep->handle_resource_management = %d\n",
+		rxr_ep->handle_resource_management);
+
 #if ENABLE_DEBUG
-	rxr_ep->sends = 0;
+	rxr_ep->efa_total_posted_tx_ops = 0;
+	rxr_ep->shm_total_posted_tx_ops = 0;
 	rxr_ep->send_comps = 0;
 	rxr_ep->failed_send_comps = 0;
 	rxr_ep->recv_comps = 0;
 #endif
 
-	rxr_ep->posted_bufs_shm = 0;
-	rxr_ep->rx_bufs_shm_to_post = 0;
-	rxr_ep->posted_bufs_efa = 0;
-	rxr_ep->rx_bufs_efa_to_post = 0;
-	rxr_ep->tx_pending = 0;
+	rxr_ep->shm_rx_pkts_posted = 0;
+	rxr_ep->shm_rx_pkts_to_post = 0;
+	rxr_ep->efa_rx_pkts_posted = 0;
+	rxr_ep->efa_rx_pkts_to_post = 0;
+	rxr_ep->efa_outstanding_tx_ops = 0;
+	rxr_ep->shm_outstanding_tx_ops = 0;
 	rxr_ep->available_data_bufs_ts = 0;
 
 	ret = fi_cq_open(rxr_domain->rdm_domain, &cq_attr,
@@ -1868,3 +2431,120 @@ err_free_ep:
 	free(rxr_ep);
 	return ret;
 }
+
+/**
+ * @brief record the event that a TX op has been submitted
+ *
+ * This function is called after a TX operation has been posted
+ * successfully. It will:
+ *
+ *  1. increase the outstanding tx_op counter in endpoint and
+ *     in the peer structure.
+ *
+ *  2. add the TX packet to peer's outstanding TX packet list.
+ *
+ * Both send and read are considered TX operation.
+ *
+ * The tx_op counters used to prevent over posting the device
+ * and used in flow control. They are also usefull for debugging.
+ *
+ * Peer's outstanding TX packet list is used when removing a peer
+ * to invalidate address of these packets, so that the completion
+ * of these packet is ignored.
+ *
+ * @param[in,out]	ep		endpoint
+ * @param[in]		pkt_entry	TX pkt_entry, which contains
+ * 					the info of the TX op.
+ */
+void rxr_ep_record_tx_op_submitted(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
+{
+	struct rdm_peer *peer;
+
+	/*
+	 * peer can be NULL when the pkt_entry is a RMA_CONTEXT_PKT,
+	 * and the RMA is a local read toward the endpoint itself
+	 */
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	if (peer)
+		dlist_insert_tail(&pkt_entry->entry, &peer->outstanding_tx_pkts);
+
+	if (pkt_entry->alloc_type == RXR_PKT_FROM_EFA_TX_POOL) {
+		ep->efa_outstanding_tx_ops++;
+		if (peer)
+			peer->efa_outstanding_tx_ops++;
+#if ENABLE_DEBUG
+		ep->efa_total_posted_tx_ops++;
+#endif
+	} else {
+		assert(pkt_entry->alloc_type == RXR_PKT_FROM_SHM_TX_POOL);
+		ep->shm_outstanding_tx_ops++;
+		if (peer)
+			peer->shm_outstanding_tx_ops++;
+#if ENABLE_DEBUG
+		ep->shm_total_posted_tx_ops++;
+#endif
+	}
+}
+
+/**
+ * @brief record the event that an TX op is completed
+ *
+ * This function is called when the completion of
+ * a TX operation is received. It will
+ *
+ * 1. decrease the outstanding tx_op counter in the endpoint
+ *    and in the peer.
+ *
+ * 2. remove the TX packet from peer's outstanding
+ *    TX packet list.
+ *
+ * Both send and read are considered TX operation.
+ *
+ * One may ask why this function is not integrated
+ * into rxr_pkt_entry_relase_tx()?
+ *
+ * The reason is the action of decrease tx_op counter
+ * is not tied to releasing a TX pkt_entry.
+ *
+ * Sometimes we need to decreate the tx_op counter
+ * without releasing a TX pkt_entry. For example,
+ * we handle a TX pkt_entry encountered RNR. We need
+ * to decrease the tx_op counter and queue the packet.
+ *
+ * Sometimes we need release TX pkt_entry without
+ * decreasing the tx_op counter. For example, when
+ * rxr_pkt_post_ctrl() failed to post a pkt entry.
+ *
+ * @param[in,out]	ep		endpoint
+ * @param[in]		pkt_entry	TX pkt_entry, which contains
+ * 					the info of the TX op
+ */
+void rxr_ep_record_tx_op_completed(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
+{
+	struct rdm_peer *peer;
+
+	/*
+	 * peer can be NULL when:
+	 *
+	 * 1. the pkt_entry is a RMA_CONTEXT_PKT, and the RMA op is a local read
+	 *    toward the endpoint itself.
+	 * 2. peer's address has been removed from address vector. Either because
+	 *    a new peer has the same GID+QPN was inserted to address, or because
+	 *    application removed the peer from address vector.
+	 */
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	if (peer)
+		dlist_remove(&pkt_entry->entry);
+
+	if (pkt_entry->alloc_type == RXR_PKT_FROM_EFA_TX_POOL) {
+		ep->efa_outstanding_tx_ops--;
+		if (peer)
+			peer->efa_outstanding_tx_ops--;
+	} else {
+		assert(pkt_entry->alloc_type == RXR_PKT_FROM_SHM_TX_POOL);
+		ep->shm_outstanding_tx_ops--;
+		if (peer)
+			peer->shm_outstanding_tx_ops--;
+	}
+}
+
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_fabric.c b/deps/libfabric/prov/efa/src/rxr/rxr_fabric.c
deleted file mode 100644
index 163c5258c252583d4dbc8c3c9cc8aa081cb5c78c..0000000000000000000000000000000000000000
--- a/deps/libfabric/prov/efa/src/rxr/rxr_fabric.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Amazon.com, Inc. or its affiliates.
- * All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <ofi_perf.h>
-
-#include "rxr.h"
-
-#ifdef RXR_PERF_ENABLED
-const char *rxr_perf_counters_str[] = {
-	RXR_PERF_FOREACH(OFI_STR)
-};
-#endif
-
-static struct fi_ops_fabric rxr_fabric_ops = {
-	.size = sizeof(struct fi_ops_fabric),
-	.domain = rxr_domain_open,
-	.passive_ep = fi_no_passive_ep,
-	.eq_open = ofi_eq_create,
-	.wait_open = ofi_wait_fd_open,
-	.trywait = ofi_trywait
-};
-
-static int rxr_fabric_close(fid_t fid)
-{
-	int ret;
-	struct rxr_fabric *rxr_fabric;
-
-	rxr_fabric = container_of(fid, struct rxr_fabric,
-				  util_fabric.fabric_fid.fid);
-	ret = fi_close(&rxr_fabric->lower_fabric->fid);
-	if (ret)
-		return ret;
-
-	if (rxr_env.enable_shm_transfer) {
-		ret = fi_close(&rxr_fabric->shm_fabric->fid);
-		if (ret)
-			return ret;
-	}
-
-	ret = ofi_fabric_close(&rxr_fabric->util_fabric);
-	if (ret)
-		return ret;
-
-#ifdef RXR_PERF_ENABLED
-	ofi_perfset_log(&rxr_fabric->perf_set, rxr_perf_counters_str);
-	ofi_perfset_close(&rxr_fabric->perf_set);
-#endif
-	free(rxr_fabric);
-	return 0;
-}
-
-static struct fi_ops rxr_fabric_fi_ops = {
-	.size = sizeof(struct fi_ops),
-	.close = rxr_fabric_close,
-	.bind = fi_no_bind,
-	.control = fi_no_control,
-	.ops_open = fi_no_ops_open,
-};
-
-int rxr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
-	       void *context)
-{
-	struct rxr_fabric *rxr_fabric;
-	struct fi_info hints, *rdm_info;
-	int ret, retv;
-
-	rxr_fabric = calloc(1, sizeof(*rxr_fabric));
-	if (!rxr_fabric)
-		return -FI_ENOMEM;
-
-	ret = ofi_fabric_init(&rxr_prov, &rxr_fabric_attr, attr,
-			      &rxr_fabric->util_fabric, context);
-	if (ret)
-		goto err_free_fabric;
-
-	memset(&hints, 0, sizeof(hints));
-	hints.fabric_attr = calloc(1, sizeof(*hints.fabric_attr));
-	if (!hints.fabric_attr) {
-		ret = -FI_ENOMEM;
-		goto err_close_util_fabric;
-	}
-	hints.fabric_attr->name = attr->name;
-	hints.fabric_attr->api_version = attr->api_version;
-	hints.mode = ~0;
-
-	ret = lower_efa_prov->getinfo(attr->api_version, NULL, NULL, 0, &hints,
-				      &rdm_info);
-	if (ret) {
-		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
-			"Unable to get core info!\n");
-		ret = -FI_EINVAL;
-		goto err_free_hints;
-	}
-
-	ret = lower_efa_prov->fabric(rdm_info->fabric_attr,
-				     &rxr_fabric->lower_fabric, context);
-	if (ret)
-		goto err_free_rdm_info;
-
-	/* Open shm provider's fabric domain */
-	if (rxr_env.enable_shm_transfer) {
-		assert(!strcmp(shm_info->fabric_attr->name, "shm"));
-		ret = fi_fabric(shm_info->fabric_attr,
-				       &rxr_fabric->shm_fabric, context);
-		if (ret)
-			goto err_close_rdm_fabric;
-	}
-
-
-#ifdef RXR_PERF_ENABLED
-	ret = ofi_perfset_create(&rxr_prov, &rxr_fabric->perf_set,
-				 rxr_perf_size, perf_domain, perf_cntr,
-				 perf_flags);
-
-	if (ret)
-		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
-			"Error initializing RxR perfset: %s\n",
-			fi_strerror(-ret));
-#endif
-
-	*fabric = &rxr_fabric->util_fabric.fabric_fid;
-	(*fabric)->fid.ops = &rxr_fabric_fi_ops;
-	(*fabric)->ops = &rxr_fabric_ops;
-
-	free(hints.fabric_attr);
-	fi_freeinfo(rdm_info);
-	return 0;
-
-err_close_rdm_fabric:
-	retv = fi_close(&rxr_fabric->lower_fabric->fid);
-	if (retv)
-		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
-			"Unable to close lower rdm fabric: %s\n",
-			fi_strerror(-retv));
-err_free_rdm_info:
-	fi_freeinfo(rdm_info);
-err_free_hints:
-	free(hints.fabric_attr);
-err_close_util_fabric:
-	retv = ofi_fabric_close(&rxr_fabric->util_fabric);
-	if (retv)
-		FI_WARN(&rxr_prov, FI_LOG_FABRIC,
-			"Unable to close fabric: %s\n",
-			fi_strerror(-retv));
-err_free_fabric:
-	free(rxr_fabric);
-	return ret;
-}
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_init.c b/deps/libfabric/prov/efa/src/rxr/rxr_init.c
index a2230545f0e0ea66d053b259466f56fd14020302..68866318bb94e0f9f1ef6790c4b48d7c7b8824f5 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_init.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_init.c
@@ -40,7 +40,6 @@
 
 struct fi_info *shm_info;
 
-struct fi_provider *lower_efa_prov;
 struct efa_ep_addr *local_efa_addr;
 
 
@@ -56,7 +55,10 @@ struct rxr_env rxr_env = {
 	.shm_av_size = 128,
 	.shm_max_medium_size = 4096,
 	.recvwin_size = RXR_RECVWIN_SIZE,
+	.ooo_pool_chunk_size = 64,
+	.unexp_pool_chunk_size = 1024,
 	.readcopy_pool_size = 256,
+	.atomrsp_pool_size = 1024,
 	.cq_size = RXR_DEF_CQ_SIZE,
 	.max_memcpy_size = 4096,
 	.mtu_size = 0,
@@ -66,18 +68,29 @@ struct rxr_env rxr_env = {
 	.rx_iov_limit = 0,
 	.rx_copy_unexp = 1,
 	.rx_copy_ooo = 1,
-	.max_timeout = RXR_DEF_RNR_MAX_TIMEOUT,
-	.timeout_interval = 0, /* 0 is random timeout */
+	.rnr_backoff_wait_time_cap = RXR_DEFAULT_RNR_BACKOFF_WAIT_TIME_CAP,
+	.rnr_backoff_initial_wait_time = 0, /* 0 is random wait time  */
 	.efa_cq_read_size = 50,
 	.shm_cq_read_size = 50,
 	.efa_max_medium_msg_size = 65536,
 	.efa_min_read_msg_size = 1048576,
 	.efa_min_read_write_size = 65536,
 	.efa_read_segment_size = 1073741824,
+	.rnr_retry = 3, /* Setting this value to EFA_RNR_INFINITE_RETRY makes the firmware retry indefinitey */
 };
 
+/* @brief Read and store the FI_EFA_* environment variables.
+ */
 static void rxr_init_env(void)
 {
+	int fork_safe = 0;
+
+	if (getenv("FI_EFA_SHM_MAX_MEDIUM_SIZE")) {
+		fprintf(stderr,
+			"FI_EFA_SHM_MAX_MEDIUM_SIZE env variable detected! The use of this variable has been deprecated and as such execution cannot proceed.\n");
+		abort();
+	};
+
 	fi_param_get_int(&rxr_prov, "rx_window_size", &rxr_env.rx_window_size);
 	fi_param_get_int(&rxr_prov, "tx_max_credits", &rxr_env.tx_max_credits);
 	fi_param_get_int(&rxr_prov, "tx_min_credits", &rxr_env.tx_min_credits);
@@ -87,7 +100,6 @@ static void rxr_init_env(void)
 	fi_param_get_int(&rxr_prov, "use_zcpy_rx", &rxr_env.use_zcpy_rx);
 	fi_param_get_int(&rxr_prov, "zcpy_rx_seed", &rxr_env.zcpy_rx_seed);
 	fi_param_get_int(&rxr_prov, "shm_av_size", &rxr_env.shm_av_size);
-	fi_param_get_int(&rxr_prov, "shm_max_medium_size", &rxr_env.shm_max_medium_size);
 	fi_param_get_int(&rxr_prov, "recvwin_size", &rxr_env.recvwin_size);
 	fi_param_get_int(&rxr_prov, "readcopy_pool_size", &rxr_env.readcopy_pool_size);
 	fi_param_get_int(&rxr_prov, "cq_size", &rxr_env.cq_size);
@@ -109,9 +121,13 @@ static void rxr_init_env(void)
 			  &rxr_env.rx_copy_unexp);
 	fi_param_get_bool(&rxr_prov, "rx_copy_ooo",
 			  &rxr_env.rx_copy_ooo);
-	fi_param_get_int(&rxr_prov, "max_timeout", &rxr_env.max_timeout);
+
+	fi_param_get_int(&rxr_prov, "max_timeout", &rxr_env.rnr_backoff_wait_time_cap);
+	if (rxr_env.rnr_backoff_wait_time_cap > RXR_MAX_RNR_BACKOFF_WAIT_TIME_CAP)
+		rxr_env.rnr_backoff_wait_time_cap = RXR_MAX_RNR_BACKOFF_WAIT_TIME_CAP;
+
 	fi_param_get_int(&rxr_prov, "timeout_interval",
-			 &rxr_env.timeout_interval);
+			 &rxr_env.rnr_backoff_initial_wait_time);
 	fi_param_get_size_t(&rxr_prov, "efa_cq_read_size",
 			 &rxr_env.efa_cq_read_size);
 	fi_param_get_size_t(&rxr_prov, "shm_cq_read_size",
@@ -124,37 +140,81 @@ static void rxr_init_env(void)
 			    &rxr_env.efa_min_read_write_size);
 	fi_param_get_size_t(&rxr_prov, "inter_read_segment_size",
 			    &rxr_env.efa_read_segment_size);
+
+	/* Initialize EFA's fork support flag based on the environment and
+	 * system support. */
+	efa_fork_status = EFA_FORK_SUPPORT_OFF;
+
+#if HAVE_IBV_IS_FORK_INITIALIZED == 1
+	if (ibv_is_fork_initialized() == IBV_FORK_UNNEEDED)
+		efa_fork_status = EFA_FORK_SUPPORT_UNNEEDED;
+#endif
+
+	if (efa_fork_status != EFA_FORK_SUPPORT_UNNEEDED) {
+		fi_param_get_bool(&rxr_prov, "fork_safe", &fork_safe);
+
+		/*
+		 * Check if any environment variables which would trigger
+		 * libibverbs' fork support are set. These variables are
+		 * defined by ibv_fork_init(3).
+		 */
+		if (fork_safe || getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE"))
+			efa_fork_status = EFA_FORK_SUPPORT_ON;
+	}
 }
 
-/*
- * Stringify the void *addr to a string smr_name formatted as `gid_qpn`, which
- * will be used to insert into shm provider's AV. Then shm uses smr_name as
- * ep_name to create the shared memory region.
+/* @brief convert raw address to an unique shm endpoint name (smr_name)
+ *
+ * Note even though all shm endpoints are on same instance. But because
+ * one instance can have multiple EFA device, it is still necessary
+ * to include GID on the name.
+ *
+ * a smr name consist of the following 4 parts:
+ *
+ *    GID:   ipv6 address from inet_ntop
+ *    QPN:   %04x format
+ *    QKEY:  %08x format
+ *    UID:   %04x format
+ *
+ * each part is connected via an underscore.
+ *
+ * The following is an example:
  *
- * The IPv6 address length is 46, but the max supported name length for shm is 32.
- * The string `gid_qpn` could be truncated during snprintf.
- * The current way works because the IPv6 addresses starting with FE in hexadecimals represent
- * link local IPv6 addresses, which has reserved first 64 bits (FE80::/64).
- * e.g., fe80:0000:0000:0000:0436:29ff:fe8e:ceaa -> fe80::436:29ff:fe8e:ceaa
- * And the length of string `gid_qpn` (fe80::436:29ff:fe8e:ceaa_***) will not exceed 32.
- * If the address is NOT link local, we need to think another reasonable way to
- * generate the string.
+ *    fe80::4a5:28ff:fe98:e500_0001_12918366_03e8
+ *
+ * @param[in]		ptr		pointer to raw address (struct efa_ep_addr)
+ * @param[out]		smr_name	an unique name for shm ep
+ * @param[in,out]	smr_name_len    As input, specify size of the "smr_name" buffer.
+ *					As output, specify number of bytes written to the buffer.
+ *
+ * @return	0 on success.
+ * 		negative error code on failure.
  */
-int rxr_ep_efa_addr_to_str(const void *addr, char *smr_name)
+int rxr_raw_addr_to_smr_name(void *ptr, char *smr_name, size_t *smr_name_len)
 {
-	char gid[INET6_ADDRSTRLEN] = { 0 };
-	uint16_t qpn;
+	struct efa_ep_addr *raw_addr;
+	char gidstr[INET6_ADDRSTRLEN] = { 0 };
 	int ret;
 
-	if (!inet_ntop(AF_INET6, ((struct efa_ep_addr *)addr)->raw, gid, INET6_ADDRSTRLEN)) {
-		printf("Failed to get current EFA's GID, errno: %d\n", errno);
-		return 0;
+	raw_addr = (struct efa_ep_addr *)ptr;
+	if (!inet_ntop(AF_INET6, raw_addr->raw, gidstr, INET6_ADDRSTRLEN)) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "Failed to convert GID to string errno: %d\n", errno);
+		return -errno;
 	}
-	qpn = ((struct efa_ep_addr *)addr)->qpn;
 
-	ret = snprintf(smr_name, NAME_MAX, "%ld_%s_%d", (size_t) getuid(), gid, qpn);
+	ret = snprintf(smr_name, *smr_name_len, "%s_%04x_%08x_%04x",
+		       gidstr, raw_addr->qpn, raw_addr->qkey, getuid());
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 || ret >= *smr_name_len)
+		return -FI_EINVAL;
 
-	return (ret <= 0) ? ret : FI_SUCCESS;
+	/* plus 1 here for the ending '\0' character, which was not
+	 * included in ret of snprintf
+	 */
+	*smr_name_len = ret + 1;
+	return FI_SUCCESS;
 }
 
 void rxr_info_to_core_mr_modes(uint32_t version,
@@ -254,7 +314,7 @@ static int rxr_info_to_core(uint32_t version, const struct fi_info *rxr_info,
 }
 
 /* Explicitly set all necessary bits before calling shm provider's getinfo function */
-void rxr_set_shm_hints(struct fi_info *shm_hints)
+static void rxr_set_shm_hints(const struct fi_info *app_hints, struct fi_info *shm_hints)
 {
 	shm_hints->caps = FI_MSG | FI_TAGGED | FI_RECV | FI_SEND | FI_READ
 			   | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE
@@ -267,6 +327,18 @@ void rxr_set_shm_hints(struct fi_info *shm_hints)
 	shm_hints->fabric_attr->name = strdup("shm");
 	shm_hints->fabric_attr->prov_name = strdup("shm");
 	shm_hints->ep_attr->type = FI_EP_RDM;
+
+	/*
+	 * We validate whether FI_HMEM is supported before this function is
+	 * called, so it's safe to check for this via the app hints directly.
+	 * We should combine this and the earlier FI_HMEM validation when we
+	 * clean up the getinfo path. That's not possible at the moment as we
+	 * only have one SHM info for the entire provider which isn't right.
+	 */
+	if (app_hints && (app_hints->caps & FI_HMEM)) {
+		shm_hints->caps |= FI_HMEM;
+		shm_hints->domain_attr->mr_mode |= FI_MR_HMEM;
+	}
 }
 
 /* Pass tx/rx attr that user specifies down to core provider */
@@ -325,6 +397,11 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 {
 	uint64_t atomic_ordering;
 	uint64_t max_atomic_size;
+	uint64_t min_pkt_size;
+
+	if (!core_info) {
+		return -FI_EINVAL;
+	}
 
 	info->caps = rxr_info.caps;
 	info->mode = rxr_info.mode;
@@ -334,8 +411,24 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 	*info->ep_attr = *rxr_info.ep_attr;
 	*info->domain_attr = *rxr_info.domain_attr;
 
-	/* TODO: update inject_size when we implement inject */
-	info->tx_attr->inject_size = 0;
+	/*
+	 * The requirement for inject is: upon return, the user buffer can be reused immediately.
+	 *
+	 * For EFA, inject is implement as: construct a packet entry, copy user data to packet entry
+	 * then send the packet entry. Therefore the maximum inject size is
+	 *    pkt_entry_size - maximum_header_size.
+	 */
+	if (rxr_env.enable_shm_transfer)
+		min_pkt_size = MIN(core_info->ep_attr->max_msg_size, rxr_env.shm_max_medium_size);
+	else
+		min_pkt_size = core_info->ep_attr->max_msg_size;
+
+	if (min_pkt_size < rxr_pkt_max_header_size()) {
+		info->tx_attr->inject_size = 0;
+	} else {
+		info->tx_attr->inject_size = min_pkt_size - rxr_pkt_max_header_size();
+	}
+
 	rxr_info.tx_attr->inject_size = info->tx_attr->inject_size;
 
 	info->addr_format = core_info->addr_format;
@@ -349,7 +442,7 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 	 * cap). The logic for device-specific checks pertaining to HMEM comes
 	 * further along this path.
 	 */
-	if ((core_info && !(core_info->caps & FI_HMEM)) || !hints) {
+	if (!(core_info->caps & FI_HMEM) || !hints) {
 		info->caps &= ~FI_HMEM;
 	}
 
@@ -391,10 +484,6 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 			info->domain_attr->data_progress = FI_PROGRESS_MANUAL;
 		}
 
-		/* Use a table for AV if the app has no strong requirement */
-		if (!hints->domain_attr || hints->domain_attr->av_type == FI_AV_UNSPEC)
-			info->domain_attr->av_type = FI_AV_TABLE;
-
 #if HAVE_LIBCUDA
 		/* If the application requires HMEM support, we will add FI_MR_HMEM
 		 * to mr_mode, because we need application to provide descriptor
@@ -408,23 +497,27 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 		 * which means FI_MR_HMEM implies FI_MR_LOCAL for cuda buffer
 		 */
 		if (hints->caps & FI_HMEM) {
-
+			if (ofi_hmem_p2p_disabled()) {
+				FI_WARN(&rxr_prov, FI_LOG_CORE,
+					"FI_HMEM capability currently requires peer to peer support, which is disabled.\n");
+				return -FI_ENODATA;
+			}
 			if (!efa_device_support_rdma_read()) {
-				FI_INFO(&rxr_prov, FI_LOG_CORE,
+				FI_WARN(&rxr_prov, FI_LOG_CORE,
 				        "FI_HMEM capability requires RDMA, which this device does not support.\n");
 				return -FI_ENODATA;
 
 			}
 
 			if (!rxr_env.use_device_rdma) {
-				FI_INFO(&rxr_prov, FI_LOG_CORE,
+				FI_WARN(&rxr_prov, FI_LOG_CORE,
 				        "FI_HMEM capability requires RDMA, which is turned off. You can turn it on by set environment variable FI_EFA_USE_DEVICE_RDMA to 1.\n");
 				return -FI_ENODATA;
 			}
 
 			if (hints->domain_attr &&
 			    !(hints->domain_attr->mr_mode & FI_MR_HMEM)) {
-				FI_INFO(&rxr_prov, FI_LOG_CORE,
+				FI_WARN(&rxr_prov, FI_LOG_CORE,
 				        "FI_HMEM capability requires device registrations (FI_MR_HMEM)\n");
 				return -FI_ENODATA;
 			}
@@ -463,19 +556,23 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 			info->mode |= FI_MSG_PREFIX;
 			info->tx_attr->mode |= FI_MSG_PREFIX;
 			info->rx_attr->mode |= FI_MSG_PREFIX;
-
-			/*
-			 * The prefix needs to be a multiple of 8. The pkt_entry
-			 * is already at 64 bytes (128 with debug).
-			 */
-			info->ep_attr->msg_prefix_size =  sizeof(struct rxr_pkt_entry)
-							  + sizeof(struct rxr_eager_msgrtm_hdr);
-			assert(!(info->ep_attr->msg_prefix_size % 8));
+			info->ep_attr->msg_prefix_size = RXR_MSG_PREFIX_SIZE;
 			FI_INFO(&rxr_prov, FI_LOG_CORE,
 				"FI_MSG_PREFIX size = %ld\n", info->ep_attr->msg_prefix_size);
 		}
 	}
 
+	/* Use a table for AV if the app has no strong requirement */
+	if (!hints || !hints->domain_attr ||
+	    hints->domain_attr->av_type == FI_AV_UNSPEC)
+		info->domain_attr->av_type = FI_AV_TABLE;
+
+	if (!hints || !hints->domain_attr ||
+	    hints->domain_attr->resource_mgmt == FI_RM_UNSPEC)
+		info->domain_attr->resource_mgmt = FI_RM_ENABLED;
+	else
+		info->domain_attr->resource_mgmt = hints->domain_attr->resource_mgmt;
+
 	rxr_set_rx_tx_size(info, core_info);
 	return 0;
 }
@@ -501,17 +598,16 @@ int rxr_get_lower_rdm_info(uint32_t version, const char *node,
 	if (ret)
 		return ret;
 
-	ret = lower_efa_prov->getinfo(version, node, service, flags,
-				      core_hints, core_info);
+	ret = efa_getinfo(version, node, service, flags, core_hints, core_info);
 	fi_freeinfo(core_hints);
 	return ret;
 }
 
 /*
- * Call getinfo on lower efa provider to get all locally qualified fi_info
+ * Call efa_getinfo() to get all locally qualified fi_info
  * structure, then store the corresponding efa nic GIDs
  */
-int rxr_get_local_gids(struct fi_provider *lower_efa_prov)
+int rxr_get_local_gids(void)
 {
 	struct fi_info *core_info, *cur;
 	struct efa_ep_addr *cur_efa_addr;
@@ -520,7 +616,7 @@ int rxr_get_local_gids(struct fi_provider *lower_efa_prov)
 	cur_efa_addr = local_efa_addr = NULL;
 	core_info = cur = NULL;
 
-	ret = lower_efa_prov->getinfo(rxr_prov.fi_version, NULL, NULL, 0, NULL, &core_info);
+	ret = efa_getinfo(rxr_prov.fi_version, NULL, NULL, 0, NULL, &core_info);
 	if (ret)
 		return ret;
 
@@ -560,8 +656,7 @@ static int rxr_dgram_getinfo(uint32_t version, const char *node,
 
 	core_info = NULL;
 
-	ret = lower_efa_prov->getinfo(version, node, service,
-				      flags, hints, &core_info);
+	ret = efa_getinfo(version, node, service, flags, hints, &core_info);
 
 	if (ret)
 		return ret;
@@ -663,7 +758,7 @@ dgram_info:
 	if (!ret && rxr_env.enable_shm_transfer && !shm_info) {
 		shm_info = NULL;
 		shm_hints = fi_allocinfo();
-		rxr_set_shm_hints(shm_hints);
+		rxr_set_shm_hints(hints, shm_hints);
 		ret = fi_getinfo(FI_VERSION(1, 8), NULL, NULL,
 		                 OFI_GETINFO_HIDDEN, shm_hints, &shm_info);
 		fi_freeinfo(shm_hints);
@@ -691,8 +786,7 @@ static void rxr_fini(void)
 {
 	struct efa_ep_addr *cur;
 
-	if (lower_efa_prov)
-		lower_efa_prov->cleanup();
+	efa_finalize_prov();
 
 	if (rxr_env.enable_shm_transfer) {
 		/* Cleanup all local efa nic GIDs */
@@ -717,7 +811,7 @@ struct fi_provider rxr_prov = {
 	.version = OFI_VERSION_DEF_PROV,
 	.fi_version = OFI_VERSION_LATEST,
 	.getinfo = rxr_getinfo,
-	.fabric = rxr_fabric,
+	.fabric = efa_fabric,
 	.cleanup = rxr_fini
 };
 
@@ -741,8 +835,6 @@ EFA_INI
 			"Defines the number of bounce-buffers the provider will prepost during EP initialization.  (Default: 0)");
 	fi_param_define(&rxr_prov, "shm_av_size", FI_PARAM_INT,
 			"Defines the maximum number of entries in SHM provider's address vector (Default 128).");
-	fi_param_define(&rxr_prov, "shm_max_medium_size", FI_PARAM_INT,
-			"Defines the switch point between small/medium message and large message. The message larger than this switch point will be transferred with large message protocol (Default 4096).");
 	fi_param_define(&rxr_prov, "recvwin_size", FI_PARAM_INT,
 			"Defines the size of sliding receive window. (Default: 16384)");
 	fi_param_define(&rxr_prov, "readcopy_pool_size", FI_PARAM_INT,
@@ -788,6 +880,9 @@ EFA_INI
 			"The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536).");
 	fi_param_define(&rxr_prov, "inter_read_segment_size", FI_PARAM_INT,
 			"Calls to RDMA read is segmented using this value.");
+	fi_param_define(&rxr_prov, "fork_safe", FI_PARAM_BOOL,
+			"Enables fork support and disables internal usage of huge pages. Has no effect on kernels which set copy-on-fork for registered pages, generally 5.13 and later. (Default: false)");
+
 	rxr_init_env();
 
 #if HAVE_EFA_DL
@@ -796,11 +891,10 @@ EFA_INI
 	ofi_monitors_init();
 #endif
 
-	lower_efa_prov = init_lower_efa_prov();
-	if (!lower_efa_prov)
+	if (efa_init_prov())
 		return NULL;
 
-	if (rxr_env.enable_shm_transfer && rxr_get_local_gids(lower_efa_prov))
+	if (rxr_env.enable_shm_transfer && rxr_get_local_gids())
 		return NULL;
 
 	return &rxr_prov;
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_msg.c b/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
index adcc044063b51e9207bca733dec403f7f8bd74ef..b31fe9e03fb34ed16bb6ba95dc353a248d4d6e7e 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
@@ -62,17 +62,23 @@ static inline
 ssize_t rxr_msg_post_cuda_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 {
 	int err, tagged;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
+	int pkt_type;
+	bool delivery_complete_requested;
 
 	assert(RXR_EAGER_MSGRTM_PKT + 1 == RXR_EAGER_TAGRTM_PKT);
-	assert(RXR_READ_MSGRTM_PKT + 1 == RXR_READ_TAGRTM_PKT);
+	assert(RXR_LONGREAD_MSGRTM_PKT + 1 == RXR_LONGREAD_TAGRTM_PKT);
+	assert(RXR_DC_EAGER_MSGRTM_PKT + 1 == RXR_DC_EAGER_TAGRTM_PKT);
 
 	tagged = (tx_entry->op == ofi_op_tagged);
 	assert(tagged == 0 || tagged == 1);
 
-	if (tx_entry->total_len == 0)
-		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-							  RXR_EAGER_MSGRTM_PKT + tagged, 0);
+	delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE;
+	if (tx_entry->total_len == 0) {
+		pkt_type = delivery_complete_requested ? RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT;
+		return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry,
+					 pkt_type + tagged, 0, 0);
+	}
 
 	/* Currently cuda data must be sent using read message protocol.
 	 * However, because read message protocol is an extra feature, we cannot
@@ -82,6 +88,7 @@ ssize_t rxr_msg_post_cuda_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_ent
 	 */
 	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
 	assert(peer);
+
 	err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "waiting for handshake packet failed!\n");
@@ -94,8 +101,8 @@ ssize_t rxr_msg_post_cuda_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_ent
 		return -FI_EOPNOTSUPP;
 	}
 
-	return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-					  RXR_READ_MSGRTM_PKT + tagged, 0);
+	return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry,
+				 RXR_LONGREAD_MSGRTM_PKT + tagged, 0, 0);
 }
 
 ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
@@ -105,52 +112,97 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 	 * always the correspondent message rtm packet type id + 1, thus the assertion here.
 	 */
 	assert(RXR_EAGER_MSGRTM_PKT + 1 == RXR_EAGER_TAGRTM_PKT);
-	assert(RXR_READ_MSGRTM_PKT + 1 == RXR_READ_TAGRTM_PKT);
-	assert(RXR_LONG_MSGRTM_PKT + 1 == RXR_LONG_TAGRTM_PKT);
+	assert(RXR_LONGREAD_MSGRTM_PKT + 1 == RXR_LONGREAD_TAGRTM_PKT);
+	assert(RXR_LONGCTS_MSGRTM_PKT + 1 == RXR_LONGCTS_TAGRTM_PKT);
 	assert(RXR_MEDIUM_MSGRTM_PKT + 1 == RXR_MEDIUM_TAGRTM_PKT);
 
+	assert(RXR_DC_EAGER_MSGRTM_PKT + 1 == RXR_DC_EAGER_TAGRTM_PKT);
+	assert(RXR_DC_MEDIUM_MSGRTM_PKT + 1 == RXR_DC_MEDIUM_TAGRTM_PKT);
+	assert(RXR_DC_LONGCTS_MSGRTM_PKT + 1 == RXR_DC_LONGCTS_TAGRTM_PKT);
+
 	int tagged;
 	size_t max_rtm_data_size;
 	ssize_t err;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
+	bool delivery_complete_requested;
+	int ctrl_type;
 	struct efa_domain *efa_domain;
 	struct rxr_domain *rxr_domain = rxr_ep_domain(rxr_ep);
 
 	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
 				  util_domain.domain_fid);
 
-
 	assert(tx_entry->op == ofi_op_msg || tx_entry->op == ofi_op_tagged);
 	tagged = (tx_entry->op == ofi_op_tagged);
 	assert(tagged == 0 || tagged == 1);
 
-	max_rtm_data_size = rxr_pkt_req_max_data_size(rxr_ep,
-						      tx_entry->addr,
-						      RXR_EAGER_MSGRTM_PKT + tagged);
-
+	if (tx_entry->fi_flags & FI_INJECT)
+		delivery_complete_requested = false;
+	else
+		delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE;
 	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
+	assert(peer);
 
-	if (peer->is_local) {
-		assert(rxr_ep->use_shm);
-		/* intra instance message */
-		int rtm_type = (tx_entry->total_len <= max_rtm_data_size) ? RXR_EAGER_MSGRTM_PKT
-									  : RXR_READ_MSGRTM_PKT;
-
-		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry, rtm_type + tagged, 0);
-	}
-
-	if (rxr_ep->use_zcpy_rx) {
+	if (delivery_complete_requested && !(peer->is_local)) {
+		tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
 		/*
-		 * The application can not deal with varying packet header sizes
-		 * before and after receiving a handshake. Forcing a handshake
-		 * here so we can always use the smallest eager msg packet
-		 * header size to determine the msg_prefix_size.
+		 * Because delivery complete is defined as an extra
+		 * feature, the receiver might not support it.
+		 *
+		 * The sender cannot send with FI_DELIVERY_COMPLETE
+		 * if the peer is not able to handle it.
+		 *
+		 * If the sender does not know whether the peer
+		 * can handle it, it needs to trigger
+		 * a handshake packet from the peer.
+		 *
+		 * The handshake packet contains
+		 * the information whether the peer
+		 * support it or not.
 		 */
-		err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer);
+		err = rxr_pkt_trigger_handshake(rxr_ep, tx_entry->addr, peer);
 		if (OFI_UNLIKELY(err))
 			return err;
 
-		assert(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED);
+		if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))
+			return -FI_EAGAIN;
+
+		else if (!rxr_peer_support_delivery_complete(peer))
+			return -FI_EOPNOTSUPP;
+
+		max_rtm_data_size = rxr_pkt_req_max_data_size(rxr_ep,
+							      tx_entry->addr,
+							      RXR_DC_EAGER_MSGRTM_PKT + tagged,
+							      tx_entry->fi_flags, 0);
+	} else {
+		max_rtm_data_size = rxr_pkt_req_max_data_size(rxr_ep,
+							      tx_entry->addr,
+							      RXR_EAGER_MSGRTM_PKT + tagged,
+							      tx_entry->fi_flags, 0);
+	}
+
+	if (peer->is_local) {
+		assert(rxr_ep->use_shm);
+		/* intra instance message
+		 *
+		 * Currently shm proivder does not support mixed memory type iov
+		 * (it will crash), which will happen is eager message protocol
+		 * is used for cuda buffer. An GitHub issue has been opened
+		 * regarding this
+		 *     https://github.com/ofiwg/libfabric/issues/6639
+		 * Before it is addressed, we use read message protocol for
+		 * all cuda messages
+		 */
+		if (tx_entry->total_len > max_rtm_data_size || efa_ep_is_cuda_mr(tx_entry->desc[0]))
+			/*
+			 * Read message support
+			 * FI_DELIVERY_COMPLETE implicitly.
+			 */
+			ctrl_type = RXR_LONGREAD_MSGRTM_PKT;
+		else
+			ctrl_type = delivery_complete_requested ? RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT;
+
+		return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, ctrl_type + tagged, 0, 0);
 	}
 
 	if (efa_ep_is_cuda_mr(tx_entry->desc[0])) {
@@ -158,26 +210,37 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 	}
 
 	/* inter instance message */
-	if (tx_entry->total_len <= max_rtm_data_size)
-		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-						  RXR_EAGER_MSGRTM_PKT + tagged, 0);
+	if (tx_entry->total_len <= max_rtm_data_size) {
+		ctrl_type = (delivery_complete_requested) ?
+			RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT;
+		return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry,
+					 ctrl_type + tagged, 0, 0);
+	}
 
 	if (tx_entry->total_len <= rxr_env.efa_max_medium_msg_size) {
 		/* we do not check the return value of rxr_ep_init_mr_desc()
 		 * because medium message works even if MR registration failed
 		 */
-		if (efa_is_cache_available(efa_domain))
+		if (tx_entry->desc[0] || efa_is_cache_available(efa_domain))
 			rxr_ep_tx_init_mr_desc(rxr_domain, tx_entry, 0, FI_SEND);
+
+		/*
+		 * we have to queue message RTM because data is sent as multiple
+		 * medium RTM packets. It could happend that the first several packets
+		 * were sent successfully, but the following packet encountered -FI_EAGAIN
+		 */
+		ctrl_type = delivery_complete_requested ?
+			RXR_DC_MEDIUM_MSGRTM_PKT : RXR_MEDIUM_MSGRTM_PKT;
 		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-						  RXR_MEDIUM_MSGRTM_PKT + tagged, 0);
+						  ctrl_type + tagged, 0);
 	}
 
 	if (tx_entry->total_len >= rxr_env.efa_min_read_msg_size &&
 	    efa_both_support_rdma_read(rxr_ep, peer) &&
 	    (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) {
-		/* use read message protocol */
-		err = rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-						 RXR_READ_MSGRTM_PKT + tagged, 0);
+		/* Read message support FI_DELIVERY_COMPLETE implicitly. */
+		err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry,
+					RXR_LONGREAD_MSGRTM_PKT + tagged, 0, 0);
 
 		if (err != -FI_ENOMEM)
 			return err;
@@ -192,8 +255,10 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
-					  RXR_LONG_MSGRTM_PKT + tagged, 0);
+	ctrl_type = delivery_complete_requested ? RXR_DC_LONGCTS_MSGRTM_PKT : RXR_LONGCTS_MSGRTM_PKT;
+	tx_entry->rxr_flags |= RXR_LONGCTS_PROTOCOL;
+	return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry,
+				 ctrl_type + tagged, 0, 0);
 }
 
 ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
@@ -202,7 +267,7 @@ ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
 	struct rxr_ep *rxr_ep;
 	ssize_t err;
 	struct rxr_tx_entry *tx_entry;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 	       "iov_len: %lu tag: %lx op: %x flags: %lx\n",
@@ -212,7 +277,7 @@ ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	assert(msg->iov_count <= rxr_ep->tx_iov_limit);
 
-	rxr_perfset_start(rxr_ep, perf_rxr_tx);
+	efa_perfset_start(rxr_ep, perf_efa_tx);
 	fastlock_acquire(&rxr_ep->util_ep.lock);
 
 	if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) {
@@ -220,6 +285,14 @@ ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
 		goto out;
 	}
 
+	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
+
+	if (peer->flags & RXR_PEER_IN_BACKOFF) {
+		err = -FI_EAGAIN;
+		goto out;
+	}
+
 	tx_entry = rxr_ep_alloc_tx_entry(rxr_ep, msg, op, tag, flags);
 
 	if (OFI_UNLIKELY(!tx_entry)) {
@@ -230,8 +303,6 @@ ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
 
 	assert(tx_entry->op == ofi_op_msg || tx_entry->op == ofi_op_tagged);
 
-	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
-	assert(peer);
 	tx_entry->msg_id = peer->next_msg_id++;
 	err = rxr_msg_post_rtm(rxr_ep, tx_entry);
 	if (OFI_UNLIKELY(err)) {
@@ -241,7 +312,7 @@ ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg,
 
 out:
 	fastlock_release(&rxr_ep->util_ep.lock);
-	rxr_perfset_end(rxr_ep, perf_rxr_tx);
+	efa_perfset_end(rxr_ep, perf_efa_tx);
 	return err;
 }
 
@@ -310,7 +381,10 @@ ssize_t rxr_msg_inject(struct fid_ep *ep, const void *buf, size_t len,
 
 	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
-	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_msgrtm_hdr));
+	if (len > rxr_ep->inject_size) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid message size %ld for inject.\n", len);
+		return -FI_EINVAL;
+	}
 
 	return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg,
 				    rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | FI_INJECT);
@@ -330,12 +404,11 @@ ssize_t rxr_msg_injectdata(struct fid_ep *ep, const void *buf,
 
 	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
-	/*
-	 * We advertise the largest possible inject size with no cq data or
-	 * source address. This means that we may end up not using the core
-	 * providers inject for this send.
-	 */
-	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_msgrtm_hdr));
+	if (len > rxr_ep->inject_size) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid message size %ld for inject.\n", len);
+		return -FI_EINVAL;
+	}
+
 	return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg,
 				    rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION |
 				    FI_REMOTE_CQ_DATA | FI_INJECT);
@@ -417,7 +490,10 @@ ssize_t rxr_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len,
 
 	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0);
 	rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
-	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_tagrtm_hdr));
+	if (len > rxr_ep->inject_size) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid message size %ld for inject.\n", len);
+		return -FI_EINVAL;
+	}
 
 	return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged,
 				    rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | FI_INJECT);
@@ -436,12 +512,10 @@ ssize_t rxr_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 
 	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data);
 	rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
-	/*
-	 * We advertise the largest possible inject size with no cq data or
-	 * source address. This means that we may end up not using the core
-	 * providers inject for this send.
-	 */
-	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_tagrtm_hdr));
+	if (len > rxr_ep->inject_size) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid message size %ld for inject.\n", len);
+		return -FI_EINVAL;
+	}
 
 	return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged,
 				    rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION |
@@ -456,30 +530,19 @@ ssize_t rxr_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
  *   Utility functions and data structures
  */
 struct rxr_match_info {
-	fi_addr_t addr;
 	uint64_t tag;
 	uint64_t ignore;
 };
 
+/**
+ * @brief match function for rx_entry in ep->unexp_tagged_list
+ *
+ * @param[in]	item	pointer to rx_entry->entry.
+ * @param[in]	arg	pointer to rxr_match_info
+ * @return   0 or 1 indicating wether this entry is a match
+ */
 static
-int rxr_msg_match_unexp_anyaddr(struct dlist_entry *item, const void *arg)
-{
-	return 1;
-}
-
-static
-int rxr_msg_match_unexp(struct dlist_entry *item, const void *arg)
-{
-	const struct rxr_match_info *match_info = arg;
-	struct rxr_rx_entry *rx_entry;
-
-	rx_entry = container_of(item, struct rxr_rx_entry, entry);
-
-	return rxr_match_addr(match_info->addr, rx_entry->addr);
-}
-
-static
-int rxr_msg_match_unexp_tagged_anyaddr(struct dlist_entry *item, const void *arg)
+int rxr_msg_match_ep_unexp_by_tag(struct dlist_entry *item, const void *arg)
 {
 	const struct rxr_match_info *match_info = arg;
 	struct rxr_rx_entry *rx_entry;
@@ -490,16 +553,22 @@ int rxr_msg_match_unexp_tagged_anyaddr(struct dlist_entry *item, const void *arg
 			     match_info->tag);
 }
 
+/**
+ * @brief match function for rx_entry in peer->unexp_tagged_list
+ *
+ * @param[in]	item	pointer to rx_entry->peer_unexp_entry.
+ * @param[in]	arg	pointer to rxr_match_info
+ * @return   0 or 1 indicating wether this entry is a match
+ */
 static
-int rxr_msg_match_unexp_tagged(struct dlist_entry *item, const void *arg)
+int rxr_msg_match_peer_unexp_by_tag(struct dlist_entry *item, const void *arg)
 {
 	const struct rxr_match_info *match_info = arg;
 	struct rxr_rx_entry *rx_entry;
 
-	rx_entry = container_of(item, struct rxr_rx_entry, entry);
+	rx_entry = container_of(item, struct rxr_rx_entry, peer_unexp_entry);
 
-	return rxr_match_addr(match_info->addr, rx_entry->addr) &&
-	       rxr_match_tag(rx_entry->tag, match_info->ignore,
+	return rxr_match_tag(rx_entry->tag, match_info->ignore,
 			     match_info->tag);
 }
 
@@ -550,6 +619,239 @@ int rxr_msg_handle_unexp_match(struct rxr_ep *ep,
 	return rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry);
 }
 
+/**
+ * @brief allocate an rx entry for a fi_msg.
+ *        This function is used by two sided operation only.
+ *
+ * @param ep[in]	end point
+ * @param msg[in]	fi_msg contains iov,iov_count,context for ths operation
+ * @param op[in]	operation type (ofi_op_msg or ofi_op_tagged)
+ * @param flags[in]	flags application used to call fi_recv/fi_trecv functions
+ * @param tag[in]	tag (used only if op is ofi_op_tagged)
+ * @param ignore[in]	ignore mask (used only if op is ofi_op_tagged)
+ * @return		if allocation succeeded, return pointer to rx_entry
+ * 			if allocation failed, return NULL
+ */
+struct rxr_rx_entry *rxr_msg_alloc_rx_entry(struct rxr_ep *ep,
+					    const struct fi_msg *msg,
+					    uint32_t op, uint64_t flags,
+					    uint64_t tag, uint64_t ignore)
+{
+	struct rxr_rx_entry *rx_entry;
+	fi_addr_t addr;
+
+	if (ep->util_ep.caps & FI_DIRECTED_RECV)
+		addr = msg->addr;
+	else
+		addr = FI_ADDR_UNSPEC;
+
+	rx_entry = rxr_ep_alloc_rx_entry(ep, addr, op);
+	if (!rx_entry)
+		return NULL;
+
+	rx_entry->fi_flags = flags;
+	if (op == ofi_op_tagged) {
+		rx_entry->tag = tag;
+		rx_entry->cq_entry.tag = tag;
+		rx_entry->ignore = ignore;
+	}
+
+	/* Handle case where we're allocating an unexpected rx_entry */
+	rx_entry->iov_count = msg->iov_count;
+	if (rx_entry->iov_count) {
+		assert(msg->msg_iov);
+		memcpy(rx_entry->iov, msg->msg_iov, sizeof(*rx_entry->iov) * msg->iov_count);
+		rx_entry->cq_entry.len = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
+		rx_entry->cq_entry.buf = msg->msg_iov[0].iov_base;
+	}
+
+	if (msg->desc)
+		memcpy(&rx_entry->desc[0], msg->desc, sizeof(*msg->desc) * msg->iov_count);
+	else
+		memset(&rx_entry->desc[0], 0, sizeof(rx_entry->desc));
+
+	rx_entry->cq_entry.op_context = msg->context;
+	return rx_entry;
+}
+
+struct rxr_rx_entry *rxr_msg_alloc_unexp_rx_entry_for_msgrtm(struct rxr_ep *ep,
+							     struct rxr_pkt_entry **pkt_entry_ptr)
+{
+	struct rdm_peer *peer;
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_pkt_entry *unexp_pkt_entry;
+
+	unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr);
+	if (OFI_UNLIKELY(!unexp_pkt_entry)) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n");
+		return NULL;
+	}
+
+	rx_entry = rxr_ep_alloc_rx_entry(ep, unexp_pkt_entry->addr, ofi_op_msg);
+	if (OFI_UNLIKELY(!rx_entry))
+		return NULL;
+
+	rx_entry->rxr_flags = 0;
+	rx_entry->state = RXR_RX_UNEXP;
+	rx_entry->unexp_pkt = unexp_pkt_entry;
+	rxr_pkt_rtm_update_rx_entry(unexp_pkt_entry, rx_entry);
+	dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_list);
+	peer = rxr_ep_get_peer(ep, unexp_pkt_entry->addr);
+	dlist_insert_tail(&rx_entry->peer_unexp_entry, &peer->rx_unexp_list);
+	return rx_entry;
+}
+
+struct rxr_rx_entry *rxr_msg_alloc_unexp_rx_entry_for_tagrtm(struct rxr_ep *ep,
+							     struct rxr_pkt_entry **pkt_entry_ptr)
+{
+	struct rdm_peer *peer;
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_pkt_entry *unexp_pkt_entry;
+
+	unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr);
+	if (OFI_UNLIKELY(!unexp_pkt_entry)) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n");
+		return NULL;
+	}
+
+	rx_entry = rxr_ep_alloc_rx_entry(ep, unexp_pkt_entry->addr, ofi_op_tagged);
+	if (OFI_UNLIKELY(!rx_entry))
+		return NULL;
+
+	rx_entry->tag = rxr_pkt_rtm_tag(unexp_pkt_entry);
+	rx_entry->rxr_flags = 0;
+	rx_entry->state = RXR_RX_UNEXP;
+	rx_entry->unexp_pkt = unexp_pkt_entry;
+	rxr_pkt_rtm_update_rx_entry(unexp_pkt_entry, rx_entry);
+	dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_tagged_list);
+	peer = rxr_ep_get_peer(ep, unexp_pkt_entry->addr);
+	dlist_insert_tail(&rx_entry->peer_unexp_entry, &peer->rx_unexp_tagged_list);
+	return rx_entry;
+}
+
+struct rxr_rx_entry *rxr_msg_split_rx_entry(struct rxr_ep *ep,
+					    struct rxr_rx_entry *posted_entry,
+					    struct rxr_rx_entry *consumer_entry,
+					    struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+	size_t buf_len, consumed_len, data_len;
+	uint64_t tag, ignore;
+	struct fi_msg msg = {0};
+
+	assert(rxr_get_base_hdr(pkt_entry->pkt)->type >= RXR_REQ_PKT_BEGIN);
+
+	if (!consumer_entry) {
+		tag = 0;
+		ignore = ~0;
+		msg.msg_iov = posted_entry->iov;
+		msg.iov_count = posted_entry->iov_count;
+		msg.addr = pkt_entry->addr;
+		rx_entry = rxr_msg_alloc_rx_entry(ep, &msg,
+						  ofi_op_msg,
+						  posted_entry->fi_flags,
+						  tag, ignore);
+		if (OFI_UNLIKELY(!rx_entry))
+			return NULL;
+
+		FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
+		       "Splitting into new multi_recv consumer rx_entry %d from rx_entry %d\n",
+		       rx_entry->rx_id,
+		       posted_entry->rx_id);
+	} else {
+		rx_entry = consumer_entry;
+		memcpy(rx_entry->iov, posted_entry->iov,
+		       sizeof(*posted_entry->iov) * posted_entry->iov_count);
+		rx_entry->iov_count = posted_entry->iov_count;
+	}
+
+	rxr_pkt_rtm_update_rx_entry(pkt_entry, rx_entry);
+	data_len = rx_entry->total_len;
+	buf_len = ofi_total_iov_len(rx_entry->iov,
+				    rx_entry->iov_count);
+	consumed_len = MIN(buf_len, data_len);
+
+	rx_entry->rxr_flags |= RXR_MULTI_RECV_CONSUMER;
+	rx_entry->total_len = data_len;
+	rx_entry->fi_flags |= FI_MULTI_RECV;
+	rx_entry->master_entry = posted_entry;
+	rx_entry->cq_entry.len = consumed_len;
+	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
+	rx_entry->cq_entry.op_context = posted_entry->cq_entry.op_context;
+	rx_entry->cq_entry.flags = (FI_RECV | FI_MSG);
+
+	ofi_consume_iov(posted_entry->iov, &posted_entry->iov_count,
+			consumed_len);
+
+	dlist_init(&rx_entry->multi_recv_entry);
+	dlist_insert_tail(&rx_entry->multi_recv_entry,
+			  &posted_entry->multi_recv_consumers);
+	return rx_entry;
+}
+
+/**
+ * @brief find an unexpected rx entry for a receive operation.
+ *
+ * @param[in]	ep	endpoint
+ * @param[in]	addr	fi_addr of the peer want to receive from, can be FI_ADDR_UNSPEC
+ * @param[in]	tag	tag of the unexpected message, used only if op is ofi_op_tagged.
+ * @param[in]	ignore	mask of the tag, used only if op is ofi_op_tagged.
+ * @param[in]	op	either ofi_op_tagged or ofi_op_msg.
+ * @param[in]	claim   whether to claim the rx_entry, e.g. remove it from unexpected queue.
+ * @return	If an unexpected rx_entry was found, return the pointer.
+ * 		Otherwise, return NULL.
+ */
+static inline
+struct rxr_rx_entry *rxr_msg_find_unexp_rx_entry(struct rxr_ep *ep, fi_addr_t addr,
+						 int64_t tag, uint64_t ignore, uint32_t op,
+						 bool claim)
+{
+	struct rxr_match_info match_info;
+	struct rxr_rx_entry *rx_entry;
+	struct dlist_entry *match;
+	struct rdm_peer *peer;
+
+	peer = (ep->util_ep.caps & FI_DIRECTED_RECV) ? rxr_ep_get_peer(ep, addr) : NULL;
+
+	switch(op) {
+	case ofi_op_msg:
+		if (peer) {
+			match = dlist_empty(&peer->rx_unexp_list) ? NULL : peer->rx_unexp_list.next;
+			rx_entry = match ? container_of(match, struct rxr_rx_entry, peer_unexp_entry) : NULL;
+		} else {
+			match = dlist_empty(&ep->rx_unexp_list) ? NULL : ep->rx_unexp_list.next;
+			rx_entry = match ? container_of(match, struct rxr_rx_entry, entry) : NULL;
+		}
+		break;
+	case ofi_op_tagged:
+		match_info.tag = tag;
+		match_info.ignore = ignore;
+
+		if (peer) {
+			match = dlist_find_first_match(&peer->rx_unexp_tagged_list,
+			                               rxr_msg_match_peer_unexp_by_tag,
+						       (void *)&match_info);
+			rx_entry = match ? container_of(match, struct rxr_rx_entry, peer_unexp_entry) : NULL;
+		} else {
+			match = dlist_find_first_match(&ep->rx_unexp_tagged_list,
+						       rxr_msg_match_ep_unexp_by_tag,
+						       (void *)&match_info);
+			rx_entry = match ? container_of(match, struct rxr_rx_entry, entry) : NULL;
+		}
+		break;
+	default:
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "Error: wrong op in rxr_msg_find_unexp_rx_entry()");
+		abort();
+	}
+
+	if (rx_entry && claim) {
+		dlist_remove(&rx_entry->entry);
+		dlist_remove(&rx_entry->peer_unexp_entry);
+	}
+
+	return rx_entry;
+}
+
 /*
  *    Search unexpected list for matching message and process it if found.
  *    Returns 0 if the message is processed, -FI_ENOMSG if no match is found.
@@ -559,50 +861,24 @@ int rxr_msg_proc_unexp_msg_list(struct rxr_ep *ep, const struct fi_msg *msg,
 				uint64_t tag, uint64_t ignore, uint32_t op, uint64_t flags,
 				struct rxr_rx_entry *posted_entry)
 {
-	struct rxr_match_info match_info;
-	struct dlist_entry *match;
 	struct rxr_rx_entry *rx_entry;
-	dlist_func_t *match_func;
 	int ret;
+	bool claim;
 
-	if (op == ofi_op_tagged) {
-		if (ep->util_ep.caps & FI_DIRECTED_RECV)
-			match_func = &rxr_msg_match_unexp_tagged;
-		else
-			match_func = &rxr_msg_match_unexp_tagged_anyaddr;
-
-		match_info.addr = msg->addr;
-		match_info.tag = tag;
-		match_info.ignore = ignore;
-		match = dlist_remove_first_match(&ep->rx_unexp_tagged_list,
-		                                 match_func,
-						 (void *)&match_info);
-	} else {
-		if (ep->util_ep.caps & FI_DIRECTED_RECV)
-			match_func = &rxr_msg_match_unexp;
-		else
-			match_func = &rxr_msg_match_unexp_anyaddr;
-
-		match_info.addr = msg->addr;
-		match = dlist_remove_first_match(&ep->rx_unexp_list,
-		                                 match_func,
-						 (void *)&match_info);
-	}
-
-	if (!match)
+	claim = true;
+	rx_entry = rxr_msg_find_unexp_rx_entry(ep, msg->addr, tag, ignore, op, claim);
+	if (!rx_entry)
 		return -FI_ENOMSG;
 
-	rx_entry = container_of(match, struct rxr_rx_entry, entry);
-
 	/*
 	 * Initialize the matched entry as a multi-recv consumer if the posted
 	 * buffer is a multi-recv buffer.
 	 */
 	if (posted_entry) {
 		/*
-		 * rxr_ep_split_rx_entry will setup rx_entry iov and count
+		 * rxr_msg_split_rx_entry will setup rx_entry iov and count
 		 */
-		rx_entry = rxr_ep_split_rx_entry(ep, posted_entry, rx_entry,
+		rx_entry = rxr_msg_split_rx_entry(ep, posted_entry, rx_entry,
 						 rx_entry->unexp_pkt);
 		if (OFI_UNLIKELY(!rx_entry)) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
@@ -675,7 +951,7 @@ ssize_t rxr_msg_multi_recv(struct rxr_ep *rxr_ep, const struct fi_msg *msg,
 	 * messages but will be used for tracking the application's buffer and
 	 * when to write the completion to release the buffer.
 	 */
-	rx_entry = rxr_ep_get_rx_entry(rxr_ep, msg, tag, ignore, op, flags);
+	rx_entry = rxr_msg_alloc_rx_entry(rxr_ep, msg, op, flags, tag, ignore);
 	if (OFI_UNLIKELY(!rx_entry)) {
 		rxr_ep_progress_internal(rxr_ep);
 		return -FI_EAGAIN;
@@ -767,7 +1043,7 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 
 	assert(msg->iov_count <= rxr_ep->rx_iov_limit);
 
-	rxr_perfset_start(rxr_ep, perf_rxr_recv);
+	efa_perfset_start(rxr_ep, perf_efa_recv);
 
 	assert(rxr_ep->util_ep.rx_msg_flags == 0 || rxr_ep->util_ep.rx_msg_flags == FI_COMPLETION);
 	rx_op_flags = rxr_ep->util_ep.rx_op_flags;
@@ -789,12 +1065,7 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 	unexp_list = (op == ofi_op_tagged) ? &rxr_ep->rx_unexp_tagged_list :
 		     &rxr_ep->rx_unexp_list;
 
-	/*
-	 * Attempt to match against stashed unexpected messages. This is not
-	 * applicable to the zero-copy path where unexpected messages are not
-	 * applicable, since there's no tag or address to match against.
-	 */
-	if (!dlist_empty(unexp_list) && !rxr_ep->use_zcpy_rx) {
+	if (!dlist_empty(unexp_list)) {
 		ret = rxr_msg_proc_unexp_msg_list(rxr_ep, msg, tag,
 						  ignore, op, flags, NULL);
 
@@ -803,8 +1074,7 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 		ret = 0;
 	}
 
-	rx_entry = rxr_ep_get_rx_entry(rxr_ep, msg, tag,
-				       ignore, op, flags);
+	rx_entry = rxr_msg_alloc_rx_entry(rxr_ep, msg, op, flags, tag, ignore);
 
 	if (OFI_UNLIKELY(!rx_entry)) {
 		ret = -FI_EAGAIN;
@@ -812,18 +1082,20 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 		goto out;
 	}
 
-	if (op == ofi_op_tagged)
+	if (rxr_ep->use_zcpy_rx) {
+		ret = rxr_ep_post_user_recv_buf(rxr_ep, rx_entry, flags);
+		if (ret == -FI_EAGAIN)
+			rxr_ep_progress_internal(rxr_ep);
+	} else if (op == ofi_op_tagged) {
 		dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_tagged_list);
-	else
+	} else {
 		dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list);
-
-	if (rxr_ep->use_zcpy_rx)
-		rxr_ep_post_buf(rxr_ep, msg, flags, EFA_EP);
+	}
 
 out:
 	fastlock_release(&rxr_ep->util_ep.lock);
 
-	rxr_perfset_end(rxr_ep, perf_rxr_recv);
+	efa_perfset_end(rxr_ep, perf_efa_recv);
 	return ret;
 }
 
@@ -894,33 +1166,23 @@ ssize_t rxr_msg_peek_trecv(struct fid_ep *ep_fid,
 {
 	ssize_t ret = 0;
 	struct rxr_ep *ep;
-	struct dlist_entry *match;
-	dlist_func_t *match_func;
-	struct rxr_match_info match_info;
 	struct rxr_rx_entry *rx_entry;
 	struct fi_context *context;
 	struct rxr_pkt_entry *pkt_entry;
 	size_t data_len;
 	int64_t tag;
+	bool claim;
 
 	ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
 
 	fastlock_acquire(&ep->util_ep.lock);
 
 	rxr_ep_progress_internal(ep);
-	match_info.addr = msg->addr;
-	match_info.tag = msg->tag;
-	match_info.ignore = msg->ignore;
 
-	if (ep->util_ep.caps & FI_DIRECTED_RECV)
-		match_func = &rxr_msg_match_unexp_tagged;
-	else
-		match_func = &rxr_msg_match_unexp_tagged_anyaddr;
-
-	match = dlist_find_first_match(&ep->rx_unexp_tagged_list,
-	                               match_func,
-				       (void *)&match_info);
-	if (!match) {
+	claim = (flags & (FI_CLAIM | FI_DISCARD));
+	rx_entry = rxr_msg_find_unexp_rx_entry(ep, msg->addr, msg->tag, msg->ignore, ofi_op_tagged,
+					       claim);
+	if (!rx_entry) {
 		FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
 		       "Message not found addr: %" PRIu64
 		       " tag: %lx ignore %lx\n", msg->addr, msg->tag,
@@ -930,14 +1192,10 @@ ssize_t rxr_msg_peek_trecv(struct fid_ep *ep_fid,
 		goto out;
 	}
 
-	rx_entry = container_of(match, struct rxr_rx_entry, entry);
 	context = (struct fi_context *)msg->context;
 	if (flags & FI_CLAIM) {
 		context->internal[0] = rx_entry;
-		dlist_remove(match);
 	} else if (flags & FI_DISCARD) {
-		dlist_remove(match);
-
 		ret = rxr_msg_discard_trecv(ep, rx_entry, msg, flags);
 		if (ret)
 			goto out;
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_msg.h b/deps/libfabric/prov/efa/src/rxr/rxr_msg.h
index 58349d147f52dc2e88f74ac6c5fa2521983e8503..130b5c0e8de044dc843c21f78f8273e00fa02a0d 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_msg.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_msg.h
@@ -43,6 +43,24 @@ void rxr_msg_multi_recv_handle_completion(struct rxr_ep *ep,
 void rxr_msg_multi_recv_free_posted_entry(struct rxr_ep *ep,
 					  struct rxr_rx_entry *rx_entry);
 
+/**
+ * functions to allocate rx_entry for two sided operations
+ */
+struct rxr_rx_entry *rxr_msg_alloc_rx_entry(struct rxr_ep *ep,
+					    const struct fi_msg *msg,
+					    uint32_t op, uint64_t flags,
+					    uint64_t tag, uint64_t ignore);
+
+struct rxr_rx_entry *rxr_msg_alloc_unexp_rx_entry_for_msgrtm(struct rxr_ep *ep,
+							     struct rxr_pkt_entry **pkt_entry);
+
+struct rxr_rx_entry *rxr_msg_alloc_unexp_rx_entry_for_tagrtm(struct rxr_ep *ep,
+							     struct rxr_pkt_entry **pkt_entry);
+
+struct rxr_rx_entry *rxr_msg_split_rx_entry(struct rxr_ep *ep,
+					    struct rxr_rx_entry *posted_entry,
+					    struct rxr_rx_entry *consumer_entry,
+					    struct rxr_pkt_entry *pkt_entry);
 /*
  * The following 2 OP structures are defined in rxr_msg.c and is
  * used by rxr_endpoint()
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
index a6398574461739104c715228f7ef7e1e74116423..e6f8b26bb1af46d1a4549174bd8a7ce4d1ada163 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
@@ -37,6 +37,7 @@
 #include "rxr_cntr.h"
 #include "rxr_read.h"
 #include "rxr_pkt_cmd.h"
+#include "rxr_pkt_type_base.h"
 
 /* Handshake wait timeout in microseconds */
 #define RXR_HANDSHAKE_WAIT_TIMEOUT 1000000
@@ -48,67 +49,6 @@
  *          dump (for debug only)
  */
 
-/*
- *  Functions used to post a packet
- */
-ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep,
-			  struct rxr_tx_entry *tx_entry)
-{
-	struct rxr_pkt_entry *pkt_entry;
-	struct rxr_data_pkt *data_pkt;
-	ssize_t ret;
-	struct efa_domain *efa_domain;
-	struct rxr_domain *rxr_domain = rxr_ep_domain(rxr_ep);
-
-	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
-				  util_domain.domain_fid);
-
-
-	pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_efa_pool);
-	if (OFI_UNLIKELY(!pkt_entry))
-		return -FI_ENOMEM;
-
-	pkt_entry->x_entry = (void *)tx_entry;
-	pkt_entry->addr = tx_entry->addr;
-
-	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
-
-	data_pkt->hdr.type = RXR_DATA_PKT;
-	data_pkt->hdr.version = RXR_BASE_PROTOCOL_VERSION;
-	data_pkt->hdr.flags = 0;
-
-	data_pkt->hdr.rx_id = tx_entry->rx_id;
-
-	/*
-	 * Data packets are sent in order so using bytes_sent is okay here.
-	 */
-	data_pkt->hdr.seg_offset = tx_entry->bytes_sent;
-
-	/*
-	 * TODO: Check to see if underlying device can support CUDA
-	 * registrations and fallback to rxr_ep_send_data_pkt_entry() if it does
-	 * not. This should be done at init time with a CUDA reg-and-fail flag.
-	 * For now, always send CUDA buffers through
-	 * rxr_pkt_send_data_desc().
-	 */
-	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
-		ret = rxr_pkt_send_data_desc(rxr_ep, tx_entry, pkt_entry);
-	else
-		ret = rxr_pkt_send_data(rxr_ep, tx_entry, pkt_entry);
-
-	if (OFI_UNLIKELY(ret)) {
-		rxr_pkt_entry_release_tx(rxr_ep, pkt_entry);
-		return ret;
-	}
-
-	data_pkt = rxr_get_data_pkt(pkt_entry->pkt);
-	tx_entry->bytes_sent += data_pkt->hdr.seg_size;
-	tx_entry->window -= data_pkt->hdr.seg_size;
-	assert(data_pkt->hdr.seg_size > 0);
-	assert(tx_entry->window >= 0);
-	return ret;
-}
-
 /*
  *   rxr_pkt_init_ctrl() uses init functions declared in rxr_pkt_type.h
  */
@@ -131,6 +71,9 @@ int rxr_pkt_init_ctrl(struct rxr_ep *rxr_ep, int entry_type, void *x_entry,
 	case RXR_ATOMRSP_PKT:
 		ret = rxr_pkt_init_atomrsp(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry);
 		break;
+	case RXR_RECEIPT_PKT:
+		ret = rxr_pkt_init_receipt(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry);
+		break;
 	case RXR_EAGER_MSGRTM_PKT:
 		ret = rxr_pkt_init_eager_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
@@ -143,32 +86,32 @@ int rxr_pkt_init_ctrl(struct rxr_ep *rxr_ep, int entry_type, void *x_entry,
 	case RXR_MEDIUM_TAGRTM_PKT:
 		ret = rxr_pkt_init_medium_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_LONG_MSGRTM_PKT:
-		ret = rxr_pkt_init_long_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGCTS_MSGRTM_PKT:
+		ret = rxr_pkt_init_longcts_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_LONG_TAGRTM_PKT:
-		ret = rxr_pkt_init_long_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGCTS_TAGRTM_PKT:
+		ret = rxr_pkt_init_longcts_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_READ_MSGRTM_PKT:
-		ret = rxr_pkt_init_read_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGREAD_MSGRTM_PKT:
+		ret = rxr_pkt_init_longread_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_READ_TAGRTM_PKT:
-		ret = rxr_pkt_init_read_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGREAD_TAGRTM_PKT:
+		ret = rxr_pkt_init_longread_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
 	case RXR_EAGER_RTW_PKT:
 		ret = rxr_pkt_init_eager_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_LONG_RTW_PKT:
-		ret = rxr_pkt_init_long_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGCTS_RTW_PKT:
+		ret = rxr_pkt_init_longcts_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_READ_RTW_PKT:
-		ret = rxr_pkt_init_read_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGREAD_RTW_PKT:
+		ret = rxr_pkt_init_longread_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
 	case RXR_SHORT_RTR_PKT:
 		ret = rxr_pkt_init_short_rtr(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
-	case RXR_LONG_RTR_PKT:
-		ret = rxr_pkt_init_long_rtr(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+	case RXR_LONGCTS_RTR_PKT:
+		ret = rxr_pkt_init_longcts_rtr(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
 	case RXR_WRITE_RTA_PKT:
 		ret = rxr_pkt_init_write_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
@@ -179,9 +122,39 @@ int rxr_pkt_init_ctrl(struct rxr_ep *rxr_ep, int entry_type, void *x_entry,
 	case RXR_COMPARE_RTA_PKT:
 		ret = rxr_pkt_init_compare_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
 		break;
+	case RXR_DC_EAGER_MSGRTM_PKT:
+		ret = rxr_pkt_init_dc_eager_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_EAGER_TAGRTM_PKT:
+		ret = rxr_pkt_init_dc_eager_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+		ret = rxr_pkt_init_dc_medium_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
+		ret = rxr_pkt_init_dc_medium_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
+		ret = rxr_pkt_init_dc_longcts_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
+		ret = rxr_pkt_init_dc_longcts_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_EAGER_RTW_PKT:
+		ret = rxr_pkt_init_dc_eager_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_LONGCTS_RTW_PKT:
+		ret = rxr_pkt_init_dc_longcts_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DC_WRITE_RTA_PKT:
+		ret = rxr_pkt_init_dc_write_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
+	case RXR_DATA_PKT:
+		ret = rxr_pkt_init_data(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry);
+		break;
 	default:
-		ret = -FI_EINVAL;
 		assert(0 && "unknown pkt type to init");
+		ret = -FI_EINVAL;
 		break;
 	}
 
@@ -209,54 +182,82 @@ void rxr_pkt_handle_ctrl_sent(struct rxr_ep *rxr_ep, struct rxr_pkt_entry *pkt_e
 	case RXR_ATOMRSP_PKT:
 		rxr_pkt_handle_atomrsp_sent(rxr_ep, pkt_entry);
 		break;
+	case RXR_RECEIPT_PKT:
+		rxr_pkt_handle_receipt_sent(rxr_ep, pkt_entry);
+		break;
 	case RXR_EAGER_MSGRTM_PKT:
 	case RXR_EAGER_TAGRTM_PKT:
 		rxr_pkt_handle_eager_rtm_sent(rxr_ep, pkt_entry);
 		break;
 	case RXR_MEDIUM_MSGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
 		rxr_pkt_handle_medium_rtm_sent(rxr_ep, pkt_entry);
 		break;
-	case RXR_LONG_MSGRTM_PKT:
-	case RXR_LONG_TAGRTM_PKT:
-		rxr_pkt_handle_long_rtm_sent(rxr_ep, pkt_entry);
+	case RXR_LONGCTS_MSGRTM_PKT:
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
+	case RXR_LONGCTS_TAGRTM_PKT:
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
+		rxr_pkt_handle_longcts_rtm_sent(rxr_ep, pkt_entry);
 		break;
-	case RXR_READ_MSGRTM_PKT:
-	case RXR_READ_TAGRTM_PKT:
-		rxr_pkt_handle_read_rtm_sent(rxr_ep, pkt_entry);
+	case RXR_LONGREAD_MSGRTM_PKT:
+	case RXR_LONGREAD_TAGRTM_PKT:
+		rxr_pkt_handle_longread_rtm_sent(rxr_ep, pkt_entry);
 		break;
 	case RXR_EAGER_RTW_PKT:
 		rxr_pkt_handle_eager_rtw_sent(rxr_ep, pkt_entry);
 		break;
-	case RXR_LONG_RTW_PKT:
-		rxr_pkt_handle_long_rtw_sent(rxr_ep, pkt_entry);
+	case RXR_LONGCTS_RTW_PKT:
+	case RXR_DC_LONGCTS_RTW_PKT:
+		rxr_pkt_handle_longcts_rtw_sent(rxr_ep, pkt_entry);
 		break;
-	case RXR_READ_RTW_PKT:
-		rxr_pkt_handle_read_rtw_sent(rxr_ep, pkt_entry);
+	case RXR_LONGREAD_RTW_PKT:
+		rxr_pkt_handle_longread_rtw_sent(rxr_ep, pkt_entry);
 		break;
 	case RXR_SHORT_RTR_PKT:
-	case RXR_LONG_RTR_PKT:
-		rxr_pkt_handle_rtr_sent(rxr_ep, pkt_entry);
+	case RXR_LONGCTS_RTR_PKT:
+		/* nothing can be done when RTR packets are sent */
 		break;
 	case RXR_WRITE_RTA_PKT:
+	case RXR_DC_WRITE_RTA_PKT:
 	case RXR_FETCH_RTA_PKT:
 	case RXR_COMPARE_RTA_PKT:
 		rxr_pkt_handle_rta_sent(rxr_ep, pkt_entry);
 		break;
+	case RXR_DC_EAGER_MSGRTM_PKT:
+	case RXR_DC_EAGER_TAGRTM_PKT:
+	case RXR_DC_EAGER_RTW_PKT:
+		break;
+	case RXR_DATA_PKT:
+		rxr_pkt_handle_data_sent(rxr_ep, pkt_entry);
+		break;
 	default:
 		assert(0 && "Unknown packet type to handle sent");
 		break;
 	}
 }
 
+/**
+ * @brief post a single control packet.
+ *
+ *
+ * @param[in]   rxr_ep          endpoint
+ * @param[in]   entry_type      type of x_entry, allowed values: RXR_TX_ENTRY, RXR_RX_ENTRY
+ * @param[in]   x_entry         x_entry pointer
+ * @param[in]   ctrl_type       type of control packet
+ * @param[in]   inject          send control packet via inject or not.
+ * @param[in]   flags           additional flags to apply for fi_sendmsg.
+ *                              currently only accepted flags is FI_MORE.
+ * @return      On success return 0, otherwise return a negative error code
+ */
 ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_entry,
-			       int ctrl_type, bool inject)
+			       int ctrl_type, bool inject, uint64_t flags)
 {
-	struct rxr_pkt_sendv send;
 	struct rxr_pkt_entry *pkt_entry;
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	ssize_t err;
 	fi_addr_t addr;
 
@@ -269,19 +270,17 @@ ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_en
 	}
 
 	peer = rxr_ep_get_peer(rxr_ep, addr);
+	assert(peer);
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
-		pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_shm_pool);
+		pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->shm_tx_pkt_pool, RXR_PKT_FROM_SHM_TX_POOL);
 	} else {
-		pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_efa_pool);
+		pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->efa_tx_pkt_pool, RXR_PKT_FROM_EFA_TX_POOL);
 	}
 
 	if (!pkt_entry)
 		return -FI_EAGAIN;
 
-	send.iov_count = 0;
-	pkt_entry->send = &send;
-
 	/*
 	 * rxr_pkt_init_ctrl will set pkt_entry->send if it want to use multi iov
 	 */
@@ -291,20 +290,22 @@ ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_en
 		return err;
 	}
 
-	/* if send, tx_pkt_entry will be released while handle completion
-	 * if inject, there will not be completion, therefore tx_pkt_entry has to be
-	 * released here
+	/* If the send (or inject) succeeded, the function rxr_pkt_entry_send
+	 * (or rxr_pkt_entry_inject) will increase the counter in rxr_ep that
+	 * tracks number of outstanding TX ops.
 	 */
-	if (inject)
+	if (inject) {
+		/*
+		 * Currently, the only accepted flags is FI_MORE, which is not
+		 * compatible with inject. Add an additional check here to make
+		 * sure flags is set by the caller correctly.
+		 */
+		assert(!flags);
 		err = rxr_pkt_entry_inject(rxr_ep, pkt_entry, addr);
-	else if (pkt_entry->send->iov_count > 0)
-		err = rxr_pkt_entry_sendv(rxr_ep, pkt_entry, addr,
-					  pkt_entry->send->iov, pkt_entry->send->desc,
-					  pkt_entry->send->iov_count, 0);
+	}
 	else
-		err = rxr_pkt_entry_send(rxr_ep, pkt_entry, addr);
+		err = rxr_pkt_entry_send(rxr_ep, pkt_entry, flags);
 
-	pkt_entry->send = NULL;
 	if (OFI_UNLIKELY(err)) {
 		rxr_pkt_entry_release_tx(rxr_ep, pkt_entry);
 		return err;
@@ -312,25 +313,47 @@ ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_en
 
 	peer->flags |= RXR_PEER_REQ_SENT;
 	rxr_pkt_handle_ctrl_sent(rxr_ep, pkt_entry);
+
+	/* If injection succeeded, packet should be considered as sent completed.
+	 * therefore call rxr_pkt_handle_send_completion().
+	 * rxr_pkt_handle_send_completion() will release pkt_entry and decrease
+	 * the counter in rxr_ep that tracks number of outstanding TX ops.
+	 */
 	if (inject)
-		rxr_pkt_entry_release_tx(rxr_ep, pkt_entry);
+		rxr_pkt_handle_send_completion(rxr_ep, pkt_entry);
 
 	return 0;
 }
 
+/**
+ * @brief post control packets.
+ *
+ *
+ * @param[in]   rxr_ep          endpoint
+ * @param[in]   entry_type      type of x_entry, allowed values: RXR_TX_ENTRY, RXR_RX_ENTRY
+ * @param[in]   x_entry         x_entry pointer
+ * @param[in]   ctrl_type       type of control packet
+ * @param[in]   inject          send control packet via inject or not.
+ * @param[in]   flags           additional flags to apply for fi_sendmsg.
+ *                              currently only accepted flags is FI_MORE.
+ * @return      On success return 0, otherwise return a negative error code
+ */
 ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry,
-			  int ctrl_type, bool inject)
+			  int ctrl_type, bool inject, uint64_t flags)
 {
 	ssize_t err;
 	struct rxr_tx_entry *tx_entry;
 
-	if (ctrl_type == RXR_MEDIUM_TAGRTM_PKT || ctrl_type == RXR_MEDIUM_MSGRTM_PKT) {
+	if (ctrl_type == RXR_MEDIUM_TAGRTM_PKT ||
+	    ctrl_type == RXR_MEDIUM_MSGRTM_PKT ||
+	    ctrl_type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+	    ctrl_type == RXR_DC_MEDIUM_TAGRTM_PKT) {
 		assert(entry_type == RXR_TX_ENTRY);
 		assert(!inject);
 
 		tx_entry = (struct rxr_tx_entry *)x_entry;
 		while (tx_entry->bytes_sent < tx_entry->total_len) {
-			err = rxr_pkt_post_ctrl_once(ep, RXR_TX_ENTRY, x_entry, ctrl_type, 0);
+			err = rxr_pkt_post_ctrl_once(ep, RXR_TX_ENTRY, x_entry, ctrl_type, 0, flags);
 			if (OFI_UNLIKELY(err))
 				return err;
 		}
@@ -338,7 +361,7 @@ ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry,
 		return 0;
 	}
 
-	return rxr_pkt_post_ctrl_once(ep, entry_type, x_entry, ctrl_type, inject);
+	return rxr_pkt_post_ctrl_once(ep, entry_type, x_entry, ctrl_type, inject, flags);
 }
 
 ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry, int ctrl_type, bool inject)
@@ -347,23 +370,25 @@ ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_en
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
 
-	err = rxr_pkt_post_ctrl(ep, entry_type, x_entry, ctrl_type, inject);
+	err = rxr_pkt_post_ctrl(ep, entry_type, x_entry, ctrl_type, inject, 0);
 	if (err == -FI_EAGAIN) {
 		if (entry_type == RXR_TX_ENTRY) {
 			tx_entry = (struct rxr_tx_entry *)x_entry;
+			assert(!(tx_entry->rxr_flags & RXR_TX_ENTRY_QUEUED_RNR));
 			tx_entry->state = RXR_TX_QUEUED_CTRL;
 			tx_entry->queued_ctrl.type = ctrl_type;
 			tx_entry->queued_ctrl.inject = inject;
-			dlist_insert_tail(&tx_entry->queued_entry,
-					  &ep->tx_entry_queued_list);
+			dlist_insert_tail(&tx_entry->queued_ctrl_entry,
+					  &ep->tx_entry_queued_ctrl_list);
 		} else {
 			assert(entry_type == RXR_RX_ENTRY);
 			rx_entry = (struct rxr_rx_entry *)x_entry;
+			assert(rx_entry->state != RXR_RX_QUEUED_CTRL);
 			rx_entry->state = RXR_RX_QUEUED_CTRL;
 			rx_entry->queued_ctrl.type = ctrl_type;
 			rx_entry->queued_ctrl.inject = inject;
-			dlist_insert_tail(&rx_entry->queued_entry,
-					  &ep->rx_entry_queued_list);
+			dlist_insert_tail(&rx_entry->queued_ctrl_entry,
+					  &ep->rx_entry_queued_ctrl_list);
 		}
 
 		err = 0;
@@ -393,14 +418,59 @@ ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_en
  * handshake packet within a certain period of time.
  */
 
-ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer)
+ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rdm_peer *peer)
 {
-	struct rxr_tx_entry *tx_entry;
-	ssize_t err;
+	ssize_t ret;
 
 	uint64_t current, endwait;
 
-	if (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)
+	ret = rxr_pkt_trigger_handshake(ep, addr, peer);
+	if (OFI_UNLIKELY(ret))
+		return ret;
+
+	current = ofi_gettime_us();
+	endwait = current + RXR_HANDSHAKE_WAIT_TIMEOUT;
+
+	while (current < endwait &&
+	       !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
+		rxr_ep_progress_internal(ep);
+		current = ofi_gettime_us();
+	}
+
+	if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"did not get handshake back in %f second(s). returning -FI_EAGAIN!\n",
+			RXR_HANDSHAKE_WAIT_TIMEOUT * 1e-6);
+		return -FI_EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * This function is used for any extra feature that does not have an
+ * alternative.
+ *
+ * This function will send a eager rtw packet to trigger handshake.
+ *
+ * We do not send eager rtm packets here because the receiver might require
+ * ordering and an extra eager rtm will interrupt the reorder
+ * process.
+ *
+ * ep: The endpoint on which the packet for triggering handshake will be sent.
+ * peer: The peer from which the sender receives handshake.
+ * addr: The address of the peer.
+ *
+ * This function will return 0 if the eager rtw packet is successfully sent.
+ */
+ssize_t rxr_pkt_trigger_handshake(struct rxr_ep *ep,
+				  fi_addr_t addr, struct rdm_peer *peer)
+{
+	struct rxr_tx_entry *tx_entry;
+	ssize_t err;
+
+	if ((peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) ||
+	    (peer->flags & RXR_PEER_REQ_SENT))
 		return 0;
 
 	tx_entry = ofi_buf_alloc(ep->tx_entry_pool);
@@ -411,6 +481,9 @@ ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_pee
 
 	tx_entry->total_len = 0;
 	tx_entry->addr = addr;
+	tx_entry->peer = rxr_ep_get_peer(ep, tx_entry->addr);
+	assert(tx_entry->peer);
+	dlist_insert_tail(&tx_entry->peer_entry, &tx_entry->peer->tx_entry_list);
 	tx_entry->msg_id = -1;
 	tx_entry->cq_entry.flags = FI_RMA | FI_WRITE;
 	tx_entry->cq_entry.buf = NULL;
@@ -420,7 +493,6 @@ ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_pee
 	tx_entry->op = ofi_op_write;
 	tx_entry->state = RXR_TX_REQ;
 
-	tx_entry->send_flags = 0;
 	tx_entry->bytes_acked = 0;
 	tx_entry->bytes_sent = 0;
 	tx_entry->window = 0;
@@ -430,141 +502,242 @@ ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_pee
 	tx_entry->iov_mr_start = 0;
 	tx_entry->iov_offset = 0;
 	tx_entry->fi_flags = RXR_NO_COMPLETION | RXR_NO_COUNTER;
+	tx_entry->rxr_flags = 0;
 
-#if ENABLE_DEBUG
-	dlist_insert_tail(&tx_entry->tx_entry_entry, &ep->tx_entry_list);
-#endif
+	dlist_insert_tail(&tx_entry->ep_entry, &ep->tx_entry_list);
 
-	err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_EAGER_RTW_PKT, 0);
+	err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_EAGER_RTW_PKT, 0, 0);
 
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	current = ofi_gettime_us();
-	endwait = current + RXR_HANDSHAKE_WAIT_TIMEOUT;
-	while (current < endwait && !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
-		rxr_ep_progress_internal(ep);
-		current = ofi_gettime_us();
-	}
-
-	if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
-		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
-			"did not get handshake back in %f second(s). returning -FI_EAGAIN!\n",
-			RXR_HANDSHAKE_WAIT_TIMEOUT*1e-6);
-		return -FI_EAGAIN;
-	}
-
 	return 0;
 }
 
-/* return the data size in a packet entry */
-size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry)
+void rxr_pkt_handle_data_copied(struct rxr_ep *ep,
+				struct rxr_pkt_entry *pkt_entry,
+				size_t data_size)
 {
-	int pkt_type;
-
-	assert(pkt_entry);
-	pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type;
-
-	if (pkt_type == RXR_DATA_PKT)
-		return pkt_entry->pkt_size - sizeof(struct rxr_data_hdr);
+	struct rxr_rx_entry *rx_entry;
+	ssize_t ret;
 
-	if (pkt_type == RXR_READRSP_PKT)
-		return pkt_entry->pkt_size - sizeof(struct rxr_readrsp_hdr);
+	rx_entry = pkt_entry->x_entry;
+	assert(rx_entry);
+	rx_entry->bytes_copied += data_size;
 
-	if (pkt_type >= RXR_REQ_PKT_BEGIN) {
-		assert(pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT ||
-		       pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT ||
-		       pkt_type == RXR_LONG_MSGRTM_PKT || pkt_type == RXR_LONG_TAGRTM_PKT ||
-		       pkt_type == RXR_EAGER_RTW_PKT || pkt_type == RXR_LONG_RTW_PKT);
+	rxr_pkt_entry_release_rx(ep, pkt_entry);
 
-		return pkt_entry->pkt_size - rxr_pkt_req_hdr_size(pkt_entry);
+	if (rx_entry->total_len == rx_entry->bytes_copied) {
+		if (rx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED) {
+			ret = rxr_pkt_post_ctrl_or_queue(ep,
+							 RXR_RX_ENTRY,
+							 rx_entry,
+							 RXR_RECEIPT_PKT, 0);
+			if (OFI_UNLIKELY(ret)) {
+				FI_WARN(&rxr_prov,
+					FI_LOG_CQ,
+					"Posting of receipt packet failed! err=%s\n",
+					fi_strerror(ret));
+				efa_eq_write_error(&ep->util_ep,
+						   FI_EIO,
+						   ret);
+				rxr_release_rx_entry(ep,
+						     rx_entry);
+				return;
+			}
+			rxr_cq_handle_rx_completion(ep, rx_entry);
+			rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
+			/* rx_entry will be released
+			 * when sender receives the
+			 * receipt packet.
+			 */
+			return;
+		}
+		rxr_cq_handle_rx_completion(ep, rx_entry);
+		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
+		rxr_release_rx_entry(ep, rx_entry);
 	}
-
-	/* other packet type does not contain data, thus return 0
-	 */
-	return 0;
 }
 
-/*
- * rxr_pkt_copy_to_rx() copy data to receiving buffer then
- * update counter in rx_entry.
+/**
+ * @brief handle the a packet that encountered error completion while sending
+ *
+ * Depend on the packet type and error type, the error are handled differently.
+ *
+ * If the packet is associated with an user initialized TX operation:
+ * (TX means send,read or write; such packets include all REQ packets and DATA):
+ *
+ *    If the error is Receiver Not Ready (RNR). there are two cases:
+ *
+ *         If user wants to manager RNR by itself (FI_RM_DISABLED),
+ *         an error CQ entry will be written.
+ *
+ *         Otherwise, the packet will be queued and resnt by progress engine.
+ *
+ *    For other type of error, an error CQ entry is written.
  *
- * If receiving buffer is on GPU memory, it will post a
- * read request, otherwise it will copy data.
+ * If the packet is associated with an user initialized recv operiaton,
+ * (such packets include EOR, CTS):
  *
- * If all data has been copied to receiving buffer,
- * it will write rx completion and release rx_entry.
+ *      If the error is RNR, the packet is queued and resent by progress
+ *      engine. No CQ entry is written.
  *
- * Return value and states:
+ *      For other types of error, an error CQ entry is written.
  *
- *    On success, return 0 and release pkt_entry
- *    On failure, return error code
+ * If the packet is not associated with an user operation (such packet include
+ * HANDSHAKE):
+ *
+ *      If the error is RNR, the packet is queued and resent by progress engine.
+ *
+ *      For othre types of error, an error EQ entry is written.
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	pkt_entry	pkt entry
+ * @param[in]	err		libfabric error code
+ * @param[in]	prov_errno	provider specific error code
  */
-ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep,
-			   struct rxr_rx_entry *rx_entry,
-			   size_t data_offset,
-			   struct rxr_pkt_entry *pkt_entry,
-			   char *data, size_t data_size)
+void rxr_pkt_handle_send_error(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, int err, int prov_errno)
 {
-	ssize_t err, bytes_copied;
+	struct rdm_peer *peer;
+	struct rxr_tx_entry *tx_entry;
+	struct rxr_rx_entry *rx_entry;
 
-	pkt_entry->x_entry = rx_entry;
+	assert(pkt_entry->alloc_type == RXR_PKT_FROM_EFA_TX_POOL ||
+	       pkt_entry->alloc_type == RXR_PKT_FROM_SHM_TX_POOL);
 
-	if (data_size > 0 && efa_ep_is_cuda_mr(rx_entry->desc[0])) {
-		err = rxr_read_post_local_read_or_queue(ep, rx_entry, data_offset,
-							pkt_entry, data, data_size);
-		if (err)
-			FI_WARN(&rxr_prov, FI_LOG_CQ, "cannot post read to copy data\n");
+	rxr_ep_record_tx_op_completed(ep, pkt_entry);
 
-		return err;
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	if (!peer) {
+		/*
+		 * If peer is NULL, it means the peer has been removed from AV.
+		 * In this case, ignore this error completion.
+		 */
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "ignoring send error completion of a packet to a removed peer.\n");
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+		return;
 	}
 
-	if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL)) &&
-	    rx_entry->cq_entry.len > data_offset && data_size > 0) {
-		bytes_copied = ofi_copy_to_iov(rx_entry->iov,
-					       rx_entry->iov_count,
-					       data_offset,
-					       data,
-					       data_size);
-		if (bytes_copied != MIN(data_size, rx_entry->cq_entry.len - data_offset)) {
-			FI_WARN(&rxr_prov, FI_LOG_CQ, "wrong size! bytes_copied: %ld\n",
-				bytes_copied);
-			return -FI_EINVAL;
+	if (!pkt_entry->x_entry) {
+		/* only handshake packet is not associated with any TX/RX operation */
+		assert(rxr_get_base_hdr(pkt_entry->pkt)->type == RXR_HANDSHAKE_PKT);
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+		if (prov_errno == IBV_WC_RNR_RETRY_EXC_ERR) {
+			/*
+			 * handshake should always be queued for RNR
+			 */
+			assert(!(peer->flags & RXR_PEER_HANDSHAKE_QUEUED));
+			peer->flags |= RXR_PEER_HANDSHAKE_QUEUED;
+			dlist_insert_tail(&peer->handshake_queued_entry,
+					  &ep->handshake_queued_peer_list);
+		} else if (prov_errno != IBV_WC_REM_INV_RD_REQ_ERR) {
+			/* If prov_errno is IBV_WC_REM_INV_RD_REQ_ERR, the peer has been destroyed.
+			 * Which is normal, as peer does not always need a handshake packet perform
+			 * its duty. (For example, if a peer just want to sent 1 message to the ep, it
+			 * does not need handshake.)
+			 * In this case, it is safe to ignore this error completion.
+			 * In all other cases, we write an eq entry because there is no application
+			 * operation associated with handshake.
+			 */
+			efa_eq_write_error(&ep->util_ep, err, prov_errno);
 		}
+		return;
 	}
 
-	rxr_pkt_handle_data_copied(ep, pkt_entry, data_size);
-	return 0;
-}
+	if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_TX_ENTRY) {
+		tx_entry = pkt_entry->x_entry;
+		if (prov_errno == IBV_WC_RNR_RETRY_EXC_ERR) {
+			if (ep->handle_resource_management == FI_RM_DISABLED) {
+				/*
+				 * Write an error to the application for RNR when
+				 * resource management is disabled.
+				 */
+				rxr_cq_write_tx_error(ep, pkt_entry->x_entry, FI_ENORX, 0);
+				rxr_pkt_entry_release_tx(ep, pkt_entry);
+			} else {
+				/*
+				 * This packet is associated with a send operation,
+				 * (such packets include all REQ, DATA)
+				 * thus shoud be queued for RNR only if
+				 * application wants EFA to manager resource.
+				 */
+				rxr_cq_queue_rnr_pkt(ep, &tx_entry->queued_pkts, pkt_entry);
+				if (!(tx_entry->rxr_flags & RXR_TX_ENTRY_QUEUED_RNR)) {
+					tx_entry->rxr_flags |= RXR_TX_ENTRY_QUEUED_RNR;
+					dlist_insert_tail(&tx_entry->queued_rnr_entry,
+							  &ep->tx_entry_queued_rnr_list);
+				}
+			}
+		} else {
+			rxr_cq_write_tx_error(ep, pkt_entry->x_entry, err, prov_errno);
+			rxr_pkt_entry_release_tx(ep, pkt_entry);
+		}
 
-void rxr_pkt_handle_data_copied(struct rxr_ep *ep,
-				struct rxr_pkt_entry *pkt_entry,
-				size_t data_size)
-{
-	struct rxr_rx_entry *rx_entry;
+		return;
+	}
 
-	rx_entry = pkt_entry->x_entry;
-	assert(rx_entry);
-	rx_entry->bytes_copied += data_size;
+	if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_RX_ENTRY) {
+		rx_entry = pkt_entry->x_entry;
+		if (prov_errno == IBV_WC_RNR_RETRY_EXC_ERR) {
+			/*
+			 * This packet is associated with a recv operation,
+			 * (such packets include CTS and EOR)
+			 * thus should always be queued for RNR.
+			 * This is regardless value of ep->handle_resource_management,
+			 * because resource management is only applied to send operation.
+			 */
+			rxr_cq_queue_rnr_pkt(ep, &rx_entry->queued_pkts, pkt_entry);
+			/*
+			 * rx_entry send one ctrl packet at a time, so if we
+			 * received RNR for the packet, the rx_entry must not
+			 * be in ep's rx_queued_entry_rnr_list, thus cannot
+			 * have the QUEUED_RNR flag
+			 */
+			assert(!(rx_entry->rxr_flags & RXR_RX_ENTRY_QUEUED_RNR));
+			rx_entry->rxr_flags |= RXR_RX_ENTRY_QUEUED_RNR;
+			dlist_insert_tail(&rx_entry->queued_rnr_entry,
+					  &ep->rx_entry_queued_rnr_list);
 
-	if (rx_entry->total_len == rx_entry->bytes_copied) {
-		rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry);
-		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
-		rxr_release_rx_entry(ep, rx_entry);
-	} else {
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		} else {
+			rxr_cq_write_rx_error(ep, pkt_entry->x_entry, err, prov_errno);
+			rxr_pkt_entry_release_tx(ep, pkt_entry);
+		}
+
+		return;
+	}
+
+	if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_READ_ENTRY) {
+		/* read will not encounter RNR */
+		assert(prov_errno != IBV_WC_RNR_RETRY_EXC_ERR);
+		rxr_read_write_error(ep, pkt_entry->x_entry, err, prov_errno);
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+		return;
 	}
+
+	FI_WARN(&rxr_prov, FI_LOG_CQ,
+		"%s unknown x_entry type %d\n",
+		__func__, RXR_GET_X_ENTRY_TYPE(pkt_entry));
+	assert(0 && "unknown x_entry state");
+	efa_eq_write_error(&ep->util_ep, err, prov_errno);
+	rxr_pkt_entry_release_tx(ep, pkt_entry);
 }
 
-/*
- *   Functions used to handle packet send completion
- */
-void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *comp)
+void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_pkt_entry *pkt_entry;
-	struct rxr_peer *peer;
-
-	pkt_entry = (struct rxr_pkt_entry *)comp->op_context;
+	/*
+	 * For a send completion, pkt_entry->addr can be FI_ADDR_NOTAVAIL in 3 situations:
+	 * 1. the pkt_entry is used for a local read operation
+	 * 2. a new peer with same gid+qpn was inserted to av, thus the peer was removed from AV.
+	 * 3. application removed the peer's address from av.
+	 * In 1, we should proceed. For 2 and 3, the send completion should be ignored.
+	 */
+	if (pkt_entry->addr == FI_ADDR_NOTAVAIL &&
+	    !(pkt_entry->flags & RXR_PKT_ENTRY_LOCAL_READ)) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "ignoring send completion of a packet to a removed peer.\n");
+		rxr_ep_record_tx_op_completed(ep, pkt_entry);
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+		return;
+	}
 
 	switch (rxr_get_base_hdr(pkt_entry->pkt)->type) {
 	case RXR_HANDSHAKE_PKT:
@@ -586,6 +759,9 @@ void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *
 	case RXR_ATOMRSP_PKT:
 		rxr_pkt_handle_atomrsp_send_completion(ep, pkt_entry);
 		break;
+	case RXR_RECEIPT_PKT:
+		rxr_pkt_handle_receipt_send_completion(ep, pkt_entry);
+		break;
 	case RXR_EAGER_MSGRTM_PKT:
 	case RXR_EAGER_TAGRTM_PKT:
 		rxr_pkt_handle_eager_rtm_send_completion(ep, pkt_entry);
@@ -594,25 +770,25 @@ void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *
 	case RXR_MEDIUM_TAGRTM_PKT:
 		rxr_pkt_handle_medium_rtm_send_completion(ep, pkt_entry);
 		break;
-	case RXR_LONG_MSGRTM_PKT:
-	case RXR_LONG_TAGRTM_PKT:
-		rxr_pkt_handle_long_rtm_send_completion(ep, pkt_entry);
+	case RXR_LONGCTS_MSGRTM_PKT:
+	case RXR_LONGCTS_TAGRTM_PKT:
+		rxr_pkt_handle_longcts_rtm_send_completion(ep, pkt_entry);
 		break;
-	case RXR_READ_MSGRTM_PKT:
-	case RXR_READ_TAGRTM_PKT:
-		rxr_pkt_handle_read_rtm_send_completion(ep, pkt_entry);
+	case RXR_LONGREAD_MSGRTM_PKT:
+	case RXR_LONGREAD_TAGRTM_PKT:
+		rxr_pkt_handle_longread_rtm_send_completion(ep, pkt_entry);
 		break;
 	case RXR_EAGER_RTW_PKT:
 		rxr_pkt_handle_eager_rtw_send_completion(ep, pkt_entry);
 		break;
-	case RXR_LONG_RTW_PKT:
-		rxr_pkt_handle_long_rtw_send_completion(ep, pkt_entry);
+	case RXR_LONGCTS_RTW_PKT:
+		rxr_pkt_handle_longcts_rtw_send_completion(ep, pkt_entry);
 		break;
-	case RXR_READ_RTW_PKT:
-		rxr_pkt_handle_read_rtw_send_completion(ep, pkt_entry);
+	case RXR_LONGREAD_RTW_PKT:
+		rxr_pkt_handle_longread_rtw_send_completion(ep, pkt_entry);
 		break;
 	case RXR_SHORT_RTR_PKT:
-	case RXR_LONG_RTR_PKT:
+	case RXR_LONGCTS_RTR_PKT:
 		rxr_pkt_handle_rtr_send_completion(ep, pkt_entry);
 		break;
 	case RXR_WRITE_RTA_PKT:
@@ -624,24 +800,76 @@ void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *
 	case RXR_COMPARE_RTA_PKT:
 		/* no action to be taken here */
 		break;
+	case RXR_DC_EAGER_MSGRTM_PKT:
+	case RXR_DC_EAGER_TAGRTM_PKT:
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
+	case RXR_DC_EAGER_RTW_PKT:
+	case RXR_DC_WRITE_RTA_PKT:
+		/* no action to be taken here */
+		/* For non-dc version of the packet types,
+		 * this is the place to write tx completion.
+		 * However, for dc tx completion will always be
+		 * written upon receving the receipt packet
+		 * if not using long message protocols.
+		 * Moreoever, because receipt can arrive
+		 * before send completion, we cannot take
+		 * any action on tx_entry here.
+		 */
+		break;
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
+		rxr_pkt_handle_dc_longcts_rtm_send_completion(ep, pkt_entry);
+		break;
+	case RXR_DC_LONGCTS_RTW_PKT:
+		rxr_pkt_handle_dc_longcts_rtw_send_completion(ep, pkt_entry);
+		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"invalid control pkt type %d\n",
 			rxr_get_base_hdr(pkt_entry->pkt)->type);
 		assert(0 && "invalid control pkt type");
-		rxr_cq_handle_cq_error(ep, -FI_EIO);
+		efa_eq_write_error(&ep->util_ep, FI_EIO, FI_EIO);
 		return;
 	}
 
-	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
-	if (!peer->is_local)
-		rxr_ep_dec_tx_pending(ep, peer, 0);
+	rxr_ep_record_tx_op_completed(ep, pkt_entry);
 	rxr_pkt_entry_release_tx(ep, pkt_entry);
 }
 
-/*
- *  Functions used to handle packet receive completion
+/**
+ * @brief handle the a packet that encountered error completion while receiving
+ *
+ * This function will write error cq or eq entry, then release the packet entry.
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	pkt_entry	pkt entry
+ * @param[in]	err		libfabric error code
+ * @param[in]	prov_errno	provider specific error code
  */
+void rxr_pkt_handle_recv_error(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, int err, int prov_errno)
+{
+	if (!pkt_entry->x_entry) {
+		efa_eq_write_error(&ep->util_ep, err, prov_errno);
+		rxr_pkt_entry_release_tx(ep, pkt_entry);
+		return;
+	}
+
+	if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_TX_ENTRY) {
+		rxr_cq_write_tx_error(ep, pkt_entry->x_entry, err, prov_errno);
+	} else if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_RX_ENTRY) {
+		rxr_cq_write_rx_error(ep, pkt_entry->x_entry, err, prov_errno);
+	} else {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+		"%s unknown x_entry type %d\n",
+			__func__, RXR_GET_X_ENTRY_TYPE(pkt_entry));
+		assert(0 && "unknown x_entry state");
+		efa_eq_write_error(&ep->util_ep, err, prov_errno);
+	}
+
+	rxr_pkt_entry_release_rx(ep, pkt_entry);
+}
+
 static
 fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, void *raw_addr)
 {
@@ -651,7 +879,7 @@ fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry
 	struct rxr_base_hdr *base_hdr;
 
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
-	if (base_hdr->version < RXR_BASE_PROTOCOL_VERSION) {
+	if (base_hdr->version < RXR_PROTOCOL_VERSION) {
 		char host_gid[ep->core_addrlen * 3];
 		int length = 0;
 
@@ -661,19 +889,19 @@ fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"Host %s received a packet with invalid protocol version %d.\n"
 			"This host can only support protocol version %d and above.\n",
-			host_gid, base_hdr->version, RXR_BASE_PROTOCOL_VERSION);
+			host_gid, base_hdr->version, RXR_PROTOCOL_VERSION);
 		efa_eq_write_error(&ep->util_ep, FI_EIO, -FI_EINVAL);
 		fprintf(stderr, "Host %s received a packet with invalid protocol version %d.\n"
 			"This host can only support protocol version %d and above. %s:%d\n",
-			host_gid, base_hdr->version, RXR_BASE_PROTOCOL_VERSION, __FILE__, __LINE__);
+			host_gid, base_hdr->version, RXR_PROTOCOL_VERSION, __FILE__, __LINE__);
 		abort();
 	}
 
 	assert(base_hdr->type >= RXR_REQ_PKT_BEGIN);
 
 	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
-	ret = efa_av_insert_addr(efa_ep->av, (struct efa_ep_addr *)raw_addr,
-				 &rdm_addr, 0, NULL);
+	ret = efa_av_insert_one(efa_ep->av, (struct efa_ep_addr *)raw_addr,
+	                        &rdm_addr, 0, NULL);
 	if (OFI_UNLIKELY(ret != 0)) {
 		efa_eq_write_error(&ep->util_ep, FI_EINVAL, ret);
 		return -1;
@@ -682,79 +910,31 @@ fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry
 	return rdm_addr;
 }
 
-void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
-				    struct fi_cq_data_entry *cq_entry,
-				    fi_addr_t src_addr)
+/**
+ * @brief process a received packet
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	pkt_entry	received packet entry
+ */
+void rxr_pkt_proc_received(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_peer *peer;
 	struct rxr_base_hdr *base_hdr;
-	struct rxr_pkt_entry *pkt_entry;
-
-	pkt_entry = (struct rxr_pkt_entry *)cq_entry->op_context;
-	pkt_entry->pkt_size = cq_entry->len;
-	assert(pkt_entry->pkt_size > 0);
 
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
-	if (base_hdr->type >= RXR_EXTRA_REQ_PKT_END) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"Peer %d is requesting feature %d, which this EP does not support.\n",
-			(int)src_addr, base_hdr->type);
-
-		assert(0 && "invalid REQ packe type");
-		rxr_cq_handle_cq_error(ep, -FI_EIO);
-		return;
-	}
-
-	if (base_hdr->type >= RXR_REQ_PKT_BEGIN) {
-		/*
-		 * as long as the REQ packet contain raw address
-		 * we will need to call insert because it might be a new
-		 * EP with new Q-Key.
-		 */
-		void *raw_addr;
-
-		raw_addr = rxr_pkt_req_raw_addr(pkt_entry);
-		if (OFI_UNLIKELY(raw_addr != NULL))
-			pkt_entry->addr = rxr_pkt_insert_addr(ep, pkt_entry, raw_addr);
-		else
-			pkt_entry->addr = src_addr;
-	} else {
-		assert(src_addr != FI_ADDR_NOTAVAIL);
-		pkt_entry->addr = src_addr;
-	}
-
-#if ENABLE_DEBUG
-	if (!ep->use_zcpy_rx) {
-		dlist_remove(&pkt_entry->dbg_entry);
-		dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list);
-	}
-#ifdef ENABLE_RXR_PKT_DUMP
-	rxr_pkt_print("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt);
-#endif
-#endif
-	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
-	if (!(peer->flags & RXR_PEER_HANDSHAKE_SENT))
-		rxr_pkt_post_handshake(ep, peer, pkt_entry->addr);
-
-	if (peer->is_local) {
-		assert(ep->use_shm);
-		ep->posted_bufs_shm--;
-	} else {
-		ep->posted_bufs_efa--;
-	}
-
 	switch (base_hdr->type) {
 	case RXR_RETIRED_RTS_PKT:
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"Received a RTS packet, which has been retired since protocol version 4\n");
 		assert(0 && "deprecated RTS pakcet received");
-		rxr_cq_handle_cq_error(ep, -FI_EIO);
+		efa_eq_write_error(&ep->util_ep, FI_EIO, FI_EIO);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
 	case RXR_RETIRED_CONNACK_PKT:
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"Received a CONNACK packet, which has been retired since protocol version 4\n");
 		assert(0 && "deprecated CONNACK pakcet received");
-		rxr_cq_handle_cq_error(ep, -FI_EIO);
+		efa_eq_write_error(&ep->util_ep, FI_EIO, FI_EIO);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
 	case RXR_EOR_PKT:
 		rxr_pkt_handle_eor_recv(ep, pkt_entry);
@@ -774,20 +954,25 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 	case RXR_ATOMRSP_PKT:
 		rxr_pkt_handle_atomrsp_recv(ep, pkt_entry);
 		return;
-	case RXR_EAGER_MSGRTM_PKT:
-		if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER)
-			rxr_pkt_handle_zcpy_recv(ep, pkt_entry);
-		else
-			rxr_pkt_handle_rtm_rta_recv(ep, pkt_entry);
+	case RXR_RECEIPT_PKT:
+		rxr_pkt_handle_receipt_recv(ep, pkt_entry);
 		return;
+	case RXR_EAGER_MSGRTM_PKT:
 	case RXR_EAGER_TAGRTM_PKT:
+	case RXR_DC_EAGER_MSGRTM_PKT:
+	case RXR_DC_EAGER_TAGRTM_PKT:
 	case RXR_MEDIUM_MSGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
-	case RXR_LONG_MSGRTM_PKT:
-	case RXR_LONG_TAGRTM_PKT:
-	case RXR_READ_MSGRTM_PKT:
-	case RXR_READ_TAGRTM_PKT:
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
+	case RXR_LONGCTS_MSGRTM_PKT:
+	case RXR_LONGCTS_TAGRTM_PKT:
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
+	case RXR_LONGREAD_MSGRTM_PKT:
+	case RXR_LONGREAD_TAGRTM_PKT:
 	case RXR_WRITE_RTA_PKT:
+	case RXR_DC_WRITE_RTA_PKT:
 	case RXR_FETCH_RTA_PKT:
 	case RXR_COMPARE_RTA_PKT:
 		rxr_pkt_handle_rtm_rta_recv(ep, pkt_entry);
@@ -795,24 +980,110 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 	case RXR_EAGER_RTW_PKT:
 		rxr_pkt_handle_eager_rtw_recv(ep, pkt_entry);
 		return;
-	case RXR_LONG_RTW_PKT:
-		rxr_pkt_handle_long_rtw_recv(ep, pkt_entry);
+	case RXR_LONGCTS_RTW_PKT:
+	case RXR_DC_LONGCTS_RTW_PKT:
+		rxr_pkt_handle_longcts_rtw_recv(ep, pkt_entry);
 		return;
-	case RXR_READ_RTW_PKT:
-		rxr_pkt_handle_read_rtw_recv(ep, pkt_entry);
+	case RXR_LONGREAD_RTW_PKT:
+		rxr_pkt_handle_longread_rtw_recv(ep, pkt_entry);
 		return;
 	case RXR_SHORT_RTR_PKT:
-	case RXR_LONG_RTR_PKT:
+	case RXR_LONGCTS_RTR_PKT:
 		rxr_pkt_handle_rtr_recv(ep, pkt_entry);
 		return;
+	case RXR_DC_EAGER_RTW_PKT:
+		rxr_pkt_handle_dc_eager_rtw_recv(ep, pkt_entry);
+		return;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"invalid control pkt type %d\n",
 			rxr_get_base_hdr(pkt_entry->pkt)->type);
 		assert(0 && "invalid control pkt type");
-		rxr_cq_handle_cq_error(ep, -FI_EIO);
+		efa_eq_write_error(&ep->util_ep, FI_EIO, FI_EIO);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return;
+	}
+}
+
+void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
+				    struct rxr_pkt_entry *pkt_entry)
+{
+	int pkt_type;
+	struct rdm_peer *peer;
+	struct rxr_base_hdr *base_hdr;
+	struct rxr_rx_entry *zcpy_rx_entry = NULL;
+
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	pkt_type = base_hdr->type;
+	if (pkt_type >= RXR_EXTRA_REQ_PKT_END) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"Peer %d is requesting feature %d, which this EP does not support.\n",
+			(int)pkt_entry->addr, base_hdr->type);
+
+		assert(0 && "invalid REQ packe type");
+		efa_eq_write_error(&ep->util_ep, FI_EIO, FI_EIO);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
 	}
+	
+	if (pkt_entry->addr == FI_ADDR_NOTAVAIL) {
+		if (pkt_type >= RXR_REQ_PKT_BEGIN && rxr_pkt_req_raw_addr(pkt_entry)) {
+			/*
+			 * We have not communicated with this peer before.
+			 * rxr_pkt_insert_addr() will insert the address to address vector,
+			 * and pkt_entry->addr should be updated accordingly.
+			 */
+			void *raw_addr;
+
+			raw_addr = rxr_pkt_req_raw_addr(pkt_entry);
+			assert(raw_addr);
+			pkt_entry->addr = rxr_pkt_insert_addr(ep, pkt_entry, raw_addr);
+		} else {
+			/*
+			 * We had prior communication with the peer.
+			 * Application called fi_av_remove() to remove the address
+			 * from address vector. In this case, this packet should be ignored.
+			 */
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "Warning: ignoring a received packet from a removed address\n");
+			rxr_pkt_entry_release_rx(ep, pkt_entry);
+			return;
+		}
+	}
+
+	assert(pkt_entry->addr != FI_ADDR_NOTAVAIL);
+
+#if ENABLE_DEBUG
+	if (!ep->use_zcpy_rx) {
+		dlist_remove(&pkt_entry->dbg_entry);
+		dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list);
+	}
+#ifdef ENABLE_RXR_PKT_DUMP
+	rxr_pkt_print("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt);
+#endif
+#endif
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	assert(peer);
+	rxr_pkt_post_handshake_or_queue(ep, peer);
+
+	if (peer->is_local) {
+		assert(ep->use_shm);
+		ep->shm_rx_pkts_posted--;
+	} else {
+		ep->efa_rx_pkts_posted--;
+	}
+
+	if (pkt_entry->alloc_type == RXR_PKT_FROM_USER_BUFFER) {
+		assert(pkt_entry->x_entry);
+		zcpy_rx_entry = pkt_entry->x_entry;
+	}
+
+	rxr_pkt_proc_received(ep, pkt_entry);
+
+	if (zcpy_rx_entry && pkt_type != RXR_EAGER_MSGRTM_PKT) {
+		/* user buffer was not matched with a message,
+		 * therefore reposting the buffer */
+		rxr_ep_post_user_recv_buf(ep, zcpy_rx_entry, 0);
+	}
 }
 
 #if ENABLE_DEBUG
@@ -833,8 +1104,8 @@ void rxr_pkt_print_handshake(char *prefix,
 	       handshake_hdr->flags);
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
-	       "%s RxR HANDSHAKE packet, maxproto: %d\n",
-	       prefix, handshake_hdr->maxproto);
+	       "%s RxR HANDSHAKE packet, nextra_p3: %d\n",
+	       prefix, handshake_hdr->nextra_p3);
 }
 
 static
@@ -846,46 +1117,64 @@ void rxr_pkt_print_cts(char *prefix, struct rxr_cts_hdr *cts_hdr)
 	       " rx_id: %"	   PRIu32
 	       " window: %"	   PRIu64
 	       "\n", prefix, cts_hdr->version, cts_hdr->flags,
-	       cts_hdr->tx_id, cts_hdr->rx_id, cts_hdr->window);
+	       cts_hdr->send_id, cts_hdr->recv_id, cts_hdr->recv_length);
 }
 
 static
-void rxr_pkt_print_data(char *prefix, struct rxr_data_pkt *data_pkt)
+void rxr_pkt_print_data(char *prefix, struct rxr_pkt_entry *pkt_entry)
 {
+	struct rxr_data_hdr *data_hdr;
 	char str[RXR_PKT_DUMP_DATA_LEN * 4];
-	size_t str_len = RXR_PKT_DUMP_DATA_LEN * 4, l;
+	size_t str_len = RXR_PKT_DUMP_DATA_LEN * 4, l, hdr_size;
+	uint8_t *data;
 	int i;
 
 	str[str_len - 1] = '\0';
 
+	data_hdr = rxr_get_data_hdr(pkt_entry->pkt);
+
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 	       "%s RxR DATA packet -  version: %" PRIu8
 	       " flags: %x rx_id: %" PRIu32
 	       " seg_size: %"	     PRIu64
 	       " seg_offset: %"	     PRIu64
-	       "\n", prefix, data_pkt->hdr.version, data_pkt->hdr.flags,
-	       data_pkt->hdr.rx_id, data_pkt->hdr.seg_size,
-	       data_pkt->hdr.seg_offset);
+	       "\n", prefix, data_hdr->version, data_hdr->flags,
+	       data_hdr->recv_id, data_hdr->seg_length,
+	       data_hdr->seg_offset);
+
+	hdr_size = sizeof(struct rxr_data_hdr);
+	if (data_hdr->flags & RXR_PKT_CONNID_HDR) {
+		hdr_size += sizeof(struct rxr_data_opt_connid_hdr);
+		FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
+		       "sender_connid: %d\n",
+		       data_hdr->connid_hdr->connid);
+	}
+
+	data = (uint8_t *)pkt_entry->pkt + hdr_size;
 
 	l = snprintf(str, str_len, ("\tdata:    "));
-	for (i = 0; i < MIN(data_pkt->hdr.seg_size, RXR_PKT_DUMP_DATA_LEN);
+	for (i = 0; i < MIN(data_hdr->seg_length, RXR_PKT_DUMP_DATA_LEN);
 	     i++)
 		l += snprintf(str + l, str_len - l, "%02x ",
-			      ((uint8_t *)data_pkt->data)[i]);
+			      data[i]);
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s\n", str);
 }
 
-void rxr_pkt_print(char *prefix, struct rxr_ep *ep, struct rxr_base_hdr *hdr)
+void rxr_pkt_print(char *prefix, struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
+	struct rxr_base_hdr *hdr;
+
+	hdr = rxr_get_base_hdr(pkt_entry->pkt);
+
 	switch (hdr->type) {
 	case RXR_HANDSHAKE_PKT:
-		rxr_pkt_print_handshake(prefix, (struct rxr_handshake_hdr *)hdr);
+		rxr_pkt_print_handshake(prefix, rxr_get_handshake_hdr(pkt_entry->pkt));
 		break;
 	case RXR_CTS_PKT:
-		rxr_pkt_print_cts(prefix, (struct rxr_cts_hdr *)hdr);
+		rxr_pkt_print_cts(prefix, rxr_get_cts_hdr(pkt_entry->pkt));
 		break;
 	case RXR_DATA_PKT:
-		rxr_pkt_print_data(prefix, (struct rxr_data_pkt *)hdr);
+		rxr_pkt_print_data(prefix, pkt_entry);
 		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid ctl pkt type %d\n",
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
index eb5d05d0e2d5a0debc3029f571f9224abfaa95f5..bc43ba5d3cd8d0e107e3938f3297d64a91c0c4c1 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
@@ -36,39 +36,39 @@
 
 #include "rxr.h"
 
-ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry);
-
 ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry,
-			  int ctrl_type, bool inject);
+			  int ctrl_type, bool inject, uint64_t flags);
 
 ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry,
 				   int ctrl_type, bool inject);
 
-size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry);
-
-ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep,
-			   struct rxr_rx_entry *rx_entry,
-			   size_t data_offset,
-			   struct rxr_pkt_entry *pkt_entry,
-			   char *data, size_t data_size);
-
 void rxr_pkt_handle_data_copied(struct rxr_ep *ep,
 				struct rxr_pkt_entry *pkt_entry,
 				size_t data_size);
 
+void rxr_pkt_handle_send_error(struct rxr_ep *ep,
+			       struct rxr_pkt_entry *pkt_entry,
+			       int err, int prov_errno);
+
 void rxr_pkt_handle_send_completion(struct rxr_ep *ep,
-				    struct fi_cq_data_entry *cq_entry);
+				    struct rxr_pkt_entry *pkt_entry);
+
+void rxr_pkt_handle_recv_error(struct rxr_ep *ep,
+			       struct rxr_pkt_entry *pkt_entry,
+			       int err, int prov_errno);
 
 void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
-				    struct fi_cq_data_entry *cq_entry,
-				    fi_addr_t src_addr);
+				    struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rdm_peer *peer);
 
-ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer);
+ssize_t rxr_pkt_trigger_handshake(struct rxr_ep *ep,
+				  fi_addr_t addr, struct rdm_peer *peer);
 
 #if ENABLE_DEBUG
 void rxr_pkt_print(char *prefix,
 		   struct rxr_ep *ep,
-		   struct rxr_base_hdr *hdr);
+		   struct rxr_pkt_entry *pkt_entry);
 #endif
 
 #endif
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
index 5ed475bd15cc1db6ec9a29316b278ba1fe2d8806..b8598c988040f37d79da63d525474c96f0275b54 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
@@ -47,43 +47,9 @@
 /*
  *   General purpose utility functions
  */
-
-struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep,
-						const struct fi_msg *posted_buf,
-						struct ofi_bufpool *pkt_pool)
-{
-	struct rxr_pkt_entry *pkt_entry;
-	struct efa_mr *mr;
-
-	/*
-	 * Given the pkt_entry->pkt immediately follows the pkt_entry
-	 * fields, we can directly map the user-provided fi_msg address
-	 * as the pkt_entry, which will hold the metadata in the prefix.
-	 */
-	assert(posted_buf->msg_iov->iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr));
-	pkt_entry = (struct rxr_pkt_entry *) posted_buf->msg_iov->iov_base;
-	if (!pkt_entry)
-		return NULL;
-
-	/*
-	 * The ownership of the prefix buffer lies with the application, do not
-	 * put it on the dbg list for cleanup during shutdown or poison it. The
-	 * provider loses jurisdiction over it soon after writing the rx
-	 * completion.
-	 */
-	dlist_init(&pkt_entry->entry);
-	mr = (struct efa_mr *) posted_buf->desc[0];
-	pkt_entry->mr = &mr->mr_fid;
-
-	pkt_entry->type = RXR_PKT_ENTRY_USER;
-	pkt_entry->state = RXR_PKT_ENTRY_IN_USE;
-	pkt_entry->next = NULL;
-
-	return pkt_entry;
-}
-
 struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
-					  struct ofi_bufpool *pkt_pool)
+					  struct ofi_bufpool *pkt_pool,
+					  enum rxr_pkt_entry_alloc_type alloc_type)
 {
 	struct rxr_pkt_entry *pkt_entry;
 	void *mr = NULL;
@@ -103,18 +69,23 @@ struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
 #ifdef ENABLE_EFA_POISONING
 	memset(pkt_entry->pkt, 0, ep->mtu_size);
 #endif
-	pkt_entry->type = RXR_PKT_ENTRY_POSTED;
-	pkt_entry->state = RXR_PKT_ENTRY_IN_USE;
+	pkt_entry->alloc_type = alloc_type;
+	pkt_entry->flags = RXR_PKT_ENTRY_IN_USE;
 	pkt_entry->next = NULL;
-
+	pkt_entry->x_entry = NULL;
 	return pkt_entry;
 }
 
-static
-void rxr_pkt_entry_release_single_tx(struct rxr_ep *ep,
-				     struct rxr_pkt_entry *pkt)
+/**
+ * @brief release a TX packet entry
+ *
+ * @param[in]     ep  the end point
+ * @param[in,out] pkt the pkt_entry to be released
+ */
+void rxr_pkt_entry_release_tx(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 
 #if ENABLE_DEBUG
 	dlist_remove(&pkt->dbg_entry);
@@ -123,37 +94,30 @@ void rxr_pkt_entry_release_single_tx(struct rxr_ep *ep,
 	 * Decrement rnr_queued_pkts counter and reset backoff for this peer if
 	 * we get a send completion for a retransmitted packet.
 	 */
-	if (OFI_UNLIKELY(pkt->state == RXR_PKT_ENTRY_RNR_RETRANSMIT)) {
+	if (OFI_UNLIKELY(pkt->flags & RXR_PKT_ENTRY_RNR_RETRANSMIT)) {
 		peer = rxr_ep_get_peer(ep, pkt->addr);
+		assert(peer);
 		peer->rnr_queued_pkt_cnt--;
-		peer->timeout_interval = 0;
-		peer->rnr_timeout_exp = 0;
-		if (peer->flags & RXR_PEER_IN_BACKOFF)
-			dlist_remove(&peer->rnr_entry);
-		peer->flags &= ~RXR_PEER_IN_BACKOFF;
+		peer->rnr_backoff_wait_time = 0;
+		if (peer->flags & RXR_PEER_IN_BACKOFF) {
+			dlist_remove(&peer->rnr_backoff_entry);
+			peer->flags &= ~RXR_PEER_IN_BACKOFF;
+		}
 		FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
 		       "reset backoff timer for peer: %" PRIu64 "\n",
 		       pkt->addr);
 	}
+	if (pkt->send) {
+		ofi_buf_free(pkt->send);
+		pkt->send = NULL;
+	}
 #ifdef ENABLE_EFA_POISONING
 	rxr_poison_mem_region((uint32_t *)pkt, ep->tx_pkt_pool_entry_sz);
 #endif
-	pkt->state = RXR_PKT_ENTRY_FREE;
+	pkt->flags = 0;
 	ofi_buf_free(pkt);
 }
 
-void rxr_pkt_entry_release_tx(struct rxr_ep *ep,
-			      struct rxr_pkt_entry *pkt_entry)
-{
-	struct rxr_pkt_entry *next;
-
-	while (pkt_entry) {
-		next = pkt_entry->next;
-		rxr_pkt_entry_release_single_tx(ep, pkt_entry);
-		pkt_entry = next;
-	}
-}
-
 /*
  * rxr_pkt_entry_release_rx() release a rx packet entry.
  * It requires input pkt_entry to be unlinked.
@@ -169,21 +133,14 @@ void rxr_pkt_entry_release_rx(struct rxr_ep *ep,
 {
 	assert(pkt_entry->next == NULL);
 
-	if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER)
+	if (ep->use_zcpy_rx && pkt_entry->alloc_type == RXR_PKT_FROM_USER_BUFFER)
 		return;
 
-	if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) {
-		struct rxr_peer *peer;
-
-		peer = rxr_ep_get_peer(ep, pkt_entry->addr);
-		assert(peer);
-		if (peer->is_local)
-			ep->rx_bufs_shm_to_post++;
-		else
-			ep->rx_bufs_efa_to_post++;
-	}
-
-	if (pkt_entry->type == RXR_PKT_ENTRY_READ_COPY) {
+	if (pkt_entry->alloc_type == RXR_PKT_FROM_EFA_RX_POOL) {
+		ep->efa_rx_pkts_to_post++;
+	} else if (pkt_entry->alloc_type == RXR_PKT_FROM_SHM_RX_POOL) {
+		ep->shm_rx_pkts_to_post++;
+	} else if (pkt_entry->alloc_type == RXR_PKT_FROM_READ_COPY_POOL) {
 		assert(ep->rx_readcopy_pkt_pool_used > 0);
 		ep->rx_readcopy_pkt_pool_used--;
 	}
@@ -195,18 +152,17 @@ void rxr_pkt_entry_release_rx(struct rxr_ep *ep,
 	/* the same pool size is used for all types of rx pkt_entries */
 	rxr_poison_mem_region((uint32_t *)pkt_entry, ep->rx_pkt_pool_entry_sz);
 #endif
-	pkt_entry->state = RXR_PKT_ENTRY_FREE;
+	pkt_entry->flags = 0;
 	ofi_buf_free(pkt_entry);
 }
 
 void rxr_pkt_entry_copy(struct rxr_ep *ep,
 			struct rxr_pkt_entry *dest,
-			struct rxr_pkt_entry *src,
-			int new_entry_type)
+			struct rxr_pkt_entry *src)
 {
 	FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
-	       "Copying packet out of posted buffer! src_entry_type: %d new_entry_type: %d\n",
-		src->type, new_entry_type);
+	       "Copying packet out of posted buffer! src_entry_alloc_type: %d desc_entry_alloc_type: %d\n",
+		src->alloc_type, dest->alloc_type);
 	dlist_init(&dest->entry);
 #if ENABLE_DEBUG
 	dlist_init(&dest->dbg_entry);
@@ -218,23 +174,40 @@ void rxr_pkt_entry_copy(struct rxr_ep *ep,
 	dest->x_entry = src->x_entry;
 	dest->pkt_size = src->pkt_size;
 	dest->addr = src->addr;
-	dest->type = new_entry_type;
-	dest->state = RXR_PKT_ENTRY_IN_USE;
+	dest->flags = RXR_PKT_ENTRY_IN_USE;
 	dest->next = NULL;
-	memcpy(dest->pkt, src->pkt, ep->mtu_size);
+	assert(src->pkt_size > 0);
+	memcpy(dest->pkt, src->pkt, src->pkt_size);
 }
 
 /*
- * Create a new rx_entry for an unexpected message. Store the packet for later
- * processing and put the rx_entry on the appropriate unexpected list.
+ * Handle copying or updating the metadata for an unexpected packet.
+ *
+ * Packets from the EFA RX pool will be copied into a separate buffer not
+ * registered with the device (if this option is enabled) so that we can repost
+ * the registered buffer again to keep the EFA RX queue full. Packets from the
+ * SHM RX pool will also be copied to reuse the unexpected message pool.
+ *
+ * @param[in]     ep  the end point
+ * @param[in,out] pkt_entry_ptr unexpected packet, if this packet is copied to
+ *                a new memory region this pointer will be updated.
+ *
+ * @return	  struct rxr_pkt_entry of the updated or copied packet, NULL on
+ * 		  allocation failure.
  */
 struct rxr_pkt_entry *rxr_pkt_get_unexp(struct rxr_ep *ep,
 					struct rxr_pkt_entry **pkt_entry_ptr)
 {
 	struct rxr_pkt_entry *unexp_pkt_entry;
+	enum rxr_pkt_entry_alloc_type type;
+
+	type = (*pkt_entry_ptr)->alloc_type;
 
-	if (rxr_env.rx_copy_unexp && (*pkt_entry_ptr)->type == RXR_PKT_ENTRY_POSTED) {
-		unexp_pkt_entry = rxr_pkt_entry_clone(ep, ep->rx_unexp_pkt_pool, *pkt_entry_ptr, RXR_PKT_ENTRY_UNEXP);
+	if (rxr_env.rx_copy_unexp && (type == RXR_PKT_FROM_EFA_RX_POOL ||
+				      type == RXR_PKT_FROM_SHM_RX_POOL)) {
+		unexp_pkt_entry = rxr_pkt_entry_clone(ep, ep->rx_unexp_pkt_pool,
+						      RXR_PKT_FROM_UNEXP_POOL,
+						      *pkt_entry_ptr);
 		if (OFI_UNLIKELY(!unexp_pkt_entry)) {
 			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 				"Unable to allocate rx_pkt_entry for unexp msg\n");
@@ -254,12 +227,12 @@ void rxr_pkt_entry_release_cloned(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_e
 	struct rxr_pkt_entry *next;
 
 	while (pkt_entry) {
-		assert(pkt_entry->type == RXR_PKT_ENTRY_OOO  ||
-		       pkt_entry->type == RXR_PKT_ENTRY_UNEXP);
+		assert(pkt_entry->alloc_type == RXR_PKT_FROM_OOO_POOL ||
+		       pkt_entry->alloc_type == RXR_PKT_FROM_UNEXP_POOL);
 #ifdef ENABLE_EFA_POISONING
 		rxr_poison_mem_region((uint32_t *)pkt_entry, ep->tx_pkt_pool_entry_sz);
 #endif
-		pkt_entry->state = RXR_PKT_ENTRY_FREE;
+		pkt_entry->flags = 0;
 		ofi_buf_free(pkt_entry);
 		next = pkt_entry->next;
 		pkt_entry = next;
@@ -268,38 +241,38 @@ void rxr_pkt_entry_release_cloned(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_e
 
 struct rxr_pkt_entry *rxr_pkt_entry_clone(struct rxr_ep *ep,
 					  struct ofi_bufpool *pkt_pool,
-					  struct rxr_pkt_entry *src,
-					  int new_entry_type)
+					  enum rxr_pkt_entry_alloc_type alloc_type,
+					  struct rxr_pkt_entry *src)
 {
 	struct rxr_pkt_entry *root = NULL;
 	struct rxr_pkt_entry *dst;
 
 	assert(src);
-	assert(new_entry_type == RXR_PKT_ENTRY_OOO ||
-	       new_entry_type == RXR_PKT_ENTRY_UNEXP ||
-	       new_entry_type == RXR_PKT_ENTRY_READ_COPY);
+	assert(alloc_type == RXR_PKT_FROM_OOO_POOL ||
+	       alloc_type == RXR_PKT_FROM_UNEXP_POOL ||
+	       alloc_type == RXR_PKT_FROM_READ_COPY_POOL);
 
-	dst = rxr_pkt_entry_alloc(ep, pkt_pool);
+	dst = rxr_pkt_entry_alloc(ep, pkt_pool, alloc_type);
 	if (!dst)
 		return NULL;
 
-	if (new_entry_type == RXR_PKT_ENTRY_READ_COPY) {
+	if (alloc_type == RXR_PKT_FROM_READ_COPY_POOL) {
 		assert(pkt_pool == ep->rx_readcopy_pkt_pool);
 		ep->rx_readcopy_pkt_pool_used++;
 		ep->rx_readcopy_pkt_pool_max_used = MAX(ep->rx_readcopy_pkt_pool_used,
 							ep->rx_readcopy_pkt_pool_max_used);
 	}
 
-	rxr_pkt_entry_copy(ep, dst, src, new_entry_type);
+	rxr_pkt_entry_copy(ep, dst, src);
 	root = dst;
 	while (src->next) {
-		dst->next = rxr_pkt_entry_alloc(ep, pkt_pool);
+		dst->next = rxr_pkt_entry_alloc(ep, pkt_pool, alloc_type);
 		if (!dst->next) {
 			rxr_pkt_entry_release_cloned(ep, root);
 			return NULL;
 		}
 
-		rxr_pkt_entry_copy(ep, dst->next, src->next, new_entry_type);
+		rxr_pkt_entry_copy(ep, dst->next, src->next);
 		src = src->next;
 		dst = dst->next;
 	}
@@ -319,19 +292,31 @@ void rxr_pkt_entry_append(struct rxr_pkt_entry *dst,
 	dst->next = src;
 }
 
+/**
+ * @brief send a packet using lower provider
+ *
+ * @param ep[in]        rxr end point
+ * @param pkt_entry[in] packet entry to be sent
+ * @param msg[in]       information regarding that the send operation, such as
+ *                      memory buffer, remote EP address and local descriptor.
+ *                      If the shm provider is to be used. Remote EP address
+ *                      and local descriptor must be prepared for shm usage.
+ * @param flags[in]     flags to be passed on to lower provider's send.
+ */
 static inline
 ssize_t rxr_pkt_entry_sendmsg(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry,
 			      const struct fi_msg *msg, uint64_t flags)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	size_t ret;
 
-	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
-	assert(ep->tx_pending <= ep->max_outstanding_tx);
-
-	if (ep->tx_pending == ep->max_outstanding_tx)
+	if (pkt_entry->alloc_type == RXR_PKT_FROM_EFA_TX_POOL &&
+	    ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops)
 		return -FI_EAGAIN;
 
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	assert(peer);
+
 	if (peer->flags & RXR_PEER_IN_BACKOFF)
 		return -FI_EAGAIN;
 
@@ -346,72 +331,82 @@ ssize_t rxr_pkt_entry_sendmsg(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry
 		ret = fi_sendmsg(ep->shm_ep, msg, flags);
 	} else {
 		ret = fi_sendmsg(ep->rdm_ep, msg, flags);
-		if (OFI_LIKELY(!ret))
-			rxr_ep_inc_tx_pending(ep, peer);
 	}
 
-	return ret;
-}
-
-ssize_t rxr_pkt_entry_sendv(struct rxr_ep *ep,
-			    struct rxr_pkt_entry *pkt_entry,
-			    fi_addr_t addr, const struct iovec *iov,
-			    void **desc, size_t count, uint64_t flags)
-{
-	struct fi_msg msg;
-	struct rxr_peer *peer;
-
-	msg.msg_iov = iov;
-	msg.desc = desc;
-	msg.iov_count = count;
-	peer = rxr_ep_get_peer(ep, addr);
-	msg.addr = (peer->is_local) ? peer->shm_fiaddr : addr;
-	msg.context = pkt_entry;
-	msg.data = 0;
+	if (OFI_UNLIKELY(ret))
+		return ret;
 
-	return rxr_pkt_entry_sendmsg(ep, pkt_entry, &msg, flags);
+	rxr_ep_record_tx_op_submitted(ep, pkt_entry);
+	return 0;
 }
 
-/* rxr_pkt_start currently expects data pkt right after pkt hdr */
-ssize_t rxr_pkt_entry_send_with_flags(struct rxr_ep *ep,
-				      struct rxr_pkt_entry *pkt_entry,
-				      fi_addr_t addr, uint64_t flags)
+/**
+ * @brief Construct a fi_msg object with the information stored in pkt_entry,
+ * and send it out
+ *
+ * @param[in] ep	rxr endpoint
+ * @param[in] pkt_entry	packet entry used to construct the fi_msg object
+ * @param[in] flags	flags to be applied to lower provider's send operation
+ * @return		0 on success
+ * 			On error, a negative value corresponding to fabric errno
+ *
+ */
+ssize_t rxr_pkt_entry_send(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry,
+			   uint64_t flags)
 {
 	struct iovec iov;
 	void *desc;
+	struct fi_msg msg;
+	struct rdm_peer *peer;
 
-	iov.iov_base = rxr_pkt_start(pkt_entry);
-	iov.iov_len = pkt_entry->pkt_size;
+	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	assert(peer);
 
-	if (rxr_ep_get_peer(ep, addr)->is_local) {
-		assert(ep->use_shm);
-		desc = NULL;
+	if (pkt_entry->send && pkt_entry->send->iov_count > 0) {
+		msg.msg_iov = pkt_entry->send->iov;
+		msg.iov_count = pkt_entry->send->iov_count;
+		msg.desc = pkt_entry->send->desc;
 	} else {
-		desc = rxr_ep_mr_local(ep) ? fi_mr_desc(pkt_entry->mr) : NULL;
+		iov.iov_base = rxr_pkt_start(pkt_entry);
+		iov.iov_len = pkt_entry->pkt_size;
+		desc = peer->is_local ? NULL : fi_mr_desc(pkt_entry->mr);
+		msg.msg_iov = &iov;
+		msg.iov_count = 1;
+		msg.desc = &desc;
 	}
 
-	return rxr_pkt_entry_sendv(ep, pkt_entry, addr, &iov, &desc, 1, flags);
-}
+	msg.addr = pkt_entry->addr;
+	msg.context = pkt_entry;
+	msg.data = 0;
 
-ssize_t rxr_pkt_entry_send(struct rxr_ep *ep,
-			   struct rxr_pkt_entry *pkt_entry,
-			   fi_addr_t addr)
-{
-	return rxr_pkt_entry_send_with_flags(ep, pkt_entry, addr, 0);
+	if (peer->is_local) {
+		msg.addr = peer->shm_fiaddr;
+		rxr_convert_desc_for_shm(msg.iov_count, msg.desc);
+	}
+
+	return rxr_pkt_entry_sendmsg(ep, pkt_entry, &msg, flags);
 }
 
 ssize_t rxr_pkt_entry_inject(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry,
 			     fi_addr_t addr)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
+	ssize_t ret;
 
 	/* currently only EOR packet is injected using shm ep */
 	peer = rxr_ep_get_peer(ep, addr);
 	assert(peer);
+
 	assert(ep->use_shm && peer->is_local);
-	return fi_inject(ep->shm_ep, rxr_pkt_start(pkt_entry), pkt_entry->pkt_size,
+	ret = fi_inject(ep->shm_ep, rxr_pkt_start(pkt_entry), pkt_entry->pkt_size,
 			 peer->shm_fiaddr);
+
+	if (OFI_UNLIKELY(ret))
+		return ret;
+
+	rxr_ep_record_tx_op_submitted(ep, pkt_entry);
+	return 0;
 }
 
 /*
@@ -423,6 +418,7 @@ struct rxr_rx_entry *rxr_pkt_rx_map_lookup(struct rxr_ep *ep,
 	struct rxr_pkt_rx_map *entry = NULL;
 	struct rxr_pkt_rx_key key;
 
+	memset(&key, 0, sizeof(key));
 	key.msg_id = rxr_pkt_msg_id(pkt_entry);
 	key.addr = pkt_entry->addr;
 	HASH_FIND(hh, ep->pkt_rx_map, &key, sizeof(struct rxr_pkt_rx_key), entry);
@@ -443,6 +439,7 @@ void rxr_pkt_rx_map_insert(struct rxr_ep *ep,
 		return;
 	}
 
+	memset(&entry->key, 0, sizeof(entry->key));
 	entry->key.msg_id = rxr_pkt_msg_id(pkt_entry);
 	entry->key.addr = pkt_entry->addr;
 
@@ -466,6 +463,7 @@ void rxr_pkt_rx_map_remove(struct rxr_ep *ep,
 	struct rxr_pkt_rx_map *entry;
 	struct rxr_pkt_rx_key key;
 
+	memset(&key, 0, sizeof(key));
 	key.msg_id = rxr_pkt_msg_id(pkt_entry);
 	key.addr = pkt_entry->addr;
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
index 85173ffdf0608fd107cc6f394c3c481cdf545dc9..dc390ca28b5ce126b609335dd2874eb47a307f73 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
@@ -36,20 +36,20 @@
 
 #include <ofi_list.h>
 
-/* pkt_entry state for retransmit tracking */
-enum rxr_pkt_entry_state {
-	RXR_PKT_ENTRY_FREE = 0,
-	RXR_PKT_ENTRY_IN_USE,
-	RXR_PKT_ENTRY_RNR_RETRANSMIT,
-};
-
-/* pkt_entry types for rx pkts */
-enum rxr_pkt_entry_type {
-	RXR_PKT_ENTRY_POSTED = 1,   /* entries that are posted to the device from the RX bufpool */
-	RXR_PKT_ENTRY_UNEXP,        /* entries used to stage unexpected msgs */
-	RXR_PKT_ENTRY_OOO,	    /* entries used to stage out-of-order RTM or RTA */
-	RXR_PKT_ENTRY_USER,	    /* entries backed by user-provided msg prefix (FI_MSG_PREFIX)*/
-	RXR_PKT_ENTRY_READ_COPY,    /* entries used to stage copy by read */
+#define RXR_PKT_ENTRY_IN_USE		BIT_ULL(0)
+#define RXR_PKT_ENTRY_RNR_RETRANSMIT	BIT_ULL(1)
+#define RXR_PKT_ENTRY_LOCAL_READ	BIT_ULL(2) /* this packet entry is used as context of a local read operation */
+
+/* pkt_entry_alloc_type indicate where the packet entry is allocated from */
+enum rxr_pkt_entry_alloc_type {
+	RXR_PKT_FROM_EFA_TX_POOL = 1, /* packet is allocated from ep->efa_tx_pkt_pool */
+	RXR_PKT_FROM_EFA_RX_POOL,     /* packet is allocated from ep->efa_rx_pkt_pool */
+	RXR_PKT_FROM_SHM_TX_POOL,     /* packet is allocated from ep->shm_tx_pkt_pool */
+	RXR_PKT_FROM_SHM_RX_POOL,     /* packet is allocated from ep->shm_rx_pkt_pool */
+	RXR_PKT_FROM_UNEXP_POOL,      /* packet is allocated from ep->rx_unexp_pkt_pool */
+	RXR_PKT_FROM_OOO_POOL,	      /* packet is allocated from ep->rx_ooo_pkt_pool */
+	RXR_PKT_FROM_USER_BUFFER,     /* packet is from user provided buffer */
+	RXR_PKT_FROM_READ_COPY_POOL,  /* packet is allocated from ep->rx_readcopy_pkt_pool */
 };
 
 struct rxr_pkt_sendv {
@@ -64,8 +64,13 @@ struct rxr_pkt_sendv {
 	void *desc[2];
 };
 
+/* rxr_pkt_entry is used both for sending data to a peer and for receiving dat from a peer.
+ */
 struct rxr_pkt_entry {
-	/* for rx/tx_entry queued_pkts list */
+	/* entry is used for sending only.
+	 * It is either linked peer->outstanding_tx_pkts (after a packet has been successfully sent, but it get a completion),
+	 * or linked to tx_rx_entry->queued_pkts (after it encountered RNR error completion).
+	 */
 	struct dlist_entry entry;
 #if ENABLE_DEBUG
 	/* for tx/rx debug list or posted buf list */
@@ -75,9 +80,27 @@ struct rxr_pkt_entry {
 	size_t pkt_size;
 
 	struct fid_mr *mr;
+	/* `addr` is used for both sending data and receiving data.
+	 *
+	 * When sending a packet, `addr` will be provided by application and it cannot be FI_ADDR_NOTAVAIL.
+	 * However, after a packet is sent, application can remove a peer by calling fi_av_remove().
+	 * When removing the peering, `addr` will be set to FI_ADDR_NOTAVAIL. Later, when device report
+	 * completion for such a TX packet, the TX completion will be ignored.
+	 *
+	 * When receiving a packet, lower device will set `addr`. If the sender's address is not in
+	 * address vector (AV), `lower device will set `addr` to FI_ADDR_NOTAVAIL. This can happen in
+	 * two scenarios:
+	 *
+	 * 1. there has been no prior communication with the peer. In this case, the packet should have
+	 *    peer's raw address in the header, and progress engien will insert the raw address into
+	 *    addres vector, and update `addr`.
+	 *
+	 * 2. this packet is from a peer whose address has been removed from AV. In this case, the
+	 *    recived packet will be ignored because all resources associated with peer has been released.
+	 */
 	fi_addr_t addr;
-	enum rxr_pkt_entry_type type;
-	enum rxr_pkt_entry_state state;
+	enum rxr_pkt_entry_alloc_type alloc_type; /* where the memory of this packet entry reside */
+	uint32_t flags;
 
 	/*
 	 * next is used on receiving end.
@@ -109,7 +132,7 @@ static_assert(sizeof(struct rxr_pkt_entry) == 64, "rxr_pkt_entry check");
 #endif
 
 OFI_DECL_RECVWIN_BUF(struct rxr_pkt_entry*, rxr_robuf, uint32_t);
-DECLARE_FREESTACK(struct rxr_robuf, rxr_robuf_fs);
+OFI_DECLARE_FREESTACK(struct rxr_robuf, rxr_robuf_fs);
 
 struct rxr_ep;
 
@@ -120,7 +143,8 @@ struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep,
 						struct ofi_bufpool *pkt_pool);
 
 struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
-					  struct ofi_bufpool *pkt_pool);
+					  struct ofi_bufpool *pkt_pool,
+					  enum rxr_pkt_entry_alloc_type alloc_type);
 
 void rxr_pkt_entry_release_tx(struct rxr_ep *ep,
 			      struct rxr_pkt_entry *pkt_entry);
@@ -133,24 +157,14 @@ void rxr_pkt_entry_append(struct rxr_pkt_entry *dst,
 
 struct rxr_pkt_entry *rxr_pkt_entry_clone(struct rxr_ep *ep,
 					  struct ofi_bufpool *pkt_pool,
-					  struct rxr_pkt_entry *src,
-					  int new_entry_type);
+					  enum rxr_pkt_entry_alloc_type alloc_type,
+					  struct rxr_pkt_entry *src);
 
 struct rxr_pkt_entry *rxr_pkt_get_unexp(struct rxr_ep *ep,
 					struct rxr_pkt_entry **pkt_entry_ptr);
 
-ssize_t rxr_pkt_entry_send_with_flags(struct rxr_ep *ep,
-				      struct rxr_pkt_entry *pkt_entry,
-				      fi_addr_t addr, uint64_t flags);
-
-ssize_t rxr_pkt_entry_sendv(struct rxr_ep *ep,
-			    struct rxr_pkt_entry *pkt_entry,
-			    fi_addr_t addr, const struct iovec *iov,
-			    void **desc, size_t count, uint64_t flags);
-
 ssize_t rxr_pkt_entry_send(struct rxr_ep *ep,
-			   struct rxr_pkt_entry *pkt_entry,
-			   fi_addr_t addr);
+			   struct rxr_pkt_entry *pkt_entry, uint64_t flags);
 
 ssize_t rxr_pkt_entry_inject(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
index 18c930237fbd87841b975f51cd219e6444db8824..3e2c318bad218d673f91cdc316c096ddc9b06a1e 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
@@ -34,77 +34,7 @@
 #ifndef _RXR_PKT_TYPE_H
 #define _RXR_PKT_TYPE_H
 
-/* This header file contain the ID of all RxR packet types, and
- * the necessary data structures and functions for each packet type
- *
- * RxR packet types can be classified into 3 categories:
- *     data packet, control packet and context packet
- *
- * For each packet type, the following items are needed:
- *
- *   First, each packet type need to define a struct for its header,
- *       and the header must be start with ```struct rxr_base_hdr```.
- *
- *   Second, each control packet type need to define an init()
- *       function and a handle_sent() function. These functions
- *       are called by rxr_pkt_post_ctrl_or_queue().
- *
- *   Finally, each packet type (except context packet) need to
- *     define a handle_recv() functions which is called by
- *     rxr_pkt_handle_recv_completion().
- */
-
-/* ID of each packet type. Changing ID would break inter
- * operability thus is strictly prohibited.
- */
-
-#define RXR_RETIRED_RTS_PKT	1
-#define RXR_RETIRED_CONNACK_PKT	2
-#define RXR_CTS_PKT		3
-#define RXR_DATA_PKT		4
-#define RXR_READRSP_PKT		5
-#define RXR_RMA_CONTEXT_PKT	6
-#define RXR_EOR_PKT		7
-#define RXR_ATOMRSP_PKT         8
-#define RXR_HANDSHAKE_PKT	9
-
-#define RXR_REQ_PKT_BEGIN		64
-#define RXR_BASELINE_REQ_PKT_BEGIN	64
-#define RXR_EAGER_MSGRTM_PKT		64
-#define RXR_EAGER_TAGRTM_PKT		65
-#define RXR_MEDIUM_MSGRTM_PKT		66
-#define RXR_MEDIUM_TAGRTM_PKT		67
-#define RXR_LONG_MSGRTM_PKT		68
-#define RXR_LONG_TAGRTM_PKT		69
-#define RXR_EAGER_RTW_PKT		70
-#define RXR_LONG_RTW_PKT		71
-#define RXR_SHORT_RTR_PKT		72
-#define RXR_LONG_RTR_PKT		73
-#define RXR_WRITE_RTA_PKT		74
-#define RXR_FETCH_RTA_PKT		75
-#define RXR_COMPARE_RTA_PKT		76
-#define RXR_BASELINE_REQ_PKT_END	77
-
-#define RXR_EXTRA_REQ_PKT_BEGIN		128
-#define RXR_READ_MSGRTM_PKT		128
-#define RXR_READ_TAGRTM_PKT		129
-#define RXR_READ_RTW_PKT		130
-#define RXR_READ_RTR_PKT		131
-#define RXR_EXTRA_REQ_PKT_END		132
-
-/*
- *  Packet fields common to all rxr packets. The other packet headers below must
- *  be changed if this is updated.
- */
-struct rxr_base_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_base_hdr) == 4, "rxr_base_hdr check");
-#endif
+#include "rdm_proto_v4.h"
 
 static inline struct rxr_base_hdr *rxr_get_base_hdr(void *pkt)
 {
@@ -112,75 +42,52 @@ static inline struct rxr_base_hdr *rxr_get_base_hdr(void *pkt)
 }
 
 struct rxr_ep;
-struct rxr_peer;
+struct rdm_peer;
 struct rxr_tx_entry;
 struct rxr_rx_entry;
 struct rxr_read_entry;
 
-/*
- *  HANDSHAKE packet header and functions
- *  implementation of the functions are in rxr_pkt_type_misc.c
- */
-struct rxr_handshake_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t maxproto;
-	uint64_t features[0];
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_handshake_hdr) == 8, "rxr_handshake_hdr check");
-#endif
-
+/* HANDSHAKE packet related functions */
 static inline
 struct rxr_handshake_hdr *rxr_get_handshake_hdr(void *pkt)
 {
 	return (struct rxr_handshake_hdr *)pkt;
 }
 
+static inline
+struct rxr_handshake_opt_connid_hdr *rxr_get_handshake_opt_connid_hdr(void *pkt)
+{
+	struct rxr_handshake_hdr *handshake_hdr;
+	size_t base_hdr_size;
+
+	handshake_hdr = (struct rxr_handshake_hdr *)pkt;
+	assert(handshake_hdr->type == RXR_HANDSHAKE_PKT);
+	assert(handshake_hdr->flags & RXR_PKT_CONNID_HDR);
+	base_hdr_size = sizeof(struct rxr_handshake_hdr) +
+			(handshake_hdr->nextra_p3 - 3) * sizeof(uint64_t);
+	return (struct rxr_handshake_opt_connid_hdr *)((char *)pkt + base_hdr_size);
+}
+
 ssize_t rxr_pkt_init_handshake(struct rxr_ep *ep,
 			       struct rxr_pkt_entry *pkt_entry,
 			       fi_addr_t addr);
 
-void rxr_pkt_post_handshake(struct rxr_ep *ep,
-			    struct rxr_peer *peer,
-			    fi_addr_t addr);
+ssize_t rxr_pkt_post_handshake(struct rxr_ep *ep, struct rdm_peer *peer);
+
+void rxr_pkt_post_handshake_or_queue(struct rxr_ep *ep,
+				     struct rdm_peer *peer);
 
 void rxr_pkt_handle_handshake_recv(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry);
-/*
- *  CTS packet data structures and functions.
- *  Definition of the functions is in rxr_pkt_type_misc.c
- */
-struct rxr_cts_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint8_t pad[4];
-	/* TODO: need to add msg_id -> tx_id/rx_id mapping */
-	uint32_t tx_id;
-	uint32_t rx_id;
-	uint64_t window;
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_cts_hdr) == 24, "rxr_cts_hdr check");
-#endif
-
-/* this flag is to indicated the CTS is the response of a RTR packet */
-#define RXR_CTS_READ_REQ		BIT_ULL(7)
-#define RXR_CTS_HDR_SIZE		(sizeof(struct rxr_cts_hdr))
 
+/* CTS packet related functions */
 static inline
 struct rxr_cts_hdr *rxr_get_cts_hdr(void *pkt)
 {
 	return (struct rxr_cts_hdr *)pkt;
 }
 
-void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer,
+void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rdm_peer *peer,
 				     uint64_t size, int request,
 				     int *window, int *credits);
 
@@ -194,45 +101,18 @@ void rxr_pkt_handle_cts_sent(struct rxr_ep *ep,
 void rxr_pkt_handle_cts_recv(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
 
-/*
- *  DATA packet data structures and functions
- *  Definition of the functions is in rxr_pkt_data.c
- */
-struct rxr_data_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	/* TODO: need to add msg_id -> tx_id/rx_id mapping */
-	uint32_t rx_id;
-	uint64_t seg_size;
-	uint64_t seg_offset;
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_data_hdr) == 24, "rxr_data_hdr check");
-#endif
-
-#define RXR_DATA_HDR_SIZE		(sizeof(struct rxr_data_hdr))
-
-struct rxr_data_pkt {
-	struct rxr_data_hdr hdr;
-	char data[];
-};
-
 static inline
-struct rxr_data_pkt *rxr_get_data_pkt(void *pkt)
+struct rxr_data_hdr *rxr_get_data_hdr(void *pkt)
 {
-	return (struct rxr_data_pkt *)pkt;
+	return (struct rxr_data_hdr *)pkt;
 }
 
-ssize_t rxr_pkt_send_data(struct rxr_ep *ep,
-			  struct rxr_tx_entry *tx_entry,
-			  struct rxr_pkt_entry *pkt_entry);
+int rxr_pkt_init_data(struct rxr_ep *ep,
+		      struct rxr_tx_entry *tx_entry,
+		      struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
-			       struct rxr_tx_entry *tx_entry,
-			       struct rxr_pkt_entry *pkt_entry);
+void rxr_pkt_handle_data_sent(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry);
 
 void rxr_pkt_proc_data(struct rxr_ep *ep,
 		       struct rxr_rx_entry *rx_entry,
@@ -243,41 +123,15 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep,
 					 struct rxr_pkt_entry *pkt_entry);
 
-
 void rxr_pkt_handle_data_recv(struct rxr_ep *ep,
 			      struct rxr_pkt_entry *pkt_entry);
 
-/*
- *  READRSP packet data structures and functions
- *  The definition of functions are in rxr_pkt_type_misc.c
- */
-struct rxr_readrsp_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint8_t pad[4];
-	uint32_t rx_id;
-	uint32_t tx_id;
-	uint64_t seg_size;
-};
-
+/* READRSP packet related functions */
 static inline struct rxr_readrsp_hdr *rxr_get_readrsp_hdr(void *pkt)
 {
 	return (struct rxr_readrsp_hdr *)pkt;
 }
 
-#define RXR_READRSP_HDR_SIZE	(sizeof(struct rxr_readrsp_hdr))
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_readrsp_hdr) == sizeof(struct rxr_data_hdr), "rxr_readrsp_hdr check");
-#endif
-
-struct rxr_readrsp_pkt {
-	struct rxr_readrsp_hdr hdr;
-	char data[];
-};
-
 int rxr_pkt_init_readrsp(struct rxr_ep *ep,
 			 struct rxr_tx_entry *tx_entry,
 			 struct rxr_pkt_entry *pkt_entry);
@@ -323,24 +177,7 @@ void rxr_pkt_init_read_context(struct rxr_ep *rxr_ep,
 void rxr_pkt_handle_rma_completion(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry);
 
-/*
- *  EOR packet, used to acknowledge the sender that large message
- *  copy has been finished.
- *  Implementaion of the functions are in rxr_pkt_misc.c
- */
-struct rxr_eor_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t tx_id;
-	uint32_t rx_id;
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_eor_hdr) == 12, "rxr_eor_hdr check");
-#endif
-
+/* EOR packet related functions */
 static inline
 struct rxr_eor_hdr *rxr_get_eor_hdr(void *pkt)
 {
@@ -360,35 +197,12 @@ void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep,
 void rxr_pkt_handle_eor_recv(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
 
-/* atomrsp types */
-struct rxr_atomrsp_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint8_t pad[4];
-	uint32_t rx_id;
-	uint32_t tx_id;
-	uint64_t seg_size;
-};
-
-#if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_atomrsp_hdr) == 24, "rxr_atomrsp_hdr check");
-#endif
-
-#define RXR_ATOMRSP_HDR_SIZE	(sizeof(struct rxr_atomrsp_hdr))
-
-struct rxr_atomrsp_pkt {
-	struct rxr_atomrsp_hdr hdr;
-	char data[];
-};
-
+/* ATOMRSP packet related functions */
 static inline struct rxr_atomrsp_hdr *rxr_get_atomrsp_hdr(void *pkt)
 {
 	return (struct rxr_atomrsp_hdr *)pkt;
 }
 
-/* atomrsp functions: init, handle_sent, handle_send_completion, recv */
 int rxr_pkt_init_atomrsp(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 			 struct rxr_pkt_entry *pkt_entry);
 
@@ -398,6 +212,48 @@ void rxr_pkt_handle_atomrsp_send_completion(struct rxr_ep *ep, struct rxr_pkt_en
 
 void rxr_pkt_handle_atomrsp_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
+/* RECEIPT packet related functions */
+static inline
+struct rxr_receipt_hdr *rxr_get_receipt_hdr(void *pkt)
+{
+	return (struct rxr_receipt_hdr *)pkt;
+}
+
+int rxr_pkt_init_receipt(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
+			 struct rxr_pkt_entry *pkt_entry);
+
+void rxr_pkt_handle_receipt_sent(struct rxr_ep *ep,
+				 struct rxr_pkt_entry *pkt_entry);
+
+void rxr_pkt_handle_receipt_send_completion(struct rxr_ep *ep,
+					    struct rxr_pkt_entry *pkt_entry);
+
+void rxr_pkt_handle_receipt_recv(struct rxr_ep *ep,
+				 struct rxr_pkt_entry *pkt_entry);
+
+/* General packet type helper functions */
+static inline
+int rxr_pkt_type_contains_rma_iov(int pkt_type)
+{
+	switch (pkt_type) {
+		case RXR_EAGER_RTW_PKT:
+		case RXR_DC_EAGER_RTW_PKT:
+		case RXR_LONGCTS_RTW_PKT:
+		case RXR_DC_LONGCTS_RTW_PKT:
+		case RXR_LONGREAD_RTW_PKT:
+		case RXR_SHORT_RTR_PKT:
+		case RXR_LONGCTS_RTR_PKT:
+		case RXR_WRITE_RTA_PKT:
+		case RXR_DC_WRITE_RTA_PKT:
+		case RXR_FETCH_RTA_PKT:
+		case RXR_COMPARE_RTA_PKT:
+			return 1;
+			break;
+		default:
+			return 0;
+			break;
+	}
+}
 #endif
 
 #include "rxr_pkt_type_req.h"
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.c
new file mode 100644
index 0000000000000000000000000000000000000000..2840199de393f1f0f17d4f491c17f4fbd7cc356b
--- /dev/null
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "efa.h"
+#include "rxr_read.h"
+#include "rxr_pkt_cmd.h"
+
+/**
+ * @brief return the optional connid header pointer in a packet
+ *
+ * @param[in]	pkt_entry	an packet entry
+ * @return	If the input has the optional connid header, return the pointer to connid header
+ * 		Otherwise, return NULL
+ */
+uint32_t *rxr_pkt_connid_ptr(struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_base_hdr *base_hdr;
+
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+
+	if (base_hdr->type >= RXR_REQ_PKT_BEGIN)
+		return rxr_pkt_req_connid_ptr(pkt_entry);
+
+	if (!(base_hdr->flags & RXR_PKT_CONNID_HDR))
+		return NULL;
+
+	switch (base_hdr->type) {
+	case RXR_CTS_PKT:
+		return &(rxr_get_cts_hdr(pkt_entry->pkt)->connid);
+
+	case RXR_RECEIPT_PKT:
+		return &(rxr_get_receipt_hdr(pkt_entry->pkt)->connid);
+
+	case RXR_DATA_PKT:
+		return &(rxr_get_data_hdr(pkt_entry->pkt)->connid_hdr->connid);
+
+	case RXR_READRSP_PKT:
+		return &(rxr_get_readrsp_hdr(pkt_entry->pkt)->connid);
+
+	case RXR_ATOMRSP_PKT:
+		return &(rxr_get_atomrsp_hdr(pkt_entry->pkt)->connid);
+
+	case RXR_EOR_PKT:
+		return &rxr_get_eor_hdr(pkt_entry->pkt)->connid;
+
+	case RXR_HANDSHAKE_PKT:
+		return &(rxr_get_handshake_opt_connid_hdr(pkt_entry->pkt)->connid);
+
+	default:
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "unknown packet type: %d\n", base_hdr->type);
+		assert(0 && "Unknown packet type");
+	}
+
+	return NULL;
+}
+
+/**
+ * @brief set up data in a packet entry using tx_entry information, such that the packet is ready to be sent.
+ *        Depend on the tx_entry, this function can either copy data to packet entry, or point
+ *        pkt_entry->iov to tx_entry->iov.
+ *        It requires the packet header to be set.
+ *
+ * @param[in]		ep		end point.
+ * @param[in,out]	pkt_entry	packet entry. Header must have been set when the function is called
+ * @param[in]		hdr_size	packet header size.
+ * @param[in]		tx_entry	This function will use iov, iov_count and desc of tx_entry
+ * @param[in]		data_offset	offset of the data to be set up. In reference to tx_entry->total_len.
+ * @param[in]		data_size	length of the data to be set up. In reference to tx_entry->total_len.
+ * @return		0 on success, negative FI code on error
+ */
+int rxr_pkt_init_data_from_tx_entry(struct rxr_ep *ep,
+				    struct rxr_pkt_entry *pkt_entry,
+				    size_t hdr_size,
+				    struct rxr_tx_entry *tx_entry,
+				    size_t data_offset,
+				    size_t data_size)
+{
+	int tx_iov_index;
+	char *data;
+	size_t tx_iov_offset, copied;
+	struct efa_mr *desc;
+
+	assert(hdr_size > 0);
+
+	pkt_entry->x_entry = tx_entry;
+	/* pkt_sendv_pool's size equal efa_tx_pkt_pool size +
+	 * shm_tx_pkt_pool size. As long as we have a pkt_entry,
+	 * pkt_entry->send should be allocated successfully
+	 */
+	pkt_entry->send = ofi_buf_alloc(ep->pkt_sendv_pool);
+	if (!pkt_entry->send) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "allocate pkt_entry->send failed\n");
+		assert(pkt_entry->send);
+		return -FI_ENOMEM;
+	}
+
+	if (data_size == 0) {
+		pkt_entry->send->iov_count = 0;
+		pkt_entry->pkt_size = hdr_size;
+		return 0;
+	}
+
+	rxr_locate_iov_pos(tx_entry->iov, tx_entry->iov_count, data_offset,
+			   &tx_iov_index, &tx_iov_offset);
+	desc = tx_entry->desc[0];
+	assert(tx_iov_index < tx_entry->iov_count);
+	assert(tx_iov_offset < tx_entry->iov[tx_iov_index].iov_len);
+
+	/*
+	 * Copy can be avoid if the following 2 conditions are true:
+	 * 1. user provided memory descriptor, or message is sent via shm provider
+	 *    (which does not require a memory descriptor)
+	 * 2. data to be send is in 1 iov, because device only support 2 iov, and we use
+	 *    1st iov for header.
+	 */
+	if ((!pkt_entry->mr || tx_entry->desc[tx_iov_index]) &&
+	    (tx_iov_offset + data_size <= tx_entry->iov[tx_iov_index].iov_len)) {
+
+		assert(ep->core_iov_limit >= 2);
+		pkt_entry->send->iov[0].iov_base = pkt_entry->pkt;
+		pkt_entry->send->iov[0].iov_len = hdr_size;
+		pkt_entry->send->desc[0] = pkt_entry->mr ? fi_mr_desc(pkt_entry->mr) : NULL;
+
+		pkt_entry->send->iov[1].iov_base = (char *)tx_entry->iov[tx_iov_index].iov_base + tx_iov_offset;
+		pkt_entry->send->iov[1].iov_len = data_size;
+		pkt_entry->send->desc[1] = tx_entry->desc[tx_iov_index];
+		pkt_entry->send->iov_count = 2;
+		pkt_entry->pkt_size = hdr_size + data_size;
+		return 0;
+	}
+
+	data = pkt_entry->pkt + hdr_size;
+	copied = ofi_copy_from_hmem_iov(data,
+					data_size,
+					desc ? desc->peer.iface : FI_HMEM_SYSTEM,
+					desc ? desc->peer.device.reserved : 0,
+					tx_entry->iov,
+					tx_entry->iov_count,
+					data_offset);
+	assert(copied == data_size);
+	pkt_entry->send->iov_count = 0;
+	pkt_entry->pkt_size = hdr_size + copied;
+	return 0;
+}
+
+/* @brief return the data size in a packet entry
+ *
+ * @param[in]	pkt_entry		packet entry
+ * @return	the data size in the packet entry.
+ * 		if the packet entry does not contain data,
+ * 		return 0.
+ */
+size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry)
+{
+	int pkt_type;
+
+	assert(pkt_entry);
+	pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type;
+
+	if (pkt_type == RXR_DATA_PKT)
+		return rxr_get_data_hdr(pkt_entry->pkt)->seg_length;
+
+	if (pkt_type == RXR_READRSP_PKT)
+		return rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_length;
+
+	if (pkt_type >= RXR_REQ_PKT_BEGIN) {
+		assert(pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT ||
+		       pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT ||
+		       pkt_type == RXR_LONGCTS_MSGRTM_PKT || pkt_type == RXR_LONGCTS_TAGRTM_PKT ||
+		       pkt_type == RXR_EAGER_RTW_PKT ||
+		       pkt_type == RXR_LONGCTS_RTW_PKT ||
+		       pkt_type == RXR_DC_EAGER_MSGRTM_PKT ||
+		       pkt_type == RXR_DC_EAGER_TAGRTM_PKT ||
+		       pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+		       pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT ||
+		       pkt_type == RXR_DC_LONGCTS_MSGRTM_PKT ||
+		       pkt_type == RXR_DC_LONGCTS_TAGRTM_PKT ||
+		       pkt_type == RXR_DC_EAGER_RTW_PKT ||
+		       pkt_type == RXR_DC_LONGCTS_RTW_PKT);
+
+		return pkt_entry->pkt_size - rxr_pkt_req_hdr_size(pkt_entry);
+	}
+
+	/* other packet type does not contain data, thus return 0
+	 */
+	return 0;
+}
+
+/**
+ * @brief copy data to receive buffer and update counter in rx_entry.
+ *
+ * If receiving buffer is on GPU memory, it will post a local
+ * read request. Otherwise it will copy data directly, and call
+ * rxr_pkt_handle_data_copied().
+ *
+ * @param[in]		ep		endpoint
+ * @param[in,out]	rx_entry	rx_entry contains information of the receive
+ *                      	        op. This function uses receive buffer in it.
+ * @param[in]		data_offset	the offset of the data in the packet in respect
+ *					of the receiving buffer.
+ * @param[in]		pkt_entry	the packet entry that contains data
+ * @param[in]		data		the pointer pointing to the beginning of data
+ * @param[in]		data_size	the length of data
+ * @return		On success, return 0
+ * 			On failure, return libfabric error code
+ */
+ssize_t rxr_pkt_copy_data_to_rx_entry(struct rxr_ep *ep,
+				      struct rxr_rx_entry *rx_entry,
+				      size_t data_offset,
+				      struct rxr_pkt_entry *pkt_entry,
+				      char *data, size_t data_size)
+{
+	ssize_t err, bytes_copied;
+
+	pkt_entry->x_entry = rx_entry;
+
+	if (data_size > 0 && efa_ep_is_cuda_mr(rx_entry->desc[0])) {
+		err = rxr_read_post_local_read_or_queue(ep, rx_entry, data_offset,
+							pkt_entry, data, data_size);
+		if (err)
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "cannot post read to copy data\n");
+
+		return err;
+	}
+
+	if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL)) &&
+	    rx_entry->cq_entry.len > data_offset && data_size > 0) {
+		bytes_copied = ofi_copy_to_iov(rx_entry->iov,
+					       rx_entry->iov_count,
+					       data_offset + ep->msg_prefix_size,
+					       data,
+					       data_size);
+		if (bytes_copied != MIN(data_size, rx_entry->cq_entry.len - data_offset)) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "wrong size! bytes_copied: %ld\n",
+				bytes_copied);
+			return -FI_EIO;
+		}
+	}
+
+	rxr_pkt_handle_data_copied(ep, pkt_entry, data_size);
+	return 0;
+}
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..18f3fe44c1f296a2bd66b0fc7dd7468f446e3c88
--- /dev/null
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_base.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _RXR_PKT_TYPE_BASE_H
+#define _RXR_PKT_TYPE_BASE_H
+
+#include "rxr.h"
+
+uint32_t *rxr_pkt_connid_ptr(struct rxr_pkt_entry *pkt_entry);
+
+int rxr_pkt_init_data_from_tx_entry(struct rxr_ep *ep,
+				    struct rxr_pkt_entry *pkt_entry,
+				    size_t hdr_size,
+				    struct rxr_tx_entry *tx_entry,
+				    size_t data_offset, size_t data_size);
+
+ssize_t rxr_pkt_copy_data_to_rx_entry(struct rxr_ep *ep,
+				      struct rxr_rx_entry *rx_entry,
+				      size_t data_offset,
+				      struct rxr_pkt_entry *pkt_entry,
+				      char *data, size_t data_size);
+
+size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry);
+
+#endif
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
index 477b530209ba82cdb4432c41d68a68ad3dc85ca7..ceb941a0325dd25ab8cb42f57848cec0c8d0f50e 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
@@ -35,195 +35,63 @@
 #include "rxr.h"
 #include "rxr_msg.h"
 #include "rxr_pkt_cmd.h"
+#include "rxr_pkt_type_base.h"
 
-/*
- * This function contains data packet related functions
- * Data packet is used by long message protocol.
- */
-
-/*
- * Functions to send data packet, including
- */
-
-ssize_t rxr_pkt_send_data(struct rxr_ep *ep,
-			  struct rxr_tx_entry *tx_entry,
-			  struct rxr_pkt_entry *pkt_entry)
+int rxr_pkt_init_data(struct rxr_ep *ep,
+		      struct rxr_tx_entry *tx_entry,
+		      struct rxr_pkt_entry *pkt_entry)
 {
-	uint64_t payload_size, copied_size;
-	struct rxr_data_pkt *data_pkt;
-	struct efa_mr *desc;
-
-	pkt_entry->x_entry = (void *)tx_entry;
-	pkt_entry->addr = tx_entry->addr;
-	desc = tx_entry->desc[0];
-
-	payload_size = MIN(tx_entry->total_len - tx_entry->bytes_sent,
-			   ep->max_data_payload_size);
-	payload_size = MIN(payload_size, tx_entry->window);
-
-	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
-	data_pkt->hdr.seg_size = payload_size;
+	struct rxr_data_hdr *data_hdr;
+	struct rdm_peer *peer;
+	size_t hdr_size;
+	int ret;
+
+	data_hdr = rxr_get_data_hdr(pkt_entry->pkt);
+	data_hdr->type = RXR_DATA_PKT;
+	data_hdr->version = RXR_PROTOCOL_VERSION;
+	data_hdr->flags = 0;
+	data_hdr->recv_id = tx_entry->rx_id;
+
+	hdr_size = sizeof(struct rxr_data_hdr);
+	peer = rxr_ep_get_peer(ep, tx_entry->addr);
+	assert(peer);
+	if (rxr_peer_need_connid(peer)) {
+		data_hdr->flags |= RXR_PKT_CONNID_HDR;
+		data_hdr->connid_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+		hdr_size += sizeof(struct rxr_data_opt_connid_hdr);
+	}
 
-	copied_size = ofi_copy_from_hmem_iov(data_pkt->data,
-					     payload_size,
-					     desc ? desc->peer.iface : FI_HMEM_SYSTEM,
-					     desc ? desc->peer.device.reserved : 0,
-					     tx_entry->iov,
-					     tx_entry->iov_count,
-					     tx_entry->bytes_sent);
-	assert(copied_size == payload_size);
+	/*
+	 * Data packets are sent in order so using bytes_sent is okay here.
+	 */
+	data_hdr->seg_offset = tx_entry->bytes_sent;
+	data_hdr->seg_length = MIN(tx_entry->total_len - tx_entry->bytes_sent,
+				   ep->max_data_payload_size);
+	data_hdr->seg_length = MIN(data_hdr->seg_length, tx_entry->window);
+	ret = rxr_pkt_init_data_from_tx_entry(ep, pkt_entry, hdr_size,
+					      tx_entry, tx_entry->bytes_sent,
+					      data_hdr->seg_length);
+	if (ret)
+		return ret;
 
-	pkt_entry->pkt_size = copied_size + sizeof(struct rxr_data_hdr);
+	pkt_entry->x_entry = (void *)tx_entry;
 	pkt_entry->addr = tx_entry->addr;
 
-	return rxr_pkt_entry_send_with_flags(ep, pkt_entry, pkt_entry->addr,
-					     tx_entry->send_flags);
-}
-
-/*
- * Copies all consecutive small iov's into one buffer. If the function reaches
- * an iov greater than the max memcpy size, it will end, only copying up to
- * that iov.
- */
-static size_t rxr_copy_from_iov(void *buf, uint64_t remaining_len,
-				struct rxr_tx_entry *tx_entry)
-{
-	struct iovec *tx_iov = tx_entry->iov;
-	uint64_t done = 0, len;
-
-	while (tx_entry->iov_index < tx_entry->iov_count &&
-	       done < remaining_len) {
-		len = tx_iov[tx_entry->iov_index].iov_len;
-		if (tx_entry->mr[tx_entry->iov_index])
-			break;
-
-		len -= tx_entry->iov_offset;
-
-		/*
-		 * If the amount to be written surpasses the remaining length,
-		 * copy up to the remaining length and return, else copy the
-		 * entire iov and continue.
-		 */
-		if (done + len > remaining_len) {
-			len = remaining_len - done;
-			memcpy((char *)buf + done,
-			       (char *)tx_iov[tx_entry->iov_index].iov_base +
-			       tx_entry->iov_offset, len);
-			tx_entry->iov_offset += len;
-			done += len;
-			break;
-		}
-		memcpy((char *)buf + done,
-		       (char *)tx_iov[tx_entry->iov_index].iov_base +
-		       tx_entry->iov_offset, len);
-		tx_entry->iov_index++;
-		tx_entry->iov_offset = 0;
-		done += len;
-	}
-	return done;
+	return 0;
 }
 
-ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
-			       struct rxr_tx_entry *tx_entry,
-			       struct rxr_pkt_entry *pkt_entry)
+void rxr_pkt_handle_data_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_data_pkt *data_pkt;
-	/* The user's iov */
-	struct iovec *tx_iov = tx_entry->iov;
-	/* The constructed iov to be passed to sendv
-	 * and corresponding fid_mrs
-	 */
-	struct iovec iov[ep->core_iov_limit];
-	void *desc[ep->core_iov_limit];
-	/* Constructed iov's total size */
-	uint64_t payload_size = 0;
-	/* pkt_entry offset to write data into */
-	uint64_t pkt_used = 0;
-	uint64_t orig_iov_index;
-	uint64_t orig_iov_offset;
-	/* Remaining size that can fit in the constructed iov */
-	uint64_t remaining_len = MIN(tx_entry->window,
-				     ep->max_data_payload_size);
-	/* The constructed iov's index */
-	size_t i = 0;
-	size_t len = 0;
-
-	ssize_t ret;
-
-	orig_iov_index = tx_entry->iov_index;
-	orig_iov_offset = tx_entry->iov_offset;
-
-	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
-	/* Assign packet header in constructed iov */
-	iov[i].iov_base = rxr_pkt_start(pkt_entry);
-	iov[i].iov_len = sizeof(struct rxr_data_hdr);
-	desc[i] = rxr_ep_mr_local(ep) ? fi_mr_desc(pkt_entry->mr) : NULL;
-	i++;
-
-	/*
-	 * Loops until payload size is at max, all user iovs are sent, the
-	 * constructed iov count is greater than the core iov limit, or the tx
-	 * entry window is exhausted.  Each iteration fills one entry of the
-	 * iov to be sent.
-	 */
-	while (tx_entry->iov_index < tx_entry->iov_count &&
-	       remaining_len > 0 && i < ep->core_iov_limit) {
-		if (!rxr_ep_mr_local(ep) || tx_entry->desc[tx_entry->iov_index]) {
-			iov[i].iov_base =
-				(char *)tx_iov[tx_entry->iov_index].iov_base +
-				tx_entry->iov_offset;
-			if (rxr_ep_mr_local(ep))
-				desc[i] = tx_entry->desc[tx_entry->iov_index];
-
-			len = tx_iov[tx_entry->iov_index].iov_len
-			      - tx_entry->iov_offset;
-			if (len > remaining_len) {
-				len = remaining_len;
-				tx_entry->iov_offset += len;
-			} else {
-				tx_entry->iov_index++;
-				tx_entry->iov_offset = 0;
-			}
-			iov[i].iov_len = len;
-		} else {
-			/* It should be noted for cuda buffer, caller will always
-			 * provide desc, and will not enter this branch.
-			 *
-			 * Copies any consecutive small iov's, returning size
-			 * written while updating iov index and offset
-			 */
-
-			len = rxr_copy_from_iov((char *)data_pkt->data +
-						 pkt_used,
-						 remaining_len,
-						 tx_entry);
+	struct rxr_tx_entry *tx_entry;
+	struct rxr_data_hdr *data_hdr;
 
-			iov[i].iov_base = (char *)data_pkt->data + pkt_used;
-			iov[i].iov_len = len;
-			desc[i] = fi_mr_desc(pkt_entry->mr);
-			pkt_used += len;
-		}
-		payload_size += len;
-		remaining_len -= len;
-		i++;
-	}
-	data_pkt->hdr.seg_size = (uint16_t)payload_size;
-	pkt_entry->pkt_size = payload_size + RXR_DATA_HDR_SIZE;
-	pkt_entry->x_entry = tx_entry;
-	pkt_entry->addr = tx_entry->addr;
+	data_hdr = rxr_get_data_hdr(pkt_entry->pkt);
+	assert(data_hdr->seg_length > 0);
 
-	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
-	       "Sending an iov count, %zu with payload size: %lu.\n",
-	       i, payload_size);
-	ret = rxr_pkt_entry_sendv(ep, pkt_entry, tx_entry->addr,
-				  (const struct iovec *)iov,
-				  desc, i, tx_entry->send_flags);
-	if (OFI_UNLIKELY(ret)) {
-		/* Reset tx_entry iov pointer on send failure. */
-		tx_entry->iov_index = orig_iov_index;
-		tx_entry->iov_offset = orig_iov_offset;
-	}
-	return ret;
+	tx_entry = pkt_entry->x_entry;
+	tx_entry->bytes_sent += data_hdr->seg_length;
+	tx_entry->window -= data_hdr->seg_length;
+	assert(tx_entry->window >= 0);
 }
 
 void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep,
@@ -233,10 +101,28 @@ void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep,
 
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	tx_entry->bytes_acked +=
-		rxr_get_data_pkt(pkt_entry->pkt)->hdr.seg_size;
-
-	if (tx_entry->total_len == tx_entry->bytes_acked)
-		rxr_cq_handle_tx_completion(ep, tx_entry);
+		rxr_get_data_hdr(pkt_entry->pkt)->seg_length;
+
+	if (tx_entry->total_len == tx_entry->bytes_acked) {
+		if (!(tx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED))
+			rxr_cq_handle_tx_completion(ep, tx_entry);
+		else
+			if (tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED)
+				/*
+				 * For long message protocol,
+				 * when FI_DELIVERY_COMPLETE
+				 * is requested,
+				 * we have to write tx completions
+				 * in either
+				 * rxr_pkt_handle_data_send_completion()
+				 * or rxr_pkt_handle_receipt_recv()
+				 * depending on which of them
+				 * is called later due
+				 * to avoid accessing released
+				 * tx_entry.
+				 */
+				rxr_cq_handle_tx_completion(ep, tx_entry);
+	}
 }
 
 /*
@@ -253,7 +139,7 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 		       char *data, size_t seg_offset,
 		       size_t seg_size)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	bool all_received = 0;
 	ssize_t err;
 
@@ -267,6 +153,7 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 	all_received = (rx_entry->bytes_received == rx_entry->total_len);
 
 	peer = rxr_ep_get_peer(ep, rx_entry->addr);
+	assert(peer);
 	peer->rx_credits += ofi_div_ceil(seg_size, ep->max_data_payload_size);
 
 	rx_entry->window -= seg_size;
@@ -274,20 +161,20 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 		ep->available_data_bufs++;
 
 #if ENABLE_DEBUG
-	/* rx_entry can be released by rxr_pkt_copy_to_rx
+	/* rx_entry can be released by rxr_pkt_copy_data_to_rx_entry
 	 * so the call to dlist_remove must happen before
-	 * call to rxr_copy_to_rx
+	 * call to rxr_copy_data_to_rx_entry
 	 */
 	if (all_received) {
 		dlist_remove(&rx_entry->rx_pending_entry);
 		ep->rx_pending--;
 	}
 #endif
-	err = rxr_pkt_copy_to_rx(ep, rx_entry, seg_offset,
-				 pkt_entry, data, seg_size);
+	err = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, seg_offset,
+					    pkt_entry, data, seg_size);
 	if (err) {
 		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		rxr_cq_handle_rx_error(ep, rx_entry, err);
+		rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 	}
 
 	if (all_received)
@@ -298,7 +185,7 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
 		if (err) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ, "post CTS packet failed!\n");
-			rxr_cq_handle_rx_error(ep, rx_entry, err);
+			rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 		}
 	}
 }
@@ -306,18 +193,23 @@ void rxr_pkt_proc_data(struct rxr_ep *ep,
 void rxr_pkt_handle_data_recv(struct rxr_ep *ep,
 			      struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_data_pkt *data_pkt;
+	struct rxr_data_hdr *data_hdr;
 	struct rxr_rx_entry *rx_entry;
+	size_t hdr_size;
 
-	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
+	data_hdr = rxr_get_data_hdr(pkt_entry->pkt);
 
 	rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool,
-					data_pkt->hdr.rx_id);
+					data_hdr->recv_id);
+
+	hdr_size = sizeof(struct rxr_data_hdr);
+	if (data_hdr->flags & RXR_PKT_CONNID_HDR)
+		hdr_size += sizeof(struct rxr_data_opt_connid_hdr);
 
 	rxr_pkt_proc_data(ep, rx_entry,
 			  pkt_entry,
-			  data_pkt->data,
-			  data_pkt->hdr.seg_offset,
-			  data_pkt->hdr.seg_size);
+			  pkt_entry->pkt + hdr_size,
+			  data_hdr->seg_offset,
+			  data_hdr->seg_length);
 }
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
index 80566dbf31d34e073297e0e48cc7aa0fd21adc20..ae7dd6c8d473fb0a7037aeebda4739baf5c0ce31 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
@@ -36,6 +36,7 @@
 #include "rxr_msg.h"
 #include "rxr_cntr.h"
 #include "rxr_pkt_cmd.h"
+#include "rxr_pkt_type_base.h"
 #include "rxr_read.h"
 
 /* This file define functons for the following packet type:
@@ -51,53 +52,111 @@ ssize_t rxr_pkt_init_handshake(struct rxr_ep *ep,
 			       struct rxr_pkt_entry *pkt_entry,
 			       fi_addr_t addr)
 {
+	int nex;
 	struct rxr_handshake_hdr *handshake_hdr;
+	struct rxr_handshake_opt_connid_hdr *connid_hdr;
 
 	handshake_hdr = (struct rxr_handshake_hdr *)pkt_entry->pkt;
 	handshake_hdr->type = RXR_HANDSHAKE_PKT;
-	handshake_hdr->version = RXR_BASE_PROTOCOL_VERSION;
+	handshake_hdr->version = RXR_PROTOCOL_VERSION;
 	handshake_hdr->flags = 0;
-	handshake_hdr->maxproto = RXR_CUR_PROTOCOL_VERSION;
-	memcpy(handshake_hdr->features, ep->features,
-	       RXR_NUM_PROTOCOL_VERSION * sizeof(uint64_t));
 
-	pkt_entry->pkt_size = sizeof(struct rxr_handshake_hdr)
-			      + RXR_NUM_PROTOCOL_VERSION * sizeof(uint64_t);
+	nex = (RXR_NUM_EXTRA_FEATURE_OR_REQUEST-1)/64 + 1;
+	/*
+	 * The action of plus 3 is for backward compatibility.
+	 * See section 2.1 of protocol v4 document for detail.
+	 */
+	handshake_hdr->nextra_p3 = nex + 3;
+	memcpy(handshake_hdr->extra_info, ep->extra_info, nex * sizeof(uint64_t));
+	pkt_entry->pkt_size = sizeof(struct rxr_handshake_hdr) + nex * sizeof(uint64_t);
+
+	/*
+	 * Always include connid at the end of a handshake packet.
+	 * If peer cannot make use of connid, the connid will be ignored.
+	 */
+	connid_hdr = (struct rxr_handshake_opt_connid_hdr *)(pkt_entry->pkt + pkt_entry->pkt_size);
+	connid_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+	handshake_hdr->flags |= RXR_PKT_CONNID_HDR;
+	pkt_entry->pkt_size += sizeof(struct rxr_handshake_opt_connid_hdr);
+
 	pkt_entry->addr = addr;
 	return 0;
 }
 
-void rxr_pkt_post_handshake(struct rxr_ep *ep,
-			    struct rxr_peer *peer,
-			    fi_addr_t addr)
+/** @brief Post a handshake packet to a peer.
+ *
+ * @param ep The endpoint on which the handshake packet is sent out.
+ * @param peer The peer to which the handshake packet is posted.
+ * @return 0 on success, fi_errno on error.
+ */
+ssize_t rxr_pkt_post_handshake(struct rxr_ep *ep, struct rdm_peer *peer)
 {
 	struct rxr_pkt_entry *pkt_entry;
+	fi_addr_t addr;
 	ssize_t ret;
 
-	assert(!(peer->flags & RXR_PEER_HANDSHAKE_SENT));
-
-	pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_efa_pool);
+	addr = peer->efa_fiaddr;
+	if (peer->is_local)
+		pkt_entry = rxr_pkt_entry_alloc(ep, ep->shm_tx_pkt_pool, RXR_PKT_FROM_SHM_TX_POOL);
+	else
+		pkt_entry = rxr_pkt_entry_alloc(ep, ep->efa_tx_pkt_pool, RXR_PKT_FROM_EFA_TX_POOL);
 	if (OFI_UNLIKELY(!pkt_entry))
-		return;
+		return -FI_EAGAIN;
 
 	rxr_pkt_init_handshake(ep, pkt_entry, addr);
 
-	/*
-	 * TODO: Once we start using a core's selective completion capability,
-	 * post the HANDSHAKE packets without FI_COMPLETION.
-	 */
-	ret = rxr_pkt_entry_send(ep, pkt_entry, addr);
-
-	/*
-	 * Skip sending this handshake on error and try again when processing the
-	 * next REQ from this peer containing the source information
-	 */
+	ret = rxr_pkt_entry_send(ep, pkt_entry, 0);
 	if (OFI_UNLIKELY(ret)) {
 		rxr_pkt_entry_release_tx(ep, pkt_entry);
-		if (ret == -FI_EAGAIN)
-			return;
-		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"Failed to send a HANDSHAKE packet: ret %zd\n", ret);
+	}
+	return ret;
+}
+
+/** @brief Post a handshake packet to a peer.
+ *
+ * This function ensures an endpoint post one and only one handshake
+ * to a peer.
+ *
+ * For a peer that the endpoint has not attempted to send handshake,
+ * it will send a handshake packet.
+ *
+ * If the send succeeded, RXR_PEER_HANDSHAKE_SENT flag will be set to peer->flags.
+ *
+ * If the send encountered FI_EAGAIN failure, the peer will be added to
+ * rxr_ep->handshake_queued_peer_list. The handshake will be resend later
+ * by the progress engine.
+ *
+ * If the send encountered other failure, an EQ entry will be written.
+ *
+ * To ensure only one handshake is send to a peer, the function will not send
+ * packet to a peer whose peer->flags has either RXR_PEER_HANDSHAKE_SENT or
+ * RXR_PEER_HANDSHAKE_QUEUED.
+ *
+ * @param[in]	ep	The endpoint on which the handshake packet is sent out.
+ * @param[in]	peer	The peer to which the handshake packet is posted.
+ * @return 	void.
+ */
+void rxr_pkt_post_handshake_or_queue(struct rxr_ep *ep, struct rdm_peer *peer)
+{
+	ssize_t err;
+
+	if (peer->flags & (RXR_PEER_HANDSHAKE_SENT | RXR_PEER_HANDSHAKE_QUEUED))
+		return;
+
+	err = rxr_pkt_post_handshake(ep, peer);
+	if (OFI_UNLIKELY(err == -FI_EAGAIN)) {
+		/* add peer to handshake_queued_peer_list for retry later */
+		peer->flags |= RXR_PEER_HANDSHAKE_QUEUED;
+		dlist_insert_tail(&peer->handshake_queued_entry,
+				  &ep->handshake_queued_peer_list);
+		return;
+	}
+
+	if (OFI_UNLIKELY(err)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"Failed to post HANDSHAKE to peer %ld: %s\n",
+			peer->efa_fiaddr, fi_strerror(-err));
+		efa_eq_write_error(&ep->util_ep, FI_EIO, -err);
 		return;
 	}
 
@@ -107,19 +166,23 @@ void rxr_pkt_post_handshake(struct rxr_ep *ep,
 void rxr_pkt_handle_handshake_recv(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct rxr_handshake_hdr *handshake_pkt;
 
 	assert(pkt_entry->addr != FI_ADDR_NOTAVAIL);
 
 	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
+	assert(peer);
 	assert(!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED));
 
 	handshake_pkt = (struct rxr_handshake_hdr *)pkt_entry->pkt;
 
-	peer->maxproto = handshake_pkt->maxproto;
-	memcpy(peer->features, handshake_pkt->features,
-	       (handshake_pkt->maxproto - RXR_BASE_PROTOCOL_VERSION + 1) * sizeof(uint64_t));
+	/* nextra_p3 is number of members in extra_info plus 3.
+	 * See section 2.1 of protocol v4 document for detail
+	 */
+	peer->nextra_p3 = handshake_pkt->nextra_p3;
+	memcpy(peer->extra_info, handshake_pkt->extra_info,
+	       (handshake_pkt->nextra_p3 - 3) * sizeof(uint64_t));
 	peer->flags |= RXR_PEER_HANDSHAKE_RECEIVED;
 	FI_DBG(&rxr_prov, FI_LOG_CQ,
 	       "HANDSHAKE received from %" PRIu64 "\n", pkt_entry->addr);
@@ -128,7 +191,7 @@ void rxr_pkt_handle_handshake_recv(struct rxr_ep *ep,
 }
 
 /*  CTS packet related functions */
-void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer,
+void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rdm_peer *peer,
 				     uint64_t size, int request,
 				     int *window, int *credits)
 {
@@ -151,7 +214,7 @@ void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer,
 	 * number of credits are allocated to the transfer so the sender can
 	 * make progress.
 	 */
-	*credits = MIN(MIN(ep->available_data_bufs, ep->posted_bufs_efa),
+	*credits = MIN(MIN(ep->available_data_bufs, ep->efa_rx_pkts_posted),
 		       peer->rx_credits);
 	*credits = MIN(request, *credits);
 	*credits = MAX(*credits, rxr_env.tx_min_credits);
@@ -166,27 +229,36 @@ ssize_t rxr_pkt_init_cts(struct rxr_ep *ep,
 {
 	int window = 0;
 	struct rxr_cts_hdr *cts_hdr;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	size_t bytes_left;
 
 	cts_hdr = (struct rxr_cts_hdr *)pkt_entry->pkt;
 	cts_hdr->type = RXR_CTS_PKT;
-	cts_hdr->version = RXR_BASE_PROTOCOL_VERSION;
+	cts_hdr->version = RXR_PROTOCOL_VERSION;
 	cts_hdr->flags = 0;
 
 	if (rx_entry->cq_entry.flags & FI_READ)
 		cts_hdr->flags |= RXR_CTS_READ_REQ;
 
-	cts_hdr->tx_id = rx_entry->tx_id;
-	cts_hdr->rx_id = rx_entry->rx_id;
+	cts_hdr->send_id = rx_entry->tx_id;
+	cts_hdr->recv_id = rx_entry->rx_id;
 
 	bytes_left = rx_entry->total_len - rx_entry->bytes_received;
 	peer = rxr_ep_get_peer(ep, rx_entry->addr);
+	assert(peer);
 	rxr_pkt_calc_cts_window_credits(ep, peer, bytes_left,
 					rx_entry->credit_request,
 					&window, &rx_entry->credit_cts);
-	cts_hdr->window = window;
+	cts_hdr->recv_length = window;
 	pkt_entry->pkt_size = sizeof(struct rxr_cts_hdr);
+
+	/*
+	 * always set connid header. If the peer does not need it,
+	 * it will be ignored.
+	 */
+	cts_hdr->flags |= RXR_PKT_CONNID_HDR;
+	cts_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+
 	pkt_entry->addr = rx_entry->addr;
 	pkt_entry->x_entry = (void *)rx_entry;
 	return 0;
@@ -198,7 +270,7 @@ void rxr_pkt_handle_cts_sent(struct rxr_ep *ep,
 	struct rxr_rx_entry *rx_entry;
 
 	rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry;
-	rx_entry->window = rxr_get_cts_hdr(pkt_entry->pkt)->window;
+	rx_entry->window = rxr_get_cts_hdr(pkt_entry->pkt)->recv_length;
 	ep->available_data_bufs -= rx_entry->credit_cts;
 
 	/*
@@ -213,24 +285,26 @@ void rxr_pkt_handle_cts_sent(struct rxr_ep *ep,
 void rxr_pkt_handle_cts_recv(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct rxr_cts_hdr *cts_pkt;
 	struct rxr_tx_entry *tx_entry;
 
 	cts_pkt = (struct rxr_cts_hdr *)pkt_entry->pkt;
 	if (cts_pkt->flags & RXR_CTS_READ_REQ)
-		tx_entry = ofi_bufpool_get_ibuf(ep->readrsp_tx_entry_pool, cts_pkt->tx_id);
+		tx_entry = ofi_bufpool_get_ibuf(ep->readrsp_tx_entry_pool, cts_pkt->send_id);
 	else
-		tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, cts_pkt->tx_id);
+		tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, cts_pkt->send_id);
 
-	tx_entry->rx_id = cts_pkt->rx_id;
-	tx_entry->window = cts_pkt->window;
+	tx_entry->rx_id = cts_pkt->recv_id;
+	tx_entry->window = cts_pkt->recv_length;
 
 	/* Return any excess tx_credits that were borrowed for the request */
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
-	tx_entry->credit_allocated = ofi_div_ceil(cts_pkt->window, ep->max_data_payload_size);
-	if (tx_entry->credit_allocated < tx_entry->credit_request)
+	tx_entry->credit_allocated = ofi_div_ceil(cts_pkt->recv_length, ep->max_data_payload_size);
+	if (tx_entry->credit_allocated < tx_entry->credit_request) {
+		assert(peer);
 		peer->tx_credits += tx_entry->credit_request - tx_entry->credit_allocated;
+	}
 
 	rxr_pkt_entry_release_rx(ep, pkt_entry);
 
@@ -245,25 +319,24 @@ int rxr_pkt_init_readrsp(struct rxr_ep *ep,
 			 struct rxr_tx_entry *tx_entry,
 			 struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_readrsp_pkt *readrsp_pkt;
 	struct rxr_readrsp_hdr *readrsp_hdr;
-	size_t mtu = ep->mtu_size;
+	int ret;
 
-	readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt;
-	readrsp_hdr = &readrsp_pkt->hdr;
+	readrsp_hdr = rxr_get_readrsp_hdr(pkt_entry->pkt);
 	readrsp_hdr->type = RXR_READRSP_PKT;
-	readrsp_hdr->version = RXR_BASE_PROTOCOL_VERSION;
+	readrsp_hdr->version = RXR_PROTOCOL_VERSION;
 	readrsp_hdr->flags = 0;
-	readrsp_hdr->tx_id = tx_entry->tx_id;
-	readrsp_hdr->rx_id = tx_entry->rx_id;
-	readrsp_hdr->seg_size = ofi_copy_from_iov(readrsp_pkt->data,
-						  mtu - RXR_READRSP_HDR_SIZE,
-						  tx_entry->iov,
-						  tx_entry->iov_count, 0);
-	pkt_entry->pkt_size = RXR_READRSP_HDR_SIZE + readrsp_hdr->seg_size;
+	readrsp_hdr->send_id = tx_entry->tx_id;
+	readrsp_hdr->recv_id = tx_entry->rx_id;
+	readrsp_hdr->flags |= RXR_PKT_CONNID_HDR;
+	readrsp_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+	readrsp_hdr->seg_length = MIN(ep->mtu_size - sizeof(struct rxr_readrsp_hdr),
+				      tx_entry->total_len);
+
 	pkt_entry->addr = tx_entry->addr;
-	pkt_entry->x_entry = tx_entry;
-	return 0;
+	ret = rxr_pkt_init_data_from_tx_entry(ep, pkt_entry, sizeof(struct rxr_readrsp_hdr),
+					      tx_entry, 0, readrsp_hdr->seg_length);
+	return ret;
 }
 
 void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
@@ -277,14 +350,13 @@ void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_en
 				  util_domain.domain_fid);
 
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
-	data_len = rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_size;
-	tx_entry->state = RXR_TX_SENT_READRSP;
+	data_len = rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_length;
 	tx_entry->bytes_sent += data_len;
 	tx_entry->window -= data_len;
 	assert(tx_entry->window >= 0);
 	if (tx_entry->bytes_sent < tx_entry->total_len) {
 		assert(!efa_ep_is_cuda_mr(tx_entry->desc[0]));
-		if (efa_is_cache_available(efa_domain) && rxr_ep_mr_local(ep))
+		if (tx_entry->desc[0] || efa_is_cache_available(efa_domain))
 			rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 
 		tx_entry->state = RXR_TX_SEND;
@@ -304,7 +376,7 @@ void rxr_pkt_handle_readrsp_send_completion(struct rxr_ep *ep,
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	assert(tx_entry->cq_entry.flags & FI_READ);
 
-	tx_entry->bytes_acked += readrsp_hdr->seg_size;
+	tx_entry->bytes_acked += readrsp_hdr->seg_length;
 	if (tx_entry->total_len == tx_entry->bytes_acked)
 		rxr_cq_handle_tx_completion(ep, tx_entry);
 }
@@ -318,12 +390,12 @@ void rxr_pkt_handle_readrsp_recv(struct rxr_ep *ep,
 
 	readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt;
 	readrsp_hdr = &readrsp_pkt->hdr;
-	rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, readrsp_hdr->rx_id);
+	rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, readrsp_hdr->recv_id);
 	assert(rx_entry->cq_entry.flags & FI_READ);
-	rx_entry->tx_id = readrsp_hdr->tx_id;
+	rx_entry->tx_id = readrsp_hdr->send_id;
 	rxr_pkt_proc_data(ep, rx_entry, pkt_entry,
 			  readrsp_pkt->data,
-			  0, readrsp_hdr->seg_size);
+			  0, readrsp_hdr->seg_length);
 }
 
 /*  RMA_CONTEXT packet functions
@@ -340,7 +412,7 @@ void rxr_pkt_init_write_context(struct rxr_tx_entry *tx_entry,
 	pkt_entry->x_entry = (void *)tx_entry;
 	rma_context_pkt = (struct rxr_rma_context_pkt *)pkt_entry->pkt;
 	rma_context_pkt->type = RXR_RMA_CONTEXT_PKT;
-	rma_context_pkt->version = RXR_BASE_PROTOCOL_VERSION;
+	rma_context_pkt->version = RXR_PROTOCOL_VERSION;
 	rma_context_pkt->context_type = RXR_WRITE_CONTEXT;
 	rma_context_pkt->tx_id = tx_entry->tx_id;
 }
@@ -359,7 +431,7 @@ void rxr_pkt_init_read_context(struct rxr_ep *rxr_ep,
 	ctx_pkt = (struct rxr_rma_context_pkt *)pkt_entry->pkt;
 	ctx_pkt->type = RXR_RMA_CONTEXT_PKT;
 	ctx_pkt->flags = 0;
-	ctx_pkt->version = RXR_BASE_PROTOCOL_VERSION;
+	ctx_pkt->version = RXR_PROTOCOL_VERSION;
 	ctx_pkt->context_type = RXR_READ_CONTEXT;
 	ctx_pkt->read_id = read_entry->read_id;
 	ctx_pkt->seg_size = seg_size;
@@ -374,7 +446,6 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 	struct rxr_pkt_entry *pkt_entry;
 	struct rxr_read_entry *read_entry;
 	struct rxr_rma_context_pkt *rma_context_pkt;
-	struct rxr_peer *peer;
 	int inject;
 	size_t data_size;
 	ssize_t ret;
@@ -392,6 +463,7 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 			tx_entry = read_entry->context;
 			assert(tx_entry && tx_entry->cq_entry.flags & FI_READ);
 			rxr_cq_write_tx_completion(ep, tx_entry);
+			rxr_release_tx_entry(ep, tx_entry);
 		} else if (read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY) {
 			rx_entry = read_entry->context;
 			if (rx_entry->op == ofi_op_msg || rx_entry->op == ofi_op_tagged) {
@@ -405,8 +477,7 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 			inject = (read_entry->lower_ep_type == SHM_EP);
 			ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_EOR_PKT, inject);
 			if (OFI_UNLIKELY(ret)) {
-				if (rxr_cq_handle_rx_error(ep, rx_entry, ret))
-					assert(0 && "failed to write err cq entry");
+				rxr_cq_write_rx_error(ep, rx_entry, -ret, -ret);
 				rxr_release_rx_entry(ep, rx_entry);
 			}
 		} else {
@@ -420,14 +491,7 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 		rxr_read_release_entry(ep, read_entry);
 	}
 
-	if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) {
-		assert(context_pkt_entry->addr == FI_ADDR_NOTAVAIL);
-		ep->tx_pending--;
-	} else {
-		peer = rxr_ep_get_peer(ep, context_pkt_entry->addr);
-		if (!peer->is_local)
-			rxr_ep_dec_tx_pending(ep, peer, 0);
-	}
+	rxr_ep_record_tx_op_completed(ep, context_pkt_entry);
 }
 
 void rxr_pkt_handle_rma_completion(struct rxr_ep *ep,
@@ -436,19 +500,19 @@ void rxr_pkt_handle_rma_completion(struct rxr_ep *ep,
 	struct rxr_tx_entry *tx_entry = NULL;
 	struct rxr_rma_context_pkt *rma_context_pkt;
 
-	assert(rxr_get_base_hdr(context_pkt_entry->pkt)->version == RXR_BASE_PROTOCOL_VERSION);
+	assert(rxr_get_base_hdr(context_pkt_entry->pkt)->version == RXR_PROTOCOL_VERSION);
 
 	rma_context_pkt = (struct rxr_rma_context_pkt *)context_pkt_entry->pkt;
 
 	switch (rma_context_pkt->context_type) {
 	case RXR_WRITE_CONTEXT:
 		tx_entry = (struct rxr_tx_entry *)context_pkt_entry->x_entry;
-		if (tx_entry->fi_flags & FI_COMPLETION) {
+		if (tx_entry->fi_flags & FI_COMPLETION)
 			rxr_cq_write_tx_completion(ep, tx_entry);
-		} else {
+		else
 			efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
-			rxr_release_tx_entry(ep, tx_entry);
-		}
+
+		rxr_release_tx_entry(ep, tx_entry);
 		break;
 	case RXR_READ_CONTEXT:
 		rxr_pkt_handle_rma_read_completion(ep, context_pkt_entry);
@@ -469,10 +533,12 @@ int rxr_pkt_init_eor(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, struct rx
 
 	eor_hdr = (struct rxr_eor_hdr *)pkt_entry->pkt;
 	eor_hdr->type = RXR_EOR_PKT;
-	eor_hdr->version = RXR_BASE_PROTOCOL_VERSION;
+	eor_hdr->version = RXR_PROTOCOL_VERSION;
 	eor_hdr->flags = 0;
-	eor_hdr->tx_id = rx_entry->tx_id;
-	eor_hdr->rx_id = rx_entry->rx_id;
+	eor_hdr->send_id = rx_entry->tx_id;
+	eor_hdr->recv_id = rx_entry->rx_id;
+	eor_hdr->flags |= RXR_PKT_CONNID_HDR;
+	eor_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
 	pkt_entry->pkt_size = sizeof(struct rxr_eor_hdr);
 	pkt_entry->addr = rx_entry->addr;
 	pkt_entry->x_entry = rx_entry;
@@ -481,16 +547,16 @@ int rxr_pkt_init_eor(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, struct rx
 
 void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_rx_entry *rx_entry;
-
-	rx_entry = pkt_entry->x_entry;
-	assert(rx_entry && rx_entry->rx_id == rxr_get_eor_hdr(pkt_entry->pkt)->rx_id);
-	rxr_release_rx_entry(ep, rx_entry);
 }
 
 void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep,
 					struct rxr_pkt_entry *pkt_entry)
 {
+	struct rxr_rx_entry *rx_entry;
+
+	rx_entry = pkt_entry->x_entry;
+	assert(rx_entry && rx_entry->rx_id == rxr_get_eor_hdr(pkt_entry->pkt)->recv_id);
+	rxr_release_rx_entry(ep, rx_entry);
 }
 
 /*
@@ -502,26 +568,51 @@ void rxr_pkt_handle_eor_recv(struct rxr_ep *ep,
 {
 	struct rxr_eor_hdr *eor_hdr;
 	struct rxr_tx_entry *tx_entry;
-	ssize_t err;
 
 	eor_hdr = (struct rxr_eor_hdr *)pkt_entry->pkt;
 
 	/* pre-post buf used here, so can NOT track back to tx_entry with x_entry */
-	tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, eor_hdr->tx_id);
-
-	err = rxr_tx_entry_mr_dereg(tx_entry);
-	if (OFI_UNLIKELY(err)) {
-		if (rxr_cq_handle_tx_error(ep, tx_entry, err))
-			assert(0 && "failed to write err cq entry");
-		rxr_release_tx_entry(ep, tx_entry);
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		return;
-	}
-
+	tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, eor_hdr->send_id);
 	rxr_cq_write_tx_completion(ep, tx_entry);
+	rxr_release_tx_entry(ep, tx_entry);
 	rxr_pkt_entry_release_rx(ep, pkt_entry);
 }
 
+/* receipt packet related functions */
+int rxr_pkt_init_receipt(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
+			 struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_receipt_hdr *receipt_hdr;
+
+	receipt_hdr = rxr_get_receipt_hdr(pkt_entry->pkt);
+	receipt_hdr->type = RXR_RECEIPT_PKT;
+	receipt_hdr->version = RXR_PROTOCOL_VERSION;
+	receipt_hdr->flags = 0;
+	receipt_hdr->tx_id = rx_entry->tx_id;
+	receipt_hdr->msg_id = rx_entry->msg_id;
+	receipt_hdr->flags |= RXR_PKT_CONNID_HDR;
+	receipt_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+
+	pkt_entry->pkt_size = sizeof(struct rxr_receipt_hdr);
+	pkt_entry->addr = rx_entry->addr;
+	pkt_entry->x_entry = rx_entry;
+
+	return 0;
+}
+
+void rxr_pkt_handle_receipt_sent(struct rxr_ep *ep,
+				 struct rxr_pkt_entry *pkt_entry)
+{
+}
+
+void rxr_pkt_handle_receipt_send_completion(struct rxr_ep *ep,
+					    struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+
+	rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry;
+	rxr_release_rx_entry(ep, rx_entry);
+}
 
 /* atomrsp packet related functions: init, handle_sent, handle_send_completion and recv
  *
@@ -532,28 +623,26 @@ void rxr_pkt_handle_eor_recv(struct rxr_ep *ep,
 int rxr_pkt_init_atomrsp(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry,
 			 struct rxr_pkt_entry *pkt_entry)
 {
-	size_t pkt_size;
+	struct rxr_atomrsp_pkt *atomrsp_pkt;
 	struct rxr_atomrsp_hdr *atomrsp_hdr;
 
-	assert(rx_entry->atomrsp_pkt);
-	pkt_size = rx_entry->atomrsp_pkt->pkt_size;
-	pkt_entry->pkt_size = pkt_size;
+	assert(rx_entry->atomrsp_data);
 	pkt_entry->addr = rx_entry->addr;
 	pkt_entry->x_entry = rx_entry;
 
-	atomrsp_hdr = (struct rxr_atomrsp_hdr *)pkt_entry->pkt;
+	atomrsp_pkt = (struct rxr_atomrsp_pkt *)pkt_entry->pkt;
+	atomrsp_hdr = &atomrsp_pkt->hdr;
 	atomrsp_hdr->type = RXR_ATOMRSP_PKT;
-	atomrsp_hdr->version = RXR_BASE_PROTOCOL_VERSION;
+	atomrsp_hdr->version = RXR_PROTOCOL_VERSION;
 	atomrsp_hdr->flags = 0;
-	atomrsp_hdr->tx_id = rx_entry->tx_id;
-	atomrsp_hdr->rx_id = rx_entry->rx_id;
-	atomrsp_hdr->seg_size = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
-
-	assert(RXR_ATOMRSP_HDR_SIZE + atomrsp_hdr->seg_size < ep->mtu_size);
-
-	/* rx_entry->atomrsp_buf was filled in rxr_pkt_handle_req_recv() */
-	memcpy((char*)pkt_entry->pkt + RXR_ATOMRSP_HDR_SIZE, rx_entry->atomrsp_buf, atomrsp_hdr->seg_size);
-	pkt_entry->pkt_size = RXR_ATOMRSP_HDR_SIZE + atomrsp_hdr->seg_size;
+	atomrsp_hdr->recv_id = rx_entry->tx_id;
+	atomrsp_hdr->seg_length = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
+	atomrsp_hdr->flags |= RXR_PKT_CONNID_HDR;
+	atomrsp_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+	assert(sizeof(struct rxr_atomrsp_hdr) + atomrsp_hdr->seg_length < ep->mtu_size);
+	/* rx_entry->atomrsp_data was filled in rxr_pkt_handle_req_recv() */
+	memcpy(atomrsp_pkt->data, rx_entry->atomrsp_data, atomrsp_hdr->seg_length);
+	pkt_entry->pkt_size = sizeof(struct rxr_atomrsp_hdr) + atomrsp_hdr->seg_length;
 	return 0;
 }
 
@@ -566,7 +655,8 @@ void rxr_pkt_handle_atomrsp_send_completion(struct rxr_ep *ep, struct rxr_pkt_en
 	struct rxr_rx_entry *rx_entry;
 	
 	rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry;
-	rxr_pkt_entry_release_tx(ep, rx_entry->atomrsp_pkt);
+	ofi_buf_free(rx_entry->atomrsp_data);
+	rx_entry->atomrsp_data = NULL;
 	rxr_release_rx_entry(ep, rx_entry);
 }
 
@@ -579,19 +669,52 @@ void rxr_pkt_handle_atomrsp_recv(struct rxr_ep *ep,
 
 	atomrsp_pkt = (struct rxr_atomrsp_pkt *)pkt_entry->pkt;
 	atomrsp_hdr = &atomrsp_pkt->hdr;
-	tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, atomrsp_hdr->tx_id);
+	tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, atomrsp_hdr->recv_id);
 
 	ofi_copy_to_iov(tx_entry->atomic_ex.resp_iov,
 			tx_entry->atomic_ex.resp_iov_count,
 			0, atomrsp_pkt->data,
-			atomrsp_hdr->seg_size);
+			atomrsp_hdr->seg_length);
 
-	if (tx_entry->fi_flags & FI_COMPLETION) {
-		/* Note write_tx_completion() will release tx_entry */
+	if (tx_entry->fi_flags & FI_COMPLETION)
 		rxr_cq_write_tx_completion(ep, tx_entry);
-	} else {
+	else
 		efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
-		rxr_release_tx_entry(ep, tx_entry);
+
+	rxr_release_tx_entry(ep, tx_entry);
+	rxr_pkt_entry_release_rx(ep, pkt_entry);
+}
+
+void rxr_pkt_handle_receipt_recv(struct rxr_ep *ep,
+				 struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_tx_entry *tx_entry = NULL;
+	struct rxr_receipt_hdr *receipt_hdr;
+
+	receipt_hdr = rxr_get_receipt_hdr(pkt_entry->pkt);
+	/* Retrieve the tx_entry that will be written into TX CQ*/
+	tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool,
+					receipt_hdr->tx_id);
+	if (!tx_entry) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"Failed to retrive the tx_entry when hadling receipt packet.\n");
+		return;
+	}
+
+	tx_entry->rxr_flags |= RXR_RECEIPT_RECEIVED;
+	if (tx_entry->rxr_flags & RXR_LONGCTS_PROTOCOL) {
+		/*
+		 * For long message protocol, when FI_DELIVERY_COMPLETE
+		 * is requested, we have to write tx completions
+		 * in either rxr_pkt_handle_data_send_completion()
+		 * or rxr_pkt_handle_receipt_recv() depending on which of them
+		 * is called later due to avoid accessing released
+		 * tx_entry.
+		 */
+		if (tx_entry->total_len == tx_entry->bytes_acked)
+			rxr_cq_handle_tx_completion(ep, tx_entry);
+	} else {
+		rxr_cq_handle_tx_completion(ep, tx_entry);
 	}
 
 	rxr_pkt_entry_release_rx(ep, pkt_entry);
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
index 8e3738bc4a15d7dab1d5de38b89deac1a7b1823e..2d995f59f2e6a30d9cacd4869614033ff1533922 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
@@ -37,6 +37,7 @@
 #include "rxr_rma.h"
 #include "rxr_msg.h"
 #include "rxr_pkt_cmd.h"
+#include "rxr_pkt_type_base.h"
 #include "rxr_read.h"
 
 /*
@@ -66,20 +67,29 @@ struct rxr_req_inf REQ_INF_LIST[] = {
 	[RXR_EAGER_TAGRTM_PKT] = {4, sizeof(struct rxr_eager_tagrtm_hdr), 0},
 	[RXR_MEDIUM_MSGRTM_PKT] = {4, sizeof(struct rxr_medium_msgrtm_hdr), 0},
 	[RXR_MEDIUM_TAGRTM_PKT] = {4, sizeof(struct rxr_medium_tagrtm_hdr), 0},
-	[RXR_LONG_MSGRTM_PKT] = {4, sizeof(struct rxr_long_msgrtm_hdr), 0},
-	[RXR_LONG_TAGRTM_PKT] = {4, sizeof(struct rxr_long_tagrtm_hdr), 0},
-	[RXR_READ_MSGRTM_PKT] = {4, sizeof(struct rxr_read_msgrtm_hdr), RXR_REQ_FEATURE_RDMA_READ},
-	[RXR_READ_TAGRTM_PKT] = {4, sizeof(struct rxr_read_tagrtm_hdr), RXR_REQ_FEATURE_RDMA_READ},
+	[RXR_LONGCTS_MSGRTM_PKT] = {4, sizeof(struct rxr_longcts_msgrtm_hdr), 0},
+	[RXR_LONGCTS_TAGRTM_PKT] = {4, sizeof(struct rxr_longcts_tagrtm_hdr), 0},
+	[RXR_LONGREAD_MSGRTM_PKT] = {4, sizeof(struct rxr_longread_msgrtm_hdr), RXR_EXTRA_FEATURE_RDMA_READ},
+	[RXR_LONGREAD_TAGRTM_PKT] = {4, sizeof(struct rxr_longread_tagrtm_hdr), RXR_EXTRA_FEATURE_RDMA_READ},
+	[RXR_DC_EAGER_MSGRTM_PKT] = {4, sizeof(struct rxr_dc_eager_msgrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_DC_EAGER_TAGRTM_PKT] = {4, sizeof(struct rxr_dc_eager_tagrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_DC_MEDIUM_MSGRTM_PKT] = {4, sizeof(struct rxr_dc_medium_msgrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_DC_MEDIUM_TAGRTM_PKT] = {4, sizeof(struct rxr_dc_medium_tagrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_DC_LONGCTS_MSGRTM_PKT] = {4, sizeof(struct rxr_longcts_msgrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_DC_LONGCTS_TAGRTM_PKT] = {4, sizeof(struct rxr_longcts_tagrtm_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
 	/* rtw header */
 	[RXR_EAGER_RTW_PKT] = {4, sizeof(struct rxr_eager_rtw_hdr), 0},
-	[RXR_LONG_RTW_PKT] = {4, sizeof(struct rxr_long_rtw_hdr), 0},
-	[RXR_READ_RTW_PKT] = {4, sizeof(struct rxr_read_rtw_hdr), RXR_REQ_FEATURE_RDMA_READ},
+	[RXR_DC_EAGER_RTW_PKT] = {4, sizeof(struct rxr_dc_eager_rtw_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_LONGCTS_RTW_PKT] = {4, sizeof(struct rxr_longcts_rtw_hdr), 0},
+	[RXR_DC_LONGCTS_RTW_PKT] = {4, sizeof(struct rxr_longcts_rtw_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
+	[RXR_LONGREAD_RTW_PKT] = {4, sizeof(struct rxr_longread_rtw_hdr), RXR_EXTRA_FEATURE_RDMA_READ},
 	/* rtr header */
 	[RXR_SHORT_RTR_PKT] = {4, sizeof(struct rxr_rtr_hdr), 0},
-	[RXR_LONG_RTR_PKT] = {4, sizeof(struct rxr_rtr_hdr), 0},
-	[RXR_READ_RTR_PKT] = {4, sizeof(struct rxr_base_hdr), RXR_REQ_FEATURE_RDMA_READ},
+	[RXR_LONGCTS_RTR_PKT] = {4, sizeof(struct rxr_rtr_hdr), 0},
+	[RXR_READ_RTR_PKT] = {4, sizeof(struct rxr_base_hdr), RXR_EXTRA_FEATURE_RDMA_READ},
 	/* rta header */
 	[RXR_WRITE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0},
+	[RXR_DC_WRITE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), RXR_EXTRA_FEATURE_DELIVERY_COMPLETE},
 	[RXR_FETCH_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0},
 	[RXR_COMPARE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0},
 };
@@ -99,7 +109,7 @@ void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
 			  struct rxr_pkt_entry *pkt_entry)
 {
 	char *opt_hdr;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct rxr_base_hdr *base_hdr;
 
 	/* init the base header */
@@ -110,13 +120,26 @@ void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
 
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
 	assert(peer);
-	if (OFI_UNLIKELY(!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))) {
+
+	if (rxr_peer_need_raw_addr_hdr(peer)) {
 		/*
 		 * This is the first communication with this peer on this
 		 * endpoint, so send the core's address for this EP in the REQ
 		 * so the remote side can insert it into its address vector.
 		 */
 		base_hdr->flags |= RXR_REQ_OPT_RAW_ADDR_HDR;
+	} else if (rxr_peer_need_connid(peer)) {
+		/*
+		 * After receiving handshake packet, we will know the peer's capability.
+		 *
+		 * If the peer need connid, we will include the optional connid
+		 * header in the req packet header.The peer will use it
+		 * to verify my identity.
+		 *
+		 * This logic means that a req packet cannot have both
+		 * the optional raw address header and the optional connid header.
+		 */
+		base_hdr->flags |= RXR_PKT_CONNID_HDR;
 	}
 
 	if (tx_entry->fi_flags & FI_REMOTE_CQ_DATA) {
@@ -129,9 +152,10 @@ void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
 		struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
 
 		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
-		raw_addr_hdr->addr_len = ep->core_addrlen;
-		memcpy(raw_addr_hdr->raw_addr, ep->core_addr, raw_addr_hdr->addr_len);
-		opt_hdr += sizeof(*raw_addr_hdr) + raw_addr_hdr->addr_len;
+		raw_addr_hdr->addr_len = RXR_REQ_OPT_RAW_ADDR_HDR_SIZE - sizeof(struct rxr_req_opt_raw_addr_hdr);
+		assert(raw_addr_hdr->addr_len >= ep->core_addrlen);
+		memcpy(raw_addr_hdr->raw_addr, ep->core_addr, ep->core_addrlen);
+		opt_hdr += RXR_REQ_OPT_RAW_ADDR_HDR_SIZE;
 	}
 
 	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) {
@@ -142,7 +166,16 @@ void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
 		opt_hdr += sizeof(*cq_data_hdr);
 	}
 
+	if (base_hdr->flags & RXR_PKT_CONNID_HDR) {
+		struct rxr_req_opt_connid_hdr *connid_hdr;
+
+		connid_hdr = (struct rxr_req_opt_connid_hdr *)opt_hdr;
+		connid_hdr->connid = rxr_ep_raw_addr(ep)->qkey;
+		opt_hdr += sizeof(*connid_hdr);
+	}
+
 	pkt_entry->addr = tx_entry->addr;
+	assert(opt_hdr - pkt_entry->pkt == rxr_pkt_req_hdr_size(pkt_entry));
 }
 
 size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry)
@@ -155,13 +188,16 @@ size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry)
 
 	hdr_size = REQ_INF_LIST[base_hdr->type].base_hdr_size;
 	if (base_hdr->type == RXR_EAGER_RTW_PKT ||
-	    base_hdr->type == RXR_LONG_RTW_PKT ||
-	    base_hdr->type == RXR_READ_RTW_PKT)
+	    base_hdr->type == RXR_DC_EAGER_RTW_PKT ||
+	    base_hdr->type == RXR_LONGCTS_RTW_PKT ||
+	    base_hdr->type == RXR_DC_LONGCTS_RTW_PKT ||
+	    base_hdr->type == RXR_LONGREAD_RTW_PKT)
 		hdr_size += rxr_get_rtw_base_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov);
 	else if (base_hdr->type == RXR_SHORT_RTR_PKT ||
-		 base_hdr->type == RXR_LONG_RTR_PKT)
+		 base_hdr->type == RXR_LONGCTS_RTR_PKT)
 		hdr_size += rxr_get_rtr_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov);
 	else if (base_hdr->type == RXR_WRITE_RTA_PKT ||
+		 base_hdr->type == RXR_DC_WRITE_RTA_PKT ||
 		 base_hdr->type == RXR_FETCH_RTA_PKT ||
 		 base_hdr->type == RXR_COMPARE_RTA_PKT)
 		hdr_size += rxr_get_rta_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov);
@@ -169,6 +205,13 @@ size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry)
 	return hdr_size;
 }
 
+/**
+ * @brief return the optional raw addr header pointer in a req packet
+ *
+ * @param[in]	pkt_entry	an REQ packet entry
+ * @return	If the input has the optional raw addres header, return the pointer to it.
+ *		Otherwise, return NULL
+ */
 void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry)
 {
 	char *opt_hdr;
@@ -178,6 +221,10 @@ void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry)
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
 	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		/* For req packet, the optional connid header and the optional
+		 * raw address header are mutually exclusive.
+		 */
+		assert(!(base_hdr->flags & RXR_PKT_CONNID_HDR));
 		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
 		return raw_addr_hdr->raw_addr;
 	}
@@ -185,6 +232,42 @@ void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry)
 	return NULL;
 }
 
+/**
+ * @brief return the pointer to connid in a req packet
+ *
+ * @param[in]	pkt_entry	an REQ packet entry
+ * @return	If the input has the optional connid header, return the pointer to connid
+ * 		Otherwise, return NULL
+ */
+uint32_t *rxr_pkt_req_connid_ptr(struct rxr_pkt_entry *pkt_entry)
+{
+	char *opt_hdr;
+	struct rxr_base_hdr *base_hdr;
+	struct rxr_req_opt_connid_hdr *connid_hdr;
+
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
+
+	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
+		struct efa_ep_addr *raw_addr;
+
+		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
+		raw_addr = (struct efa_ep_addr *)raw_addr_hdr->raw_addr;
+		return &raw_addr->qkey;
+	}
+
+	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR)
+		opt_hdr += sizeof(struct rxr_req_opt_cq_data_hdr);
+
+	if (base_hdr->flags & RXR_PKT_CONNID_HDR) {
+		connid_hdr = (struct rxr_req_opt_connid_hdr *)opt_hdr;
+		return &connid_hdr->connid;
+	}
+
+	return NULL;
+}
+
 size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry)
 {
 	char *opt_hdr;
@@ -193,7 +276,13 @@ size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry)
 
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
+
+	/*
+	 * It is not possible to have both optional raw addr header and optional
+	 * connid header in a packet header.
+	 */
 	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		assert(!(base_hdr->flags & RXR_PKT_CONNID_HDR));
 		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
 		opt_hdr += sizeof(struct rxr_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len;
 	}
@@ -201,6 +290,11 @@ size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry)
 	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR)
 		opt_hdr += sizeof(struct rxr_req_opt_cq_data_hdr);
 
+	if (base_hdr->flags & RXR_PKT_CONNID_HDR) {
+		assert(!(base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR));
+		opt_hdr += sizeof(struct rxr_req_opt_connid_hdr);
+	}
+
 	return opt_hdr - (char *)pkt_entry->pkt;
 }
 
@@ -223,16 +317,57 @@ int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry)
 	return cq_data_hdr->cq_data;
 }
 
-size_t rxr_pkt_req_max_header_size(int pkt_type)
+/**
+ * @brief calculates the exact header size given a REQ packet type, flags, and IOV count.
+ *
+ * @param[in]	pkt_type	packet type
+ * @param[in]	flags	flags from packet
+ * @param[in]	rma_iov_count	number of RMA IOV structures present
+ * @return	The exact size of the packet header
+ */
+inline
+size_t rxr_pkt_req_header_size(int pkt_type, uint16_t flags, size_t rma_iov_count)
 {
-	int max_hdr_size = REQ_INF_LIST[pkt_type].base_hdr_size
-		+ sizeof(struct rxr_req_opt_raw_addr_hdr) + RXR_MAX_NAME_LENGTH
-		+ sizeof(struct rxr_req_opt_cq_data_hdr);
+	int hdr_size = REQ_INF_LIST[pkt_type].base_hdr_size;
 
-	if (pkt_type == RXR_EAGER_RTW_PKT || pkt_type == RXR_LONG_RTW_PKT)
-		max_hdr_size += RXR_IOV_LIMIT * sizeof(struct fi_rma_iov);
+	if (flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		/* It is impossible to have both optional connid hdr and opt_raw_addr_hdr
+		 * in the header, and length of opt raw addr hdr is larger than
+		 * connid hdr (which is confirmed by the following assertion).
+		 */
+		assert(RXR_REQ_OPT_RAW_ADDR_HDR_SIZE >= sizeof(struct rxr_req_opt_connid_hdr));
+		hdr_size += RXR_REQ_OPT_RAW_ADDR_HDR_SIZE;
+	} else if (flags & RXR_PKT_CONNID_HDR) {
+		hdr_size += sizeof(struct rxr_req_opt_connid_hdr);
+	}
 
-	return max_hdr_size;
+	if (flags & RXR_REQ_OPT_CQ_DATA_HDR) {
+		hdr_size += sizeof(struct rxr_req_opt_cq_data_hdr);
+	}
+
+	if (rxr_pkt_type_contains_rma_iov(pkt_type)) {
+		hdr_size += rma_iov_count * sizeof(struct fi_rma_iov);
+	}
+
+	return hdr_size;
+}
+
+/**
+ * @brief calculates the max header size given a REQ packet type
+ *
+ * @param[in]	pkt_type	packet type
+ * @return	The max possible size of the packet header
+ */
+inline size_t rxr_pkt_req_max_header_size(int pkt_type)
+{
+	/* To calculate max REQ reader size, we should include all possible REQ opt header flags.
+	 * However, because the optional connid header and optional raw address header cannot
+	 * exist at the same time, and the raw address header is longer than connid header,
+	 * we did not include the flag for CONNID header
+	 */
+	uint16_t header_flags = RXR_REQ_OPT_RAW_ADDR_HDR_SIZE | RXR_REQ_OPT_CQ_DATA_HDR;
+
+	return rxr_pkt_req_header_size(pkt_type, header_flags, RXR_IOV_LIMIT);
 }
 
 size_t rxr_pkt_max_header_size(void)
@@ -250,12 +385,13 @@ size_t rxr_pkt_max_header_size(void)
 	}
 
 	return max_hdr_size;
-
 }
 
-size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type)
+size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type,
+				 uint64_t fi_flags, size_t rma_iov_count)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
+	uint16_t header_flags = 0;
 
 	peer = rxr_ep_get_peer(ep, addr);
 	assert(peer);
@@ -265,7 +401,17 @@ size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type
 		return rxr_env.shm_max_medium_size;
 	}
 
-	return ep->mtu_size - rxr_pkt_req_max_header_size(pkt_type);
+	if (rxr_peer_need_raw_addr_hdr(peer))
+		header_flags |= RXR_REQ_OPT_RAW_ADDR_HDR;
+	else if (rxr_peer_need_connid(peer))
+		header_flags |= RXR_PKT_CONNID_HDR;
+
+	if (fi_flags & FI_REMOTE_CQ_DATA)
+		header_flags |= RXR_REQ_OPT_CQ_DATA_HDR;
+
+	return ep->mtu_size - rxr_pkt_req_header_size(pkt_type,
+						      header_flags,
+						      rma_iov_count);
 }
 
 /*
@@ -273,82 +419,16 @@ size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type
  *
  *     init() functions
  */
-
-/*
- * this function is called after you have set header in pkt_entry->pkt
- */
-void rxr_pkt_data_from_tx(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry,
-			  struct rxr_tx_entry *tx_entry, size_t data_offset,
-			  size_t data_size)
-{
-	int tx_iov_index;
-	size_t tx_iov_offset;
-	char *data;
-	size_t hdr_size;
-	struct efa_mr *desc;
-
-	assert(pkt_entry->send);
-	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
-	assert(hdr_size > 0);
-	if (data_size == 0) {
-		pkt_entry->send->iov_count = 0;
-		pkt_entry->pkt_size = hdr_size;
-		return;
-	}
-
-	rxr_locate_iov_pos(tx_entry->iov, tx_entry->iov_count, data_offset,
-			   &tx_iov_index, &tx_iov_offset);
-	desc = tx_entry->desc[0];
-	assert(tx_iov_index < tx_entry->iov_count);
-	assert(tx_iov_offset < tx_entry->iov[tx_iov_index].iov_len);
-
-	/*
-	 * We want to go through the bounce-buffers here only when
-	 * one of the following conditions are true:
-	 * 1. The application can not register buffers (no FI_MR_LOCAL)
-	 * 2. desc.peer.iface is anything but FI_HMEM_SYSTEM
-	 * 3. prov/shm is not used for this transfer, and #1 or #2 hold true.
-	 *
-	 * In the first case, we use the pre-registered pkt_entry's MR. In the
-	 * second case, this is for the eager and medium-message protocols which
-	 * can not rendezvous and pull the data from a peer. In the third case,
-	 * the bufpool would not have been created with a registration handler,
-	 * so pkt_entry->mr will be NULL.
-	 *
-	 */
-	if (!tx_entry->desc[tx_iov_index] && pkt_entry->mr) {
-		data = (char *)pkt_entry->pkt + hdr_size;
-		data_size = ofi_copy_from_hmem_iov(data,
-					data_size,
-					desc ? desc->peer.iface : FI_HMEM_SYSTEM,
-					desc ? desc->peer.device.reserved : 0,
-					tx_entry->iov,
-					tx_entry->iov_count,
-					data_offset);
-		pkt_entry->send->iov_count = 0;
-		pkt_entry->pkt_size = hdr_size + data_size;
-		return;
-	}
-
-	assert(ep->core_iov_limit >= 2);
-	pkt_entry->send->iov[0].iov_base = pkt_entry->pkt;
-	pkt_entry->send->iov[0].iov_len = hdr_size;
-	pkt_entry->send->desc[0] = pkt_entry->mr ? fi_mr_desc(pkt_entry->mr) : NULL;
-
-	pkt_entry->send->iov[1].iov_base = (char *)tx_entry->iov[tx_iov_index].iov_base + tx_iov_offset;
-	pkt_entry->send->iov[1].iov_len = MIN(data_size, tx_entry->iov[tx_iov_index].iov_len - tx_iov_offset);
-	pkt_entry->send->desc[1] = tx_entry->desc[tx_iov_index];
-	pkt_entry->send->iov_count = 2;
-	pkt_entry->pkt_size = hdr_size + pkt_entry->send->iov[1].iov_len;
-}
-
-void rxr_pkt_init_rtm(struct rxr_ep *ep,
-		      struct rxr_tx_entry *tx_entry,
-		      int pkt_type, uint64_t data_offset,
-		      struct rxr_pkt_entry *pkt_entry)
+static inline
+int rxr_pkt_init_rtm(struct rxr_ep *ep,
+		     struct rxr_tx_entry *tx_entry,
+		     int pkt_type, uint64_t data_offset,
+		     struct rxr_pkt_entry *pkt_entry)
 {
 	size_t data_size;
 	struct rxr_rtm_base_hdr *rtm_hdr;
+	int ret;
+
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
 
 	rtm_hdr = (struct rxr_rtm_base_hdr *)pkt_entry->pkt;
@@ -357,26 +437,50 @@ void rxr_pkt_init_rtm(struct rxr_ep *ep,
 
 	data_size = MIN(tx_entry->total_len - data_offset,
 			ep->mtu_size - rxr_pkt_req_hdr_size(pkt_entry));
-	rxr_pkt_data_from_tx(ep, pkt_entry, tx_entry, data_offset, data_size);
-	pkt_entry->x_entry = tx_entry;
+	ret = rxr_pkt_init_data_from_tx_entry(ep, pkt_entry, rxr_pkt_req_hdr_size(pkt_entry),
+					      tx_entry, data_offset, data_size);
+	return ret;
 }
 
 ssize_t rxr_pkt_init_eager_msgrtm(struct rxr_ep *ep,
 				  struct rxr_tx_entry *tx_entry,
 				  struct rxr_pkt_entry *pkt_entry)
 {
-	rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_MSGRTM_PKT, 0, pkt_entry);
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_MSGRTM_PKT, 0, pkt_entry);
+	if (ret)
+		return ret;
+
 	assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry));
 	return 0;
 }
 
+ssize_t rxr_pkt_init_dc_eager_msgrtm(struct rxr_ep *ep,
+				     struct rxr_tx_entry *tx_entry,
+				     struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_dc_eager_msgrtm_hdr *dc_eager_msgrtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_EAGER_MSGRTM_PKT, 0, pkt_entry);
+	if (ret)
+		return ret;
+	dc_eager_msgrtm_hdr = rxr_get_dc_eager_msgrtm_hdr(pkt_entry->pkt);
+	dc_eager_msgrtm_hdr->hdr.send_id = tx_entry->tx_id;
+	return 0;
+}
+
 ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep,
 				  struct rxr_tx_entry *tx_entry,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_base_hdr *base_hdr;
+	int ret;
 
-	rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_TAGRTM_PKT, 0, pkt_entry);
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_TAGRTM_PKT, 0, pkt_entry);
+	if (ret)
+		return ret;
 	assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry));
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	base_hdr->flags |= RXR_REQ_TAGGED;
@@ -384,17 +488,60 @@ ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep,
 	return 0;
 }
 
+ssize_t rxr_pkt_init_dc_eager_tagrtm(struct rxr_ep *ep,
+				     struct rxr_tx_entry *tx_entry,
+				     struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_base_hdr *base_hdr;
+	struct rxr_dc_eager_tagrtm_hdr *dc_eager_tagrtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_EAGER_TAGRTM_PKT, 0, pkt_entry);
+	if (ret)
+		return ret;
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	base_hdr->flags |= RXR_REQ_TAGGED;
+	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
+
+	dc_eager_tagrtm_hdr = rxr_get_dc_eager_tagrtm_hdr(pkt_entry->pkt);
+	dc_eager_tagrtm_hdr->hdr.send_id = tx_entry->tx_id;
+	return 0;
+}
+
 ssize_t rxr_pkt_init_medium_msgrtm(struct rxr_ep *ep,
 				   struct rxr_tx_entry *tx_entry,
 				   struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_medium_rtm_base_hdr *rtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_MSGRTM_PKT,
+			       tx_entry->bytes_sent, pkt_entry);
+	if (ret)
+		return ret;
 
-	rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_MSGRTM_PKT,
-			 tx_entry->bytes_sent, pkt_entry);
 	rtm_hdr = rxr_get_medium_rtm_base_hdr(pkt_entry->pkt);
-	rtm_hdr->data_len = tx_entry->total_len;
-	rtm_hdr->offset = tx_entry->bytes_sent;
+	rtm_hdr->msg_length = tx_entry->total_len;
+	rtm_hdr->seg_offset = tx_entry->bytes_sent;
+	return 0;
+}
+
+ssize_t rxr_pkt_init_dc_medium_msgrtm(struct rxr_ep *ep,
+				      struct rxr_tx_entry *tx_entry,
+				      struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_dc_medium_msgrtm_hdr *dc_medium_msgrtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_MEDIUM_MSGRTM_PKT,
+			       tx_entry->bytes_sent, pkt_entry);
+	if (ret)
+		return ret;
+
+	dc_medium_msgrtm_hdr = rxr_get_dc_medium_msgrtm_hdr(pkt_entry->pkt);
+	dc_medium_msgrtm_hdr->hdr.msg_length = tx_entry->total_len;
+	dc_medium_msgrtm_hdr->hdr.seg_offset = tx_entry->bytes_sent;
+	dc_medium_msgrtm_hdr->hdr.send_id = tx_entry->tx_id;
 	return 0;
 }
 
@@ -403,69 +550,125 @@ ssize_t rxr_pkt_init_medium_tagrtm(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_medium_rtm_base_hdr *rtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_TAGRTM_PKT,
+			       tx_entry->bytes_sent, pkt_entry);
+	if (ret)
+		return ret;
 
-	rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_TAGRTM_PKT,
-			 tx_entry->bytes_sent, pkt_entry);
 	rtm_hdr = rxr_get_medium_rtm_base_hdr(pkt_entry->pkt);
-	rtm_hdr->data_len = tx_entry->total_len;
-	rtm_hdr->offset = tx_entry->bytes_sent;
+	rtm_hdr->msg_length = tx_entry->total_len;
+	rtm_hdr->seg_offset = tx_entry->bytes_sent;
 	rtm_hdr->hdr.flags |= RXR_REQ_TAGGED;
 	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
 	return 0;
 }
 
-void rxr_pkt_init_long_rtm(struct rxr_ep *ep,
-			   struct rxr_tx_entry *tx_entry,
-			   int pkt_type,
-			   struct rxr_pkt_entry *pkt_entry)
+ssize_t rxr_pkt_init_dc_medium_tagrtm(struct rxr_ep *ep,
+				      struct rxr_tx_entry *tx_entry,
+				      struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_long_rtm_base_hdr *rtm_hdr;
+	struct rxr_dc_medium_tagrtm_hdr *dc_medium_tagrtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_MEDIUM_TAGRTM_PKT,
+			       tx_entry->bytes_sent, pkt_entry);
+	if (ret)
+		return ret;
 
-	rxr_pkt_init_rtm(ep, tx_entry, pkt_type, 0, pkt_entry);
-	rtm_hdr = rxr_get_long_rtm_base_hdr(pkt_entry->pkt);
-	rtm_hdr->data_len = tx_entry->total_len;
-	rtm_hdr->tx_id = tx_entry->tx_id;
+	dc_medium_tagrtm_hdr = rxr_get_dc_medium_tagrtm_hdr(pkt_entry->pkt);
+	dc_medium_tagrtm_hdr->hdr.msg_length = tx_entry->total_len;
+	dc_medium_tagrtm_hdr->hdr.seg_offset = tx_entry->bytes_sent;
+	dc_medium_tagrtm_hdr->hdr.hdr.flags |= RXR_REQ_TAGGED;
+	dc_medium_tagrtm_hdr->hdr.send_id = tx_entry->tx_id;
+	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
+	return 0;
+}
+
+int rxr_pkt_init_longcts_rtm(struct rxr_ep *ep,
+			     struct rxr_tx_entry *tx_entry,
+			     int pkt_type,
+			     struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_longcts_rtm_base_hdr *rtm_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_rtm(ep, tx_entry, pkt_type, 0, pkt_entry);
+	if (ret)
+		return ret;
+
+	rtm_hdr = rxr_get_longcts_rtm_base_hdr(pkt_entry->pkt);
+	rtm_hdr->msg_length = tx_entry->total_len;
+	rtm_hdr->send_id = tx_entry->tx_id;
 	rtm_hdr->credit_request = tx_entry->credit_request;
+	return 0;
 }
 
-ssize_t rxr_pkt_init_long_msgrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_msgrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry)
 {
-	rxr_pkt_init_long_rtm(ep, tx_entry, RXR_LONG_MSGRTM_PKT, pkt_entry);
-	return 0;
+	return rxr_pkt_init_longcts_rtm(ep, tx_entry, RXR_LONGCTS_MSGRTM_PKT, pkt_entry);
+}
+
+ssize_t rxr_pkt_init_dc_longcts_msgrtm(struct rxr_ep *ep,
+				    struct rxr_tx_entry *tx_entry,
+				    struct rxr_pkt_entry *pkt_entry)
+{
+	return rxr_pkt_init_longcts_rtm(ep, tx_entry, RXR_DC_LONGCTS_MSGRTM_PKT, pkt_entry);
 }
 
-ssize_t rxr_pkt_init_long_tagrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_tagrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_base_hdr *base_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_longcts_rtm(ep, tx_entry, RXR_LONGCTS_TAGRTM_PKT, pkt_entry);
+	if (ret)
+		return ret;
 
-	rxr_pkt_init_long_rtm(ep, tx_entry, RXR_LONG_TAGRTM_PKT, pkt_entry);
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	base_hdr->flags |= RXR_REQ_TAGGED;
 	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
 	return 0;
 }
 
-ssize_t rxr_pkt_init_read_rtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_dc_longcts_tagrtm(struct rxr_ep *ep,
+				    struct rxr_tx_entry *tx_entry,
+				    struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_base_hdr *base_hdr;
+	int ret;
+
+	ret = rxr_pkt_init_longcts_rtm(ep, tx_entry, RXR_DC_LONGCTS_TAGRTM_PKT, pkt_entry);
+	if (ret)
+		return ret;
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	base_hdr->flags |= RXR_REQ_TAGGED;
+	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
+	return 0;
+}
+
+ssize_t rxr_pkt_init_longread_rtm(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      int pkt_type,
 			      struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_read_rtm_base_hdr *rtm_hdr;
+	struct rxr_longread_rtm_base_hdr *rtm_hdr;
 	struct fi_rma_iov *read_iov;
 	size_t hdr_size;
 	int err;
 
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
 
-	rtm_hdr = rxr_get_read_rtm_base_hdr(pkt_entry->pkt);
+	rtm_hdr = rxr_get_longread_rtm_base_hdr(pkt_entry->pkt);
 	rtm_hdr->hdr.flags |= RXR_REQ_MSG;
 	rtm_hdr->hdr.msg_id = tx_entry->msg_id;
-	rtm_hdr->data_len = tx_entry->total_len;
-	rtm_hdr->tx_id = tx_entry->tx_id;
+	rtm_hdr->msg_length = tx_entry->total_len;
+	rtm_hdr->send_id = tx_entry->tx_id;
 	rtm_hdr->read_iov_count = tx_entry->iov_count;
 
 	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
@@ -475,24 +678,25 @@ ssize_t rxr_pkt_init_read_rtm(struct rxr_ep *ep,
 		return err;
 
 	pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
+	pkt_entry->x_entry = tx_entry;
 	return 0;
 }
 
-ssize_t rxr_pkt_init_read_msgrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longread_msgrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry)
 {
-	return rxr_pkt_init_read_rtm(ep, tx_entry, RXR_READ_MSGRTM_PKT, pkt_entry);
+	return rxr_pkt_init_longread_rtm(ep, tx_entry, RXR_LONGREAD_MSGRTM_PKT, pkt_entry);
 }
 
-ssize_t rxr_pkt_init_read_tagrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longread_tagrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry)
 {
 	ssize_t err;
 	struct rxr_base_hdr *base_hdr;
 
-	err = rxr_pkt_init_read_rtm(ep, tx_entry, RXR_READ_TAGRTM_PKT, pkt_entry);
+	err = rxr_pkt_init_longread_rtm(ep, tx_entry, RXR_LONGREAD_TAGRTM_PKT, pkt_entry);
 	if (err)
 		return err;
 
@@ -518,7 +722,7 @@ void rxr_pkt_handle_medium_rtm_sent(struct rxr_ep *ep,
 	tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry);
 }
 
-void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtm_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
@@ -532,7 +736,7 @@ void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep,
 	tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry);
 	assert(tx_entry->bytes_sent < tx_entry->total_len);
 
-	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
+	if (tx_entry->desc[0] || efa_is_cache_available(efa_domain))
 		rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 }
 
@@ -560,7 +764,7 @@ void rxr_pkt_handle_medium_rtm_send_completion(struct rxr_ep *ep,
 		rxr_cq_handle_tx_completion(ep, tx_entry);
 }
 
-void rxr_pkt_handle_long_rtm_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtm_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
@@ -571,6 +775,18 @@ void rxr_pkt_handle_long_rtm_send_completion(struct rxr_ep *ep,
 		rxr_cq_handle_tx_completion(ep, tx_entry);
 }
 
+void rxr_pkt_handle_dc_longcts_rtm_send_completion(struct rxr_ep *ep,
+						struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_tx_entry *tx_entry;
+
+	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
+	tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry);
+	if (tx_entry->total_len == tx_entry->bytes_acked &&
+	    tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED)
+		rxr_cq_handle_tx_completion(ep, tx_entry);
+}
+
 /*
  *     proc() functions
  */
@@ -582,16 +798,23 @@ size_t rxr_pkt_rtm_total_len(struct rxr_pkt_entry *pkt_entry)
 	switch (base_hdr->type) {
 	case RXR_EAGER_MSGRTM_PKT:
 	case RXR_EAGER_TAGRTM_PKT:
+	case RXR_DC_EAGER_MSGRTM_PKT:
+	case RXR_DC_EAGER_TAGRTM_PKT:
 		return rxr_pkt_req_data_size(pkt_entry);
 	case RXR_MEDIUM_MSGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
-		return rxr_get_medium_rtm_base_hdr(pkt_entry->pkt)->data_len;
-	case RXR_LONG_MSGRTM_PKT:
-	case RXR_LONG_TAGRTM_PKT:
-		return rxr_get_long_rtm_base_hdr(pkt_entry->pkt)->data_len;
-	case RXR_READ_MSGRTM_PKT:
-	case RXR_READ_TAGRTM_PKT:
-		return rxr_get_read_rtm_base_hdr(pkt_entry->pkt)->data_len;
+		return rxr_get_medium_rtm_base_hdr(pkt_entry->pkt)->msg_length;
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
+		return rxr_get_dc_medium_rtm_base_hdr(pkt_entry->pkt)->msg_length;
+	case RXR_LONGCTS_MSGRTM_PKT:
+	case RXR_LONGCTS_TAGRTM_PKT:
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
+		return rxr_get_longcts_rtm_base_hdr(pkt_entry->pkt)->msg_length;
+	case RXR_LONGREAD_MSGRTM_PKT:
+	case RXR_LONGREAD_TAGRTM_PKT:
+		return rxr_get_longread_rtm_base_hdr(pkt_entry->pkt)->msg_length;
 	default:
 		assert(0 && "Unknown REQ packet type\n");
 	}
@@ -599,8 +822,25 @@ size_t rxr_pkt_rtm_total_len(struct rxr_pkt_entry *pkt_entry)
 	return 0;
 }
 
-void rxr_pkt_rtm_init_rx_entry(struct rxr_pkt_entry *pkt_entry,
-			       struct rxr_rx_entry *rx_entry)
+/*
+ * @brief Update rx_entry with the following information in RTM packet entry.
+ *            address:       this is necessary because original address in
+ *                           rx_entry can be FI_ADDR_UNSPEC
+ *            cq_entry.data: for FI_REMOTE_CQ_DATA
+ *            msg_id:        message id
+ *            total_len:     application might provide a buffer that is larger
+ *                           then incoming message size.
+ *            tag:           sender's tag can be different from receiver's tag
+ *                           becuase match only requires
+ *                           (sender_tag | ignore) == (receiver_tag | ignore)
+ *        This function is applied to both unexpected rx_entry (when they are
+ *        allocated) and expected rx_entry (when they are matched to a RTM)
+ *
+ * @param pkt_entry(input)  RTM packet entry
+ * @param rx_entry(input)   rx entry to be updated
+ */
+void rxr_pkt_rtm_update_rx_entry(struct rxr_pkt_entry *pkt_entry,
+				 struct rxr_rx_entry *rx_entry)
 {
 	struct rxr_base_hdr *base_hdr;
 
@@ -627,8 +867,7 @@ struct rxr_rx_entry *rxr_pkt_get_rtm_matched_rx_entry(struct rxr_ep *ep,
 	assert(match);
 	rx_entry = container_of(match, struct rxr_rx_entry, entry);
 	if (rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) {
-		rx_entry = rxr_ep_split_rx_entry(ep, rx_entry,
-						 NULL, pkt_entry);
+		rx_entry = rxr_msg_split_rx_entry(ep, rx_entry, NULL, pkt_entry);
 		if (OFI_UNLIKELY(!rx_entry)) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"RX entries exhausted.\n");
@@ -636,7 +875,7 @@ struct rxr_rx_entry *rxr_pkt_get_rtm_matched_rx_entry(struct rxr_ep *ep,
 			return NULL;
 		}
 	} else {
-		rxr_pkt_rtm_init_rx_entry(pkt_entry, rx_entry);
+		rxr_pkt_rtm_update_rx_entry(pkt_entry, rx_entry);
 	}
 
 	rx_entry->state = RXR_RX_MATCHED;
@@ -702,6 +941,20 @@ struct rxr_rx_entry *rxr_pkt_get_msgrtm_rx_entry(struct rxr_ep *ep,
 	dlist_func_t *match_func;
 	int pkt_type;
 
+	if ((*pkt_entry_ptr)->alloc_type == RXR_PKT_FROM_USER_BUFFER) {
+		/* If a pkt_entry is constructred from user supplied buffer,
+		 * the endpoint must be in zero copy receive mode.
+		 */
+		assert(ep->use_zcpy_rx);
+		/* In this mode, an rx_entry is always created together
+		 * with this pkt_entry, and pkt_entry->x_entry is pointing
+		 * to it. Thus we can skip the matching process, and return
+		 * pkt_entry->x_entry right away.
+		 */
+		assert((*pkt_entry_ptr)->x_entry);
+		return (*pkt_entry_ptr)->x_entry;
+	}
+
 	if (ep->util_ep.caps & FI_DIRECTED_RECV)
 		match_func = &rxr_pkt_rtm_match_recv;
 	else
@@ -711,10 +964,10 @@ struct rxr_rx_entry *rxr_pkt_get_msgrtm_rx_entry(struct rxr_ep *ep,
 	                               *pkt_entry_ptr);
 	if (OFI_UNLIKELY(!match)) {
 		/*
-		 * rxr_ep_alloc_unexp_rx_entry_for_msgrtm() might release pkt_entry,
+		 * rxr_msg_alloc_unexp_rx_entry_for_msgrtm() might release pkt_entry,
 		 * thus we have to use pkt_entry_ptr here
 		 */
-		rx_entry = rxr_ep_alloc_unexp_rx_entry_for_msgrtm(ep, pkt_entry_ptr);
+		rx_entry = rxr_msg_alloc_unexp_rx_entry_for_msgrtm(ep, pkt_entry_ptr);
 		if (OFI_UNLIKELY(!rx_entry)) {
 			FI_WARN(&rxr_prov, FI_LOG_CQ,
 				"RX entries exhausted.\n");
@@ -727,7 +980,8 @@ struct rxr_rx_entry *rxr_pkt_get_msgrtm_rx_entry(struct rxr_ep *ep,
 	}
 
 	pkt_type = rxr_get_base_hdr((*pkt_entry_ptr)->pkt)->type;
-	if (pkt_type == RXR_MEDIUM_MSGRTM_PKT)
+	if (pkt_type == RXR_MEDIUM_MSGRTM_PKT ||
+	    pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT)
 		rxr_pkt_rx_map_insert(ep, *pkt_entry_ptr, rx_entry);
 
 	return rx_entry;
@@ -751,10 +1005,10 @@ struct rxr_rx_entry *rxr_pkt_get_tagrtm_rx_entry(struct rxr_ep *ep,
 	                               *pkt_entry_ptr);
 	if (OFI_UNLIKELY(!match)) {
 		/*
-		 * rxr_ep_alloc_unexp_rx_entry_for_tagrtm() might release pkt_entry,
+		 * rxr_msg_alloc_unexp_rx_entry_for_tagrtm() might release pkt_entry,
 		 * thus we have to use pkt_entry_ptr here
 		 */
-		rx_entry = rxr_ep_alloc_unexp_rx_entry_for_tagrtm(ep, pkt_entry_ptr);
+		rx_entry = rxr_msg_alloc_unexp_rx_entry_for_tagrtm(ep, pkt_entry_ptr);
 		if (OFI_UNLIKELY(!rx_entry)) {
 			efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
 			return NULL;
@@ -764,23 +1018,24 @@ struct rxr_rx_entry *rxr_pkt_get_tagrtm_rx_entry(struct rxr_ep *ep,
 	}
 
 	pkt_type = rxr_get_base_hdr((*pkt_entry_ptr)->pkt)->type;
-	if (pkt_type == RXR_MEDIUM_TAGRTM_PKT)
+	if (pkt_type == RXR_MEDIUM_TAGRTM_PKT ||
+	    pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT)
 		rxr_pkt_rx_map_insert(ep, *pkt_entry_ptr, rx_entry);
 
 	return rx_entry;
 }
 
-ssize_t rxr_pkt_proc_matched_read_rtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_proc_matched_longread_rtm(struct rxr_ep *ep,
 				      struct rxr_rx_entry *rx_entry,
 				      struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_read_rtm_base_hdr *rtm_hdr;
+	struct rxr_longread_rtm_base_hdr *rtm_hdr;
 	struct fi_rma_iov *read_iov;
 
-	rtm_hdr = rxr_get_read_rtm_base_hdr(pkt_entry->pkt);
+	rtm_hdr = rxr_get_longread_rtm_base_hdr(pkt_entry->pkt);
 	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry));
 
-	rx_entry->tx_id = rtm_hdr->tx_id;
+	rx_entry->tx_id = rtm_hdr->send_id;
 	rx_entry->rma_iov_count = rtm_hdr->read_iov_count;
 	memcpy(rx_entry->rma_iov, read_iov,
 	       rx_entry->rma_iov_count * sizeof(struct fi_rma_iov));
@@ -790,7 +1045,7 @@ ssize_t rxr_pkt_proc_matched_read_rtm(struct rxr_ep *ep,
 	/* truncate rx_entry->iov to save memory registration pages because we
 	 * need to do memory registration for the receiving buffer.
 	 */
-	ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_count, rx_entry->total_len);
+	ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_count, rx_entry->total_len + ep->msg_prefix_size);
 	return rxr_read_post_remote_read_or_queue(ep, RXR_RX_ENTRY, rx_entry);
 }
 
@@ -808,23 +1063,26 @@ ssize_t rxr_pkt_proc_matched_medium_rtm(struct rxr_ep *ep,
 	while (cur) {
 		hdr_size = rxr_pkt_req_hdr_size(cur);
 		data = (char *)cur->pkt + hdr_size;
-		offset = rxr_get_medium_rtm_base_hdr(cur->pkt)->offset;
+		if (rx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED)
+			offset = rxr_get_dc_medium_rtm_base_hdr(cur->pkt)->seg_offset;
+		else
+			offset = rxr_get_medium_rtm_base_hdr(cur->pkt)->seg_offset;
 		data_size = cur->pkt_size - hdr_size;
 
-		/* rxr_pkt_copy_to_rx() can release rx_entry, so
+		/* rxr_pkt_copy_data_to_rx_entry() can release rx_entry, so
 		 * bytes_received must be calculated before it.
 		 */
 		rx_entry->bytes_received += data_size;
 		if (rx_entry->total_len == rx_entry->bytes_received)
 			rxr_pkt_rx_map_remove(ep, cur, rx_entry);
 
-		/* rxr_pkt_copy_to_rx() will release cur, so
+		/* rxr_pkt_copy_data_to_rx_entry() will release cur, so
 		 * cur->next must be copied out before it.
 		 */
 		nxt = cur->next;
 		cur->next = NULL;
 
-		err = rxr_pkt_copy_to_rx(ep, rx_entry, offset, cur, data, data_size);
+		err = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, offset, cur, data, data_size);
 		if (err) {
 			rxr_pkt_entry_release_rx(ep, cur);
 			ret = err;
@@ -836,6 +1094,71 @@ ssize_t rxr_pkt_proc_matched_medium_rtm(struct rxr_ep *ep,
 	return ret;
 }
 
+/**
+ * @brief process a matched eager rtm packet entry
+ *
+ * For an eager message, it will write rx completion,
+ * release packet entry and rx_entry.
+ *
+ * @param[in]	ep		endpoint
+ * @param[in]	rx_entry	RX entry
+ * @param[in]	pkt_entry	packet entry
+ * @return	On success, return 0
+ * 		On failure, return libfabric error code
+ */
+ssize_t rxr_pkt_proc_matched_eager_rtm(struct rxr_ep *ep,
+				       struct rxr_rx_entry *rx_entry,
+				       struct rxr_pkt_entry *pkt_entry)
+{
+	int err;
+	char *data;
+	size_t hdr_size, data_size;
+
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+
+	if (pkt_entry->alloc_type != RXR_PKT_FROM_USER_BUFFER) {
+		data = (char *)pkt_entry->pkt + hdr_size;
+		data_size = pkt_entry->pkt_size - hdr_size;
+
+		/*
+		 * On success, rxr_pkt_copy_data_to_rx_entry will write rx completion,
+		 * release pkt_entry and rx_entry
+		 */
+		err = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, 0, pkt_entry, data, data_size);
+		if (err)
+			rxr_pkt_entry_release_rx(ep, pkt_entry);
+
+		return err;
+	}
+
+	/* In this case, data is already in user provided buffer, so no need
+	 * to copy. However, we do need to make sure the packet header length
+	 * is correct. Otherwise, user will get wrong data.
+	 *
+	 * The expected header size is
+	 * 	ep->msg_prefix_size - sizeof(struct rxr_pkt_entry)
+	 * because we used the first sizeof(struct rxr_pkt_entry) to construct
+	 * a pkt_entry.
+	 */
+	if (hdr_size != ep->msg_prefix_size - sizeof(struct rxr_pkt_entry)) {
+		/* if header size is wrong, the data in user buffer is not useful.
+		 * setting rx_entry->cq_entry.len here will cause an error cq entry
+		 * to be written to application.
+		 */
+		rx_entry->cq_entry.len = 0;
+	} else {
+		assert(rx_entry->cq_entry.buf == pkt_entry->pkt - sizeof(struct rxr_pkt_entry));
+		rx_entry->cq_entry.len = pkt_entry->pkt_size + sizeof(struct rxr_pkt_entry);
+	}
+
+	rxr_cq_write_rx_completion(ep, rx_entry);
+	rxr_release_rx_entry(ep, rx_entry);
+
+	/* no need to release packet entry because it is
+	 * constructed using user supplied buffer */
+	return 0;
+}
+
 ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 				 struct rxr_rx_entry *rx_entry,
 				 struct rxr_pkt_entry *pkt_entry)
@@ -847,6 +1170,13 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 
 	assert(rx_entry->state == RXR_RX_MATCHED);
 
+	if (!rx_entry->peer) {
+		rx_entry->addr = pkt_entry->addr;
+		rx_entry->peer = rxr_ep_get_peer(ep, rx_entry->addr);
+		assert(rx_entry->peer);
+		dlist_insert_tail(&rx_entry->peer_entry, &rx_entry->peer->rx_entry_list);
+	}
+
 	/* Adjust rx_entry->cq_entry.len as needed.
 	 * Initialy rx_entry->cq_entry.len is total recv buffer size.
 	 * rx_entry->total_len is from REQ packet and is total send buffer size.
@@ -858,39 +1188,59 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 		rx_entry->cq_entry.len = rx_entry->total_len;
 
 	pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type;
-	if (pkt_type == RXR_READ_MSGRTM_PKT || pkt_type == RXR_READ_TAGRTM_PKT)
-		return rxr_pkt_proc_matched_read_rtm(ep, rx_entry, pkt_entry);
 
-	if (pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT)
+	if (pkt_type > RXR_DC_REQ_PKT_BEGIN &&
+	    pkt_type < RXR_DC_REQ_PKT_END)
+		rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+
+	if (pkt_type == RXR_LONGCTS_MSGRTM_PKT ||
+	    pkt_type == RXR_LONGCTS_TAGRTM_PKT)
+		rx_entry->tx_id = rxr_get_longcts_rtm_base_hdr(pkt_entry->pkt)->send_id;
+	else if (pkt_type == RXR_DC_EAGER_MSGRTM_PKT ||
+		 pkt_type == RXR_DC_EAGER_TAGRTM_PKT)
+		rx_entry->tx_id = rxr_get_dc_eager_rtm_base_hdr(pkt_entry->pkt)->send_id;
+	else if (pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+		 pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT)
+		rx_entry->tx_id = rxr_get_dc_medium_rtm_base_hdr(pkt_entry->pkt)->send_id;
+	else if (pkt_type == RXR_DC_LONGCTS_MSGRTM_PKT ||
+		 pkt_type == RXR_DC_LONGCTS_TAGRTM_PKT)
+		rx_entry->tx_id = rxr_get_longcts_rtm_base_hdr(pkt_entry->pkt)->send_id;
+
+	rx_entry->msg_id = rxr_get_rtm_base_hdr(pkt_entry->pkt)->msg_id;
+
+	if (pkt_type == RXR_LONGREAD_MSGRTM_PKT || pkt_type == RXR_LONGREAD_TAGRTM_PKT)
+		return rxr_pkt_proc_matched_longread_rtm(ep, rx_entry, pkt_entry);
+
+	if (pkt_type == RXR_MEDIUM_MSGRTM_PKT ||
+	    pkt_type == RXR_MEDIUM_TAGRTM_PKT ||
+	    pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+	    pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT)
 		return rxr_pkt_proc_matched_medium_rtm(ep, rx_entry, pkt_entry);
 
+	if (pkt_type == RXR_EAGER_MSGRTM_PKT ||
+	    pkt_type == RXR_EAGER_TAGRTM_PKT ||
+	    pkt_type == RXR_DC_EAGER_MSGRTM_PKT ||
+	    pkt_type == RXR_DC_EAGER_TAGRTM_PKT) {
+		return rxr_pkt_proc_matched_eager_rtm(ep, rx_entry, pkt_entry);
+	}
+
 	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
 	data = (char *)pkt_entry->pkt + hdr_size;
 	data_size = pkt_entry->pkt_size - hdr_size;
 
 	rx_entry->bytes_received += data_size;
-	ret = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
+	ret = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, 0, pkt_entry, data, data_size);
 	if (ret) {
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return ret;
 	}
-
-	if (pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT) {
-		ret = 0;
-	} else {
-		/*
-		 * long message protocol
-		 */
 #if ENABLE_DEBUG
-		dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list);
-		ep->rx_pending++;
+	dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list);
+	ep->rx_pending++;
 #endif
-		rx_entry->state = RXR_RX_RECV;
-		rx_entry->tx_id = rxr_get_long_rtm_base_hdr(pkt_entry->pkt)->tx_id;
-		/* we have noticed using the default value achieve better bandwidth */
-		rx_entry->credit_request = rxr_env.tx_min_credits;
-		ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
-	}
+	rx_entry->state = RXR_RX_RECV;
+	/* we have noticed using the default value achieve better bandwidth */
+	rx_entry->credit_request = rxr_env.tx_min_credits;
+	ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
 
 	return ret;
 }
@@ -911,10 +1261,7 @@ ssize_t rxr_pkt_proc_msgrtm(struct rxr_ep *ep,
 	if (rx_entry->state == RXR_RX_MATCHED) {
 		err = rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry);
 		if (OFI_UNLIKELY(err)) {
-			if (rxr_cq_handle_rx_error(ep, rx_entry, err)) {
-				assert(0 && "cannot write cq error entry");
-				efa_eq_write_error(&ep->util_ep, -err, err);
-			}
+			rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 			rxr_pkt_entry_release_rx(ep, pkt_entry);
 			rxr_release_rx_entry(ep, rx_entry);
 			return err;
@@ -940,10 +1287,7 @@ ssize_t rxr_pkt_proc_tagrtm(struct rxr_ep *ep,
 	if (rx_entry->state == RXR_RX_MATCHED) {
 		err = rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry);
 		if (OFI_UNLIKELY(err)) {
-			if (rxr_cq_handle_rx_error(ep, rx_entry, err)) {
-				assert(0 && "cannot write error cq entry");
-				efa_eq_write_error(&ep->util_ep, -err, err);
-			}
+			rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 			rxr_pkt_entry_release_rx(ep, pkt_entry);
 			rxr_release_rx_entry(ep, rx_entry);
 			return err;
@@ -967,16 +1311,24 @@ ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep,
 	switch (base_hdr->type) {
 	case RXR_EAGER_MSGRTM_PKT:
 	case RXR_MEDIUM_MSGRTM_PKT:
-	case RXR_LONG_MSGRTM_PKT:
-	case RXR_READ_MSGRTM_PKT:
+	case RXR_LONGCTS_MSGRTM_PKT:
+	case RXR_LONGREAD_MSGRTM_PKT:
+	case RXR_DC_EAGER_MSGRTM_PKT:
+	case RXR_DC_MEDIUM_MSGRTM_PKT:
+	case RXR_DC_LONGCTS_MSGRTM_PKT:
 		return rxr_pkt_proc_msgrtm(ep, pkt_entry);
 	case RXR_EAGER_TAGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
-	case RXR_LONG_TAGRTM_PKT:
-	case RXR_READ_TAGRTM_PKT:
+	case RXR_LONGCTS_TAGRTM_PKT:
+	case RXR_LONGREAD_TAGRTM_PKT:
+	case RXR_DC_EAGER_TAGRTM_PKT:
+	case RXR_DC_MEDIUM_TAGRTM_PKT:
+	case RXR_DC_LONGCTS_TAGRTM_PKT:
 		return rxr_pkt_proc_tagrtm(ep, pkt_entry);
 	case RXR_WRITE_RTA_PKT:
 		return rxr_pkt_proc_write_rta(ep, pkt_entry);
+	case RXR_DC_WRITE_RTA_PKT:
+		return rxr_pkt_proc_dc_write_rta(ep, pkt_entry);
 	case RXR_FETCH_RTA_PKT:
 		return rxr_pkt_proc_fetch_rta(ep, pkt_entry);
 	case RXR_COMPARE_RTA_PKT:
@@ -985,70 +1337,28 @@ ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep,
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 			"Unknown packet type ID: %d\n",
 		       base_hdr->type);
-		if (rxr_cq_handle_cq_error(ep, -FI_EINVAL))
-			assert(0 && "failed to write err cq entry");
-	}
-
-	return -FI_EINVAL;
-}
-
-void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep,
-			      struct rxr_pkt_entry *pkt_entry)
-{
-	struct rxr_rx_entry *rx_entry;
-
-	struct rxr_base_hdr *base_hdr __attribute__((unused));
-	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
-	assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN);
-	assert(base_hdr->type != RXR_MEDIUM_MSGRTM_PKT);
-	assert(base_hdr->type != RXR_MEDIUM_TAGRTM_PKT);
-	assert(pkt_entry->type == RXR_PKT_ENTRY_USER);
-
-	rx_entry = rxr_pkt_get_msgrtm_rx_entry(ep, &pkt_entry);
-	if (OFI_UNLIKELY(!rx_entry)) {
-		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
+		efa_eq_write_error(&ep->util_ep, FI_EINVAL, FI_EINVAL);
 		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		return;
 	}
-	pkt_entry->x_entry = rx_entry;
-	if (rx_entry->state != RXR_RX_MATCHED)
-		return;
-
-	/*
-	 * The incoming receive will always get matched to the first posted
-	 * rx_entry available, so this is a constant cost. No real tag or
-	 * address matching happens.
-	 */
-	assert(rx_entry->state == RXR_RX_MATCHED);
-
-	/*
-	 * Adjust rx_entry->cq_entry.len as needed.
-	 * Initialy rx_entry->cq_entry.len is total recv buffer size.
-	 * rx_entry->total_len is from REQ packet and is total send buffer size.
-	 * if send buffer size < recv buffer size, we adjust value of rx_entry->cq_entry.len
-	 * if send buffer size > recv buffer size, we have a truncated message and will
-	 * write error CQ entry.
-	 */
-	if (rx_entry->cq_entry.len > rx_entry->total_len)
-		rx_entry->cq_entry.len = rx_entry->total_len;
 
-	rxr_cq_write_rx_completion(ep, rx_entry);
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
-	rxr_release_rx_entry(ep, rx_entry);
+	return -FI_EINVAL;
 }
 
 void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 				 struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_base_hdr *base_hdr;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	bool need_ordering;
 	int ret, msg_id;
 
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN);
 
-	if (base_hdr->type == RXR_MEDIUM_MSGRTM_PKT || base_hdr->type == RXR_MEDIUM_TAGRTM_PKT) {
+	if (base_hdr->type == RXR_MEDIUM_MSGRTM_PKT ||
+	    base_hdr->type == RXR_MEDIUM_TAGRTM_PKT ||
+	    base_hdr->type == RXR_DC_MEDIUM_MSGRTM_PKT ||
+	    base_hdr->type == RXR_DC_MEDIUM_TAGRTM_PKT) {
 		struct rxr_rx_entry *rx_entry;
 		struct rxr_pkt_entry *unexp_pkt_entry;
 
@@ -1069,6 +1379,7 @@ void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 	need_ordering = false;
 	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
 	assert(peer);
+
 	if (!peer->is_local) {
 		/*
  		 * only need to reorder msg for efa_ep
@@ -1098,7 +1409,7 @@ void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 			"Invalid msg_id: %" PRIu32
 			" robuf->exp_msg_id: %" PRIu32 "\n",
-		       msg_id, peer->robuf->exp_msg_id);
+		       msg_id, peer->robuf.exp_msg_id);
 		efa_eq_write_error(&ep->util_ep, FI_EIO, ret);
 		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return;
@@ -1127,20 +1438,19 @@ void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 	if (OFI_UNLIKELY(ret))
 		return;
 
-	ofi_recvwin_slide(peer->robuf);
+	ofi_recvwin_slide((&peer->robuf));
 	/* process pending items in reorder buff */
 	rxr_cq_proc_pending_items_in_recvwin(ep, peer);
 }
 
 /*
- * RTW pakcet type functions
+ * RTW packet type functions
  */
-void rxr_pkt_init_rtw_data(struct rxr_ep *ep,
-			   struct rxr_tx_entry *tx_entry,
-			   struct rxr_pkt_entry *pkt_entry,
-			   struct fi_rma_iov *rma_iov)
+int rxr_pkt_init_rtw_data(struct rxr_ep *ep,
+			  struct rxr_tx_entry *tx_entry,
+			  struct rxr_pkt_entry *pkt_entry,
+			  struct efa_rma_iov *rma_iov)
 {
-	char *data;
 	size_t hdr_size;
 	size_t data_size;
 	int i;
@@ -1152,12 +1462,8 @@ void rxr_pkt_init_rtw_data(struct rxr_ep *ep,
 	}
 
 	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
-	data = (char *)pkt_entry->pkt + hdr_size;
-	data_size = ofi_copy_from_iov(data, ep->mtu_size - hdr_size,
-				      tx_entry->iov, tx_entry->iov_count, 0);
-
-	pkt_entry->pkt_size = hdr_size + data_size;
-	pkt_entry->x_entry = tx_entry;
+	data_size = MIN(ep->mtu_size - hdr_size, tx_entry->total_len);
+	return rxr_pkt_init_data_from_tx_entry(ep, pkt_entry, hdr_size, tx_entry, 0, data_size);
 }
 
 ssize_t rxr_pkt_init_eager_rtw(struct rxr_ep *ep,
@@ -1171,45 +1477,86 @@ ssize_t rxr_pkt_init_eager_rtw(struct rxr_ep *ep,
 	rtw_hdr = (struct rxr_eager_rtw_hdr *)pkt_entry->pkt;
 	rtw_hdr->rma_iov_count = tx_entry->rma_iov_count;
 	rxr_pkt_init_req_hdr(ep, tx_entry, RXR_EAGER_RTW_PKT, pkt_entry);
-	rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov);
-	return 0;
+	return rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov);
 }
 
-ssize_t rxr_pkt_init_long_rtw(struct rxr_ep *ep,
-			      struct rxr_tx_entry *tx_entry,
-			      struct rxr_pkt_entry *pkt_entry)
+ssize_t rxr_pkt_init_dc_eager_rtw(struct rxr_ep *ep,
+				  struct rxr_tx_entry *tx_entry,
+				  struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_long_rtw_hdr *rtw_hdr;
+	struct rxr_dc_eager_rtw_hdr *dc_eager_rtw_hdr;
+	int ret;
 
 	assert(tx_entry->op == ofi_op_write);
 
-	rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt;
+	dc_eager_rtw_hdr = (struct rxr_dc_eager_rtw_hdr *)pkt_entry->pkt;
+	dc_eager_rtw_hdr->rma_iov_count = tx_entry->rma_iov_count;
+	rxr_pkt_init_req_hdr(ep, tx_entry, RXR_DC_EAGER_RTW_PKT, pkt_entry);
+	ret = rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry,
+				    dc_eager_rtw_hdr->rma_iov);
+	dc_eager_rtw_hdr->send_id = tx_entry->tx_id;
+	return ret;
+}
+
+static inline void rxr_pkt_init_longcts_rtw_hdr(struct rxr_ep *ep,
+					     struct rxr_tx_entry *tx_entry,
+					     struct rxr_pkt_entry *pkt_entry,
+					     int pkt_type)
+{
+	struct rxr_longcts_rtw_hdr *rtw_hdr;
+
+	rtw_hdr = (struct rxr_longcts_rtw_hdr *)pkt_entry->pkt;
 	rtw_hdr->rma_iov_count = tx_entry->rma_iov_count;
-	rtw_hdr->data_len = tx_entry->total_len;
-	rtw_hdr->tx_id = tx_entry->tx_id;
+	rtw_hdr->msg_length = tx_entry->total_len;
+	rtw_hdr->send_id = tx_entry->tx_id;
 	rtw_hdr->credit_request = tx_entry->credit_request;
-	rxr_pkt_init_req_hdr(ep, tx_entry, RXR_LONG_RTW_PKT, pkt_entry);
-	rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov);
-	return 0;
+	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
 }
 
-ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_rtw(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_read_rtw_hdr *rtw_hdr;
-	struct fi_rma_iov *rma_iov, *read_iov;
+	struct rxr_longcts_rtw_hdr *rtw_hdr;
+
+	assert(tx_entry->op == ofi_op_write);
+
+	rtw_hdr = (struct rxr_longcts_rtw_hdr *)pkt_entry->pkt;
+	rxr_pkt_init_longcts_rtw_hdr(ep, tx_entry, pkt_entry, RXR_LONGCTS_RTW_PKT);
+	return rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov);
+}
+
+ssize_t rxr_pkt_init_dc_longcts_rtw(struct rxr_ep *ep,
+				 struct rxr_tx_entry *tx_entry,
+				 struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_longcts_rtw_hdr *rtw_hdr;
+
+	assert(tx_entry->op == ofi_op_write);
+
+	rtw_hdr = (struct rxr_longcts_rtw_hdr *)pkt_entry->pkt;
+	rxr_pkt_init_longcts_rtw_hdr(ep, tx_entry, pkt_entry, RXR_DC_LONGCTS_RTW_PKT);
+	return rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov);
+}
+
+ssize_t rxr_pkt_init_longread_rtw(struct rxr_ep *ep,
+			      struct rxr_tx_entry *tx_entry,
+			      struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_longread_rtw_hdr *rtw_hdr;
+	struct efa_rma_iov *rma_iov;
+	struct fi_rma_iov *read_iov;
 	size_t hdr_size;
 	int i, err;
 
 	assert(tx_entry->op == ofi_op_write);
 
-	rtw_hdr = (struct rxr_read_rtw_hdr *)pkt_entry->pkt;
+	rtw_hdr = (struct rxr_longread_rtw_hdr *)pkt_entry->pkt;
 	rtw_hdr->rma_iov_count = tx_entry->rma_iov_count;
-	rtw_hdr->data_len = tx_entry->total_len;
-	rtw_hdr->tx_id = tx_entry->tx_id;
+	rtw_hdr->msg_length = tx_entry->total_len;
+	rtw_hdr->send_id = tx_entry->tx_id;
 	rtw_hdr->read_iov_count = tx_entry->iov_count;
-	rxr_pkt_init_req_hdr(ep, tx_entry, RXR_READ_RTW_PKT, pkt_entry);
+	rxr_pkt_init_req_hdr(ep, tx_entry, RXR_LONGREAD_RTW_PKT, pkt_entry);
 
 	rma_iov = rtw_hdr->rma_iov;
 	for (i = 0; i < tx_entry->rma_iov_count; ++i) {
@@ -1224,16 +1571,17 @@ ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep,
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
+	pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct efa_rma_iov);
+	pkt_entry->x_entry = tx_entry;
 	return 0;
 }
 
 /*
  *     handle_sent() functions for RTW packet types
  *
- *         rxr_pkt_handle_long_rtw_sent() is empty and is defined in rxr_pkt_type_req.h
+ *         rxr_pkt_handle_longcts_rtw_sent() is empty and is defined in rxr_pkt_type_req.h
  */
-void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtw_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
@@ -1246,7 +1594,7 @@ void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep,
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry);
 	assert(tx_entry->bytes_sent < tx_entry->total_len);
-	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
+	if (tx_entry->desc[0] || efa_is_cache_available(efa_domain))
 		rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 }
 
@@ -1263,7 +1611,7 @@ void rxr_pkt_handle_eager_rtw_send_completion(struct rxr_ep *ep,
 	rxr_cq_handle_tx_completion(ep, tx_entry);
 }
 
-void rxr_pkt_handle_long_rtw_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtw_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
@@ -1274,6 +1622,18 @@ void rxr_pkt_handle_long_rtw_send_completion(struct rxr_ep *ep,
 		rxr_cq_handle_tx_completion(ep, tx_entry);
 }
 
+void rxr_pkt_handle_dc_longcts_rtw_send_completion(struct rxr_ep *ep,
+						struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_tx_entry *tx_entry;
+
+	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
+	tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry);
+	if (tx_entry->total_len == tx_entry->bytes_acked &&
+	    tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED)
+		rxr_cq_handle_tx_completion(ep, tx_entry);
+}
+
 /*
  *     handle_recv() functions
  */
@@ -1284,10 +1644,8 @@ struct rxr_rx_entry *rxr_pkt_alloc_rtw_rx_entry(struct rxr_ep *ep,
 {
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_base_hdr *base_hdr;
-	struct fi_msg msg = {0};
 
-	msg.addr = pkt_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_write, 0);
+	rx_entry = rxr_ep_alloc_rx_entry(ep, pkt_entry->addr, ofi_op_write);
 	if (OFI_UNLIKELY(!rx_entry))
 		return NULL;
 
@@ -1304,28 +1662,19 @@ struct rxr_rx_entry *rxr_pkt_alloc_rtw_rx_entry(struct rxr_ep *ep,
 	return rx_entry;
 }
 
-void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
-				   struct rxr_pkt_entry *pkt_entry)
+void rxr_pkt_proc_eager_rtw(struct rxr_ep *ep,
+			    struct efa_rma_iov *rma_iov,
+			    size_t rma_iov_count,
+			    struct rxr_rx_entry *rx_entry,
+			    struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_rx_entry *rx_entry;
-	struct rxr_eager_rtw_hdr *rtw_hdr;
+	ssize_t err;
 	char *data;
 	size_t data_size, hdr_size;
-	ssize_t err;
 
-	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
-	if (!rx_entry) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"RX entries exhausted.\n");
-		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		return;
-	}
+	err = rxr_rma_verified_copy_iov(ep, rma_iov, rma_iov_count,
+					FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc);
 
-	rtw_hdr = (struct rxr_eager_rtw_hdr *)pkt_entry->pkt;
-	rx_entry->iov_count = rtw_hdr->rma_iov_count;
-	err = rxr_rma_verified_copy_iov(ep, rtw_hdr->rma_iov, rtw_hdr->rma_iov_count,
-					FI_REMOTE_WRITE, rx_entry->iov);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n");
 		efa_eq_write_error(&ep->util_ep, FI_EIO, err);
@@ -1351,7 +1700,7 @@ void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
 			rx_entry->iov[0].iov_len);
 		err = FI_EINVAL;
 	} else {
-		err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
+		err = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, 0, pkt_entry, data, data_size);
 	}
 
 	if (err) {
@@ -1361,14 +1710,64 @@ void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
 	}
 }
 
-void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
+void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
+				   struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_eager_rtw_hdr *rtw_hdr;
+
+	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
+
+	if (!rx_entry) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"RX entries exhausted.\n");
+		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return;
+	}
+
+	rtw_hdr = (struct rxr_eager_rtw_hdr *)pkt_entry->pkt;
+	rx_entry->iov_count = rtw_hdr->rma_iov_count;
+	rxr_pkt_proc_eager_rtw(ep,
+			       rtw_hdr->rma_iov,
+			       rtw_hdr->rma_iov_count,
+			       rx_entry, pkt_entry);
+}
+
+void rxr_pkt_handle_dc_eager_rtw_recv(struct rxr_ep *ep,
+				      struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_dc_eager_rtw_hdr *rtw_hdr;
+
+	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
+	if (!rx_entry) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"RX entries exhausted.\n");
+		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return;
+	}
+
+	rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+	rtw_hdr = (struct rxr_dc_eager_rtw_hdr *)pkt_entry->pkt;
+	rx_entry->tx_id = rtw_hdr->send_id;
+	rx_entry->iov_count = rtw_hdr->rma_iov_count;
+	rxr_pkt_proc_eager_rtw(ep,
+			       rtw_hdr->rma_iov,
+			       rtw_hdr->rma_iov_count,
+			       rx_entry, pkt_entry);
+}
+
+void rxr_pkt_handle_longcts_rtw_recv(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_rx_entry *rx_entry;
-	struct rxr_long_rtw_hdr *rtw_hdr;
+	struct rxr_longcts_rtw_hdr *rtw_hdr;
 	char *data;
 	size_t hdr_size, data_size;
 	ssize_t err;
+	uint32_t tx_id;
 
 	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
 	if (!rx_entry) {
@@ -1379,10 +1778,14 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 		return;
 	}
 
-	rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt;
+	rtw_hdr = (struct rxr_longcts_rtw_hdr *)pkt_entry->pkt;
+	tx_id = rtw_hdr->send_id;
+	if (rtw_hdr->type == RXR_DC_LONGCTS_RTW_PKT)
+		rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+
 	rx_entry->iov_count = rtw_hdr->rma_iov_count;
 	err = rxr_rma_verified_copy_iov(ep, rtw_hdr->rma_iov, rtw_hdr->rma_iov_count,
-					FI_REMOTE_WRITE, rx_entry->iov);
+					FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n");
 		efa_eq_write_error(&ep->util_ep, FI_EIO, err);
@@ -1408,7 +1811,7 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 			rx_entry->iov[0].iov_len);
 		err = FI_EINVAL;
 	} else {
-		err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
+		err = rxr_pkt_copy_data_to_rx_entry(ep, rx_entry, 0, pkt_entry, data, data_size);
 	}
 
 	if (err) {
@@ -1423,21 +1826,21 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 	ep->rx_pending++;
 #endif
 	rx_entry->state = RXR_RX_RECV;
-	rx_entry->tx_id = rtw_hdr->tx_id;
+	rx_entry->tx_id = tx_id;
 	rx_entry->credit_request = rxr_env.tx_min_credits;
 	err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "Cannot post CTS packet\n");
-		rxr_cq_handle_rx_error(ep, rx_entry, err);
+		rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 		rxr_release_rx_entry(ep, rx_entry);
 	}
 }
 
-void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
+void rxr_pkt_handle_longread_rtw_recv(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_rx_entry *rx_entry;
-	struct rxr_read_rtw_hdr *rtw_hdr;
+	struct rxr_longread_rtw_hdr *rtw_hdr;
 	struct fi_rma_iov *read_iov;
 	size_t hdr_size;
 	ssize_t err;
@@ -1451,10 +1854,10 @@ void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
 		return;
 	}
 
-	rtw_hdr = (struct rxr_read_rtw_hdr *)pkt_entry->pkt;
+	rtw_hdr = (struct rxr_longread_rtw_hdr *)pkt_entry->pkt;
 	rx_entry->iov_count = rtw_hdr->rma_iov_count;
 	err = rxr_rma_verified_copy_iov(ep, rtw_hdr->rma_iov, rtw_hdr->rma_iov_count,
-					FI_REMOTE_WRITE, rx_entry->iov);
+					FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n");
 		efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL);
@@ -1471,7 +1874,7 @@ void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
 	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
 	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size);
 	rx_entry->addr = pkt_entry->addr;
-	rx_entry->tx_id = rtw_hdr->tx_id;
+	rx_entry->tx_id = rtw_hdr->send_id;
 	rx_entry->rma_iov_count = rtw_hdr->read_iov_count;
 	memcpy(rx_entry->rma_iov, read_iov,
 	       rx_entry->rma_iov_count * sizeof(struct fi_rma_iov));
@@ -1503,9 +1906,9 @@ void rxr_pkt_init_rtr(struct rxr_ep *ep,
 	rtr_hdr = (struct rxr_rtr_hdr *)pkt_entry->pkt;
 	rtr_hdr->rma_iov_count = tx_entry->rma_iov_count;
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
-	rtr_hdr->data_len = tx_entry->total_len;
-	rtr_hdr->read_req_rx_id = tx_entry->rma_loc_rx_id;
-	rtr_hdr->read_req_window = window;
+	rtr_hdr->msg_length = tx_entry->total_len;
+	rtr_hdr->recv_id = tx_entry->rma_loc_rx_id;
+	rtr_hdr->recv_length = window;
 	for (i = 0; i < tx_entry->rma_iov_count; ++i) {
 		rtr_hdr->rma_iov[i].addr = tx_entry->rma_iov[i].addr;
 		rtr_hdr->rma_iov[i].len = tx_entry->rma_iov[i].len;
@@ -1524,27 +1927,14 @@ ssize_t rxr_pkt_init_short_rtr(struct rxr_ep *ep,
 	return 0;
 }
 
-ssize_t rxr_pkt_init_long_rtr(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_rtr(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry)
 {
-	rxr_pkt_init_rtr(ep, tx_entry, RXR_LONG_RTR_PKT, tx_entry->rma_window, pkt_entry);
+	rxr_pkt_init_rtr(ep, tx_entry, RXR_LONGCTS_RTR_PKT, tx_entry->rma_window, pkt_entry);
 	return 0;
 }
 
-/*
- *     handle_sent() functions for RTR packet types
- */
-void rxr_pkt_handle_rtr_sent(struct rxr_ep *ep,
-			     struct rxr_pkt_entry *pkt_entry)
-{
-	struct rxr_tx_entry *tx_entry;
-
-	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
-	tx_entry->bytes_sent = 0;
-	tx_entry->state = RXR_TX_WAIT_READ_FINISH;
-}
-
 /*
  *     handle_send_completion() funciton for RTR packet
  */
@@ -1568,10 +1958,8 @@ void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_tx_entry *tx_entry;
 	ssize_t err;
-	struct fi_msg msg = {0};
 
-	msg.addr = pkt_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_read_rsp, 0);
+	rx_entry = rxr_ep_alloc_rx_entry(ep, pkt_entry->addr, ofi_op_read_rsp);
 	if (OFI_UNLIKELY(!rx_entry)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"RX entries exhausted.\n");
@@ -1583,17 +1971,13 @@ void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	rx_entry->addr = pkt_entry->addr;
 	rx_entry->bytes_received = 0;
 	rx_entry->bytes_copied = 0;
-	rx_entry->cq_entry.flags |= (FI_RMA | FI_READ);
-	rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
-	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
-	rx_entry->total_len = rx_entry->cq_entry.len;
 
 	rtr_hdr = (struct rxr_rtr_hdr *)pkt_entry->pkt;
-	rx_entry->rma_initiator_rx_id = rtr_hdr->read_req_rx_id;
-	rx_entry->window = rtr_hdr->read_req_window;
+	rx_entry->rma_initiator_rx_id = rtr_hdr->recv_id;
+	rx_entry->window = rtr_hdr->recv_length;
 	rx_entry->iov_count = rtr_hdr->rma_iov_count;
 	err = rxr_rma_verified_copy_iov(ep, rtr_hdr->rma_iov, rtr_hdr->rma_iov_count,
-					FI_REMOTE_READ, rx_entry->iov);
+					FI_REMOTE_READ, rx_entry->iov, rx_entry->desc);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verification failed!\n");
 		efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL);
@@ -1602,6 +1986,11 @@ void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 		return;
 	}
 
+	rx_entry->cq_entry.flags |= (FI_RMA | FI_READ);
+	rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
+	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
+	rx_entry->total_len = rx_entry->cq_entry.len;
+
 	tx_entry = rxr_rma_alloc_readrsp_tx_entry(ep, rx_entry);
 	if (OFI_UNLIKELY(!tx_entry)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "Readrsp tx entry exhausted!\n");
@@ -1631,7 +2020,7 @@ void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 ssize_t rxr_pkt_init_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 			 int pkt_type, struct rxr_pkt_entry *pkt_entry)
 {
-	struct fi_rma_iov *rma_iov;
+	struct efa_rma_iov *rma_iov;
 	struct rxr_rta_hdr *rta_hdr;
 	char *data;
 	size_t hdr_size, data_size;
@@ -1642,7 +2031,6 @@ ssize_t rxr_pkt_init_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	rta_hdr->rma_iov_count = tx_entry->rma_iov_count;
 	rta_hdr->atomic_datatype = tx_entry->atomic_hdr.datatype;
 	rta_hdr->atomic_op = tx_entry->atomic_hdr.atomic_op;
-	rta_hdr->tx_id = tx_entry->tx_id;
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
 	rta_hdr->flags |= RXR_REQ_ATOMIC;
 	rma_iov = rta_hdr->rma_iov;
@@ -1669,10 +2057,26 @@ ssize_t rxr_pkt_init_write_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	return 0;
 }
 
+ssize_t rxr_pkt_init_dc_write_rta(struct rxr_ep *ep,
+				  struct rxr_tx_entry *tx_entry,
+				  struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rta_hdr *rta_hdr;
+
+	rxr_pkt_init_rta(ep, tx_entry, RXR_DC_WRITE_RTA_PKT, pkt_entry);
+	rta_hdr = rxr_get_rta_hdr(pkt_entry->pkt);
+	rta_hdr->send_id = tx_entry->tx_id;
+	return 0;
+}
+
 ssize_t rxr_pkt_init_fetch_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry)
 {
+	struct rxr_rta_hdr *rta_hdr;
+
 	rxr_pkt_init_rta(ep, tx_entry, RXR_FETCH_RTA_PKT, pkt_entry);
+	rta_hdr = rxr_get_rta_hdr(pkt_entry->pkt);
+	rta_hdr->recv_id = tx_entry->tx_id;
 	return 0;
 }
 
@@ -1681,9 +2085,11 @@ ssize_t rxr_pkt_init_compare_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entr
 {
 	char *data;
 	size_t data_size;
+	struct rxr_rta_hdr *rta_hdr;
 
 	rxr_pkt_init_rta(ep, tx_entry, RXR_COMPARE_RTA_PKT, pkt_entry);
-
+	rta_hdr = rxr_get_rta_hdr(pkt_entry->pkt);
+	rta_hdr->recv_id = tx_entry->tx_id;
 	/* rxr_pkt_init_rta() will copy data from tx_entry->iov to pkt entry
 	 * the following append the data to be compared
 	 */
@@ -1708,6 +2114,7 @@ int rxr_pkt_proc_write_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
 	struct iovec iov[RXR_IOV_LIMIT];
 	struct rxr_rta_hdr *rta_hdr;
+	void *desc[RXR_IOV_LIMIT];
 	char *data;
 	int iov_count, op, dt, i;
 	size_t dtsize, offset, hdr_size;
@@ -1716,11 +2123,14 @@ int rxr_pkt_proc_write_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	op = rta_hdr->atomic_op;
 	dt = rta_hdr->atomic_datatype;
 	dtsize = ofi_datatype_size(dt);
-	
+	if (OFI_UNLIKELY(!dtsize)) {
+		return -errno;
+	}
+
 	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
 	data = (char *)pkt_entry->pkt + hdr_size;
 	iov_count = rta_hdr->rma_iov_count;
-	rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, iov_count, FI_REMOTE_WRITE, iov);
+	rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, iov_count, FI_REMOTE_WRITE, iov, desc);
 
 	offset = 0;
 	for (i = 0; i < iov_count; ++i) {
@@ -1738,26 +2148,29 @@ struct rxr_rx_entry *rxr_pkt_alloc_rta_rx_entry(struct rxr_ep *ep, struct rxr_pk
 {
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_rta_hdr *rta_hdr;
-	struct fi_msg msg = {0};
 
-	msg.addr = pkt_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, op, 0);
+	rx_entry = rxr_ep_alloc_rx_entry(ep, pkt_entry->addr, op);
 	if (OFI_UNLIKELY(!rx_entry)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"RX entries exhausted.\n");
 		return NULL;
 	}
 
+	if (op == ofi_op_atomic) {
+		rx_entry->addr = pkt_entry->addr;
+		return rx_entry;
+	}
+
 	rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt;
 	rx_entry->atomic_hdr.atomic_op = rta_hdr->atomic_op;
 	rx_entry->atomic_hdr.datatype = rta_hdr->atomic_datatype;
 
 	rx_entry->iov_count = rta_hdr->rma_iov_count;
-	rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, rx_entry->iov_count, FI_REMOTE_READ, rx_entry->iov);
-	rx_entry->tx_id = rta_hdr->tx_id;
+	rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, rx_entry->iov_count,
+				  FI_REMOTE_READ, rx_entry->iov, rx_entry->desc);
 	rx_entry->total_len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
 	/*
-	 * prepare a pkt entry to temporarily hold response data.
+	 * prepare a buffer to hold response data.
 	 * Atomic_op operates on 3 data buffers:
 	 *          local_data (input/output),
 	 *          request_data (input),
@@ -1765,13 +2178,13 @@ struct rxr_rx_entry *rxr_pkt_alloc_rta_rx_entry(struct rxr_ep *ep, struct rxr_pk
 	 * The fact local data will be changed by atomic_op means
 	 * response_data is not reproducible.
 	 * Because sending response packet can fail due to
-	 * -FI_EAGAIN, we need a temporary buffer to hold response_data.
-	 * This packet entry will be release in rxr_handle_atomrsp_send_completion()
+	 * -FI_EAGAIN, we need a buffer to hold response_data.
+	 * The buffer will be release in rxr_handle_atomrsp_send_completion()
 	 */
-	rx_entry->atomrsp_pkt = rxr_pkt_entry_alloc(ep, ep->tx_pkt_efa_pool);
-	if (!rx_entry->atomrsp_pkt) {
+	rx_entry->atomrsp_data = ofi_buf_alloc(ep->rx_atomrsp_pool);
+	if (!rx_entry->atomrsp_data) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"pkt entries exhausted.\n");
+			"atomic repsonse buffer pool exhausted.\n");
 		rxr_release_rx_entry(ep, rx_entry);
 		return NULL;
 	}
@@ -1779,6 +2192,48 @@ struct rxr_rx_entry *rxr_pkt_alloc_rta_rx_entry(struct rxr_ep *ep, struct rxr_pk
 	return rx_entry;
 }
 
+int rxr_pkt_proc_dc_write_rta(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+	struct rxr_rta_hdr *rta_hdr;
+	ssize_t err;
+	int ret;
+
+	rx_entry = rxr_pkt_alloc_rta_rx_entry(ep, pkt_entry, ofi_op_atomic);
+	if (OFI_UNLIKELY(!rx_entry)) {
+		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return -FI_ENOBUFS;
+	}
+
+	rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt;
+	rx_entry->tx_id = rta_hdr->send_id;
+	rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+
+	ret = rxr_pkt_proc_write_rta(ep, pkt_entry);
+	if (OFI_UNLIKELY(ret)) {
+		FI_WARN(&rxr_prov,
+			FI_LOG_CQ,
+			"Error while processing the write rta packet\n");
+		return ret;
+	}
+
+	err = rxr_pkt_post_ctrl_or_queue(ep,
+					 RXR_RX_ENTRY,
+					 rx_entry,
+					 RXR_RECEIPT_PKT, 0);
+	if (OFI_UNLIKELY(err)) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"Posting of receipt packet failed! err=%s\n",
+			fi_strerror(err));
+		rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
+		return err;
+	}
+
+	return ret;
+}
+
 int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_rx_entry *rx_entry;
@@ -1793,27 +2248,28 @@ int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 		return -FI_ENOBUFS;
 	}
 
+	rx_entry->tx_id = rxr_get_rta_hdr(pkt_entry->pkt)->recv_id;
 	op = rx_entry->atomic_hdr.atomic_op;
  	dt = rx_entry->atomic_hdr.datatype;	
 	dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype);
+	if (OFI_UNLIKELY(!dtsize)) {
+		return -errno;
+	}
 
 	data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry);
-	rx_entry->atomrsp_buf = (char *)rx_entry->atomrsp_pkt->pkt + sizeof(struct rxr_atomrsp_hdr);
 
 	offset = 0;
 	for (i = 0; i < rx_entry->iov_count; ++i) {
 		ofi_atomic_readwrite_handlers[op][dt](rx_entry->iov[i].iov_base,
 						      data + offset,
-						      rx_entry->atomrsp_buf + offset,
+						      rx_entry->atomrsp_data + offset,
 						      rx_entry->iov[i].iov_len / dtsize);
 		offset += rx_entry->iov[i].iov_len;
 	}
 
 	err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_ATOMRSP_PKT, 0);
-	if (OFI_UNLIKELY(err)) {
-		if (rxr_cq_handle_rx_error(ep, rx_entry, err))
-			assert(0 && "Cannot handle rx error");
-	}
+	if (OFI_UNLIKELY(err))
+		rxr_cq_write_rx_error(ep, rx_entry, -err, -err);
 
 	rxr_pkt_entry_release_rx(ep, pkt_entry);
 	return 0;
@@ -1834,28 +2290,72 @@ int rxr_pkt_proc_compare_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 		return -FI_ENOBUFS;
 	}
 
+	rx_entry->tx_id = rxr_get_rta_hdr(pkt_entry->pkt)->recv_id;
 	op = rx_entry->atomic_hdr.atomic_op;
 	dt = rx_entry->atomic_hdr.datatype;
-       	dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype);
+	dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype);
+	if (OFI_UNLIKELY(!dtsize)) {
+		efa_eq_write_error(&ep->util_ep, FI_EINVAL, -errno);
+		rxr_release_rx_entry(ep, rx_entry);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return -errno;
+	}
 
 	src_data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry);
 	cmp_data = src_data + rx_entry->total_len;
-	rx_entry->atomrsp_buf = (char *)rx_entry->atomrsp_pkt->pkt + sizeof(struct rxr_atomrsp_hdr);
-
 	offset = 0;
-	for (i = 0; i < rx_entry->iov_count; ++i) {
-		ofi_atomic_swap_handlers[op - FI_CSWAP][dt](rx_entry->iov[i].iov_base,
-							    src_data + offset,
-							    cmp_data + offset,
-							    rx_entry->atomrsp_buf + offset,
-							    rx_entry->iov[i].iov_len / dtsize);
-		offset += rx_entry->iov[i].iov_len;
+
+#ifdef HAVE___INT128
+	/*
+	 * Perform a check here on the datatype and then a copy if this is a
+	 * 128-bit integer (otherwise, take the normal code path). We have to
+	 * do this because of the way our buffers are laid out in memory.
+	 * Unfortunately they are not aligned at 16 bytes, which is required
+	 * when using optimized instructions.
+	 */
+	if (dt == FI_INT128) {
+		for (i = 0; i < rx_entry->iov_count; ++i) {
+			ofi_int128_t src, cmp;
+			memcpy(&src, src_data + offset, sizeof(ofi_int128_t));
+			memcpy(&cmp, cmp_data + offset, sizeof(ofi_int128_t));
+
+			ofi_atomic_swap_handler(op, dt, rx_entry->iov[i].iov_base,
+									&src,
+									&cmp,
+									rx_entry->atomrsp_data + offset,
+									rx_entry->iov[i].iov_len / dtsize);
+			offset += rx_entry->iov[i].iov_len;
+		}
+	} else if (dt == FI_UINT128) {
+		for (i = 0; i < rx_entry->iov_count; ++i) {
+			ofi_uint128_t src, cmp;
+			memcpy(&src, src_data + offset, sizeof(ofi_uint128_t));
+			memcpy(&cmp, cmp_data + offset, sizeof(ofi_uint128_t));
+			ofi_atomic_swap_handler(op, dt, rx_entry->iov[i].iov_base,
+									&src,
+									&cmp,
+									rx_entry->atomrsp_data + offset,
+									rx_entry->iov[i].iov_len / dtsize);
+			offset += rx_entry->iov[i].iov_len;
+		}
+	} else {
+#endif
+		for (i = 0; i < rx_entry->iov_count; ++i) {
+			ofi_atomic_swap_handler(op, dt, rx_entry->iov[i].iov_base,
+									src_data + offset,
+									cmp_data + offset,
+									rx_entry->atomrsp_data + offset,
+									rx_entry->iov[i].iov_len / dtsize);
+			offset += rx_entry->iov[i].iov_len;
+		}
+#ifdef HAVE___INT128
 	}
+#endif
 
 	err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_ATOMRSP_PKT, 0);
 	if (OFI_UNLIKELY(err)) {
 		efa_eq_write_error(&ep->util_ep, FI_EIO, err);
-		rxr_pkt_entry_release_tx(ep, rx_entry->atomrsp_pkt);
+		ofi_buf_free(rx_entry->atomrsp_data);
 		rxr_release_rx_entry(ep, rx_entry);
 		rxr_pkt_entry_release_rx(ep, pkt_entry);
 		return err;
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
index 401674aaa3d5ed73d1a4913279c5ae9c4b243b22..e99c68096b5a95edc9892cf22e45097d2eef6ed0 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
@@ -34,93 +34,30 @@
 #ifndef _RXR_PKT_TYPE_REQ_H
 #define _RXR_PKT_TYPE_REQ_H
 
-/*
- * This file contain REQ packet type related struct and functions
- * REQ packets can be classifed into 4 categories:
- *    RTM (Request To Message) is used by message
- *    RTW (Request To Write) is used by RMA write
- *    RTR (Request To Read) is used by RMA read
- *    RTA (Request To Atomic) is used by Atomic
- *
- * For each REQ packet type need to have the following:
- *
- *     1. a header struct
- *     2. an init() function called by rxr_pkt_init_ctrl()
- *     3. a handle_sent() function called by rxr_pkt_post_ctrl()
- *     4. a handle_send_completion() function called by
- *               rxr_pkt_handle_send_completion()
- *     5. a proc() function called by
- *               rxr_pkt_proc_req()
- *
- * Some req packet types are so similar that they can share
- * some functions.
- */
-
-/*
- * Utilities shared by all REQ packets
- *
- *     Packet Header Flags
- */
-#define RXR_REQ_OPT_RAW_ADDR_HDR	BIT_ULL(0)
-#define RXR_REQ_OPT_CQ_DATA_HDR		BIT_ULL(1)
-#define RXR_REQ_MSG			BIT_ULL(2)
-#define RXR_REQ_TAGGED			BIT_ULL(3)
-#define RXR_REQ_RMA			BIT_ULL(4)
-#define RXR_REQ_ATOMIC			BIT_ULL(5)
-
-/*
- *     Extra Feature Flags
- */
-#define RXR_REQ_FEATURE_RDMA_READ	BIT_ULL(0)
-
-/*
- *     Utility struct and functions for
- *             REQ packet types
- */
-struct rxr_req_opt_raw_addr_hdr {
-	uint32_t addr_len;
-	char raw_addr[0];
-};
+#define RXR_MSG_PREFIX_SIZE (sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr) + RXR_REQ_OPT_RAW_ADDR_HDR_SIZE)
 
-struct rxr_req_opt_cq_data_hdr {
-	int64_t cq_data;
-};
+#if defined(static_assert) && defined(__x86_64__)
+static_assert(RXR_MSG_PREFIX_SIZE % 8 == 0, "message prefix size alignment check");
+#endif
 
 void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry);
 
 int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry);
 
+uint32_t *rxr_pkt_req_connid_ptr(struct rxr_pkt_entry *pkt_entry);
+
 size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry);
 
 size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry);
 
+size_t rxr_pkt_req_header_size(int pkt_type, uint16_t flags, size_t rma_iov_count);
+
 size_t rxr_pkt_req_max_header_size(int pkt_type);
 
 size_t rxr_pkt_max_header_size(void);
 
-size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type);
-
-/*
- * Structs and funcitons for RTM (Message) packet types
- * There are 4 message protocols
- *         Eager message protocol,
- *         Medium message protocol,
- *         Long message protocol,
- *         Read message protocol (message by read)
- * Each protocol employes two packet types: non-tagged and tagged.
- * Thus altogether there are 8 RTM packet types.
- */
-
-/*
- *   Utility structs and functions shared by all
- *   RTM packet types
- */
-struct rxr_rtm_base_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	uint32_t msg_id;
-};
+size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type,
+				 uint64_t fi_flags, size_t rma_iov_count);
 
 static inline
 struct rxr_rtm_base_hdr *rxr_get_rtm_base_hdr(void *pkt)
@@ -169,98 +106,76 @@ void rxr_pkt_rtm_settag(struct rxr_pkt_entry *pkt_entry, uint64_t tag)
 	*tagptr = tag;
 }
 
-/*
- *   Header structs for each REQ packe type
- */
-struct rxr_eager_msgrtm_hdr {
-	struct rxr_rtm_base_hdr hdr;
-};
-
-struct rxr_eager_tagrtm_hdr {
-	struct rxr_rtm_base_hdr hdr;
-	uint64_t tag;
-};
-
-struct rxr_medium_rtm_base_hdr {
-	struct rxr_rtm_base_hdr hdr;
-	uint64_t data_len;
-	uint64_t offset;
-};
-
-struct rxr_medium_msgrtm_hdr {
-	struct rxr_medium_rtm_base_hdr hdr;
-};
-
-struct rxr_medium_tagrtm_hdr {
-	struct rxr_medium_rtm_base_hdr hdr;
-	uint64_t tag;
-};
-
 static inline
-struct rxr_medium_rtm_base_hdr *rxr_get_medium_rtm_base_hdr(void *pkt)
+struct rxr_dc_eager_rtm_base_hdr *rxr_get_dc_eager_rtm_base_hdr(void *pkt)
 {
-	return (struct rxr_medium_rtm_base_hdr *)pkt;
+	return (struct rxr_dc_eager_rtm_base_hdr *)pkt;
 }
 
-struct rxr_long_rtm_base_hdr {
-	struct rxr_rtm_base_hdr hdr;
-	uint64_t data_len;
-	uint32_t tx_id;
-	uint32_t credit_request;
-};
+static inline
+struct rxr_dc_eager_msgrtm_hdr *rxr_get_dc_eager_msgrtm_hdr(void *pkt)
+{
+	return (struct rxr_dc_eager_msgrtm_hdr *)pkt;
+}
 
 static inline
-struct rxr_long_rtm_base_hdr *rxr_get_long_rtm_base_hdr(void *pkt)
+struct rxr_dc_eager_tagrtm_hdr *rxr_get_dc_eager_tagrtm_hdr(void *pkt)
 {
-	return (struct rxr_long_rtm_base_hdr *)pkt;
+	return (struct rxr_dc_eager_tagrtm_hdr *)pkt;
 }
 
-struct rxr_long_msgrtm_hdr {
-	struct rxr_long_rtm_base_hdr hdr;
-};
+static inline
+struct rxr_medium_rtm_base_hdr *rxr_get_medium_rtm_base_hdr(void *pkt)
+{
+	return (struct rxr_medium_rtm_base_hdr *)pkt;
+}
 
-struct rxr_long_tagrtm_hdr {
-	struct rxr_long_rtm_base_hdr hdr;
-	uint64_t tag;
-};
+static inline
+struct rxr_dc_medium_rtm_base_hdr *rxr_get_dc_medium_rtm_base_hdr(void *pkt)
+{
+	return (struct rxr_dc_medium_rtm_base_hdr *)pkt;
+}
 
-struct rxr_read_rtm_base_hdr {
-	struct rxr_rtm_base_hdr hdr;
-	uint64_t data_len;
-	uint32_t tx_id;
-	uint32_t read_iov_count;
-};
+static inline
+struct rxr_dc_medium_msgrtm_hdr *rxr_get_dc_medium_msgrtm_hdr(void *pkt)
+{
+	return (struct rxr_dc_medium_msgrtm_hdr *)pkt;
+}
 
 static inline
-struct rxr_read_rtm_base_hdr *rxr_get_read_rtm_base_hdr(void *pkt)
+struct rxr_dc_medium_tagrtm_hdr *rxr_get_dc_medium_tagrtm_hdr(void *pkt)
 {
-	return (struct rxr_read_rtm_base_hdr *)pkt;
+	return (struct rxr_dc_medium_tagrtm_hdr *)pkt;
 }
 
-struct rxr_read_msgrtm_hdr {
-	struct rxr_read_rtm_base_hdr hdr;
-};
+static inline
+struct rxr_longcts_rtm_base_hdr *rxr_get_longcts_rtm_base_hdr(void *pkt)
+{
+	return (struct rxr_longcts_rtm_base_hdr *)pkt;
+}
 
-struct rxr_read_tagrtm_hdr {
-	struct rxr_read_rtm_base_hdr hdr;
-	uint64_t tag;
-};
+static inline
+struct rxr_longread_rtm_base_hdr *rxr_get_longread_rtm_base_hdr(void *pkt)
+{
+	return (struct rxr_longread_rtm_base_hdr *)pkt;
+}
 
 static inline
-int rxr_read_rtm_pkt_type(int op)
+int rxr_longread_rtm_pkt_type(int op)
 {
 	assert(op == ofi_op_tagged || op == ofi_op_msg);
-	return (op == ofi_op_tagged) ? RXR_READ_TAGRTM_PKT
-				     : RXR_READ_MSGRTM_PKT;
+	return (op == ofi_op_tagged) ? RXR_LONGREAD_TAGRTM_PKT
+				     : RXR_LONGREAD_MSGRTM_PKT;
 }
 
-/*
- *  init() functions for RTM packets
- */
 ssize_t rxr_pkt_init_eager_msgrtm(struct rxr_ep *ep,
 				  struct rxr_tx_entry *tx_entry,
 				  struct rxr_pkt_entry *pkt_entry);
 
+ssize_t rxr_pkt_init_dc_eager_msgrtm(struct rxr_ep *ep,
+				     struct rxr_tx_entry *tx_entry,
+				     struct rxr_pkt_entry *pkt_entry);
+
 ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep,
 				  struct rxr_tx_entry *tx_entry,
 				  struct rxr_pkt_entry *pkt_entry);
@@ -269,28 +184,46 @@ ssize_t rxr_pkt_init_medium_msgrtm(struct rxr_ep *ep,
 				   struct rxr_tx_entry *tx_entry,
 				   struct rxr_pkt_entry *pkt_entry);
 
+ssize_t rxr_pkt_init_dc_eager_tagrtm(struct rxr_ep *ep,
+				     struct rxr_tx_entry *tx_entry,
+				     struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_init_dc_medium_msgrtm(struct rxr_ep *ep,
+				      struct rxr_tx_entry *tx_entry,
+				      struct rxr_pkt_entry *pkt_entry);
+
 ssize_t rxr_pkt_init_medium_tagrtm(struct rxr_ep *ep,
 				   struct rxr_tx_entry *tx_entry,
 				   struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_long_msgrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_dc_medium_tagrtm(struct rxr_ep *ep,
+				      struct rxr_tx_entry *tx_entry,
+				      struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_init_longcts_msgrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_long_tagrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_dc_longcts_msgrtm(struct rxr_ep *ep,
+				    struct rxr_tx_entry *tx_entry,
+				    struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_init_longcts_tagrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_read_msgrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_dc_longcts_tagrtm(struct rxr_ep *ep,
+				    struct rxr_tx_entry *tx_entry,
+				    struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_init_longread_msgrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_read_tagrtm(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longread_tagrtm(struct rxr_ep *ep,
 				 struct rxr_tx_entry *tx_entry,
 				 struct rxr_pkt_entry *pkt_entry);
-/*
- *   handle_sent() functions for RTM packets
- */
+
 static inline
 void rxr_pkt_handle_eager_rtm_sent(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry)
@@ -302,38 +235,35 @@ void rxr_pkt_handle_eager_rtm_sent(struct rxr_ep *ep,
 void rxr_pkt_handle_medium_rtm_sent(struct rxr_ep *ep,
 				    struct rxr_pkt_entry *pkt_entry);
 
-void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtm_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry);
 
 static inline
-void rxr_pkt_handle_read_rtm_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longread_rtm_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 }
 
-/*
- *   handle_send_completion() functions for RTM packet types
- */
 void rxr_pkt_handle_eager_rtm_send_completion(struct rxr_ep *ep,
 					      struct rxr_pkt_entry *pkt_entry);
 
 void rxr_pkt_handle_medium_rtm_send_completion(struct rxr_ep *ep,
 					       struct rxr_pkt_entry *pkt_entry);
 
-void rxr_pkt_handle_long_rtm_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtm_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry);
 
+void rxr_pkt_handle_dc_longcts_rtm_send_completion(struct rxr_ep *ep,
+						struct rxr_pkt_entry *pkt_entry);
+
 static inline
-void rxr_pkt_handle_read_rtm_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longread_rtm_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry)
 {
 }
 
-/*
- *   proc() functions for RTM packet types
- */
-void rxr_pkt_rtm_init_rx_entry(struct rxr_pkt_entry *pkt_entry,
-			       struct rxr_rx_entry *rx_entry);
+void rxr_pkt_rtm_update_rx_entry(struct rxr_pkt_entry *pkt_entry,
+				 struct rxr_rx_entry *rx_entry);
 
 /*         This function is called by both
  *            rxr_pkt_handle_rtm_recv() and
@@ -357,81 +287,38 @@ void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep,
 void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 				 struct rxr_pkt_entry *pkt_entry);
 
-/* Structs and functions for RTW packet types
- * There are 3 write protocols
- *         Eager write protocol,
- *         Long write protocol and
- *         Read write protocol (write by read)
- * Each protocol correspond to a packet type
- */
-
-/*
- *     Header structs
- */
-struct rxr_rtw_base_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t rma_iov_count;
-};
-
 static inline
 struct rxr_rtw_base_hdr *rxr_get_rtw_base_hdr(void *pkt)
 {
 	return (struct rxr_rtw_base_hdr *)pkt;
 }
 
-struct rxr_eager_rtw_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t rma_iov_count;
-	struct fi_rma_iov rma_iov[0];
-};
-
-struct rxr_long_rtw_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t rma_iov_count;
-	uint64_t data_len;
-	uint32_t tx_id;
-	uint32_t credit_request;
-	struct fi_rma_iov rma_iov[0];
-};
-
-struct rxr_read_rtw_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t rma_iov_count;
-	uint64_t data_len;
-	uint32_t tx_id;
-	uint32_t read_iov_count;
-	struct fi_rma_iov rma_iov[0];
-};
+static inline
+struct rxr_dc_eager_rtw_hdr *rxr_get_dc_eager_rtw_hdr(void *pkt)
+{
+	return (struct rxr_dc_eager_rtw_hdr *)pkt;
+}
 
-/*
- *     init() functions for each RTW packet types
- */
 ssize_t rxr_pkt_init_eager_rtw(struct rxr_ep *ep,
 			       struct rxr_tx_entry *tx_entry,
 			       struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_long_rtw(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_rtw(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longread_rtw(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry);
-/*
- *     handle_sent() functions
- */
+
+ssize_t rxr_pkt_init_dc_eager_rtw(struct rxr_ep *ep,
+				  struct rxr_tx_entry *tx_entry,
+				  struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_init_dc_longcts_rtw(struct rxr_ep *ep,
+				 struct rxr_tx_entry *tx_entry,
+				 struct rxr_pkt_entry *pkt_entry);
+
 static inline
 void rxr_pkt_handle_eager_rtw_sent(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry)
@@ -440,118 +327,63 @@ void rxr_pkt_handle_eager_rtw_sent(struct rxr_ep *ep,
 	return;
 }
 
-void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtw_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry);
 
 static inline
-void rxr_pkt_handle_read_rtw_sent(struct rxr_ep *ep,
+void rxr_pkt_handle_longread_rtw_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 }
 
-/*
- *     handle_send_completion() functions
- */
 void rxr_pkt_handle_eager_rtw_send_completion(struct rxr_ep *ep,
 					      struct rxr_pkt_entry *pkt_entry);
 
-void rxr_pkt_handle_long_rtw_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtw_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry);
 
+void rxr_pkt_handle_dc_longcts_rtw_send_completion(struct rxr_ep *ep,
+						struct rxr_pkt_entry *pkt_entry);
+
 static inline
-void rxr_pkt_handle_read_rtw_send_completion(struct rxr_ep *ep,
+void rxr_pkt_handle_longread_rtw_send_completion(struct rxr_ep *ep,
 					     struct rxr_pkt_entry *pkt_entry)
 {
 }
 
-/*
- *     handle_recv() functions
- */
 void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
 				   struct rxr_pkt_entry *pkt_entry);
 
-void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
-				  struct rxr_pkt_entry *pkt_entry);
+void rxr_pkt_handle_dc_eager_rtw_recv(struct rxr_ep *ep,
+				      struct rxr_pkt_entry *pkt_entry);
 
-void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
+void rxr_pkt_handle_longcts_rtw_recv(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry);
 
-/* Structs and functions for RTR packet types
- * There are 3 read protocols
- *         Short protocol,
- *         Long read protocol and
- *         RDMA read protocol
- * Each protocol correspond to a packet type
- */
-
-/*
- *     Header structs
- */
-struct rxr_rtr_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	/* end of rxr_base_hdr */
-	uint32_t rma_iov_count;
-	uint64_t data_len;
-	uint32_t read_req_rx_id;
-	uint32_t read_req_window;
-	struct fi_rma_iov rma_iov[0];
-};
-
+void rxr_pkt_handle_longread_rtw_recv(struct rxr_ep *ep,
+				  struct rxr_pkt_entry *pkt_entry);
 static inline
 struct rxr_rtr_hdr *rxr_get_rtr_hdr(void *pkt)
 {
 	return (struct rxr_rtr_hdr *)pkt;
 }
 
-/*
- *     init() functions for each RTW packet types
- */
 ssize_t rxr_pkt_init_short_rtr(struct rxr_ep *ep,
 			       struct rxr_tx_entry *tx_entry,
 			       struct rxr_pkt_entry *pkt_entry);
 
-ssize_t rxr_pkt_init_long_rtr(struct rxr_ep *ep,
+ssize_t rxr_pkt_init_longcts_rtr(struct rxr_ep *ep,
 			      struct rxr_tx_entry *tx_entry,
 			      struct rxr_pkt_entry *pkt_entry);
 
-/*
- *     handle_sent() functions
- */
 void rxr_pkt_handle_rtr_sent(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
 
-/*
- *     handle_send_completion() functions
- */
 void rxr_pkt_handle_rtr_send_completion(struct rxr_ep *ep,
 					struct rxr_pkt_entry *pkt_entry);
-/*
- *     handle_recv() functions
- */
 void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
 
-/* Structs and functions for RTW packet types
- * There are 2 atomic protocols
- *         write atomic protocol and, 
- *         read/compare atomic protocol and
- * Each protocol correspond to a packet type
- */
-struct rxr_rta_hdr {
-	uint8_t type;
-	uint8_t version;
-	uint16_t flags;
-	uint32_t msg_id;
-	/* end of rtm_base_hdr, atomic packet need msg_id for reordering */
-	uint32_t rma_iov_count;
-	uint32_t atomic_datatype;
-	uint32_t atomic_op;
-	uint32_t tx_id;
-	struct fi_rma_iov rma_iov[0];
-};
-
 static inline
 struct rxr_rta_hdr *rxr_get_rta_hdr(void *pkt)
 {
@@ -560,6 +392,10 @@ struct rxr_rta_hdr *rxr_get_rta_hdr(void *pkt)
 
 ssize_t rxr_pkt_init_write_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry);
 
+ssize_t rxr_pkt_init_dc_write_rta(struct rxr_ep *ep,
+				  struct rxr_tx_entry *tx_entry,
+				  struct rxr_pkt_entry *pkt_entry);
+
 ssize_t rxr_pkt_init_fetch_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry);
 
 ssize_t rxr_pkt_init_compare_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry);
@@ -580,6 +416,9 @@ void rxr_pkt_handle_write_rta_send_completion(struct rxr_ep *ep,
 int rxr_pkt_proc_write_rta(struct rxr_ep *ep,
 			   struct rxr_pkt_entry *pkt_entry);
 
+int rxr_pkt_proc_dc_write_rta(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry);
+
 int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep,
 			   struct rxr_pkt_entry *pkt_entry);
 
@@ -587,4 +426,5 @@ int rxr_pkt_proc_compare_rta(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
 
 void rxr_pkt_handle_rta_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
+
 #endif
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_read.c b/deps/libfabric/prov/efa/src/rxr/rxr_read.c
index 5ec4db0980259985fc080d6f74059b1003faf571..5066648ae0742d3715c27ef5f45fc256c9279896 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_read.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_read.c
@@ -110,14 +110,15 @@ ssize_t rxr_read_prepare_pkt_entry_mr(struct rxr_ep *ep, struct rxr_read_entry *
 	}
 
 	/* only ooo and unexp packet entry's memory is not registered with device */
-	assert(pkt_entry->type == RXR_PKT_ENTRY_OOO ||
-	       pkt_entry->type == RXR_PKT_ENTRY_UNEXP);
+	assert(pkt_entry->alloc_type == RXR_PKT_FROM_OOO_POOL ||
+	       pkt_entry->alloc_type == RXR_PKT_FROM_UNEXP_POOL);
 
 	pkt_offset = (char *)read_entry->rma_iov[0].addr - (char *)pkt_entry->pkt;
 	assert(pkt_offset > sizeof(struct rxr_base_hdr));
 
 	pkt_entry_copy = rxr_pkt_entry_clone(ep, ep->rx_readcopy_pkt_pool,
-					     pkt_entry, RXR_PKT_ENTRY_READ_COPY);
+					     RXR_PKT_FROM_READ_COPY_POOL,
+					     pkt_entry);
 	if (!pkt_entry_copy) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"readcopy pkt pool exhausted! Set FI_EFA_READCOPY_POOL_SIZE to a higher value!");
@@ -179,6 +180,35 @@ ssize_t rxr_read_mr_reg(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 	return 0;
 }
 
+/**
+ * @brief convert descriptor from application for lower provider to use
+ *
+ * Each provider define its descriptors format. The descriptor provided
+ * by application is in EFA provider format.
+ * This function convert it to descriptors for lower provider according
+ * to lower provider type. It also handle the case application does not
+ * provider descriptors.
+ *
+ * @param lower_ep_type[in] lower efa type, can be EFA_EP or SHM_EP.
+ * @param numdesc[in]       number of descriptors in the array
+ * @param desc_in[in]       descriptors provided by application
+ * @param desc_out[out]     descriptors for lower provider.
+ */
+static inline
+void rxr_read_copy_desc(enum rxr_lower_ep_type lower_ep_type,
+			int numdesc, void **desc_in, void **desc_out)
+{
+	if (!desc_in) {
+		memset(desc_out, 0, numdesc * sizeof(void *));
+		return;
+	}
+
+	memcpy(desc_out, desc_in, numdesc * sizeof(void *));
+	if (lower_ep_type == SHM_EP) {
+		rxr_convert_desc_for_shm(numdesc, desc_out);
+	}
+}
+
 /* rxr_read_alloc_entry allocates a read entry.
  * It is called by rxr_read_post_or_queue().
  * Input:
@@ -205,6 +235,7 @@ struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, v
 		return NULL;
 	}
 
+	read_entry->type = RXR_READ_ENTRY;
 	read_entry->read_id = ofi_buf_index(read_entry);
 	read_entry->state = RXR_RDMA_ENTRY_CREATED;
 
@@ -228,10 +259,7 @@ struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, v
 		total_rma_iov_len = ofi_total_rma_iov_len(tx_entry->rma_iov, tx_entry->rma_iov_count);
 		read_entry->total_len = MIN(total_iov_len, total_rma_iov_len);
 
-		if (tx_entry->desc) {
-			memcpy(read_entry->mr_desc, tx_entry->desc,
-			       read_entry->iov_count * sizeof(void *));
-		}
+		rxr_read_copy_desc(lower_ep_type, read_entry->iov_count, tx_entry->desc, read_entry->mr_desc);
 
 	} else {
 		rx_entry = (struct rxr_rx_entry *)x_entry;
@@ -254,10 +282,7 @@ struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, v
 		total_rma_iov_len = ofi_total_rma_iov_len(rx_entry->rma_iov, rx_entry->rma_iov_count);
 		read_entry->total_len = MIN(total_iov_len, total_rma_iov_len);
 
-		if (rx_entry->desc) {
-			memcpy(read_entry->mr_desc, rx_entry->desc,
-			       read_entry->iov_count * sizeof(void *));
-		}
+		rxr_read_copy_desc(lower_ep_type, read_entry->iov_count, rx_entry->desc, read_entry->mr_desc);
 	}
 
 	memset(read_entry->mr, 0, read_entry->iov_count * sizeof(struct fid_mr *));
@@ -286,7 +311,7 @@ void rxr_read_release_entry(struct rxr_ep *ep, struct rxr_read_entry *read_entry
 			err = fi_close((struct fid *)read_entry->mr[i]);
 			if (err) {
 				FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to close mr\n");
-				rxr_read_handle_error(ep, read_entry, err);
+				rxr_read_write_error(ep, read_entry, -err, -err);
 			}
 		}
 	}
@@ -319,7 +344,7 @@ int rxr_read_post_or_queue(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 
 int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry)
 {
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct rxr_read_entry *read_entry;
 	int lower_ep_type;
 
@@ -329,8 +354,8 @@ int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *
 		assert(entry_type == RXR_RX_ENTRY);
 		peer = rxr_ep_get_peer(ep, ((struct rxr_rx_entry *)x_entry)->addr);
 	}
-
 	assert(peer);
+
 	lower_ep_type = (peer->is_local) ? SHM_EP : EFA_EP;
 	read_entry = rxr_read_alloc_entry(ep, entry_type, x_entry, lower_ep_type);
 	if (!read_entry) {
@@ -377,8 +402,9 @@ int rxr_read_post_local_read_or_queue(struct rxr_ep *ep,
 	assert(pkt_entry->x_entry == rx_entry);
 	assert(rx_entry->desc && efa_ep_is_cuda_mr(rx_entry->desc[0]));
 	read_entry->iov_count = rx_entry->iov_count;
+	memset(read_entry->mr, 0, sizeof(*read_entry->mr) * read_entry->iov_count);
 	memcpy(read_entry->iov, rx_entry->iov, rx_entry->iov_count * sizeof(struct iovec));
-	memcpy(read_entry->mr_desc, rx_entry->desc, rx_entry->iov_count * sizeof(void *));
+	rxr_read_copy_desc(EFA_EP, rx_entry->iov_count, rx_entry->desc, read_entry->mr_desc);
 	ofi_consume_iov_desc(read_entry->iov, read_entry->mr_desc, &read_entry->iov_count, data_offset);
 	if (read_entry->iov_count == 0) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
@@ -389,7 +415,7 @@ int rxr_read_post_local_read_or_queue(struct rxr_ep *ep,
 	}
 
 	assert(efa_ep_is_cuda_mr(read_entry->mr_desc[0]));
-	err = ofi_truncate_iov(read_entry->iov, &read_entry->iov_count, data_size);
+	err = ofi_truncate_iov(read_entry->iov, &read_entry->iov_count, data_size + ep->msg_prefix_size);
 	if (err) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"data_offset %ld data_size %ld out of range\n",
@@ -407,10 +433,9 @@ int rxr_read_init_iov(struct rxr_ep *ep,
 {
 	int i, err;
 	struct fid_mr *mr;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
-	assert(peer);
 
 	for (i = 0; i < tx_entry->iov_count; ++i) {
 		read_iov[i].addr = (uint64_t)tx_entry->iov[i].iov_base;
@@ -427,6 +452,7 @@ int rxr_read_init_iov(struct rxr_ep *ep,
 		if (!tx_entry->mr[0]) {
 			for (i = 0; i < tx_entry->iov_count; ++i) {
 				assert(!tx_entry->mr[i]);
+				assert(peer);
 
 				if (peer->is_local)
 					err = efa_mr_reg_shm(rxr_ep_domain(ep)->rdm_domain,
@@ -467,8 +493,8 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 	struct fi_rma_iov rma_iov;
 	struct fi_msg_rma msg;
 	struct efa_ep *efa_ep;
-	struct rxr_peer *peer;
-	fi_addr_t shm_fiaddr;
+	struct rdm_peer *peer;
+	fi_addr_t shm_fiaddr = FI_ADDR_NOTAVAIL;
 
 	assert(read_entry->iov_count > 0);
 	assert(read_entry->rma_iov_count > 0);
@@ -487,32 +513,38 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 			return ret;
 	}
 
-	peer = rxr_ep_get_peer(ep, read_entry->addr);
-	assert(peer);
-	if (read_entry->lower_ep_type == SHM_EP)
+	if (read_entry->lower_ep_type == SHM_EP) {
+		peer = rxr_ep_get_peer(ep, read_entry->addr);
+		assert(peer);
 		shm_fiaddr = peer->shm_fiaddr;
+	}
 
 	max_read_size = (read_entry->lower_ep_type == EFA_EP) ?
 				efa_max_rdma_size(ep->rdm_ep) : SIZE_MAX;
 	assert(max_read_size > 0);
 
 	ret = rxr_locate_iov_pos(read_entry->iov, read_entry->iov_count,
-				 read_entry->bytes_submitted,
+				 read_entry->bytes_submitted + ep->msg_prefix_size,
 				 &iov_idx, &iov_offset);
 	assert(ret == 0);
+	if (ret) {
+		return ret;
+	}
 
 	ret = rxr_locate_rma_iov_pos(read_entry->rma_iov, read_entry->rma_iov_count,
 				     read_entry->bytes_submitted,
 				     &rma_iov_idx, &rma_iov_offset);
 	assert(ret == 0);
-
+	if (ret) {
+		return ret;
+	}
 	total_iov_len = ofi_total_iov_len(read_entry->iov, read_entry->iov_count);
 	total_rma_iov_len = ofi_total_rma_iov_len(read_entry->rma_iov, read_entry->rma_iov_count);
 	assert(read_entry->total_len == MIN(total_iov_len, total_rma_iov_len));
 
 	while (read_entry->bytes_submitted < read_entry->total_len) {
 
-		if (ep->tx_pending == ep->max_outstanding_tx)
+		if (read_entry->lower_ep_type == EFA_EP && ep->efa_outstanding_tx_ops == ep->efa_max_outstanding_tx_ops)
 			return -FI_EAGAIN;
 
 		assert(iov_idx < read_entry->iov_count);
@@ -537,9 +569,9 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 		 * we had to use a pkt_entry as context too
 		 */
 		if (read_entry->lower_ep_type == SHM_EP)
-			pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_shm_pool);
+			pkt_entry = rxr_pkt_entry_alloc(ep, ep->shm_tx_pkt_pool, RXR_PKT_FROM_SHM_TX_POOL);
 		else
-			pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_efa_pool);
+			pkt_entry = rxr_pkt_entry_alloc(ep, ep->efa_tx_pkt_pool, RXR_PKT_FROM_EFA_TX_POOL);
 
 		if (OFI_UNLIKELY(!pkt_entry))
 			return -FI_EAGAIN;
@@ -561,6 +593,8 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 			efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
 			msg.addr = read_entry->addr;
 			self_comm = (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY);
+			if (self_comm)
+				pkt_entry->flags |= RXR_PKT_ENTRY_LOCAL_READ;
 			ret = efa_rma_post_read(efa_ep, &msg, 0, self_comm);
 		}
 
@@ -569,13 +603,7 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 			return ret;
 		}
 
-		if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) {
-			assert(read_entry->lower_ep_type == EFA_EP);
-			/* read from self, no peer */
-			ep->tx_pending++;
-		} else if (read_entry->lower_ep_type == EFA_EP) {
-			rxr_ep_inc_tx_pending(ep, peer);
-		}
+		rxr_ep_record_tx_op_submitted(ep, pkt_entry);
 
 		read_entry->bytes_submitted += iov.iov_len;
 
@@ -607,21 +635,22 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 	return 0;
 }
 
-int rxr_read_handle_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry, int ret)
+void rxr_read_write_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry,
+			  int err, int prov_errno)
 {
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
 
 	if (read_entry->context_type == RXR_READ_CONTEXT_TX_ENTRY) {
 		tx_entry = read_entry->context;
-		ret = rxr_cq_handle_tx_error(ep, tx_entry, ret);
+		rxr_cq_write_tx_error(ep, tx_entry, err, prov_errno);
 	} else {
 		assert(read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY);
 		rx_entry = read_entry->context;
-		ret = rxr_cq_handle_rx_error(ep, rx_entry, ret);
+		rxr_cq_write_rx_error(ep, rx_entry, err, prov_errno);
 	}
 
-	dlist_remove(&read_entry->pending_entry);
-	return ret;
+	if (read_entry->state == RXR_RDMA_ENTRY_PENDING)
+		dlist_remove(&read_entry->pending_entry);
 }
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_read.h b/deps/libfabric/prov/efa/src/rxr/rxr_read.h
index 33b55c0eac9238664f9bdfb99ad1e8f684df3ba7..934eabace41ecfd1c9d86d169fc37de7687362b7 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_read.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_read.h
@@ -68,13 +68,15 @@ enum rxr_read_context_type {
 enum rxr_read_entry_state {
 	RXR_RDMA_ENTRY_FREE = 0,
 	RXR_RDMA_ENTRY_CREATED,
-	RXR_RDMA_ENTRY_PENDING
+	RXR_RDMA_ENTRY_PENDING,
+	RXR_RDMA_ENTRY_SUBMITTED,
 };
 
 /*
  * rxr_read_entry contains the information of a read request
  */
 struct rxr_read_entry {
+	enum rxr_x_entry_type type;
 	int read_id;
 	enum rxr_lower_ep_type lower_ep_type;
 
@@ -125,7 +127,7 @@ int rxr_read_post_local_read_or_queue(struct rxr_ep *ep,
 
 void rxr_read_handle_read_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
-int rxr_read_handle_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry, int ret);
+void rxr_read_write_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry, int err, int prov_errno);
 
 #endif
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_rma.c b/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
index f25569b50d57fa144143a6c3c2316f165a981a65..70fa090fda6bc1655d0f71b509cd248bcbdefb57 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
@@ -37,25 +37,32 @@
 #include <ofi_iov.h>
 #include "efa.h"
 #include "rxr.h"
+#include "rxr_msg.h"
 #include "rxr_rma.h"
 #include "rxr_pkt_cmd.h"
 #include "rxr_cntr.h"
 #include "rxr_read.h"
 
-int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct fi_rma_iov *rma,
-			      size_t count, uint32_t flags, struct iovec *iov)
+int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct efa_rma_iov *rma,
+			      size_t count, uint32_t flags,
+			      struct iovec *iov, void **desc)
 {
+	void *context;
+	struct efa_mr *efa_mr;
 	struct efa_ep *efa_ep;
 	int i, ret;
 
 	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
 
 	for (i = 0; i < count; i++) {
-		ret = ofi_mr_verify(&efa_ep->domain->util_domain.mr_map,
-				    rma[i].len,
-				    (uintptr_t *)(&rma[i].addr),
-				    rma[i].key,
-				    flags);
+		fastlock_acquire(&efa_ep->domain->util_domain.lock);
+		ret = ofi_mr_map_verify(&efa_ep->domain->util_domain.mr_map,
+					(uintptr_t *)(&rma[i].addr),
+					rma[i].len, rma[i].key, flags,
+					&context);
+		efa_mr = context;
+		desc[i] = fi_mr_desc(&efa_mr->mr_fid);
+		fastlock_release(&efa_ep->domain->util_domain.lock);
 		if (ret) {
 			FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
 				"MR verification failed (%s), addr: %lx key: %ld\n",
@@ -85,14 +92,12 @@ rxr_rma_alloc_readrsp_tx_entry(struct rxr_ep *rxr_ep,
 	}
 
 	assert(tx_entry);
-#if ENABLE_DEBUG
-	dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list);
-#endif
+	dlist_insert_tail(&tx_entry->ep_entry, &rxr_ep->tx_entry_list);
 
 	msg.msg_iov = rx_entry->iov;
 	msg.iov_count = rx_entry->iov_count;
 	msg.addr = rx_entry->addr;
-	msg.desc = NULL;
+	msg.desc = rx_entry->desc;
 	msg.context = NULL;
 	msg.data = 0;
 
@@ -148,9 +153,7 @@ rxr_rma_alloc_tx_entry(struct rxr_ep *rxr_ep,
 	memcpy(tx_entry->rma_iov, msg_rma->rma_iov,
 	       sizeof(struct fi_rma_iov) * msg_rma->rma_iov_count);
 
-#if ENABLE_DEBUG
-	dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list);
-#endif
+	dlist_insert_tail(&tx_entry->ep_entry, &rxr_ep->tx_entry_list);
 	return tx_entry;
 }
 
@@ -158,12 +161,13 @@ size_t rxr_rma_post_shm_write(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_ent
 {
 	struct rxr_pkt_entry *pkt_entry;
 	struct fi_msg_rma msg;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	int i, err;
 
 	assert(tx_entry->op == ofi_op_write);
 	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
-	pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_shm_pool);
+	assert(peer);
+	pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->shm_tx_pkt_pool, RXR_PKT_FROM_SHM_TX_POOL);
 	if (OFI_UNLIKELY(!pkt_entry))
 		return -FI_EAGAIN;
 
@@ -182,6 +186,8 @@ size_t rxr_rma_post_shm_write(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_ent
 	msg.rma_iov_count = tx_entry->rma_iov_count;
 	msg.context = pkt_entry;
 	msg.data = tx_entry->cq_entry.data;
+	msg.desc = tx_entry->desc;
+	rxr_convert_desc_for_shm(msg.iov_count, tx_entry->desc);
 
 	err = fi_writemsg(rxr_ep->shm_ep, &msg, tx_entry->fi_flags);
 	if (err)
@@ -194,20 +200,15 @@ size_t rxr_rma_post_shm_write(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_ent
 ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 {
 	int err, window, credits;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct rxr_rx_entry *rx_entry;
-	struct fi_msg msg = {0};
 
 	/* create a rx_entry to receve data
 	 * use ofi_op_msg for its op.
 	 * it does not write a rx completion.
 	 */
-	msg.msg_iov = tx_entry->iov;
-	msg.iov_count = tx_entry->iov_count;
-	msg.addr = tx_entry->addr;
-	rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_msg, 0);
+	rx_entry = rxr_ep_alloc_rx_entry(ep, tx_entry->addr, ofi_op_msg);
 	if (!rx_entry) {
-		rxr_release_tx_entry(ep, tx_entry);
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"RX entries exhausted for read.\n");
 		rxr_ep_progress_internal(ep);
@@ -223,8 +224,10 @@ ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *t
 	assert(rx_entry);
 	rx_entry->tx_id = -1;
 	rx_entry->cq_entry.flags |= FI_READ;
-	rx_entry->total_len = rx_entry->cq_entry.len;
-
+	rx_entry->cq_entry.len = tx_entry->total_len;
+	rx_entry->total_len = tx_entry->total_len;
+	rx_entry->iov_count = tx_entry->iov_count;
+	memcpy(rx_entry->iov, tx_entry->iov, sizeof(*rx_entry->iov) * tx_entry->iov_count);
 	/*
 	 * there will not be a CTS for fi_read, we calculate CTS
 	 * window here, and send it via REQ.
@@ -236,7 +239,6 @@ ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *t
 	 * call rxr_ep_progress_internal() might release some buffer
 	 */
 	if (ep->available_data_bufs == 0) {
-		rxr_release_tx_entry(ep, tx_entry);
 		rxr_release_rx_entry(ep, rx_entry);
 		rxr_ep_progress_internal(ep);
 		return -FI_EAGAIN;
@@ -261,10 +263,11 @@ ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *t
 	tx_entry->rma_loc_rx_id = rx_entry->rx_id;
 
 	if (tx_entry->total_len < ep->mtu_size - sizeof(struct rxr_readrsp_hdr)) {
-		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_SHORT_RTR_PKT, 0);
+		err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_SHORT_RTR_PKT, 0, 0);
 	} else {
 		peer = rxr_ep_get_peer(ep, tx_entry->addr);
 		assert(peer);
+
 		rxr_pkt_calc_cts_window_credits(ep, peer,
 						tx_entry->total_len,
 						tx_entry->credit_request,
@@ -274,7 +277,15 @@ ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *t
 		rx_entry->window = window;
 		rx_entry->credit_cts = credits;
 		tx_entry->rma_window = rx_entry->window;
-		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_LONG_RTR_PKT, 0);
+		err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_LONGCTS_RTR_PKT, 0, 0);
+	}
+
+	if (OFI_UNLIKELY(err)) {
+#if ENABLE_DEBUG
+	        dlist_remove(&rx_entry->rx_pending_entry);
+		ep->rx_pending--;
+#endif
+		rxr_release_rx_entry(ep, rx_entry);
 	}
 
 	return err;
@@ -284,8 +295,8 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 {
 	ssize_t err;
 	struct rxr_ep *rxr_ep;
-	struct rxr_peer *peer;
-	struct rxr_tx_entry *tx_entry;
+	struct rdm_peer *peer;
+	struct rxr_tx_entry *tx_entry = NULL;
 	bool use_lower_ep_read;
 
 	FI_DBG(&rxr_prov, FI_LOG_EP_DATA,
@@ -296,7 +307,7 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	assert(msg->iov_count <= rxr_ep->tx_iov_limit);
 
-	rxr_perfset_start(rxr_ep, perf_rxr_tx);
+	efa_perfset_start(rxr_ep, perf_efa_tx);
 	fastlock_acquire(&rxr_ep->util_ep.lock);
 
 	if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) {
@@ -304,6 +315,14 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 		goto out;
 	}
 
+	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
+
+	if (peer->flags & RXR_PEER_IN_BACKOFF) {
+		err = -FI_EAGAIN;
+		goto out;
+	}
+
 	tx_entry = rxr_rma_alloc_tx_entry(rxr_ep, msg, ofi_op_read_req, flags);
 	if (OFI_UNLIKELY(!tx_entry)) {
 		rxr_ep_progress_internal(rxr_ep);
@@ -311,9 +330,6 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 		goto out;
 	}
 
-	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
-	assert(peer);
-
 	use_lower_ep_read = false;
 	if (peer->is_local) {
 		assert(rxr_ep->use_shm);
@@ -328,24 +344,24 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 	if (use_lower_ep_read) {
 		err = rxr_read_post_remote_read_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry);
 		if (OFI_UNLIKELY(err == -FI_ENOBUFS)) {
-			rxr_release_tx_entry(rxr_ep, tx_entry);
 			err = -FI_EAGAIN;
 			rxr_ep_progress_internal(rxr_ep);
 			goto out;
 		}
 	} else {
 		err = rxr_ep_set_tx_credit_request(rxr_ep, tx_entry);
-		if (OFI_UNLIKELY(err)) {
-			rxr_release_tx_entry(rxr_ep, tx_entry);
+		if (OFI_UNLIKELY(err))
 			goto out;
-		}
 
 		err = rxr_rma_post_efa_emulated_read(rxr_ep, tx_entry);
 	}
 
 out:
+	if (OFI_UNLIKELY(err && tx_entry))
+		rxr_release_tx_entry(rxr_ep, tx_entry);
+
 	fastlock_release(&rxr_ep->util_ep.lock);
-	rxr_perfset_end(rxr_ep, perf_rxr_tx);
+	efa_perfset_end(rxr_ep, perf_efa_tx);
 	return err;
 }
 
@@ -387,8 +403,11 @@ ssize_t rxr_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc,
 ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 {
 	ssize_t err;
-	struct rxr_peer *peer;
+	struct rdm_peer *peer;
 	struct efa_domain *efa_domain;
+	bool delivery_complete_requested;
+	int ctrl_type;
+	size_t max_eager_rtw_data_size;
 	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
 
 	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
@@ -396,17 +415,61 @@ ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
 	assert(peer);
+
 	if (peer->is_local)
 		return rxr_rma_post_shm_write(ep, tx_entry);
 
+	delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE;
+	if (delivery_complete_requested) {
+		tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED;
+		/*
+		 * Because delivery complete is defined as an extra
+		 * feature, the receiver might not support it.
+		 *
+		 * The sender cannot send with FI_DELIVERY_COMPLETE
+		 * if the peer is not able to handle it.
+		 *
+		 * If the sender does not know whether the peer
+		 * can handle it, it needs to trigger
+		 * a handshake packet from the peer.
+		 *
+		 * The handshake packet contains
+		 * the information whether the peer
+		 * support it or not.
+		 */
+		err = rxr_pkt_trigger_handshake(ep, tx_entry->addr, peer);
+		if (OFI_UNLIKELY(err))
+			return err;
+
+		if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))
+			return -FI_EAGAIN;
+		else if (!rxr_peer_support_delivery_complete(peer))
+			return -FI_EOPNOTSUPP;
+
+		max_eager_rtw_data_size = rxr_pkt_req_max_data_size(ep,
+								    tx_entry->addr,
+								    RXR_DC_EAGER_RTW_PKT,
+								    tx_entry->fi_flags,
+								    tx_entry->rma_iov_count);
+	} else {
+		max_eager_rtw_data_size = rxr_pkt_req_max_data_size(ep,
+								    tx_entry->addr,
+								    RXR_EAGER_RTW_PKT,
+								    tx_entry->fi_flags,
+								    tx_entry->rma_iov_count);
+	}
+
 	/* Inter instance */
-	if (tx_entry->total_len < rxr_pkt_req_max_data_size(ep, tx_entry->addr, RXR_EAGER_RTW_PKT))
-		return rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_EAGER_RTW_PKT, 0);
+	if (tx_entry->total_len <= max_eager_rtw_data_size) {
+		ctrl_type = delivery_complete_requested ?
+			RXR_DC_EAGER_RTW_PKT : RXR_EAGER_RTW_PKT;
+		return rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, ctrl_type, 0, 0);
+	}
 
 	if (tx_entry->total_len >= rxr_env.efa_min_read_write_size &&
 	    efa_both_support_rdma_read(ep, peer) &&
 	    (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) {
-		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_READ_RTW_PKT, 0);
+		err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_LONGREAD_RTW_PKT, 0, 0);
 		if (err != -FI_ENOMEM)
 			return err;
 		/*
@@ -419,7 +482,10 @@ ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	return rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_LONG_RTW_PKT, 0);
+	ctrl_type = delivery_complete_requested ?
+		RXR_DC_LONGCTS_RTW_PKT : RXR_LONGCTS_RTW_PKT;
+	tx_entry->rxr_flags |= RXR_LONGCTS_PROTOCOL;
+	return rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, ctrl_type, 0, 0);
 }
 
 ssize_t rxr_rma_writemsg(struct fid_ep *ep,
@@ -427,6 +493,7 @@ ssize_t rxr_rma_writemsg(struct fid_ep *ep,
 			 uint64_t flags)
 {
 	ssize_t err;
+	struct rdm_peer *peer;
 	struct rxr_ep *rxr_ep;
 	struct rxr_tx_entry *tx_entry;
 
@@ -438,9 +505,17 @@ ssize_t rxr_rma_writemsg(struct fid_ep *ep,
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	assert(msg->iov_count <= rxr_ep->tx_iov_limit);
 
-	rxr_perfset_start(rxr_ep, perf_rxr_tx);
+	efa_perfset_start(rxr_ep, perf_efa_tx);
 	fastlock_acquire(&rxr_ep->util_ep.lock);
 
+	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
+	assert(peer);
+
+	if (peer->flags & RXR_PEER_IN_BACKOFF) {
+		err = -FI_EAGAIN;
+		goto out;
+	}
+
 	tx_entry = rxr_rma_alloc_tx_entry(rxr_ep, msg, ofi_op_write, flags);
 	if (OFI_UNLIKELY(!tx_entry)) {
 		rxr_ep_progress_internal(rxr_ep);
@@ -449,11 +524,12 @@ ssize_t rxr_rma_writemsg(struct fid_ep *ep,
 	}
 
 	err = rxr_rma_post_write(rxr_ep, tx_entry);
-	if (OFI_UNLIKELY(err))
+	if (OFI_UNLIKELY(err)) {
 		rxr_release_tx_entry(rxr_ep, tx_entry);
+	}
 out:
 	fastlock_release(&rxr_ep->util_ep.lock);
-	rxr_perfset_end(rxr_ep, perf_rxr_tx);
+	efa_perfset_end(rxr_ep, perf_efa_tx);
 	return err;
 }
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_rma.h b/deps/libfabric/prov/efa/src/rxr/rxr_rma.h
index e8253ff6c830016d139c67f30792faf2a3bca693..4bca1e47ca4885e24a53ceb4b77f9b30db2e7cb6 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_rma.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_rma.h
@@ -39,8 +39,9 @@
 
 #include <rdma/fi_rma.h>
 
-int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct fi_rma_iov *rma,
-			      size_t count, uint32_t flags, struct iovec *iov);
+int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct efa_rma_iov *rma,
+			      size_t count, uint32_t flags,
+			      struct iovec *iov, void **desc);
 
 /* read response related functions */
 struct rxr_tx_entry *
diff --git a/deps/libfabric/prov/gni/configure.m4 b/deps/libfabric/prov/gni/configure.m4
index a368b32cf801c74226fc023a3ced061765fd3180..19b3462bd654f0493f6df4f02fb001070def3c63 100644
--- a/deps/libfabric/prov/gni/configure.m4
+++ b/deps/libfabric/prov/gni/configure.m4
@@ -2,6 +2,8 @@ dnl
 dnl Copyright (c) 2015-2019 Cray Inc. All rights reserved.
 dnl Copyright (c) 2015-2018 Los Alamos National Security, LLC.
 dnl                         All rights reserved.
+dnl Copyright (c) 2021      Triad National Security, LLC. All rights
+dnl                         reserved.
 dnl
 dnl This software is available to you under a choice of one of two
 dnl licenses.  You may choose to be licensed under the terms of the GNU
@@ -87,7 +89,7 @@ AC_DEFUN([FI_GNI_CONFIGURE],[
                                  ],
                                  [ugni_lib_happy=0])
 
-               AS_IF([test x"$enable_ugni_static" == x"yes" && test $ugni_lib_happy -eq 1],
+               AS_IF([test x"$enable_ugni_static" = x"yes" && test $ugni_lib_happy -eq 1],
                      [gni_LDFLAGS=$(echo $gni_LDFLAGS | sed -e 's/lugni/l:libugni.a/')],[])
 
                FI_PKG_CHECK_MODULES_STATIC([CRAY_ALPS_LLI], [cray-alpslli],
@@ -119,14 +121,15 @@ AC_DEFUN([FI_GNI_CONFIGURE],[
                       [AC_DEFINE_UNQUOTED([HAVE_XPMEM], [0], [Define to 1 if xpmem available])
                       ])
 
-               gni_path_to_gni_pub=${CRAY_GNI_HEADERS_INCLUDE_OPTS:2}/gni_pub.h
-               dnl Trim the leading -I in order to provide a path
 
+               CPPFLAGS_SAVE=$CPPFLAGS
+               CPPFLAGS="$gni_CPPFLAGS $CPPFLAGS"
                AC_CHECK_TYPES([gni_ct_cqw_post_descriptor_t], [],
                               [AC_MSG_WARN([GNI provider requires CLE 5.2.UP04 or higher. Disabling gni provider.])
                                gni_header_happy=0
                               ],
-                              [[#include "$gni_path_to_gni_pub"]])
+                              [[#include "gni_pub.h"]])
+               CPPFLAGS=$CPPFLAGS_SAVE
 
                AS_IF([test -d $srcdir/prov/gni/test],
                      [AC_ARG_WITH([criterion], [AS_HELP_STRING([--with-criterion],
diff --git a/deps/libfabric/prov/gni/include/gnix_cm.h b/deps/libfabric/prov/gni/include/gnix_cm.h
index 5344caa071889514fdedb18e77cc16540fc24523..81fdea134afcf39124f1c8cd557c4feddad08bb3 100644
--- a/deps/libfabric/prov/gni/include/gnix_cm.h
+++ b/deps/libfabric/prov/gni/include/gnix_cm.h
@@ -1,7 +1,8 @@
 /*
  * Copyright (c) 2016 Cray Inc. All rights reserved.
  * Copyright (c) 2017 Los Alamos National Security, LLC. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -117,7 +118,7 @@ _gnix_resolve_gni_ep_name(const char *ep_name, int idx,
 	int ret = FI_SUCCESS;
 	static size_t addr_size = sizeof(struct gnix_ep_name);
 
-	GNIX_TRACE(FI_LOG_TRACE, "\n");
+	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
 
 	/*TODO (optimization): Just return offset into ep_name */
 	memcpy(addr, &ep_name[addr_size * idx], addr_size);
@@ -138,7 +139,7 @@ _gnix_resolve_str_ep_name(const char *ep_name, int idx,
 	int ret = FI_SUCCESS;
 	static size_t addr_size = GNIX_FI_ADDR_STR_LEN;
 
-	GNIX_TRACE(FI_LOG_TRACE, "\n");
+	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
 
 	ret = _gnix_ep_name_from_str(&ep_name[addr_size * idx], addr);
 	return ret;
diff --git a/deps/libfabric/prov/gni/include/gnix_freelist.h b/deps/libfabric/prov/gni/include/gnix_freelist.h
index 7a2fbb8aaa149ac8aca38f6b113090df8f0e138c..dea00fd253de93562a4980e955bf54d121e8580c 100644
--- a/deps/libfabric/prov/gni/include/gnix_freelist.h
+++ b/deps/libfabric/prov/gni/include/gnix_freelist.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2016 Cray Inc.  All rights reserved.
  * Copyright (c) 2015 Los Alamos National Security, LLC. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -128,7 +129,7 @@ static inline int _gnix_fl_alloc(struct dlist_entry **e, struct gnix_freelist *f
         if (fl->refill_size == 0) {
                 ret = -FI_ECANCELED;
 
-                GNIX_DEBUG(FI_LOG_DEBUG, "Freelist not growable (refill "
+                GNIX_DEBUG(FI_LOG_EP_CTRL, "Freelist not growable (refill "
                                    "size is 0\n");
 
                 goto err;
diff --git a/deps/libfabric/prov/gni/src/gnix_av.c b/deps/libfabric/prov/gni/src/gnix_av.c
index 749c7537b965a4335dc06d2184b434d6cd0a8e6a..3aeca6624c25585c853fe2ac6c6262270b163eb3 100644
--- a/deps/libfabric/prov/gni/src/gnix_av.c
+++ b/deps/libfabric/prov/gni/src/gnix_av.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
- * Copyright (c) 2019      Triad National Security, LLC.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
  *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -361,7 +361,7 @@ static int map_insert(struct gnix_fid_av *av_priv, const void *addr,
 					ret_cnt = -FI_EINVAL;
 					continue;
 				}
-				GNIX_DEBUG(FI_LOG_DEBUG, "ep_name doesn't fit "
+				GNIX_DEBUG(FI_LOG_AV, "ep_name doesn't fit "
 					"into the av context bits\n");
 				return -FI_EINVAL; /* TODO: should try to do
 						      cleanup */
@@ -745,7 +745,7 @@ DIRECT_FN const char *gnix_av_straddr(struct fid_av *av,
 	struct gnix_fid_av *av_priv;
 
 	if (!av || !addr || !buf || !len) {
-		GNIX_DEBUG(FI_LOG_DEBUG, "NULL parameter in gnix_av_straddr\n");
+		GNIX_DEBUG(FI_LOG_AV, "NULL parameter in gnix_av_straddr\n");
 		return NULL;
 	}
 
diff --git a/deps/libfabric/prov/gni/src/gnix_cm.c b/deps/libfabric/prov/gni/src/gnix_cm.c
index f30aa0cd4c7dcf25a51cf4671903b7eeaee75b19..f3c301e77ce388b10fec49f700207d12497dc808 100644
--- a/deps/libfabric/prov/gni/src/gnix_cm.c
+++ b/deps/libfabric/prov/gni/src/gnix_cm.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -55,7 +56,7 @@ int _gnix_ep_name_to_str(struct gnix_ep_name *ep_name, char **out_buf)
 	char *str;
 	size_t len = GNIX_FI_ADDR_STR_LEN;
 
-	GNIX_TRACE(FI_LOG_TRACE, "\n");
+	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
 
 	if (*out_buf == NULL) {
 		str = calloc(len, sizeof(char));
@@ -90,10 +91,10 @@ int _gnix_ep_name_from_str(const char *addr,
 	long tok_val;
 	char *dup_addr;
 
-	GNIX_TRACE(FI_LOG_TRACE, "\n");
+	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
 
 	if (!addr || !resolved_addr) {
-		GNIX_WARN(FI_LOG_WARN, "NULL parameter in "
+		GNIX_WARN(FI_LOG_EP_CTRL, "NULL parameter in "
 			"__gnix_resolved_name_from_str");
 		return -FI_EINVAL;
 	}
@@ -105,34 +106,34 @@ int _gnix_ep_name_from_str(const char *addr,
 
 	tok = strtok(dup_addr, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		return -FI_EINVAL;
 	}
 
 	ret = memcmp(tok, "gni", 3);
 	if (ret) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 
 	tok = strtok(NULL, ";");/*node*/
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 
 	tok = strtok(NULL, ";");/*service*/
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 
 	tok = strtok(NULL, ";");/*GNIX_AV_STR_ADDR_VERSION*/
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -140,13 +141,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*device_addr*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 16);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid device_addr.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid device_addr.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -155,13 +156,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*cdm_id*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 16);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid cdm_id.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cdm_id.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -170,13 +171,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*name_type*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 10);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid name_type.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid name_type.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -185,13 +186,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*cm_nic_cdm_id*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 16);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid cm_nic_cdm_id.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cm_nic_cdm_id.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -200,13 +201,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*cookie*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 16);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid cookie.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cookie.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
@@ -215,13 +216,13 @@ int _gnix_ep_name_from_str(const char *addr,
 	/*rx_ctx_cnt*/
 	tok = strtok(NULL, ";");
 	if (!tok) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid address.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
 	tok_val = strtol(tok, &endptr, 10);
 	if (*endptr) {
-		GNIX_WARN(FI_LOG_WARN, "Invalid rx_ctx_cnt.\n");
+		GNIX_WARN(FI_LOG_EP_CTRL, "Invalid rx_ctx_cnt.\n");
 		free(dup_addr);
 		return -FI_EINVAL;
 	}
diff --git a/deps/libfabric/prov/gni/src/gnix_ep.c b/deps/libfabric/prov/gni/src/gnix_ep.c
index cd09fc95ee0f6092282013a18b7ce14cb6320d45..48e5437075b99ae2deb9a208a3cef13580a7918a 100644
--- a/deps/libfabric/prov/gni/src/gnix_ep.c
+++ b/deps/libfabric/prov/gni/src/gnix_ep.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2019 Cray Inc. All rights reserved.
  * Copyright (c) 2015-2018 Los Alamos National Security, LLC.
  *                         All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -799,7 +800,7 @@ gnix_ep_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
 	struct gnix_fid_ep *gnix_ep;
 	uint64_t flags;
 
-	if (!ep || !iov || !desc || count > GNIX_MAX_RMA_IOV_LIMIT) {
+	if (!ep || !iov || count > GNIX_MAX_RMA_IOV_LIMIT) {
 		return -FI_EINVAL;
 	}
 
@@ -809,7 +810,7 @@ gnix_ep_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
 	flags = gnix_ep->op_flags | GNIX_RMA_READ_FLAGS_DEF;
 
 	return _gnix_rma(gnix_ep, GNIX_FAB_RQ_RDMA_READ,
-			 (uint64_t)iov[0].iov_base, iov[0].iov_len, desc[0],
+			 (uint64_t)iov[0].iov_base, iov[0].iov_len, desc? desc[0] : NULL,
 			 src_addr, addr, key,
 			 context, flags, 0);
 }
@@ -819,7 +820,7 @@ gnix_ep_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
 {
 	struct gnix_fid_ep *gnix_ep;
 
-	if (!ep || !msg || !msg->msg_iov || !msg->rma_iov || !msg->desc ||
+	if (!ep || !msg || !msg->msg_iov || !msg->rma_iov ||
 	    msg->iov_count != 1 || msg->rma_iov_count != 1 ||
 	    msg->rma_iov[0].len > msg->msg_iov[0].iov_len) {
 		return -FI_EINVAL;
@@ -832,7 +833,7 @@ gnix_ep_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
 
 	return _gnix_rma(gnix_ep, GNIX_FAB_RQ_RDMA_READ,
 			 (uint64_t)msg->msg_iov[0].iov_base,
-			 msg->msg_iov[0].iov_len, msg->desc[0],
+			 msg->msg_iov[0].iov_len, msg->desc? msg->desc[0] : NULL,
 			 msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key,
 			 msg->context, flags, msg->data);
 }
@@ -866,7 +867,7 @@ gnix_ep_writev(struct fid_ep *ep, const struct iovec *iov, void **desc,
 	struct gnix_fid_ep *gnix_ep;
 	uint64_t flags;
 
-	if (!ep || !iov || !desc || count > GNIX_MAX_RMA_IOV_LIMIT) {
+	if (!ep || !iov || count > GNIX_MAX_RMA_IOV_LIMIT) {
 		return -FI_EINVAL;
 	}
 
@@ -876,7 +877,7 @@ gnix_ep_writev(struct fid_ep *ep, const struct iovec *iov, void **desc,
 	flags = gnix_ep->op_flags | GNIX_RMA_WRITE_FLAGS_DEF;
 
 	return _gnix_rma(gnix_ep, GNIX_FAB_RQ_RDMA_WRITE,
-			 (uint64_t)iov[0].iov_base, iov[0].iov_len, desc[0],
+			 (uint64_t)iov[0].iov_base, iov[0].iov_len, desc? desc[0] : NULL,
 			 dest_addr, addr, key, context, flags, 0);
 }
 
@@ -2350,7 +2351,7 @@ DIRECT_FN int gnix_ep_open(struct fid_domain *domain, struct fi_info *info,
 	ep_priv->info = fi_dupinfo(info);
 	ep_priv->info->addr_format = info->addr_format;
 
-	GNIX_DEBUG(FI_LOG_DEBUG, "ep(%p) is using addr_format(%s)\n", ep_priv,
+	GNIX_DEBUG(FI_LOG_EP_CTRL, "ep(%p) is using addr_format(%s)\n", ep_priv,
 		  ep_priv->info->addr_format == FI_ADDR_STR ? "FI_ADDR_STR" :
 		  "FI_ADDR_GNI");
 
diff --git a/deps/libfabric/prov/gni/src/gnix_mbox_allocator.c b/deps/libfabric/prov/gni/src/gnix_mbox_allocator.c
index d398e0055f91e23c20c9b22306b9912f8de207da..fb8bd9435e0c3cb55723e1749578d9f65a8f7bb9 100644
--- a/deps/libfabric/prov/gni/src/gnix_mbox_allocator.c
+++ b/deps/libfabric/prov/gni/src/gnix_mbox_allocator.c
@@ -125,6 +125,7 @@ static int __generate_file_name(size_t page_size, char **filename)
 	int my_file_id;
 	int size;
 	int ret;
+	int file_name_size;
 
 	if (!filename) {
 		GNIX_WARN(FI_LOG_EP_CTRL, "filename pointer is NULL.\n");
@@ -151,7 +152,8 @@ static int __generate_file_name(size_t page_size, char **filename)
 		goto err_snprintf;
 	}
 
-	full_filename = malloc(size + 1);
+	file_name_size = size + 1;
+	full_filename = malloc(file_name_size);
 	if (!full_filename) {
 		error = strerror_r(errno, error_buf, sizeof(error_buf));
 		GNIX_WARN(FI_LOG_EP_CTRL,
@@ -161,8 +163,8 @@ static int __generate_file_name(size_t page_size, char **filename)
 		goto err_snprintf;
 	}
 
-	sprintf(full_filename, "%s/%s.%d.%d", huge_page, basename, getpid(),
-		my_file_id);
+	snprintf(full_filename, file_name_size, "%s/%s.%d.%d", huge_page, basename,
+			getpid(), my_file_id);
 
 	GNIX_DEBUG(FI_LOG_EP_CTRL, "Generated filename: %s\n", full_filename);
 
diff --git a/deps/libfabric/prov/gni/src/gnix_msg.c b/deps/libfabric/prov/gni/src/gnix_msg.c
index 6cfcd5aeb8e653286e900dcf1eeeed6d6a2fe439..4b24d8702fbe931009c67fad9363faad2694ba89 100644
--- a/deps/libfabric/prov/gni/src/gnix_msg.c
+++ b/deps/libfabric/prov/gni/src/gnix_msg.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2019 Cray Inc. All rights reserved.
  * Copyright (c) 2015-2018 Los Alamos National Security, LLC.
  *                         All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -407,7 +408,7 @@ static int __recv_completion_src(
 	char *buffer;
 	size_t buf_len;
 
-	GNIX_DBG_TRACE(FI_LOG_TRACE, "\n");
+	GNIX_DBG_TRACE(FI_LOG_EP_DATA, "\n");
 
 	if ((req->msg.recv_flags & FI_COMPLETION) && ep->recv_cq) {
 		if ((src_addr == FI_ADDR_NOTAVAIL) &&
diff --git a/deps/libfabric/prov/gni/test/api.c b/deps/libfabric/prov/gni/test/api.c
index c61304df8c6daf96f2fd6be265d2ff29cbc435a2..5808e0693839ee98949d4bf53a3adb9c0b7befe5 100644
--- a/deps/libfabric/prov/gni/test/api.c
+++ b/deps/libfabric/prov/gni/test/api.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -70,24 +71,23 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[NUMEPS];
-struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
+static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
-void *ep_name[NUMEPS];
-fi_addr_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fid_cq *msg_cq[NUMEPS];
 static struct fi_info *fi[NUMEPS];
 static struct fi_cq_attr cq_attr;
-const char *api_cdm_id[NUMEPS] = { "5000", "5001" };
-struct fi_info *hints[NUMEPS];
+static struct fi_info *hints[NUMEPS];
 
 #define BUF_SZ (1<<20)
-char *target, *target_base;
-char *source, *source_base;
-char *uc_target;
-char *uc_source;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-uint64_t mr_key[NUMEPS];
+static char *target, *target_base;
+static char *source, *source_base;
+static char *uc_target;
+static char *uc_source;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static uint64_t mr_key[NUMEPS];
 
 static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS];
 static struct fi_cntr_attr cntr_attr = {
diff --git a/deps/libfabric/prov/gni/test/api_cntr.c b/deps/libfabric/prov/gni/test/api_cntr.c
index 3ed370a56c9e3b5fe6d10acec1c118ca741a5ca1..cf3a878cc49ca8330a674d3bbb579f3e36d3c98a 100644
--- a/deps/libfabric/prov/gni/test/api_cntr.c
+++ b/deps/libfabric/prov/gni/test/api_cntr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2017 Cray Inc. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -67,22 +68,22 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[NUMEPS];
-struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
+static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
-void *ep_name[NUMEPS];
-fi_addr_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fi_info *fi[NUMEPS];
-struct fi_info *hints[NUMEPS];
+static struct fi_info *hints[NUMEPS];
 
 #define BUF_SZ (1<<20)
-char *target, *target_base;
-char *source, *source_base;
-char *uc_target;
-char *uc_source;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-uint64_t mr_key[NUMEPS];
-uint64_t cntr_bind_flags;
+static char *target, *target_base;
+static char *source, *source_base;
+static char *uc_target;
+static char *uc_source;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static uint64_t mr_key[NUMEPS];
+static uint64_t cntr_bind_flags;
 
 static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS];
 static struct fid_cntr *write_cntr[NUMEPS], *read_cntr[NUMEPS];
diff --git a/deps/libfabric/prov/gni/test/api_cq.c b/deps/libfabric/prov/gni/test/api_cq.c
index 6559c043f1b7f8e731fa7d06a31a5b0ea10b8547..26296f6ad729538a553aeaf7e033f9322b27695c 100644
--- a/deps/libfabric/prov/gni/test/api_cq.c
+++ b/deps/libfabric/prov/gni/test/api_cq.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -68,24 +69,24 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[NUMEPS];
-struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
+static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
-void *ep_name[NUMEPS];
-fi_addr_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fid_cq *msg_cq[NUMEPS];
 static struct fi_info *fi[NUMEPS];
 static struct fi_cq_attr cq_attr;
-struct fi_info *hints[NUMEPS];
+static struct fi_info *hints[NUMEPS];
 
 #define BUF_SZ (1<<20)
-char *target, *target_base;
-char *source, *source_base;
-char *uc_target;
-char *uc_source;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-uint64_t mr_key[NUMEPS];
-uint64_t cq_bind_flags;
+static char *target, *target_base;
+static char *source, *source_base;
+static char *uc_target;
+static char *uc_source;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static uint64_t mr_key[NUMEPS];
+static uint64_t cq_bind_flags;
 
 void api_cq_bind(uint64_t flags)
 {
diff --git a/deps/libfabric/prov/gni/test/av.c b/deps/libfabric/prov/gni/test/av.c
index 824f33f76ff9817d690102a38e6b37304c379f50..f0f4b20ccf1cbde01ffe859a5e140944170da813 100644
--- a/deps/libfabric/prov/gni/test/av.c
+++ b/deps/libfabric/prov/gni/test/av.c
@@ -3,7 +3,7 @@
  *                         All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2019      Triad National Security, LLC.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
  *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -54,7 +54,7 @@ static struct fid_fabric *fab;
 static struct fid_domain *dom;
 static struct fi_info *hints;
 static struct fi_info *fi;
-struct gnix_ep_name *fake_names;
+static struct gnix_ep_name *fake_names;
 static struct fid_av *av;
 static struct gnix_fid_av *gnix_av;
 
diff --git a/deps/libfabric/prov/gni/test/cancel.c b/deps/libfabric/prov/gni/test/cancel.c
index a410d3fc350feee4a9b196534c7eb58ffe9b800e..eb24433c015546e5d816262c69078ba28789fb7a 100644
--- a/deps/libfabric/prov/gni/test/cancel.c
+++ b/deps/libfabric/prov/gni/test/cancel.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -63,16 +64,16 @@ static struct fid_ep *ep[2];
 static struct fid_av *av;
 static struct fi_info *hints;
 static struct fi_info *fi;
-void *ep_name[2];
-size_t gni_addr[2];
+static void *ep_name[2];
+static size_t gni_addr[2];
 static struct fid_cq *msg_cq[2];
 static struct fi_cq_attr cq_attr;
 
 #define BUF_SZ (8*1024)
-char *target, *target_base;
-char *source, *source_base;
-struct fid_mr *rem_mr, *loc_mr;
-uint64_t mr_key;
+static char *target, *target_base;
+static char *source, *source_base;
+static struct fid_mr *rem_mr, *loc_mr;
+static uint64_t mr_key;
 
 void cancel_setup(void)
 {
diff --git a/deps/libfabric/prov/gni/test/cm.c b/deps/libfabric/prov/gni/test/cm.c
index 1572405e3996da70431dc8caec25b8f92d5ffa0e..0ce599c4d71ba732f7bff2449ceb2d2550ba9c24 100644
--- a/deps/libfabric/prov/gni/test/cm.c
+++ b/deps/libfabric/prov/gni/test/cm.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2016-2017 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -83,7 +84,7 @@ static struct fi_info *cli_hints;
 static struct fi_info *cli_fi;
 static struct fid_eq *cli_eq;
 static struct fid_cq *cli_cq;
-char *cli_cm_in_data = "Hola.  Soy cliente.";
+static char *cli_cm_in_data = "Hola.  Soy cliente.";
 
 static struct fid_fabric *srv_fab;
 static struct fid_domain *srv_dom;
@@ -93,7 +94,7 @@ static struct fi_info *srv_hints;
 static struct fi_info *srv_fi;
 static struct fid_eq *srv_eq;
 static struct fid_cq *srv_cq;
-char *srv_cm_in_data = "Este es servidor.";
+static char *srv_cm_in_data = "Este es servidor.";
 
 struct fi_eq_attr eq_attr = {
 	.wait_obj = FI_WAIT_UNSPEC
diff --git a/deps/libfabric/prov/gni/test/cntr.c b/deps/libfabric/prov/gni/test/cntr.c
index bd6eb5b873e3be8f13311d5299b5dd475dc00010..51bdf9ce08ea86613eea5170dabea679aa262786 100644
--- a/deps/libfabric/prov/gni/test/cntr.c
+++ b/deps/libfabric/prov/gni/test/cntr.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -90,10 +91,10 @@ static struct fi_cntr_attr cntr_attr = {.events = FI_CNTR_EVENTS_COMP,
 					.flags = 0};
 
 #define BUF_SZ (64*1024)
-char *target, *target_base;
-char *source, *source_base;
-struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS];
-uint64_t mr_key[NUM_EPS];
+static char *target, *target_base;
+static char *source, *source_base;
+static struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS];
+static uint64_t mr_key[NUM_EPS];
 
 static inline void cntr_setup_eps(const uint64_t caps,
 	uint32_t version,
diff --git a/deps/libfabric/prov/gni/test/datagram.c b/deps/libfabric/prov/gni/test/datagram.c
index 5f0f9c28850baa3ab5dfab4add8c9890b36be911..dc5bf641168e757113016d1ec9d3dea4d2e5059a 100644
--- a/deps/libfabric/prov/gni/test/datagram.c
+++ b/deps/libfabric/prov/gni/test/datagram.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -64,7 +65,7 @@ static struct fid_ep *ep;
 static struct fi_info *hints;
 static struct fi_info *fi;
 static struct gnix_fid_ep *ep_priv;
-const char  my_cdm_id[] = "3000";
+static const char  my_cdm_id[] = "3000";
 
 void dg_setup(void)
 {
diff --git a/deps/libfabric/prov/gni/test/rdm_addr_str_sr.c b/deps/libfabric/prov/gni/test/rdm_addr_str_sr.c
index 312d3bf0899820cfdac78c166a19df7c226707f2..9f600618b032f30b752a946e8a96b0ac05b9922d 100644
--- a/deps/libfabric/prov/gni/test/rdm_addr_str_sr.c
+++ b/deps/libfabric/prov/gni/test/rdm_addr_str_sr.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -69,7 +70,7 @@ static fi_addr_t gni_addr[NUMEPS];
 static struct fid_cq *msg_cq[NUMEPS];
 static struct fi_info *fi[NUMEPS];
 static struct fi_cq_attr cq_attr;
-struct fi_info *hints;
+static struct fi_info *hints;
 static size_t addrlen = 0;
 
 #define BUF_SZ (1<<20)
diff --git a/deps/libfabric/prov/gni/test/rdm_atomic.c b/deps/libfabric/prov/gni/test/rdm_atomic.c
index 98560dc60cafebfcf523bf1626cae46a0cff5440..99d237ee36155bcb9e8efd72b9d7e4cf3913de95 100644
--- a/deps/libfabric/prov/gni/test/rdm_atomic.c
+++ b/deps/libfabric/prov/gni/test/rdm_atomic.c
@@ -1,7 +1,8 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -72,23 +73,23 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[NUMEPS];
-struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
+static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
 static struct fi_info *hints;
 static struct fi_info *fi;
-void *ep_name[NUMEPS];
-size_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static size_t gni_addr[NUMEPS];
 static struct fid_cq *send_cq[NUMEPS];
 static struct fid_cq *recv_cq[NUMEPS];
 static struct fi_cq_attr cq_attr;
 
 #define BUF_SZ (64*1024)
-char *target, *target_base;
-char *source, *source_base;
-char *uc_source, *uc_source_base;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-uint64_t mr_key[NUMEPS];
+static char *target, *target_base;
+static char *source, *source_base;
+static char *uc_source, *uc_source_base;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static uint64_t mr_key[NUMEPS];
 
 static struct fid_cntr *write_cntr[NUMEPS], *read_cntr[NUMEPS];
 static struct fid_cntr *rwrite_cntr;
diff --git a/deps/libfabric/prov/gni/test/rdm_dgram_rma.c b/deps/libfabric/prov/gni/test/rdm_dgram_rma.c
index 96b54aaf41e1a5ce41dac4e98f9aeca6d4c5c3eb..f07cc8497d4e6ae354da722e3ee60201636fb9cb 100644
--- a/deps/libfabric/prov/gni/test/rdm_dgram_rma.c
+++ b/deps/libfabric/prov/gni/test/rdm_dgram_rma.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2018 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -70,25 +71,25 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[2];
-struct fi_gni_ops_domain *gni_domain_ops[2];
+static struct fi_gni_ops_domain *gni_domain_ops[2];
 static struct fid_ep *ep[2];
 static struct fid_av *av[2];
 static struct fi_info *hints;
 static struct fi_info *fi;
-void *ep_name[2];
-size_t gni_addr[2];
+static void *ep_name[2];
+static size_t gni_addr[2];
 static struct fid_cq *send_cq[2];
 static struct fid_cq *recv_cq[2];
 static struct fi_cq_attr cq_attr[2];
 
 #define BUF_SZ (64*1024)
-char *target, *target_base;
-char *target2, *target2_base;
-char *source, *source_base;
-char *source2, *source2_base;
-char *uc_source;
-struct fid_mr *rem_mr[2], *loc_mr[2], *rem_mr2[2], *loc_mr2[2];
-uint64_t mr_key[2], mr_key2[2];
+static char *target, *target_base;
+static char *target2, *target2_base;
+static char *source, *source_base;
+static char *source2, *source2_base;
+static char *uc_source;
+static struct fid_mr *rem_mr[2], *loc_mr[2], *rem_mr2[2], *loc_mr2[2];
+static uint64_t mr_key[2], mr_key2[2];
 
 static struct fid_cntr *write_cntr[2], *read_cntr[2];
 static struct fid_cntr *rwrite_cntr;
diff --git a/deps/libfabric/prov/gni/test/rdm_dgram_stx.c b/deps/libfabric/prov/gni/test/rdm_dgram_stx.c
index c57a7f707cde632da482f449382e2b070ebab009..c1bceade365857230b906d3999ba620f8531e4ff 100644
--- a/deps/libfabric/prov/gni/test/rdm_dgram_stx.c
+++ b/deps/libfabric/prov/gni/test/rdm_dgram_stx.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2018 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -71,13 +72,13 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[2];
-struct fi_gni_ops_domain *gni_domain_ops[2];
+static struct fi_gni_ops_domain *gni_domain_ops[2];
 static struct fid_ep *ep[2];
 static struct fid_av *av[2];
 static struct fi_info *hints;
 static struct fi_info *fi;
-void *ep_name[2];
-size_t gni_addr[2];
+static void *ep_name[2];
+static size_t gni_addr[2];
 static struct fid_cq *send_cq[2];
 static struct fid_cq *recv_cq[2];
 static struct fi_cq_attr cq_attr[2];
@@ -85,11 +86,11 @@ static struct fid_stx *stx_ctx[2];
 static struct fid_stx *stx_ctx_too_late;
 
 #define BUF_SZ (64*1024)
-char *target, *target_base;
-char *source, *source_base;
-char *uc_source;
-struct fid_mr *rem_mr[2], *loc_mr[2];
-uint64_t mr_key[2];
+static char *target, *target_base;
+static char *source, *source_base;
+static char *uc_source;
+static struct fid_mr *rem_mr[2], *loc_mr[2];
+static uint64_t mr_key[2];
 
 static struct fid_cntr *write_cntr[2], *read_cntr[2];
 static struct fid_cntr *rwrite_cntr;
diff --git a/deps/libfabric/prov/gni/test/rdm_multi_recv.c b/deps/libfabric/prov/gni/test/rdm_multi_recv.c
index a3ed435f406eff952061efc984b823cf0ac5565b..021e479d8b7419f7347734d67973fba6435c6b6c 100644
--- a/deps/libfabric/prov/gni/test/rdm_multi_recv.c
+++ b/deps/libfabric/prov/gni/test/rdm_multi_recv.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -73,24 +74,24 @@
 static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY;
 static struct fid_fabric *fab;
 static struct fid_domain *dom[NUMEPS];
-struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
+static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
-void *ep_name[NUMEPS];
-fi_addr_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fid_cq *msg_cq[NUMEPS];
 static struct fi_info *fi[NUMEPS];
 static struct fi_cq_attr cq_attr;
-struct fi_info *hints;
+static struct fi_info *hints;
 
 #define BUF_SZ (1<<20)
 #define BUF_RNDZV (1<<14)
 #define IOV_CNT (1<<3)
 
-char *target, *target_base;
-char *target2, *target2_base;
-char *source, *source_base;
-char *source2, *source2_base;
+static char *target, *target_base;
+static char *target2, *target2_base;
+static char *source, *source_base;
+static char *source2, *source2_base;
 struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
 
 static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS];
diff --git a/deps/libfabric/prov/gni/test/rdm_rx_overrun.c b/deps/libfabric/prov/gni/test/rdm_rx_overrun.c
index 2f53e4bb2ec533a0d90099f0e5b973f2fd6cad36..eb4da67c8f2a14b9830c31266a754546deed15b7 100644
--- a/deps/libfabric/prov/gni/test/rdm_rx_overrun.c
+++ b/deps/libfabric/prov/gni/test/rdm_rx_overrun.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -80,7 +81,7 @@ static struct fi_cq_attr cq_attr;
 
 static int target[NUM_EPS];
 static int source[NUM_EPS];
-struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS];
+static struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS];
 static uint64_t mr_key[NUM_EPS];
 
 static int max_eps = NUM_EPS;
diff --git a/deps/libfabric/prov/gni/test/rdm_sr.c b/deps/libfabric/prov/gni/test/rdm_sr.c
index 3f0874809552efc807cc67155b565bda642d0b0a..a645dc8ed011ae5edd627f1e18a55dc2656a8b57 100644
--- a/deps/libfabric/prov/gni/test/rdm_sr.c
+++ b/deps/libfabric/prov/gni/test/rdm_sr.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2018 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -76,12 +77,12 @@ static struct fid_domain *dom[NUMEPS];
 struct fi_gni_ops_domain *gni_domain_ops[NUMEPS];
 static struct fid_ep *ep[NUMEPS];
 static struct fid_av *av[NUMEPS];
-void *ep_name[NUMEPS];
-fi_addr_t gni_addr[NUMEPS];
+static void *ep_name[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fid_cq *msg_cq[NUMEPS];
 static struct fi_info *fi[NUMEPS];
 static struct fi_cq_attr cq_attr;
-const char *cdm_id[NUMEPS] = { "5000", "5001" };
+static const char *cdm_id[NUMEPS] = { "5000", "5001" };
 struct fi_info *hints;
 static int using_bnd_ep = 0;
 static int dgram_should_fail;
@@ -92,18 +93,18 @@ static int peer_src_known = 1;
 #define BUF_RNDZV (1<<14)
 #define IOV_CNT (1<<3)
 
-char *target, *target_base;
-char *target2, *target2_base;
-char *source, *source_base;
-char *source2, *source2_base;
-struct iovec *src_iov, *dest_iov, *s_iov, *d_iov;
-char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base;
-char *uc_target;
-char *uc_source;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS];
-uint64_t iov_dest_buf_mr_key[NUMEPS];
-uint64_t mr_key[NUMEPS];
+static char *target, *target_base;
+static char *target2, *target2_base;
+static char *source, *source_base;
+static char *source2, *source2_base;
+static struct iovec *src_iov, *dest_iov, *s_iov, *d_iov;
+static char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base;
+static char *uc_target;
+static char *uc_source;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS];
+static uint64_t iov_dest_buf_mr_key[NUMEPS];
+static uint64_t mr_key[NUMEPS];
 
 static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS];
 static struct fi_cntr_attr cntr_attr = {
diff --git a/deps/libfabric/prov/gni/test/rdm_tagged_sr.c b/deps/libfabric/prov/gni/test/rdm_tagged_sr.c
index c6a4805a4cd3da6fd4c65a279373f201f6a691b9..a36f97b756f5bc67d1018fd8544d69d7a8bcc6e1 100644
--- a/deps/libfabric/prov/gni/test/rdm_tagged_sr.c
+++ b/deps/libfabric/prov/gni/test/rdm_tagged_sr.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -72,20 +73,20 @@ static struct fid_ep *ep[2];
 static struct fid_av *av;
 static struct fi_info *hints;
 static struct fi_info *fi;
-void *ep_name[2];
-size_t gni_addr[2];
+static void *ep_name[2];
+static size_t gni_addr[2];
 static struct fid_cq *msg_cq[2];
 static struct fi_cq_attr cq_attr;
 
 #define BUF_SZ (1<<16)
 #define IOV_CNT (1<<3)
 
-char *target, *target_base;
-char *source, *source_base;
-struct iovec *src_iov, *dest_iov;
-char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base;
-struct fid_mr *rem_mr, *loc_mr;
-uint64_t mr_key;
+static char *target, *target_base;
+static char *source, *source_base;
+static struct iovec *src_iov, *dest_iov;
+static char *iov_src_buf, *iov_dest_buf;
+static struct fid_mr *rem_mr, *loc_mr;
+static uint64_t mr_key;
 
 static void setup_dom(enum fi_progress pm, uint32_t version, int mr_mode)
 {
diff --git a/deps/libfabric/prov/gni/test/sep.c b/deps/libfabric/prov/gni/test/sep.c
index 288774698468d79b2238b13ecf0770d33eaa8abf..2fb6ab3b0a82af128a2dd761afa9395b49aa7e83 100644
--- a/deps/libfabric/prov/gni/test/sep.c
+++ b/deps/libfabric/prov/gni/test/sep.c
@@ -2,7 +2,8 @@
  * Copyright (c) 2015-2018 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2015-2017 Cray Inc. All rights reserved.
- * Copyright (c) 2019 Triad National Security, LLC. All rights reserved.
+ * Copyright (c) 2019-2020 Triad National Security, LLC.
+ *                         All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -67,19 +68,19 @@ static struct fid_domain *dom[NUMEPS];
 static struct fid_av *av[NUMEPS];
 static struct fid_av *t_av;
 static void *ep_name[TOTALEPS];
-fi_addr_t gni_addr[NUMEPS];
+static fi_addr_t gni_addr[NUMEPS];
 static struct fi_cq_attr cq_attr;
-struct fi_info *hints;
+static struct fi_info *hints;
 static struct fi_info *fi[NUMEPS];
 static struct fid_ep *sep[TOTALEPS];
 
-char *target, *target_base;
-char *source, *source_base;
-struct iovec *src_iov, *dest_iov;
-char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base;
-struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
-struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS];
-uint64_t mr_key[NUMEPS];
+static char *target, *target_base;
+static char *source, *source_base;
+static struct iovec *src_iov, *dest_iov;
+static char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base;
+static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS];
+static struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS];
+static uint64_t mr_key[NUMEPS];
 
 static int ctx_cnt = NUMCONTEXTS;
 static int rx_ctx_bits;
@@ -92,8 +93,8 @@ static struct fi_cntr_attr cntr_attr = {
 	.events = FI_CNTR_EVENTS_COMP,
 	.flags = 0
 };
-struct fi_tx_attr tx_attr;
-struct fi_rx_attr rx_attr;
+static struct fi_tx_attr tx_attr;
+static struct fi_rx_attr rx_attr;
 
 static uint64_t sends[NUMEPS] = {0}, recvs[NUMEPS] = {0},
 	send_errs[NUMEPS] = {0}, recv_errs[NUMEPS] = {0};
diff --git a/deps/libfabric/prov/gni/test/vc.c b/deps/libfabric/prov/gni/test/vc.c
index 148a3a2fb7537e6d10264d7014db10c5376f61ee..fa61cbf7a055fce161624680b12a6a3ccb34cdf9 100644
--- a/deps/libfabric/prov/gni/test/vc.c
+++ b/deps/libfabric/prov/gni/test/vc.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved
  * Copyright (c) 2015-2017 Cray Inc.  All rights reserved.
+ * Copyright (c) 2020 Triad National Security, LLC. All rights reserved.
  *
  *
  * This software is available to you under a choice of one of two
@@ -65,23 +66,22 @@ static struct fi_info *hints;
 static struct fi_info *fi;
 static struct fid_cq *cq[2];
 static struct fi_cq_attr cq_attr;
-void *ep_name[2];
-fi_addr_t gni_addr[2];
-struct gnix_av_addr_entry gnix_addr[2];
+static void *ep_name[2];
+static fi_addr_t gni_addr[2];
+static struct gnix_av_addr_entry gnix_addr[2];
 
 /* Third EP with unqiue domain is used to test inter-CM connect. */
 static struct fid_domain *dom3;
 static struct fid_ep *ep3;
 static struct fid_av *av3;
 static struct fid_cq *cq3;
-void *ep_name3;
-fi_addr_t gni_addr3;
+static void *ep_name3;
 
 /* Register a target buffer with both domains for pings. */
-void *target_buf, *target_buf_base;
-int target_len = 64;
-struct fid_mr *rem_mr, *rem_mr3;
-uint64_t mr_key, mr_key3;
+static void *target_buf, *target_buf_base;
+static int target_len = 64;
+static struct fid_mr *rem_mr, *rem_mr3;
+static uint64_t mr_key, mr_key3;
 
 static void vc_setup_common(uint32_t version, int mr_mode);
 
diff --git a/deps/libfabric/prov/hook/hook_debug/configure.m4 b/deps/libfabric/prov/hook/hook_debug/configure.m4
index 58a6c633c579905856d6c3b7bcea46678af87abc..0c2b8cfb71c149f36f58461f2c7f56b5160c43a7 100644
--- a/deps/libfabric/prov/hook/hook_debug/configure.m4
+++ b/deps/libfabric/prov/hook/hook_debug/configure.m4
@@ -12,7 +12,7 @@ AC_DEFUN([FI_HOOK_DEBUG_CONFIGURE],[
     # Determine if we can support the debug hooking provider
     hook_debug_happy=0
     AS_IF([test x"$enable_hook_debug" != x"no"], [hook_debug_happy=1])
-    AS_IF([test x"$hook_debug_dl" == x"1"], [
+    AS_IF([test x"$hook_debug_dl" = x"1"], [
 	hook_debug_happy=0
 	AC_MSG_ERROR([debug hooking provider cannot be compiled as DL])
     ])
diff --git a/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c b/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
index ba5cb77d4678cd740665a55febe0b58cb9f489af..3610e7f8466f29cb64fe06ff5b6036f5428c22fe 100644
--- a/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
+++ b/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
@@ -89,14 +89,14 @@ static void hook_debug_trace_exit(struct fid *fid, struct fid *hfid,
 
 	if (ret > 0) {
 		FI_TRACE(hook_to_hprov(fid), subsys, "%s (fid: %p) returned: "
-			 "%zd\n", fn, hfid, ret);
+			 "%zd\n", fn, (void *) hfid, ret);
 		goto out;
 	}
 
 	if (ret != -FI_EAGAIN || !eagain_count ||
 	    !((*eagain_count)++ % HOOK_DEBUG_EAGAIN_LOG))
 		FI_TRACE(hook_to_hprov(fid), subsys, "%s (fid: %p) returned: "
-			 "%zd (%s)\n", fn, hfid, ret, fi_strerror(-ret));
+			 "%zd (%s)\n", fn, (void *) hfid, ret, fi_strerror(-ret));
 out:
 	if (eagain_count && ret != -FI_EAGAIN)
 		*eagain_count = 0;
@@ -105,8 +105,8 @@ out:
 static void
 hook_debug_trace_exit_eq(struct hook_debug_eq *eq, const char *fn, ssize_t ret)
 {
-	return hook_debug_trace_exit(&eq->hook_eq.eq.fid, &eq->hook_eq.heq->fid,
-				     FI_LOG_EQ, fn, ret, &eq->eagain_count);
+	hook_debug_trace_exit(&eq->hook_eq.eq.fid, &eq->hook_eq.heq->fid,
+			      FI_LOG_EQ, fn, ret, &eq->eagain_count);
 }
 
 static void
@@ -143,7 +143,7 @@ static void hook_debug_rx_end(struct hook_debug_ep *ep, char *fn,
 			ep->rx_outs++;
 			FI_TRACE(hook_to_hprov(&ep->hook_ep.ep.fid),
 				 FI_LOG_EP_DATA, "ep: %p rx_outs: %zu\n",
-				 ep->hook_ep.hep, ep->rx_outs);
+				 (void *) ep->hook_ep.hep, ep->rx_outs);
 		} else {
 			rx_entry = mycontext;
 			ofi_buf_free(rx_entry);
@@ -238,7 +238,7 @@ static void hook_debug_tx_end(struct hook_debug_ep *ep, char *fn,
 			ep->tx_outs++;
 			FI_TRACE(hook_to_hprov(&ep->hook_ep.ep.fid),
 				 FI_LOG_EP_DATA, "ep: %p tx_outs: %zu\n",
-				 ep->hook_ep.hep, ep->tx_outs);
+				 (void *) ep->hook_ep.hep, ep->tx_outs);
 		} else {
 			tx_entry = mycontext;
 			ofi_buf_free(tx_entry);
@@ -522,7 +522,7 @@ static void hook_debug_cq_process_entry(struct hook_debug_cq *mycq,
 				rx_entry->ep->rx_outs--;
 				FI_TRACE(hook_to_hprov(&mycq->hook_cq.cq.fid),
 					 FI_LOG_CQ, "ep: %p rx_outs: %zu\n",
-					 rx_entry->ep->hook_ep.hep,
+					 (void *) rx_entry->ep->hook_ep.hep,
 					 rx_entry->ep->rx_outs);
 				ofi_buf_free(rx_entry);
 			}
@@ -535,7 +535,7 @@ static void hook_debug_cq_process_entry(struct hook_debug_cq *mycq,
 			tx_entry->ep->tx_outs--;
 			FI_TRACE(hook_to_hprov(&mycq->hook_cq.cq.fid),
 				 FI_LOG_CQ, "ep: %p tx_outs: %zu\n",
-				 tx_entry->ep->hook_ep.hep,
+				 (void *) tx_entry->ep->hook_ep.hep,
 				 tx_entry->ep->tx_outs);
 			ofi_buf_free(tx_entry);
 		}
@@ -565,7 +565,7 @@ static ssize_t hook_debug_cq_readfrom(struct fid_cq *cq, void *buf, size_t count
 	return ret;
 }
 
-int hook_debug_cq_close(struct fid *fid)
+static int hook_debug_cq_close(struct fid *fid)
 {
 	struct hook_debug_cq *mycq =
 		container_of(fid, struct hook_debug_cq, hook_cq.cq.fid);
@@ -601,11 +601,13 @@ static void hook_debug_cq_attr_log(struct hook_domain *dom,
 	HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\tsignaling_vector: %d\n",
 			 attr->signaling_vector);
 	HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_cond: %s\n", "TBD");
-	HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_set: %p\n", attr->wait_set);
+	HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_set: %p\n",
+			 (void *) attr->wait_set);
 }
 
-int hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
-		       struct fid_cq **cq, void *context)
+static int
+hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
+		   struct fid_cq **cq, void *context)
 {
 	struct hook_domain *domain = container_of(domain_fid, struct hook_domain,
 						  domain);
@@ -633,7 +635,7 @@ int hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 		goto err;
 
 	FI_TRACE(hook_fabric_to_hprov(mycq->hook_cq.domain->fabric), FI_LOG_CQ,
-		 "cq opened, fid: %p\n", &mycq->hook_cq.hcq->fid);
+		 "cq opened, fid: %p\n", (void *) &mycq->hook_cq.hcq->fid);
 
 	mycq->hook_cq.cq.fid.ops = &hook_debug_cq_fid_ops;
 	mycq->hook_cq.cq.ops = &hook_debug_cq_ops;
@@ -667,7 +669,7 @@ static int hook_debug_ep_close(struct fid *fid)
 	return ret;
 }
 
-int hook_debug_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+static int hook_debug_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 {
 	struct fid *hfid, *hbfid;
 	struct hook_cntr *cntr;
@@ -682,13 +684,14 @@ int hook_debug_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 	case FI_CLASS_CQ:
 		cq = container_of(bfid, struct hook_cq, cq.fid);
 		HOOK_DEBUG_TRACE(cq->domain->fabric, FI_LOG_EP_CTRL,
-				 "cq: %p bind flags: %s\n", cq->hcq,
+				 "cq: %p bind flags: %s\n", (void *) cq->hcq,
 				 fi_tostr(&flags, FI_TYPE_CAPS));
 		break;
 	case FI_CLASS_CNTR:
 		cntr = container_of(bfid, struct hook_cntr, cntr.fid);
 		HOOK_DEBUG_TRACE(cntr->domain->fabric, FI_LOG_EP_CTRL,
-				 "cntr: %p bind flags: %s\n", cntr->hcntr,
+				 "cntr: %p bind flags: %s\n",
+				 (void *) cntr->hcntr,
 				 fi_tostr(&flags, FI_TYPE_CAPS));
 		break;
 	}
@@ -729,8 +732,9 @@ struct fi_ops_tagged hook_debug_tagged_ops = {
 	.injectdata	= hook_debug_tinjectdata,
 };
 
-int hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info,
-			struct fid_ep **ep, void *context)
+static int
+hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info,
+		    struct fid_ep **ep, void *context)
 {
 	struct hook_debug_ep *myep;
 	struct ofi_bufpool_attr bufpool_attr = {
@@ -779,7 +783,7 @@ int hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info,
 		goto err;
 
 	FI_TRACE(hook_to_hprov(&myep->hook_ep.ep.fid), FI_LOG_EP_CTRL,
-		 "endpoint opened, fid: %p\n", &myep->hook_ep.hep->fid);
+		 "endpoint opened, fid: %p\n", (void *) &myep->hook_ep.hep->fid);
 
 	myep->hook_ep.ep.fid.ops = &hook_debug_ep_fid_ops;
 	myep->hook_ep.ep.msg = &hook_debug_msg_ops;
@@ -852,8 +856,9 @@ static int hook_debug_eq_close(struct fid *fid)
 static struct fi_ops_eq hook_debug_eq_ops;
 static struct fi_ops hook_debug_eq_fid_ops;
 
-int hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
-		 struct fid_eq **eq, void *context)
+static int
+hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
+		   struct fid_eq **eq, void *context)
 {
 	struct hook_debug_eq *myeq;
 	int i, ret;
@@ -932,7 +937,7 @@ static int hook_debug_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int t
 
 	HOOK_DEBUG_TRACE(mycntr->domain->fabric, FI_LOG_CNTR,
 			 "cntr: %p, threshold: %" PRIu64 ", timeout: %d\n",
-			 mycntr->hcntr, threshold, timeout);
+			 (void *) mycntr->hcntr, threshold, timeout);
 
 	ret = fi_cntr_wait(mycntr->hcntr, threshold, timeout);
 
@@ -942,18 +947,18 @@ static int hook_debug_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int t
 
 static struct fi_ops_cntr hook_debug_cntr_ops;
 
-int hook_debug_cntr_init(struct fid *fid)
+static int hook_debug_cntr_init(struct fid *fid)
 {
 	struct hook_cntr *mycntr = container_of(fid, struct hook_cntr, cntr.fid);
 	HOOK_DEBUG_TRACE(mycntr->domain->fabric, FI_LOG_CNTR,
-			 "fi_cntr_open: %p\n", mycntr->hcntr);
+			 "fi_cntr_open: %p\n", (void *) mycntr->hcntr);
 	mycntr->cntr.ops = &hook_debug_cntr_ops;
 	return 0;
 }
 
 static struct fi_ops_domain hook_debug_domain_ops;
 
-int hook_debug_domain_init(struct fid *fid)
+static int hook_debug_domain_init(struct fid *fid)
 {
 	struct fid_domain *domain = container_of(fid, struct fid_domain, fid);
 	domain->ops = &hook_debug_domain_ops;
diff --git a/deps/libfabric/prov/hook/perf/configure.m4 b/deps/libfabric/prov/hook/perf/configure.m4
index 82befb0102ce956e651992fc1d03f5065f57381e..b871ed5040b0d01a222d943aace345e3db4b6013 100644
--- a/deps/libfabric/prov/hook/perf/configure.m4
+++ b/deps/libfabric/prov/hook/perf/configure.m4
@@ -12,7 +12,7 @@ AC_DEFUN([FI_PERF_CONFIGURE],[
     # Determine if we can support the perf hooking provider
     perf_happy=0
     AS_IF([test x"$enable_perf" != x"no"], [perf_happy=1])
-    AS_IF([test x"$perf_dl" == x"1"], [
+    AS_IF([test x"$perf_dl" = x"1"], [
 	perf_happy=0
 	AC_MSG_ERROR([perf provider cannot be compiled as DL])
     ])
diff --git a/deps/libfabric/prov/hook/src/hook_domain.c b/deps/libfabric/prov/hook/src/hook_domain.c
index 004e19ee6ec8fc72232fd8753ee511eea66b5237..985da736600a86146c24762074d5cf7a5ba62400 100644
--- a/deps/libfabric/prov/hook/src/hook_domain.c
+++ b/deps/libfabric/prov/hook/src/hook_domain.c
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <sys/uio.h>
 #include "ofi_hook.h"
+#include "ofi_util.h"
 
 static int hook_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 			   uint64_t flags, struct fid_mr **mr)
@@ -101,6 +102,85 @@ static struct fi_ops_mr hook_mr_ops = {
 	.regattr = hook_mr_regattr,
 };
 
+static ssize_t hook_credit_handler(struct fid_ep *ep_fid, size_t credits)
+{
+	/*
+	 * called from the base provider, ep_fid is the base ep, and
+	 * it's fid context is the hook ep.
+	 */
+	struct hook_ep *ep = (struct hook_ep *)ep_fid->fid.context;
+
+	return (*ep->domain->base_credit_handler)(&ep->ep, credits);
+}
+
+static void hook_set_threshold(struct fid_ep *ep_fid, size_t threshold)
+{
+	struct hook_ep *ep = container_of(ep_fid, struct hook_ep, ep);
+
+	return ep->domain->base_ops_flow_ctrl->set_threshold(ep->hep, threshold);
+}
+
+static void hook_set_send_handler(struct fid_domain *domain_fid,
+		ssize_t (*credit_handler)(struct fid_ep *ep, size_t credits))
+{
+	struct hook_domain *domain = container_of(domain_fid,
+						  struct hook_domain, domain);
+
+	domain->base_credit_handler = credit_handler;
+	domain->base_ops_flow_ctrl->set_send_handler(domain->hdomain,
+						     hook_credit_handler);
+}
+
+static int hook_enable_ep_flow_ctrl(struct fid_ep *ep_fid)
+{
+	struct hook_ep *ep = container_of(ep_fid, struct hook_ep, ep);
+
+	return ep->domain->base_ops_flow_ctrl->enable(ep->hep);
+}
+
+static void hook_add_credits(struct fid_ep *ep_fid, size_t credits)
+{
+	struct hook_ep *ep = container_of(ep_fid, struct hook_ep, ep);
+
+	return ep->domain->base_ops_flow_ctrl->add_credits(ep->hep, credits);
+}
+
+static struct ofi_ops_flow_ctrl hook_ops_flow_ctrl = {
+	.size = sizeof(struct ofi_ops_flow_ctrl),
+	.set_threshold = hook_set_threshold,
+	.add_credits = hook_add_credits,
+	.enable = hook_enable_ep_flow_ctrl,
+	.set_send_handler = hook_set_send_handler,
+};
+
+static int hook_domain_ops_open(struct fid *fid, const char *name,
+				uint64_t flags, void **ops, void *context)
+{
+	int err;
+	struct hook_domain *domain = container_of(fid, struct hook_domain,
+						  domain);
+
+	err = fi_open_ops(hook_to_hfid(fid), name, flags, ops, context);
+	if (err)
+		return err;
+
+	if (!strcasecmp(name, OFI_OPS_FLOW_CTRL)) {
+		domain->base_ops_flow_ctrl = *ops;
+		*ops = &hook_ops_flow_ctrl;
+	}
+
+	return 0;
+}
+
+struct fi_ops hook_domain_fid_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = hook_close,
+	.bind = hook_bind,
+	.control = hook_control,
+	.ops_open = hook_domain_ops_open,
+};
+
+
 int hook_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
 		  enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags)
 {
@@ -146,7 +226,7 @@ int hook_domain(struct fid_fabric *fabric, struct fi_info *info,
 	dom->fabric = fab;
 	dom->domain.fid.fclass = FI_CLASS_DOMAIN;
 	dom->domain.fid.context = context;
-	dom->domain.fid.ops = &hook_fid_ops;
+	dom->domain.fid.ops = &hook_domain_fid_ops;
 	dom->domain.ops = &hook_domain_ops;
 	dom->domain.mr = &hook_mr_ops;
 
diff --git a/deps/libfabric/prov/mrail/src/mrail.h b/deps/libfabric/prov/mrail/src/mrail.h
index 45593b485ddf04cac1e82744b713b7caaea6e85d..6af26046687a14edf0a7fb94897b50a268ef2cf9 100644
--- a/deps/libfabric/prov/mrail/src/mrail.h
+++ b/deps/libfabric/prov/mrail/src/mrail.h
@@ -215,7 +215,7 @@ struct mrail_recv {
 	uint64_t 		ignore;
 	struct mrail_rndv_recv	rndv;
 };
-DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs);
+OFI_DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs);
 
 int mrail_cq_process_buf_recv(struct fi_cq_tagged_entry *comp,
 			      struct mrail_recv *recv);
@@ -316,8 +316,8 @@ mrail_pop_recv(struct mrail_ep *mrail_ep)
 {
 	struct mrail_recv *recv;
 	ofi_ep_lock_acquire(&mrail_ep->util_ep);
-	recv = freestack_isempty(mrail_ep->recv_fs) ? NULL :
-		freestack_pop(mrail_ep->recv_fs);
+	recv = ofi_freestack_isempty(mrail_ep->recv_fs) ? NULL :
+		ofi_freestack_pop(mrail_ep->recv_fs);
 	ofi_ep_lock_release(&mrail_ep->util_ep);
 	return recv;
 }
@@ -326,7 +326,7 @@ static inline void
 mrail_push_recv(struct mrail_recv *recv)
 {
 	ofi_ep_lock_acquire(&recv->ep->util_ep);
-	freestack_push(recv->ep->recv_fs, recv);
+	ofi_freestack_push(recv->ep->recv_fs, recv);
 	ofi_ep_lock_release(&recv->ep->util_ep);
 }
 
diff --git a/deps/libfabric/prov/netdir/src/netdir_ep_msg.c b/deps/libfabric/prov/netdir/src/netdir_ep_msg.c
index 3427dbe1d191078a359b75caf03bc2d35d204cdb..0089b0f412f01e06d58f9eecd9ba69572ab8d2c3 100644
--- a/deps/libfabric/prov/netdir/src/netdir_ep_msg.c
+++ b/deps/libfabric/prov/netdir/src/netdir_ep_msg.c
@@ -399,7 +399,7 @@ ofi_nd_ep_sendmsg(struct fid_ep *pep, const struct fi_msg *msg, uint64_t flags)
 	entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1);
 
 	/* since send operation can't be canceled, set NULL into
-	 * the 1st byte of internal data of context */
+	 * the 1st pointer of internal data of context */
 	if (msg->context)
 		ND_FI_CONTEXT(msg->context) = 0;
 
@@ -554,7 +554,7 @@ static ssize_t ofi_nd_ep_recvmsg(struct fid_ep *pep, const struct fi_msg *msg,
 	for (i = 0; i < msg->iov_count; i++)
 		entry->iov[i] = msg->msg_iov[i];
 
-	/* store allocated entry in 1st byte of internal data of context */
+	/* store allocated entry in 1st pointer of internal data of context */
 	if (msg->context)
 		ND_FI_CONTEXT(msg->context) = entry;
 
diff --git a/deps/libfabric/prov/netdir/src/netdir_ep_rma.c b/deps/libfabric/prov/netdir/src/netdir_ep_rma.c
index 4bc1058f8bcc842e868060de617e688f0101bb6e..3d28487bd347881410397826ef7aeb43083ee1a7 100644
--- a/deps/libfabric/prov/netdir/src/netdir_ep_rma.c
+++ b/deps/libfabric/prov/netdir/src/netdir_ep_rma.c
@@ -192,7 +192,7 @@ ofi_nd_ep_readmsg(struct fid_ep *pep, const struct fi_msg_rma *msg,
 	main_entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1);
 
 	/* since write operation can't be canceled, set NULL into
-	 * the 1st byte of internal data of context */
+	 * the 1st pointer of internal data of context */
 	if (msg->context)
 		ND_FI_CONTEXT(msg->context) = 0;
 
@@ -340,7 +340,7 @@ ofi_nd_ep_writemsg(struct fid_ep *pep, const struct fi_msg_rma *msg,
 		return -FI_EINVAL;
 
 	for (i = 0; i < msg->rma_iov_count; i++) {
-		if (msg->rma_iov[i].len && !msg->rma_iov[i].addr) 
+		if (msg->rma_iov[i].len && !msg->rma_iov[i].addr)
 			return -FI_EINVAL;
 		rma_len += msg->rma_iov[i].len;
 	}
@@ -368,7 +368,7 @@ ofi_nd_ep_writemsg(struct fid_ep *pep, const struct fi_msg_rma *msg,
 	main_entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1);
 
 	/* since write operation can't be canceled, set NULL into
-	* the 1st byte of internal data of context */
+	* the 1st pointer of internal data of context */
 	if (msg->context)
 		ND_FI_CONTEXT(msg->context) = 0;
 
diff --git a/deps/libfabric/prov/netdir/src/netdir_ep_srx.c b/deps/libfabric/prov/netdir/src/netdir_ep_srx.c
index 19d19a6d63d6594935ed1ea410da87aaca89c573..63336110d647c9879566b45302f65f97027c4b09 100644
--- a/deps/libfabric/prov/netdir/src/netdir_ep_srx.c
+++ b/deps/libfabric/prov/netdir/src/netdir_ep_srx.c
@@ -188,7 +188,7 @@ static ssize_t ofi_nd_srx_recvmsg(struct fid_ep *pep, const struct fi_msg *msg,
 		entry->iov[i] = msg->msg_iov[i];
 	}
 
-	/* store allocated entry in 1st byte of internal data of context */
+	/* store allocated entry in 1st pointer of internal data of context */
 	if (msg->context)
 		ND_FI_CONTEXT(msg->context) = entry;
 
diff --git a/deps/libfabric/prov/netdir/src/netdir_init.c b/deps/libfabric/prov/netdir/src/netdir_init.c
index 8f718fb930774090e4512bd6ea5ad454ec2de2d5..75b5466d5ef5db1fc1fedc3f70404b63cef9ccdc 100644
--- a/deps/libfabric/prov/netdir/src/netdir_init.c
+++ b/deps/libfabric/prov/netdir/src/netdir_init.c
@@ -101,6 +101,7 @@ static int ofi_nd_adapter_cb(const ND2_ADAPTER_INFO *adapter, const char *name)
 		return -FI_ENOMEM;
 
 	info->tx_attr->caps = FI_MSG | FI_SEND;
+	info->tx_attr->mode = FI_CONTEXT;
 	info->tx_attr->comp_order = FI_ORDER_STRICT;
 	info->tx_attr->inject_size = (size_t)gl_data.inline_thr;
 	info->tx_attr->size = (size_t)adapter->MaxTransferLength;
@@ -112,6 +113,7 @@ static int ofi_nd_adapter_cb(const ND2_ADAPTER_INFO *adapter, const char *name)
 	info->tx_attr->msg_order = OFI_ND_MSG_ORDER;
 
 	info->rx_attr->caps = FI_MSG | FI_RECV;
+	info->rx_attr->mode = FI_CONTEXT;
 	info->rx_attr->comp_order = FI_ORDER_STRICT;
 	info->rx_attr->total_buffered_recv = 0;
 	info->rx_attr->size = (size_t)adapter->MaxTransferLength;
@@ -141,6 +143,7 @@ static int ofi_nd_adapter_cb(const ND2_ADAPTER_INFO *adapter, const char *name)
 	info->fabric_attr->prov_version = OFI_VERSION_DEF_PROV;
 
 	info->caps = OFI_ND_EP_CAPS | OFI_ND_DOMAIN_CAPS;
+	info->mode = FI_CONTEXT;
 	info->addr_format = FI_SOCKADDR;
 
 	if (!ofi_nd_util_prov.info) {
diff --git a/deps/libfabric/prov/netdir/src/netdir_ndinit.c b/deps/libfabric/prov/netdir/src/netdir_ndinit.c
index a9b8dfdd6da8a623da75a1da7899e33e08a0da4e..1f813b604c1b084e252832d4e69f8508186ce4ad 100644
--- a/deps/libfabric/prov/netdir/src/netdir_ndinit.c
+++ b/deps/libfabric/prov/netdir/src/netdir_ndinit.c
@@ -36,7 +36,7 @@
 #include <guiddef.h>
 
 #include <ws2spi.h>
-#include <cassert>
+#include <assert.h>
 #include "ndspi.h"
 
 #include "netdir.h"
diff --git a/deps/libfabric/prov/psm/src/psmx.h b/deps/libfabric/prov/psm/src/psmx.h
index 50971dd3c69b50e76fcc6be0df11f7a6939b6385..560c114522664104394b9116bdfa15cfac2ab106 100644
--- a/deps/libfabric/prov/psm/src/psmx.h
+++ b/deps/libfabric/prov/psm/src/psmx.h
@@ -287,7 +287,7 @@ struct psmx_fid_domain {
 	 * purpose. The tag-matching functions automatically treat these bits
 	 * as 0. This field is a bit mask, with reserved bits valued as "1".
 	 */
-	uint64_t		reserved_tag_bits; 
+	uint64_t		reserved_tag_bits;
 
 	/* lock to prevent the sequence of psm_mq_ipeek and psm_mq_test be
 	 * interleaved in a multithreaded environment.
@@ -534,7 +534,7 @@ struct psmx_fid_mr {
 	uint64_t		flags;
 	uint64_t		offset;
 	size_t			iov_count;
-	struct iovec		iov[0];	/* must be the last field */
+	struct iovec		iov[];	/* must be the last field */
 };
 
 struct psmx_epaddr_context {
diff --git a/deps/libfabric/prov/psm2/Makefile.include b/deps/libfabric/prov/psm2/Makefile.include
index 7e4783ceb412049079581e3fe7baf0567df61cdc..993d3eea97770cd11be0af754ae2c7d1b92fac85 100644
--- a/deps/libfabric/prov/psm2/Makefile.include
+++ b/deps/libfabric/prov/psm2/Makefile.include
@@ -22,6 +22,9 @@ _psm2_files = \
 	prov/psm2/src/psmx2_wait.c \
 	prov/psm2/src/psmx2_util.c
 
+_psm2_cppflags = \
+	-I$(top_srcdir)/prov/psm2/include
+
 if HAVE_PSM2_SRC
 _psm2_files += \
 	prov/psm2/src/psm2_revision.c
@@ -97,7 +100,7 @@ _psm2_nodist_files += \
 	prov/psm2/src/psm2/opa/opa_dwordcpy-x86_64-fast.S
 endif
 
-_psm2_cppflags = \
+_psm2_cppflags += \
 	-I$(top_srcdir)/prov/psm2/src/psm2 \
 	-I$(top_srcdir)/prov/psm2/src/psm2/include \
 	-I$(top_srcdir)/prov/psm2/src/psm2/include/linux-i386 \
@@ -132,6 +135,7 @@ src_libfabric_la_LIBADD += libpsmx2.la
 src_libfabric_la_DEPENDENCIES += libpsmx2.la
 endif !HAVE_PSM2_DL
 
+rdmainclude_HEADERS += prov/psm2/include/fi_ext_psm2.h
 prov_install_man_pages += man/man7/fi_psm2.7
 
 endif HAVE_PSM2
diff --git a/deps/libfabric/prov/psm2/include/fi_ext_psm2.h b/deps/libfabric/prov/psm2/include/fi_ext_psm2.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a48d83e17f999064849df04be6681a7c4e21278
--- /dev/null
+++ b/deps/libfabric/prov/psm2/include/fi_ext_psm2.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_EXT_PSM2_H
+#define FI_EXT_PSM2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Provider specific name for fi_set_val() / fi_get_val() */
+#define	FI_PSM2_DISCONNECT	(1U | FI_PROV_SPECIFIC)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_EXT_PSM2_H */
diff --git a/deps/libfabric/prov/psm2/src/psmx2.h b/deps/libfabric/prov/psm2/src/psmx2.h
index 48ca3aca9ca6d326d1da7475cc561602db720f20..da5b6cfe67bb4f89ad3f318e8ff876d9bddf5c7b 100644
--- a/deps/libfabric/prov/psm2/src/psmx2.h
+++ b/deps/libfabric/prov/psm2/src/psmx2.h
@@ -72,6 +72,7 @@ extern "C" {
 #include "ofi_mem.h"
 #include "rbtree.h"
 #include "version.h"
+#include "fi_ext_psm2.h"
 
 #ifdef FABRIC_DIRECT_ENABLED
 #define DIRECT_FN __attribute__((visibility ("default")))
@@ -475,7 +476,6 @@ struct psmx2_multi_recv {
 
 struct psmx2_fid_fabric {
 	struct util_fabric	util_fabric;
-	psm2_uuid_t		uuid;
 	struct util_ns		name_server;
 
 	/* list of all opened domains */
@@ -528,6 +528,8 @@ struct psmx2_trx_ctxt {
 	ofi_atomic32_t		poll_refcnt;
 	int			poll_active;
 
+	psm2_uuid_t		uuid;
+
 	struct dlist_entry	entry;
 };
 
@@ -540,6 +542,7 @@ struct psmx2_fid_domain {
 	struct psmx2_fid_fabric	*fabric;
 	uint64_t		mode;
 	uint64_t		caps;
+	psm2_uuid_t		uuid;
 
 	enum fi_mr_mode		mr_mode;
 	fastlock_t		mr_lock;
@@ -587,6 +590,11 @@ struct psmx2_fid_domain {
 	psmx2_unlock_fn_t	context_unlock_fn;
 	psmx2_trylock_fn_t	poll_trylock_fn;
 	psmx2_unlock_fn_t	poll_unlock_fn;
+
+	/* parameters that can be set via domain_ops */
+	struct {
+		int		disconnect;
+	} params;
 };
 
 #define PSMX2_EP_REGULAR	0
@@ -989,7 +997,8 @@ int	psmx2_domain_enable_ep(struct psmx2_fid_domain *domain, struct psmx2_fid_ep
 void	psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt, int usage_flags);
 struct	psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 					     struct psmx2_ep_name *src_addr,
-					     int sep_ctxt_idx, int usage_flags);
+					     int sep_ctxt_idx, int usage_flags,
+					     uint8_t *uuid);
 
 static inline
 int	psmx2_ns_service_cmp(void *svc1, void *svc2)
diff --git a/deps/libfabric/prov/psm2/src/psmx2_attr.c b/deps/libfabric/prov/psm2/src/psmx2_attr.c
index eedda590fb2b124d8e16434b4098fffa60045520..c582868e7a44581dcb2f3942aff181fcb0d3c2d4 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_attr.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_attr.c
@@ -177,14 +177,14 @@ int psmx2_init_prov_info(const struct fi_info *hints, struct fi_info **info)
 	if (hints->fabric_attr && hints->fabric_attr->name &&
 	    strcasecmp(hints->fabric_attr->name, fabric_attr->name)) {
 		FI_INFO(&psmx2_prov, FI_LOG_CORE, "Unknown fabric name\n");
-		FI_INFO_NAME(&psmx2_prov, fabric_attr, hints->fabric_attr);
+		OFI_INFO_NAME(&psmx2_prov, fabric_attr, hints->fabric_attr);
 		return -FI_ENODATA;
 	}
 
 	if (hints->domain_attr && hints->domain_attr->name &&
 	    strncasecmp(hints->domain_attr->name, domain_attr->name, strlen(PSMX2_DOMAIN_NAME))) {
 		FI_INFO(&psmx2_prov, FI_LOG_CORE, "Unknown domain name\n");
-		FI_INFO_NAME(&psmx2_prov, domain_attr, hints->domain_attr);
+		OFI_INFO_NAME(&psmx2_prov, domain_attr, hints->domain_attr);
 		return -FI_ENODATA;
 	}
 
@@ -230,7 +230,7 @@ int psmx2_init_prov_info(const struct fi_info *hints, struct fi_info **info)
 
 	if ((hints->caps & PSMX2_CAPS) != hints->caps) {
 		FI_INFO(&psmx2_prov, FI_LOG_CORE, "caps not supported\n");
-		FI_INFO_CHECK(&psmx2_prov, prov_info, hints, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(&psmx2_prov, prov_info, hints, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
diff --git a/deps/libfabric/prov/psm2/src/psmx2_av.c b/deps/libfabric/prov/psm2/src/psmx2_av.c
index d21040d7bb47f22c11c8864677bc64cbac7ba305..204fb95a0a7da03245ef53c8e0ea490a23d0ed00 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_av.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_av.c
@@ -735,6 +735,7 @@ STATIC int psmx2_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t cou
 	struct psmx2_fid_av *av_priv;
 	struct psmx2_trx_ctxt *trx_ctxt;
 	psm2_error_t *errors;
+	int i;
 
 	av_priv = container_of(av, struct psmx2_fid_av, av);
 
@@ -749,6 +750,17 @@ STATIC int psmx2_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t cou
 	if (!errors)
 		return -FI_ENOMEM;
 
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	for (i = 0; i < count; i++) {
+		dlist_remove_first_match(&trx_ctxt->peer_list,
+					 psmx2_peer_match,
+					 (psm2_epaddr_t)(fi_addr[i]));
+	}
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+
+	for (i = 0; i < count; i++)
+		psm2_epaddr_setctxt((psm2_epaddr_t)(fi_addr[i]), NULL);
+
 	psm2_ep_disconnect2(trx_ctxt->psm2_ep, count, (psm2_epaddr_t *)fi_addr,
 			    NULL, errors, PSM2_EP_DISCONNECT_FORCE, 0);
 
diff --git a/deps/libfabric/prov/psm2/src/psmx2_domain.c b/deps/libfabric/prov/psm2/src/psmx2_domain.c
index 46d4f349fbef1194351990100a6e5df7e56880fb..d7c327945e83f91d73b7097d0619976d69b86feb 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_domain.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_domain.c
@@ -203,10 +203,51 @@ static int psmx2_domain_close(fid_t fid)
 	return 0;
 }
 
+static int psmx2_domain_get_val(struct fid *fid, int var, void *val)
+{
+	struct psmx2_fid_domain *domain;
+ 
+	if (!val)
+		return -FI_EINVAL;
+
+	domain = container_of(fid, struct psmx2_fid_domain,
+			      util_domain.domain_fid.fid);
+ 
+	switch (var) {
+	case FI_PSM2_DISCONNECT:
+		*(uint32_t *)val = domain->params.disconnect;
+		break;
+	default:
+		return -FI_EINVAL;
+	}
+	return 0;
+}
+
+static int psmx2_domain_set_val(struct fid *fid, int var, void *val)
+{
+	struct psmx2_fid_domain *domain;
+ 
+	if (!val)
+		return -FI_EINVAL;
+
+	domain = container_of(fid, struct psmx2_fid_domain,
+			      util_domain.domain_fid.fid);
+ 
+	switch (var) {
+	case FI_PSM2_DISCONNECT:
+		domain->params.disconnect = *(uint32_t *)val;
+		break;
+	default:
+		return -FI_EINVAL;
+	}
+	return 0;
+}
+
 DIRECT_FN
 STATIC int psmx2_domain_control(fid_t fid, int command, void *arg)
 {
 	struct fi_mr_map_raw *map;
+	struct fi_fid_var *var;
 
 	switch (command) {
 	case FI_MAP_RAW_MR:
@@ -220,6 +261,14 @@ STATIC int psmx2_domain_control(fid_t fid, int command, void *arg)
 		/* Nothing to do here */
 		break;
 
+	case FI_GET_VAL:
+		var = arg;
+		return psmx2_domain_get_val(fid, var->name, var->val);
+
+	case FI_SET_VAL:
+		var = arg;
+		return psmx2_domain_set_val(fid, var->name, var->val);
+
 	default:
 		return -FI_ENOSYS;
 	}
@@ -321,6 +370,21 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		goto err_out;
 	}
 
+	psmx2_get_uuid(domain_priv->uuid);
+	if (info->ep_attr && info->ep_attr->auth_key) {
+		if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx2_prov, FI_LOG_DOMAIN,
+				"Invalid auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->ep_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			err = -FI_EINVAL;
+			goto err_out_free_domain;
+		}
+		memcpy(domain_priv->uuid, info->ep_attr->auth_key,
+		       sizeof(psm2_uuid_t));
+	}
+
 	err = ofi_domain_init(fabric, info, &domain_priv->util_domain, context);
 	if (err)
 		goto err_out_free_domain;
@@ -336,6 +400,7 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	domain_priv->progress_thread_enabled =
 		(info->domain_attr->data_progress == FI_PROGRESS_AUTO);
 	domain_priv->addr_format = info->addr_format;
+	domain_priv->params.disconnect = psmx2_env.disconnect;
 
 	if (info->addr_format == FI_ADDR_STR)
 		src_addr = psmx2_string_to_ep_name(info->src_addr);
diff --git a/deps/libfabric/prov/psm2/src/psmx2_ep.c b/deps/libfabric/prov/psm2/src/psmx2_ep.c
index 4b32b7422a7b492c8f7a0ae64623c359b011886e..b924d25b88ec6322d11c7671f85d46136ab01382 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_ep.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_ep.c
@@ -526,24 +526,6 @@ int psmx2_ep_open_internal(struct psmx2_fid_domain *domain_priv,
 	else
 		ep_cap = FI_TAGGED;
 
-	if (info && info->ep_attr && info->ep_attr->auth_key) {
-		if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
-			FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL,
-				"Invalid auth_key_len %"PRIu64
-				", should be %"PRIu64".\n",
-				info->ep_attr->auth_key_size,
-				sizeof(psm2_uuid_t));
-			goto errout;
-		}
-		if (memcmp(domain_priv->fabric->uuid, info->ep_attr->auth_key,
-			   sizeof(psm2_uuid_t))) {
-			FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL,
-				"Invalid auth_key: %s\n",
-				psmx2_uuid_to_string((void *)info->ep_attr->auth_key));
-			goto errout;
-		}
-	}
-
 	ep_priv = (struct psmx2_fid_ep *) calloc(1, sizeof *ep_priv);
 	if (!ep_priv) {
 		err = -FI_ENOMEM;
@@ -625,6 +607,7 @@ int psmx2_ep_open(struct fid_domain *domain, struct fi_info *info,
 	struct psmx2_trx_ctxt *trx_ctxt = NULL;
 	int err = -FI_EINVAL;
 	int usage_flags = PSMX2_TX_RX;
+	uint8_t *uuid = NULL;
 
 	domain_priv = container_of(domain, struct psmx2_fid_domain,
 				   util_domain.domain_fid.fid);
@@ -655,9 +638,21 @@ int psmx2_ep_open(struct fid_domain *domain, struct fi_info *info,
 			src_addr = info->src_addr;
 	}
 
+	if (info && info->ep_attr && info->ep_attr->auth_key) {
+		if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL,
+				"Invalid auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->ep_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			goto errout;
+		}
+		uuid = info->ep_attr->auth_key;
+	}
+
 	if (usage_flags) {
 		trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, src_addr, -1,
-						usage_flags);
+						usage_flags, uuid);
 		if (!trx_ctxt)
 			goto errout;
 	} else {
@@ -758,7 +753,9 @@ int psmx2_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr,
 		goto errout;
 	}
 
-	trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX2_TX);
+	/* no auth_key is provided, use NULL to pick the default uuid */
+	trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX2_TX,
+					NULL);
 	if (!trx_ctxt) {
 		err = -FI_ENOMEM;
 		goto errout_free_stx;
@@ -941,6 +938,7 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info,
 	size_t ctxt_cnt = 1;
 	size_t ctxt_size;
 	int err = -FI_EINVAL;
+	uint8_t *uuid = NULL;
 	int i;
 
 	domain_priv = container_of(domain, struct psmx2_fid_domain,
@@ -949,6 +947,16 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info,
 		goto errout;
 
 	if (info && info->ep_attr) {
+		if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL,
+				"Invalid auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->ep_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			goto errout;
+		}
+		uuid = info->ep_attr->auth_key;
+
 		if (info->ep_attr->tx_ctx_cnt > psmx2_hfi_info.max_trx_ctxt) {
 			FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL,
 				"tx_ctx_cnt %"PRIu64" exceed limit %d.\n",
@@ -1000,7 +1008,7 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info,
 	for (i = 0; i < ctxt_cnt; i++) {
 		trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, src_addr,
 						(ctxt_cnt > 1) ? i : -1,
-						PSMX2_TX_RX);
+						PSMX2_TX_RX, uuid);
 		if (!trx_ctxt) {
 			err = -FI_ENOMEM;
 			goto errout_free_ctxt;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_fabric.c b/deps/libfabric/prov/psm2/src/psmx2_fabric.c
index e6395d0286d441c131ff58dbb850106ec3666080..fcd8398c0314eb622d953e6b868ad2f12e78d3e0 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_fabric.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_fabric.c
@@ -105,9 +105,11 @@ int psmx2_fabric(struct fi_fabric_attr *attr,
 	fastlock_init(&fabric_priv->domain_lock);
 	dlist_init(&fabric_priv->domain_list);
 
-	psmx2_get_uuid(fabric_priv->uuid);
 	if (psmx2_env.name_server) {
-		fabric_priv->name_server.port = psmx2_uuid_to_port(fabric_priv->uuid);
+		psm2_uuid_t uuid;
+
+		psmx2_get_uuid(uuid);
+		fabric_priv->name_server.port = psmx2_uuid_to_port(uuid);
 		fabric_priv->name_server.name_len = sizeof(struct psmx2_ep_name);
 		fabric_priv->name_server.service_len = sizeof(int);
 		fabric_priv->name_server.service_cmp = psmx2_ns_service_cmp;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_init.c b/deps/libfabric/prov/psm2/src/psmx2_init.c
index 72982d73064ff17728d25a1afd88ecf8ec5840a0..09ebbea3f39a4144cf45b47204efeb3c9f031596 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_init.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_init.c
@@ -70,9 +70,29 @@ int	 psmx2_tag_layout_locked = 0;
 
 static void psmx2_init_env(void)
 {
+	char *ompi_job_key;
+	psm2_uuid_t uuid = {};
+	unsigned long long *u = (unsigned long long *)uuid;
+
 	if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK") || getenv("PMIX_RANK"))
 		psmx2_env.name_server = 0;
 
+	/*
+	 * Check for Open MPI job key. If set, convert it to the default uuid
+	 * string. This will be overridden by the FI_PSM2_UUID variable, and
+	 * both will have lower priority than the auth_key passed via ep_attr.
+	 */
+	ompi_job_key = getenv("OMPI_MCA_orte_precondition_transports");
+	if (ompi_job_key) {
+		FI_INFO(&psmx2_prov, FI_LOG_CORE,
+			"Open MPI job key: %s.\n", ompi_job_key);
+		if (sscanf(ompi_job_key, "%016llx-%016llx", &u[0], &u[1]) == 2)
+			psmx2_env.uuid = strdup(psmx2_uuid_to_string(uuid));
+		else
+			FI_INFO(&psmx2_prov, FI_LOG_CORE,
+				"Invalid Open MPI job key format.\n");
+	}
+
 	fi_param_get_bool(&psmx2_prov, "name_server", &psmx2_env.name_server);
 	fi_param_get_bool(&psmx2_prov, "tagged_rma", &psmx2_env.tagged_rma);
 	fi_param_get_str(&psmx2_prov, "uuid", &psmx2_env.uuid);
@@ -286,6 +306,7 @@ static int psmx2_update_hfi_info(void)
 	char unit_name[8];
 	uint32_t cnt = 0;
 	int tmp_nctxts, tmp_nfreectxts;
+	int offset = 0;
 
 #if HAVE_PSM2_INFO_QUERY
 	int unit_active;
@@ -387,10 +408,13 @@ static int psmx2_update_hfi_info(void)
 		psmx2_hfi_info.unit_nfreectxts[i] = tmp_nfreectxts;
 		psmx2_hfi_info.active_units[psmx2_hfi_info.num_active_units++] = i;
 
-		sprintf(unit_name, "hfi1_%hu", i);
+		snprintf(unit_name, sizeof(unit_name), "hfi1_%hu", i);
 		if (psmx2_hfi_info.num_active_units > 1)
-			strcat(psmx2_hfi_info.default_domain_name, ";");
-		strcat(psmx2_hfi_info.default_domain_name, unit_name);
+			offset = snprintf(psmx2_hfi_info.default_domain_name,
+				sizeof(psmx2_hfi_info.default_domain_name), ";");
+		snprintf(psmx2_hfi_info.default_domain_name,
+			sizeof(psmx2_hfi_info.default_domain_name) - offset,
+			"%s", unit_name);
 
 		if (multirail)
 			break;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c b/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
index f90f50f6e738173f9e37e47e40a86d90e6e1e6b7..31c5fcd741c582a740e27539d705b92569e62fa7 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
@@ -143,7 +143,7 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt)
 
 	dlist_foreach_safe(&peer_list, item, tmp) {
 		peer = container_of(item, struct psmx2_epaddr_context, entry);
-		if (psmx2_env.disconnect) {
+		if (trx_ctxt->domain->params.disconnect) {
 			FI_INFO(&psmx2_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr);
 			err = psm2_am_request_short(peer->epaddr,
 						    PSMX2_AM_TRX_CTXT_HANDLER,
@@ -236,7 +236,8 @@ void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt, int usage_flags)
 struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 					    struct psmx2_ep_name *src_addr,
 					    int sep_ctxt_idx,
-					    int usage_flags)
+					    int usage_flags,
+					    uint8_t *uuid)
 {
 	struct psmx2_trx_ctxt *trx_ctxt;
 	struct psm2_ep_open_opts opts;
@@ -246,12 +247,16 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 	int asked_flags = usage_flags & PSMX2_TX_RX;
 	int compatible_flags = ~asked_flags & PSMX2_TX_RX;
 
+	if (!uuid)
+		uuid = domain->uuid;
+
 	/* Check existing allocations first if only Tx or Rx is needed */
 	if (compatible_flags) {
 		domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1);
 		dlist_foreach(&domain->trx_ctxt_list, item) {
 			trx_ctxt = container_of(item, struct psmx2_trx_ctxt, entry);
-			if (compatible_flags == trx_ctxt->usage_flags) {
+			if (compatible_flags == trx_ctxt->usage_flags &&
+			    !memcmp(uuid, trx_ctxt->uuid, sizeof(psm2_uuid_t))) {
 				trx_ctxt->usage_flags |= asked_flags;
 				domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1);
 				FI_INFO(&psmx2_prov, FI_LOG_CORE,
@@ -288,8 +293,9 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 	}
 
 	psm2_ep_open_opts_get_defaults(&opts);
+	memcpy(trx_ctxt->uuid, uuid, sizeof(psm2_uuid_t));
 	FI_INFO(&psmx2_prov, FI_LOG_CORE,
-		"uuid: %s\n", psmx2_uuid_to_string(domain->fabric->uuid));
+		"uuid: %s\n", psmx2_uuid_to_string(uuid));
 
 	opts.unit = src_addr ? src_addr->unit : PSMX2_DEFAULT_UNIT;
 	opts.port = src_addr ? src_addr->port : PSMX2_DEFAULT_PORT;
@@ -303,7 +309,7 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 			"sep %d: ep_open_opts: unit=%d\n", sep_ctxt_idx, opts.unit);
 	}
 
-	err = psm2_ep_open(domain->fabric->uuid, &opts,
+	err = psm2_ep_open(uuid, &opts,
 			   &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid);
 	if (err != PSM2_OK) {
 		FI_WARN(&psmx2_prov, FI_LOG_CORE,
@@ -313,7 +319,7 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain,
 
 		/* When round-robin fails, retry w/o explicit assignment */
 		opts.unit = -1;
-		err = psm2_ep_open(domain->fabric->uuid, &opts,
+		err = psm2_ep_open(uuid, &opts,
 				   &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid);
 		if (err != PSM2_OK) {
 			FI_WARN(&psmx2_prov, FI_LOG_CORE,
diff --git a/deps/libfabric/prov/psm2/src/psmx2_util.c b/deps/libfabric/prov/psm2/src/psmx2_util.c
index 382ba1722699559eed9f4913315ea2a7da300950..71ce68cbf0a4d454195eb82e2baa9e3f3ab1c8f1 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_util.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_util.c
@@ -79,7 +79,7 @@ char *psmx2_uuid_to_string(psm2_uuid_t uuid)
 {
 	static char s[40];
 
-	sprintf(s,
+	snprintf(s, sizeof(s),
 		"%02hhX%02hhX%02hhX%02hhX-"
 		"%02hhX%02hhX-%02hhX%02hhX-%02hhX%02hhX-"
 		"%02hhX%02hhX%02hhX%02hhX%02hhX%02hhX",
diff --git a/deps/libfabric/prov/psm3/.gitignore b/deps/libfabric/prov/psm3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..993024e61e679a8d7ca9ecc548f72bb9ed94d797
--- /dev/null
+++ b/deps/libfabric/prov/psm3/.gitignore
@@ -0,0 +1,5 @@
+libpsm3-fi.map
+libpsm3-fi.pc
+
+libpsm3-fi-*.tar.bz2
+libpsm3-fi-*.tar.gz
diff --git a/deps/libfabric/prov/psm3/Makefile.am b/deps/libfabric/prov/psm3/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..f0d07f15bce2164a81ca520d44ea8e121361f4b7
--- /dev/null
+++ b/deps/libfabric/prov/psm3/Makefile.am
@@ -0,0 +1,251 @@
+#
+# Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved.
+# Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
+# (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+#
+# Makefile.am for libpsm3-fi
+EXTRA_DIST =
+
+AM_CPPFLAGS = \
+	-I$(srcdir)/inc \
+	-D_GNU_SOURCE -D__USE_XOPEN2K8 \
+	-DSYSCONFDIR=\"$(sysconfdir)\" \
+	-DRDMADIR=\"@rdmadir@\" \
+	-DPROVDLDIR=\"$(pkglibdir)\"
+if HAVE_PSM3_SRC
+AM_CPPFLAGS += -I$(srcdir)/psm3
+endif
+
+noinst_LTLIBRARIES =
+libfabric_pkglibdir = $(libdir)/libfabric
+libfabric_pkglib_LTLIBRARIES =
+
+if EMBEDDED
+noinst_LTLIBRARIES += src/libpsm3-fi.la
+else
+libfabric_pkglib_LTLIBRARIES += src/libpsm3-fi.la
+endif
+
+ACLOCAL_AMFLAGS = -I config
+AM_CFLAGS = -Wall
+
+if HAVE_LD_VERSION_SCRIPT
+    libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map
+else !HAVE_LD_VERSION_SCRIPT
+    libpsm3_fi_version_script =
+endif !HAVE_LD_VERSION_SCRIPT
+
+# rdmaincludedir = $(includedir)/rdma
+
+# rdmainclude_HEADERS =
+
+# internal utility functions shared by in-tree providers:
+common_srcs = \
+	shared/hmem.c \
+	shared/hmem_rocr.c \
+	shared/hmem_cuda.c \
+	shared/hmem_cuda_gdrcopy.c \
+	shared/hmem_ze.c \
+	shared/common.c \
+	shared/enosys.c \
+	shared/rbtree.c \
+	shared/tree.c \
+	shared/fasthash.c \
+	shared/indexer.c \
+	shared/mem.c \
+	shared/iov.c \
+	shared/shared/ofi_str.c \
+	util/src/util_atomic.c \
+	util/src/util_attr.c \
+	util/src/util_av.c \
+	util/src/util_buf.c \
+	util/src/util_coll.c \
+	util/src/util_cq.c \
+	util/src/util_cntr.c \
+	util/src/util_domain.c \
+	util/src/util_ep.c \
+	util/src/util_eq.c \
+	util/src/util_fabric.c \
+	util/src/util_main.c \
+	util/src/util_mem_hooks.c \
+	util/src/util_mem_monitor.c \
+	util/src/util_mr_cache.c \
+	util/src/util_mr_map.c \
+	util/src/util_ns.c \
+	util/src/util_pep.c \
+	util/src/util_poll.c \
+	util/src/util_shm.c \
+	util/src/util_wait.c \
+	util/src/cuda_mem_monitor.c \
+	util/src/rocr_mem_monitor.c
+
+if MACOS
+common_srcs += shared/osx/osd.c
+common_srcs += shared/unix/osd.c
+common_srcs += inc/osx/osd.h
+common_srcs += inc/unix/osd.h
+endif
+
+if FREEBSD
+common_srcs += shared/unix/osd.c
+common_srcs += inc/freebsd/osd.h
+common_srcs += inc/unix/osd.h
+endif
+
+if LINUX
+common_srcs += shared/unix/osd.c
+common_srcs += shared/linux/osd.c
+if HAVE_LINUX_PERF_RDPMC
+if !HAVE_PSM3_SRC
+common_srcs += shared/linux/rdpmc.c  #seems to be a copy of psm3/psm_perf.c
+endif
+endif
+common_srcs += inc/linux/rdpmc.h
+common_srcs += inc/linux/osd.h
+common_srcs += inc/unix/osd.h
+endif
+
+# ensure dl-built providers link back to libfabric
+# linkback = src/libfabric.la
+
+bin_SCRIPTS =
+
+nodist_src_libpsm3_fi_la_SOURCES =
+src_libpsm3_fi_la_SOURCES = \
+	inc/ofi_hmem.h \
+	inc/ofi.h \
+	inc/ofi_abi.h \
+	inc/ofi_atom.h \
+	inc/ofi_enosys.h \
+	inc/ofi_file.h \
+	inc/ofi_hook.h \
+	inc/ofi_indexer.h \
+	inc/ofi_iov.h \
+	inc/ofi_list.h \
+	inc/ofi_bitmask.h \
+	inc/shared/ofi_str.h \
+	inc/ofi_lock.h \
+	inc/ofi_mem.h \
+	inc/ofi_osd.h \
+	inc/ofi_proto.h \
+	inc/ofi_recvwin.h \
+	inc/ofi_rbuf.h \
+	inc/ofi_shm.h \
+	inc/ofi_signal.h \
+	inc/ofi_epoll.h \
+	inc/ofi_tree.h \
+	inc/ofi_util.h \
+	inc/ofi_atomic.h \
+	inc/ofi_mr.h \
+	inc/ofi_net.h \
+	inc/ofi_perf.h \
+	inc/ofi_coll.h \
+	inc/fasthash.h \
+	inc/rbtree.h \
+	inc/uthash.h \
+	inc/ofi_prov.h \
+	inc/rdma/providers/fi_log.h \
+	inc/rdma/providers/fi_prov.h \
+	inc/rdma/fabric.h \
+	inc/rdma/fi_atomic.h \
+	inc/rdma/fi_cm.h \
+	inc/rdma/fi_collective.h \
+	inc/rdma/fi_domain.h \
+	inc/rdma/fi_eq.h \
+	inc/rdma/fi_rma.h \
+	inc/rdma/fi_endpoint.h \
+	inc/rdma/fi_errno.h \
+	inc/rdma/fi_tagged.h \
+	inc/rdma/fi_trigger.h \
+	src/psmx3.h \
+	src/psmx3_am.c \
+	src/psmx3_atomic.c \
+	src/psmx3_attr.c \
+	src/psmx3_av.c \
+	src/psmx3_cm.c \
+	src/psmx3_cntr.c \
+	src/psmx3_cq.c \
+	src/psmx3_domain.c \
+	src/psmx3_ep.c \
+	src/psmx3_fabric.c \
+	src/psmx3_init.c \
+	src/psmx3_mr.c \
+	src/psmx3_msg.c \
+	src/psmx3_rma.c \
+	src/psmx3_tagged.c \
+	src/psmx3_trigger.h \
+	src/psmx3_trx_ctxt.c \
+	src/psmx3_util.c \
+	src/psmx3_wait.c \
+	src/version.h \
+	$(common_srcs)
+
+src_libpsm3_fi_la_CPPFLAGS = $(AM_CPPFLAGS)
+src_libpsm3_fi_la_DEPENDENCIES = libpsm3-fi.map
+src_libpsm3_fi_la_LDFLAGS =
+src_libpsm3_fi_la_LIBADD =
+
+src_libpsm3_fi_la_LDFLAGS += \
+	-export-dynamic \
+	$(libpsm3_fi_version_script)
+
+chksum_srcs = $(src_libpsm3_fi_la_SOURCES)
+if HAVE_PSM3_SRC
+nodist_src_libpsm3_fi_la_SOURCES += src/psm3_revision.c
+
+include psm3/Makefile.include
+src_libpsm3_fi_la_LIBADD += libpsm2.la
+src_libpsm3_fi_la_DEPENDENCIES += libpsm2.la
+
+else !HAVE_PSM3_SRC
+src_libpsm3_fi_la_LDFLAGS += -lpsm2
+endif !HAVE_PSM3_SRC
+
+if !EMBEDDED
+src_libpsm3_fi_la_LDFLAGS += -version-info 16:1:15
+endif
+
+prov_install_man_pages = man/man7/fi_psm3.7
+
+prov_dist_man_pages = man/man7/fi_psm3.7
+
+man_MANS = $(prov_install_man_pages)
+
+EXTRA_DIST += \
+        libpsm3-fi.spec.in \
+        config/distscript.pl \
+        $(prov_dist_man_pages) \
+        VERSION
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libpsm3-fi.pc
+
+chksum_srcs += $(EXTRA_DIST) $(pkgconfig_DATA)
+
+all-local:
+	@echo "Building src checksum..."; \
+	chksum=`cat $(chksum_srcs) | sha1sum | cut -d' ' -f 1`; \
+	if ! grep -q $$chksum src/psm3_revision.c 2>/dev/null; then \
+		sed -i "/define PSMX3_SRC_CHECKSUM/s/\".*\"/\"$$chksum\"/" src/psm3_revision.c; \
+		echo "SRC checksum updated to $$chksum"; \
+	else \
+		echo "SRC checksum not changed: $$chksum"; \
+	fi; \
+	timestamp=`date`; \
+	sed -i "/define PSMX3_BUILD_TIMESTAMP/s/\".*\"/\"$$timestamp\"/" src/psm3_revision.c; \
+	echo "Updated build timestamp: $$timestamp"
+
+nroff:
+	@for file in $(prov_install_man_pages); do \
+	    source=`echo $$file | sed -e 's@/man[0-9]@@'`; \
+	    perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
+	done
+
+dist-hook: libpsm3-fi.spec
+	cp libpsm3-fi.spec $(distdir)
+	perl $(top_srcdir)/config/distscript.pl "$(distdir)" "$(PACKAGE_VERSION)"
+
+rpm: dist
+	LDFLAGS=-Wl,--build-id rpmbuild -ta libpsm3-fi-$(PACKAGE_VERSION).tar.bz2
+
diff --git a/deps/libfabric/prov/psm3/Makefile.include b/deps/libfabric/prov/psm3/Makefile.include
new file mode 100644
index 0000000000000000000000000000000000000000..bbe532dde4475c856723ef22cd573bf276a7851e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/Makefile.include
@@ -0,0 +1,333 @@
+if HAVE_PSM3
+_psm3_files = \
+	prov/psm3/src/version.h \
+	prov/psm3/src/psmx3.h \
+	prov/psm3/src/psmx3_am.c \
+	prov/psm3/src/psmx3_atomic.c \
+	prov/psm3/src/psmx3_attr.c \
+	prov/psm3/src/psmx3_av.c \
+	prov/psm3/src/psmx3_cm.c \
+	prov/psm3/src/psmx3_cntr.c \
+	prov/psm3/src/psmx3_cq.c \
+	prov/psm3/src/psmx3_domain.c \
+	prov/psm3/src/psmx3_ep.c \
+	prov/psm3/src/psmx3_fabric.c \
+	prov/psm3/src/psmx3_init.c \
+	prov/psm3/src/psmx3_mr.c \
+	prov/psm3/src/psmx3_msg.c \
+	prov/psm3/src/psmx3_rma.c \
+	prov/psm3/src/psmx3_tagged.c \
+	prov/psm3/src/psmx3_trigger.h \
+	prov/psm3/src/psmx3_trx_ctxt.c \
+	prov/psm3/src/psmx3_util.c \
+	prov/psm3/src/psmx3_wait.c
+
+_psm3_cppflags = \
+	-I$(top_srcdir)/prov/psm3
+
+chksum_srcs = $(_psm3_files)
+
+if HAVE_PSM3_SRC
+_psm3_cflags = $(psm3_ARCH_CFLAGS)
+_nodist_psm3_files = \
+	prov/psm3/src/psm3_revision.c \
+	prov/psm3/src/psm3_src_chksum.h
+
+# builddir is for nodist config headers: See nodist_libpsm3i_la_SOURCES
+_psm3_cppflags += \
+	-I$(top_srcdir)/prov/psm3/src \
+	-I$(top_srcdir)/prov/psm3/psm3 \
+	-I$(top_srcdir)/prov/psm3/psm3/ptl_ips \
+	-I$(top_srcdir)/prov/psm3/psm3/include \
+	-I$(top_srcdir)/prov/psm3/psm3/include/linux-i386 \
+	-I$(top_srcdir)/prov/psm3/psm3/mpspawn \
+	-I$(top_srcdir)/prov/psm3/psm3/opa \
+	-I$(top_builddir)/prov/psm3/psm3 \
+	-D_GNU_SOURCE=1
+
+noinst_LTLIBRARIES += \
+	prov/psm3/psm3/libopa.la \
+	prov/psm3/psm3/libptl_am.la \
+	prov/psm3/psm3/libptl_ips.la \
+	prov/psm3/psm3/libptl_self.la \
+	prov/psm3/psm3/libpsm_hal_gen1.la \
+	prov/psm3/psm3/libpsm3i.la
+
+prov_psm3_psm3_libptl_am_la_SOURCES = \
+	prov/psm3/psm3/ptl_am/am_config.h \
+	prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c \
+	prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h \
+	prov/psm3/psm3/ptl_am/am_reqrep.c \
+	prov/psm3/psm3/ptl_am/am_reqrep_shmem.c \
+	prov/psm3/psm3/ptl_am/cmarw.h \
+	prov/psm3/psm3/ptl_am/cmarwu.c \
+	prov/psm3/psm3/ptl_am/psm_am_internal.h \
+	prov/psm3/psm3/ptl_am/ptl.c \
+	prov/psm3/psm3/ptl_am/ptl_fwd.h
+prov_psm3_psm3_libptl_am_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/ptl_am/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libptl_am_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
+prov_psm3_psm3_libptl_ips_la_SOURCES = \
+	prov/psm3/psm3/ptl_ips/ips_config.h \
+	prov/psm3/psm3/ptl_ips/ips_crc32.c \
+	prov/psm3/psm3/ptl_ips/ips_epstate.c \
+	prov/psm3/psm3/ptl_ips/ips_epstate.h \
+	prov/psm3/psm3/ptl_ips/ips_expected_proto.h \
+	prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c \
+	prov/psm3/psm3/ptl_ips/ips_path_rec.c \
+	prov/psm3/psm3/ptl_ips/ips_path_rec.h \
+	prov/psm3/psm3/ptl_ips/ips_proto.c \
+	prov/psm3/psm3/ptl_ips/ips_proto.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_am.c \
+	prov/psm3/psm3/ptl_ips/ips_proto_am.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_connect.c \
+	prov/psm3/psm3/ptl_ips/ips_proto_dump.c \
+	prov/psm3/psm3/ptl_ips/ips_proto_expected.c \
+	prov/psm3/psm3/ptl_ips/ips_proto_header.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_help.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_internal.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_mq.c \
+	prov/psm3/psm3/ptl_ips/ips_proto_params.h \
+	prov/psm3/psm3/ptl_ips/ips_proto_recv.c \
+	prov/psm3/psm3/ptl_ips/ips_recvhdrq.c \
+	prov/psm3/psm3/ptl_ips/ips_recvhdrq.h \
+	prov/psm3/psm3/ptl_ips/ips_recvq.c \
+	prov/psm3/psm3/ptl_ips/ips_recvq.h \
+	prov/psm3/psm3/ptl_ips/ips_scb.c \
+	prov/psm3/psm3/ptl_ips/ips_scb.h \
+	prov/psm3/psm3/ptl_ips/ips_stats.h \
+	prov/psm3/psm3/ptl_ips/ips_subcontext.h \
+	prov/psm3/psm3/ptl_ips/ips_tid.c \
+	prov/psm3/psm3/ptl_ips/ips_tid.h \
+	prov/psm3/psm3/ptl_ips/ips_tidcache.c \
+	prov/psm3/psm3/ptl_ips/ips_tidcache.h \
+	prov/psm3/psm3/ptl_ips/ips_tidflow.c \
+	prov/psm3/psm3/ptl_ips/ips_tidflow.h \
+	prov/psm3/psm3/ptl_ips/ips_writehdrq.c \
+	prov/psm3/psm3/ptl_ips/ips_writehdrq.h \
+	prov/psm3/psm3/ptl_ips/ptl.c \
+	prov/psm3/psm3/ptl_ips/ptl_fwd.h \
+	prov/psm3/psm3/ptl_ips/ptl_ips.h \
+	prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
+prov_psm3_psm3_libptl_ips_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/ptl_ips/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libptl_ips_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+prov_psm3_psm3_libptl_ips_la_DEPENDENCIES = \
+	prov/psm3/psm3/libopa.la
+
+prov_psm3_psm3_libptl_self_la_SOURCES = \
+	prov/psm3/psm3/ptl_self/ptl.c \
+	prov/psm3/psm3/ptl_self/ptl_fwd.h
+prov_psm3_psm3_libptl_self_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/ptl_self/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libptl_self_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
+prov_psm3_psm3_libopa_la_SOURCES = \
+	prov/psm3/psm3/opa/opa_debug.c \
+	prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c \
+	prov/psm3/psm3/opa/opa_service.c \
+	prov/psm3/psm3/opa/opa_sysfs.c \
+	prov/psm3/psm3/opa/opa_syslog.c \
+	prov/psm3/psm3/opa/opa_time.c \
+	prov/psm3/psm3/opa/opa_utils.c \
+	prov/psm3/psm3/include/opa_byteorder.h \
+	prov/psm3/psm3/include/opa_debug.h \
+	prov/psm3/psm3/include/opa_intf.h \
+	prov/psm3/psm3/include/opa_queue.h \
+	prov/psm3/psm3/include/opa_revision.h \
+	prov/psm3/psm3/include/opa_service.h \
+	prov/psm3/psm3/include/opa_udebug.h \
+	prov/psm3/psm3/include/opa_user.h \
+	prov/psm3/psm3/include/psm2_mock_testing.h \
+	prov/psm3/psm3/include/psm3_rbtree.h \
+	prov/psm3/psm3/include/linux-i386/bit_ops.h \
+	prov/psm3/psm3/include/linux-i386/sysdep.h \
+	prov/psm3/psm3/mpspawn/mpspawn_stats.h
+prov_psm3_psm3_libopa_la_CPPFLAGS = \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libopa_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
+prov_psm3_psm3_libpsm_hal_gen1_la_SOURCES = \
+	prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h \
+	prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h \
+	prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c \
+	prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c \
+	prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c \
+	prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h \
+	prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h \
+	prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c \
+	prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c \
+	prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c \
+	prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h \
+	prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h \
+	prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h
+prov_psm3_psm3_libpsm_hal_gen1_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/psm_hal_gen1/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libpsm_hal_gen1_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
+prov_psm3_psm3_libpsm3i_la_SOURCES = \
+	prov/psm3/psm3/psm.c \
+	prov/psm3/psm3/psm_am.c \
+	prov/psm3/psm3/psm_am_internal.h \
+	prov/psm3/psm3/psm_config.h \
+	prov/psm3/psm3/psm_context.c \
+	prov/psm3/psm3/psm_context.h \
+	prov/psm3/psm3/psm_diags.c \
+	prov/psm3/psm3/psm_ep.c \
+	prov/psm3/psm3/psm_ep.h \
+	prov/psm3/psm3/psm_ep_connect.c \
+	prov/psm3/psm3/psm_error.c \
+	prov/psm3/psm3/psm_error.h \
+	prov/psm3/psm3/psm_gdrcpy.h \
+	prov/psm3/psm3/psm_help.h \
+	prov/psm3/psm3/psm_lock.h \
+	prov/psm3/psm3/psm_log.h \
+	prov/psm3/psm3/psm_memcpy.c \
+	prov/psm3/psm3/psm_mock.c \
+	prov/psm3/psm3/psm_mpool.c \
+	prov/psm3/psm3/psm_mpool.h \
+	prov/psm3/psm3/psm_mq.c \
+	prov/psm3/psm3/psm_mq_internal.h \
+	prov/psm3/psm3/psm_mq_recv.c \
+	prov/psm3/psm3/psm_mq_utils.c \
+	prov/psm3/psm3/psm_netutils.h \
+	prov/psm3/psm3/psm_perf.c \
+	prov/psm3/psm3/psm_perf.h \
+	prov/psm3/psm3/psm_rndv_mod.c \
+	prov/psm3/psm3/psm_rndv_mod.h \
+	prov/psm3/psm3/psm_stats.c \
+	prov/psm3/psm3/psm_stats.h \
+	prov/psm3/psm3/psm_sysbuf.c \
+	prov/psm3/psm3/psm_sysbuf.h \
+	prov/psm3/psm3/psm_timer.c \
+	prov/psm3/psm3/psm_timer.h \
+	prov/psm3/psm3/psm_user.h \
+	prov/psm3/psm3/psm_utils.c \
+	prov/psm3/psm3/psm_utils.h \
+	prov/psm3/psm3/psm_verbs_ep.c \
+	prov/psm3/psm3/psm_verbs_ep.h \
+	prov/psm3/psm3/psm_verbs_mr.c \
+	prov/psm3/psm3/psm_verbs_mr.h \
+	prov/psm3/psm3/psm_udp_ep.c \
+	prov/psm3/psm3/psm_udp_ep.h \
+	prov/psm3/psm3/psmi_wrappers.c \
+	prov/psm3/psm3/psmi_wrappers.h \
+	prov/psm3/psm3/psm2.h \
+	prov/psm3/psm3/psm2_am.h \
+	prov/psm3/psm3/psm2_hal.c \
+	prov/psm3/psm3/psm2_hal.h \
+	prov/psm3/psm3/psm2_hal_inline_t.h \
+	prov/psm3/psm3/psm2_mq.h \
+	prov/psm3/psm3/ptl.h
+prov_psm3_psm3_libpsm3i_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/include/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libpsm3i_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
+nodist_prov_psm3_psm3_libpsm3i_la_SOURCES = \
+	prov/psm3/psm3/psm2_hal_inlines_i.h \
+	prov/psm3/psm3/psm2_hal_inlines_d.h
+
+prov_psm3_psm3_libpsm3i_la_LIBADD = \
+	prov/psm3/psm3/libopa.la \
+	prov/psm3/psm3/libptl_am.la \
+	prov/psm3/psm3/libptl_ips.la \
+	prov/psm3/psm3/libptl_self.la \
+	prov/psm3/psm3/libpsm_hal_gen1.la
+
+prov_psm3_psm3_libpsm3i_la_DEPENDENCIES = \
+	prov/psm3/psm3/libopa.la \
+	prov/psm3/psm3/libptl_am.la \
+	prov/psm3/psm3/libptl_ips.la \
+	prov/psm3/psm3/libptl_self.la \
+	prov/psm3/psm3/libpsm_hal_gen1.la
+
+_psm3_extra_dist = \
+	prov/psm3/psm3/include/psm3_rbtree.c \
+	prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c \
+	prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S \
+	prov/psm3/VERSION
+EXTRA_DIST += $(_psm3_extra_dist)
+
+chksum_srcs += \
+	$(prov_psm3_psm3_libptl_am_la_SOURCES) $(prov_psm3_psm3_libptl_ips_la_SOURCES) \
+	$(prov_psm3_psm3_libptl_self_la_SOURCES) $(prov_psm3_psm3_libopa_la_SOURCES) \
+	$(prov_psm3_psm3_libpsm_hal_gen1_la_SOURCES) $(prov_psm3_psm3_libpsm3i_la_SOURCES) \
+	$(_psm3_extra_dist)
+
+_psm3_LIBS = prov/psm3/psm3/libpsm3i.la
+
+BUILT_SOURCES = prov/psm3/src/psm3_src_chksum.h
+CLEANFILES = prov/psm3/src/psm3_src_chksum.h
+DATE_FMT = +%Y-%m-%dT%H:%M:%S
+prov/psm3/src/psm3_src_chksum.h: Makefile $(chksum_srcs)
+	$(AM_V_GEN) chksum=`for file in $(chksum_srcs); do cat $(top_srcdir)/$$file; done | sha1sum | cut -d' ' -f 1`; \
+	if ! grep -q $$chksum prov/psm3/src/psm3_src_chksum.h 2>/dev/null; then \
+		echo "#define PSMX3_SRC_CHECKSUM \"$$chksum\"" > prov/psm3/src/psm3_src_chksum.h; \
+		echo "#define PSMX3_BUILD_TIMESTAMP \"`if test -z "$(SOURCE_DATE_EPOCH)" ; then date "$(DATE_FMT)" ; else date -u -d "@$(SOURCE_DATE_EPOCH)" "$(DATE_FMT)" 2>/dev/null || date -u -r "$(SOURCE_DATE_EPOCH)" "$(DATE_FMT)" 2>/dev/null || date -u "$(DATE_FMT)" ; fi`\"" >> prov/psm3/src/psm3_src_chksum.h; \
+	fi
+
+endif HAVE_PSM3_SRC
+
+if HAVE_PSM3_DL
+pkglib_LTLIBRARIES += libpsm3-fi.la
+libpsm3_fi_la_SOURCES = $(_psm3_files) $(common_srcs)
+nodist_libpsm3_fi_la_SOURCES = $(_nodist_psm3_files)
+libpsm3_fi_la_CFLAGS = $(AM_CFLAGS) $(psm3_CFLAGS)
+libpsm3_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+libpsm3_fi_la_LDFLAGS = \
+	-module -avoid-version -shared -export-dynamic \
+	-export-symbols-regex ^fi_prov_ini $(psm3_LDFLAGS)
+libpsm3_fi_la_LIBADD = $(linkback) $(psm3_LIBS) $(_psm3_LIBS)
+libpsm3_fi_la_DEPENDENCIES = $(linkback) \
+	prov/psm3/src/psm3_src_chksum.h \
+	prov/psm3/psm3/libpsm3i.la
+else !HAVE_PSM3_DL
+noinst_LTLIBRARIES += libpsm3.la
+libpsm3_la_SOURCES = $(_psm3_files)
+libpsm3_la_DEPENDENCIES = \
+	prov/psm3/src/psm3_src_chksum.h \
+	prov/psm3/psm3/libpsm3i.la
+nodist_libpsm3_la_SOURCES = $(_nodist_psm3_files)
+libpsm3_la_CFLAGS = $(src_libfabric_la_CFLAGS) $(psm3_CFLAGS)
+libpsm3_la_CPPFLAGS = $(src_libfabric_la_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+libpsm3_la_LDFLAGS = $(psm3_LDFLAGS)
+libpsm3_la_LIBADD = $(psm3_LIBS) $(_psm3_LIBS)
+src_libfabric_la_LIBADD += libpsm3.la
+src_libfabric_la_DEPENDENCIES += libpsm3.la
+
+if HAVE_PSM2_SRC
+prov/psm3/psm3/.libs/libpsm3_full.lo: $(libpsm3_la_OBJECTS) $(libpsm3_la_DEPENDENCIES) $(EXTRA_libpsm3_la_DEPENDENCIES)
+	@sed -i.bak "/dependency_libs/s/='.*'/=''/" prov/psm3/psm3/libpsm3i.la
+	$(AM_V_CCLD)$(libpsm3_la_LINK) -r $(am_libpsm3_la_rpath) $(libpsm3_la_OBJECTS) prov/psm3/psm3/libpsm3i.la
+
+prov/psm3/psm3/.libs/libpsm3_exp.o: prov/psm3/psm3/.libs/libpsm3_full.lo
+	@objcopy --keep-global-symbol=fi_psm3_ini prov/psm3/psm3/.libs/libpsm3_full.o prov/psm3/psm3/.libs/libpsm3_exp.o
+
+libpsm3.la: prov/psm3/psm3/.libs/libpsm3_exp.o
+	@mv prov/psm3/psm3/libpsm3i.la.bak prov/psm3/psm3/libpsm3i.la
+	$(AM_V_CCLD)$(libpsm3_la_LINK) $(am_libpsm3_la_rpath) $(libpsm3_la_OBJECTS) $(libpsm3_la_LIBADD) $(LIBS); \
+	rm -f .libs/libpsm3.a libpsm3.a; \
+	$(AR) cru .libs/libpsm3.a prov/psm3/psm3/.libs/libpsm3_exp.o; \
+	$(RANLIB) .libs/libpsm3.a
+endif HAVE_PSM2_SRC
+
+endif !HAVE_PSM3_DL
+
+prov_install_man_pages += man/man7/fi_psm3.7
+
+endif HAVE_PSM3
+
+prov_dist_man_pages += man/man7/fi_psm3.7
+
diff --git a/deps/libfabric/prov/psm3/README b/deps/libfabric/prov/psm3/README
new file mode 100644
index 0000000000000000000000000000000000000000..bd185a2237d4eb45ad50c12432ab0cb1435caece
--- /dev/null
+++ b/deps/libfabric/prov/psm3/README
@@ -0,0 +1,11 @@
+This is derived from libfabric source.  See libfabric for README, AUTHORS, COPYING
+and other notices.
+
+
+To Build PSM3 OFI Provider:
+1. ./configure
+2. make -j
+
+To Build PSM3 OFI Provider RPM:
+1. ./configure
+2. make rpm
diff --git a/deps/libfabric/prov/psm3/VERSION b/deps/libfabric/prov/psm3/VERSION
new file mode 100644
index 0000000000000000000000000000000000000000..b435351498bcc71017c6e1fcdf3764496f88a45d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/VERSION
@@ -0,0 +1 @@
+3_1_0_0
diff --git a/deps/libfabric/prov/psm3/autogen.sh b/deps/libfabric/prov/psm3/autogen.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e4a8eed41f97fe9ab3c0ec59f44c76effbf1308a
--- /dev/null
+++ b/deps/libfabric/prov/psm3/autogen.sh
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+if test ! -f src/psmx3.h; then
+	echo You really need to run this script in the prov psm3 directory in git
+	exit 1
+fi
+
+set -x
+autoreconf -ivf
+
diff --git a/deps/libfabric/prov/psm3/config b/deps/libfabric/prov/psm3/config
new file mode 120000
index 0000000000000000000000000000000000000000..899f6989820ff1667f8ddcf88e156e873fcec28f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/config
@@ -0,0 +1 @@
+../../config
\ No newline at end of file
diff --git a/deps/libfabric/prov/psm3/configure.ac b/deps/libfabric/prov/psm3/configure.ac
new file mode 100644
index 0000000000000000000000000000000000000000..8ea1671a52e1191ed3d900de18fffa2cfde45c84
--- /dev/null
+++ b/deps/libfabric/prov/psm3/configure.ac
@@ -0,0 +1,713 @@
+dnl
+dnl Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
+dnl Copyright (c) 2019 Intel, Inc.  All rights reserved.
+dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+dnl
+dnl Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.60])
+AC_INIT([libpsm3-fi], m4_normalize(m4_esyscmd([sed 's/_/./g' VERSION])))
+AC_CONFIG_SRCDIR([src/psmx3.h])
+AC_CONFIG_AUX_DIR(config)
+AC_CONFIG_MACRO_DIR(config)
+AC_CONFIG_HEADERS(config.h)
+AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects parallel-tests tar-pax])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([no])])
+dnl --- m4_include(config/fi_check_package.m4)
+m4_include(config/fi_strip_optflags.m4)
+AC_DEFINE([HAVE_PSM3], [1], [Build libfabric PSM3 provider])
+AC_DEFINE([HAVE_PSM3_DL], [1], [Build libfabric PSM3 provider])
+
+dnl Override Default flags
+CPPFLAGS="-D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE"
+AS_IF([test ! -z "$CC" && test "x$CC" == "xicc"],
+      [ dnl ICC
+	CFLAGS="-Werror -xATOM_SSE4.2 -DPSM_AVX512 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,"
+	LDFLAGS="-Wc,-static-intel"
+      ], [ dnl GCC/other
+	CFLAGS="-Werror -mavx2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security"
+      ])
+
+AC_ARG_ENABLE([psm-src],
+	      [AS_HELP_STRING([--enable-psm-src],
+			      [Enable Monolithic provider @<:@default=yes@:>@])],
+			      [],
+			      [enable_psm_src=yes])
+AS_IF([test "x$enable_psm_src" != "xno"], [psm_src=1], [psm_src=0])
+AM_CONDITIONAL([HAVE_PSM3_SRC], [test "x$enable_psm_src" != "xno"], [build PSM3 src into provider])
+AC_DEFINE_UNQUOTED([HAVE_PSM3_SRC], $psm_src, [PSM3 source is built-in])
+
+PSM_HAL_CNT=1
+PSM_HAL_INST=gen1
+
+AC_ARG_ENABLE([psm-ud],
+	      [AS_HELP_STRING([--enable-psm-ud],
+			      [Enable Verbs UD support @<:@default=yes@:>@])],
+	      [],
+	      [enable_psm_ud=yes])
+AC_ARG_ENABLE([psm-rc],
+	      [AS_HELP_STRING([--enable-psm-rc],
+			      [Enable Verbs RC support (requires UD support) @<:@default=yes@:>@])],
+	      [],
+	      [enable_psm_rc=yes])
+AC_ARG_WITH([psm3-rv],
+            [AS_HELP_STRING([--with-psm3-rv],
+                            [Enable RV module use @<:@default=check@:>@])])
+AS_IF([test x$with_psm3_rv = xno],
+      [CPPFLAGS="$CPPFLAGS -URNDV_MOD"],
+      [
+	AS_IF([test "x$with_psm3_rv" = "x"],
+	      [
+		psm3_rv_check=1
+		with_psm3_rv=/usr/include
+	      ],[psm3_rv_check=0])
+	psm3_rv_old_header=0
+	save_CPPFLAGS=$CPPFLAGS
+	CPPFLAGS="$CPPFLAGS -I$with_psm3_rv"
+	dnl Check for /usr/include/rdma/rv_user_ioctls.h first
+	_FI_CHECK_PACKAGE_HEADER([psm3_rv],
+	                         [rdma/rv_user_ioctls.h],
+	                         [],
+	                         [psm3_rv_happy=1],
+	                         [psm3_rv_happy=0])
+
+	AS_IF([test $psm3_rv_happy -eq 0], [
+		AS_IF([test "$psm3_rv_check" -eq 1],
+		      [with_psm3_rv=/usr/include/uapi])
+		CPPFLAGS="$save_CPPFLAGS -I$with_psm3_rv"
+		_FI_CHECK_PACKAGE_HEADER([psm3_rv],
+		                         [rv/rv_user_ioctls.h],
+		                         [],
+		                         [psm3_rv_happy=1
+		                          psm3_rv_old_header=1],
+		                         [psm3_rv_happy=0])
+	      ])
+	CPPFLAGS=$save_CPPFLAGS
+	AS_IF([test "$psm3_rv_happy" -eq 0],
+	      [
+		AS_IF([test "$psm3_rv_check" -eq 0], [
+			psm3_happy=0
+			AC_MSG_ERROR([RV Module headers requested but rv_user_ioctls.h not found.])
+		])
+		CPPFLAGS="$CPPFLAGS -URNDV_MOD"
+	      ],[
+		CPPFLAGS="$CPPFLAGS -DRNDV_MOD -I$with_psm3_rv"
+		AS_IF([test "$psm3_rv_old_header" -eq 1],
+		      [CPPFLAGS="$CPPFLAGS -DHAVE_OLD_RV_HEADER"])
+	      ])
+	AS_IF([test "$psm3_rv_happy" -eq 1 ], [
+		AC_MSG_CHECKING([for RV support for ring.overflow_cnt])
+		AC_COMPILE_IFELSE(
+			[AC_LANG_PROGRAM(
+				[[#include <sys/types.h>
+				  #include <stdint.h>
+				  #include <rdma/rv_user_ioctls.h>
+				]],[[struct rv_ring_header ring; ring.overflow_cnt=0;]])
+			],[
+				AC_MSG_RESULT(yes)
+			],[
+				AC_MSG_RESULT(no)
+				CPPFLAGS="$CPPFLAGS -DHAVE_NO_PSM3_RV_OVERFLOW_CNT"
+			])
+	      ])
+      ])
+AC_ARG_WITH([psm-headers],
+	    [AC_HELP_STRING([--with-psm-headers=DIR],
+			    [Provide path to where the psm headers are installed for split build. @<:@default=no@:>@])],
+	    [], [with_psm_headers="no"])
+if test "$with_psm_headers" != "" && test "$with_psm_headers" != "no"; then
+	CPPFLAGS="$CPPFLAGS -I$with_psm_headers"
+	AC_CHECK_HEADER(psm2.h, [],
+		AC_MSG_ERROR([PSM Headers requested but <psm2.h> not found.]))
+fi
+AC_ARG_ENABLE([psm-rdma-read],
+	      [AS_HELP_STRING([--enable-psm-rdma-read],
+			      [Enable RDMA READ (requires UD and RC support) @<:@default=no@:>@])],
+	      [],
+	      [enable_psm_rdma_read=no])
+
+AS_IF([test "x$enable_psm_src" == "xyes" && test "x$enable_psm_ud" == "xyes"],
+      [
+	CPPFLAGS="$CPPFLAGS -DPSM_UD"
+	AS_IF([test "x$enable_psm_rc" == "xyes"],
+	      [
+	          CPPFLAGS="$CPPFLAGS -DUSE_RC"
+	          AS_IF([test "x$enable_psm_rdma_read" == "yes"],[CPPFLAGS="$CPPFLAGS -DUSE_RDMA_READ"])
+	      ],
+	      [
+	          CPPFLAGS="$CPPFLAGS -UUSE_RC"
+	      ])
+      ])
+AS_IF([test "x$enable_psm_src" == "xyes"],
+      [
+	AC_SEARCH_LIBS([shm_open], [rt], [], [AC_MSG_ERROR([unable to find shm_open() in librt])])
+	AC_SEARCH_LIBS([dlopen], [dl], [], [AC_MSG_ERROR([unable to find dlopen() in libdl])])
+	AC_SEARCH_LIBS([numa_node_of_cpu], [numa], [], [AC_MSG_ERROR([unable to find numa_node_of_cpu() in libnuma])])
+	AC_SEARCH_LIBS([uuid_parse], [uuid], [], [AC_MSG_ERROR([unable to find uuid_parse() in libuuid])])
+	AS_IF([test "x$enable_psm_ud" == "xyes"],
+	      [AC_SEARCH_LIBS([ibv_get_device_list], [ibverbs], [],
+			      [AC_MSG_ERROR([unable to find ibv_get_device_list() in libibverbs])])
+	      ], [])
+
+	AS_IF([test ! -z "$PSM2_MOCK_TESTING"], [CPPFLAGS="$CPPFLAGS -DPSM2_MOCK_TESTING=1"], [])
+	AS_IF([test ! -z "$PSM_FI"], [CPPFLAGS="$CPPFLAGS -DPSM_FI"], [])
+	AS_IF([test ! -z "$PSM_DEBUG"],
+	      [
+		CFLAGS="-O0 -g3 $CFLAGS"
+		CPPFLAGS="$CPPFLAGS -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2"
+	      ],
+	      [CFLAGS="-O3 -g3 $CFLAGS"])
+
+	AS_IF([test ! -z "$PSM_COVERAGE"],
+	      [
+		CFLAGS="$CFLAGS -O -fprofile-arcs -ftest-coverage"
+		LDFLAGS="$LDFLAGS -fprofile-arcs"
+	      ], [])
+
+	AS_IF([test ! -z "$PSM_LOG"],
+	      [
+		CPPFLAGS="$CPPFLAGS -DPSM_LOG"
+		AS_IF([test ! -z "$PSM_LOG_FAST_IO"],
+		      [CPPFLAGS="$CPPFLAGS -DPSM_LOG"], [])
+	      ], [])
+	AS_IF([test ! -z "$PSM_PERF"], [CPPFLAGS="$CPPFLAGS -DRDPMC_PERF_FRAMEWORK"], [])
+	AS_IF([test ! -z "$PSM_HEAP_DEBUG"], [CPPFLAGS="$CPPFLAGS -DPSM_HEAP_DEBUG"], [])
+	AS_IF([test ! -z "$PSM_PROFILE"], [CPPFLAGS="$CPPFLAGS -DPSM_PROFILE"], [])
+      ])
+
+AM_CONDITIONAL([HAVE_PSM3_ADDITIONAL_GLOBALS], [test ! -z "$PSM2_ADDITIONAL_GLOBALS"], [])
+AM_COND_IF([HAVE_PSM3_ADDITIONAL_GLOBALS], [PSM3_ADDITIONAL_GLOBALS="$PSM2_ADDITIONAL_GLOBALS"],[])
+
+
+psm3_happy=1
+
+AC_CANONICAL_HOST
+
+macos=0
+linux=0
+freebsd=0
+
+case $host_os in
+*darwin*)
+	macos=1
+	;;
+*linux*)
+	linux=1
+	;;
+*freebsd*)
+	freebsd=1
+	;;
+*)
+	AC_MSG_ERROR([libfabric only builds on Linux, OS X, and FreeBSD])
+	;;
+esac
+
+AM_CONDITIONAL([MACOS], [test "x$macos" = "x1"])
+AM_CONDITIONAL([LINUX], [test "x$linux" = "x1"])
+AM_CONDITIONAL([FREEBSD], [test "x$freebsd" = "x1"])
+
+base_c_warn_flags="-Wall -Wundef -Wpointer-arith"
+debug_c_warn_flags="-Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-missing-field-initializers"
+debug_c_other_flags="-fstack-protector-strong"
+picky_c_warn_flags="-Wno-long-long -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic"
+
+AC_ARG_WITH([build_id],
+	    [AC_HELP_STRING([--with-build_id],
+			    [Enable build_id annotation @<:@default=no@:>@])],
+	    [], [with_build_id=no])
+AS_IF([test x"$with_build_id" = x"no"], [with_build_id=""])
+AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"],
+                   [adds build_id to version if it was defined])
+
+# Override autoconf default CFLAG settings (e.g. "-g -O2") while still
+# allowing the user to explicitly set CFLAGS=""
+: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags} $CFLAGS"}
+: ${CPPFLAGS="$CPPFLAGS"}
+
+# AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it
+# should not be called earlier
+AM_PROG_AS()
+
+# AM PROG_AR did not exist pre AM 1.11.x (where x is somewhere >0 and
+# <3), but it is necessary in AM 1.12.x.
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+
+AC_ARG_WITH([valgrind],
+    AC_HELP_STRING([--with-valgrind],
+		   [Enable valgrind annotations @<:@default=no@:>@]))
+
+if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then
+	AC_DEFINE([INCLUDE_VALGRIND], 1,
+		  [Define to 1 to enable valgrind annotations])
+	if test -d $with_valgrind; then
+		CPPFLAGS="$CPPFLAGS -I$with_valgrind/include"
+	fi
+fi
+
+AC_ARG_ENABLE([atomics],
+	[AS_HELP_STRING([--enable-atomics],
+		[Enable atomics support @<:@default=yes@:>@])
+	],
+	[],
+	[enable_atomics=yes])
+
+dnl Checks for programs
+AC_PROG_CC_C99
+AS_IF([test "$ac_cv_prog_cc_c99" = "no"],
+      [AC_MSG_WARN([Libfabric requires a C99-compliant compiler])
+       AC_MSG_ERROR([Cannot continue])])
+AM_PROG_CC_C_O
+AC_PROG_CPP
+
+AC_ARG_ENABLE([debug],
+	      [AS_HELP_STRING([--enable-debug],
+			      [Enable debugging @<:@default=no@:>@])
+	      ],
+	      [],
+	      [enable_debug=no])
+
+AS_IF([test x"$enable_debug" != x"no"],
+      [dbg=1
+       # See if all the flags in $debug_c_other_flags work
+       good_flags=
+       CFLAGS_save="$CFLAGS"
+       for flag in $debug_c_other_flags; do
+           AC_MSG_CHECKING([to see if compiler supports $flag])
+           CFLAGS="$flag $CFLAGS_save"
+           AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[int i = 3;]])],
+	                     [AC_MSG_RESULT([yes])
+			      good_flags="$flag $good_flags"],
+			     [AC_MSG_RESULT([no])])
+       done
+       debug_c_other_flags=$good_flags
+       unset good_flags
+
+       CFLAGS="-g -O0 ${base_c_warn_flags} ${debug_c_warn_flags} ${debug_c_other_flags} ${CFLAGS_save}"
+       unset CFLAGS_save],
+      [dbg=0
+       CFLAGS="-DNDEBUG $CFLAGS"])
+
+AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg],
+                   [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise])
+
+dnl Checks for header files.
+AC_HEADER_STDC
+
+dnl Check for compiler features
+AC_C_TYPEOF
+
+LT_INIT
+LT_OUTPUT
+
+dnl dlopen support is optional
+AC_ARG_WITH([dlopen],
+	AC_HELP_STRING([--with-dlopen],
+		       [dl-loadable provider support @<:@default=yes@:>@]),
+	)
+
+if test "$freebsd" == "0"; then
+AS_IF([test x"$with_dlopen" != x"no"], [
+AC_CHECK_LIB(dl, dlopen, [],
+    AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.]))
+])
+fi
+
+dnl handle picky option
+AC_ARG_ENABLE([picky],
+    [AC_HELP_STRING([--enable-picky],
+                    [Enable developer-level compiler pickyness when building @<:@default=no@:>@])])
+AS_IF([test x"$enable_picky" = x"yes" && test x"$GCC" = x"yes"],
+      [AS_IF([test x"$enable_debug" = x"no"],
+             [CFLAGS="${base_c_warn_flags} ${debug_c_warn_flags} ${debug_c_other_flags} ${picky_c_warn_flags} $CFLAGS"],
+             [CFLAGS="${picky_c_warn_flags} $CFLAGS"])
+      ])
+
+dnl Checks for libraries
+AC_CHECK_LIB(pthread, pthread_mutex_init, [],
+    AC_MSG_ERROR([pthread_mutex_init() not found.  libfabric requires libpthread.]))
+
+AC_CHECK_FUNC([pthread_spin_init],
+	[have_spinlock=1],
+	[have_spinlock=0])
+
+dnl shm_open not used in the common code on os-x
+
+AC_DEFINE_UNQUOTED([PT_LOCK_SPIN], [$have_spinlock],
+	[Define to 1 if pthread_spin_init is available.])
+
+AC_ARG_ENABLE([epoll],
+    [AS_HELP_STRING([--disable-epoll],
+        [Disable epoll if available@<:@default=no@:>@])],
+    [],
+    [enable_epoll=auto]
+)
+
+AS_IF([test x"$enable_epoll" != x"no"],
+    [AC_CHECK_FUNCS([epoll_create])
+     if test "$ac_cv_func_epoll_create" = yes; then
+        AC_DEFINE([HAVE_EPOLL], [1], [Define if you have epoll support.])
+     fi]
+)
+
+AC_CHECK_HEADER([linux/perf_event.h],
+    [AC_CHECK_DECL([__builtin_ia32_rdpmc],
+        [
+            AC_TRY_LINK([#include <linux/perf_event.h>],
+                [__builtin_ia32_rdpmc(0);],
+                [linux_perf_rdpmc=1],
+                [linux_perf_rdpmc=0])
+	],
+        [linux_perf_rdpmc=0],
+        [#include <linux/perf_event.h>])],
+    [linux_perf_rdpmc=0])
+AC_DEFINE_UNQUOTED(HAVE_LINUX_PERF_RDPMC, [$linux_perf_rdpmc],
+    [Whether we have __builtin_ia32_rdpmc() and linux/perf_event.h file or not])
+AM_CONDITIONAL([HAVE_LINUX_PERF_RDPMC], [test "x$linux_perf_rdpmc" = "x1"])
+
+dnl Check for gcc atomic intrinsics
+AS_IF([test x"$enable_atomics" != x"no"],
+    AC_MSG_CHECKING(compiler support for c11 atomics)
+    AC_TRY_LINK([#include <stdatomic.h>],
+        [atomic_int a;
+         atomic_init(&a, 0);
+         #ifdef __STDC_NO_ATOMICS__
+           #error c11 atomics are not supported
+         #else
+           return 0;
+         #endif
+        ],
+        [
+        AC_MSG_RESULT(yes)
+            AC_DEFINE(HAVE_ATOMICS, 1, [Set to 1 to use c11 atomic functions])
+        ],
+        [AC_MSG_RESULT(no)])
+
+
+    AC_MSG_CHECKING(compiler support for c11 atomic `least` types)
+    AC_TRY_LINK([#include <stdatomic.h>],
+        [atomic_int_least32_t a;
+         atomic_int_least64_t b;
+        ],
+        [
+            AC_MSG_RESULT(yes)
+            AC_DEFINE(HAVE_ATOMICS_LEAST_TYPES, 1,
+                      [Set to 1 to use c11 atomic `least` types])
+        ],
+        [
+            AC_MSG_RESULT(no)
+        ]),
+[
+    AC_MSG_RESULT(configure: atomics support for c11 is disabled)
+])
+
+dnl Check for gcc built-in atomics
+AS_IF([test x"$enable_atomics" != x"no"],
+    AC_MSG_CHECKING(compiler support for built-in atomics)
+    AC_TRY_LINK([#include <stdint.h>],
+        [int32_t a;
+         __sync_add_and_fetch(&a, 0);
+         __sync_sub_and_fetch(&a, 0);
+         #if defined(__PPC__) && !defined(__PPC64__)
+           #error compiler built-in atomics are not supported on PowerPC 32-bit
+         #else
+           return 0;
+         #endif
+        ],
+        [
+        AC_MSG_RESULT(yes)
+            AC_DEFINE(HAVE_BUILTIN_ATOMICS, 1, [Set to 1 to use built-in intrincics atomics])
+        ],
+        [AC_MSG_RESULT(no)]),
+[
+    AC_MSG_RESULT(configure: atomics support built-in is disabled)
+])
+
+dnl Check for gcc memory model aware built-in atomics
+dnl If supported check to see if not internal to compiler
+LIBS_save=$LIBS
+AC_SEARCH_LIBS([__atomic_load_8], [atomic])
+AS_IF([test x"$enable_atomics" != x"no"],
+    AC_MSG_CHECKING(compiler support for built-in memory model aware atomics)
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <stdint.h>]],
+        [[uint64_t d;
+         uint64_t s;
+         uint64_t c;
+         uint64_t r;
+          r = __atomic_fetch_add(&d, s, __ATOMIC_SEQ_CST);
+          r = __atomic_load_8(&d, __ATOMIC_SEQ_CST);
+          __atomic_exchange(&d, &s, &r, __ATOMIC_SEQ_CST);
+          __atomic_compare_exchange(&d,&c,&s,0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+         #if defined(__PPC__) && !defined(__PPC64__)
+           #error compiler built-in memory model aware atomics are not supported on PowerPC 32-bit
+         #else
+           return 0;
+         #endif
+        ]])],
+        [
+            AC_MSG_RESULT(yes)
+            AC_DEFINE(HAVE_BUILTIN_MM_ATOMICS, 1, [Set to 1 to use built-in intrinsics memory model aware atomics])
+        ],
+        [
+            AC_MSG_RESULT(no)
+            LIBS=$LIBS_save
+        ]),
+[
+    AC_MSG_RESULT(configure: -latomic key is disabled)
+    LIBS=$LIBS_save
+])
+unset LIBS_save
+
+dnl Check for gcc cpuid intrinsics
+AC_MSG_CHECKING(compiler support for cpuid)
+AC_TRY_LINK([
+     #include <stddef.h>
+     #include <cpuid.h>],
+    [
+     int a, b, c, d;
+     __cpuid_count(0, 0, a, b, c, d);
+    ],
+    [
+	AC_MSG_RESULT(yes)
+        AC_DEFINE(HAVE_CPUID, 1, [Set to 1 to use cpuid])
+    ],
+    [AC_MSG_RESULT(no)])
+
+if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then
+AC_CHECK_HEADER(valgrind/memcheck.h, [],
+    AC_MSG_ERROR([valgrind requested but <valgrind/memcheck.h> not found.]))
+fi
+
+AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
+    [if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
+        ac_cv_version_script=yes
+    else
+        ac_cv_version_script=no
+    fi])
+
+AC_ARG_ENABLE([embedded],
+	      [AS_HELP_STRING([--enable-embedded],
+			      [Enable embedded support (turns off symbol versioning) @<:@default=no@:>@])
+	      ],
+	      [ac_asm_symver_support=0
+               icc_symver_hack=1],
+	      [enable_embedded=no])
+AM_CONDITIONAL([EMBEDDED], [test x"$enable_embedded" = x"yes"])
+
+AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$ac_cv_version_script" = "yes")
+
+dnl Disable symbol versioning when -ipo is in CFLAGS or ipo is disabled by icc.
+dnl The gcc equivalent ipo (-fwhole-program) seems to work fine.
+AS_CASE([$CFLAGS],
+	[*-ipo*],[
+		AC_MSG_NOTICE([disabling symbol versioning support with -ipo CFLAG])
+		icc_symver_hack=1
+		ac_asm_symver_support=0
+	],
+	[]
+)
+
+dnl Check for symbol versioning compiler + linker support.
+dnl If icc + ipo, then print disabled and skip check
+AC_MSG_CHECKING(for .symver assembler support)
+AS_IF([test "$icc_symver_hack"],
+	[AC_MSG_RESULT(disabled)],
+[
+
+AC_TRY_LINK([],
+	[__asm__(".symver main_, main@ABIVER_1.0");],
+	[
+		AC_MSG_RESULT(yes)
+		ac_asm_symver_support=1
+	],
+	[
+		AC_MSG_RESULT(no)
+		ac_asm_symver_support=0
+	])
+
+]) dnl AS_IF icc_symver_hack
+
+AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support],
+	  	   [Define to 1 if compiler/linker support symbol versioning.])
+
+AC_MSG_CHECKING(for __alias__ attribute support)
+AC_TRY_LINK(
+	[
+		int foo(int arg);
+		int foo(int arg) { return arg + 3; };
+		int foo2(int arg) __attribute__ (( __alias__("foo")));
+	],
+	[ /* empty main */ ],
+	[
+		AC_MSG_RESULT(yes)
+		ac_prog_cc_alias_symbols=1
+	],
+	[
+		AC_MSG_RESULT(no)
+		ac_prog_cc_alias_symbols=0
+	])
+
+AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols],
+	  	   [Define to 1 if the linker supports alias attribute.])
+AC_CHECK_FUNCS([getifaddrs])
+
+dnl Check for ethtool support
+AC_MSG_CHECKING(ethtool support)
+AC_TRY_LINK([
+    #include <net/if.h>
+    #include <sys/types.h>
+    #include <linux/ethtool.h>
+    #include <linux/sockios.h>
+    #include <sys/ioctl.h>],
+    [
+        unsigned long ioctl_req = SIOCETHTOOL;
+        struct ethtool_cmd cmd = {
+            .cmd = ETHTOOL_GSET,
+        };
+        long speed = cmd.speed;
+    ],
+    [
+	AC_MSG_RESULT(yes)
+        AC_DEFINE(HAVE_ETHTOOL, 1, [Set to 1 to use ethtool])
+
+    ],
+    [AC_MSG_RESULT(no)])
+
+dnl Check for ethtool SPEED_UNKNOWN macro (suppoirted in the linux
+dnl kernel >= 3.2) and ethtool_cmd_speed function declarations
+dnl supported in the linux kernel >= 2.6.26
+AC_CHECK_DECLS([ethtool_cmd_speed, SPEED_UNKNOWN], [], [],
+               [#include <linux/ethtool.h>])
+
+dnl Check for userfault fd support
+have_uffd=0
+AC_CHECK_HEADERS([linux/userfaultfd.h],
+	[AC_CHECK_DECL([__NR_userfaultfd],
+		[have_uffd=1],
+		[],
+		[[#include <sys/syscall.h>]])],
+	[], [])
+
+AS_IF([test $have_uffd -eq 1],
+	[AC_MSG_CHECKING([for userfaultfd unmap support])
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+			#include <sys/types.h>
+			#include <linux/userfaultfd.h>
+			#include <unistd.h>
+			#include <sys/syscall.h>
+			#include <fcntl.h>
+			#include <sys/ioctl.h>
+		]],
+		[[
+			int fd;
+			struct uffdio_api api_obj;
+			api_obj.api = UFFD_API;
+			api_obj.features = UFFD_FEATURE_EVENT_UNMAP |
+					UFFD_FEATURE_EVENT_REMOVE |
+					UFFD_FEATURE_EVENT_REMAP;
+			fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+			return ioctl(fd, UFFDIO_API, &api_obj);
+		]])
+	],
+	[AC_MSG_RESULT([yes])],
+	[AC_MSG_RESULT([no])
+		have_uffd=0])])
+
+AC_DEFINE_UNQUOTED([HAVE_UFFD_UNMAP], [$have_uffd],
+	[Define to 1 if platform supports userfault fd unmap])
+
+dnl Check support to intercept syscalls
+AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h)
+
+dnl Check support to clock_gettime
+have_clock_gettime=0
+
+AC_SEARCH_LIBS([clock_gettime],[rt],
+         [have_clock_gettime=1],
+         [])
+
+AC_DEFINE_UNQUOTED(HAVE_CLOCK_GETTIME, [$have_clock_gettime],
+       [Define to 1 if clock_gettime is available.])
+AM_CONDITIONAL(HAVE_CLOCK_GETTIME, [test $have_clock_gettime -eq 1])
+
+dnl Check for CUDA runtime libraries.
+AC_ARG_WITH([cuda],
+	    [AC_HELP_STRING([--with-cuda=DIR],
+			    [Provide path to where the CUDA development
+			    and runtime libraries are installed.])],
+	    [], [])
+AS_IF([test ! -z "$PSM_CUDA"], [with_cuda=/usr/local/cuda])
+have_libcuda=0
+AS_IF([test x"$with_cuda" != x"no"],
+      [FI_CHECK_PACKAGE([cuda],
+			[cuda_runtime.h],
+			[cudart],
+			[cudaMemcpy],
+			[-lcuda],
+			[$with_cuda],
+			[],
+			[have_libcuda=1],
+			[],
+			[])],
+      [])
+AS_IF([test "$with_cuda" = "yes" && test "$have_libcuda" = "0" ],
+      [AC_MSG_ERROR([CUDA support requested but CUDA runtime not available.])],
+      [])
+AC_DEFINE_UNQUOTED([HAVE_LIBCUDA], [$have_libcuda], [Whether we have CUDA runtime or not])
+if test $have_libcuda -eq 1; then
+	cuda_CPPFLAGS="$cuda_CPPFLAGS -DPSM_CUDA -DNVIDIA_GPU_DIRECT"
+fi
+AC_DEFINE_UNQUOTED([PSM3_CUDA], [$have_libcuda], [Whether we have CUDA runtime or not])
+
+CPPFLAGS="$CPPFLAGS $cuda_CPPFLAGS"
+LDFLAGS="$LDFLAGS $cuda_LDFLAGS"
+LIBS="$LIBS $cuda_LIBS"
+
+AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], [])
+AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], [])
+dnl Provider-specific checks
+dnl FI_PROVIDER_INIT
+dnl FI_PROVIDER_SETUP([psm3])
+dnl FI_PROVIDER_FINI
+dnl Configure the .pc file
+#FI_PROVIDER_SETUP_PC
+
+AC_SUBST(PSM_HAL_CNT)
+AC_SUBST(PSM_HAL_INST)
+
+AM_COND_IF([HAVE_PSM3_SRC],
+	   [
+		IFS_VERSION="${RELEASE_TAG:-$(git describe --dirty --always --abbrev=8 --broken --tags 2>/dev/null \
+			|| git describe --dirty --always --abbrev=8 --broken 2>/dev/null || echo 'unknown commit')}"
+		IFS_VERSION=${IFS_VERSION//./_}
+		GIT_HASH="$(git log --oneline --format='%H' -1)"
+		RPM_RELEASE=$(echo "${IFS_VERSION}" | cut -d'_' -f5)
+		RELEASE_VER=$(echo "${IFS_VERSION}" | cut -d'_' -f1-4 | sed 's/_/./g')
+		AS_IF([test x"${RELEASE_VER}" = x"${PACKAGE_VERSION}"], [], [
+			AC_MSG_NOTICE([Release Tag does not match VERSION file])
+			AC_MSG_NOTICE([${RELEASE_VER} != ${PACKAGE_VERSION}])
+			RPM_RELEASE=999
+		])
+		PSM3_PROV_VER_MAJOR=$(echo "${PACKAGE_VERSION}" | cut -d'.' -f1)
+		PSM3_PROV_VER_MINOR=$(echo "${PACKAGE_VERSION}" | cut -d'.' -f2)
+		PSM3_PROV_VER_MAINT=$(echo "${PACKAGE_VERSION}" | cut -d'.' -f3)
+		PSM3_PROV_VER_PATCH=$(echo "${PACKAGE_VERSION}" | cut -d'.' -f4)
+	   ])
+AS_IF([test $have_libcuda -eq 1], [RPM_RELEASE=${RPM_RELEASE}cuda])
+
+AC_SUBST(IFS_VERSION)
+AC_SUBST(GIT_HASH)
+AC_SUBST(RPM_RELEASE)
+AC_SUBST(PSM3_PROV_VER_MAJOR)
+AC_SUBST(PSM3_PROV_VER_MINOR)
+AC_SUBST(PSM3_PROV_VER_MAINT)
+AC_SUBST(PSM3_PROV_VER_PATCH)
+dnl Set during Make.
+dnl AC_SUBST(BUILD_TIMESTAMP)
+dnl AC_SUBST(SRC_CHECKSUM)
+
+AC_SUBST(PSM3_ADDITIONAL_GLOBALS)
+
+AC_CONFIG_FILES([Makefile libpsm3-fi.spec libpsm3-fi.map libpsm3-fi.pc])
+AM_COND_IF([HAVE_PSM3_SRC],
+	   [AC_CONFIG_FILES([psm3/psm2_hal_inlines_i.h psm3/psm2_hal_inlines_d.h src/psm3_revision.c])])
+AC_OUTPUT
diff --git a/deps/libfabric/prov/psm3/configure.m4 b/deps/libfabric/prov/psm3/configure.m4
new file mode 100644
index 0000000000000000000000000000000000000000..c78d035e8381523298503f920b709ec69e8d956a
--- /dev/null
+++ b/deps/libfabric/prov/psm3/configure.m4
@@ -0,0 +1,247 @@
+dnl Configury specific to the libfabric PSM3 provider
+
+dnl Called to configure this provider
+dnl
+dnl Arguments:
+dnl
+dnl $1: action if configured successfully
+dnl $2: action if not configured successfully
+dnl
+AC_DEFUN([FI_PSM3_CONFIGURE],[
+	 # Determine if we can support the psm3 provider
+	 psm3_ARCH=$host_cpu
+	 AM_CONDITIONAL([HAVE_PSM3_X86_64], [test x$psm3_ARCH = xx86_64])
+	 AC_SUBST([HAVE_PSM3_X86_64])
+	 AC_SUBST([psm3_ARCH])
+
+	 enable_psm3_src=yes
+	 AM_CONDITIONAL([HAVE_PSM3_SRC], [test "x$enable_psm3_src" != "xno"], [build PSM3 src into provider])
+	 AC_DEFINE([HAVE_PSM3_SRC], [1], [PSM3 source is built-in])
+
+	 PSM_HAL_CNT=1
+	 PSM_HAL_INST=gen1
+
+	 psm3_happy=1
+	 AS_IF([test x"$enable_psm3" != x"no"],
+	       [
+		FI_CHECK_PACKAGE([psm3_rt],
+		                 [sys/mman.h],
+		                 [rt],
+		                 [shm_open],
+		                 [],
+		                 [$psm3_PREFIX],
+		                 [$psm3_LIBDIR],
+		                 [],
+		                 [psm3_happy=0])
+ifelse('
+		FI_CHECK_PACKAGE([psm3_dl],
+		                 [dlfcn.h],
+		                 [dl],
+		                 [dlopen],
+		                 [],
+		                 [$psm3_PREFIX],
+		                 [$psm3_LIBDIR],
+		                 [psm3_dl_happy=1],
+		                 [psm3_happy=0])
+')dnl
+		FI_CHECK_PACKAGE([psm3_numa],
+		                 [numa.h],
+		                 [numa],
+		                 [numa_node_of_cpu],
+		                 [],
+		                 [$psm3_PREFIX],
+		                 [$psm3_LIBDIR],
+		                 [psm3_numa_happy=1],
+		                 [psm3_happy=0])
+
+		FI_CHECK_PACKAGE([psm3_ibv],
+		                 [infiniband/verbs.h],
+		                 [ibverbs],
+		                 [ibv_get_device_list],
+		                 [],
+		                 [$psm3_PREFIX],
+		                 [$psm3_LIBDIR],
+		                 [psm3_ibv_happy=1],
+		                 [psm3_happy=0])
+
+		FI_CHECK_PACKAGE([psm3_uuid],
+		                 [uuid/uuid.h],
+		                 [uuid],
+		                 [uuid_parse],
+		                 [],
+		                 [$psm3_PREFIX],
+		                 [$psm3_LIBDIR],
+		                 [psm3_uuid_happy=1],
+		                 [psm3_happy=0])
+
+		AC_MSG_CHECKING([for -msse4.2 support])
+
+		dnl Strip other optflags to avoid conflicts when checking for instruction sets
+		FI_STRIP_OPTFLAGS($CFLAGS)
+		PSM3_STRIP_OPTFLAGS="$s_result"
+
+		save_CFLAGS=$CFLAGS
+		CFLAGS="$PSM3_STRIP_OPTFLAGS -msse4.2 -O0"
+		AC_LINK_IFELSE(
+			[AC_LANG_PROGRAM(
+				[[#include <nmmintrin.h>]],
+				[[unsigned int crc = 0;
+				  crc = _mm_crc32_u32(crc, 0);
+				  return crc == 0;]])
+			],[
+				AC_MSG_RESULT([yes])
+				PSM3_ARCH_CFLAGS="-msse4.2"
+			],[
+				psm3_happy=0
+				AC_MSG_RESULT([no])
+				AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build])
+			])
+		CFLAGS=$save_CFLAGS
+
+		AC_MSG_CHECKING([for -mavx support])
+		save_CFLAGS=$CFLAGS
+		CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx -O0"
+		AC_LINK_IFELSE(
+			[AC_LANG_PROGRAM(
+				[[#include <immintrin.h>]],
+				[[unsigned long long _a[4] = {1ULL,2ULL,3ULL,4ULL};
+				  __m256i vA = _mm256_loadu_si256((__m256i *)_a);
+				  __m256i vB;
+				  _mm256_store_si256(&vB, vA);
+				  return 0;]])
+			],[
+				AC_MSG_RESULT([yes])
+				PSM3_ARCH_CFLAGS="-mavx"
+			],[
+				psm3_happy=0
+				AC_MSG_RESULT([no])
+				AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build])
+			])
+		CFLAGS=$save_CFLAGS
+
+		AC_MSG_CHECKING([for -mavx2 support])
+		save_CFLAGS=$CFLAGS
+		CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx2 -O0"
+		AC_LINK_IFELSE(
+			[AC_LANG_PROGRAM(
+				[[#include <immintrin.h>]],
+				[[unsigned long long _a[4] = {1ULL,2ULL,3ULL,4ULL};
+				  __m256i vA = _mm256_loadu_si256((__m256i *)_a);
+				  __m256i vB = _mm256_add_epi64(vA, vA);
+				  (void)vB;
+				  return 0;]])
+			],[
+				AC_MSG_RESULT([yes])
+				PSM3_ARCH_CFLAGS="-mavx2"
+			],[
+				AC_MSG_RESULT([no])
+			])
+		CFLAGS=$save_CFLAGS
+
+		AS_IF([test $have_libcuda -eq 1],
+		      [psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_CUDA -DNVIDIA_GPU_DIRECT"])
+		AC_DEFINE_UNQUOTED([PSM3_CUDA], [$have_libcuda], [Whether we have CUDA runtime or not])
+
+		AS_IF([test x$with_psm3_rv = xno],
+		      [psm3_CPPFLAGS="$psm3_CPPFLAGS -URNDV_MOD"],
+		      [
+			AS_IF([test "x$with_psm3_rv" = "x"],
+			      [
+				psm3_rv_check=1
+				with_psm3_rv=/usr/include
+			      ],[psm3_rv_check=0])
+			psm3_rv_old_header=0
+			save_CPPFLAGS=$CPPFLAGS
+			CPPFLAGS="$CPPFLAGS -I$with_psm3_rv"
+			dnl Check for /usr/include/rdma/rv_user_ioctls.h first
+			_FI_CHECK_PACKAGE_HEADER([psm3_rv],
+			                         [rdma/rv_user_ioctls.h],
+			                         [],
+			                         [psm3_rv_happy=1],
+			                         [psm3_rv_happy=0])
+
+			AS_IF([test $psm3_rv_happy -eq 0], [
+				AS_IF([test "$psm3_rv_check" -eq 1],
+				      [with_psm3_rv=/usr/include/uapi])
+				CPPFLAGS="$save_CPPFLAGS -I$with_psm3_rv"
+				_FI_CHECK_PACKAGE_HEADER([psm3_rv],
+				                         [rv/rv_user_ioctls.h],
+				                         [],
+				                         [psm3_rv_happy=1
+				                          psm3_rv_old_header=1],
+				                         [psm3_rv_happy=0])
+			      ])
+			CPPFLAGS=$save_CPPFLAGS
+			AS_IF([test "$psm3_rv_happy" -eq 0],
+			      [
+				AS_IF([test "$psm3_rv_check" -eq 0], [
+					psm3_happy=0
+					AC_MSG_ERROR([RV Module headers requested but rv_user_ioctls.h not found.])
+				])
+				psm3_CPPFLAGS="$psm3_CPPFLAGS -URNDV_MOD"
+			      ],[
+				psm3_CPPFLAGS="$psm3_CPPFLAGS -DRNDV_MOD -I$with_psm3_rv"
+				AS_IF([test "$psm3_rv_old_header" -eq 1],
+				      [psm3_CPPFLAGS="$psm3_CPPFLAGS -DHAVE_OLD_RV_HEADER"])
+			      ])
+			AS_IF([test "$psm3_rv_happy" -eq 1], [
+				AC_MSG_CHECKING([for RV support for ring.overflow_cnt])
+				AC_COMPILE_IFELSE(
+					[AC_LANG_PROGRAM(
+						[[#include <sys/types.h>
+						  #include <stdint.h>
+						  #include <rdma/rv_user_ioctls.h>]],
+						[[struct rv_ring_header ring;
+						  ring.overflow_cnt=0;
+						  (void)ring;
+						  return 0;]])
+					],[
+						AC_MSG_RESULT(yes)
+					],[
+						AC_MSG_RESULT(no)
+						psm3_CPPFLAGS="$psm3_CPPFLAGS -DHAVE_NO_PSM3_RV_OVERFLOW_CNT"
+					])
+			      ])
+		      ])
+		AS_IF([test $psm3_happy -eq 1], [
+			AC_CONFIG_FILES([prov/psm3/psm3/psm2_hal_inlines_i.h \
+		                 prov/psm3/psm3/psm2_hal_inlines_d.h \
+		                 prov/psm3/src/psm3_revision.c])
+		])
+	       ],[psm3_happy=0])
+
+	 AS_IF([test $psm3_happy -eq 1], [$1], [$2])
+
+	 psm3_CFLAGS=""
+	 psm3_ARCH_CFLAGS="$PSM3_ARCH_CFLAGS"
+	 psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS"
+	 psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS"
+	 psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS"
+	 AC_SUBST(psm3_CFLAGS)
+	 AC_SUBST(psm3_ARCH_CFLAGS)
+	 AC_SUBST(psm3_CPPFLAGS)
+	 AC_SUBST(psm3_LDFLAGS)
+	 AC_SUBST(psm3_LIBS)
+	 AC_SUBST(PSM_HAL_CNT)
+	 AC_SUBST(PSM_HAL_INST)
+
+	 PSM3_IFS_VERSION=m4_normalize(m4_esyscmd([cat prov/psm3/VERSION]))
+	 AC_SUBST(PSM3_IFS_VERSION)
+	 PSM3_PROV_VER_MAJOR=$(echo "${PSM3_IFS_VERSION}" | cut -d'_' -f1)
+	 PSM3_PROV_VER_MINOR=$(echo "${PSM3_IFS_VERSION}" | cut -d'_' -f2)
+	 PSM3_PROV_VER_MAINT=$(echo "${PSM3_IFS_VERSION}" | cut -d'_' -f3)
+	 PSM3_PROV_VER_PATCH=$(echo "${PSM3_IFS_VERSION}" | cut -d'_' -f4)
+	 AC_SUBST(PSM3_PROV_VER_MAJOR)
+	 AC_SUBST(PSM3_PROV_VER_MINOR)
+	 AC_SUBST(PSM3_PROV_VER_MAINT)
+	 AC_SUBST(PSM3_PROV_VER_PATCH)
+
+	 AC_SUBST(PSM3_BUILD_TIMESTAMP, ["<Unknown>"])
+	 AC_SUBST(PSM3_SRC_CHECKSUM, ["<Unknown>"])
+	 AC_SUBST(PSM3_GIT_HASH, ["<Unknown>"])
+
+])
+
+AC_ARG_WITH([psm3-rv],
+            [AS_HELP_STRING([--with-psm3-rv],
+                            [Enable RV module use @<:@default=check@:>@])])
diff --git a/deps/libfabric/prov/psm3/inc b/deps/libfabric/prov/psm3/inc
new file mode 120000
index 0000000000000000000000000000000000000000..fcffffbed8d4c29e745de5e9680f9df564a01f97
--- /dev/null
+++ b/deps/libfabric/prov/psm3/inc
@@ -0,0 +1 @@
+../../include
\ No newline at end of file
diff --git a/deps/libfabric/prov/psm3/libpsm3-fi.map.in b/deps/libfabric/prov/psm3/libpsm3-fi.map.in
new file mode 100644
index 0000000000000000000000000000000000000000..b6672359af9a952f71c978adea4281803a53b707
--- /dev/null
+++ b/deps/libfabric/prov/psm3/libpsm3-fi.map.in
@@ -0,0 +1,6 @@
+PSM3_FI_1.0 {
+	global:
+		fi_prov_ini;
+		@PSM3_ADDITIONAL_GLOBALS@
+	local: *;
+};
diff --git a/deps/libfabric/prov/psm3/libpsm3-fi.pc.in b/deps/libfabric/prov/psm3/libpsm3-fi.pc.in
new file mode 100644
index 0000000000000000000000000000000000000000..c664ed200b88b2ec9de3f6fe16c36587efdaf5a9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/libpsm3-fi.pc.in
@@ -0,0 +1,14 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libpsm3-fi
+Description: OFI-WG libfabric PSM3 provider
+URL: https://github.com/ofiwg/libfabric.git
+Version: @VERSION@
+Requires: libfabric
+Cflags: -I${includedir}
+Libs: -L${libdir} -lpsm3-fi
+Libs.private:
+Requires.private:
diff --git a/deps/libfabric/prov/psm3/libpsm3-fi.spec.in b/deps/libfabric/prov/psm3/libpsm3-fi.spec.in
new file mode 100644
index 0000000000000000000000000000000000000000..eee516ccab8d1e0e6b142224ebe65217628a598e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/libpsm3-fi.spec.in
@@ -0,0 +1,70 @@
+%{!?configopts: %global configopts LDFLAGS=-Wl,--build-id}
+%{!?provider: %define provider psm3}
+%{!?provider_formal: %define provider_formal PSM3}
+
+Name: lib%{provider}-fi
+Version: @VERSION@
+Release: 179_@RPM_RELEASE@
+Summary: Dynamic %{provider_formal} provider for Libfabric
+
+Group: System Environment/Libraries
+License: GPLv2 or BSD
+Url: http://www.github.com/ofiwg/libfabric
+Source: http://www.github.org/ofiwg/%{name}/releases/download/%{provider}-v{%version}/%{name}-%{version}.tar.bz2
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
+Requires: libfabric
+%if 0%{?suse_version} >= 1
+Provides: lib${provider}-fi1 = %{version}-%{release}
+%endif
+
+BuildRequires: libuuid-devel
+BuildRequires: rdma-core-devel
+%if 0%{?suse_version} >= 1
+BuildRequires: glibc-devel
+BuildRequires: libnuma-devel
+%endif
+%if 0%{?rhel} >= 1
+BuildRequires: glibc-headers
+BuildRequires: numactl-devel
+%endif
+
+%description
+This RPM provides the %{provider_formal} provider as a "plugin" to an existing
+libfabric installation.  This plugin will override older %{provider_formal}
+provider functionality in the existing libfabric installation.
+
+%prep
+%setup -q -n %{name}-%{version}
+
+%build
+%configure %{configopts}
+%make_build
+
+%install
+rm -rf %{buildroot}
+
+%make_install installdirs
+# remove unpackaged files from the buildroot
+rm -f %{buildroot}%{_libdir}/*.la
+
+%clean
+rm -rf %{buildroot}
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%files
+%defattr(-,root,root,-)
+%{_libdir}/libfabric/%{name}*
+%doc README
+%exclude %{_libdir}/libfabric/*.a
+%exclude %{_libdir}/libfabric/*.la
+%exclude %{_libdir}/pkgconfig
+%exclude %{_mandir}
+
+%changelog
+* Wed Mar 31 2021 Adam Goldman <adam.goldman@intel.com>
+- Include BuildRequires lines for RHEL and SLES
+
+* Wed May 24 2017 Open Fabrics Interfaces Working Group <ofiwg@lists.openfabrics.org>
+- First release of specfile for packaging a single dl provider.
diff --git a/deps/libfabric/prov/psm3/man b/deps/libfabric/prov/psm3/man
new file mode 120000
index 0000000000000000000000000000000000000000..ee201c19319848a53614c366728362ab32272041
--- /dev/null
+++ b/deps/libfabric/prov/psm3/man
@@ -0,0 +1 @@
+../../man
\ No newline at end of file
diff --git a/deps/libfabric/prov/psm3/psm3/.gitignore b/deps/libfabric/prov/psm3/psm3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d12089a26941b8ff42c5b6292fd4c59255ee14e2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/.gitignore
@@ -0,0 +1,2 @@
+psm2_hal_inlines_d.h
+psm2_hal_inlines_i.h
diff --git a/deps/libfabric/prov/psm3/psm3/Makefile.include b/deps/libfabric/prov/psm3/psm3/Makefile.include
new file mode 100644
index 0000000000000000000000000000000000000000..51b79c19f7a7658f02cfb1b17cdb8ea501773e41
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/Makefile.include
@@ -0,0 +1,211 @@
+
+_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/ \
+	-I$(top_builddir)/psm3/ \
+	-I$(top_srcdir)/psm3/ptl_ips/ \
+	-I$(top_srcdir)/psm3/include \
+	-I$(top_srcdir)/psm3/include/linux-i386 \
+	-I$(top_srcdir)/psm3/mpspawn \
+	-I$(top_srcdir)/psm3/opa \
+	-D_GNU_SOURCE=1 \
+	$(AM_CPPFLAGS)
+
+noinst_LTLIBRARIES += libopa.la \
+		     libptl_am.la libptl_ips.la libptl_self.la \
+		     libpsm_hal_gen1.la libpsm2.la
+
+libptl_am_la_SOURCES = \
+	psm3/ptl_am/am_config.h \
+	psm3/ptl_am/am_cuda_memhandle_cache.c \
+	psm3/ptl_am/am_cuda_memhandle_cache.h \
+	psm3/ptl_am/am_reqrep.c \
+	psm3/ptl_am/am_reqrep_shmem.c \
+	psm3/ptl_am/cmarw.h \
+	psm3/ptl_am/cmarwu.c \
+	psm3/ptl_am/psm_am_internal.h \
+	psm3/ptl_am/ptl.c \
+	psm3/ptl_am/ptl_fwd.h
+libptl_am_la_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/ptl_am/ \
+	$(_CPPFLAGS)
+
+libptl_ips_la_SOURCES = \
+	psm3/ptl_ips/ips_config.h \
+	psm3/ptl_ips/ips_crc32.c \
+	psm3/ptl_ips/ips_epstate.c \
+	psm3/ptl_ips/ips_epstate.h \
+	psm3/ptl_ips/ips_expected_proto.h \
+	psm3/ptl_ips/ips_opp_path_rec.c \
+	psm3/ptl_ips/ips_path_rec.c \
+	psm3/ptl_ips/ips_path_rec.h \
+	psm3/ptl_ips/ips_proto.c \
+	psm3/ptl_ips/ips_proto.h \
+	psm3/ptl_ips/ips_proto_am.c \
+	psm3/ptl_ips/ips_proto_am.h \
+	psm3/ptl_ips/ips_proto_connect.c \
+	psm3/ptl_ips/ips_proto_dump.c \
+	psm3/ptl_ips/ips_proto_expected.c \
+	psm3/ptl_ips/ips_proto_header.h \
+	psm3/ptl_ips/ips_proto_help.h \
+	psm3/ptl_ips/ips_proto_internal.h \
+	psm3/ptl_ips/ips_proto_mq.c \
+	psm3/ptl_ips/ips_proto_params.h \
+	psm3/ptl_ips/ips_proto_recv.c \
+	psm3/ptl_ips/ips_recvhdrq.c \
+	psm3/ptl_ips/ips_recvhdrq.h \
+	psm3/ptl_ips/ips_recvq.c \
+	psm3/ptl_ips/ips_recvq.h \
+	psm3/ptl_ips/ips_scb.c \
+	psm3/ptl_ips/ips_scb.h \
+	psm3/ptl_ips/ips_stats.h \
+	psm3/ptl_ips/ips_subcontext.h \
+	psm3/ptl_ips/ips_tid.c \
+	psm3/ptl_ips/ips_tid.h \
+	psm3/ptl_ips/ips_tidcache.c \
+	psm3/ptl_ips/ips_tidcache.h \
+	psm3/ptl_ips/ips_tidflow.c \
+	psm3/ptl_ips/ips_tidflow.h \
+	psm3/ptl_ips/ips_writehdrq.c \
+	psm3/ptl_ips/ips_writehdrq.h \
+	psm3/ptl_ips/ptl.c \
+	psm3/ptl_ips/ptl_fwd.h \
+	psm3/ptl_ips/ptl_ips.h \
+	psm3/ptl_ips/ptl_rcvthread.c
+libptl_ips_la_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/ptl_ips/ \
+	$(_CPPFLAGS)
+libptl_ips_la_DEPENDENCIES = \
+	libopa.la
+
+libptl_self_la_SOURCES = \
+	psm3/ptl_self/ptl.c \
+	psm3/ptl_self/ptl_fwd.h
+libptl_self_la_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/ptl_self/ \
+	$(_CPPFLAGS)
+
+libopa_la_SOURCES = \
+	psm3/opa/opa_debug.c \
+	psm3/opa/opa_dwordcpy-x86_64.c \
+	psm3/opa/opa_service.c \
+	psm3/opa/opa_sysfs.c \
+	psm3/opa/opa_syslog.c \
+	psm3/opa/opa_time.c \
+	psm3/opa/opa_utils.c \
+	psm3/include/opa_byteorder.h \
+	psm3/include/opa_debug.h \
+	psm3/include/opa_intf.h \
+	psm3/include/opa_queue.h \
+	psm3/include/opa_revision.h \
+	psm3/include/opa_service.h \
+	psm3/include/opa_udebug.h \
+	psm3/include/opa_user.h \
+	psm3/include/psm2_mock_testing.h \
+	psm3/include/rbtree.h \
+	psm3/include/linux-i386/bit_ops.h \
+	psm3/include/linux-i386/sysdep.h \
+	psm3/mpspawn/mpspawn_stats.h
+libopa_la_CPPFLAGS = \
+	$(_CPPFLAGS)
+
+libpsm_hal_gen1_la_SOURCES = \
+	psm3/psm_hal_gen1/hfi1_deprecated_gen1.h \
+	psm3/psm_hal_gen1/opa_common_gen1.h \
+	psm3/psm_hal_gen1/opa_i2cflash_gen1.c \
+	psm3/psm_hal_gen1/opa_proto_gen1.c \
+	psm3/psm_hal_gen1/opa_service_gen1.c \
+	psm3/psm_hal_gen1/opa_service_gen1.h \
+	psm3/psm_hal_gen1/opa_user_gen1.h \
+	psm3/psm_hal_gen1/opa_utils_gen1.c \
+	psm3/psm_hal_gen1/psm_gdrcpy.c \
+	psm3/psm_hal_gen1/psm_hal_gen1.c \
+	psm3/psm_hal_gen1/psm_hal_gen1.h \
+	psm3/psm_hal_gen1/psm_hal_inline_i.h \
+	psm3/psm_hal_gen1/psm_hal_gen1_spio.h
+libpsm_hal_gen1_la_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/psm_hal_gen1/ \
+	$(_CPPFLAGS)
+
+libpsm2_la_SOURCES = \
+	psm3/psm.c \
+	psm3/psm_am.c \
+	psm3/psm_am_internal.h \
+	psm3/psm_config.h \
+	psm3/psm_context.c \
+	psm3/psm_context.h \
+	psm3/psm_diags.c \
+	psm3/psm_ep.c \
+	psm3/psm_ep.h \
+	psm3/psm_ep_connect.c \
+	psm3/psm_error.c \
+	psm3/psm_error.h \
+	psm3/psm_gdrcpy.h \
+	psm3/psm_help.h \
+	psm3/psm_lock.h \
+	psm3/psm_log.h \
+	psm3/psm_memcpy.c \
+	psm3/psm_mock.c \
+	psm3/psm_mpool.c \
+	psm3/psm_mpool.h \
+	psm3/psm_mq.c \
+	psm3/psm_mq_internal.h \
+	psm3/psm_mq_recv.c \
+	psm3/psm_mq_utils.c \
+	psm3/psm_netutils.h \
+	psm3/psm_perf.c \
+	psm3/psm_perf.h \
+	psm3/psm_rndv_mod.c \
+	psm3/psm_rndv_mod.h \
+	psm3/psm_stats.c \
+	psm3/psm_stats.h \
+	psm3/psm_sysbuf.c \
+	psm3/psm_sysbuf.h \
+	psm3/psm_timer.c \
+	psm3/psm_timer.h \
+	psm3/psm_user.h \
+	psm3/psm_utils.c \
+	psm3/psm_utils.h \
+	psm3/psm_verbs_ep.c \
+	psm3/psm_verbs_ep.h \
+	psm3/psm_verbs_mr.c \
+	psm3/psm_verbs_mr.h \
+	psm3/psm_udp_ep.c \
+	psm3/psm_udp_ep.h \
+	psm3/psmi_wrappers.c \
+	psm3/psmi_wrappers.h \
+	psm3/psm2.h \
+	psm3/psm2_am.h \
+	psm3/psm2_hal.c \
+	psm3/psm2_hal.h \
+	psm3/psm2_hal_inline_t.h \
+	psm3/psm2_mq.h \
+	psm3/ptl.h
+libpsm2_la_CPPFLAGS = \
+	$(_CPPFLAGS)
+nodist_libpsm2_la_SOURCES = \
+	psm3/psm2_hal_inlines_i.h \
+	psm3/psm2_hal_inlines_d.h
+
+libpsm2_la_LIBADD = \
+	libopa.la \
+	libptl_am.la \
+	libptl_ips.la \
+	libptl_self.la \
+	libpsm_hal_gen1.la
+
+libpsm2_la_DEPENDENCIES = \
+	libopa.la \
+	libptl_am.la \
+	libptl_ips.la \
+	libptl_self.la \
+	libpsm_hal_gen1.la
+
+EXTRA_DIST += \
+	psm3/include/rbtree.c \
+	psm3/psm_hal_gen1/psm_hal_gen1_spio.c \
+	psm3/opa/opa_dwordcpy-x86_64-fast.S
+
+chksum_srcs += \
+	$(libptl_am_la_SOURCES) $(libptl_ips_la_SOURCES) $(libptl_self_la_SOURCES) \
+	$(libopa_la_SOURCES) $(libpsm_hal_gen1_la_SOURCES) \
+	$(libpsm2_la_SOURCES)
diff --git a/deps/libfabric/prov/psm3/psm3/include/linux-i386/bit_ops.h b/deps/libfabric/prov/psm3/psm3/include/linux-i386/bit_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d272e755d9b05084f64ce2c707bc6b358206e5a0
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/linux-i386/bit_ops.h
@@ -0,0 +1,98 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_BIT_OPS_H
+#define _HFI_i386_BIT_OPS_H
+
+static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	int oldbit;
+
+	asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit),
+		      "=m"(*addr) : "dIr"(nr) : "memory");
+	return oldbit;
+}
+
+static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr)
+{
+	asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips___test_and_set_bit(int nr,
+					     volatile unsigned long *addr)
+{
+	int oldbit;
+
+	asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit),
+		      "=m"(*addr) : "dIr"(nr) : "memory");
+	return oldbit;
+}
+
+#endif /* _HFI_i386_BIT_OPS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/linux-i386/sysdep.h b/deps/libfabric/prov/psm3/psm3/include/linux-i386/sysdep.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd5746455a90bd8a23448e2f535f075610e0653
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/linux-i386/sysdep.h
@@ -0,0 +1,171 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _HFI_i386_SYSDEP_H
+#define _HFI_i386_SYSDEP_H
+
+typedef struct cpuid {
+        unsigned eax, ebx, ecx, edx;
+} cpuid_t;
+
+static __inline__ void
+get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id)
+{
+	unsigned a, b, c, d;
+
+	asm (" \
+	mov %4, %%eax \n\
+	mov %5, %%ecx \n\
+	cpuid \n\
+	mov %%eax, %0 \n\
+	mov %%ebx, %1 \n\
+	mov %%ecx, %2 \n\
+	mov %%edx, %3 \n\
+	" : "=g" (a), "=g" (b), "=g" (c), "=g" (d)
+	: "g" (func), "g" (subfunc)
+	: "%eax", "%ebx", "%ecx", "%edx"
+	);
+
+	id->eax = a;
+	id->ebx = b;
+	id->ecx = c;
+	id->edx = d;
+}
+
+static __inline__ uint64_t get_cycles(void)
+{
+	uint64_t v;
+	uint32_t a, d;
+
+	asm volatile ("rdtsc" : "=a" (a), "=d"(d));
+	v = ((uint64_t) a) | (((uint64_t) d) << 32);
+
+	return v;
+}
+
+#ifndef LOCK_PREFIX
+#define LOCK_PREFIX "lock "
+#endif
+
+static __inline__ void ips_barrier()
+{
+	asm volatile ("" :  :  : "memory");
+}
+
+static __inline__ void ips_mb()
+{
+	asm volatile ("mfence" :  :  : "memory");
+}
+
+/* gcc-3.4 has a bug with this function body at -O0 */
+static
+#if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4
+#else
+__inline__
+#endif
+void ips_rmb()
+{
+	asm volatile ("" :  :  : "memory");
+}
+
+static __inline__ void ips_wmb()
+{
+	asm volatile ("sfence" :  :  : "memory");
+}
+
+static __inline__ void ips_sync_writes()
+{
+	asm volatile ("sfence" :  :  : "memory");
+}
+
+static __inline__ void ips_sync_reads()
+{
+	asm volatile ("lfence" :  :  : "memory");
+}
+
+static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
+				       uint32_t old_val, uint32_t new_val)
+{
+	uint32_t prev;
+	struct xchg_dummy {
+		uint32_t a[100];
+	};
+
+	asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev)
+		      : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val)
+		      : "memory");
+
+	return prev;
+}
+
+typedef struct {
+	volatile int32_t counter;
+} ips_atomic_t;
+
+#define ips_atomic_set(v, i)		  (((v)->counter) = (i))
+#define ips_atomic_cmpxchg(p, oval, nval)	  \
+	    ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval)
+
+#if 0
+static __inline__ int32_t
+ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value)
+{
+	asm volatile ("lock cmpxchg %2, %0" :
+		      "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+	return old_value;
+}
+#endif
+
+#endif /* _HFI_i386_SYSDEP_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_byteorder.h b/deps/libfabric/prov/psm3/psm3/include/opa_byteorder.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc909c1895301cb32009de060821fcd802b9a6b6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_byteorder.h
@@ -0,0 +1,265 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_BYTEORDER_H
+#define OPA_BYTEORDER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/types.h>
+#include <sys/param.h>
+#include <endian.h>
+
+#ifndef __BYTE_ORDER
+#	error "BYTE_ORDER undefined"
+#endif
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+static __inline__ __u16 __hfi_fswab16(__u16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __hfi_fswab32(__u32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __hfi_fswab64(__u64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __hfi_fswab16(__u16 x) {
+	return ((x & (__u16) 0x00ffU) << 8)
+	    | ((x & (__u16) 0xff00U) >> 8);
+} static __inline__ __u32 __hfi_fswab32(__u32 x) {
+	return ((x & (__u32) 0x000000ffUL) << 24)
+	    | ((x & (__u32) 0x0000ff00UL) << 8)
+	    | ((x & (__u32) 0x00ff0000UL) >> 8)
+	    | ((x & (__u32) 0xff000000UL) >> 24);
+}
+
+static __inline__ __u64 __hfi_fswab64(__u64 x) {
+	return ((x & (__u64) 0x00000000000000ffULL) << 56)
+	    | ((x & (__u64) 0x000000000000ff00ULL) << 40)
+	    | ((x & (__u64) 0x0000000000ff0000ULL) << 24)
+	    | ((x & (__u64) 0x00000000ff000000ULL) << 8)
+	    | ((x & (__u64) 0x000000ff00000000ULL) >> 8)
+	    | ((x & (__u64) 0x0000ff0000000000ULL) >> 24)
+	    | ((x & (__u64) 0x00ff000000000000ULL) >> 40)
+	    | ((x & (__u64) 0xff00000000000000ULL) >> 56);
+}
+
+static __inline__ __u16 __cpu_to_le16(__le16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_le32(__le32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_le64(__le64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __le16_to_cpu(__le16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __le32_to_cpu(__le32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __le64_to_cpu(__le64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __cpu_to_be16(__be16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_be32(__be32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_be64(__be64)
+    __attribute__ ((always_inline));
+
+static __inline__ __u16 __be16_to_cpu(__be16)
+    __attribute__ ((always_inline));
+static __inline__ __u32 __be32_to_cpu(__be32)
+    __attribute__ ((always_inline));
+static __inline__ __u64 __be64_to_cpu(__be64)
+    __attribute__ ((always_inline));
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+	return x;
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+	return x;
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+	return x;
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+	return x;
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+	return x;
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+	return x;
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+	return __hfi_fswab64(x);
+}
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x) {
+	return __hfi_fswab16(x);
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x) {
+	return __hfi_fswab32(x);
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x) {
+	return __hfi_fswab64(x);
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x) {
+	return x;
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x) {
+	return x;
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x) {
+	return x;
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x) {
+	return x;
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x) {
+	return x;
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x) {
+	return x;
+}
+
+#else
+#	error "unsupported BYTE_ORDER: " #BYTE_ORDER
+#endif
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif /* OPA_BYTEORDER_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_debug.h b/deps/libfabric/prov/psm3/psm3/include/opa_debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..749b4ed26c15e6f6b2761833ea5ae02d90c03847
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_debug.h
@@ -0,0 +1,115 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_DEBUG_H
+#define OPA_DEBUG_H
+
+// See opa_udebug.h for macros and comments about these settings
+
+#ifndef _HFI_DEBUGGING		/* debugging enabled or not */
+#define _HFI_DEBUGGING 1
+#endif
+
+#if _HFI_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe hfi.ko hfi_debug=7
+ */
+
+#define __HFI_INFO        0x1	/* generic low verbosity stuff */
+#define __HFI_DBG         0x2	/* generic debug */
+/* leave some low verbosity spots open */
+/* Debug messages related to the connection protocol. */
+#define __HFI_CONNDBG     0x20
+#define __HFI_VERBDBG     0x40	/* very verbose debug */
+#define __HFI_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages and important env vars */
+#define __HFI_PROCDBG     0x100
+/* print MR, mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG       0x200
+/* low-level environment variables */
+#define __HFI_ENVDBG	    0x400
+
+#define __HFI_DEBUG_DEFAULT __HFI_INFO
+#define __HFI_DEBUG_DEFAULT_STR "0x0001"
+
+#else /* _HFI_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(hfi_debug & _HFI_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __HFI_INFO      0x0	/* generic low verbosity stuff */
+#define __HFI_DBG       0x0	/* generic debug */
+#define __HFI_CONNDBG   0x0
+#define __HFI_VERBDBG   0x0	/* very verbose debug */
+#define __HFI_PKTDBG    0x0	/* print packet data */
+#define __HFI_PROCDBG   0x0	/* print process startup (init)/exit messages */
+/* print MR, mmap/nopage stuff, not using VDBG any more */
+#define __HFI_MMDBG     0x0
+
+#define __HFI_DEBUG_DEFAULT __HFI_INFO
+#define __HFI_DEBUG_DEFAULT_STR "0x0000"
+
+#endif /* _HFI_DEBUGGING */
+
+#define __HFI_VERBOSEDBG __HFI_VERBDBG
+
+#endif /* OPA_DEBUG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_intf.h b/deps/libfabric/prov/psm3/psm3/include/opa_intf.h
new file mode 100644
index 0000000000000000000000000000000000000000..725418765e49f03686bd874f8343d6f544873103
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_intf.h
@@ -0,0 +1,98 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_INTF_H
+#define OPA_INTF_H
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+#ifdef __inline__
+#undef __inline__
+#endif
+#define __inline__ inline __attribute__((always_inline, unused))
+
+#include "sysdep.h"
+#include "bit_ops.h"
+
+/* these aren't implemented for user mode, which is OK until we multi-thread */
+typedef struct _atomic {
+	uint32_t counter;
+} atomic_t;			/* no atomic_t type in user-land */
+#define atomic_set(a, v) ((a)->counter = (v))
+#define atomic_inc_return(a)  (++(a)->counter)
+
+#if defined(__GNUC__)
+#ifndef likely
+#define likely(x)    __builtin_expect(!!(x), 1L)
+#endif
+#ifndef unlikely
+#define unlikely(x)  __builtin_expect(!!(x), 0L)
+#endif
+#ifndef if_pt
+#define if_pt(cond) if (likely(cond))
+#endif
+#ifndef if_pf
+#define if_pf(cond) if (unlikely(cond))
+#endif
+#define _Pragma_unlikely
+#define _Pragma_likely
+#else
+#error "Unsupported compiler"
+#endif
+
+#define yield() sched_yield()
+#endif /* OPA_INTF_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_queue.h b/deps/libfabric/prov/psm3/psm3/include/opa_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3d9595455ae54539f805ae20614dc9a884f3240
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_queue.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $
+ */
+
+#ifndef OPA_QUEUE_H_
+#define	OPA_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ	CIRCLEQ
+ * _HEAD		+	+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+	+
+ * _ENTRY		+	+	+	+	+
+ * _INIT		+	+	+	+	+
+ * _EMPTY		+	+	+	+	+
+ * _FIRST		+	+	+	+	+
+ * _NEXT		+	+	+	+	+
+ * _PREV		-	-	-	+	+
+ * _LAST		-	-	+	+	+
+ * _FOREACH		+	+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+	+
+ * _INSERT_HEAD		+	+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+	+
+ * _INSERT_AFTER	+	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+	+
+ * _REMOVE_HEAD		+	-	+	-	-
+ * _REMOVE		+	+	+	+	+
+ *
+ */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) =				\
+		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for ((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY(head) ?						\
+		NULL :							\
+		((struct type *)					\
+		((char *)((head)->stqh_last) - offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD(head, field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev =		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    (elm)->field.tqe_prev;				\
+	else								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Circular queue declarations.
+ */
+#define	CIRCLEQ_HEAD(name, type)					\
+struct name {								\
+	struct type *cqh_first;		/* first element */		\
+	struct type *cqh_last;		/* last element */		\
+}
+
+#define	CIRCLEQ_HEAD_INITIALIZER(head)					\
+	{ (void *)&(head), (void *)&(head) }
+
+#define	CIRCLEQ_ENTRY(type)						\
+struct {								\
+	struct type *cqe_next;		/* next element */		\
+	struct type *cqe_prev;		/* previous element */		\
+}
+
+/*
+ * Circular queue functions.
+ */
+#define	CIRCLEQ_EMPTY(head)	((head)->cqh_first == (void *)(head))
+
+#define	CIRCLEQ_FIRST(head)	((head)->cqh_first)
+
+#define	CIRCLEQ_FOREACH(var, head, field)				\
+	for ((var) = CIRCLEQ_FIRST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_NEXT((var), field))
+
+#define	CIRCLEQ_FOREACH_REVERSE(var, head, field)			\
+	for ((var) = CIRCLEQ_LAST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_PREV((var), field))
+
+#define	CIRCLEQ_INIT(head) do {						\
+	CIRCLEQ_FIRST((head)) = (void *)(head);				\
+	CIRCLEQ_LAST((head)) = (void *)(head);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field);	\
+	CIRCLEQ_PREV((elm), field) = (listelm);				\
+	if (CIRCLEQ_NEXT((listelm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\
+	CIRCLEQ_NEXT((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = (listelm);				\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field);	\
+	if (CIRCLEQ_PREV((listelm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\
+	CIRCLEQ_PREV((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_HEAD(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head));		\
+	CIRCLEQ_PREV((elm), field) = (void *)(head);			\
+	if (CIRCLEQ_LAST((head)) == (void *)(head))			\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm);	\
+	CIRCLEQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_INSERT_TAIL(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = (void *)(head);			\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head));		\
+	if (CIRCLEQ_FIRST((head)) == (void *)(head))			\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm);	\
+	CIRCLEQ_LAST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_LAST(head)	((head)->cqh_last)
+
+#define	CIRCLEQ_NEXT(elm, field)	((elm)->field.cqe_next)
+
+#define	CIRCLEQ_PREV(elm, field)	((elm)->field.cqe_prev)
+
+#define	CIRCLEQ_REMOVE(head, elm, field) do {				\
+	if (CIRCLEQ_NEXT((elm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field);	\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) =	\
+		    CIRCLEQ_PREV((elm), field);				\
+	if (CIRCLEQ_PREV((elm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field);	\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) =	\
+		    CIRCLEQ_NEXT((elm), field);				\
+} while (0)
+
+#endif /* !OPA_QUEUE_H_ */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_revision.h b/deps/libfabric/prov/psm3/psm3/include/opa_revision.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a288219d68b58a8751aecf3b8cccad376008318
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_revision.h
@@ -0,0 +1,64 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_REVISION_H
+#define OPA_REVISION_H
+
+/* Those variables are defined in the _revision.c file
+which is dynamically generated during building of the library */
+extern char psmi_hfi_IFS_version[];
+extern char psmi_hfi_build_timestamp[];
+extern char psmi_hfi_sources_checksum[];
+extern char psmi_hfi_git_checksum[];
+
+#endif /* OPA_REVISION_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_service.h b/deps/libfabric/prov/psm3/psm3/include/opa_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..20bdfc96966a5a1ef34499ee880a017b5726811f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_service.h
@@ -0,0 +1,97 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_SERVICE_H
+#define OPA_SERVICE_H
+
+/* This file contains all the lowest level routines calling into sysfs */
+/* and qib driver. All other calls are based on these routines. */
+
+#include <libgen.h>
+
+#include "opa_intf.h"
+#include "opa_udebug.h"
+#include "opa_byteorder.h"
+
+/* upper and lower bounds for HFI port numbers */
+#define HFI_MIN_PORT 1
+#define HFI_MAX_PORT 1
+
+/* any unit id to match. */
+#define PSM3_NIC_ANY ((long)-1)
+/* any port num to match. */
+#define PSM3_NIC_PORT_ANY ((long)0)
+
+
+/* sysfs helper routines (only those currently used are exported;
+ * try to avoid using others) */
+
+/* Initializes the following sysfs helper routines.
+   sysfs_init() returns 0 on success, non-zero on an error: */
+int sysfs_init(const char *dflt_hfi_class_path);
+
+const char *sysfs_unit_path(int unit_id);
+const char *sysfs_unit_dev_name(int unit_id);
+int sysfs_find_unit(const char *name);
+/* Complementary */
+void sysfs_fini(void);
+
+/* read a string value into buff, no more than size bytes.
+   returns the number of bytes read */
+size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char *buff, size_t size);
+
+
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit);
+
+#endif /* OPA_SERVICE_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_udebug.h b/deps/libfabric/prov/psm3/psm3/include/opa_udebug.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a2abfa1060d5a2d07684fd12383551f9d3fc5a8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_udebug.h
@@ -0,0 +1,232 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_UDEBUG_H
+#define OPA_UDEBUG_H
+
+#include <stdio.h>
+#include "opa_debug.h"
+
+// To have a a message be unconditionally output for all builds, regardless of
+// env variables, use _HFI_ERROR or _HFI_UNIT_ERROR
+// All other logging macros are under the control of the user via env variables
+// and build options can disable them
+//
+// Other logging calls are only enabled if _HFI_DEBUGGING is defined,
+// in which case _HFI_INFO is also enabled by default (but env can disable it).
+// All others controlled by env variable.
+//
+// Currently opa_debug.h always defines _HFI_DEBUGGING and it is included by
+// opa_udebug.h, so logging is presently enabled in all builds.  At some point
+// may want to explore a performance optimization and disable logging macros
+// for lower level debug messages in non-debug builds.
+//
+// See psmi_handle_error in psm_error.h.  Use of it's PSMI_EP_NO_RETURN option
+// can unconditionally output a message and abort.
+
+extern unsigned hfi_debug;
+const char *hfi_get_unit_name(int unit);
+extern char *__progname;
+
+static const char hfi_ident_tag[] = "PSM3_IDENTIFY";
+char *hfi_get_mylabel();
+int hfi_get_myrank();	// -1 if unknown
+int hfi_get_myrank_count();	// -1 if unknown
+int hfi_get_mylocalrank();	// -1 if unknown
+int hfi_get_mylocalrank_count();	// -1 if unknown
+
+#if _HFI_DEBUGGING
+
+extern char __hfi_mylabel[];
+void hfi_set_mylabel(char *);
+extern FILE *__hfi_dbgout;
+extern void hfi_dump_buf(uint8_t *buf, uint32_t len);
+#ifdef PSM_CUDA
+extern void hfi_dump_gpu_buf(uint8_t *buf, uint32_t len);
+#endif
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s.%s: " fmt, __hfi_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s.%s: " fmt, __hfi_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_INFO(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&__HFI_INFO))  \
+			printf("%s.%s: " fmt, __hfi_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG)
+
+#define __HFI_DBG_WHICH(which, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&(which))) \
+			fprintf(__hfi_dbgout, "%s.%s: " fmt, __hfi_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(hfi_debug&(which))) \
+			fprintf(__hfi_dbgout, "%s: " fmt, __hfi_mylabel, \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__)
+#define _HFI_CONNDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CONNDBG, fmt, ##__VA_ARGS__)
+#define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__)
+#define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__)
+#define _HFI_ENVDBG(lev, fmt, ...) \
+	__HFI_DBG_WHICH_NOFUNC(					    \
+		(lev == 0) ? __HFI_INFO : __HFI_ENVDBG,\
+		"env " fmt, ##__VA_ARGS__)
+#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__)
+
+/*
+ * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together
+ * for a scope of code preparing debug info for printing; e.g.
+ * if (_HFI_DBG_ON) {
+ *     // put your code here
+ *     _HFI_DBG_ALWAYS(print your results here);
+ * }
+ */
+#define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG)
+#define _HFI_DBG_ALWAYS(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		fprintf(__hfi_dbgout, "%s: " fmt, __hfi_mylabel, \
+			##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_CONNDBG_ON unlikely(hfi_debug & __HFI_CONNDBG)
+#define _HFI_CONNDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG)
+#define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_PDBG_ON unlikely(hfi_debug & __HFI_PKTDBG)
+#define _HFI_PDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+#define _HFI_PDBG_DUMP(buf, len) hfi_dump_buf(buf, len)
+#ifdef PSM_CUDA
+#define _HFI_PDBG_DUMP_GPU(buf, len) hfi_dump_gpu_buf(buf, len)
+#endif
+
+#define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG)
+#define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG)
+#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO)
+#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
+#else /* ! _HFI_DEBUGGING */
+
+#define _HFI_UNIT_ERROR(unit, fmt, ...) \
+	do { \
+		printf("%s: " fmt, "", ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_ERROR(fmt, ...) \
+	do { \
+		printf("%s: " fmt, "", ##__VA_ARGS__); \
+	} while (0)
+
+#define _HFI_INFO(fmt, ...)
+
+#define __HFI_PKTDBG_ON 0
+
+#define _HFI_DBG(fmt, ...)
+#define _HFI_PDBG(fmt, ...)
+#define _HFI_PRDBG(fmt, ...)
+#define _HFI_ENVDBG(lev, fmt, ...)
+#define _HFI_CONNDBG(fmt, ...)
+#define _HFI_VDBG(fmt, ...)
+#define _HFI_MMDBG(fmt, ...)
+
+#define _HFI_DBG_ON 0
+#define _HFI_DBG_ALWAYS(fmt, ...)
+#define _HFI_CONNDBG_ON 0
+#define _HFI_CONNDBG_ALWAYS(fmt, ...)
+#define _HFI_VDBG_ON 0
+#define _HFI_VDBG_ALWAYS(fmt, ...)
+#define _HFI_PRDBG_ON 0
+#define _HFI_PRDBG_ALWAYS(fmt, ...)
+#define _HFI_PDBG_DUMP(buf, len)
+#ifdef PSM_CUDA
+#define _HFI_PDBG_DUMP_GPU(buf, len)
+#endif
+#define _HFI_CCADBG_ON 0
+#define _HFI_CCADBG_ALWAYS(fmt, ...)
+#define _HFI_INFO_ON 0
+#define _HFI_INFO_ALWAYS(fmt, ...)
+
+#endif /* _HFI_DEBUGGING */
+
+#endif /* OPA_UDEBUG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/opa_user.h b/deps/libfabric/prov/psm3/psm3/include/opa_user.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c15567b9f5a20b5808d9a90f27d0b4544b41a7d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/opa_user.h
@@ -0,0 +1,268 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_USER_H
+#define OPA_USER_H
+
+/* This file contains all of the data structures and routines that are
+   publicly visible and usable (to low level infrastructure code; it is
+   not expected that any application, or even normal application-level library,
+   will ever need to use any of this).
+
+   Additional entry points and data structures that are used by these routines
+   may be referenced in this file, but they should not be generally available;
+   they are visible here only to allow use in inlined functions.  Any variable,
+   data structure, or function that starts with a leading "_" is in this
+   category.
+*/
+
+/* Include header files we need that are unlikely to otherwise be needed by */
+/* programs. */
+#include <stddef.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <syslog.h>
+#include "opa_intf.h"
+#include "opa_byteorder.h"
+#include "opa_udebug.h"
+#include "opa_service.h"
+
+#ifndef PACK_SUFFIX
+/* XXX gcc only */
+#define PACK_SUFFIX __attribute__((packed))
+#endif
+
+#define HFI_TF_NFLOWS                       32
+
+// The sender uses an RDMA Write with Immediate.  The immediate data
+// carries the receiver's desc genc and idx from which the receiver can
+// locate the ips_tid_recv_desc
+// we have 16 bits of genc and 5 bits of desc_idx (max of HFI_TF_NFLOWS).
+// leaving up to 11 bits for dest_rv_idx for RNDV_MOD (we use 9)
+// so desc_idx could grow to 7 bits if needed
+#define RV_INDEX_BITS 9
+#define RDMA_PACK_IMMED(desc_genc, desc_idx, dest_rv_idx) \
+		((((uint32_t)(desc_genc))&0xffff) \
+		| ((((uint32_t)(desc_idx))&0x7f) << 16) \
+		| ((dest_rv_idx) << (32-RV_INDEX_BITS)))
+#define RDMA_UNPACK_IMMED_GENC(immed) ((immed) & 0xffff)
+#define RDMA_UNPACK_IMMED_IDX(immed) (((immed) >> 16) & 0x7f)
+#define RDMA_UNPACK_IMMED_RV_IDX(immed) ((immed) >> (32-RV_INDEX_BITS))
+#define RDMA_IMMED_DESC_MASK 0x7fffff // mask for desc genc and desc idx
+
+// source of the immediate callback
+#define RDMA_IMMED_USER_RC 0	// from a user space RC QP
+#define RDMA_IMMED_RV 1			// from RV module kernel QP
+
+/* IB - LRH header consts */
+#define HFI_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+#define HFI_LRH_SC_SHIFT 12
+#define HFI_LRH_SC_MASK 0xf
+#define HFI_LRH_SL_SHIFT 4
+#define HFI_LRH_SL_MASK 0xf
+#define HFI_LRH_PKTLEN_MASK 0xfff
+
+/* IB - BTH header consts */
+// bth[0]
+#define HFI_BTH_OPCODE_SHIFT 24
+#define HFI_BTH_OPCODE_MASK 0xff
+// bth[1]
+#define HFI_BTH_FLOWID_SHIFT 11
+#define HFI_BTH_FLOWID_MASK 0x1f
+// bth[2]
+#define HFI_BTH_SEQ_SHIFT 0
+#define HFI_BTH_SEQ_MASK 0x7ff
+#define HFI_BTH_GEN_SHIFT 11
+#define HFI_BTH_GEN_MASK 0xfffff
+#define HFI_BTH_ACK_SHIFT 31
+
+/* KDETH header consts */
+#define HFI_KHDR_OFFSET_MASK 0x7fff
+#define HFI_KHDR_OM_SHIFT 15
+#define HFI_KHDR_TID_SHIFT 16
+#define HFI_KHDR_TID_MASK 0x3ff
+#define HFI_KHDR_TIDCTRL_SHIFT 26
+#define HFI_KHDR_TIDCTRL_MASK 0x3
+#define HFI_KHDR_INTR_SHIFT 28
+#define HFI_KHDR_SH_SHIFT 29
+#define HFI_KHDR_KVER_SHIFT 30
+#define HFI_KHDR_KVER_MASK 0x3
+
+#define HFI_KHDR_MSGSEQ_MASK 0xffff
+#define HFI_KHDR_TINYLEN_MASK 0xf
+#define HFI_KHDR_TINYLEN_SHIFT 16
+
+#define GET_HFI_KHDR_TIDCTRL(val) \
+	(((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \
+	HFI_KHDR_TIDCTRL_MASK)
+
+#ifdef PSM_CUDA
+extern int is_driver_gpudirect_enabled;
+
+#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED  likely(is_driver_gpudirect_enabled)
+#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled)
+#endif
+
+/* hfi kdeth header format */
+struct hfi_kdeth {
+	__u32 kdeth0;
+
+	union {
+		struct {
+			__u16 job_key;	// unused for UD/UDP
+			__u16 hcrc;	// unused for UD/UDP
+		};
+		__u32 kdeth1;
+	};
+} PACK_SUFFIX;
+
+/* misc. */
+#define HFI_CRC_SIZE_IN_BYTES 4
+
+//#define HFI_DEFAULT_SERVICE_ID 0 /* let rv module decide */
+#define HFI_DEFAULT_SERVICE_ID 0x1000125500000001ULL
+#define HFI_DEFAULT_P_KEY 0      /* use slot 0 as default */
+
+#if 0
+#define HFI_PERMISSIVE_LID 0xFFFF
+#define HFI_AETH_CREDIT_SHIFT 24
+#define HFI_AETH_CREDIT_MASK 0x1F
+#define HFI_AETH_CREDIT_INVAL 0x1F
+#define HFI_PSN_MASK 0xFFFFFF
+#define HFI_MSN_MASK 0xFFFFFF
+#define HFI_QPN_MASK 0xFFFFFF
+#define HFI_MULTICAST_LID_BASE 0xC000
+#define HFI_MULTICAST_QPN 0xFFFFFF
+#endif
+
+/* Receive Header Queue: receive type (from hfi) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+/* OPA PSM assumes that the message header is always 56 bytes. */
+#define HFI_MESSAGE_HDR_SIZE	56
+
+/* interval timing routines */
+/* Convert a count of cycles to elapsed nanoseconds */
+/* this is only accurate for reasonably large numbers of cycles (at least tens)
+*/
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t)
+					  __attribute__ ((always_inline));
+/* convert elapsed nanoseconds to elapsed cycles */
+/* this is only accurate for reasonably large numbers of nsecs (at least tens)
+*/
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t)
+					  __attribute__ ((always_inline));
+
+/* Statistics maintained by the driver */
+const char *hfi_get_next_name(char **names);
+int hfi_get_stats_names_count(void);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit_names_count(int unitno);
+int hfi_get_ctrs_port_names_count(int unitno);
+/* Convert Timeout value from usec to
+ * timeout_mult where usec = 4.096usec * 2^timeout_mult
+ */
+uint8_t timeout_usec_to_mult(uint64_t timeout_us);
+
+uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s);
+int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c);
+void hfi_release_names(char *namep);
+
+/* Syslog wrapper
+
+   level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING,
+   LOG_NOTICE, LOG_INFO, LOG_DEBUG.
+
+   prefix should be a short string to describe which part of the software stack
+   is using syslog, i.e. "PSM", "mpi", "mpirun".
+*/
+void hfi_syslog(const char *prefix, int to_console, int level,
+		const char *format, ...)
+		__attribute__((format(printf, 4, 5)));
+
+void hfi_vsyslog(const char *prefix, int to_console, int level,
+		 const char *format, va_list ap);
+
+/*
+ * Copy routine that may copy a byte multiple times but optimized for througput
+ * This is not safe to use for PIO routines where we want a guarantee that a
+ * byte is only copied/moved across the bus once.
+ */
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src,
+		  uint32_t ndwords);
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src,
+		  uint32_t nqwords);
+
+extern uint32_t __hfi_pico_per_cycle;	/* only for use in these functions */
+
+/* this is only accurate for reasonably large numbers of cycles (at least tens) */
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs)
+{
+	return (__hfi_pico_per_cycle * cycs) / 1000ULL;
+}
+
+/* this is only accurate for reasonably large numbers of nsecs (at least tens) */
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns)
+{
+	return (ns * 1000ULL) / __hfi_pico_per_cycle;
+}
+
+#endif /* OPA_USER_H */
diff --git a/deps/libfabric/prov/psm3/psm3/include/psm2_mock_testing.h b/deps/libfabric/prov/psm3/psm3/include/psm2_mock_testing.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1e9bff44c8501b4628101e1c740d4d551b808e4
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/psm2_mock_testing.h
@@ -0,0 +1,176 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MOCK_TESTING_H
+#define PSM2_MOCK_TESTING_H
+
+/* PSM2_MOCK_TESTING being defined flips a couple of switches so that a
+ * testable version of libpsm2.so is built. It'll make properly annotated
+ * static functions be non-static, visible to the outside. Also, all mockable
+ * functions will be replaced with function pointers which will originally
+ * point to the actual implementation. However, those function pointers might
+ * be reset by the test code, thus allowing for mocking selected PSM2 functions
+ * for the purpose of the test.
+ *
+ * So far the following utilities have been introduced for enabling a
+ * conditional compilation of the testable vs. production version of the library:
+ *  - ustatic: toggles function visibility
+ *  - MOCKABLE(): decorates function name so that it is visible after being mocked
+ *  - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam
+ *        for mocking a function
+ *  - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam
+ *        for mocking a function
+ *
+ * If the declaration and definition of a static function @c foo reside in
+ * different files, this would be the common use case:
+ *
+ * @code
+ * // somefile.c:
+ * int MOCKABLE(foo)();
+ * MOCK_DCL_EPILOGUE(foo);
+ *
+ * // otherfile.c:
+ * int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * int foo();
+ *
+ * // otherfile.c:
+ * int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * On the other hand, if a testable version of the libary is being build, it
+ * would produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ *
+ * // otherfile.c:
+ * int foo_original_() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ *
+ * If the function to be mocked is a static function residing in the header,
+ * the following syntax would be used:
+ * @code
+ * // somefile.c:
+ * ustatic int MOCKABLE(foo)() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * MOCK_DCL_EPILOGUE(foo);
+ * MOCK_DEF_EPILOGUE(foo);
+ * @endcode
+ *
+ * If the production version of the library is being built, the following code
+ * would result:
+ * @code
+ * // somefile.c:
+ * static int foo() {
+ * 	printf("I am the original foo!\n");
+ * }
+ * @endcode
+ *
+ * Similarly, if a testable version of the libary is being build, it would
+ * produce the following code:
+ * @code
+ * // somefile.c:
+ * int foo_original_();
+ * extern typeof(& foo_original_) foo;
+ * typeof(& foo_original_) foo = foo_original_;
+ * @endcode
+ */
+#ifndef PSM2_MOCK_TESTING
+
+/* If no testing is being done, ustatic resolves to regular "static" */
+#define ustatic static
+/* If no testing is being done, no indirection is introduced */
+#define MOCKABLE(fname) fname
+/* If no testing is being done, no declaration epilogue is needed */
+#define MOCK_DCL_EPILOGUE(fname)
+/* If no testing is being done, no definition epilogue is needed */
+#define MOCK_DEF_EPILOGUE(fname)
+
+#else /* ndef PSM2_MOCK_TESTING */
+
+/* For the testable version, all _ustatic_ function will NOT be static */
+#define ustatic
+/* TODO override inline directives in the same fashion as static */
+/* For the testable version, the actual implementation function is renamed */
+#define MOCKABLE(x) x ## _original_
+/* For the testable version, we declare the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the declaration of the actual function happens.
+ */
+#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x;
+/* For the testable version, we define the function pointer which will be the
+ * point of indirection for calls to that function. It must be delared after
+ * the definition of the actual function happens.
+ */
+#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_;
+
+#endif /* ndef PSM2_MOCK_TESTING */
+
+#endif /* PSM2_MOCK_TESTING_H */
+
diff --git a/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.c b/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.c
new file mode 100644
index 0000000000000000000000000000000000000000..b79f135296f1a813a0127ea21ec1d00ffc69c5e5
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.c
@@ -0,0 +1,743 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+/*
+ * Abstract:
+ *	Implementation of quick map, a binary tree where the caller always provides
+ *	all necessary storage.
+ *
+ * Environment:
+ *	All
+ *
+ * $Revision$
+ */
+
+
+/*****************************************************************************
+*
+* Map
+*
+* Map is an associative array.  By providing a key, the caller can retrieve
+* an object from the map.  All objects in the map have an associated key,
+* as specified by the caller when the object was inserted into the map.
+* In addition to random access, the caller can traverse the map much like
+* a linked list, either forwards from the first object or backwards from
+* the last object.  The objects in the map are always traversed in
+* order since the nodes are stored sorted.
+*
+* This implementation of Map uses a red black tree verified against
+* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth
+* printing, 1994.
+*
+*****************************************************************************/
+
+#include <string.h> /* for memset declaration */
+
+// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to
+// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively.
+#ifdef RBTREE_CMP
+
+#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST)
+#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST
+#endif
+
+#elif !defined ( RBTREE_GET_LEFTMOST )       || \
+	! defined ( RBTREE_GET_RIGHTMOST ) || \
+	! defined ( RBTREE_MAP_COUNT )     || \
+	! defined ( RBTREE_ASSERT )
+#error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \
+        RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c"
+
+#endif /* RBTREE_CMP */
+
+#define IN /* nothing */
+
+/******************************************************************************
+*******************************************************************************
+**************                                                     ************
+**************			 IMPLEMENTATION OF QUICK MAP       ************
+**************                                                     ************
+*******************************************************************************
+******************************************************************************/
+
+/* Forward declarations: */
+static void ips_cl_qmap_init(
+				IN	cl_qmap_t		*p_map,
+				IN	cl_map_item_t* const	root,
+				IN	cl_map_item_t* const	nil);
+static void ips_cl_qmap_insert_item(
+				IN	cl_qmap_t* const	p_map,
+				IN	cl_map_item_t* const	p_item);
+static void ips_cl_qmap_remove_item(
+				IN	cl_qmap_t* const	p_map,
+				IN	cl_map_item_t* const	p_item);
+static cl_map_item_t* ips_cl_qmap_successor(
+				IN	cl_qmap_t* const	p_map,
+				IN	const cl_map_item_t*	p_item);
+
+
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+static cl_map_item_t* ips_cl_qmap_predecessor(
+				IN	cl_qmap_t* const	p_map,
+				IN	const cl_map_item_t*	p_item);
+#endif
+
+#if defined(RBTREE_GET_LEFTMOST)
+static cl_map_item_t* ips_cl_qmap_search(
+				IN	cl_qmap_t* const	p_map,
+				IN	unsigned long		start,
+				IN	unsigned long		end);
+#else
+static cl_map_item_t* ips_cl_qmap_searchv(
+				cl_qmap_t* const	p_map,
+				const RBTREE_MI_PL *key);
+#endif
+
+/*
+ * Get the root.
+ */
+static inline cl_map_item_t*
+__cl_map_root(
+	IN	const cl_qmap_t* const	p_map )
+{
+	RBTREE_ASSERT( p_map );
+	return( p_map->root->p_left );
+}
+
+
+/*
+ * Returns whether a given item is on the left of its parent.
+ */
+static int
+__cl_map_is_left_child(
+	IN	const cl_map_item_t* const	p_item )
+{
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_up );
+	RBTREE_ASSERT( p_item->p_up != p_item );
+
+	return( p_item->p_up->p_left == p_item );
+}
+
+
+/*
+ * Retrieve the pointer to the parent's pointer to an item.
+ */
+static cl_map_item_t**
+__cl_map_get_parent_ptr_to_item(
+	IN	cl_map_item_t* const	p_item )
+{
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_up );
+	RBTREE_ASSERT( p_item->p_up != p_item );
+
+	if( __cl_map_is_left_child( p_item ) )
+		return( &p_item->p_up->p_left );
+
+	RBTREE_ASSERT( p_item->p_up->p_right == p_item );
+	return( &p_item->p_up->p_right );
+}
+
+
+/*
+ * Rotate a node to the left.  This rotation affects the least number of links
+ * between nodes and brings the level of C up by one while increasing the depth
+ * of A one.  Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ *	    R				      R
+ *	    |				      |
+ *	    A				      C
+ *	  /   \			        /   \
+ *	W       C			  A       Z
+ *	       / \			 / \
+ *	      B   Z			W   B
+ *	     / \			   / \
+ *	    X   Y			  X   Y
+ */
+static void
+__cl_map_rot_left(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	**pp_root;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_right != p_map->nil_item );
+
+	pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+
+	/* Point R to C instead of A. */
+	*pp_root = p_item->p_right;
+	/* Set C's parent to R. */
+	(*pp_root)->p_up = p_item->p_up;
+
+	/* Set A's right to B */
+	p_item->p_right = (*pp_root)->p_left;
+	/*
+	 * Set B's parent to A.  We trap for B being NIL since the
+	 * caller may depend on NIL not changing.
+	 */
+	if( (*pp_root)->p_left != p_map->nil_item )
+		(*pp_root)->p_left->p_up = p_item;
+
+	/* Set C's left to A. */
+	(*pp_root)->p_left = p_item;
+	/* Set A's parent to C. */
+	p_item->p_up = *pp_root;
+}
+
+
+/*
+ * Rotate a node to the right.  This rotation affects the least number of links
+ * between nodes and brings the level of A up by one while increasing the depth
+ * of C one.  Note that the links to/from W, X, Y, and Z are not affected.
+ *
+ *	        R				     R
+ *	        |				     |
+ *	        C				     A
+ *	      /   \				   /   \
+ *	    A       Z			 W       C
+ *	   / \    				        / \
+ *	  W   B   				       B   Z
+ *	     / \				      / \
+ *	    X   Y				     X   Y
+ */
+static void
+__cl_map_rot_right(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	**pp_root;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item->p_left != p_map->nil_item );
+
+	/* Point R to A instead of C. */
+	pp_root = __cl_map_get_parent_ptr_to_item( p_item );
+	(*pp_root) = p_item->p_left;
+	/* Set A's parent to R. */
+	(*pp_root)->p_up = p_item->p_up;
+
+	/* Set C's left to B */
+	p_item->p_left = (*pp_root)->p_right;
+	/*
+	 * Set B's parent to C.  We trap for B being NIL since the
+	 * caller may depend on NIL not changing.
+	 */
+	if( (*pp_root)->p_right != p_map->nil_item )
+		(*pp_root)->p_right->p_up = p_item;
+
+	/* Set A's right to C. */
+	(*pp_root)->p_right = p_item;
+	/* Set C's parent to A. */
+	p_item->p_up = *pp_root;
+}
+
+/*
+ * Balance a tree starting at a given item back to the root.
+ */
+static void
+__cl_map_ins_bal(
+	IN	cl_qmap_t* const	p_map,
+	IN	cl_map_item_t*		p_item )
+{
+	cl_map_item_t*		p_grand_uncle;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_item != p_map->root );
+
+	while( p_item->p_up->color == CL_MAP_RED )
+	{
+		if( __cl_map_is_left_child( p_item->p_up ) )
+		{
+			p_grand_uncle = p_item->p_up->p_up->p_right;
+			RBTREE_ASSERT( p_grand_uncle );
+			if( p_grand_uncle->color == CL_MAP_RED )
+			{
+				p_grand_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_BLACK;
+				p_item->p_up->p_up->color = CL_MAP_RED;
+				p_item = p_item->p_up->p_up;
+				continue;
+			}
+
+			if( !__cl_map_is_left_child( p_item ) )
+			{
+				p_item = p_item->p_up;
+				__cl_map_rot_left( p_map, p_item );
+			}
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_item->p_up->p_up->color = CL_MAP_RED;
+			__cl_map_rot_right( p_map, p_item->p_up->p_up );
+		}
+		else
+		{
+			p_grand_uncle = p_item->p_up->p_up->p_left;
+			RBTREE_ASSERT( p_grand_uncle );
+			if( p_grand_uncle->color == CL_MAP_RED )
+			{
+				p_grand_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_BLACK;
+				p_item->p_up->p_up->color = CL_MAP_RED;
+				p_item = p_item->p_up->p_up;
+				continue;
+			}
+
+			if( __cl_map_is_left_child( p_item ) )
+			{
+				p_item = p_item->p_up;
+				__cl_map_rot_right( p_map, p_item );
+			}
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_item->p_up->p_up->color = CL_MAP_RED;
+			__cl_map_rot_left( p_map, p_item->p_up->p_up );
+		}
+	}
+}
+
+static void ips_cl_qmap_init(
+				IN	cl_qmap_t		*p_map,
+				IN	cl_map_item_t* const	root,
+				IN	cl_map_item_t* const	nil_item)
+{
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( root );
+	RBTREE_ASSERT( nil_item );
+
+	memset(p_map,0,sizeof(cl_qmap_t));
+
+	p_map->root = root;
+
+	/* setup the RB tree map */
+	p_map->nil_item = nil_item;
+
+	p_map->root->p_up = p_map->root;
+	p_map->root->p_left = p_map->nil_item;
+	p_map->root->p_right = p_map->nil_item;
+	p_map->root->color = CL_MAP_BLACK;
+
+	p_map->nil_item->p_up = p_map->nil_item;
+	p_map->nil_item->p_left = p_map->nil_item;
+	p_map->nil_item->p_right = p_map->nil_item;
+	p_map->nil_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_insert_item(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	*p_insert_at, *p_comp_item;
+	int compare_res = 0;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+	RBTREE_ASSERT( p_map->root->p_up == p_map->root );
+	RBTREE_ASSERT( p_map->root->color != CL_MAP_RED );
+	RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+
+	/* Find the insertion location. */
+	p_insert_at = p_map->root;
+	p_comp_item = __cl_map_root( p_map );
+
+	while( p_comp_item != p_map->nil_item )
+	{
+		p_insert_at = p_comp_item;
+
+		/* Traverse the tree until the correct insertion point is found. */
+#ifdef RBTREE_GET_LEFTMOST
+		if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) )
+#else
+		if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0)
+#endif
+		{
+			p_comp_item = p_insert_at->p_left;
+			compare_res = 1;
+		} else {
+			p_comp_item = p_insert_at->p_right;
+			compare_res = -1;
+		}
+	}
+
+	RBTREE_ASSERT( p_insert_at != p_map->nil_item );
+	RBTREE_ASSERT( p_comp_item == p_map->nil_item );
+
+	/* Insert the item. */
+	p_item->p_left = p_map->nil_item;
+	p_item->p_right = p_map->nil_item;
+	p_item->color = CL_MAP_RED;
+	if( p_insert_at == p_map->root )
+	{
+		p_insert_at->p_left = p_item;
+	}
+	else if( compare_res > 0 ) /* key < p_insert_at->key */
+	{
+		p_insert_at->p_left = p_item;
+	}
+	else
+	{
+		p_insert_at->p_right = p_item;
+	}
+	/* Increase the count. */
+	RBTREE_MAP_COUNT(&p_map->payload)++;
+
+	p_item->p_up = p_insert_at;
+
+	/*
+	 * We have added depth to this section of the tree.
+	 * Rebalance as necessary as we retrace our path through the tree
+	 * and update colors.
+	 */
+	__cl_map_ins_bal( p_map, p_item );
+
+	__cl_map_root( p_map )->color = CL_MAP_BLACK;
+
+	/*
+	 * Note that it is not necessary to re-color the nil node black because all
+	 * red color assignments are made via the p_up pointer, and nil is never
+	 * set as the value of a p_up pointer.
+	 */
+}
+
+static void
+__cl_map_del_bal(
+	IN	cl_qmap_t* const	p_map,
+	IN	cl_map_item_t*		p_item )
+{
+	cl_map_item_t		*p_uncle;
+
+	while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) )
+	{
+		if( __cl_map_is_left_child( p_item ) )
+		{
+			p_uncle = p_item->p_up->p_right;
+
+			if( p_uncle->color == CL_MAP_RED )
+			{
+				p_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_RED;
+				__cl_map_rot_left( p_map, p_item->p_up );
+				p_uncle = p_item->p_up->p_right;
+			}
+
+			if( p_uncle->p_right->color != CL_MAP_RED )
+			{
+				if( p_uncle->p_left->color != CL_MAP_RED )
+				{
+					p_uncle->color = CL_MAP_RED;
+					p_item = p_item->p_up;
+					continue;
+				}
+
+				p_uncle->p_left->color = CL_MAP_BLACK;
+				p_uncle->color = CL_MAP_RED;
+				__cl_map_rot_right( p_map, p_uncle );
+				p_uncle = p_item->p_up->p_right;
+			}
+			p_uncle->color = p_item->p_up->color;
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_uncle->p_right->color = CL_MAP_BLACK;
+			__cl_map_rot_left( p_map, p_item->p_up );
+			break;
+		}
+		else
+		{
+			p_uncle = p_item->p_up->p_left;
+
+			if( p_uncle->color == CL_MAP_RED )
+			{
+				p_uncle->color = CL_MAP_BLACK;
+				p_item->p_up->color = CL_MAP_RED;
+				__cl_map_rot_right( p_map, p_item->p_up );
+				p_uncle = p_item->p_up->p_left;
+			}
+
+			if( p_uncle->p_left->color != CL_MAP_RED )
+			{
+				if( p_uncle->p_right->color != CL_MAP_RED )
+				{
+					p_uncle->color = CL_MAP_RED;
+					p_item = p_item->p_up;
+					continue;
+				}
+
+				p_uncle->p_right->color = CL_MAP_BLACK;
+				p_uncle->color = CL_MAP_RED;
+				__cl_map_rot_left( p_map, p_uncle );
+				p_uncle = p_item->p_up->p_left;
+			}
+			p_uncle->color = p_item->p_up->color;
+			p_item->p_up->color = CL_MAP_BLACK;
+			p_uncle->p_left->color = CL_MAP_BLACK;
+			__cl_map_rot_right( p_map, p_item->p_up );
+			break;
+		}
+	}
+	p_item->color = CL_MAP_BLACK;
+}
+
+static void
+ips_cl_qmap_remove_item(
+	IN	cl_qmap_t* const		p_map,
+	IN	cl_map_item_t* const	p_item )
+{
+	cl_map_item_t	*p_child, *p_del_item;
+
+	RBTREE_ASSERT( p_map );
+	RBTREE_ASSERT( p_item );
+
+	if( p_item == p_map->nil_item )
+		return;
+
+	if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) )
+	{
+		/* The item being removed has children on at most on side. */
+		p_del_item = p_item;
+	}
+	else
+	{
+		/*
+		 * The item being removed has children on both side.
+		 * We select the item that will replace it.  After removing
+		 * the substitute item and rebalancing, the tree will have the
+		 * correct topology.  Exchanging the substitute for the item
+		 * will finalize the removal.
+		 */
+		p_del_item = ips_cl_qmap_successor(p_map, p_item);
+		RBTREE_ASSERT( p_del_item != p_map->nil_item );
+	}
+
+	RBTREE_MAP_COUNT(&p_map->payload)--;
+
+	/* Get the pointer to the new root's child, if any. */
+	if( p_del_item->p_left != p_map->nil_item )
+		p_child = p_del_item->p_left;
+	else
+		p_child = p_del_item->p_right;
+
+	/*
+	 * This assignment may modify the parent pointer of the nil node.
+	 * This is inconsequential.
+	 */
+	p_child->p_up = p_del_item->p_up;
+	(*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child;
+
+	if( p_del_item->color != CL_MAP_RED )
+		__cl_map_del_bal( p_map, p_child );
+
+	/*
+	 * Note that the splicing done below does not need to occur before
+	 * the tree is balanced, since the actual topology changes are made by the
+	 * preceding code.  The topology is preserved by the color assignment made
+	 * below (reader should be reminded that p_del_item == p_item in some cases).
+	 */
+	if( p_del_item != p_item )
+	{
+		/*
+		 * Finalize the removal of the specified item by exchanging it with
+		 * the substitute which we removed above.
+		 */
+		p_del_item->p_up = p_item->p_up;
+		p_del_item->p_left = p_item->p_left;
+		p_del_item->p_right = p_item->p_right;
+		(*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item;
+		p_item->p_right->p_up = p_del_item;
+		p_item->p_left->p_up = p_del_item;
+		p_del_item->color = p_item->color;
+	}
+
+	RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED );
+}
+
+static cl_map_item_t *
+ips_cl_qmap_successor(
+	IN	cl_qmap_t* const		p_map,
+	IN	const cl_map_item_t*		p_item )
+{
+	cl_map_item_t	*p_tmp;
+
+	p_tmp = p_item->p_right;
+	if (p_tmp != p_map->nil_item) {
+		while (p_tmp->p_left != p_map->nil_item)
+			p_tmp = p_tmp->p_left;
+		return p_tmp;
+	} else {
+		p_tmp = p_item->p_up;
+		while (p_tmp->p_right == p_item) {
+			p_item = p_tmp;
+			p_tmp = p_tmp->p_up;
+		}
+		if (p_tmp == p_map->root)
+			return p_map->nil_item;
+		return p_tmp;
+	}
+}
+
+// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted.
+// When this happens, ips_cl_qmap_predecessor() may not be called.
+// Combined with -Werror -Wunused-function, libpsm2 fails to build.
+// So provide macro to control emitting this function
+#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+static cl_map_item_t *
+ips_cl_qmap_predecessor(
+	IN	cl_qmap_t* const		p_map,
+	IN	const cl_map_item_t*		p_item )
+{
+	cl_map_item_t	*p_tmp;
+
+	p_tmp = p_item->p_left;
+	if (p_tmp != p_map->nil_item) {
+		while (p_tmp->p_right != p_map->nil_item)
+			p_tmp = p_tmp->p_right;
+		return p_tmp;
+	} else {
+		p_tmp = p_item->p_up;
+		while (p_tmp->p_left == p_item) {
+			p_item = p_tmp;
+			p_tmp = p_tmp->p_up;
+		}
+		if (p_tmp == p_map->root)
+			return p_map->nil_item;
+		return p_tmp;
+	}
+}
+#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */
+
+#if defined(RBTREE_GET_LEFTMOST)
+/*
+ * return the first node with buffer overlapping or zero.
+ */
+static cl_map_item_t *
+ips_cl_qmap_search(cl_qmap_t * const p_map,
+		unsigned long start, unsigned long end)
+{
+	cl_map_item_t *p_item, *p_tmp;
+
+	RBTREE_ASSERT( p_map );
+	p_item = __cl_map_root(p_map);
+
+	while (p_item != p_map->nil_item) {
+		if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) {
+			p_tmp = p_item->p_right;
+			if (p_tmp != p_map->nil_item) {
+				p_item = p_tmp;
+				continue;
+			}
+
+			/*
+			 * p_item is on immediate left side of 'start'.
+			 */
+			if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) {
+				/*
+				 * p_item is on immediate right
+				 * side of 'start'.
+				 */
+				p_item = ips_cl_qmap_successor(p_map, p_item);
+				if (p_item != p_map->nil_item &&
+				    end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+					p_item = p_map->nil_item;
+			}
+		} else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) {
+			p_tmp = p_item->p_left;
+			if (p_tmp != p_map->nil_item) {
+				p_item = p_tmp;
+				continue;
+			}
+
+			/*
+			 * p_tmp is on immediate left side of 'start'.
+			 */
+			p_tmp = ips_cl_qmap_predecessor(p_map, p_item);
+			if (p_tmp == p_map->nil_item ||
+			    (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) {
+				/*
+				 * p_item is on immediate right
+				 * side of 'start'.
+				 */
+				if (end <= RBTREE_GET_LEFTMOST(&p_item->payload))
+					p_item = p_map->nil_item;
+			} else
+				p_item = p_tmp;
+		}
+
+		break;
+	}
+
+
+	return p_item;
+}
+#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
+static cl_map_item_t *
+ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key)
+{
+	RBTREE_ASSERT( p_map );
+	cl_map_item_t *p_item = __cl_map_root(p_map);
+
+	while (p_item != p_map->nil_item) {
+		if (RBTREE_CMP(key, &p_item->payload) > 0) {
+			p_item = p_item->p_right;
+		} else if (RBTREE_CMP(key, &p_item->payload) < 0) {
+			p_item = p_item->p_left;
+		} else {
+			break;
+		}
+	}
+
+	return p_item;
+}
+#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */
diff --git a/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.h b/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.h
new file mode 100644
index 0000000000000000000000000000000000000000..13245b0d456601d50e460c693a06605f6216b875
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/include/psm3_rbtree.h
@@ -0,0 +1,90 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __RBTREE_H__
+
+#define __RBTREE_H__
+
+#include <stdint.h>
+
+#ifndef RBTREE_MAP_PL
+#error "You must define RBTREE_MAP_PL before including rbtree.h"
+#endif
+
+#ifndef RBTREE_MI_PL
+#error "You must define RBTREE_MI_PL before including rbtree.h"
+#endif
+
+/*
+ * Red-Black tid cache definition.
+ */
+typedef struct _cl_map_item {
+	struct _cl_map_item	*p_left;	/* left pointer */
+	struct _cl_map_item	*p_right;	/* right pointer */
+	struct _cl_map_item	*p_up;		/* up pointer */
+	uint16_t		color;		/* red-black color */
+
+	RBTREE_MI_PL            payload;
+} cl_map_item_t;
+
+typedef struct _cl_qmap {
+	cl_map_item_t		*root;		/* root node pointer */
+	cl_map_item_t		*nil_item;	/* terminator node pointer */
+
+	RBTREE_MAP_PL            payload;
+} cl_qmap_t;
+
+#define CL_MAP_RED   0
+#define CL_MAP_BLACK 1
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/pack.c b/deps/libfabric/prov/psm3/psm3/libuuid/pack.c
new file mode 100644
index 0000000000000000000000000000000000000000..801b89177c9bc46a9ed8af29128e4d0e25789691
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/pack.c
@@ -0,0 +1,69 @@
+/*
+ * Internal routine for packing UUID's
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_pack(const struct uuid *uu, uuid_t ptr)
+{
+	uint32_t	tmp;
+	unsigned char	*out = ptr;
+
+	tmp = uu->time_low;
+	out[3] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[2] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[1] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[0] = (unsigned char) tmp;
+
+	tmp = uu->time_mid;
+	out[5] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[4] = (unsigned char) tmp;
+
+	tmp = uu->time_hi_and_version;
+	out[7] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[6] = (unsigned char) tmp;
+
+	tmp = uu->clock_seq;
+	out[9] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[8] = (unsigned char) tmp;
+
+	memcpy(out+10, uu->node, 6);
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/parse.c b/deps/libfabric/prov/psm3/psm3/libuuid/parse.c
new file mode 100644
index 0000000000000000000000000000000000000000..dd8c2587ba9112f444a1a29c55172fb1dfc91cf1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/parse.c
@@ -0,0 +1,78 @@
+/*
+ * parse.c --- UUID parsing
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+int uuid_parse(const char *in, uuid_t uu)
+{
+	struct uuid	uuid;
+	int 		i;
+	const char	*cp;
+	char		buf[3];
+
+	if (strlen(in) != 36)
+		return -1;
+	for (i=0, cp = in; i <= 36; i++,cp++) {
+		if ((i == 8) || (i == 13) || (i == 18) ||
+		    (i == 23)) {
+			if (*cp == '-')
+				continue;
+			else
+				return -1;
+		}
+		if (i== 36)
+			if (*cp == 0)
+				continue;
+		if (!isxdigit(*cp))
+			return -1;
+	}
+	uuid.time_low = strtoul(in, NULL, 16);
+	uuid.time_mid = strtoul(in+9, NULL, 16);
+	uuid.time_hi_and_version = strtoul(in+14, NULL, 16);
+	uuid.clock_seq = strtoul(in+19, NULL, 16);
+	cp = in+24;
+	buf[2] = 0;
+	for (i=0; i < 6; i++) {
+		buf[0] = *cp++;
+		buf[1] = *cp++;
+		uuid.node[i] = strtoul(buf, NULL, 16);
+	}
+
+	uuid_pack(&uuid, uu);
+	return 0;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.c b/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.c
new file mode 100644
index 0000000000000000000000000000000000000000..4db29a69bffd50e353307ed7575f291b1eefe8c7
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.c
@@ -0,0 +1,114 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/stat.h>
+#include <limits.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static void psmi_make_drand_uuid(psm2_uuid_t uuid_out)
+{
+	struct drand48_data drand48_data;
+	int i;
+	long int rnum;
+	srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data);
+	for(i=0; i < 16; i++) {
+		lrand48_r(&drand48_data, &rnum);
+		uuid_out[i] = rnum % UCHAR_MAX;
+	}
+}
+
+/* Since libuuid can call srand, we will generate our own uuids */
+void
+__psm2_uuid_generate(psm2_uuid_t uuid_out)
+{
+	PSM2_LOG_MSG("entering");
+	/* Prefer using urandom, fallback to drand48_r */
+	struct stat urandom_stat;
+	size_t nbytes;
+	int fd;
+	if(stat("/dev/urandom", &urandom_stat) != 0) {
+		psmi_make_drand_uuid(uuid_out);
+		return;
+	}
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if(fd == -1) {
+		psmi_make_drand_uuid(uuid_out);
+	} else {
+		nbytes = read(fd, (char *) uuid_out, 16);
+		if(nbytes != 16) {
+			psmi_make_drand_uuid(uuid_out);
+		}
+		close(fd);
+	}
+	PSM2_LOG_MSG("leaving");
+	return;
+}
+PSMI_API_DECL(psm2_uuid_generate)
+
+void
+psmi_uuid_unparse(const uuid_t uu, char *out)
+{
+	uuid_unparse_lower(uu, out);
+}
+
+int
+psmi_uuid_parse(const char *in, uuid_t uu)
+{
+	return uuid_parse(in, uu);
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.h b/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.h
new file mode 100644
index 0000000000000000000000000000000000000000..09df044d9cabaa83b590f589e8af8567316ef37f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/psm_uuid.h
@@ -0,0 +1,78 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM_UUID_H
+#define _PSM_UUID_H
+struct uuid {
+	uint32_t	time_low;
+	uint16_t	time_mid;
+	uint16_t	time_hi_and_version;
+	uint16_t	clock_seq;
+	uint8_t	node[6];
+};
+
+typedef unsigned char uuid_t[16];
+
+int	    psmi_uuid_parse(const char *in, psm2_uuid_t uu);
+void	    psmi_uuid_unparse(const psm2_uuid_t uuid, char *out);
+int	    psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB);
+int uuid_compare(const uuid_t uu1, const uuid_t uu2);
+void uuid_pack(const struct uuid *uu, uuid_t ptr);
+void uuid_unparse(const uuid_t uu, char *out);
+void uuid_unparse_upper(const uuid_t uu, char *out);
+void uuid_unparse_lower(const uuid_t uu, char *out);
+void uuid_unpack(const uuid_t in, struct uuid *uu);
+int uuid_parse(const char *in, uuid_t uu);
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/unpack.c b/deps/libfabric/prov/psm3/psm3/libuuid/unpack.c
new file mode 100644
index 0000000000000000000000000000000000000000..26e4394c80cbe5f0ae08d5f146cd6a6f11901f26
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/unpack.c
@@ -0,0 +1,63 @@
+/*
+ * Internal routine for unpacking UUID
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+void uuid_unpack(const uuid_t in, struct uuid *uu)
+{
+	const uint8_t	*ptr = in;
+	uint32_t		tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_low = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_mid = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_hi_and_version = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->clock_seq = tmp;
+
+	memcpy(uu->node, ptr, 6);
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/libuuid/unparse.c b/deps/libfabric/prov/psm3/psm3/libuuid/unparse.c
new file mode 100644
index 0000000000000000000000000000000000000000..d8593797ad5220a89897c87c958f2e2220c03aa0
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/libuuid/unparse.c
@@ -0,0 +1,75 @@
+/*
+ * unparse.c -- convert a UUID to string
+ *
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <stdio.h>
+
+#include "psm_user.h"
+#include "psm_uuid.h"
+
+static const char *fmt_lower =
+	"%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x";
+
+static const char *fmt_upper =
+	"%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X";
+
+#ifdef UUID_UNPARSE_DEFAULT_UPPER
+#define FMT_DEFAULT fmt_upper
+#else
+#define FMT_DEFAULT fmt_lower
+#endif
+
+static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt)
+{
+	struct uuid uuid;
+
+	uuid_unpack(uu, &uuid);
+	sprintf(out, fmt,
+		uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
+		uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
+		uuid.node[0], uuid.node[1], uuid.node[2],
+		uuid.node[3], uuid.node[4], uuid.node[5]);
+}
+
+void uuid_unparse_lower(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_lower);
+}
+
+void uuid_unparse_upper(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_upper);
+}
+
+void uuid_unparse(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out, FMT_DEFAULT);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/mpspawn/mpspawn_stats.h b/deps/libfabric/prov/psm3/psm3/mpspawn/mpspawn_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..36be6f20f5a86f0445e2ce30d6b5699c80dd7718
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/mpspawn/mpspawn_stats.h
@@ -0,0 +1,135 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _MPSPAWN_STATS_H
+#define _MPSPAWN_STATS_H
+
+#include <math.h>
+
+#define MPSPAWN_STATS_VERSION	1
+
+typedef enum {
+	MPSPAWN_STATS_TYPE_DOUBLE = 0x1,
+#define MPSPAWN_STATS_TYPE_DOUBLE      0x1
+	MPSPAWN_STATS_TYPE_HEADER = 0x2,
+#define MPSPAWN_STATS_TYPE_HEADER      0x2
+	MPSPAWN_STATS_REDUCTION_MAX = 0x1000,
+#define MPSPAWN_STATS_REDUCTION_MAX    0x1000
+	MPSPAWN_STATS_REDUCTION_MIN = 0x2000,
+#define MPSPAWN_STATS_REDUCTION_MIN    0x2000
+	MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000,
+#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000
+	MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000
+#define MPSPAWN_STATS_SKIP_IF_ZERO     0x8000
+} mpspawn_stats_flags;
+
+#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \
+	    MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN)
+
+#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg)))
+#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL)
+#define MPSPAWN_ISNAN_U64(x)    (((uint64_t)(x)) == MPSPAWN_NAN_U64)
+
+#define MPSPAWN_NAN	    ((uint64_t) ~0ULL)	/* NAN) */
+#define MPSPAWN_ISNAN(x)    (isnan(x))
+
+#if 0   // unused code, specific to QLogic MPI
+
+struct mpspawn_stats_add_args;	/* client->mpspawn stats registration */
+struct mpspawn_stats_req_args;	/* mpspawn->client fn callback stats request */
+struct mpspawn_stats_init_args;	/* mpspawn->client "downcall" to register */
+
+/* Clients implement this function to fill in mpspawn request for stats */
+typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *);
+/* mpspawn implements this function to allow clients to register new stats */
+typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *);
+/* mpspawn implements this function to map rank indexes into epaddr structs */
+struct psm2_epaddr;
+typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank);
+
+typedef struct mpspawn_stats_req_args {
+	int version;
+	int num;
+	uint64_t *stats;
+	uint16_t *flags;
+	void *context;
+} mpspawn_stats_req_args_t;
+
+typedef
+struct mpspawn_stats_add_args {
+	int version;
+	int num;
+	char *header;
+	char **desc;
+	uint16_t *flags;
+	mpspawn_stats_req_fn req_fn;
+	void *context;
+} mpspawn_stats_add_args_t;
+
+typedef
+struct mpspawn_stats_init_args {
+	int version;
+	psm2_mq_t mq;		/* initialized mq endpoint */
+	int num_epaddr;		/* number of endpoints in job */
+	mpspawn_stats_add_fn add_fn;	/* function for client to add stats */
+	mpspawn_map_epaddr_fn epaddr_map_fn;
+	const char *stats_types;	/* stats type string mpirun -M */
+} mpspawn_stats_init_args_t;
+
+/* Function in psm exposed to register stats */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args);
+
+#endif
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_debug.c b/deps/libfabric/prov/psm3/psm3/opa/opa_debug.c
new file mode 100644
index 0000000000000000000000000000000000000000..2b9fbda51f68accb61b2f5d2610cbebf0ab4bbaa
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_debug.c
@@ -0,0 +1,479 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <execinfo.h>
+#include <fcntl.h>
+#include <ucontext.h>
+#include "opa_user.h"
+#include "psm_user.h"
+#include "../psm_log.h"
+
+unsigned hfi_debug = __HFI_DEBUG_DEFAULT;
+char __hfi_mylabel[1024];
+int __hfi_myrank = -1;
+int __hfi_myrank_count = -1;
+int __hfi_mylocalrank = -1;
+int __hfi_mylocalrank_count = -1;
+FILE *__hfi_dbgout;
+static void init_hfi_mylabel(void) __attribute__ ((constructor));
+static void init_hfi_backtrace(void) __attribute__ ((constructor));
+static void init_hfi_dbgfile(void) __attribute__ ((constructor));
+static void fini_hfi_backtrace(void) __attribute__ ((destructor));
+static struct sigaction SIGSEGV_old_act;
+static struct sigaction SIGBUS_old_act;
+static struct sigaction SIGILL_old_act;
+static struct sigaction SIGABRT_old_act;
+static struct sigaction SIGINT_old_act;
+static struct sigaction SIGTERM_old_act;
+#ifdef PSM3_BRAKE_DEBUG
+static void hfi_brake_debug(void) __attribute__ ((constructor));
+
+/*
+  How to use hfi_break_debug code:
+
+  1. Build psm with PSM3_BRAKE_DEBUG set in the environment.
+  2. Create a script for your test case (e.g. mpistress?).  In the script
+     make sure to choose a PSM3 brake file that corresponds to a network
+     file system that is common to all hosts where you will run your code.
+     Also, in the script, make sure to propagate the "PSM3_BRAKE_FILE_NAME"
+     env var to all hosts.
+  3. Bring up 3 putty sessions to one of the hosts that your script uses.
+  4. In putty session number 1, touch the PSM3_BRAKE_FILE and sync.
+  5. In putty session number 1, start the script.   You should see messages
+     of the form:
+-bash-4.2$ ./mpistress.0304.sc
+<hostname>:5716 remove the file: "/nfs/user/PSM3_BRAKE"  to continue
+<hostname>:5717 remove the file: "/nfs/user/PSM3_BRAKE"  to continue
+<hostname>:3456 remove the file: "/nfs/user/PSM3_BRAKE"  to continue
+<hostname>:3457 remove the file: "/nfs/user/PSM3_BRAKE"  to continue
+
+     Note that the hostname and process id are shown for all of the processes that are started
+     by your script.
+  6. In putty session 2, bring up gdb, and debug the program that is referenced in your script.
+     For example: /usr/mpi/gcc/openmpi-1.10.2-ofi/tests/intel/mpi_stress
+  7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1.
+  8. Note, at this point, you have only one gdb session.  I leave it as an exercise to the reader to
+     determine how to bring up multiple gdb sessions.
+  9. In putty session 3, rm the PSM3_BRAKE_FILE.
+ 10. You are now debugging a live session of psm.
+ */
+
+static void hfi_brake_debug(void)
+{
+	struct stat buff;
+	char hostname[80];
+	const char *hfi_brake_file_name = getenv("PSM3_BRAKE_FILE_NAME");
+	gethostname(hostname, 80);
+	hostname[sizeof(hostname) - 1] = '\0';
+
+	if (!hfi_brake_file_name)
+		hfi_brake_file_name = "/tmp/PSM3_BRAKE_FILE";
+	printf("%s:%d remove the file: \"%s\"  to continue\n",hostname,getpid(),hfi_brake_file_name);
+	while (0 == stat(hfi_brake_file_name, &buff))
+	{
+		printf("%s:pid%d remove the file: \"%s\"  to continue\n",hostname,getpid(),hfi_brake_file_name);
+		sleep(10);
+	}
+	printf("%s:pid%d continuing.\n",hostname,getpid());
+}
+#endif
+
+static void init_hfi_mylabel(void)
+{
+	char hostname[80];
+	char *e;
+	/* By default, try to come up with a decent default label, it will be
+	 * overridden later.  Try getting rank, if that's not available revert to
+	 * pid. */
+	gethostname(hostname, 80);
+	__hfi_mylabel[0] = '\0';
+	hostname[sizeof(hostname) - 1] = '\0';
+
+#if 0
+	/* DEBUG: Used to selectively test possible NIC selection,
+	 * shared context and shm-only settings */
+	unsetenv("PSC_MPI_NODE_RANK");
+	unsetenv("PSC_MPI_PPN");
+	unsetenv("MPI_LOCALRANKID");
+	unsetenv("MPI_LOCALRANKS");
+#endif
+
+	if ((((e = getenv("PMI_SIZE")) && *e))	// MPICH & IMPI
+	    || (((e = getenv("OMPI_COMM_WORLD_SIZE")) && *e)) // OMPI
+	    || (((e = getenv("MPI_NRANKS")) && *e)) // Platform MPI
+	    || (((e = getenv("MPIRUN_NPROCS")) && *e)) // older MPICH
+	    // N/A || (((e = getenv("PSC_MPI_TBD")) && *e)) // pathscale MPI
+	    || (((e = getenv("SLURM_NTASKS")) && *e)) // SLURM
+	    || (((e = getenv("SLURM_NPROCS")) && *e)) // older SLURM
+	) {
+		char *ep;
+		unsigned long val;
+		val = strtoul(e, &ep, 10);
+		if (ep != e) /* valid conversion */
+			__hfi_myrank_count = val;
+	}
+
+	if ((((e = getenv("MPI_LOCALRANKID")) && *e))	// MPICH and IMPI
+	    || (((e = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) && *e)) // OMPI
+	    || (((e = getenv("MPI_LOCALRANKID")) && *e)) // Platform MPI
+	    // N/A | (((e = getenv("MPIRUN_TBD")) && *e)) // older MPICH
+	    || (((e = getenv("PSC_MPI_NODE_RANK")) && *e)) // pathscale MPI
+	    || (((e = getenv("SLURM_LOCALID")) && *e)) // SLURM
+	) {
+		char *ep;
+		unsigned long val;
+		val = strtoul(e, &ep, 10);
+		if (ep != e) /* valid conversion */
+			__hfi_mylocalrank = val;
+	}
+
+	if ((((e = getenv("MPI_LOCALNRANKS")) && *e))	// MPICH and IMPI
+	    || (((e = getenv("OMPI_COMM_WORLD_LOCAL_SIZE")) && *e)) // OMPI
+	    || (((e = getenv("MPI_LOCALNRANKS")) && *e)) // Platform MPI
+	    // N/A || (((e = getenv("MPIRUN_TBD")) && *e)) // older MPICH
+	    || (((e = getenv("PSC_MPI_PPN")) && *e)) // pathscale MPI
+	    || (((e = getenv("SLURM_NTASKS_PER_NODE")) && *e)) // SLURM
+	) {
+		char *ep;
+		unsigned long val;
+		val = strtoul(e, &ep, 10);
+		if (ep != e) /* valid conversion */
+			__hfi_mylocalrank_count = val;
+	}
+
+	if ((((e = getenv("PMI_RANK")) && *e))	// MPICH and *_SIZE
+	    || (((e = getenv("OMPI_COMM_WORLD_RANK")) && *e)) // OMPI and *_SIZE
+	    || (((e = getenv("MPI_RANKID")) && *e)) // Platform MPI and *_NRANKS
+	    || (((e = getenv("MPIRUN_RANK")) && *e)) // older MPICH and *_NPROCS
+	    || (((e = getenv("PSC_MPI_RANK")) && *e)) // pathscale MPI
+	    || (((e = getenv("SLURM_TASKID")) && *e)) // SLURM
+	    || (((e = getenv("SLURM_PROCID")) && *e)) // SLURM
+	) {
+		char *ep;
+		unsigned long val;
+		val = strtoul(e, &ep, 10);
+		if (ep != e) {	/* valid conversion */
+			snprintf(__hfi_mylabel, sizeof(__hfi_mylabel),
+				"%s:rank%lu", hostname, val);
+			__hfi_myrank = val;
+		}
+	}
+	if (__hfi_mylabel[0] == '\0')
+		snprintf(__hfi_mylabel, sizeof(__hfi_mylabel),
+			"%s:pid%u", hostname, getpid());
+}
+
+/* FIXME: This signal handler does not conform to the posix standards described
+   in 'man 7 signal' due to it calling unsafe functions.
+
+   See 'CALLS UNSAFE FUNCTION' notes below for examples.
+ */
+static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv)
+{
+	/* we make these static to try and avoid issues caused
+	   by stack overflow that might have gotten us here. */
+	static void *backaddr[128];	/* avoid stack usage */
+	static char buf[150], hname[64], fname[128];
+	static int i, j, fd, id;
+	extern char *__progname;
+	PSM2_LOG_DECLARE_BT_BUFFER();
+
+	/* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */
+	PSM2_LOG_BT(100,__FUNCTION__);
+	/* If this is a SIGINT do not display backtrace. Just invoke exit
+	   handlers */
+	if ((sig == SIGINT) || (sig == SIGTERM))
+		/* CALLS UNSAFE FUNCTION (exit) */
+		exit(1);
+
+	/* CALLS UNSAFE FUNCTION (snprintf) */
+	id = snprintf(buf, sizeof(buf),
+		      "\n%.60s:pid%u terminated with signal %d", __progname,
+		      getpid(), sig);
+	if (ucv) {
+		static ucontext_t *uc;
+		uc = (ucontext_t *) ucv;
+		id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx",
+#if defined(__x86_64__)
+			       (unsigned long)uc->uc_mcontext.gregs[REG_RIP],
+			       (unsigned long)uc->uc_mcontext.gregs[REG_RSP]);
+#elif defined(__i386__)
+			       (unsigned long)uc->uc_mcontext.gregs[REG_EIP],
+			       (unsigned long)uc->uc_mcontext.gregs[REG_ESP]);
+#else
+			       0ul, 0ul);
+#warning No stack pointer or instruction pointer for this arch
+#endif
+	}
+	id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n");
+	/* CALLS UNSAFE FUNCTION (fprintf) */
+	fprintf(stderr, "%.*s", id, buf);
+
+	i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0]));
+	if (i > 2)		/* skip ourselves and backtrace */
+		j = 2, i -= j;
+	else
+		j = 0;
+
+	backtrace_symbols_fd(backaddr + j, i, 2);
+	(void)fsync(2);
+
+	/* Try to write it to a file as well, in case the rest doesn't make it
+	   out. Do it second, in case we get a second failure (more likely).
+	   We might eventually want to print some more of the registers to the
+	   btr file, to aid debugging, but not for now.  Truncate the program
+	   name if overly long, so we always get pid and (at least part of)
+	   hostname. */
+	/* CALLS UNSAFE FUNCTION (gethostname) */
+	(void)gethostname(hname, sizeof(hname));
+	hname[sizeof(hname) - 1] = '\0';
+	snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname,
+		 getpid(), hname);
+	if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) {
+		/* CALLS UNSAFE FUNCTION (fdopen) */
+		FILE *fp = fdopen(fd, "w");
+		if (fp)
+			fprintf(fp, "%.*s", id, buf);
+		backtrace_symbols_fd(backaddr + j, i, fd);
+		if (fp)
+			/* CALLS UNSAFE FUNCTION (fclose) */
+			fclose(fp);
+	}
+	switch (sig){
+        case SIGSEGV:
+                (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGBUS:
+                (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGILL:
+                (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        case SIGABRT:
+                (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv);
+                break;
+        default:
+                break;
+        }
+	exit(1);		/* not _exit(), want atexit handlers to get run */
+}
+
+/* We do this as a constructor so any user program that sets signal handlers
+   for these will override our settings, but we still get backtraces if they
+   don't.
+*/
+static void init_hfi_backtrace(void)
+{
+	/* we need to track memory corruption */
+	static struct sigaction act;	/* easier than memset */
+	act.sa_sigaction = hfi_sighdlr;
+	act.sa_flags = SA_SIGINFO;
+
+	if (getenv("PSM3_BACKTRACE")) {
+		/* permanent, although probably
+		   undocumented way to disable backtraces. */
+		(void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act);
+		(void)sigaction(SIGBUS, &act, &SIGBUS_old_act);
+		(void)sigaction(SIGILL, &act, &SIGILL_old_act);
+		(void)sigaction(SIGABRT, &act, &SIGABRT_old_act);
+		(void)sigaction(SIGINT, &act, &SIGINT_old_act);
+		(void)sigaction(SIGTERM, &act, &SIGTERM_old_act);
+	}
+}
+
+/* if PSM3_DEBUG_FILENAME is set in the environment, then all the
+   debug prints (not info and error) will go to that file.
+   %h is expanded to the hostname, and %p to the pid, if present. */
+static void init_hfi_dbgfile(void)
+{
+	char *fname = getenv("PSM3_DEBUG_FILENAME");
+	char *exph, *expp, tbuf[1024];
+	FILE *newf;
+
+	if (!fname) {
+		__hfi_dbgout = stdout;
+		return;
+	}
+	exph = strstr(fname, "%h");	/* hostname */
+	expp = strstr(fname, "%p");	/* pid */
+	if (exph || expp) {
+		int baselen;
+		char hname[256], pid[12];
+		if (exph) {
+			*hname = hname[sizeof(hname) - 1] = 0;
+			gethostname(hname, sizeof(hname) - 1);
+			if (!*hname)
+				strcpy(hname, "[unknown]");
+		}
+		if (expp)
+			snprintf(pid, sizeof(pid), "%d", getpid());
+		if (exph && expp) {
+			if (exph < expp) {
+				baselen = exph - fname;
+				snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+					 baselen, fname, hname,
+					 (int)(expp - (exph + 2)), exph + 2,
+					 pid, expp + 2);
+			} else {
+				baselen = expp - fname;
+				snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s",
+					 baselen, fname, pid,
+					 (int)(exph - (expp + 2)), expp + 2,
+					 hname, exph + 2);
+			}
+		} else if (exph) {
+			baselen = exph - fname;
+			snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+				 baselen, fname, hname, exph + 2);
+		} else {
+			baselen = expp - fname;
+			snprintf(tbuf, sizeof(tbuf), "%.*s%s%s",
+				 baselen, fname, pid, expp + 2);
+		}
+		fname = tbuf;
+	}
+	newf = fopen(fname, "a");
+	if (!newf) {
+		_HFI_ERROR
+		    ("Unable to open \"%s\" for debug output, using stdout: %s\n",
+		     fname, strerror(errno));
+		__hfi_dbgout = stdout;
+	} else {
+		__hfi_dbgout = newf;
+		setlinebuf(__hfi_dbgout);
+	}
+}
+
+void hfi_set_mylabel(char *label)
+{
+	strncpy(__hfi_mylabel, label, sizeof(__hfi_mylabel));
+	__hfi_mylabel[sizeof(__hfi_mylabel)-1] = '\0';
+}
+
+char *hfi_get_mylabel()
+{
+	return __hfi_mylabel;
+}
+
+int hfi_get_myrank()
+{
+	return __hfi_myrank;
+}
+
+int hfi_get_myrank_count()
+{
+	return __hfi_myrank_count;
+}
+
+int hfi_get_mylocalrank()
+{
+	return __hfi_mylocalrank;
+}
+
+int hfi_get_mylocalrank_count()
+{
+	return __hfi_mylocalrank_count;
+}
+
+static void fini_hfi_backtrace(void)
+{
+  if (getenv("PSM3_BACKTRACE")) {
+    (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL);
+    (void)sigaction(SIGBUS,  &SIGBUS_old_act, NULL);
+    (void)sigaction(SIGILL,  &SIGILL_old_act, NULL);
+    (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL);
+    (void)sigaction(SIGINT,  &SIGINT_old_act, NULL);
+    (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL);
+  }
+}
+
+void hfi_dump_buf(uint8_t *buf, uint32_t len)
+{
+	int i, j;
+	for (i=0; i<len; i += 16 ) {
+		fprintf(__hfi_dbgout, "%s: 0x%04x:", __hfi_mylabel, i);
+		for (j=0; j<16 && i+j < len; j++)
+			fprintf(__hfi_dbgout, " %02x", (unsigned)buf[i+j]);
+		fprintf(__hfi_dbgout, "\n");
+	}
+}
+
+#ifdef PSM_CUDA
+void hfi_dump_gpu_buf(uint8_t *buf, uint32_t len)
+{
+	int i, j;
+	uint8_t hbuf[1024];
+
+	for (i=0; i<len; i += 16 ) {
+		fprintf(__hfi_dbgout, "%s: 0x%04x:", __hfi_mylabel, i);
+		if (0 == i % 1024)
+			PSMI_CUDA_CALL(cuMemcpyDtoH, hbuf, (CUdeviceptr)buf,
+                                                min(len-i, 1024));
+		for (j=0; j<16 && i+j < len; j++)
+			fprintf(__hfi_dbgout, " %02x", (unsigned)hbuf[i%1024+j]);
+		fprintf(__hfi_dbgout, "\n");
+	}
+}
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S b/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S
new file mode 100644
index 0000000000000000000000000000000000000000..12fe9a3e2008b69e97fa1a4fe77285d0c6c28c05
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S
@@ -0,0 +1,84 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifdef __CET__
+#include <cet.h>
+#endif
+
+ 	.globl hfi_dwordcpy
+	.file	"opa_dwordcpy-x86_64-fast.S"
+	.text
+	.p2align 4,,15
+	// standard C calling convention, rdi is dest, rsi is source, rdx is count
+        // does not return any value
+hfi_dwordcpy:
+	.type	hfi_dwordcpy, @function
+#ifdef _CET_ENDBR
+	_CET_ENDBR
+#endif
+	movl %edx,%ecx
+	shrl $1,%ecx
+	andl $1,%edx
+        cld
+	rep
+	movsq
+	movl %edx,%ecx
+	rep
+	movsd
+	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c b/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c
new file mode 100644
index 0000000000000000000000000000000000000000..342b05f7e6de4e962809a9f7e01025ae68770764
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c
@@ -0,0 +1,315 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <stdint.h>
+#include <immintrin.h>
+#include "opa_intf.h"
+#include "psm_user.h"
+
+#if defined(__x86_64__) && defined(HAVE_PSM3_DWORD_FAST)
+#define hfi_dwordcpy hfi_dwordcpy_safe
+#define hfi_qwordcpy hfi_qwordcpy_safe
+#endif
+
+void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = (const uint64_t *) src;
+	dst64[0] = (volatile uint64_t *) dest;
+
+	while (ndw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		ndw -= 8;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	}
+	if (ndw) {
+		src = (const uint32_t *) src64[0];
+		dest = (volatile uint32_t *) dst64[0];
+
+		switch (ndw) {
+		case 7: *dest++ = *src++;
+		/* fall through */
+		case 6: *dest++ = *src++;
+		/* fall through */
+		case 5: *dest++ = *src++;
+		/* fall through */
+		case 4: *dest++ = *src++;
+		/* fall through */
+		case 3: *dest++ = *src++;
+		/* fall through */
+		case 2:	*dest++ = *src++;
+		/* fall through */
+		case 1: *dest++ = *src++;
+		}
+
+	}
+}
+
+void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords)
+{
+	uint_fast32_t nqw = nqwords;
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	while (nqw >= 8) {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		nqw -= 8;
+	}
+	if (nqw) {
+		switch (nqw) {
+		case 7: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 6: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 5: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 4: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 3: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 2: *(dst64[0])++ = *(src64[0])++;
+		/* fall through */
+		case 1: *(dst64[0])++ = *(src64[0])++;
+		}
+	}
+}
+
+#ifdef PSM_AVX512
+void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m512i *dp = (volatile __m512i *) dest;
+	const __m512i *sp = (const __m512i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x3f) == 0x0) {
+		/* source and destination are both 64 byte aligned */
+		do {
+			__m512i tmp0 = _mm512_load_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	} else {
+		/* only destination is 64 byte aligned - use unaligned loads */
+		do {
+			__m512i tmp0 = _mm512_loadu_si512(sp);
+			_mm512_store_si512((__m512i *)dp, tmp0);
+		} while ((--nblock) && (++dp) && (++sp));
+	}
+}
+#endif
+
+void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m256i *dp = (volatile __m256i *) dest;
+	const __m256i *sp = (const __m256i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0x1f) == 0x0) {
+		/* source and destination are both 32 byte aligned */
+		do {
+			__m256i tmp0 = _mm256_load_si256(sp);
+			__m256i tmp1 = _mm256_load_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	} else {
+		/* only destination is 32 byte aligned - use unaligned loads */
+		do {
+			__m256i tmp0 = _mm256_loadu_si256(sp);
+			__m256i tmp1 = _mm256_loadu_si256(sp + 1);
+			_mm256_store_si256((__m256i *)dp, tmp0);
+			_mm256_store_si256((__m256i *)(dp + 1), tmp1);
+		} while ((--nblock) && (dp = dp+2) && (sp = sp+2));
+	}
+}
+
+void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	volatile __m128i *dp = (volatile __m128i *) dest;
+	const __m128i *sp = (const __m128i *) src;
+
+	psmi_assert((dp != NULL) && (sp != NULL));
+	psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0);
+
+	if ((((uintptr_t) sp) & 0xf) == 0x0) {
+		/* source and destination are both 16 byte aligned */
+		do {
+			__m128i tmp0 = _mm_load_si128(sp);
+			__m128i tmp1 = _mm_load_si128(sp + 1);
+			__m128i tmp2 = _mm_load_si128(sp + 2);
+			__m128i tmp3 = _mm_load_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	} else {
+		/* only destination is 16 byte aligned - use unaligned loads */
+		do {
+			__m128i tmp0 = _mm_loadu_si128(sp);
+			__m128i tmp1 = _mm_loadu_si128(sp + 1);
+			__m128i tmp2 = _mm_loadu_si128(sp + 2);
+			__m128i tmp3 = _mm_loadu_si128(sp + 3);
+			_mm_store_si128((__m128i *)dp, tmp0);
+			_mm_store_si128((__m128i *)(dp + 1), tmp1);
+			_mm_store_si128((__m128i *)(dp + 2), tmp2);
+			_mm_store_si128((__m128i *)(dp + 3), tmp3);
+		} while ((--nblock) && (dp = dp+4) && (sp = sp+4));
+	}
+}
+
+void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock)
+{
+	const uint64_t *src64[4];
+	volatile uint64_t *dst64[4];
+	src64[0] = src;
+	dst64[0] = dest;
+
+	psmi_assert((dst64[0] != NULL) && (src64[0] != NULL));
+	psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0);
+
+	do {
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+
+		*dst64[0] = *src64[0];
+		src64[1] = src64[0] + 1;
+		src64[2] = src64[0] + 2;
+		src64[3] = src64[0] + 3;
+		dst64[1] = dst64[0] + 1;
+		dst64[2] = dst64[0] + 2;
+		dst64[3] = dst64[0] + 3;
+		*dst64[1] = *src64[1];
+		*dst64[2] = *src64[2];
+		*dst64[3] = *src64[3];
+		src64[0] += 4;
+		dst64[0] += 4;
+	} while (--nblock);
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars)
+{
+
+#ifdef PSM_CUDA
+	if (nchars && PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+		PSMI_CUDA_CALL(cuMemcpy,
+			       (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars);
+		return;
+	}
+#endif
+	memcpy(vdest, vsrc, nchars);
+	return;
+
+
+}
+MOCK_DEF_EPILOGUE(psmi_mq_mtucpy);
+
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)
+{
+	memcpy(vdest, vsrc, nchars);
+	return;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_service.c b/deps/libfabric/prov/psm3/psm3/opa/opa_service.c
new file mode 100644
index 0000000000000000000000000000000000000000..eeda438e9cd050a6a969a0e76803eec0bea2cf86
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_service.c
@@ -0,0 +1,59 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low
+   level hfi protocol code. */
+
+#include "opa_service.h"
+#include "psmi_wrappers.h"
+
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_sysfs.c b/deps/libfabric/prov/psm3/psm3/opa/opa_sysfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..59fcaaa24232d3d97f1346760fdb02015e380376
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_sysfs.c
@@ -0,0 +1,504 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file contains a simple sysfs interface used by the low level
+   hfi protocol code.  It also implements the interface to hfifs. */
+
+#include <sys/stat.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include "psm_config.h"
+
+#include "opa_service.h"
+
+static char sysfs_paths[PSMI_MAX_RAILS][PATH_MAX];
+static int  sysfs_path_count = -1;
+static long sysfs_page_size;
+#define SYSFS_DIR "/sys/class/infiniband/"
+
+int filter_dir(const struct dirent *item) {
+	if (item->d_name[0] == '.') return 0;
+	return 1;
+}
+
+int sysfs_init(const char *dflt_hfi_class_path)
+{
+	char *hfi_env;
+
+	if (NULL != (hfi_env = getenv("PSM3_SYSFS_PATH")))
+	{
+		snprintf(sysfs_paths[0], PATH_MAX, "%s", hfi_env);
+		sysfs_path_count = 1;
+	}
+	if (sysfs_path_count < 1) {
+		struct dirent **d = NULL;
+		int i, n = scandir(SYSFS_DIR, &d, filter_dir, alphasort);
+		sysfs_path_count = 0;
+		for (i = 0; i < n; i++) {
+			if (d[i] != NULL) {
+				if (sysfs_path_count < PSMI_MAX_RAILS) {
+					struct stat s;
+					snprintf(sysfs_paths[sysfs_path_count], PATH_MAX, SYSFS_DIR "%s", d[i]->d_name);
+					if (stat(sysfs_paths[sysfs_path_count], &s) || !S_ISDIR(s.st_mode)) {
+						memset(sysfs_paths[sysfs_path_count], 0, PATH_MAX);
+					} else {
+						sysfs_path_count++;
+					}
+				} else {
+					_HFI_INFO("Max " SYSFS_DIR " device count (%d) reached: Skipping %s\n", PSMI_MAX_RAILS, d[i]->d_name);
+				}
+				free(d[i]);
+			}
+		}
+		if (d) free(d);
+	}
+
+
+	if (!sysfs_page_size)
+		sysfs_page_size = sysconf(_SC_PAGESIZE);
+
+	if (_HFI_DBG_ON) {
+		int i;
+		_HFI_DBG("Found %u devices:\n", sysfs_path_count);
+		for (i = 0; i < sysfs_path_count; i++) {
+			_HFI_DBG(" Device[%u]: %s\n", i, sysfs_paths[i]);
+		}
+	}
+
+
+	return sysfs_path_count >= 1 ? 0 : -1;
+}
+
+void sysfs_fini(void)
+{
+	memset(sysfs_paths, 0, sizeof(sysfs_paths));
+	sysfs_path_count = -1;
+}
+
+const char *sysfs_unit_path(int unit_id)
+{
+	if (sysfs_path_count > 0 && unit_id < sysfs_path_count) {
+		return sysfs_paths[unit_id];
+	}
+	return NULL;
+}
+
+const char *sysfs_unit_dev_name(int unit_id)
+{
+	if (unit_id >= 0 && unit_id < sysfs_path_count) {
+		char *dev_name = strrchr(sysfs_paths[unit_id], '/');
+		if (dev_name && *dev_name)
+			return dev_name+1;
+	}
+	return "";	// make it easier to use in output messages
+}
+
+// accepts a unit number (>=0) or a case insenstive unit name
+// there must be no trailing whitespace
+// will accept unit number in decimal or hex (0x prefix required)
+int sysfs_find_unit(const char *name)
+{
+	int i;
+	long unit;
+	char *end;
+
+	if (! name || ! *name)
+		return -1;
+
+	// unit specified by name
+	for (i=0; i< sysfs_path_count; i++) {
+		const char *dev_name = sysfs_unit_dev_name(i);
+		if (dev_name && *dev_name && 0 == strcasecmp(dev_name, name))
+			return i;
+	}
+
+	// unit specified by number
+	unit = strtol(name, &end, 10);
+	if (end == NULL || *end != 0) {
+		unit = strtol(name, &end, 16);
+		if (end == NULL || *end != 0)
+			return -1;
+	}
+	if (unit >= 0 && unit < sysfs_path_count)
+		return unit;
+
+	// invalid
+	return -1;
+}
+
+
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	const char *unitpath = sysfs_unit_path(unit);
+
+	if (unitpath == NULL) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+			 unit, "unit id not valid");
+		return -1;
+	}
+
+	snprintf(buf, sizeof(buf), "%s/%s", unitpath, attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+			 unit, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	const char *unitpath = sysfs_unit_path(unit);
+
+	if (unitpath == NULL) {
+		_HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n",
+			 unit, "unit id not valid");
+		return -1;
+	}
+
+	snprintf(buf, sizeof(buf), "%s/device/numa_node", unitpath);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n",
+			 unit, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr,
+			int flags)
+{
+	int saved_errno;
+	char buf[1024];
+	int fd;
+	const char *unitpath = sysfs_unit_path(unit);
+
+	if (unitpath == NULL) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+			 unit, "unit id not valid");
+		return -1;
+	}
+	snprintf(buf, sizeof(buf), "%s/ports/%u/%s", unitpath, port, attr);
+	fd = open(buf, flags);
+	saved_errno = errno;
+
+	if (fd == -1) {
+		_HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n",
+			 attr, unit, port, strerror(errno));
+		_HFI_DBG("Offending file name: %s\n", buf);
+	}
+
+	errno = saved_errno;
+	return fd;
+}
+
+
+static int read_page(int fd, char **datap)
+{
+	char *data = NULL;
+	int saved_errno;
+	int ret = -1;
+
+	data = malloc(sysfs_page_size);
+	saved_errno = errno;
+
+	if (!data) {
+		_HFI_DBG("Could not allocate memory: %s\n", strerror(errno));
+		goto bail;
+	}
+
+	ret = read(fd, data, sysfs_page_size);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		_HFI_DBG("Read of attribute failed: %s\n", strerror(errno));
+		goto bail;
+	}
+
+bail:
+	if (ret == -1) {
+		free(data);
+	} else {
+		if (ret < sysfs_page_size)
+			data[ret] = 0;
+		else
+			data[sysfs_page_size-1] = 0;
+		*datap = data;
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+/* read a string value into buff, no more than size bytes.
+   returns the number of bytes read */
+size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char *buff, size_t size)
+{
+	int fd = -1;
+	size_t rv = -1;
+
+	fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY);
+
+	if (fd == -1)
+		return rv;
+
+	rv = read(fd, buff, size);
+
+	close(fd);
+
+	if (rv < size)
+		buff[rv] = 0;
+	else
+		buff[size-1] = 0;
+
+	return rv;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	saved_errno = errno;
+
+bail:
+	if (ret == -1)
+		*datap = NULL;
+
+	if (fd != -1) {
+		close(fd);
+	}
+
+	errno = saved_errno;
+	return ret;
+}
+
+
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+			    int64_t *valp, int base)
+{
+	char *data=NULL, *end;
+	int saved_errno;
+	long long val;
+	int ret;
+
+	ret = hfi_sysfs_unit_read(unit, attr, &data);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, base);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	*valp = val;
+	ret = 0;
+
+bail:
+	if (data)
+		free(data);
+	errno = saved_errno;
+	return ret;
+}
+
+static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap)
+{
+	int fd = -1, ret = -1;
+	int saved_errno;
+
+	fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY);
+	saved_errno = errno;
+
+	if (fd == -1)
+		goto bail;
+
+	ret = read_page(fd, datap);
+	if (ret == -1)
+		*datap = NULL;
+
+	saved_errno = errno;
+	close(fd);
+bail:
+	errno = saved_errno;
+	return ret;
+}
+
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit)
+{
+	char *data=NULL, *end;
+	int saved_errno;
+	long long val;
+	int64_t ret = -1;
+
+	saved_errno = errno;
+	if (hfi_sysfs_unit_read_node(unit, &data) == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, 0);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	ret = (int64_t) val;
+bail:
+	free(data);
+	errno = saved_errno;
+	return ret;
+}
+
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+			    int64_t *valp, int base)
+{
+	char *data, *end;
+	int saved_errno;
+	long long val;
+	int ret;
+
+	ret = hfi_sysfs_port_read(unit, port, attr, &data);
+	saved_errno = errno;
+
+	if (ret == -1) {
+		goto bail;
+	}
+
+	val = strtoll(data, &end, base);
+	saved_errno = errno;
+
+	if (!*data || !(*end == '\0' || isspace(*end))) {
+		ret = -1;
+		goto bail;
+	}
+
+	*valp = val;
+	ret = 0;
+
+bail:
+	free(data);
+	errno = saved_errno;
+	return ret;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_syslog.c b/deps/libfabric/prov/psm3/psm3/opa/opa_syslog.c
new file mode 100644
index 0000000000000000000000000000000000000000..88de4cbb345ac016135eeab60e83987946263db2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_syslog.c
@@ -0,0 +1,113 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "opa_user.h"
+
+#define SYSLOG_MAXLEN	512
+
+extern char __hfi_mylabel[];
+
+void
+hfi_vsyslog(const char *prefix, int to_console, int level,
+	    const char *format, va_list ap)
+{
+	char logprefix[SYSLOG_MAXLEN];
+	size_t len;
+
+	if (to_console) {
+		char hostname[80];
+		va_list ap_cons;
+		va_copy(ap_cons, ap);
+		len = strlen(format);
+		gethostname(hostname, sizeof(hostname));
+		hostname[sizeof(hostname) - 1] = '\0';
+
+		if (__hfi_mylabel[0])
+			fprintf(stderr, "%s: ", __hfi_mylabel);
+		else
+			fprintf(stderr, "%s: ", hostname);
+
+		vfprintf(stderr, format, ap_cons);
+		if (format[len] != '\n')
+			fprintf(stderr, "\n");
+		fflush(stderr);
+		va_end(ap_cons);
+	}
+
+	len = snprintf(logprefix, sizeof(logprefix),
+		       "(nic/%s)[%d]: %s", prefix ? prefix : "nic",
+		       (int)getpid(), format);
+
+	vsyslog(level | LOG_USER, logprefix, ap);
+
+	return;
+}
+
+void
+hfi_syslog(const char *prefix, int to_console, int level,
+	   const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	hfi_vsyslog(prefix, to_console, level, format, ap);
+	va_end(ap);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_time.c b/deps/libfabric/prov/psm3/psm3/opa/opa_time.c
new file mode 100644
index 0000000000000000000000000000000000000000..33de9959b22680b93237ba729c835d8923ab6318
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_time.c
@@ -0,0 +1,299 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#define __USE_GNU
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "opa_user.h"
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+/* init the cycle counter to picosecs/cycle conversion automatically */
+/* at program startup, if it's using timing functions. */
+static void init_picos_per_cycle(void) __attribute__ ((constructor));
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle);
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle);
+
+/* in case two of our mechanisms fail */
+#define SAFEDEFAULT_PICOS_PER_CYCLE 500
+
+uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+
+/* This isn't perfect, but it's close enough for rough timing. We want this
+   to work on systems where the cycle counter isn't the same as the clock
+   frequency.
+   __hfi_pico_per_cycle isn't going to lead to completely accurate
+   conversions from timestamps to nanoseconds, but it's close enough for
+   our purposes, which is mainly to allow people to show events with nsecs
+   or usecs if desired, rather than cycles.   We use it in some performance
+   analysis, but it has to be done with care, since cpuspeed can change,
+   different cpu's can have different speeds, etc.
+
+   Some architectures don't have their TSC-equivalent running at anything
+   related to the processor speed (e.g. G5 Power systems use a fixed
+   33 MHz frequency).
+*/
+
+#define MIN_TEST_TIME_IN_PICOS (100000000000LL)	/* 100 milliseconds */
+
+static int timebase_debug;	/* off by default */
+
+#define timebase_warn_always(fmt, ...)				    \
+	    hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__)
+#define timebase_warn(fmt, ...)	if (timebase_debug)		    \
+	    timebase_warn_always(fmt, ##__VA_ARGS__)
+
+static int hfi_timebase_isvalid(uint32_t pico_per_cycle)
+{
+#if defined(__x86_64__) || defined(__i386__)
+	/* If pico-per-cycle is less than 200, the clock speed would be greater
+	 * than 5 GHz.  Similarly, we minimally support a 1GHz clock.
+	 * Allow some slop, because newer kernels with HPET can be a few
+	 * units off, and we don't want to spend the startup time needlessly */
+	if (pico_per_cycle >= 198 && pico_per_cycle <= 1005)
+		return 1;
+#endif
+	else
+		return 0;
+}
+
+/*
+ * Method #1:
+ *
+ * Derive the pico-per-cycle by trying to correlate the difference between two
+ * reads of the tsc counter to gettimeofday.
+ */
+static void init_picos_per_cycle()
+{
+	struct timeval tvs, tve;
+	int64_t usec = 0;
+	uint64_t ts, te;
+	int64_t delta;
+	uint32_t picos = 0;
+	int trials = 0;
+	int retry = 0;
+	cpu_set_t cpuset, cpuset_saved;
+	int have_cpuset = 1;
+
+	/*
+	 * Make sure we try to calculate the cycle time without being migrated.
+	 */
+	CPU_ZERO(&cpuset_saved);
+	if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved))
+		have_cpuset = 0;
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset))
+		have_cpuset = 0;
+
+	/*
+	 * If we set affinity correctly, give the scheduler another change to put
+	 * us on processor 0
+	 */
+	if (have_cpuset)
+		sched_yield();
+
+retry_pico_test:
+	if (++retry == 10) {
+		__hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos);
+		goto reset_cpu_mask;	/* Reset CPU mask before exiting */
+	}
+
+	usec = 0;
+	gettimeofday(&tvs, NULL);
+	ts = get_cycles();
+	while (usec < MIN_TEST_TIME_IN_PICOS) {	/* wait for at least 100 millisecs */
+		trials++;
+		usleep(125);
+		gettimeofday(&tve, NULL);
+		usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) +
+		    1000000000000LL * (tve.tv_sec - tvs.tv_sec);
+		if (usec < 0) {
+			timebase_warn
+			    ("RTC timebase, gettimeofday is negative (!) %lld\n",
+			     (long long)usec);
+			goto retry_pico_test;
+		}
+	}
+	te = get_cycles();
+	delta = te - ts;
+	picos = (uint32_t) (usec / delta);
+
+	if (!hfi_timebase_isvalid(picos)) {
+		cpu_set_t cpuget;
+		int affinity_valid =
+		    !sched_getaffinity(0, sizeof(cpuget), &cpuget);
+		if (affinity_valid && !CPU_ISSET(0, &cpuget))
+			affinity_valid = 0;
+		timebase_warn
+		    ("Failed to get valid RTC timebase, gettimeofday delta=%lld, "
+		     "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n",
+		     (long long)usec, (long long)delta, picos,
+		     affinity_valid ? "YES" : "NO", retry);
+		goto retry_pico_test;
+	}
+
+	/* If we've had to retry even once, let that be known */
+	if (retry > 1)
+		timebase_warn("Clock is %d picos/cycle found in %d trials and "
+			      "%.3f seconds (retry=%d)\n", picos, trials,
+			      (double)usec / 1.0e12, retry);
+
+	__hfi_pico_per_cycle = picos;
+
+reset_cpu_mask:
+	/* Restore affinity */
+	if (have_cpuset) {
+		sched_setaffinity(0, sizeof(cpuset), &cpuset_saved);
+		/*
+		 * Give a chance to other processes that also set affinity to 0 for
+		 * doing this test.
+		 */
+		sched_yield();
+	}
+}
+
+/*
+ * Method #2:
+ *
+ * Derive the pico-per-cycle from /proc instead of using sleep trick
+ * that relies on scheduler.
+ */
+static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
+{
+	/* we only validate once */
+	uint32_t new_pico_per_cycle = old_pico_per_cycle;
+	uint32_t max_bet_new_old_pico, min_bet_new_old_pico;
+
+	char hostname[80];
+	gethostname(hostname, 80);
+	hostname[sizeof(hostname) - 1] = '\0';
+
+	if (getenv("PSM3_DEBUG_TIMEBASE"))
+		timebase_debug = 1;
+
+	/* If the old one is valid, don't bother with this mechanism */
+	if (hfi_timebase_isvalid(old_pico_per_cycle))
+		return old_pico_per_cycle;
+
+#if defined(__x86_64__) || defined(__i386__)
+	{
+		FILE *fp = fopen("/proc/cpuinfo", "r");
+		char input[255];
+		char *p = NULL;
+
+		if (!fp)
+			goto fail;
+
+		while (!feof(fp) && fgets(input, 255, fp)) {
+			if (strstr(input, "cpu MHz")) {
+				p = strchr(input, ':');
+				if (p)
+				{
+					double MHz = atof(p + 1);
+					if (MHz != 0.0)
+						new_pico_per_cycle =
+							(uint32_t) (1000000. / MHz);
+				}
+				break;
+			}
+		}
+		fclose(fp);
+		if (!p)
+			goto fail;
+	}
+#endif
+
+	max_bet_new_old_pico = max(new_pico_per_cycle, old_pico_per_cycle);
+	min_bet_new_old_pico = min(new_pico_per_cycle, old_pico_per_cycle);
+	/* If there's no change (within a small range), just return the old one */
+	if ((max_bet_new_old_pico - min_bet_new_old_pico) < 5)
+		return old_pico_per_cycle;
+
+	if (hfi_timebase_isvalid(new_pico_per_cycle)) {
+		timebase_warn_always
+		    ("RTC timebase, using %d picos/cycle from /proc "
+		     "instead of the detected %d picos/cycle\n",
+		     new_pico_per_cycle, old_pico_per_cycle);
+		return new_pico_per_cycle;
+	}
+
+fail:
+	new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+	timebase_warn_always
+	    ("Problem obtaining CPU time base, detected to be %d "
+	     "pico/cycle, adjusted to safe default %d picos/cycle",
+	     old_pico_per_cycle, new_pico_per_cycle);
+	return new_pico_per_cycle;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/opa/opa_utils.c b/deps/libfabric/prov/psm3/psm3/opa/opa_utils.c
new file mode 100644
index 0000000000000000000000000000000000000000..1abe60ecbd633979ce593eec160de95f2cc46aa2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/opa/opa_utils.c
@@ -0,0 +1,196 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low */
+/* level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <time.h>
+
+#include "opa_user.h"
+
+/* keep track whether we disabled mmap in malloc */
+int __hfi_malloc_no_mmap = 0;
+
+const char *hfi_get_next_name(char **names)
+{
+	char *p, *start;
+
+	p = start = *names;
+	while (*p != '\0' && *p != '\n') {
+		p++;
+	}
+	if (*p == '\n') {
+		*p = '\0';
+		p++;
+		*names = p;
+		return start;
+	} else
+		return NULL;
+}
+
+void hfi_release_names(char *namep)
+{
+	/* names were initialised in the data section before. Now
+	 * they are allocated when hfi_hfifs_read() is called. Allocation
+	 * for names is done only once at init time. Should we eventually
+	 * have an "stats_type_unregister" type of routine to explicitly
+	 * deallocate memory and free resources ?
+	 */
+#if 0
+	if (namep != NULL)
+		free(namep);
+#endif
+}
+
+
+/*
+ * Add a constructor function to disable mmap if asked to do so by the user
+ */
+static void init_mallopt_disable_mmap(void) __attribute__ ((constructor));
+
+static void init_mallopt_disable_mmap(void)
+{
+	char *env = getenv("PSM3_DISABLE_MMAP_MALLOC");
+
+	if (env && *env) {
+		if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) {
+			__hfi_malloc_no_mmap = 1;
+		}
+	}
+
+	return;
+}
+
+/* Convert Timeout value from usec to
+ * timeout_mult where usec = 4.096usec * 2^timeout_mult
+ */
+uint8_t timeout_usec_to_mult(uint64_t timeout_us)
+{
+	/* all values are rounded up, comments reflect exact value */
+	if (timeout_us <= 4)
+		return 0;	/* 4.096 us */
+	else if (timeout_us <= 8)
+		return 1;	/* 8.192 us */
+	else if (timeout_us <= 16)
+		return 2;	/* 16.384 us */
+	else if (timeout_us <= 32)
+		return 3;	/* 32.768 us */
+	else if (timeout_us <= 65)
+		return 4;	/* 65.536 us */
+	else if (timeout_us <= 131)
+		return 5;	/* 131.072 us */
+	else if (timeout_us <= 262)
+		return 6;	/* 262.144 us */
+	else if (timeout_us <= 524)
+		return 7;	/* 524.288 us */
+	else if (timeout_us <= 1048)
+		return 8;	/* 1048.576 us */
+	else if (timeout_us <= 2097)
+		return 9;	/* 2.097 ms */
+	else if (timeout_us <= 4194)
+		return 10;	/* 4.197 ms */
+	else if (timeout_us <= 8388)
+		return 11;	/* 8.388 ms */
+	else if (timeout_us <= 16777)
+		return 12;	/* 16.777 ms */
+	else if (timeout_us <= 33554)
+		return 13;	/* 33.554 ms */
+	else if (timeout_us <= 67108)
+		return 14;	/* 67.1 ms */
+	else if (timeout_us <= 134217)
+		return 15;	/* 134.2 ms */
+	else if (timeout_us <= 268435)
+		return 16;	/* 268.4 ms */
+	else if (timeout_us <= 536870)
+		return 17;	/* 536.8 ms */
+	else if (timeout_us <= 1073741)
+		return 18;/* 1.073 s */
+	else if (timeout_us <= 2147483)
+		return 19;/* 2.148 s */
+	else if (timeout_us <= 4294967)
+		return 20;/* 4.294 s */
+	else if (timeout_us <= 8589934)
+		return 21;/* 8.589 s */
+	else if (timeout_us <= 17179869)
+		return 22;/* 17.179 s */
+	else if (timeout_us <= 34359738)
+		return 23;/* 34.359 s */
+	else if (timeout_us <= 68719476)
+		return 24;/* 68.719 s */
+	else if (timeout_us <= 137438953ll)
+		return 25;/* 2.2 minutes */
+	else if (timeout_us <= 274877906ll)
+		return 26; /* 4.5 minutes */
+	else if (timeout_us <= 549755813ll)
+		return 27; /* 9 minutes */
+	else if (timeout_us <= 1099511628ll)
+		return 28;	/* 18 minutes */
+	else if (timeout_us <= 2199023256ll)
+		return 29;	/* 0.6 hr */
+	else if (timeout_us <= 4398046511ll)
+		return 30;	/* 1.2 hr	 */
+	else
+		return 31;	/* 2.4 hr */
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm.c b/deps/libfabric/prov/psm3/psm3/psm.c
new file mode 100644
index 0000000000000000000000000000000000000000..7159722a6ec1bb9166220a3503313f3e05afbab9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm.c
@@ -0,0 +1,1256 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <dlfcn.h>
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "opa_revision.h"
+#include "psm_mq_internal.h"
+
+static int psmi_verno_major = PSM2_VERNO_MAJOR;
+static int psmi_verno_minor = PSM2_VERNO_MINOR;
+static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+static int psmi_verno_client_val;
+int psmi_epid_ver;
+int psmi_allow_routers;
+
+// Special psmi_refcount values
+#define PSMI_NOT_INITIALIZED    0
+#define PSMI_FINALIZED         -1
+
+// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state
+// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change
+// psmi_refcount should be treated as an error
+static int psmi_refcount = PSMI_NOT_INITIALIZED;
+
+/* Global lock used for endpoint creation and destroy
+ * (in functions psm2_ep_open and psm2_ep_close) and also
+ * for synchronization with recv_thread (so that recv_thread
+ * will not work on an endpoint which is in a middle of closing). */
+psmi_lock_t psmi_creation_lock;
+
+int psmi_affinity_semaphore_open = 0;
+char *sem_affinity_shm_rw_name;
+sem_t *sem_affinity_shm_rw = NULL;
+
+int psmi_affinity_shared_file_opened = 0;
+char *affinity_shm_name;
+uint64_t *shared_affinity_ptr;
+
+uint32_t psmi_cpu_model;
+
+#ifdef PSM_CUDA
+int is_cuda_enabled;
+int is_gdr_copy_enabled;
+int is_gpudirect_enabled = 0;
+int _device_support_unified_addr = -1; // -1 indicates "unchecked". See verify_device_support_unified_addr().
+int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
+int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
+int my_gpu_device = 0;
+int cuda_lib_version;
+int is_driver_gpudirect_enabled;
+uint32_t cuda_thresh_rndv;
+uint32_t gdr_copy_limit_send;
+uint32_t gdr_copy_limit_recv;
+uint64_t gpu_cache_evict;	// in bytes
+
+void *psmi_cuda_lib;
+CUresult (*psmi_cuInit)(unsigned int  Flags );
+CUresult (*psmi_cuCtxDetach)(CUcontext c);
+CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
+CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
+CUresult (*psmi_cuDeviceGetCount)(int* count);
+CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
+CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
+CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
+CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
+CUresult (*psmi_cuEventQuery)(CUevent hEvent);
+CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
+CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
+CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
+CUresult (*psmi_cuMemFreeHost)(void* p);
+CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
+CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
+CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
+CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
+CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
+CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
+CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
+CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
+CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
+
+uint64_t psmi_count_cuInit;
+uint64_t psmi_count_cuCtxDetach;
+uint64_t psmi_count_cuCtxGetCurrent;
+uint64_t psmi_count_cuCtxSetCurrent;
+uint64_t psmi_count_cuPointerGetAttribute;
+uint64_t psmi_count_cuPointerSetAttribute;
+uint64_t psmi_count_cuDeviceCanAccessPeer;
+uint64_t psmi_count_cuDeviceGet;
+uint64_t psmi_count_cuDeviceGetAttribute;
+uint64_t psmi_count_cuDriverGetVersion;
+uint64_t psmi_count_cuDeviceGetCount;
+uint64_t psmi_count_cuStreamCreate;
+uint64_t psmi_count_cuStreamDestroy;
+uint64_t psmi_count_cuStreamSynchronize;
+uint64_t psmi_count_cuEventCreate;
+uint64_t psmi_count_cuEventDestroy;
+uint64_t psmi_count_cuEventQuery;
+uint64_t psmi_count_cuEventRecord;
+uint64_t psmi_count_cuEventSynchronize;
+uint64_t psmi_count_cuMemHostAlloc;
+uint64_t psmi_count_cuMemFreeHost;
+uint64_t psmi_count_cuMemcpy;
+uint64_t psmi_count_cuMemcpyDtoD;
+uint64_t psmi_count_cuMemcpyDtoH;
+uint64_t psmi_count_cuMemcpyHtoD;
+uint64_t psmi_count_cuMemcpyDtoHAsync;
+uint64_t psmi_count_cuMemcpyHtoDAsync;
+uint64_t psmi_count_cuIpcGetMemHandle;
+uint64_t psmi_count_cuIpcOpenMemHandle;
+uint64_t psmi_count_cuIpcCloseMemHandle;
+uint64_t psmi_count_cuMemGetAddressRange;
+uint64_t psmi_count_cuDevicePrimaryCtxGetState;
+uint64_t psmi_count_cuDevicePrimaryCtxRetain;
+uint64_t psmi_count_cuCtxGetDevice;
+uint64_t psmi_count_cuDevicePrimaryCtxRelease;
+#endif
+
+/*
+ * Bit field that contains capability set.
+ * Each bit represents different capability.
+ * It is supposed to be filled with logical OR
+ * on conditional compilation basis
+ * along with future features/capabilities.
+ */
+uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP;
+
+int psmi_verno_client()
+{
+	return psmi_verno_client_val;
+}
+
+/* This function is used to determine whether the current library build can
+ * successfully communicate with another library that claims to be version
+ * 'verno'.
+ *
+ * PSM 2.x is always ABI compatible, but this checks to see if two different
+ * versions of the library can coexist.
+ */
+int psmi_verno_isinteroperable(uint16_t verno)
+{
+	if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR)
+		return 0;
+
+	return 1;
+}
+
+int MOCKABLE(psmi_isinitialized)()
+{
+	return (psmi_refcount > 0);
+}
+MOCK_DEF_EPILOGUE(psmi_isinitialized);
+
+#ifdef PSM_CUDA
+int psmi_cuda_lib_load()
+{
+	psm2_error_t err = PSM2_OK;
+	char *dlerr;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_VDBG("Loading CUDA library.\n");
+
+	psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY);
+	if (!psmi_cuda_lib) {
+		dlerr = dlerror();
+		_HFI_ERROR("Unable to open libcuda.so.  Error %s\n",
+				dlerr ? dlerr : "no dlerror()");
+		goto fail;
+	}
+
+	psmi_count_cuDriverGetVersion++;
+	psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion");
+
+	if (!psmi_cuDriverGetVersion) {
+		_HFI_ERROR
+			("Unable to resolve symbols in CUDA libraries.\n");
+		goto fail;
+	}
+
+	PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version);
+	if (cuda_lib_version < 7000) {
+		_HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n");
+		goto fail;
+	}
+
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	if (psmi_cuda_lib)
+		dlclose(psmi_cuda_lib);
+	err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n");
+	return err;
+}
+
+static void psmi_cuda_stats_register()
+{
+#define PSMI_CUDA_COUNT_DECLU64(func) \
+	PSMI_STATS_DECLU64(#func, &psmi_count_##func)
+
+	struct psmi_stats_entry entries[] = {
+		PSMI_CUDA_COUNT_DECLU64(cuInit),
+		PSMI_CUDA_COUNT_DECLU64(cuCtxDetach),
+		PSMI_CUDA_COUNT_DECLU64(cuCtxGetCurrent),
+		PSMI_CUDA_COUNT_DECLU64(cuCtxSetCurrent),
+		PSMI_CUDA_COUNT_DECLU64(cuPointerGetAttribute),
+		PSMI_CUDA_COUNT_DECLU64(cuPointerSetAttribute),
+		PSMI_CUDA_COUNT_DECLU64(cuDeviceCanAccessPeer),
+		PSMI_CUDA_COUNT_DECLU64(cuDeviceGet),
+		PSMI_CUDA_COUNT_DECLU64(cuDeviceGetAttribute),
+		PSMI_CUDA_COUNT_DECLU64(cuDriverGetVersion),
+		PSMI_CUDA_COUNT_DECLU64(cuDeviceGetCount),
+		PSMI_CUDA_COUNT_DECLU64(cuStreamCreate),
+		PSMI_CUDA_COUNT_DECLU64(cuStreamDestroy),
+		PSMI_CUDA_COUNT_DECLU64(cuStreamSynchronize),
+		PSMI_CUDA_COUNT_DECLU64(cuEventCreate),
+		PSMI_CUDA_COUNT_DECLU64(cuEventDestroy),
+		PSMI_CUDA_COUNT_DECLU64(cuEventQuery),
+		PSMI_CUDA_COUNT_DECLU64(cuEventRecord),
+		PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize),
+		PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc),
+		PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpy),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoD),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoHAsync),
+		PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoDAsync),
+		PSMI_CUDA_COUNT_DECLU64(cuIpcGetMemHandle),
+		PSMI_CUDA_COUNT_DECLU64(cuIpcOpenMemHandle),
+		PSMI_CUDA_COUNT_DECLU64(cuIpcCloseMemHandle),
+		PSMI_CUDA_COUNT_DECLU64(cuMemGetAddressRange),
+		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxGetState),
+		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRetain),
+		PSMI_CUDA_COUNT_DECLU64(cuCtxGetDevice),
+		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRelease),
+	};
+#undef PSMI_CUDA_COUNT_DECLU64
+
+	psmi_stats_register_type("PSM_Cuda_call_statistics",
+			PSMI_STATSTYPE_CUDA,
+			entries, PSMI_STATS_HOWMANY(entries), 0,
+			&is_cuda_enabled, NULL); /* context must != NULL */
+}
+
+int psmi_cuda_initialize()
+{
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_VDBG("Enabling CUDA support.\n");
+
+	psmi_cuda_stats_register();
+
+	err = psmi_cuda_lib_load();
+	if (err != PSM2_OK)
+		goto fail;
+
+	PSMI_CUDA_CALL(cuInit, 0);
+
+#ifdef RNDV_MOD
+	psm2_get_gpu_bars();
+#endif
+	union psmi_envvar_val env_enable_gdr_copy;
+	psmi_getenv("PSM3_GDRCOPY",
+				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)1, &env_enable_gdr_copy);
+	is_gdr_copy_enabled = env_enable_gdr_copy.e_int;
+
+	union psmi_envvar_val env_cuda_thresh_rndv;
+	psmi_getenv("PSM3_CUDA_THRESH_RNDV",
+				"RNDV protocol is used for GPU send message sizes greater than the threshold",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv);
+	cuda_thresh_rndv = env_cuda_thresh_rndv.e_int;
+
+	if (cuda_thresh_rndv < 0
+		)
+	    cuda_thresh_rndv = CUDA_THRESH_RNDV;
+
+	union psmi_envvar_val env_gdr_copy_limit_send;
+	psmi_getenv("PSM3_GDRCOPY_LIMIT_SEND",
+				"GDR Copy is turned off on the send side"
+				" for message sizes greater than the limit"
+#ifndef OPA
+				" or larger than 1 MTU\n",
+#else
+				"\n",
+#endif
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send);
+	gdr_copy_limit_send = env_gdr_copy_limit_send.e_int;
+
+	if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > cuda_thresh_rndv)
+		gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, cuda_thresh_rndv);
+
+	union psmi_envvar_val env_gdr_copy_limit_recv;
+	psmi_getenv("PSM3_GDRCOPY_LIMIT_RECV",
+				"GDR Copy is turned off on the recv side"
+				" for message sizes greater than the limit\n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_LIMIT_RECV, &env_gdr_copy_limit_recv);
+	gdr_copy_limit_recv = env_gdr_copy_limit_recv.e_int;
+
+	if (gdr_copy_limit_recv < 8)
+		gdr_copy_limit_recv = GDR_COPY_LIMIT_RECV;
+
+	if (!is_gdr_copy_enabled)
+		gdr_copy_limit_send = gdr_copy_limit_recv = 0;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n");
+	return err;
+}
+#endif
+
+psm2_error_t __psm2_init(int *major, int *minor)
+{
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val env_tmask;
+
+	psmi_stats_initialize();
+
+	psmi_mem_stats_register();
+
+	psmi_log_initialize();
+
+	PSM2_LOG_MSG("entering");
+
+	/* When PSM_PERF is enabled, the following code causes the
+	   PMU to be programmed to measure instruction cycles of the
+	   TX/RX speedpaths of PSM. */
+	GENERIC_PERF_INIT();
+	GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX");
+	GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX");
+
+	if (psmi_refcount > 0) {
+		psmi_refcount++;
+		goto update;
+	}
+
+	if (psmi_refcount == PSMI_FINALIZED) {
+		err = PSM2_IS_FINALIZED;
+		goto fail;
+	}
+
+	if (major == NULL || minor == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	psmi_init_lock(&psmi_creation_lock);
+
+#ifdef PSM_DEBUG
+	if (!getenv("PSM3_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! YOU ARE RUNNING AN INTERNAL-ONLY PSM *DEBUG* BUILD.\n");
+#endif
+
+#ifdef PSM_PROFILE
+	if (!getenv("PSM3_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! YOU ARE RUNNING AN INTERNAL-ONLY PSM *PROFILE* BUILD.\n");
+#endif
+
+#ifdef PSM_FI
+	/* Make sure we complain if fault injection is enabled */
+	if (getenv("PSM3_FI") && !getenv("PSM3_NO_WARN"))
+		fprintf(stderr,
+			"!!! WARNING !!! YOU ARE RUNNING WITH FAULT INJECTION ENABLED!\n");
+#endif /* #ifdef PSM_FI */
+
+	/* Make sure, as an internal check, that this version knows how to detect
+	 * compatibility with other library versions it may communicate with */
+	if (psmi_verno_isinteroperable(psmi_verno) != 1) {
+		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"psmi_verno_isinteroperable() not updated for current version!");
+		goto fail;
+	}
+
+	/* The only way to not support a client is if the major number doesn't
+	 * match */
+	if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) {
+		err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION,
+					"This library does not implement version %d.%d",
+					*major, *minor);
+		goto fail;
+	}
+
+	/* Make sure we don't keep track of a client that claims a higher version
+	 * number than we are */
+	psmi_verno_client_val =
+	    min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno);
+
+	/* Check to see if we need to set Architecture flags to something
+	 * besides big core Xeons */
+	cpuid_t id;
+	psmi_cpu_model = CPUID_MODEL_UNDEFINED;
+
+	/* First check to ensure Genuine Intel */
+	get_cpuid(0x0, 0, &id);
+	if(id.ebx == CPUID_GENUINE_INTEL_EBX
+		&& id.ecx == CPUID_GENUINE_INTEL_ECX
+		&& id.edx == CPUID_GENUINE_INTEL_EDX)
+	{
+		/* Use cpuid with EAX=1 to get processor info */
+		get_cpuid(0x1, 0, &id);
+		psmi_cpu_model = CPUID_GENUINE_INTEL;
+	}
+
+	if( (psmi_cpu_model == CPUID_GENUINE_INTEL) &&
+		(id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON)
+	{
+		psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) |
+				((id.eax & CPUID_EXMODEL_MASK) >> 12);
+	}
+
+	psmi_refcount++;
+	/* hfi_debug lives in libhfi.so */
+	psmi_getenv("PSM3_TRACEMASK",
+		    "Mask flags for tracing",
+		    PSMI_ENVVAR_LEVEL_USER,
+		    PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)__HFI_DEBUG_DEFAULT_STR, &env_tmask);
+	hfi_debug = psmi_parse_val_pattern(env_tmask.e_str, __HFI_DEBUG_DEFAULT,
+			__HFI_DEBUG_DEFAULT);
+
+	/* The "real thing" is done in hfi_proto.c as a constructor function, but
+	 * we getenv it here to report what we're doing with the setting */
+	{
+		extern int __hfi_malloc_no_mmap;
+		union psmi_envvar_val env_mmap;
+		char *env = getenv("PSM3_DISABLE_MMAP_MALLOC");
+		int broken = (env && *env && !__hfi_malloc_no_mmap);
+		psmi_getenv("PSM3_DISABLE_MMAP_MALLOC",
+			    broken ? "Skipping mmap disable for malloc()" :
+			    "Disable mmap for malloc()",
+			    PSMI_ENVVAR_LEVEL_USER,
+			    PSMI_ENVVAR_TYPE_YESNO,
+			    (union psmi_envvar_val)0, &env_mmap);
+		if (broken)
+			_HFI_ERROR
+			    ("Couldn't successfully disable mmap in mallocs "
+			     "with mallopt()\n");
+	}
+
+	{
+		union psmi_envvar_val env_epid_ver;
+		psmi_getenv("PSM3_ADDR_FMT",
+					"Used to force PSM3 to use a particular version of EPID",
+					PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+					(union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver);
+		psmi_epid_ver = env_epid_ver.e_int;
+		if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " The max epid version supported in this version of PSM3 is %d \n"
+					  "Please upgrade PSM3 \n",
+					  PSMI_MAX_EPID_VERNO_SUPPORTED);
+			goto fail;
+		} else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Invalid value provided through PSM3_ADDR_FMT \n");
+			goto fail;
+		}
+	}
+	{
+		union psmi_envvar_val env_allow_routers;
+		psmi_getenv("PSM3_ALLOW_ROUTERS",
+					"Disable check for Ethernet subnet equality between nodes\n"
+					" allows routers between nodes and assumes single network plane for multi-rail\n",
+					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+					(union psmi_envvar_val)0, &env_allow_routers);
+		psmi_allow_routers = env_allow_routers.e_int;
+	}
+
+	if (getenv("PSM3_DIAGS")) {
+		_HFI_INFO("Running diags...\n");
+		psmi_diags();
+	}
+
+	psmi_multi_ep_init();
+
+#ifdef PSM_FI
+	psmi_faultinj_init();
+#endif /* #ifdef PSM_FI */
+
+	psmi_epid_init();
+
+	int rc = psmi_hal_initialize();
+
+	if (rc)
+	{
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_enable_cuda;
+	psmi_getenv("PSM3_CUDA",
+			"Enable (set envvar to 1) for cuda support in PSM (Disabled by default)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0, &env_enable_cuda);
+	// order important, always parse gpudirect
+	is_cuda_enabled = psmi_parse_gpudirect() || env_enable_cuda.e_int;
+
+	if (PSMI_IS_CUDA_ENABLED) {
+		err = psmi_cuda_initialize();
+		if (err != PSM2_OK)
+			goto fail;
+	}
+#endif
+
+update:
+	if (psmi_parse_identify()) {
+                Dl_info info_psm;
+		char ofed_delta[100] = "";
+		strcat(strcat(ofed_delta," built for IEFS "),psmi_hfi_IFS_version);
+                printf("%s %s PSM3 v%d.%d%s%s\n"
+		       "%s %s location %s\n"
+		       "%s %s build date %s\n"
+		       "%s %s src checksum %s\n"
+                       "%s %s git checksum %s\n"
+#ifdef RNDV_MOD
+#ifdef NVIDIA_GPU_DIRECT
+                       "%s %s built against rv interface v%d.%d gpu v%d.%d cuda\n"
+#else
+                       "%s %s built against rv interface v%d.%d\n"
+#endif
+#endif
+                       "%s %s Global Rank %d (%d total) Local Rank %d (%d total)\n"
+                       "%s %s CPU Core %d NUMA %d\n",
+		       hfi_get_mylabel(), hfi_ident_tag,
+				PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR,
+#ifdef PSM_CUDA
+				"-cuda",
+#else
+				"",
+#endif
+				(strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta : "",
+		       hfi_get_mylabel(), hfi_ident_tag,
+				dladdr(psm2_init, &info_psm) ?
+					info_psm.dli_fname : "PSM3 path not available",
+		       hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp,
+		       hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum,
+		       hfi_get_mylabel(), hfi_ident_tag,
+				(strcmp(psmi_hfi_git_checksum,"") != 0) ?
+					psmi_hfi_git_checksum : "<not available>",
+#ifdef RNDV_MOD
+#ifdef NVIDIA_GPU_DIRECT
+		       hfi_get_mylabel(), hfi_ident_tag,
+				psm2_rv_get_user_major_bldtime_version(),
+				psm2_rv_get_user_minor_bldtime_version(),
+				psm2_rv_get_gpu_user_major_bldtime_version(),
+				psm2_rv_get_gpu_user_minor_bldtime_version(),
+#else
+		       hfi_get_mylabel(), hfi_ident_tag,
+				psm2_rv_get_user_major_bldtime_version(),
+				psm2_rv_get_user_minor_bldtime_version(),
+#endif
+#endif
+		       hfi_get_mylabel(), hfi_ident_tag,
+				hfi_get_myrank(), hfi_get_myrank_count(),
+				hfi_get_mylocalrank(),
+				hfi_get_mylocalrank_count(),
+		       hfi_get_mylabel(), hfi_ident_tag,
+				sched_getcpu(), psmi_get_current_proc_location()
+		       );
+	}
+
+	*major = (int)psmi_verno_major;
+	*minor = (int)psmi_verno_minor;
+fail:
+	_HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_init)
+
+static
+psm2_error_t psmi_get_psm2_config(psm2_mq_t     mq,
+				  psm2_epaddr_t epaddr,
+				  uint32_t *out)
+{
+	psm2_error_t rv = PSM2_INTERNAL_ERR;
+
+	*out = 0;
+	if (&mq->ep->ptl_ips == epaddr->ptlctl)
+	{
+		rv = PSM2_OK;
+		*out |= PSM2_INFO_QUERY_CONFIG_IPS;
+#ifdef PSM_CUDA
+		if (PSMI_IS_CUDA_ENABLED)
+		{
+			*out |= PSM2_INFO_QUERY_CONFIG_CUDA;
+			if (PSMI_IS_GDR_COPY_ENABLED)
+				*out |= PSM2_INFO_QUERY_CONFIG_GDR_COPY;
+		}
+#endif
+		*out |= PSM2_INFO_QUERY_CONFIG_PIO;
+	}
+	else if (&mq->ep->ptl_amsh == epaddr->ptlctl)
+	{
+		*out |= PSM2_INFO_QUERY_CONFIG_AMSH;
+		rv = PSM2_OK;
+	}
+	else if (&mq->ep->ptl_self == epaddr->ptlctl)
+	{
+		*out |= PSM2_INFO_QUERY_CONFIG_SELF;
+		rv = PSM2_OK;
+	}
+	return rv;
+}
+
+psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out,
+			       size_t nargs, psm2_info_query_arg_t args[])
+{
+	static const size_t expected_arg_cnt[PSM2_INFO_QUERY_LAST] =
+	{
+		0, /* PSM2_INFO_QUERY_NUM_UNITS         */
+		0, /* PSM2_INFO_QUERY_NUM_PORTS         */
+		1, /* PSM2_INFO_QUERY_UNIT_STATUS       */
+		2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS  */
+		1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */
+		1, /* PSM2_INFO_QUERY_NUM_CONTEXTS      */
+		2, /* PSM2_INFO_QUERY_CONFIG            */
+		3, /* PSM2_INFO_QUERY_THRESH            */
+		3, /* PSM2_INFO_QUERY_DEVICE_NAME       */
+		2, /* PSM2_INFO_QUERY_MTU               */
+		2, /* PSM2_INFO_QUERY_LINK_SPEED        */
+		1, /* PSM2_INFO_QUERY_NETWORK_TYPE      */
+		0, /* PSM2_INFO_QUERY_FEATURE_MASK      */
+		2, /* PSM2_INFO_QUERY_UNIT_NAME         */
+		2, /* PSM2_INFO_QUERY_UNIT_SYS_PATH     */
+	};
+	psm2_error_t rv = PSM2_INTERNAL_ERR;
+
+	if ((q < 0) ||
+	    (q >= PSM2_INFO_QUERY_LAST))
+		return 	PSM2_IQ_INVALID_QUERY;
+
+	if (nargs != expected_arg_cnt[q])
+		return PSM2_PARAM_ERR;
+
+	switch (q)
+	{
+	case PSM2_INFO_QUERY_NUM_UNITS:
+		*((uint32_t*)out) = psmi_hal_get_num_units_();
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_NUM_PORTS:
+		*((uint32_t*)out) = psmi_hal_get_num_ports_();
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_UNIT_STATUS:
+		*((uint32_t*)out) = psmi_hal_get_unit_active(args[0].unit);
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_UNIT_PORT_STATUS:
+		*((uint32_t*)out) = psmi_hal_get_port_active(args[0].unit,
+								args[1].port);
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS:
+		*((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit);
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_NUM_CONTEXTS:
+		*((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit);
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_CONFIG:
+		{
+			psm2_mq_t     mq     = args[0].mq;
+			psm2_epaddr_t epaddr = args[1].epaddr;
+			rv = psmi_get_psm2_config(mq, epaddr, (uint32_t*)out);
+		}
+		break;
+	case PSM2_INFO_QUERY_THRESH:
+		{
+			psm2_mq_t                      mq     = args[0].mq;
+			psm2_epaddr_t                  epaddr = args[1].epaddr;
+			enum psm2_info_query_thresh_et iqt    = args[2].mstq;
+
+			uint32_t                       config;
+			rv = psmi_get_psm2_config(mq, epaddr, &config);
+			if (rv == PSM2_OK)
+			{
+				*((uint32_t*)out) = 0;
+				/* Delegate the call to the ptl member function: */
+				rv = epaddr->ptlctl->msg_size_thresh_query(iqt, (uint32_t*)out, mq, epaddr);
+			}
+		}
+		break;
+	case PSM2_INFO_QUERY_DEVICE_NAME:
+		{
+			char         *hfiName       = (char*)out;
+			psm2_mq_t     mq            = args[0].mq;
+			psm2_epaddr_t epaddr        = args[1].epaddr;
+			size_t        hfiNameLength = args[2].length;
+			uint32_t      config;
+
+			rv = psmi_get_psm2_config(mq, epaddr, &config);
+			if (rv == PSM2_OK)
+			{
+				if (snprintf(hfiName, hfiNameLength, "%s_%d",
+					     psmi_hal_get_hfi_name(),
+					     mq->ep->unit_id)
+				    < hfiNameLength)
+					rv = PSM2_OK;
+			}
+		}
+		break;
+	case PSM2_INFO_QUERY_MTU:
+		{
+			psm2_mq_t     mq     = args[0].mq;
+			psm2_epaddr_t epaddr = args[1].epaddr;
+			uint32_t      config;
+
+			rv = psmi_get_psm2_config(mq, epaddr, &config);
+			if (rv == PSM2_OK)
+			{
+				// TBD - should get ipsaddr to find pr_mtu negotiated
+				*((uint32_t*)out) = mq->ep->mtu;
+			}
+		}
+		break;
+	case PSM2_INFO_QUERY_LINK_SPEED:
+		{
+			psm2_mq_t     mq     = args[0].mq;
+			psm2_epaddr_t epaddr = args[1].epaddr;
+			uint32_t      config;
+
+			rv = psmi_get_psm2_config(mq, epaddr, &config);
+			if (rv == PSM2_OK)
+			{
+				*((uint32_t*)out) = psmi_hal_get_port_rate(mq->ep->unit_id,
+								       mq->ep->portnum);
+			}
+		}
+		break;
+	case PSM2_INFO_QUERY_NETWORK_TYPE:
+		{
+			char              *networkType      = (char*)out;
+			size_t            networkTypeLength = args[0].length;
+			const char *const intelopa          = "Intel(R) OPA";
+			if (networkTypeLength >= strlen(intelopa)+1)
+			{
+				strcpy(networkType,intelopa);
+				rv = PSM2_OK;
+			}
+		}
+		break;
+	case PSM2_INFO_QUERY_FEATURE_MASK:
+		{
+#ifdef PSM_CUDA
+		*((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA;
+#else
+		*((uint32_t*)out) = 0;
+#endif /* #ifdef PSM_CUDA */
+		}
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_UNIT_NAME:
+		{
+			char         *hfiName       = (char*)out;
+			uint32_t      unit          = args[0].unit;
+			size_t        hfiNameLength = args[1].length;
+			const char   *pathName      = sysfs_unit_path(unit);
+			char         *unitName      = NULL;
+
+			if (!pathName) break;
+
+			unitName = strrchr(sysfs_unit_path(unit),'/');
+			if (!unitName) break;
+
+			strncpy(hfiName, ++unitName, hfiNameLength);
+			hfiName[hfiNameLength-1] = '\0';
+			rv = PSM2_OK;
+		}
+		break;
+	case PSM2_INFO_QUERY_UNIT_SYS_PATH:
+		{
+			char         *hfiName       = (char*)out;
+			uint32_t      unit          = args[0].unit;
+			size_t        hfiNameLength = args[1].length;
+			const char   *pathName      = sysfs_unit_path(unit);
+			//char         *unitName      = NULL;
+
+			if (!pathName) break;
+
+			strncpy(hfiName, pathName, hfiNameLength);
+			hfiName[hfiNameLength-1] = '\0';
+			rv = PSM2_OK;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return rv;
+}
+PSMI_API_DECL(psm2_info_query)
+
+uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask)
+{
+	return (psm2_capabilities_bitset & req_cap_mask);
+}
+PSMI_API_DECL(psm2_get_capability_mask)
+
+psm2_error_t __psm2_finalize(void)
+{
+	struct psmi_eptab_iterator itor;
+	char *hostname;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+
+	_HFI_DBG("psmi_refcount=%d\n", psmi_refcount);
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+	psmi_assert(psmi_refcount > 0);
+	psmi_refcount--;
+
+	if (psmi_refcount > 0) {
+		return PSM2_OK;
+	}
+
+	/* When PSM_PERF is enabled, the following line causes the
+	   instruction cycles gathered in the current run to be dumped
+	   to stderr. */
+	GENERIC_PERF_DUMP(stderr);
+	ep = psmi_opened_endpoint;
+	while (ep != NULL) {
+		psm2_ep_t saved_ep = ep->user_ep_next;
+		psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL,
+			     2 * PSMI_MIN_EP_CLOSE_TIMEOUT);
+		psmi_opened_endpoint = ep = saved_ep;
+	}
+
+#ifdef PSM_FI
+	psmi_faultinj_fini();
+#endif /* #ifdef PSM_FI */
+
+	/* De-allocate memory for any allocated space to store hostnames */
+	psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME);
+	while ((hostname = psmi_epid_itor_next(&itor)))
+		psmi_free(hostname);
+	psmi_epid_itor_fini(&itor);
+
+	psmi_epid_fini();
+
+	/* unmap shared mem object for affinity */
+	if (psmi_affinity_shared_file_opened) {
+		/*
+		 * Start critical section to decrement ref count and unlink
+		 * affinity shm file.
+		 */
+		psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+		shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1;
+		if (shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) {
+			_HFI_VDBG("Unlink shm file for NIC affinity as there are no more users\n");
+			shm_unlink(affinity_shm_name);
+		} else {
+			_HFI_VDBG("Number of affinity shared memory users left=%ld\n",
+				  shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]);
+		}
+
+		msync(shared_affinity_ptr, AFFINITY_SHMEMSIZE, MS_SYNC);
+
+		/* End critical section */
+		psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+		munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE);
+		shared_affinity_ptr = NULL;
+		psmi_free(affinity_shm_name);
+		affinity_shm_name = NULL;
+		psmi_affinity_shared_file_opened = 0;
+	}
+
+	if (psmi_affinity_semaphore_open) {
+		_HFI_VDBG("Closing and Unlinking Semaphore: %s.\n", sem_affinity_shm_rw_name);
+		sem_close(sem_affinity_shm_rw);
+		sem_affinity_shm_rw = NULL;
+		sem_unlink(sem_affinity_shm_rw_name);
+		psmi_free(sem_affinity_shm_rw_name);
+		sem_affinity_shm_rw_name = NULL;
+		psmi_affinity_semaphore_open = 0;
+	}
+
+	psmi_hal_finalize();
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		psmi_stats_deregister_type(PSMI_STATSTYPE_CUDA, &is_cuda_enabled);
+#endif
+
+	psmi_refcount = PSMI_FINALIZED;
+	PSM2_LOG_MSG("leaving");
+	psmi_log_fini();
+
+	psmi_stats_finalize();
+
+	psmi_heapdebug_finalize();
+
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_finalize)
+
+/*
+ * Function exposed in >= 1.05
+ */
+psm2_error_t
+__psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (nids == NULL || hostnames == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	for (i = 0; i < num; i++) {
+		if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1)))
+			break;
+	}
+
+fail:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_map_nid_hostname)
+
+void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label)
+{
+	PSM2_LOG_MSG("entering");
+	PSM2_LOG_MSG("leaving");
+	return;			/* ignore this function */
+}
+PSMI_API_DECL(psm2_epaddr_setlabel)
+
+void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt)
+{
+
+	/* Eventually deprecate this API to use set/get opt as this is unsafe. */
+	PSM2_LOG_MSG("entering");
+	psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+		   PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *));
+	PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_epaddr_setctxt)
+
+void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr)
+{
+	psm2_error_t err;
+	uint64_t optlen = sizeof(void *);
+	void *result = NULL;
+
+	PSM2_LOG_MSG("entering");
+	/* Eventually deprecate this API to use set/get opt as this is unsafe. */
+	err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr,
+			 PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen);
+
+	PSM2_LOG_MSG("leaving");
+
+	if (err == PSM2_OK)
+		return result;
+	else
+		return NULL;
+}
+PSMI_API_DECL(psm2_epaddr_getctxt)
+
+psm2_error_t
+__psm2_setopt(psm2_component_t component, const void *component_obj,
+	     int optname, const void *optval, uint64_t optlen)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	switch (component) {
+	case PSM2_COMPONENT_CORE:
+		rv = psmi_core_setopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_MQ:
+		/* Use the deprecated MQ set/get opt for now which does not use optlen */
+		rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_AM:
+		/* Hand off to active messages */
+		rv = psmi_am_setopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_IB:
+		/* Hand off to IPS ptl to set option */
+		rv = psmi_ptl_ips.setopt(component_obj, optname, optval,
+					   optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	}
+
+	/* Unrecognized/unknown component */
+	rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+				 component);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_setopt);
+
+psm2_error_t
+__psm2_getopt(psm2_component_t component, const void *component_obj,
+	     int optname, void *optval, uint64_t *optlen)
+{
+	psm2_error_t rv;
+
+	PSM2_LOG_MSG("entering");
+	switch (component) {
+	case PSM2_COMPONENT_CORE:
+		rv = psmi_core_getopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_MQ:
+		/* Use the deprecated MQ set/get opt for now which does not use optlen */
+		rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_AM:
+		/* Hand off to active messages */
+		rv = psmi_am_getopt(component_obj, optname, optval, optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	case PSM2_COMPONENT_IB:
+		/* Hand off to IPS ptl to set option */
+		rv = psmi_ptl_ips.getopt(component_obj, optname, optval,
+					   optlen);
+		PSM2_LOG_MSG("leaving");
+		return rv;
+		break;
+	}
+
+	/* Unrecognized/unknown component */
+	rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u",
+				 component);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_getopt);
+
+psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly)
+{
+	PSM2_LOG_MSG("entering");
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK_NO_PROGRESS;
+}
+PSMI_API_DECL(psmi_poll_noop)
+
+psm2_error_t __psm2_poll(psm2_ep_t ep)
+{
+	psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK;
+	psm2_ep_t tmp;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	tmp = ep;
+	do {
+		err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0);	/* poll reqs & reps */
+		if (err1 > PSM2_OK_NO_PROGRESS) {	/* some error unrelated to polling */
+			PSMI_UNLOCK(ep->mq->progress_lock);
+			PSM2_LOG_MSG("leaving");
+			return err1;
+		}
+
+		err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0);	/* get into ips_do_work */
+		if (err2 > PSM2_OK_NO_PROGRESS) {	/* some error unrelated to polling */
+			PSMI_UNLOCK(ep->mq->progress_lock);
+			PSM2_LOG_MSG("leaving");
+			return err2;
+		}
+		ep = ep->mctxt_next;
+	} while (ep != tmp);
+
+	/* This is valid because..
+	 * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK
+	 * PSM2_OK & PSM2_OK => PSM2_OK
+	 * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK
+	 * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */
+	PSMI_UNLOCK(ep->mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return (err1 & err2);
+}
+PSMI_API_DECL(psm2_poll)
+
+psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh)
+{
+	psm2_error_t err1 = PSM2_OK_NO_PROGRESS;
+	psm2_error_t err2;
+	psm2_ep_t tmp;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_LOCK_ASSERT(ep->mq->progress_lock);
+
+	tmp = ep;
+	do {
+		if (poll_amsh) {
+			err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0);	/* poll reqs & reps */
+			if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+				PSM2_LOG_MSG("leaving");
+				return err1;
+			}
+		}
+
+		err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0);	/* get into ips_do_work */
+		if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */
+			PSM2_LOG_MSG("leaving");
+			return err2;
+		}
+
+		ep = ep->mctxt_next;
+	} while (ep != tmp);
+	PSM2_LOG_MSG("leaving");
+	return (err1 & err2);
+}
+PSMI_API_DECL(psmi_poll_internal)
+#ifdef PSM_PROFILE
+/* These functions each have weak symbols */
+void psmi_profile_block()
+{
+	;			/* empty for profiler */
+}
+
+void psmi_profile_unblock()
+{
+	;			/* empty for profiler */
+}
+
+void psmi_profile_reblock(int did_no_progress)
+{
+	;			/* empty for profiler */
+}
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm2.h b/deps/libfabric/prov/psm3/psm3/psm2.h
new file mode 100644
index 0000000000000000000000000000000000000000..c37ecb1265ea7b21892f47f0d9fafb12e2a7d9a6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2.h
@@ -0,0 +1,1778 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_H
+#define PSM2_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <uuid/uuid.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2.h
+ * @page psm2_main PSM2 API
+ *
+ * @brief PSM2 OPA Messaging Library
+ *
+ * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level
+ * user-level communications interface for the OPA family of products.
+ * PSM2 users are enabled with mechanisms necessary to implement higher level
+ * communications interfaces in parallel environments.
+ *
+ * Since PSM2 targets clusters of multicore processors, it internally implements
+ * two levels of communication: intra-node shared memory communication and
+ * inter-node OPA communication.  Both of these levels are encapsulated
+ * below the interface and the user is free to assume that intra-node and
+ * inter-node communication is transparently handled within PSM.
+ *
+ * @section compat Compatibility
+ *
+ * PSM2 can coexist with other QLogic/Pathscale software distributions, such as
+ * OpenIB/OpenFabrics, which allows applications to simultaneously target
+ * PSM-based and non PSM-based applications on a single node without changing
+ * any system-level configuration.  However, PSM2 does not support running
+ * PSM-based and non PSM-based communication within the same user process.
+ *
+ * Except where noted, PSM2 does not assume an SPMD (single program, multiple
+ * data) parallel model and extends to MPMD (multiple program, multiple data)
+ * environments in specific areas. However, PSM2 assumes the runtime environment
+ * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness
+ * (little or big) and will fail at startup if any of these assumptions do not
+ * hold.  For homogeneous systems PSM2 can run either in 32-bit or 64-bit
+ * environments.  Even though both environments should expect similar
+ * performance from the API, PSM2 has chosen to favor 64-bit environments in
+ * some minor areas.
+ *
+ * @section ep_model Endpoint Communication Model
+ *
+ * PSM2 follows an endpoint communication model where an endpoint is defined as
+ * an object (or handle) instantiated to support sending and receiving messages
+ * to other endpoints.  In order to prevent PSM2 from being tied to a particular
+ * parallel model (such as SPMD), control over the parallel layout of endpoints
+ * is retained by the user.  Opening endpoints (@ref psm2_ep_open) and
+ * connecting endpoints to enable communication (@ref psm2_ep_connect) are two
+ * decoupled mechanisms.  Users that do not dynamically change the number of
+ * endpoints beyond parallel startup will probably lump both mechanisms
+ * together at startup.  Users that wish to manipulate the location and number
+ * of endpoints at runtime can do so by explicitly connecting sets or subsets
+ * of endpoints.
+ *
+ * As a side effect, this greater flexibility forces the user to cope with a
+ * two-stage initialization process.  In the first stage of opening an endpoint
+ * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a
+ * globally distributable endpoint identifier (@ref psm2_epid_t).  Prior to the
+ * second stage of connecting endpoints (@ref psm2_ep_connect), a user must
+ * distribute all relevent endpoint identifiers through an out-of-band
+ * mechanism.  Once the endpoint identifiers are successfully distributed to
+ * all processes that wish to communicate, the user
+ * connects all endpoint identifiers to the locally opened endpoint
+ * (@ref psm2_ep_connect).  In connecting the endpoints, the user obtains an
+ * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM
+ * communication primitives.
+ *
+ *
+ * @section components PSM2 Components
+ *
+ * PSM2 exposes a single endpoint initialization model, but enables various
+ * levels of communication functionality and semantics through @e components.
+ * The first major component available in PSM2 is PSM2 Matched Queues
+ * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am).
+ *
+ * Matched Queues (MQ) present a queue-based communication model with the
+ * distinction that queue consumers use a 3-tuple of metadata to match incoming
+ * messages against a list of preposted receive buffers.  The MQ semantics are
+ * sufficiently akin to MPI to cover the entire MPI-1.2 standard.
+ *
+ * The Active Message (AM) component presents a request/reply model where
+ * the arrival of a message triggers the execution of consumer-provided
+ * handler code. This can be used to implement many one-sided and two-sided
+ * communications paradigms.
+ *
+ * With future releases of the PSM2 interface, more components will
+ * be exposed to accommodate users that implement parallel communication
+ * models that deviate from the Matched Queue semantics.  For example, PSM
+ * plans to expose a connection management component to make it easier to
+ * handle endpoint management for clients without their own connection
+ * managers.
+ *
+ *
+ * @section progress PSM2 Communication Progress Guarantees
+ *
+ * PSM2 internally ensures progress of both intra-node and inter-node messages,
+ * but not autonomously.  This means that while performance does not depend
+ * greatly on how the user decides to schedule communication progress,
+ * explicit progress calls are required for correctness.  The @ref psm2_poll
+ * function is available to make progress over all PSM2 components in a generic
+ * manner.  For more information on making progress over many communication
+ * operations in the MQ component, see the @ref mq_progress documentation.
+ *
+ *
+ * @section completion PSM2 Completion semantics
+ *
+ * PSM2 implements the MQ component, which documents its own
+ * message completion semantics (@ref mq_completion).
+ *
+ *
+ * @section error_handling PSM2 Error handling
+ *
+ * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error.
+ * While most errors are fatal in that the user is not expected to be able to
+ * recover from them, PSM2 still allows some level of control.  By
+ * default, PSM2 returns all errors to the user but as a convenience, allows
+ * users to either defer errors internally to PSM2 or to have PSM2 return all
+ * errors to the user (callers to PSM2 functions).  PSM2 attempts to deallocate
+ * its resources as a best effort, but exits are always non-collective with
+ * respect to endpoints opened in other processes.  The user is expected to be
+ * able to handle non-collective exits from any endpoint and in turn cleanly
+ * and independently terminate the parallel environment.  Local error handling
+ * can be handled in three modes:
+ *
+ * Errors and error handling can be individually registered either globally or
+ * per-endpoint:
+ * @li @b Per-endpoint error handling captures errors for functions where the
+ * error scoping is determined to be over an endpoint.  This includes all
+ * communication functions that include an EP or MQ handle as the first
+ * parameter.
+ *
+ * @li @b Global error handling captures errors for functions where a
+ * particular endpoint cannot be identified or for @ref psm2_ep_open, where
+ * errors (if any) occur before the endpoint is opened.
+ *
+ * Error handling is controlled by registering error handlers (@ref
+ * psm2_error_register_handler).  The global error handler can
+ * be set at any time (even before @ref psm2_init), whereas a per-endpoint error
+ * handler can be set as soon as a new endpoint is successfully created.  If a
+ * per-endpoint handle is not registered, the per-endpoint handler inherits
+ * from the global error handler at time of open.
+ *
+ * PSM2 predefines two different mechanisms for handling errors:
+ *
+ * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER)
+ * @li No-op PSM2 error handler where errors are returned
+ *     (@ref PSM2_ERRHANDLER_NO_HANDLER)
+ *
+ * The default PSM-internal error handler effectively frees the user from
+ * explicitly handling the return values of ever PSM2 function but may not
+ * return to the user in a function determined to have caused a fatal error.
+ *
+ * The No-op PSM2 error handler bypasses all error handling functionality and
+ * always returns the error to the user.  The user can then use @ref
+ * psm2_error_get_string to obtain a generic string from an error code (compared
+ * to a more detailed error message available through registering of error
+ * handlers).
+ *
+ * For even more control, users can register their own error handlers to have
+ * access to more precise error strings and selectively control when an when
+ * not to return to callers of PSM2 functions.  All error handlers shown defer
+ * error handling to PSM2 for errors that are not recognized using @ref
+ * psm2_error_defer.  Deferring an error from a custom error handler is
+ * equivalent to relying on the default error handler.
+ *
+ * @section env_var Environment variables
+ *
+ * Some PSM2 behaviour can be controlled via environment variables.
+ *
+ * @li @b PSM3_DEVICES. PSM2 implements three devices for communication which
+ * are, in order,  @c self, @c shm and @c hfi.  For PSM2 jobs that do not
+ * require shared-memory communications, @b PSM3_DEVICES can be specified as @c
+ * self, @c hfi.  Similarly, for shared-memory only jobs, the @c hfi device
+ * can be disabled.  It is up to the user to ensure that the endpoint ids
+ * passed in @ref psm2_ep_connect do not require a device that has been
+ * explicitly disabled by the user.  In some instances, enabling only the
+ * devices that are required may improve performance.
+ *
+ * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts
+ * of PSM2 will output debugging information.  With a default value of @c 0x1,
+ * informative messages will be printed (this value should be considered a
+ * minimum).  At @c 0x101, startup and finalization messages are added to the
+ * output.  At @c 0x1c3, every communication event is logged and should hence
+ * be used for extreme debugging only.
+ *
+ * @li @b PSM3_MULTI_EP. By default, only one PSM2 endpoint may be opened in
+ * a process. With the correct setting of this environment variable, a process
+ * may open more than one PSM2 endpoint. In order to enable multiple endpoint
+ * per process support, the value of this environment variable should be set
+ * to "1" or "yes".
+ *
+ * @section thr_sfty Thread safety and reentrancy
+ * Unless specifically noted otherwise, all PSM2 functions should not be considered
+ * to be thread safe or reentrant.
+ */
+
+/** @brief Local endpoint handle (opaque)
+ *  @ingroup ep
+ *
+ * Handle returned to the user when a new local endpoint is created.  The
+ * handle is a local handle to be used in all communication functions and is
+ * not intended to globally identify the opened endpoint in any way.
+ *
+ * All open endpoint handles can be globally identified using the endpoint id
+ * integral type (@ref psm2_epid_t) and all communication must use an endpoint
+ * address (@ref psm2_epaddr_t) that can be obtained by connecting a local
+ * endpoint to one or more endpoint identifiers.
+ *
+ * @remark The local endpoint handle is opaque to the user.  */
+typedef struct psm2_ep *psm2_ep_t;
+
+/** @brief MQ handle (opaque)
+ * @ingroup mq
+ *
+ * Handle returned to the user when a new Matched queue is created (@ref
+ * psm2_mq_init).  */
+typedef struct psm2_mq *psm2_mq_t;
+
+/*! @defgroup init PSM2 Initialization and Maintenance
+ * @{
+ */
+#define PSM2_VERNO       0x0300	/*!< Header-defined Version number */
+#define PSM2_VERNO_MAJOR 0x03	/*!< Header-defined Major Version Number */
+#define PSM2_VERNO_MINOR 0x00	/*!< Header-defined Minor Version Number */
+#define PSM2_VERNO_COMPAT_MAJOR 0x02    /*!<Minimum Major Version Number for Compatibility */
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_error {
+	/*! Interface-wide "ok", guaranteed to be 0. */
+	PSM2_OK = 0,
+	/*! No events progressed on @ref psm2_poll (not fatal) */
+	PSM2_OK_NO_PROGRESS = 1,
+	/*! Error in a function parameter */
+	PSM2_PARAM_ERR = 3,
+	/*! PSM2 ran out of memory */
+	PSM2_NO_MEMORY = 4,
+	/*! PSM2 has not been initialized by @ref psm2_init */
+	PSM2_INIT_NOT_INIT = 5,
+	/*! API version passed in @ref psm2_init is incompatible */
+	PSM2_INIT_BAD_API_VERSION = 6,
+	/*! PSM2 Could not set affinity */
+	PSM2_NO_AFFINITY = 7,
+	/*! PSM2 Unresolved internal error */
+	PSM2_INTERNAL_ERR = 8,
+	/*! PSM2 could not set up shared memory segment */
+	PSM2_SHMEM_SEGMENT_ERR = 9,
+	/*! PSM2 option is a read-only option */
+	PSM2_OPT_READONLY = 10,
+	/*! PSM2 operation timed out */
+	PSM2_TIMEOUT = 11,
+	/*! Too many endpoints */
+	PSM2_TOO_MANY_ENDPOINTS = 12,
+
+	/*! PSM2 is finalized */
+	PSM2_IS_FINALIZED = 13,
+
+	/*! Endpoint was closed */
+	PSM2_EP_WAS_CLOSED = 20,
+	/*! PSM2 Could not find an OPA Unit */
+	PSM2_EP_NO_DEVICE = 21,
+	/*! User passed a bad unit or port number */
+	PSM2_EP_UNIT_NOT_FOUND = 22,
+	/*! Failure in initializing endpoint */
+	PSM2_EP_DEVICE_FAILURE = 23,
+	/*! Error closing the endpoing error */
+	PSM2_EP_CLOSE_TIMEOUT = 24,
+	/*! No free ports could be obtained */
+	PSM2_EP_NO_PORTS_AVAIL = 25,
+	/*! Could not detect network connectivity */
+	PSM2_EP_NO_NETWORK = 26,
+	/*! Invalid Unique job-wide UUID Key */
+	PSM2_EP_INVALID_UUID_KEY = 27,
+	/*! Internal out of resources */
+	PSM2_EP_NO_RESOURCES = 28,
+
+	/*! Endpoint connect status unknown (because of other failures or if
+	 * connect attempt timed out) */
+	PSM2_EPID_UNKNOWN = 40,
+	/*! Endpoint could not be reached by any PSM2 component */
+	PSM2_EPID_UNREACHABLE = 41,
+	/*! At least one of the connecting nodes was incompatible in endianess */
+	PSM2_EPID_INVALID_NODE = 43,
+	/*! At least one of the connecting nodes provided an invalid MTU */
+	PSM2_EPID_INVALID_MTU = 44,
+	/*! At least one of the connecting nodes provided a bad key */
+	PSM2_EPID_INVALID_UUID_KEY = 45,
+	/*! At least one of the connecting nodes is running an incompatible
+	 * PSM2 protocol version */
+	PSM2_EPID_INVALID_VERSION = 46,
+	/*! At least one node provided garbled information */
+	PSM2_EPID_INVALID_CONNECT = 47,
+	/*! EPID was already connected */
+	PSM2_EPID_ALREADY_CONNECTED = 48,
+	/*! EPID is duplicated, network connectivity problem */
+	PSM2_EPID_NETWORK_ERROR = 49,
+	/*! EPID incompatible partition keys */
+	PSM2_EPID_INVALID_PKEY = 50,
+	/*! Unable to resolve path for endpoint */
+	PSM2_EPID_PATH_RESOLUTION = 51,
+	/*! Unable to connect rv QP */
+	PSM2_EPID_RV_CONNECT_ERROR = 52,
+	/*! Recovering rv QP conection */
+	PSM2_EPID_RV_CONNECT_RECOVERING = 53,
+
+	/*! MQ Non-blocking request is incomplete */
+	PSM2_MQ_NO_COMPLETIONS = 60,
+	/*! MQ Message has been truncated at the receiver */
+	PSM2_MQ_TRUNCATION = 61,
+
+	/*! AM reply error */
+	PSM2_AM_INVALID_REPLY = 70,
+
+	/*! Info query invalid query error */
+	PSM2_IQ_INVALID_QUERY = 71,
+
+    /*! Reserved Value to indicate highest ENUM value */
+    PSM2_ERROR_LAST = 80
+};
+
+/*! Backwards header compatibility for a confusing error return name */
+#define PSM2_MQ_INCOMPLETE PSM2_MQ_NO_COMPLETIONS
+
+/*! @see psm2_error */
+typedef enum psm2_error psm2_error_t;
+
+/*! @brief PSM2 Error type
+ */
+enum psm2_component {
+	/*! PSM2 core library */
+	PSM2_COMPONENT_CORE = 0,
+	/*! MQ component */
+	PSM2_COMPONENT_MQ = 1,
+	/*! AM component */
+	PSM2_COMPONENT_AM = 2,
+	/*! IB component */
+	PSM2_COMPONENT_IB = 3
+};
+
+/*! @see psm2_component */
+typedef enum psm2_component psm2_component_t;
+
+/*! @brief PSM2 Path resolution mechanism
+ */
+enum psm2_path_res {
+	/*! PSM2 no path resolution */
+	PSM2_PATH_RES_NONE = 0,
+	/*! Use OFED Plus for path resolution */
+	PSM2_PATH_RES_OPP = 1,
+	/*! Use OFED UMAD for path resolution */
+	PSM2_PATH_RES_UMAD = 2
+};
+
+/*! @see psm2_path_resolution */
+typedef enum psm2_path_res psm2_path_res_t;
+
+/** @brief Initialize PSM2 interface
+ *
+ * Call to initialize the PSM2 library for a desired API revision number.
+ *
+ * @param[in,out] api_verno_major As input a pointer to an integer that holds
+ *                                @ref PSM2_VERNO_MAJOR. As output, the pointer
+ *                                is updated with the major revision number of
+ *                                the loaded library.
+ * @param[in,out] api_verno_minor As input, a pointer to an integer that holds
+ *                                @ref PSM2_VERNO_MINOR.  As output, the pointer
+ *                                is updated with the minor revision number of
+ *                                the loaded library.
+ *
+ * @pre The user has not called any other PSM2 library call except @ref
+ *      psm2_error_register_handler to register a global error handler.
+ *
+ * @post Depending on the environment variable @ref PSM3_MULTI_EP being set and
+ * 	 its contents, support for opening multiple endpoints is either enabled
+ * 	 or disabled.
+ *
+ * @warning PSM2 initialization is a precondition for all functions used in the
+ *          PSM2 library.
+ *
+ * @returns PSM2_OK The PSM2 interface could be opened and the desired API
+ *                 revision can be provided.
+ * @returns PSM2_INIT_BAD_API_VERSION The PSM2 library cannot compatibility for
+ *                                   the desired API version.
+ *
+ * @code{.c}
+   	// In this example, we want to handle our own errors before doing init,
+   	// since we don't want a fatal error if OPA is not found.
+   	// Note that @ref psm2_error_register_handler
+   	// (and @ref psm2_get_capability_mask)
+   	// are the only function that can be called before @ref psm2_init
+
+   	int try_to_initialize_psm() {
+   	    int verno_major = PSM2_VERNO_MAJOR;
+   	    int verno_minor = PSM2_VERNO_MINOR;
+
+   	    int err = psm2_error_register_handler(NULL,  // Global handler
+   	                                 PSM2_ERRHANDLER_NO_HANDLER); // return errors
+   	    if (err) {
+   	       fprintf(stderr, "Couldn't register global handler: %s\n",
+   	   	          psm2_error_get_string(err));
+   	       return -1;
+   	    }
+
+   	    err = psm2_init(&verno_major, &verno_minor);
+   	    if (err || verno_major > PSM2_VERNO_MAJOR) {
+   	       if (err)
+   	         fprintf(stderr, "PSM3 initialization failure: %s\n",
+   	                 psm2_error_get_string(err));
+   	     else
+   	         fprintf(stderr, "PSM3 loaded an unexpected/unsupported "
+   	                         "version (%d.%d)\n", verno_major, verno_minor);
+   	     return -1;
+   	    }
+
+   	    // We were able to initialize PSM2 but will defer all further error
+   	    // handling since most of the errors beyond this point will be fatal.
+   	    int err = psm2_error_register_handler(NULL,  // Global handler
+   	                                          PSM2_ERRHANDLER_PSM_HANDLER);
+   	    if (err) {
+   	       fprintf(stderr, "Couldn't register global errhandler: %s\n",
+   	   	          psm2_error_get_string(err));
+   	       return -1;
+   	    }
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor);
+
+/*! @brief PSM2 capabilities definitions
+ *
+ * Each capability is defined as a separate bit,
+ * i.e. next capabilities must be defined as
+ * consecutive bits : 0x2, 0x4 ... and so on.
+ */
+#define PSM2_MULTI_EP_CAP 0x1	/* Multiple Endpoints capability */
+#define PSM2_LIB_REFCOUNT_CAP 0x2	/* Library finalization is managed with reference count */
+
+/** @brief PSM2 capabilities provider
+ *
+ * @param[in] req_cap_mask Requested capabilities are given as bit field.
+ *
+ * @returns internal capabilities bit field ANDed with a requested bit mask */
+uint64_t psm2_get_capability_mask(uint64_t req_cap_mask);
+
+/** @brief Finalize PSM2 interface
+ *
+ * Single call to finalize PSM2 and close all unclosed endpoints
+ *
+ * @post The user guarantees not to make any further PSM2 calls, including @ref
+ * psm2_init.
+ *
+ * @returns PSM2_OK Always returns @c PSM2_OK */
+psm2_error_t psm2_finalize(void);
+
+/** @brief Error handling opaque token
+ *
+ * A token is required for users that register their own handlers and wish to
+ * defer further error handling to PSM. */
+typedef struct psm2_error_token *psm2_error_token_t;
+
+/** @brief Error handling function
+ *
+ * Users can handle errors explicitly instead of relying on PSM's own error
+ * handler.  There is one global error handler and error handlers that can be
+ * individually set for each opened endpoint.  By default, endpoints will
+ * inherit the global handler registered at the time of open.
+ *
+ * @param[in] ep Handle associated to the endpoint over which the error occurred
+ *               or @c NULL if the error is being handled by the global error
+ *               handler.
+ * @param[in] error PSM2 error identifier
+ * @param[in] error_string A descriptive error string of maximum length @ref
+ *                         PSM2_ERRSTRING_MAXLEN.
+ * @param[in] token Opaque PSM2 token associated with the particular event that
+ *		    generated the error.  The token can be used to extract the
+ *		    error string and can be passed to @ref psm2_error_defer to
+ *		    defer any remaining or unhandled error handling to PSM.
+ *
+ * @post If the error handler returns, the error returned is propagated to the
+ *       caller.  */
+typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep,
+					   const psm2_error_t error,
+					   const char *error_string,
+					   psm2_error_token_t token);
+
+#define PSM2_ERRHANDLER_DEFAULT	((psm2_ep_errhandler_t)-1)
+/**< Obsolete names, only here for backwards compatibility */
+#define PSM2_ERRHANDLER_NOP	((psm2_ep_errhandler_t)-2)
+/**< Obsolete names, only here for backwards compatibility */
+
+#define PSM2_ERRHANDLER_PSM_HANDLER  ((psm2_ep_errhandler_t)-1)
+/**< PSM2 error handler as explained in @ref error_handling */
+
+#define PSM2_ERRHANDLER_NO_HANDLER   ((psm2_ep_errhandler_t)-2)
+/**< Bypasses the default PSM2 error handler and returns all errors to the user
+ * (this is the default) */
+
+#define PSM2_ERRSTRING_MAXLEN	512 /**< Maximum error string length. */
+
+/** @brief PSM2 error handler registration
+ *
+ * Function to register error handlers on a global basis and on a per-endpoint
+ * basis.  PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special
+ * pre-defined handlers to respectively enable use of the default PSM-internal
+ * handler or the no-handler that disables registered error handling and
+ * returns all errors to the caller (both are documented in @ref
+ * error_handling).
+ *
+ * @param[in] ep Handle of the endpoint over which the error handler should be
+ *               registered.  With ep set to @c NULL, the behavior of the
+ *               global error handler can be controlled.
+ * @param[in] errhandler Handler to register.  Can be a user-specific error
+ *                       handling function or PSM2_ERRHANDLER_PSM_HANDLER or
+ *                       PSM2_ERRHANDLER_NO_HANDLER.
+ *
+ * @remark When ep is set to @c NULL, this is the only function that can be
+ * called before @ref psm2_init
+ */
+psm2_error_t
+psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler);
+
+/** @brief PSM2 deferred error handler
+ *
+ * Function to handle fatal PSM2 errors if no error handler is installed or if
+ * the user wishes to defer further error handling to PSM.  Depending on the
+ * type of error, PSM2 may or may not return from the function call.
+ *
+ * @param[in] err_token Error token initially passed to error handler
+ *
+ * @pre The user is calling into the function because it has decided that PSM
+ *      should handle an error case.
+ *
+ * @post The function may or may not return depending on the error
+ */
+psm2_error_t psm2_error_defer(psm2_error_token_t err_token);
+
+/** @brief Get generic error string from error
+ *
+ * Function to return the default error string associated to a PSM2 error.
+ *
+ * While a more detailed and precise error string is usually available within
+ * error handlers, this function is available to obtain an error string out of
+ * an error handler context or when a no-op error handler is registered.
+ *
+ * @param[in] error PSM2 error
+ */
+const char *psm2_error_get_string(psm2_error_t error);
+
+/** @brief Option key/pair structure
+ *
+ * Currently only used in MQ.
+ */
+struct psm2_optkey {
+	uint32_t key;	/**< Option key */
+	void *value;	/**< Option value */
+};
+
+/*! @} */
+
+/*! @defgroup ep PSM2 Device Endpoint Management
+ * @{
+ */
+
+/** @brief Endpoint ID
+ *
+ * Integral type of size 8 bytes that can be used by the user to globally
+ * identify a successfully opened endpoint.  Although the contents of the
+ * endpoint id integral type remains opaque to the user, unique network id and
+ * OPA port number can be extracted using @ref psm2_epid_nid and @ref
+ * psm2_epid_context.
+ */
+typedef uint64_t psm2_epid_t;
+
+/** @brief Endpoint Address (opaque)
+ *
+ * Remote endpoint addresses are created when the user binds an endpoint ID
+ * to a particular endpoint handle using @ref psm2_ep_connect.  A given endpoint
+ * address is only guaranteed to be valid over a single endpoint.
+ */
+typedef struct psm2_epaddr *psm2_epaddr_t;
+
+/** @brief PSM2 Unique UID
+ *
+ * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an
+ * endpoint within a particular job.  Since PSM2 does not participate in job
+ * allocation and management, users are expected to generate a unique ID to
+ * associate endpoints to a particular parallel or collective job.
+ */
+typedef uuid_t psm2_uuid_t;
+
+/** @brief Get Endpoint identifier's Unique Network ID */
+uint64_t psm2_epid_nid(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA context number */
+uint64_t psm2_epid_context(psm2_epid_t epid);
+
+/** @brief Get Endpoint identifier's OPA port (deprecated, use
+ * @ref psm2_epid_context instead) */
+uint64_t psm2_epid_port(psm2_epid_t epid);
+
+/** @brief List the number of available OPA units
+ *
+ * Function used to determine the number of locally available OPA units.
+ * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1.
+ *
+ * @returns PSM2_OK unless the user has not called @ref psm2_init
+ */
+psm2_error_t psm2_ep_num_devunits(uint32_t *num_units);
+
+/* Affinity modes for the affinity member of struct psm2_ep_open_opts */
+#define PSM2_EP_OPEN_AFFINITY_SKIP     0	/**< Disable setting affinity */
+#define PSM2_EP_OPEN_AFFINITY_SET      1	/**< Enable setting affinity unless
+					  already set */
+#define PSM2_EP_OPEN_AFFINITY_FORCE    2	/**< Enable setting affinity regardless
+					  of current affinity setting */
+
+/* Default values for some constants */
+#define PSM2_EP_OPEN_PKEY_DEFAULT    0xffffffffffffffffULL
+				    /**< Default protection key */
+
+/** @brief Endpoint Open Options
+ *
+ * These options are available for opening a PSM2 endpoint.  Each is
+ * individually documented and setting each option to -1 or passing NULL as the
+ * options parameter in @ref psm2_ep_open instructs PSM2 to use
+ * implementation-defined defaults.
+ *
+ * Each option is documented in @ref psm2_ep_open
+ */
+struct psm2_ep_open_opts {
+	int64_t timeout;	/**< timeout in nanoseconds to open device */
+	int unit;		/**< OPA Unit ID to open on */
+	int affinity;		/**< How PSM2 should set affinity */
+	int shm_mbytes;	/**< Megabytes used for intra-node, deprecated */
+	int sendbufs_num;	/**< Preallocated send buffers */
+	uint64_t network_pkey;	/**< Network Protection Key (v1.01) */
+	int port;		/**< IB port to use (1 to N) */
+	int outsl;		/**< IB SL to use when sending pkts */
+	uint64_t service_id;	/* IB Service ID to use for endpoint */
+	psm2_path_res_t path_res_type;	/* Path resolution type */
+	int senddesc_num;	/* Preallocated send descriptors */
+	int imm_size;		/* Immediate data size for endpoint */
+};
+
+/** @brief OPA endpoint creation
+ *
+ * Function used to create a new local communication endpoint on an OPA
+ * adapter.  The returned endpoint handle is required in all PSM2 communication
+ * operations, as PSM2 can manage communication over multiple endpoints.  An
+ * opened endpoint has no global context until the user connects the endpoint
+ * to other global endpoints by way of @ref psm2_ep_connect.  All local endpoint
+ * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are
+ * also returned when an endpoint is opened.  It is assumed that the user can
+ * provide an out-of-band mechanism to distribute the endpoint IDs in order to
+ * establish connections between endpoints (@ref psm2_ep_connect for more
+ * information).
+ *
+ * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in
+ *                           a parallel job.  It is up to the user to ensure
+ *                           that the key is globally unique over a period long
+ *                           enough to prevent duplicate keys over the same set
+ *                           of endpoints (see comments below).
+ *
+ * @param[in] opts Open options of type @ref psm2_ep_open_opts
+ *                 (see @ref psm2_ep_open_opts_get_defaults).
+ *
+ * @param[out] ep User-supplied storage to return a pointer to the newly
+ *                created endpoint.  The returned pointer of type @ref psm2_ep_t
+ *                is a local handle and cannot be used to globally identify the
+ *                endpoint.
+ * @param[out] epid User-supplied storage to return the endpoint ID associated
+ *                  to the newly created local endpoint returned in the @c ep
+ *                  handle.  The endpoint ID is an integral type suitable for
+ *                  uniquely identifying the local endpoint.
+ *
+ * PSM2 does not internally verify the consistency of the uuid, it is up to the
+ * user to ensure that the uid is unique enough not to collide with other
+ * currently-running jobs.  Users can employ two mechanisms to obtain a uuid.
+ *
+ * 1. Use an OS or library-specific uuid generation utility, that complies with
+ *    OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on
+ *    FreeBSD.
+ *    (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm)
+ *
+ * 2. Manually pack a 16-byte string using a utility such as /dev/random or
+ *    other source with enough entropy and proper seeding to prevent two nodes
+ *    from generating the same uuid_t.
+ *
+ * The following options are relevent when opening an endpoint:
+ *   @li @c timeout establishes the number of nanoseconds to wait before
+ *                  failing to open a port (with -1, defaults to 15 secs).
+ *   @li @c unit sets the OPA unit number to use to open a port (with
+ *               -1, PSM2 determines the best unit to open the port).  If @c
+ *               PSM3_NIC is set in the environment, this setting is ignored.
+ *   @li @c affinity enables or disables PSM2 setting processor affinity.  The
+ *                   option can be controlled to either disable (@ref
+ *                   PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting
+ *                   only if it is already unset (@ref
+ *                   PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity being
+ *                   set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE).
+ *                   If @c PSM3_NO_CPUAFFINITY is set in the environment, this
+ *                   setting is ignored.
+ *   @li @c shm_mbytes sets a maximum number of megabytes that can be allocated
+ *		       to each local endpoint ID connected through this
+ *		       endpoint (with -1, defaults to 10 MB).
+ *   @li @c sendbufs_num sets the number of send buffers that can be
+ *                       pre-allocated for communication (with -1, defaults to
+ *                       512 buffers of MTU size).
+ *   @li @c network_pkey sets the protection key to employ for point-to-point
+ *                       PSM2 communication.  Unless a specific value is used,
+ *                       this parameter should be set to
+ *                       PSM2_EP_OPEN_PKEY_DEFAULT.
+ *
+ * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only
+ * once per process and subsequent calls will fail. In order to enable creation
+ * of multiple endoints per process, one must properly set the environment variable
+ * @ref PSM3_MULTI_EP before calling @ref psm2_init.
+ *
+ * @code{.c}
+    	// In order to open an endpoint and participate in a job, each endpoint has
+    	// to be distributed a unique 16-byte UUID key from an out-of-band source.
+    	// Presumably this can come from the parallel spawning utility either
+    	// indirectly through an implementors own spawning interface or as in this
+    	// example, the UUID is set as a string in an environment variable
+    	// propagated to all endpoints in the job.
+
+    	int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle
+    	                             psm2_epid_t *epid, // output endpoint identifier
+    	                             int unit)  // unit of our choice
+    	{
+    	   struct psm2_ep_open_opts epopts;
+    	   psm2_uuid_t job_uuid;
+    	   char *c;
+
+    	   // Let PSM2 assign its default values to the endpoint options.
+    	   psm2_ep_open_opts_get_defaults(&epopts);
+
+    	   // We want a stricter timeout and a specific unit
+    	   epopts.timeout = 15*1e9;  // 15 second timeout
+    	   epopts.unit = unit;	// We want a specific unit, -1 would let PSM
+    	                             // choose the unit for us.
+    	   epopts.port = port;	// We want a specific unit, <= 0 would let PSM
+    	                             // choose the port for us.
+    	   // We've already set affinity, don't let PSM2 do so if it wants to.
+    	   if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET)
+    	      epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP;
+
+    	   // ENDPOINT_UUID is set to the same value in the environment of all the
+    	   // processes that wish to communicate over PSM2 and was generated by
+    	   // the process spawning utility
+    	   c = getenv("ENDPOINT_UUID");
+    	   if (c && *c)
+    	      implementor_string_to_16byte_packing(c, job_uuid);
+    	   else {
+    	      fprintf(stderr, "Can't find UUID for endpoint\n);
+    	      return -1;
+    	   }
+
+    	   // Assume we don't want to handle errors here.
+    	   psm2_ep_open(job_uuid, &epopts, ep, epid);
+    	   return 1;
+    	}
+   @endcode
+ */
+psm2_error_t
+psm2_ep_open(const psm2_uuid_t unique_job_key,
+	    const struct psm2_ep_open_opts *opts, psm2_ep_t *ep,
+	    psm2_epid_t *epid);
+
+/** @brief Endpoint open default options.
+ *
+ * Function used to initialize the set of endpoint options to their default
+ * values for use in @ref psm2_ep_open.
+ *
+ * @param[out] opts Endpoint Open options.
+ *
+ * @warning For portable operation, users should always call this function
+ * prior to calling @ref psm2_ep_open.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_INIT_NOT_INIT If psm has not been initialized.
+ */
+psm2_error_t
+psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts);
+
+/** @brief Endpoint shared memory query
+ *
+ * Function used to determine if a remote endpoint shares memory with a
+ * currently opened local endpiont.
+ *
+ * @param[in] ep Endpoint handle
+ * @param[in] epid Endpoint ID
+ *
+ * @param[out] result Result is non-zero if the remote endpoint shares memory with the local
+ * endpoint @c ep, or zero otherwise.
+ *
+ * @return PSM2_OK If result could be updated
+ * @return PSM2_EPID_UNKNOWN If the epid is not recognized
+ */
+psm2_error_t
+psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result);
+
+/** @brief Close endpoint
+ * @param[in] ep PSM2 endpoint handle
+ * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE
+ * @param[in] timeout How long to wait in nanoseconds if mode is
+ *			PSM2_EP_CLOSE_GRACEFUL, 0 waits forever.  If @c mode is
+ *			@ref PSM2_EP_CLOSE_FORCE, this parameter is ignored.
+ *
+ * The following errors are returned, others are handled by the per-endpoint
+ * error handler:
+ *
+ * @return PSM2_OK  Endpoint was successfully closed without force or
+ *                 successfully closed with force within the supplied timeout.
+ * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed
+ *                              within timeout.
+ */
+psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout);
+
+#define PSM2_EP_CLOSE_GRACEFUL	0	/**< Graceful mode in @ref psm2_ep_close */
+#define PSM2_EP_CLOSE_FORCE	1	/**< Forceful mode in @ref psm2_ep_close */
+
+/** @brief Provide mappings for network id to hostname
+ *
+ * Since PSM2 does not assume or rely on the availability of an external
+ * networkid-to-hostname mapping service, users can provide one or more of
+ * these mappings.  The @ref psm2_map_nid_hostname function allows a list of
+ * network ids to be associated to hostnames.
+ *
+ * This function is not mandatory for correct operation but may allow PSM2 to
+ * provide better diagnostics when remote endpoints are unavailable and can
+ * otherwise only be identified by their network id.
+ *
+ * @param[in] num Number elements in @c nid and @c hostnames arrays
+ * @param[in] nids User-provided array of network ids (i.e. OPA LIDs),
+ *                 should be obtained by calling @ref psm2_epid_nid on each
+ *                 epid.
+ * @param[in] hostnames User-provided array of hostnames (array of
+ *                      NUL-terimated strings) where each hostname index
+ *                      maps to the provided nid hostname.
+ *
+ * @warning Duplicate nids may be provided in the input @c nids array, only
+ *          the first corresponding hostname will be remembered.
+ *
+ * @pre The user may or may not have already provided a hostname mappings.
+ * @post The user may free any dynamically allocated memory passed to the
+ *       function.
+ *
+ */
+psm2_error_t
+psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames);
+
+/** @brief Connect one or more remote endpoints to a local endpoint
+ *
+ * Function to non-collectively establish a connection to a set of endpoint IDs
+ * and translate endpoint IDs into endpoint addresses.  Establishing a remote
+ * connection with a set of remote endpoint IDs does not imply a collective
+ * operation and the user is free to connect unequal sets on each process.
+ * Similarly, a given endpoint address does not imply that a pairwise
+ * communication context exists between the local endpoint and remote endpoint.
+ *
+ * @param[in] ep PSM2 endpoint handle
+ *
+ * @param[in] num_of_epid The number of endpoints to connect to, which
+ *                        also establishes the number of elements contained in
+ *                        all of the function's array-based parameters.
+ *
+ * @param[in] array_of_epid User-allocated array that contains @c num_of_epid
+ *                          valid endpoint identifiers.  Each endpoint id (or
+ *                          epid) has been obtained through an out-of-band
+ *                          mechanism and each endpoint must have been opened
+ *                          with the same uuid key.
+ *
+ * @param[in] array_of_epid_mask User-allocated array that contains
+ *                          @c num_of_epid integers.  This array of masks
+ *                          allows users to select which of the epids in @c
+ *                          array_of_epid should be connected.  If the integer
+ *                          at index i is zero, psm does not attempt to connect
+ *                          to the epid at index i in @c array_of_epid.  If
+ *                          this parameter is NULL, psm will try to connect to
+ *                          each epid.
+ *
+ * @param[out] array_of_errors User-allocated array of at least @c num_of_epid
+ *                             elements. If the function does not return
+ *                             PSM2_OK, this array can be consulted for each
+ *                             endpoint not masked off by @c array_of_epid_mask
+ *                             to know why the endpoint could not be connected.
+ *                             Endpoints that could not be connected because of
+ *                             an unrelated failure will be marked as @ref
+ *                             PSM2_EPID_UNKNOWN.  If the function returns
+ *                             PSM2_OK, the errors for all endpoints will also
+ *                             contain PSM2_OK.
+ *
+ * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid
+ *                             elements of type psm2_epaddr_t.  Each
+ *                             successfully connected endpoint is updated with
+ *                             an endpoint address handle that corresponds to
+ *                             the endpoint id at the same index in @c
+ *                             array_of_epid.  Handles are only updated if the
+ *                             endpoint could be connected and if its error in
+ *                             array_of_errors is PSM2_OK.
+ *
+ * @param[in] timeout Timeout in nanoseconds after which connection attempts
+ *                    will be abandoned.  Setting this value to 0 disables
+ *                    timeout and waits until all endpoints have been
+ *                    successfully connected or until an error is detected.
+ *
+ * @pre The user has opened a local endpoint and obtained a list of endpoint
+ *      IDs to connect to a given endpoint handle using an out-of-band
+ *      mechanism not provided by PSM.
+ *
+ * @post If the connect is successful, @c array_of_epaddr is updated with valid
+ *       endpoint addresses.
+ *
+ * @post If unsuccessful, the user can query the return status of each
+ *       individual remote endpoint in @c array_of_errors.
+ *
+ * @post The user can call into @ref psm2_ep_connect many times with the same
+ *       endpoint ID and the function is guaranteed to return the same output
+ *       parameters.
+ *
+ * @post PSM2 does not keep any reference to the arrays passed into the
+ *       function and the caller is free to deallocate them.
+ *
+ * The error value with the highest importance is returned by
+ * the function if some portion of the communication failed.  Users should
+ * always refer to individual errors in @c array_of_errors whenever the
+ * function cannot return PSM2_OK.
+ *
+ * @returns PSM2_OK  The entire set of endpoint IDs were successfully connected
+ *                  and endpoint addresses are available for all endpoint IDs.
+ *
+ * @code{.c}
+   	int connect_endpoints(psm2_ep_t ep, int numep,
+   	                      const psm2_epid_t *array_of_epid,
+   	                      psm2_epaddr_t **array_of_epaddr_out)
+   	{
+   	    psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t));
+   	    if (errors == NULL)
+   	        return -1;
+
+   	    psm2_epaddr_t *all_epaddrs =
+   	             (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t));
+
+   	    if (all_epaddrs == NULL)
+   	        return -1;
+
+   	    psm2_ep_connect(ep, numep, array_of_epid,
+   	                   NULL, // We want to connect all epids, no mask needed
+   	                   errors,
+   	                   all_epaddrs,
+   	                   30*e9); // 30 second timeout, <1 ns is forever
+   	    *array_of_epaddr_out = all_epaddrs;
+   	    free(errors);
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid,
+		   const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		   psm2_epaddr_t *array_of_epaddr, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* This function is equivalent to calling psm2_ep_disconnect2() with mode ==
+* PSM2_EP_DISCONNECT_GRACEFUL.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+*                          which also indicates the number of elements contained
+*                          in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+*                            valid endpoint addresses. Each endpoint address (or
+*                            epaddr) has been obtained through a previous
+*                            psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+*                                 num_of_epaddr integers. This array of masks
+*                                 allows users to select which of the
+*                                 epaddresses in array_of_epaddr should be
+*                                 disconnected. If the integer at index i is
+*                                 zero, PSM2 does not attempt to disconnect to
+*                                 the epaddr at index i in array_of_epaddr. If
+*                                 this parameter is NULL, PSM2 tries to
+*                                 disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+*                             elements. If the function does not return PSM2_OK,
+*                             this array can be consulted for each endpoint
+*                             address not masked off by array_of_epaddr_mask to
+*                             know why the endpoint could not be disconnected.
+*                             Any endpoint address that could not be
+*                             disconnected because of an unrelated failure is
+*                             marked as PSM2_EPID_UNKNOWN. If the function
+*                             returns PSM2_OK, the errors for all endpoint
+*                             addresses also contain PSM2_OK.
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+*                    are abandoned. Setting this value to 0 disables timeout and
+*                    waits until all endpoints have been successfully
+*                    disconnected or until an error is detected.
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+*       array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+*       remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+*       and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+*       if some portion of the communication failed. Refer to individual errors
+*       in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+*          and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+             const psm2_epaddr_t *array_of_epaddr)
+{
+    psm2_error_t *errors =
+        (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+    if (errors == NULL)
+        return -1;
+    psm2_ep_disconnect(
+        ep, num_epaddr, array_of_epaddr,
+        NULL, // We want to disconnect all epaddrs, no mask needed,
+        errors,
+        30 * e9); // 30 second timeout, <1 ns is forever
+    free(errors);
+    return 1;
+}
+@endcode
+*/
+psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+				psm2_epaddr_t *array_of_epaddr,
+				const int *array_of_epaddr_mask,
+				psm2_error_t *array_of_errors, int64_t timeout);
+
+/* @brief Disconnect one or more remote endpoints from a local endpoint.
+*
+* Function to non-collectively disconnect a connection to a set of endpoint
+* addresses and free the endpoint addresses. After disconnecting, the
+* application cannot send messages to the remote processes and PSM2 is
+* restored back to the state before calling psm2_ep_connect. The application
+* must call psm2_ep_connect to establish the connections again.
+*
+* @param[in] ep PSM2 endpoint handle
+*
+* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from,
+*                          which also indicates the number of elements contained
+*                          in all of the function’s array-based parameters.
+*
+* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr
+*                            valid endpoint addresses. Each endpoint address (or
+*                            epaddr) has been obtained through a previous
+*                            psm2_ep_connect call.
+*
+* @param[in] array_of_epaddr_mask User-allocated array that contains
+*                                 num_of_epaddr integers. This array of masks
+*                                 allows users to select which of the
+*                                 epaddresses in array_of_epaddr should be
+*                                 disconnected. If the integer at index i is
+*                                 zero, PSM2 does not attempt to disconnect to
+*                                 the epaddr at index i in array_of_epaddr. If
+*                                 this parameter is NULL, PSM2 tries to
+*                                 disconnect all epaddr in array_of_epaddr.
+*
+* @param[out] array_of_errors User-allocated array of at least num_of_epaddr
+*                             elements. If the function does not return PSM2_OK,
+*                             this array can be consulted for each endpoint
+*                             address not masked off by array_of_epaddr_mask to
+*                             know why the endpoint could not be disconnected.
+*                             Any endpoint address that could not be
+*                             disconnected because of an unrelated failure is
+*                             marked as PSM2_EPID_UNKNOWN. If the function
+*                             returns PSM2_OK, the errors for all endpoint
+*                             addresses also contain PSM2_OK.
+*
+* @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE
+*
+* @param[in] timeout Timeout in nanoseconds after which disconnection attempts
+*                    are abandoned. Setting this value to 0 disables timeout and
+*                    waits until all endpoints have been successfully
+*                    disconnected or until an error is detected. Supplying a
+*                    negative value here sets the disconnection mode to "force".
+*
+* @pre You have established the connections with previous psm2_ep_connect calls.
+*
+* @post If the disconnect is successful, the corresponding epaddr in
+*       array_of_epaddr is reset to NULL pointer.
+*
+* @post If unsuccessful, you can query the return status of each individual
+*       remote endpoint in array_of_errors.
+*
+* @post PSM2 does not keep any reference to the arrays passed into the function
+*       and the caller is free to deallocate them.
+*
+* @post The error value with the highest importance is returned by the function
+*       if some portion of the communication failed. Refer to individual errors
+*       in array_of_errors whenever the function cannot return PSM2_OK.
+*
+* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected
+*          and endpoint addresses are freed by PSM2.
+*
+* @code{.c}
+int disconnect_endpoints(psm2_ep_t ep, int num_epaddr,
+             const psm2_epaddr_t *array_of_epaddr)
+{
+    psm2_error_t *errors =
+        (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t));
+    if (errors == NULL)
+        return -1;
+    psm2_ep_disconnect2(
+        ep, num_epaddr, array_of_epaddr,
+        NULL, // We want to disconnect all epaddrs, no mask needed,
+        errors,
+	PSM2_EP_DISCONECT_GRACEFUL,
+        30 * e9); // 30 second timeout, 0 ns is forever
+    free(errors);
+    return 1;
+}
+@endcode
+*/
+psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+				psm2_epaddr_t *array_of_epaddr,
+				const int *array_of_epaddr_mask,
+				psm2_error_t *array_of_errors,
+				int mode, int64_t timeout);
+
+#define PSM2_EP_DISCONNECT_GRACEFUL	PSM2_EP_CLOSE_GRACEFUL   /**< Graceful mode in @ref psm2_ep_disconnect2 */
+#define PSM2_EP_DISCONNECT_FORCE	PSM2_EP_CLOSE_FORCE   /**< Forceful mode in @ref psm2_ep_disconnect2 */
+
+/** @brief Ensure endpoint communication progress
+ *
+ * Function to ensure progress for all PSM2 components instantiated on an
+ * endpoint (currently, this only includes the MQ component).  The function
+ * never blocks and is typically required in two cases:
+ *
+ * @li Allowing all PSM2 components instantiated over a given endpoint to make
+ *     communication progress. Refer to @ref mq_progress for a detailed
+ *     discussion on MQ-level progress issues.
+ *
+ * @li Cases where users write their own synchronization primitives that
+ *     depend on remote communication (such as spinning on a memory location
+ *     which's new value depends on ongoing communication).
+ *
+ * The poll function doesn't block, but the user can rely on the @ref
+ * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of
+ * frequency (poll until an event happens) or execution environment (poll for a
+ * while but yield to other threads of CPUs are oversubscribed).
+ *
+ * @returns PSM2_OK             Some communication events were progressed
+ * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress
+ *
+ */
+psm2_error_t psm2_poll(psm2_ep_t ep);
+
+/** @brief Set a user-determined ep address label.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] epaddr_label_string User-allocated string to print when
+ *                   identifying endpoint in error handling or other verbose
+ *                   printing.  The NULL-terminated string must be allocated by
+ *                   the user since PSM2 only keeps a pointer to the label.  If
+ *                   users do not explicitly set a label for each endpoint,
+ *                   endpoints will identify themselves as hostname:port.
+ */
+void psm2_epaddr_setlabel(psm2_epaddr_t epaddr,
+			 const char *epaddr_label_string);
+
+/** @brief Set a user-determined ep address context.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect
+ * @param[in] ctxt   Opaque user defined state to associate with an endpoint
+ *                   address. This state can be retrieved via
+ *                   @ref psm2_epaddr_getctxt.
+ */
+void
+psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt);
+
+/** @brief Get the user-determined ep address context. Users can associate an
+ *  opaque context with each endpoint via @ref psm2_epaddr_setctxt.
+ *
+ * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect.
+ */
+void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr);
+
+/* Below are all component specific options. The component object for each of
+ * the options is also specified.
+ */
+
+/* PSM2_COMPONENT_CORE options */
+/* PSM2 debug level */
+#define PSM2_CORE_OPT_DEBUG     0x101
+  /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set
+   * before initializing the PSM2 library.
+   *
+   * component object: (null)
+   * option value: PSM2 Debug mask to set or currently active debug level.
+   */
+
+/* PSM2 endpoint address context */
+#define PSM2_CORE_OPT_EP_CTXT   0x102
+  /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint
+   * address (psm2_epaddr_t).
+   *
+   * component object: PSM2 endpoint (@ref psm2_epaddr_t) address.
+   * option value: Context associated with PSM2 endpoint address.
+   */
+
+/* PSM2_COMPONENT_IB options */
+/* Default service level to use to communicate with remote endpoints */
+#define PSM2_IB_OPT_DF_SL 0x201
+  /**< [@b uint32_t ] Default OPA SL to use for all remote communication.
+   * If unset defaults to Service Level 0.
+   *
+   * component object: Opened PSM2 endpoint id (@ref psm2_ep_t).
+   * option value: Default IB SL to use for endpoint. (0 <= SL < 15)
+   */
+
+/* Set IB service level to use for communication to an endpoint */
+#define PSM2_IB_OPT_EP_SL 0x202
+  /**< [@b uint32_t ] OPA SL to use for communication to specified
+   * remote endpoint.
+   *
+   * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address.
+   * option value: SL used to communicate with remote endpoint. (0 <= SL < 15)
+   */
+
+/* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */
+/* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */
+#define PSM2_MQ_OPT_RNDV_IB_SZ       0x301
+  /**< [@b uint32_t ] Size at which to start enabling rendezvous
+   * messaging for OPA messages (if unset, defaults to values
+   * between 56000 and 72000 depending on the system configuration)
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+#define PSM2_MQ_RNDV_HFI_SZ          PSM2_MQ_OPT_RNDV_IB_SZ
+#define PSM2_MQ_RNDV_IPATH_SZ        PSM2_MQ_OPT_RNDV_IB_SZ
+
+#define PSM2_MQ_OPT_RNDV_SHM_SZ      0x302
+#define PSM2_MQ_RNDV_SHM_SZ          PSM2_MQ_OPT_RNDV_SHM_SZ
+  /**< [@b uint32_t ] Size at which to start enabling
+   * rendezvous messaging for shared memory (intra-node) messages (If
+   * unset, defaults to 64000 bytes).
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+
+#define PSM2_MQ_OPT_SYSBUF_MYBYTES   0x303
+#define PSM2_MQ_MAX_SYSBUF_MBYTES    PSM2_MQ_OPT_SYSBUF_MYBYTES
+  /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected
+   * messages.
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Deprecated; this option has no effect.
+   */
+
+/* PSM2_COMPONENT_AM options */
+#define PSM2_AM_OPT_FRAG_SZ          0x401
+#define PSM2_AM_MAX_FRAG_SZ          PSM2_AM_OPT_FRAG_SZ
+/*!< [@b uint32_t ] Maximum active message fragment size that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum active message fragment size in bytes.
+ */
+
+#define PSM2_AM_OPT_NARGS 0x402
+#define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS
+
+/*!< [@b uint32_t ] Maximum number of message arguments that can be sent
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum number of active message arguments.
+ */
+
+#define PSM2_AM_OPT_HANDLERS 0x403
+#define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS
+/*!< [@b uint32_t ] Maximum number of message handlers that can be registered
+ * for a given endpoint or across all endpoints. This value can only be
+ * queried.
+ *
+ * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then
+ *                   option value is the smalles fragment size across all
+ *                   active endpoints.
+ * option value: Maximum number of active message handlers.
+ */
+
+/** @brief Set an option for a PSM2 component
+ *
+ * Function to set the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to set the option
+ * @param[in] component_obj Opaque component specify object to apply the set
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * @param[in] optname Name of component option to set. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * @param[in] optval Pointer to storage that contains the value to be updated
+ *                   for the supplied option.  It is up to the user to
+ *                   ensure that the pointer points to a memory location with a
+ *                   correct size and format.
+ * @param[in] optlen Size of the memory region pointed to by optval.
+ *
+ * @returns PSM2_OK if option could be set.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option.
+ *
+ */
+psm2_error_t
+psm2_setopt(psm2_component_t component, const void *component_obj,
+	   int optname, const void *optval, uint64_t optlen);
+
+/** @brief Get an option for a PSM2 component
+ *
+ * Function to get the value of a PSM2 component option
+ *
+ * @param[in] component Type of PSM2 component for which to get the option
+ * @param[in] component_obj Opaque component specify object to apply the get
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * @param[in] optname Name of component option to get. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * @param[out] optval Pointer to storage that contains the value to be updated
+ *                    for the supplied option.  It is up to the user to
+ *                    ensure that the pointer points to a valid memory region.
+ * @param[in,out] optlen This is a value result parameter initially containing
+ *                      the size of the memory region pointed to by optval and
+ *                      modified to return the actual size of optval.
+ *
+ * @returns PSM2_OK if option value could be retrieved successfully.
+ * @returns PSM2_PARAM_ERR if the component or optname are not valid.
+ * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size.
+ *                         optlen contains the required memory region size for
+ *                         optname value.
+ *
+ */
+psm2_error_t
+psm2_getopt(psm2_component_t component, const void *component_obj,
+	   int optname, void *optval, uint64_t *optlen);
+
+/** @brief Datatype for end-point information */
+typedef struct psm2_epinfo {
+	psm2_ep_t ep;		/**< The ep for this end-point*/
+	psm2_epid_t epid;	/**< The epid for this end-point */
+	psm2_uuid_t uuid;	/**< The UUID for this end-point */
+	uint16_t jkey;		/**< The job key for this end-point */
+	char uuid_str[64];	/**< String representation of the UUID for this end-point */
+} psm2_epinfo_t;
+
+/** @brief Datatype for end-point connection */
+typedef struct psm2_epconn {
+	psm2_epaddr_t addr;	/**< The epaddr for this connection */
+	psm2_ep_t ep;		/**< The ep for this connection */
+	psm2_mq_t mq;		/**< The mq for this connection */
+} psm2_epconn_t;
+
+/** @brief Query PSM2 for end-point information.
+ *
+ * Function to query PSM2 for end-point information. This allows retrieval of
+ * end-point information in cases where the caller does not have access to the
+ * results of psm2_ep_open().  In the default single-rail mode PSM2 will use
+ * a single endpoint. If either multi-rail mode or multi-endpoint mode is
+ * enabled, PSM2 will use multiple endpoints.
+ *
+ * @param[in,out] num_of_epinfo On input, sizes the available number of entries
+ *                              in array_of_epinfo.  On output, specifies the
+ *                              returned number of entries in array_of_epinfo.
+ * @param[out] array_of_epinfo Returns end-point information structures.
+ *
+ * @pre PSM2 is initialized and the end-point has been opened.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ */
+psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo);
+
+/** @brief Query PSM2 for end-point connections.
+ *
+ * Function to query PSM2 for end-point connections. This allows retrieval of
+ * end-point connections in cases where the caller does not have access to the
+ * results of psm2_ep_connect().  The epid values can be found using
+ * psm2_ep_query() so that each PSM2 process can determine its own epid. These
+ * values can then be distributed across the PSM2 process so that each PSM
+ * process knows the epid for all other PSM2 processes.
+ *
+ * @param[in] epid The epid of a PSM2 process.
+ * @param[out] epconn The connection information for that PSM2 process.
+ *
+ * @pre PSM2 is initialized and the end-point has been connected to this epid.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM.
+ */
+psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Query given PSM2 end-point for its connections.
+ *
+ * The need for this function comes with 'multi-ep' feature.
+ * Function is similar to (@ref psm2_ep_epid_lookup).
+ * It differs in that an extra parameter which identifies
+ * the end-point [ep] must be provided which limits the lookup to that single ep.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist.
+ * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM.
+ * @returns PSM2_PARAM_ERR if output [epconn] is NULL.
+ */
+psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn);
+
+/** @brief Get PSM2 epid for given epaddr.
+ *
+ * @param[in] epaddr The endpoint address.
+ * @param[out] epid The epid of a PSM2 process.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL.
+ */
+psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid);
+
+/*! @} */
+
+/*! @addtogroup init PSM2 Information Query
+ * @{
+ */
+
+/** @brief Enumeration for info query APIs
+ *
+ * Note that calling the function:
+ *
+ @code{.c}
+ psm2_error_t psm2_info_query(psm2_info_query_t, void *out,
+                              size_t nargs, psm2_info_query_arg_t []);
+ @endcode
+ *
+ * Takes a variable number of input arguments, per the initial psm2_info_query_t
+ *
+ * Below, there is an explanation of the number, type and order of the
+ * required input arguments, as well as a definition of the type of the output.
+ */
+typedef enum psm2_info_query_et
+{
+/*! Required input arguments 0
+   Output parameter: uint32_t*, description: the number of units */
+	PSM2_INFO_QUERY_NUM_UNITS,
+
+/*! Required input arguments: 0
+   Output parameter: uint32_t*, description: the number of ports */
+	PSM2_INFO_QUERY_NUM_PORTS,
+
+/*! Required input arguments: 1
+   1.  type: uint32_t, description: the unit for which status is
+       desired (use: psm2_info_query_arg_t.unit).
+   Output parameter: uint32_t, description: zero, when the unit
+                     is not active, non-zero when the unit is
+                     active.  */
+	PSM2_INFO_QUERY_UNIT_STATUS,
+
+/*! Required input arguments: 2
+   1.  type: uint32_t, description: the unit for which status is
+       desired (use: psm2_info_query_arg_t.unit).
+   2.  type: uint32_t, description: the port for which status is
+       desired (use: psm2_info_query_arg_t.port).
+   Output parameter: uint32_t, description: zero, when the unit
+                     is not active, non-zero when the unit is
+                     active.  */
+	PSM2_INFO_QUERY_UNIT_PORT_STATUS,
+
+/*! Required input arguments: 1
+   1.  type: uint32_t, description: the unit for which the number of
+       free contexts is desired (use: psm2_info_query_arg_t.unit).
+   Output parameter: uint32_t, description: the number of free
+                     contexts..  */
+	PSM2_INFO_QUERY_NUM_FREE_CONTEXTS,
+
+/*! Required input arguments: 1
+   1.  type: uint32_t, description: the unit for which the number of
+       contexts is desired (use: psm2_info_query_arg_t.unit).
+   Output parameter: uint32_t, description: the number of
+                     contexts..  */
+	PSM2_INFO_QUERY_NUM_CONTEXTS,
+
+/*! Required input arguments: 2
+   1.  type: psm2_mq_t, description: the mq that is associated with the
+       connection for which configuration information is wanted.
+       (use: psm2_info_query_arg_t.mq).
+   2.  type: psm2_epaddr_t, description: the ep address that is
+       associated with the connection for which configuration
+       information is wanted (use: psm2_info_query_arg_t.epaddr).
+   Output parameter: uint32_t, description: a bit mask containing bits defining the configuration.
+   see psm2_info_query_config for a description of the bits. */
+	PSM2_INFO_QUERY_CONFIG,
+
+/*! Required input arguments: 3
+   1.  type: psm2_mq_t, description: the mq that is associated with the
+       connection for which the msg size query information is wanted.
+       (use: psm2_info_query_arg_t.mq).
+   2.  type: psm2_epaddr_t, description: the ep address that is
+       associated with the connection for which the msg size query
+       information is wanted (use: psm2_info_query_arg_t.epaddr).
+   3.  type: enum psm2_info_query_thresh_et, the specific msg size query.
+       (use: psm2_info_query_arg_t.mstq).
+
+       Output parameter: uint32_t, description: the message size threshold. */
+	PSM2_INFO_QUERY_THRESH,
+
+/*! Required input arguments: 3
+   1.  type: psm2_mq_t, description: the mq that is associated with the
+       connection for which the device name is wanted.
+       (use: psm2_info_query_arg_t.mq).
+   2.  type: psm2_epaddr_t, description: the ep address that is
+       associated with the connection for which device name is wanted.
+       (use: psm2_info_query_arg_t.epaddr).
+   3.  type: size_t, the length of the output buffer that will recieve
+       the device name (use: psm2_info_query_arg_t.length).
+       Output parameter: char *, description: the device name. */
+	PSM2_INFO_QUERY_DEVICE_NAME,
+
+/*! Required input arguments: 2
+   1.  type: psm2_mq_t, description: the mq that is associated with the
+       connection for which the mtu is wanted (use: psm2_info_query_arg_t.mq).
+   2.  type: psm2_epaddr_t, description: the ep address that is
+       associated with the connection for which mtu is wanted.
+       (use: psm2_info_query_arg_t.epaddr).
+       Output parameter: uint32_t, description: the mtu. */
+
+	PSM2_INFO_QUERY_MTU,
+
+/*! Required input arguments: 2
+   1.  type: psm2_mq_t, description: the mq that is associated with the
+       connection for which the link speed is wanted (use:
+       psm2_info_query_arg_t.mq).
+   2.  type: psm2_epaddr_t, description: the ep address that is
+       associated with the connection for which link speed is wanted.
+       (use: psm2_info_query_arg_t.epaddr).
+       Output parameter: uint32_t, description: the link speed. */
+	PSM2_INFO_QUERY_LINK_SPEED,
+
+/*! Required input arguments: 1
+   1.  type: size_t, description: the length of the output buffer to receive
+       the network type (use: psm2_info_query_arg_t.length).
+       Output parameter: char*, description: the network type. */
+	PSM2_INFO_QUERY_NETWORK_TYPE,
+
+/*! Required input arguments 0
+    Output parameter: uint32_t*, description: a bit mask of the features in libpsm2.
+    See psm2_info_query_feature_mask below for bit mask definition. */
+	PSM2_INFO_QUERY_FEATURE_MASK,
+
+/*! Required input arguments 2
+   1.  type: uint32_t, description: the unit # of the device you want to
+       identify.
+   2.  type: size_t, description: the length of the output buffer that will
+       receive the device name.
+       Output parameter: char*, description: name of the device. */
+	PSM2_INFO_QUERY_UNIT_NAME,
+
+/*! Required input arguments: 2
+   1.  type: uint32_t, description: unit number for which the device
+       name is wanted.
+	   (use: psm2_info_query_arg_t.unit).
+   2.  type: size_t, description: the length of the output buffer
+       that will recieve the sysfs path.
+	   (use: psm2_info_query_arg_t.length).
+       Output parameter: char *, description: the sysfs path. */
+	PSM2_INFO_QUERY_UNIT_SYS_PATH,
+
+	PSM2_INFO_QUERY_LAST, /* must appear last, and the info query
+				 constants are used as an index. */
+} psm2_info_query_t;
+
+/** @brief Enumeration for info query config
+ */
+enum psm2_info_query_config
+{
+	/*! The following three are 'main configs': */
+	PSM2_INFO_QUERY_CONFIG_IPS      = (1 << 0),
+	PSM2_INFO_QUERY_CONFIG_AMSH     = (1 << 1),
+	PSM2_INFO_QUERY_CONFIG_SELF     = (1 << 2),
+
+	/*! The following three are sub-configs of
+           the IPS main config: */
+
+	PSM2_INFO_QUERY_CONFIG_CUDA     = (1 << 3),
+	PSM2_INFO_QUERY_CONFIG_PIO      = (1 << 4),
+	PSM2_INFO_QUERY_CONFIG_DMA      = (1 << 5),
+
+	/*! The following is a sub-config of IPS & CUDA
+           main config: */
+
+	PSM2_INFO_QUERY_CONFIG_GDR_COPY = (1 << 6),
+};
+
+/** @brief Enumeration info query thresholds
+ */
+enum psm2_info_query_thresh_et
+{
+/*! This is the start of the thresh queries for IPS config: */
+	PSM2_INFO_QUERY_THRESH_IPS_START,
+
+/*! Not shown here are the specific queries supported by the CUDA
+   and GDR_COPY, sub-configs.
+
+   But, those configs will need to include threshold queries in case the
+   config includes them.
+
+   Note that for the case of gdr_copy the thresholds varies for the case
+   of the memory is gpu memory or not. */
+
+/*! The following threshold queres are supported for the IPS config
+   only. */
+
+/*! The PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA threshold query indicates at
+   what message size the send transport transitions from PIO to DMA.
+
+   Note that this threshold query may be meaningless if PIO or DMA is
+   disabled. */
+	PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA = PSM2_INFO_QUERY_THRESH_IPS_START,
+/*! Messages with messages sizes less than or equal to the tiny threshold
+   will be sent by tiny message. */
+	PSM2_INFO_QUERY_THRESH_IPS_TINY,
+/*! Messages with messages sizes greater than tiny, but less than or equal
+   to frag size will be sent by short message. */
+	PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE,
+	PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE,
+/*! Messages that are greater than the frag_size, but less than RNDV will
+   be sent by eager message.
+   Messages with messages sizes greater than or equal to RNDV will be
+   sent by the rendezvous protocol message. */
+	PSM2_INFO_QUERY_THRESH_IPS_RNDV,
+	PSM2_INFO_QUERY_THRESH_IPS_END = PSM2_INFO_QUERY_THRESH_IPS_RNDV,
+
+/*! Not shown here are the specific thresh queries supported by AMSH and
+   SELF configs: */
+	PSM2_INFO_QUERY_THRESH_AMSH_START,
+	PSM2_INFO_QUERY_THRESH_AMSH_END = PSM2_INFO_QUERY_THRESH_AMSH_START,
+
+	PSM2_INFO_QUERY_THRESH_SELF_START,
+	PSM2_INFO_QUERY_THRESH_SELF_END = PSM2_INFO_QUERY_THRESH_SELF_START,
+};
+
+enum psm2_info_query_feature_mask
+{
+	/*! The following bit means that the libpsm2 _can_ support cuda.
+	    If the PSM2_INFO_QUERY_FEATURE_MASK request is made and
+	    the PSM2_INFO_QUERY_FEATURE_CUDA bit is not present, thne cuda
+            is not supported. */
+	PSM2_INFO_QUERY_FEATURE_CUDA      = (1 << 0),
+};
+
+/** @brief Union for info query arg type
+ */
+typedef union psm2_info_query_arg
+{
+	uint32_t                       unit;
+	uint32_t                       port;
+	size_t                         length;
+	psm2_mq_t                      mq;
+	psm2_epaddr_t                  epaddr;
+	enum psm2_info_query_thresh_et mstq;
+} psm2_info_query_arg_t;
+
+/** @brief PSM2 info query
+ *
+ * Function that allows a client to interrogate PSM2 for various information.
+ *
+ * @param[in] psm2_info_query_t  What information is requested.
+ * @param[out] void * out, where the information will be delivered on a
+ *                         PSM2_OK return.
+ * @param[in] size_t nargs, the number of following arguments.
+ * @param[in] psm2_info_query_arg_t [], The arguments that are required for
+ *                                      certain queries.  See documentation
+ *                                      at @ref psm2_info_query_t for what
+ *                                      arguments are required for what
+ *                                      queries as well as what the type
+ *                                      the output is expected to be.
+ *
+ * @retval PSM2_OK The out buffer has successfully been written with the
+ * result of the query.
+ */
+psm2_error_t psm2_info_query(psm2_info_query_t, void *out,
+			     size_t nargs, psm2_info_query_arg_t []);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_am.h b/deps/libfabric/prov/psm3/psm3/psm2_am.h
new file mode 100644
index 0000000000000000000000000000000000000000..a53777bdba1bda3cc16effa963d17fb3534c764f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_am.h
@@ -0,0 +1,481 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef PSM2_AM_H
+#define PSM2_AM_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <psm2.h>
+
+#ifndef PACK_SUFFIX
+/* XXX gcc only */
+#define PACK_SUFFIX __attribute__((packed))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_am.h
+ * @brief PSM2 Active Message.
+ *
+ * @page psm2_am Active Message Interface
+ *
+ * PSM2 implements an Active Message (AM) component that lives alongside the
+ * Matched Queues (MQ) component. The active message interface essentially
+ * provides a remote procedure call mechanism. A PSM2 process can generate a
+ * request to run an active message handler on a remote PSM2 process
+ * identified by its end-point address (epaddr). End-point address values
+ * are returned by PSM2 when connecting end-points using the psm2_ep_connect()
+ * function.
+ *
+ * An AM handler may make local state updates, and may generate at most
+ * one reply to be returned to the original requestor. This reply will cause
+ * a handler to be run on that requestor. The requestor handler may make
+ * local state updates but is not allowed to reply nor request in that handler
+ * context. A request or reply can convey a small number of in-line arguments
+ * plus a short amount of data. A tight bound is placed on the number of
+ * in-line arguments to allow them to be packed into a header. A bound is
+ * placed on the size of the data payload so that the request or reply can
+ * be sent as a single packet within the MTU of the underlying communication
+ * transport. Longer payloads must be synthesized on top of the provided
+ * short request/reply mechanism by fragmentation and reassembly, or
+ * transported by some other means.
+ *
+ * Handlers are run in the process context of the targeted PSM2 process,
+ * either in its main thread of execution or in a progress thread. A handler
+ * may therefore be executed concurrently with the main thread of execution
+ * of the PSM2 process. PSM2 ensures that its own state is protected against this
+ * concurrent execution. However, a handler must make its own arrangements to
+ * protect its own state. Alternatively, the PSM2 progress thread can be
+ * disabled using the PSM3_RCVTHREAD environment variable if this is too
+ * onerous for the handler.
+ *
+ * PSM2 has an active progress model and requires that the PSM2 library is
+ * called in order to make progress. This can be achieved using the psm2_poll()
+ * function. A PSM2 implementatation may provide passive progress through some
+ * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume
+ * this and must arrange to make active progress through calls into the PSM
+ * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM
+ * interfaces, and that MTsafety must be provided by the consumer if required.
+ *
+ * The order in which AM requests are issued by an initiator to a particular
+ * target defines the order in which those AM requests will be executed on
+ * that target. Therefore the AM implementation will maintain the order
+ * of handler executions on a flow, and this also applies when progress
+ * threads are used. For multiple initiators issuing requests to a particular
+ * target, the handler executions will be interleaved in some sequentially
+ * consistent ordering.
+ */
+
+/*! @defgroup am PSM2 Active Message
+ *
+ * @{
+ */
+
+/** @brief Datatype for an index representing an active message handler */
+typedef uint32_t psm2_handler_t;
+
+/** @brief Datatype for a token for an active message handler.*/
+typedef void *psm2_am_token_t;
+
+/* PSM2 AM flags
+ * These flags may be combined using bitwise-or.
+ */
+#define PSM2_AM_FLAG_NONE    0 /**< No other PSM2 AM flags are needed. */
+#define PSM2_AM_FLAG_ASYNC   1 /**< No need to copy source data. */
+#define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is
+				   guaranteed not to generate a reply. */
+
+/** @brief The psm2_amarg type represents the type of an AM argument. This is
+ *  a 64-bit type and is broken down into four 16-bit fields, two 32-bit
+ *  fields or one 64-bit field for the convenience of code using the PSM2 AM
+ *  interface.
+ */
+typedef
+struct psm2_amarg {
+	union {
+		struct {
+			uint16_t u16w3;
+			uint16_t u16w2;
+			uint16_t u16w1;
+			uint16_t u16w0;
+		} PACK_SUFFIX;
+		struct {
+			uint32_t u32w1;
+			uint32_t u32w0;
+		} PACK_SUFFIX;
+		uint64_t u64w0;
+		uint64_t u64;
+	};
+} PACK_SUFFIX psm2_amarg_t;
+
+/** @brief The AM handler function type
+ *
+ * psm2_am_handler_fn_t is the datatype for an AM handler. PSM2 AM will call-back
+ * into an AM handler using this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * @param[in] token This is an opaque token value passed into a handler.
+ *                  A request handler may send at most one reply back to the
+ *                  original requestor, and must pass this value as the token
+ *                  parameter to the psm2_am_reply_short() function. A reply
+ *                  handler is also passed a token value, but must not attempt
+ *                  to reply.
+ * @param[in] args A pointer to the arguments provided to this handler.
+ * @param[in] nargs The number of arguments.
+ * @param[in] src A pointer to the data payload provided to this handler.
+ * @param[in] len The length of the data payload in bytes.
+ *
+ * @returns 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm2_am_handler_fn_t) (psm2_am_token_t token,
+			    psm2_amarg_t *args, int nargs,
+			    void *src, uint32_t len);
+
+/** @brief The AM handler function type with caller context
+ *
+ * psm2_am_handler_2_fn_t is the datatype for an AM handler that
+ * includes a user context. PSM2 AM will call-back into an AM handler using
+ * this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * @param[in] token This is an opaque token value passed into a handler.
+ *                  A request handler may send at most one reply back to the
+ *                  original requestor, and must pass this value as the token
+ *                  parameter to the psm2_am_reply_short() function. A reply
+ *                  handler is also passed a token value, but must not attempt
+ *                  to reply.
+ * @param[in] args A pointer to the arguments provided to this handler.
+ * @param[in] nargs The number of arguments.
+ * @param[in] src A pointer to the data payload provided to this handler.
+ * @param[in] len The length of the data payload in bytes.
+ * @param[in] hctx The user context pointer provided at handler registration.
+ *
+ * @returns 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm2_am_handler_2_fn_t) (psm2_am_token_t token,
+			    psm2_amarg_t *args, int nargs,
+			    void *src, uint32_t len, void *hctx);
+
+/** @brief Type for a completion call-back handler.
+ *
+ * A completion handler can be specified to give a call-back on the initiation
+ * side that an AM request or reply has completed on the target side. The
+ * call-back has a context pointer which is provided along with the call-back
+ * function pointer when the initiator generates the request or reply. This
+ * approach will typically give higher performance than using an AM request or
+ * reply to achieve the same effect, though note that no additional information
+ * can be passed from the target side back to the initiator side with the
+ * completion handler approach.
+ *
+ * @param[in] context A context pointer.
+ * @returns void This handler has no return result.
+ */
+typedef
+void (*psm2_am_completion_fn_t) (void *context);
+
+/** @brief Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of
+ * handlers that can be registered is limited to the max_handlers value
+ * returned by psm2_am_get_parameters(). Handlers are associated with a PSM
+ * end-point. The handlers are allocated index numbers in the the handler table
+ * for that end-point.  The allocated index for the handler function in
+ * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These
+ * handler index values are used in the psm2_am_request_short() and
+ * psm2_am_reply_short() functions.
+ *
+ * @param[in] ep End-point value
+ * @param[in] handlers Array of handler functions
+ * @param[in] num_handlers Number of handlers (sizes the handlers and
+ *                         handlers_idx arrays)
+ * @param[out] handlers_idx Used to return handler index mapping table
+ *
+ * @returns PSM2_OK Indicates success
+ * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm2_error_t psm2_am_register_handlers(psm2_ep_t ep,
+				     const psm2_am_handler_fn_t *
+				     handlers, int num_handlers,
+				     int *handlers_idx);
+
+/** @brief Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of
+ * handlers that can be registered is limited to the max_handlers value
+ * returned by psm2_am_get_parameters(). Handlers are associated with a PSM
+ * end-point. The handlers are allocated index numbers in the the handler table
+ * for that end-point.  The allocated index for the handler function in
+ * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These
+ * handler index values are used in the psm2_am_request_short() and
+ * psm2_am_reply_short() functions.
+ *
+ * @param[in] ep End-point value
+ * @param[in] handlers Array of handler functions
+ * @param[in] num_handlers Number of handlers (sizes the handlers and
+ *                         handlers_idx arrays)
+ * @param[in] hctx Array of void* pointers to a user contexts for identifying the
+ *                         target ep that registered these handlers.
+ * @param[out] handlers_idx Used to return handler index mapping table
+ *
+ * @returns PSM2_OK Indicates success
+ * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm2_error_t psm2_am_register_handlers_2(psm2_ep_t ep,
+				     const psm2_am_handler_2_fn_t *
+				     handlers, int num_handlers,
+				     void **hctx,
+				     int *handlers_idx);
+
+/** @brief Unregister all AM call-back handlers for the specific end-point.
+ *
+ * This function is used to unregister all AM handlers registered to the
+ * specified end-point.
+ *
+ * @param[in] ep End-point value
+ *
+ */
+void psm2_am_unregister_handlers(psm2_ep_t ep);
+
+/** @brief Generate an AM request.
+ *
+ * This function generates an AM request causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_request_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ *   PSM2_AM_FLAG_NONE    - No flags
+ *   PSM2_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *   PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to
+ *                         generate a reply
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the request cannot be issued immediately.  However, if PSM2_AM_FLAG_ASYNC
+ * is provided then a copy will not be taken and the PSM2 AM implementation
+ * retains ownership of the payload src memory until the request is locally
+ * complete. Local completion can be determined using the completion handler
+ * call-back, or through an AM handler associated with an AM reply.
+ *
+ * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that
+ * a reply will not be generated. Use of this flag is optional, but it may
+ * enable a performance optimization in this case by indicating that reply
+ * state is not required.
+ *
+ * @param[in] epaddr End-point address to run handler on
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ *                  bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ *                          remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ *                            completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+		     psm2_amarg_t *args, int nargs, void *src,
+		     size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt);
+
+/** @brief Generate an AM reply.
+ *
+ * This function may only be called from an AM handler called due to an AM
+ * request.  If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM
+ * handler must not call this function. Otherwise, the AM request handler may
+ * call psm2_am_reply_short() at most once, and must pass in the token value
+ * that it received in its own handler call-back.
+ *
+ * This function generates an AM reply causing an AM handler function to be
+ * called in the PSM2 process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in
+ * bytes to max_reply_short returned by the psm2_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the
+ * argument pointer will not be dereferenced. If payload is not required, set
+ * the payload size to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be
+ * provided, and a local call-back will be made to that function passing in
+ * that context pointer once remote execution of the handler has completed. If
+ * the completion call-back is not required, the handler should be specified as
+ * NULL and the pointer value will not be used.
+ *
+ * The allowed flags are any combination of the following combined with
+ * bitwise-or:
+ *   PSM2_AM_FLAG_NONE    - No flags
+ *   PSM2_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *
+ * The PSM2 AM implementation will not dereference the args pointer after return
+ * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM
+ * implementation will not dereference the src pointer after return from this
+ * function. This may require the implementation to take a copy of the payload
+ * if the reply cannot be issued immediately.  However, if PSM2_AM_FLAG_ASYNC is
+ * provided then a copy will not be taken and the PSM2 AM implementation retains
+ * ownership of the payload src memory until the reply is locally complete.
+ * Local completion can be determined using the completion handler call-back.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ *                  the reply.
+ * @param[in] handler Index of handler to run
+ * @param[in] args Array of arguments to be provided to the handler
+ * @param[in] nargs Number of arguments to be provided to the handler
+ * @param[in] src Pointer to the payload to be delivered to the handler
+ * @param[in] len Length of the payload in bytes
+ * @param[in] flags These are PSM2 AM flags and may be combined together with
+ *                  bitwise-or
+ * @param[in] completion_fn The completion function to called locally when
+ *                          remote handler is complete
+ * @param[in] completion_ctxt User-provided context pointer to be passed to the
+ *                            completion handler
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+		   psm2_amarg_t *args, int nargs, void *src,
+		   size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn,
+		   void *completion_ctxt);
+
+/** @brief Return the source end-point address for a token.
+ *
+ * This function is used to obtain the epaddr object representing the message
+ * initiator from a token passed by PSM2 to a message handler.
+ *
+ * @param[in] token Token value provided to the AM handler that is generating
+ *                  the reply.
+ * @param[out] epaddr_out Pointer to the where the epaddr should be returned.
+ *
+ * @returns PSM2_OK indicates success.
+ * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL.
+ */
+psm2_error_t psm2_am_get_source(psm2_am_token_t token,
+			      psm2_epaddr_t *epaddr_out);
+
+/** @brief AM parameters
+ *
+ * This structure is used to return PSM2 AM implementation-specific parameter
+ * values back to the caller of the psm2_am_get_parameters() function. This
+ * API also specifies the minimum values for these parameters that an
+ * implementation must at least provide:
+ *   max_handlers >= 64,
+ *   max_nargs >= 2,
+ *   max_request_short >= 256 and
+ *   max_reply_short >= 256.
+ */
+struct psm2_am_parameters {
+	/** Maximum number of handlers that can be registered. */
+	uint32_t max_handlers;
+	/** Maximum number of arguments to an AM handler. */
+	uint32_t max_nargs;
+	/** Maximum number of bytes in a request payload. */
+	uint32_t max_request_short;
+	/** Maximum number of bytes in a reply payload. */
+	uint32_t max_reply_short;
+};
+
+/** @brief Get the AM parameter values
+ *
+ * This function retrieves the implementation-specific AM parameter values for
+ * the specified end-point.
+ *
+ * @param[in] ep The end-point value returned by psm2_ep_open().
+ * @param[out] parameters Pointer to the struct where the parameters will be
+ *                        returned.
+ * @param[in] sizeof_parameters_in The size in bytes of the struct provided by
+ *                                 the caller.
+ * @param[out] sizeof_parameters_out The size in bytes of the struct returned
+ *                                   by PSM.
+ *
+ * @returns PSM2_OK indicates success.
+ */
+psm2_error_t
+psm2_am_get_parameters(psm2_ep_t ep,
+		      struct psm2_am_parameters *parameters,
+		      size_t sizeof_parameters_in,
+		      size_t *sizeof_parameters_out);
+
+/*! @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_hal.c b/deps/libfabric/prov/psm3/psm3/psm2_hal.c
new file mode 100644
index 0000000000000000000000000000000000000000..546c6000dcef7bcc21ba1900f996c2245add1f98
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_hal.c
@@ -0,0 +1,423 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+
+#include "ptl_ips/ips_scb.h"
+
+static SLIST_HEAD(, _psmi_hal_instance) head_hi;
+
+/* define the current hal instance pointer */
+psmi_hal_instance_t *psmi_hal_current_hal_instance = NULL;
+
+/* psmi_hal_register_instance */
+void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi)
+{
+#define REJECT_IMPROPER_HI(MEMBER) if (!psm_hi->MEMBER) return
+
+	/* If an attempt to register a hal instance contains a NULL func ptr, reject it. */
+	/* To allow fast lookups, please keep this code segment alphabetized by hfp_*
+	   func ptr member name: */
+#if PSMI_HAL_INST_CNT > 1
+
+	REJECT_IMPROPER_HI(hfp_close_context);
+	REJECT_IMPROPER_HI(hfp_context_open);
+	REJECT_IMPROPER_HI(hfp_context_initstats);
+
+
+	REJECT_IMPROPER_HI(hfp_finalize_);
+
+
+	REJECT_IMPROPER_HI(hfp_get_jkey);
+
+
+	REJECT_IMPROPER_HI(hfp_get_node_id);
+	REJECT_IMPROPER_HI(hfp_get_num_contexts);
+	REJECT_IMPROPER_HI(hfp_get_num_free_contexts);
+
+
+	REJECT_IMPROPER_HI(hfp_get_port_active);
+	REJECT_IMPROPER_HI(hfp_get_port_subnet);
+
+
+	REJECT_IMPROPER_HI(hfp_get_port_lid);
+
+
+	REJECT_IMPROPER_HI(hfp_get_port_rate);
+
+
+	REJECT_IMPROPER_HI(hfp_get_unit_active);
+
+
+	REJECT_IMPROPER_HI(hfp_spio_process_events);
+	REJECT_IMPROPER_HI(hfp_spio_transfer_frame);
+
+#endif // PSMI_HAL_INST_CNT > 1
+	REJECT_IMPROPER_HI(hfp_get_default_pkey);
+	REJECT_IMPROPER_HI(hfp_get_num_ports);
+	REJECT_IMPROPER_HI(hfp_get_num_units);
+	REJECT_IMPROPER_HI(hfp_initialize);
+
+#ifndef PSM2_MOCK_TESTING
+	if (!sysfs_init(psm_hi->hfi_sys_class_path))
+#endif
+		SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi);
+}
+
+static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void);
+
+int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
+{
+	va_list ap;
+	va_start(ap, k);
+
+	int rv = 0;
+	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst();
+
+	if (!p)
+		rv = -1;
+	else
+	{
+		switch(k)
+		{
+		case psmi_hal_pre_init_cache_func_get_num_units:
+			rv = p->params.num_units;
+			break;
+		case psmi_hal_pre_init_cache_func_get_num_ports:
+			rv = p->params.num_ports;
+			break;
+		case psmi_hal_pre_init_cache_func_get_unit_active:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.unit_active_valid[unit]) {
+						p->params.unit_active_valid[unit] = 1;
+						p->params.unit_active[unit] = p->hfp_get_unit_active(unit);
+					}
+					rv = p->params.unit_active[unit];
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_port_active:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						if (!p->params.port_active_valid[unit*port]) {
+							p->params.port_active_valid[unit*port] = 1;
+							p->params.port_active[unit*port] = p->hfp_get_port_active(unit,port);
+						}
+						rv = p->params.port_active[unit*port];
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_num_contexts:
+			{
+				int unit = va_arg(ap,int);
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.num_contexts_valid[unit]) {
+						p->params.num_contexts_valid[unit] = 1;
+						p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit);
+					}
+					rv = p->params.num_contexts[unit];
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_num_free_contexts:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					if (!p->params.num_free_contexts_valid[unit]) {
+						p->params.num_free_contexts_valid[unit] = 1;
+						p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit);
+					}
+					rv = p->params.num_free_contexts[unit];
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_default_pkey:
+			rv = p->params.default_pkey;
+			break;
+		case psmi_hal_pre_init_cache_func_get_port_subnet:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						if (!p->params.port_subnet_valid[unit*port]) {
+							rv = p->hfp_get_port_subnet(unit, port,
+									&p->params.port_subnet[unit*port],
+									&p->params.port_subnet_addr[unit*port],
+									&p->params.port_ip_addr[unit*port],
+									&p->params.port_netmask[unit*port],
+									&p->params.port_subnet_idx[unit*port],
+									&p->params.port_subnet_gid_hi[unit*port],
+									&p->params.port_subnet_gid_lo[unit*port]);
+							if (rv == 0)
+								p->params.port_subnet_valid[unit*port] = 1;
+							else
+								p->params.port_subnet_valid[unit*port] = -1;
+						}
+						uint64_t* subnet = va_arg(ap,uint64_t*);
+						uint64_t* addr = va_arg(ap,uint64_t*);
+						uint32_t* ip_addr = va_arg(ap,uint32_t*);
+						uint32_t* netmask = va_arg(ap,uint32_t*);
+						int* idx = va_arg(ap,int*);
+						uint64_t* hi = va_arg(ap,uint64_t*);
+						uint64_t* lo = va_arg(ap,uint64_t*);
+						rv = (p->params.port_subnet_valid[unit*port] ==1)? 0: -1;
+						if (subnet) *subnet = p->params.port_subnet[unit*port];
+						if (addr) *addr = p->params.port_subnet_addr[unit*port];
+						if (ip_addr) *ip_addr = p->params.port_ip_addr[unit*port];
+						if (netmask) *netmask = p->params.port_netmask[unit*port];
+						if (idx) *idx = p->params.port_subnet_idx[unit*port];
+						if (hi) *hi = p->params.port_subnet_gid_hi[unit*port];
+						if (lo) *lo = p->params.port_subnet_gid_lo[unit*port];
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
+		default:
+			rv = -1;
+			break;
+		}
+	}
+
+	va_end(ap);
+	return rv;
+}
+
+static void psmi_hal_free_cache(struct _psmi_hal_instance *p)
+{
+#define FREE_HAL_CACHE(field) \
+	do { \
+		if (p->params.field) \
+			psmi_free(p->params.field); \
+		p->params.field = NULL; \
+	} while (0)
+
+	FREE_HAL_CACHE(unit_active);
+	FREE_HAL_CACHE(unit_active_valid);
+	FREE_HAL_CACHE(port_active);
+	FREE_HAL_CACHE(port_active_valid);
+	FREE_HAL_CACHE(num_contexts);
+	FREE_HAL_CACHE(num_contexts_valid);
+	FREE_HAL_CACHE(num_free_contexts);
+	FREE_HAL_CACHE(num_free_contexts_valid);
+	FREE_HAL_CACHE(port_subnet_valid);
+	FREE_HAL_CACHE(port_subnet);
+	FREE_HAL_CACHE(port_subnet_addr);
+	FREE_HAL_CACHE(port_ip_addr);
+	FREE_HAL_CACHE(port_netmask);
+	FREE_HAL_CACHE(port_subnet_idx);
+	FREE_HAL_CACHE(port_subnet_gid_hi);
+	FREE_HAL_CACHE(port_subnet_gid_lo);
+#undef FREE_HAL_CACHE
+	p->params.sw_status = 0;
+}
+
+static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void)
+{
+
+	if (psmi_hal_current_hal_instance)
+		return psmi_hal_current_hal_instance;
+
+	if (SLIST_EMPTY(&head_hi))
+		return NULL;
+
+	/* At this point, assuming there are multiple HAL INSTANCES that are
+	   registered, and two or more of the HAL INSTANCES are capable
+	   of initialization on a host, the environment variable PSM3_HAL_PREF
+	   allows the user to identify the one HAL INSTANCE that is desired to
+	   be used. The default policy is, when the PSM3_HAL_PREF is not set, the
+	   first hal instance that successfully initializes is used. */
+
+	union psmi_envvar_val env_hi_pref; /* HAL instance preference */
+	psmi_getenv("PSM3_HAL_PREF",
+		    "Indicate preference for HAL instance (Default is use first HAL"
+		    " instance to successfully initialize))",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref);
+
+	/* The hfp_get_num_units() call below, will not wait for the HFI driver
+	   to come up and create device nodes in /dev/.) */
+	struct _psmi_hal_instance *p;
+	SLIST_FOREACH(p, &head_hi, next_hi)
+	{
+		if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) ||
+		    (p->type == env_hi_pref.e_int))
+		{
+			const int valid_flags = PSM_HAL_PARAMS_VALID_DEFAULT_PKEY |
+				PSM_HAL_PARAMS_VALID_NUM_UNITS |
+				PSM_HAL_PARAMS_VALID_NUM_PORTS | PSM_HAL_PARAMS_VALID_CACHE;
+
+			if ((p->params.sw_status & valid_flags) == valid_flags)
+				return p;
+
+			int nunits = p->hfp_get_num_units();
+			int nports = p->hfp_get_num_ports();
+			int dflt_pkey = p->hfp_get_default_pkey();
+			if (nunits > 0 && nports > 0
+#ifndef PSM2_MOCK_TESTING
+			    && (0 == sysfs_init(p->hfi_sys_class_path))
+#endif
+				)
+			{
+				p->params.num_units = nunits;
+				p->params.num_ports = nports;
+				p->params.default_pkey = dflt_pkey;
+				// unit is 0 to nunits-1
+				// port is 1 to nports
+				// size extra entry for ports below, entry 0 unused
+#define ALLOC_HAL_CACHE(field, type, cnt) \
+	do { \
+		p->params.field = (type *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, cnt, sizeof(type)); \
+		if (! p->params.field) goto fail_cache_alloc; \
+	} while (0)
+
+				ALLOC_HAL_CACHE(unit_active, int8_t, nunits);
+				ALLOC_HAL_CACHE(unit_active_valid, int8_t, nunits);
+				ALLOC_HAL_CACHE(port_active, int8_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_active_valid, int8_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits);
+				ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits);
+				ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits);
+				ALLOC_HAL_CACHE(num_free_contexts_valid, uint16_t, nunits);
+				ALLOC_HAL_CACHE(port_subnet_valid, int8_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_subnet, uint64_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_subnet_addr, uint64_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_ip_addr, uint32_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_netmask, uint32_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_subnet_idx, int, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_subnet_gid_hi, uint64_t, nunits*(nports+1));
+				ALLOC_HAL_CACHE(port_subnet_gid_lo, uint64_t, nunits*(nports+1));
+				p->params.sw_status |= valid_flags;
+#undef ALLOC_HAL_CACHE
+				return p;
+			}
+		}
+	}
+	return NULL;
+
+fail_cache_alloc:
+	psmi_hal_free_cache(p);
+	return NULL;
+}
+
+/* psmi_hal_initialize */
+int psmi_hal_initialize(void)
+{
+	struct _psmi_hal_instance *p = psmi_hal_get_pi_inst();
+
+	if (!p)
+		return -PSM_HAL_ERROR_INIT_FAILED;
+
+	int rv = p->hfp_initialize(p);
+
+	if (!rv)
+	{
+		psmi_hal_current_hal_instance = p;
+
+
+		return rv;
+	}
+	return -PSM_HAL_ERROR_INIT_FAILED;
+}
+
+int psmi_hal_finalize(void)
+{
+	struct _psmi_hal_instance *p = psmi_hal_current_hal_instance;
+
+	int rv = psmi_hal_finalize_();
+	psmi_hal_free_cache(p);
+	psmi_hal_current_hal_instance = NULL;
+	sysfs_fini();
+	return rv;
+}
+
+
+#ifdef PSM2_MOCK_TESTING
+
+#include "psm_hal_gen1/opa_user_gen1.h"
+
+
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_hal.h b/deps/libfabric/prov/psm3/psm3/psm2_hal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f9cc715c16ed06fa696de7764dea7b087d639
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_hal.h
@@ -0,0 +1,373 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __PSM2_HAL_H__
+
+#define __PSM2_HAL_H__
+
+#include "psm_user.h"
+
+/* Forward declaration of PSM structs: */
+struct ips_recvhdrq_event;
+struct ips_flow;
+struct ips_scb;
+struct ips_epinfo;
+struct ips_message_header;
+
+/* Declare types: */
+typedef enum
+{
+	PSM_HAL_INSTANCE_ANY_GEN =  0,
+	PSM_HAL_INSTANCE_GEN1    =  1,
+	PSM_HAL_INSTANCE_GEN2    =  2,
+	PSM_HAL_INSTANCE_GEN3    =  3,
+
+#ifdef PSM2_MOCK_TESTING
+	PSM_HAL_INSTANCE_MOCK    = 99,
+#endif
+} psmi_hal_instance_type;
+
+typedef enum
+{
+	/* Operation was successful.  No error occurred. */
+	PSM_HAL_ERROR_OK			= 0,
+	/* The operation can not be done unless HAL is initialized first. */
+	PSM_HAL_ERROR_NOT_INITIALIZED		= 1,
+	/* No HAL INSTANCE has been registered.  Initialization is impossible. */
+	PSM_HAL_ERROR_NO_HI_REGISTERED		= 2,
+	/* Initialization failure. */
+	PSM_HAL_ERROR_INIT_FAILED		= 3,
+	/* Can't open device file. */
+	PSM_HAL_ERROR_CANNOT_OPEN_DEVICE	= 4,
+	/* Can't open context. */
+	PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT	= 5,
+	/* Context is not open. */
+	PSM_HAL_ERROR_CONTEXT_IS_NOT_OPEN	= 6,
+	/* General error. */
+	PSM_HAL_ERROR_GENERAL_ERROR		= 7,
+	/* Not implemented. */
+	PSM_HAL_ERROR_NOT_IMPLEMENTED		= 8,
+	/* Internal error. */
+	PSM_HAL_ERROR_INTERNAL_ERROR		= 9,
+
+	/* HAL instances should not return errors less than the value
+	   PSM_HAL_ERROR_RESERVED_BY_HAL_API.  These errors are reserved by
+	   the HAL API layer. */
+	PSM_HAL_ERROR_RESERVED_BY_HAL_API	= 1000,
+} psmi_hal_errors;
+
+
+
+/* The following enum constants correspond to the bits in the
+   cap_mask member of the psmi_hal_params_t. */
+typedef enum
+{
+	PSM_HAL_CAP_GPUDIRECT_OT		= (1UL << 16),
+	PSM_HAL_CAP_USER_MR			= (1UL << 17),
+	PSM_HAL_CAP_EVICT			= (1UL << 18),
+} psmi_hal_capability_bits;
+
+/* The following enum constants correspond to the bits in the
+   sw_status member of the psmi_hal_params_t. */
+typedef enum
+{
+	/* Request to start rx thread. */
+	PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD	= (1UL <<  0),
+	/* Rx thread is started. */
+	PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED	= (1UL <<  1),
+	PSM_HAL_PSMI_RUNTIME_INTR_ENABLED       = (1UL <<  2),
+	PSM_HAL_PARAMS_VALID_NUM_UNITS          = (1UL <<  4),
+	PSM_HAL_PARAMS_VALID_NUM_PORTS          = (1UL <<  5),
+	PSM_HAL_PARAMS_VALID_DEFAULT_PKEY       = (1UL <<  6),
+	PSM_HAL_PARAMS_VALID_CACHE              = (1UL <<  7),
+
+} psmi_hal_sw_status;
+
+/* The _psmi_hal_params structure stores values that remain constant for the entire life of
+   the process and this structure resides in the hal instance structure (below).
+   The values are settled after the context is opened. */
+typedef struct _psmi_hal_params
+{
+	uint32_t   cap_mask;
+	uint32_t   sw_status;
+	/* start cached members */
+	uint16_t   num_units;
+	uint16_t   num_ports;
+	uint16_t   default_pkey;
+	int8_t     *unit_active,*unit_active_valid;
+	int8_t     *port_active,*port_active_valid;
+	uint16_t   *num_contexts,*num_contexts_valid;
+	uint16_t   *num_free_contexts,*num_free_contexts_valid;
+		// information from port_get_subnet
+	int8_t     *port_subnet_valid;
+	uint64_t   *port_subnet;
+	uint64_t   *port_subnet_addr;
+	uint32_t   *port_ip_addr;
+	uint32_t   *port_netmask;
+	int        *port_subnet_idx;
+	uint64_t   *port_subnet_gid_hi;
+	uint64_t   *port_subnet_gid_lo;
+} psmi_hal_params_t;
+
+
+#define PSM_HAL_ALG_ACROSS     0
+#define PSM_HAL_ALG_WITHIN     1
+#define PSM_HAL_ALG_ACROSS_ALL 2
+
+
+typedef enum {
+	PSMI_HAL_POLL_TYPE_URGENT = 1
+} psmi_hal_poll_type;
+
+/* Forward declaration of incomplete struct type _psmi_hal_instance and
+ * psmi_hal_instance_t typedef: */
+
+struct _psmi_hal_instance;
+typedef struct _psmi_hal_instance psmi_hal_instance_t;
+
+struct _psmi_hal_instance
+{
+	SLIST_ENTRY(_psmi_hal_instance) next_hi;
+	psmi_hal_instance_type		type;
+	const char			*description;
+	const char			*hfi_name;
+	const char			*hfi_sys_class_path;
+	/* The params member should be read-only for HIC, and
+	   written only by the HAL instance. */
+	psmi_hal_params_t		params;
+	/* Initialize the HAL INSTANCE. */
+	int (*hfp_initialize)(psmi_hal_instance_t *);
+	/* Finalize the HAL INSTANCE. */
+	int (*hfp_finalize_)(void);
+
+	/* Returns the number of hfi units installed on ths host:
+	   NOTE: hfp_get_num_units is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_num_units)(void);
+
+	/* Returns the number of ports on each hfi unit installed.
+	   on ths host.
+	   NOTE: hfp_get_num_ports is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_num_ports)(void);
+
+	/* Returns the default pkey:
+	   NOTE: hfp_get_default_pkey is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_default_pkey)(void);
+
+	/* Given a unit number, returns 1 if any port on the unit is active.
+	   returns 0 if no port on the unit is active.
+	   returns -1 when an error occurred.
+	   NOTE: hfp_get_unit_active is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_unit_active)(int unit);
+
+	int (*hfp_get_port_active)(int unit,int port);
+	/* NOTE: hfp_get_num_contexts is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_num_contexts)(int unit);
+	/* NOTE: hfp_get_num_free_contexts is a function that must
+	   be callable before the hal instance is initialized. */
+	int (*hfp_get_num_free_contexts)(int unit);
+
+	/* Context open includes opening the device file, and get hw params. */
+	int (*hfp_context_open)(int unit,
+				int port,
+				uint64_t open_timeout,
+				psm2_ep_t ep,
+				psm2_uuid_t const job_key,
+				psmi_context_t *psm_ctxt,
+				uint32_t cap_mask,
+				unsigned retryCnt);
+
+	/* Initialize PSM3_PRINT_STATS stats for given ep */
+	void (*hfp_context_initstats)(psm2_ep_t ep);
+
+	/* Close the context, including the device file. */
+	int (*hfp_close_context)(psmi_hal_hw_context *);
+
+
+	int (*hfp_get_port_rate)(int unit, int port);
+
+
+	int (*hfp_get_port_lid)(int unit, int port);
+	int (*hfp_get_port_subnet)(int unit, int port,
+				uint64_t *subnet, uint64_t *addr,
+				uint32_t *ip_addr, uint32_t *netmask,
+				int *idx, uint64_t *hi, uint64_t *lo);
+
+
+	/* End of receive functions. */
+
+
+	int (*hfp_spio_transfer_frame)(struct ips_proto *proto,
+				       struct ips_flow *flow, struct ips_scb *scb,
+				       uint32_t *payload, uint32_t length,
+				       uint32_t isCtrlMsg, uint32_t cksum_valid,
+				       uint32_t cksum, psmi_hal_hw_context
+#ifdef PSM_CUDA
+				, uint32_t is_cuda_payload
+#endif
+		);
+	int (*hfp_spio_process_events)(const struct ptl *ptl);
+	int (*hfp_get_node_id)(int unit, int *nodep);
+
+
+	int      (*hfp_get_jkey)(psmi_hal_hw_context);
+
+};
+
+/* This is the current psmi_hal_instance, or, NULL if not initialized.
+   The HIC should not modify the contents of the HAL instance directly. */
+extern psmi_hal_instance_t *psmi_hal_current_hal_instance;
+
+/* Declare functions called by the HAL INSTANCES. */
+void psmi_hal_register_instance(psmi_hal_instance_t *);
+
+/* Declare functions that are called by the HIC: */
+/* All of these functions return a negative int value to
+   indicate failure, or >= 0 for success. */
+
+/* Chooses one of the the psmi_hal_instances that have been
+    registered and then initializes it.
+    Returns: -PSM_HAL_ERROR_NOT_REGISTERED_HI if no HAL
+    INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when
+    another failure has occured during initialization. */
+int psmi_hal_initialize(void);
+
+int psmi_hal_finalize(void);
+
+#include "psm2_hal_inlines_d.h"
+
+enum psmi_hal_pre_init_cache_func_krnls
+{
+	psmi_hal_pre_init_cache_func_get_num_units,
+	psmi_hal_pre_init_cache_func_get_num_ports,
+	psmi_hal_pre_init_cache_func_get_unit_active,
+	psmi_hal_pre_init_cache_func_get_port_active,
+	psmi_hal_pre_init_cache_func_get_num_contexts,
+	psmi_hal_pre_init_cache_func_get_num_free_contexts,
+	psmi_hal_pre_init_cache_func_get_default_pkey,
+	psmi_hal_pre_init_cache_func_get_port_subnet,
+};
+
+int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...);
+
+#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_cache_func(psmi_hal_pre_init_cache_func_ ## KERNEL , ##__VA_ARGS__ ) )
+
+#if PSMI_HAL_INST_CNT == 1
+
+#define PSMI_HAL_DISPATCH(KERNEL,...)    ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) )
+
+#else
+
+#define PSMI_HAL_DISPATCH(KERNEL,...)    ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ ))
+
+#endif
+
+#define psmi_hal_get_num_units_(...)                           PSMI_HAL_DISPATCH_PI(get_num_units,##__VA_ARGS__)
+#define psmi_hal_get_num_ports_(...)                           PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__)
+#define psmi_hal_get_unit_active(...)                          PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__)
+#define psmi_hal_get_port_active(...)                          PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__)
+#define psmi_hal_get_num_contexts(...)                         PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__)
+#define psmi_hal_get_num_free_contexts(...)                    PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__)
+#define psmi_hal_get_default_pkey(...)			       PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__)
+#define psmi_hal_get_port_subnet(...)				PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__)
+#define psmi_hal_context_open(...)				PSMI_HAL_DISPATCH(context_open,__VA_ARGS__)
+#define psmi_hal_context_initstats(...)				PSMI_HAL_DISPATCH(context_initstats,__VA_ARGS__)
+#define psmi_hal_close_context(...)				PSMI_HAL_DISPATCH(close_context,__VA_ARGS__)
+
+
+#define psmi_hal_get_port_rate(...)				PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__)
+
+
+#define psmi_hal_get_port_lid(...)				PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__)
+
+
+#define psmi_hal_finalize_(...)                                 PSMI_HAL_DISPATCH(finalize_,__VA_ARGS__)
+
+
+#define psmi_hal_get_user_major_bldtime_version(...)		PSMI_HAL_DISPATCH(get_user_major_bldtime_version,__VA_ARGS__)
+#define psmi_hal_get_user_minor_bldtime_version(...)		PSMI_HAL_DISPATCH(get_user_minor_bldtime_version,__VA_ARGS__)
+
+
+#define psmi_hal_spio_transfer_frame(...)			PSMI_HAL_DISPATCH(spio_transfer_frame,__VA_ARGS__)
+#define psmi_hal_spio_process_events(...)			PSMI_HAL_DISPATCH(spio_process_events,__VA_ARGS__)
+#define psmi_hal_get_node_id(...)				PSMI_HAL_DISPATCH(get_node_id,__VA_ARGS__)
+
+
+#define psmi_hal_get_jkey(...)					PSMI_HAL_DISPATCH(get_jkey,__VA_ARGS__)
+
+
+#define psmi_hal_get_hal_instance_type()			psmi_hal_current_hal_instance->type
+#define psmi_hal_get_hal_instance_description()			psmi_hal_current_hal_instance->description
+#define psmi_hal_get_hfi_name()					psmi_hal_current_hal_instance->hfi_name
+#define psmi_hal_get_num_units()				psmi_hal_current_hal_instance->params.num_units
+#define psmi_hal_get_num_ports()				psmi_hal_current_hal_instance->params.num_ports
+#define psmi_hal_get_cap_mask()					psmi_hal_current_hal_instance->params.cap_mask
+#define psmi_hal_set_cap_mask(NEW_MASK)				(psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK))
+#define psmi_hal_add_cap(CAP)					(psmi_hal_current_hal_instance->params.cap_mask |= (CAP))
+#define psmi_hal_sub_cap(CAP)					(psmi_hal_current_hal_instance->params.cap_mask &= (~(CAP)))
+#define psmi_hal_has_cap(CAP)                                   ((psmi_hal_get_cap_mask() & (CAP)) == (CAP))
+
+#define psmi_hal_get_sw_status()				psmi_hal_current_hal_instance->params.sw_status
+#define psmi_hal_set_sw_status(NEW_STATUS)			(psmi_hal_current_hal_instance->params.sw_status = (NEW_STATUS))
+#define psmi_hal_add_sw_status(STATUS)				(psmi_hal_current_hal_instance->params.sw_status |= (STATUS))
+#define psmi_hal_sub_sw_status(STATUS)				(psmi_hal_current_hal_instance->params.sw_status &= (~(STATUS)))
+#define psmi_hal_has_sw_status(STATUS)				((psmi_hal_get_sw_status() & (STATUS)) == (STATUS))
+
+
+#include "psm2_hal_inlines_i.h"
+
+#endif /* #ifndef __PSM2_HAL_H__ */
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_hal_inline_t.h b/deps/libfabric/prov/psm3/psm3/psm2_hal_inline_t.h
new file mode 100644
index 0000000000000000000000000000000000000000..916c9998b4b5cb0b3a3e6c4df6d4a7a22dc38ed1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_hal_inline_t.h
@@ -0,0 +1,122 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* The psm2_hal_inline_t.h file serves as a template to allow all HAL
+   instances to easily and conveniently declare their HAL methods. */
+
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize)
+				(psmi_hal_instance_t *);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize_)
+				(void);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units)
+				(void);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports)
+				(void);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active)
+				(int unit);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_node_id)
+				(int unit, int *nodep);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_active)
+				(int unit, int port);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_contexts)
+				(int unit);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_free_contexts)
+				(int unit);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(close_context)
+				(psmi_hal_hw_context *);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(context_open)
+				(int unit,
+				 int port,
+				 uint64_t open_timeout,
+				 psm2_ep_t ep,
+				 psm2_uuid_t const job_key,
+				 psmi_context_t *psm_ctxt,
+				 uint32_t cap_mask,
+				 unsigned);
+static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(context_initstats)
+				 (psm2_ep_t ep);
+
+
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate)
+				(int unit, int port);
+
+
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lid)
+				(int unit, int port);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_subnet)
+				(int unit, int port, uint64_t *subnet, uint64_t *addr,
+				uint32_t *ip_addr, uint32_t *netmask,
+				int *idx, uint64_t *hi, uint64_t *lo);
+
+
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_default_pkey)
+				(void);
+
+
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_transfer_frame)
+				(struct ips_proto *proto,
+				 struct ips_flow *flow, struct ips_scb *scb,
+				 uint32_t *payload, uint32_t length,
+				 uint32_t isCtrlMsg, uint32_t cksum_valid,
+				 uint32_t cksum, psmi_hal_hw_context
+#ifdef PSM_CUDA
+				 , uint32_t is_cuda_payload
+#endif
+					);
+static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_process_events)
+				(const struct ptl *ptl);
+
+
+static PSMI_HAL_INLINE int      PSMI_HAL_CAT_INL_SYM(get_jkey)
+				(psmi_hal_hw_context ctxt);
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_d.h.in b/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_d.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..99d4e4a6216006152d6e53cbf4b7c670547ef262
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_d.h.in
@@ -0,0 +1,66 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2020 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2020 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#define PSMI_HAL_INST_CNT @PSM_HAL_CNT@
+
+#if PSMI_HAL_INST_CNT == 1
+
+#define PSMI_HAL_INLINE inline
+#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_@PSM_HAL_INST@_##KERNEL
+#include "psm2_hal_inline_t.h"
+
+#else
+#define PSMI_HAL_INLINE /* nothing */
+
+#endif
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_i.h.in b/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_i.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..af20bdb40f6fd294d98f33a58f851578761e61fe
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_hal_inlines_i.h.in
@@ -0,0 +1,58 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2020 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2020 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#if PSMI_HAL_INST_CNT == 1
+
+#include "psm_hal_@PSM_HAL_INST@/psm_hal_inline_i.h"
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm2_mq.h b/deps/libfabric/prov/psm3/psm3/psm2_mq.h
new file mode 100644
index 0000000000000000000000000000000000000000..7267b095a70f25d254015d34a8fdbdf9fc5c7d79
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm2_mq.h
@@ -0,0 +1,1650 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM2_MQ_H
+#define PSM2_MQ_H
+
+#include <psm2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * @file psm2_mq.h
+ * @brief PSM2 Matched Queues
+ *
+ * @page psm2_mq Matched Queues interface
+ *
+ * The Matched Queues (MQ) interface implements a queue-based communication
+ * model with the distinction that queue message consumers use a 3-tuple of
+ * metadata to match incoming messages against a list of preposted receive
+ * buffers.  These semantics are consistent with those presented by MPI-1.2
+ * and all the features and side-effects of Message-Passing find their way into
+ * Matched Queues. There is currently a single MQ context,
+ * If need be, MQs may expose a function to allocate more than
+ * one MQ context in the future.  Since an MQ is implicitly bound to a locally
+ * opened endpoint, handle all MQ functions use an MQ handle instead of an EP
+ * handle as a communication context.
+ *
+ * @section tagmatch MQ Tag Matching
+ *
+ * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t)
+ * and a 3-tuple of tag objects.  Two of the tag objects are provided by the
+ * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is
+ * provided by the sender as part of every message sent (@ref psm2_mq_send and
+ * @ref psm2_mq_isend).  Since MQ is a receiver-directed communication model,
+ * the tag matching done at the receiver involves matching the sent message's
+ * origin and send tag (@c stag) with the source endpointer address, tag (@c
+ * rtag), and tag selector (@c rtagsel) attached to every preposted receive
+ * buffer.  The incoming @c stag is compared to the posted @c rtag but only for
+ * significant bits set to @c 1 in the @c rtagsel.  The @c rtagsel can be used
+ * to mask off parts (or even all) of the bitwise comparison between sender and
+ * receiver tags.  A successful match causes the message to be received into
+ * the buffer with which the tag is matched.  If the incoming message is too
+ * large, it is truncated to the size of the posted receive buffer.  The
+ * bitwise operation corresponding to a successful match and receipt of an
+ * expected message amounts to the following expression evaluating as true:
+ *
+ *      @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim
+ *
+ * It is up to the user to encode (pack) into the 64-bit unsigned
+ * integers, including employing the @c rtagsel tag selector as a method to
+ * wildcart part or all of the bits significant in the tag matching operation.
+ * For example, MPI uses triple based on context (MPI communicator), source
+ * rank, send tag. The following code example shows how the triple can be
+ * packed into 64 bits:
+ *
+ * @code{.c}
+ 	//
+ 	// 64-bit send tag formed by packing the triple:
+ 	//
+ 	// ( context_id_16bits | source_rank_16bits | send_tag_32bits )
+ 	//
+ 	stag = ( (((context_id)&0xffffULL)<<48)|    \
+ 	         (((source_rank)&0xffffULL)<<32)|   \
+ 	         (((send_tag)&0xffffffffULL)) );
+   @endcode
+ *
+ * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel
+ * masking bits against a list of send tags and returns the first successful
+ * match.  Zero bits in the @c tagsel can be used to indicate wildcarded bits
+ * in the 64-bit tag which can be useful for implementing MPI's
+ * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG.  Following the example bit splicing in
+ * the above @c stag example:
+ *
+ * @code{.c}
+   	// Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333
+  
+   	// MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000000000007;
+   	rtagsel = 0xffff0000ffffffff;
+  
+   	// MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000300000000;
+   	rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag
+  
+   	// MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD
+   	rtag    = 0x3333000300000000;
+   	rtagsel = 0xffff000080000000; // can't ignore sign bit in tag
+   @endcode
+ *
+ *
+ * Applications that do not follow tag matching semantics can simply always
+ * pass a value of @c 0 for @c rtagsel, which will always yield a successful
+ * match to the first preposted buffer.  If a message cannot be matched to any
+ * of the preposted buffers, the message is delivered as an unexpected
+ * message.
+ *
+ * @section mq_receive MQ Message Reception
+ *
+ * MQ messages are either received as @e expected or @e unexpected: @li The
+ * received message is @e expected if the incoming message tag matches the
+ * combination of tag and tag selector of at least one of the user-provided
+ * receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * @li The received message is @e unexpected if the incoming message tag @b
+ * doesn't match any combination of tag and tag selector from all the
+ * user-provided receive buffers preposted with @ref psm2_mq_irecv.
+ *
+ * Unexpected messages are messages that the MQ library buffers until the
+ * user provides a receive buffer that can match the unexpected message.
+ * With Matched Queues and MPI alike, unexpected messages can occur as a
+ * side-effect of the programming model, whereby the arrival of messages can be
+ * slightly out of step with the ordering in which the user
+ * provides receive buffers.  Unexpected messages can also be triggered by the
+ * difference between the rate at which a sender produces messages and the rate
+ * at which a paired receiver can post buffers and hence consume the messages.
+ *
+ * In all cases, too many @e unexpected messages will negatively affect
+ * performance.  Users can employ some of the following mechanisms to reduce
+ * the effect of added memory allocations and copies that result from
+ * unexpected messages:
+ *   @li If and when possible, receive buffers should be posted as early as
+ *       possible and ideally before calling into the progress engine.
+ *   @li Use of rendezvous messaging that can be controlled with
+ *       @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options.  These
+ *       options default to values determined to make effective use of
+ *       bandwidth and are hence not advisable for all communication message
+ *       sizes, but rendezvous messages inherently prevent unexpected
+ *       messages by synchronizing the sender with the receiver beforehand.
+ *   @li The amount of memory that is allocated to handle unexpected messages
+ *       can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *       option.
+ *   @li MQ statistics, such as the amount of received unexpected messages and
+ *       the aggregate amount of unexpected bytes are available in the @ref
+ *       psm2_mq_stats structure.
+ *
+ * Whenever a match occurs, whether the message is expected or unexpected, it
+ * is generally up to the user to ensure that the message is not truncated.
+ * Message truncation occurs when the size of the preposted buffer is less than
+ * the size of the incoming matched message.  MQ will correctly handle
+ * message truncation by always copying the appropriate amount of bytes as to
+ * not overwrite any data.  While it is valid to send less data than the amount
+ * of data that has been preposted, messages that are truncated will be marked
+ * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status
+ * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t).
+ *
+ * @section mq_completion MQ Completion Semantics
+ *
+ * Message completion in Matched Queues follows local completion semantics.
+ * When sending an MQ message, it is deemed complete when MQ guarantees that
+ * the source data has been sent and that the entire input source data memory
+ * location can be safely overwritten.  As with standard Message-Passing,
+ * MQ does not make any remote completion guarantees for sends.  MQ does
+ * however, allow a sender to synchronize with a receiver to send a synchronous
+ * message which sends a message only after a matching receive buffer has been
+ * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC).
+ *
+ * A receive is deemed complete after it has matched its associated receive
+ * buffer with an incoming send and that the data from the send has been
+ * completely delivered to the receive buffer.
+ *
+ * @section mq_progress MQ Progress Requirements
+ *
+ * Progress on MQs must be @e explicitly ensured by the user for correctness.
+ * The progress requirement holds even if certain areas of the MQ
+ * implementation require less network attention than others, or if progress
+ * may internally be guaranteed through interrupts.  The main polling function,
+ * @ref psm2_poll, is the most general form of ensuring process on a given
+ * endpoint.  Calling @ref psm2_poll ensures that progress is made over all the
+ * MQs and other components instantiated over the endpoint passed to @ref
+ * psm2_poll.
+ *
+ * While @ref psm2_poll is the only way to directly ensure progress, other MQ
+ * functions will conditionally ensure progres depending on how they are used:
+ *
+ * @li @ref psm2_mq_wait employs polling and waits until the request is
+ * completed.  For blocking communication operations where the caller is
+ * waiting on a single send or receive to complete, psm2_mq_wait usually
+ * provides the best responsiveness in terms of latency.
+ *
+ * @li @ref psm2_mq_test can test a particular request for completion, but @b
+ * never directly or indirectly ensures progress as it only tests the
+ * completion status of a request, nothing more.  See functional documentation
+ * in @ref psm2_mq_test for a detailed discussion.
+ *
+ * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion
+ * queue is empty and will not ensure progress as long as the completion queue
+ * is non-empty.  Users that always aggressively process all elements of the MQ
+ * completion queue as part of their own progress engine will indirectly always
+ * ensure MQ progress. The ipeek mechanism is the preferred way for
+ * ensuring progress when many non-blocking requests are in flight since ipeek
+ * returns requests in the order in which they complete.  Depending on how the
+ * user initiates and completes communication, this may be preferable to
+ * calling other progress functions on individual requests.
+ */
+
+/*! @defgroup mq PSM Matched Queues
+ *
+ * @{
+ */
+
+/** @brief Initialize the MQ component for MQ communication
+ *
+ * This function provides the Matched Queue handle necessary to perform all
+ * Matched Queue communication operations.
+ *
+ * @param[in] ep Endpoint over which to initialize Matched Queue
+ * @param[in] ignored
+ * @param[in] opts Set of options for Matched Queue
+ * @param[in] numopts Number of options passed
+ * @param[out] mq User-supplied storage to return the Matched Queue handle
+ *                associated to the newly created Matched Queue.
+ *
+ * @remark This function can be called many times to retrieve the MQ handle
+ *         associated to an endpoint, but options are only considered the first
+ *         time the function is called.
+ *
+ * @post The user obtains a handle to an instantiated Match Queue.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A new Matched Queue has been instantiated across all the
+ *         members of the group.
+ *
+ * @code{.c}
+   	int try_open_endpoint_and_initialize_mq(
+   	       psm2_ep_t *ep,	// endpoint handle
+   	       psm2_epid_t *epid, // unique endpoint ID
+   	       psm2_uuid_t job_uuid, // unique job uuid, for ep_open
+   	       psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep'
+   	       uint64_t communicator_bits) // Where we store our communicator or
+   	                                   // context bits in the 64-bit tag.
+   	{
+   	    // Simplified open, see psm2_ep_open documentation for more info
+   	    psm2_ep_open(job_uuid,
+   	                NULL, // no options
+   	                ep, epid);
+
+   	    // We initialize a matched queue by telling PSM the bits that are
+   	    // order-significant in the tag.  Point-to-point ordering will not be
+   	    // maintained between senders where the communicator bits are not the
+   	    // same.
+   	    psm2_mq_init(ep,
+   	                communicator_bits,
+   	                NULL, // no other MQ options
+   	                0,    // 0 options passed
+   	                mq);  // newly initialized matched Queue
+
+   	    return 1;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_init(psm2_ep_t ep, uint64_t ignored,
+	    const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq);
+
+#define PSM2_MQ_ORDERMASK_NONE	0ULL
+	/**< This macro is reserved for future tag order masking support. */
+
+#define PSM2_MQ_ORDERMASK_ALL	0xffffffffffffffffULL
+	/**< This macro is reserved for future tag order masking support. */
+
+/** @brief Finalize (close) an MQ handle
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK A given Matched Queue has been freed and use of the future
+ * use of the handle produces undefined results.
+ */
+psm2_error_t
+psm2_mq_finalize(psm2_mq_t mq);
+
+#define PSM2_MQ_TAG_ELEMENTS 4
+	/**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t
+	 *   type plus one extra element to keep alignment and padding
+	 *   as 16 bytes.  */
+
+/** @struct psm2_mq_tag
+ ** @brief MQ Message tag
+ *
+ * Extended message tag type introduced in PSM 2.0.  The previous 64 bit tag
+ * values are replaced by a struct containing three 32 bit tag values for a
+ * total of 96 bits.  Matching semantics are unchanged from the previous 64-bit
+ * matching scheme; the only difference is that 96 bits are matched instead of
+ * 64.  For interoperability with existing PSM routines, 64 bit tags are
+ * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to
+ * zero.  Other than this caveat, all of the existing routines using 64-bit
+ * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type.
+ * For example, a message sent using @ref psm2_mq_send can be received using
+ * @ref psm2_mq_irecv2, provided the tags match as described above.
+ */
+typedef
+//struct psm2_mq_tag {
+union psm2_mq_tag {
+	uint32_t tag[PSM2_MQ_TAG_ELEMENTS]; /* No longer specifying
+					     * alignment as it makes
+					     * code break with newer
+					     * compilers. */
+	/**< 3 x 32bit array representation of @ref psm2_mq_tag */
+	struct {
+		uint32_t tag0; /**< 1 of 3 uint32_t tag values */
+		uint32_t tag1; /**< 2 of 3 uint32_t tag values */
+		uint32_t tag2; /**< 3 of 3 uint32_t tag values */
+	};
+	struct {
+		uint64_t tag64; /**< uint64_t tag values */
+		uint32_t res; /**< uint32_t reserved */
+	};
+} psm2_mq_tag_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.
+ * For wait and test functions, MQ fills in the structure upon completion.
+ * Upon completion, receive requests fill in every field of the status
+ * structure while send requests only return a valid error_code and context
+ * pointer.
+ */
+typedef
+struct psm2_mq_status {
+	/** Sender's original message tag (receive reqs only) */
+	uint64_t msg_tag;
+	/** Sender's original message length (receive reqs only) */
+	uint32_t msg_length;
+	/** Actual number of bytes transfered (receive reqs only) */
+	uint32_t nbytes;
+	/** MQ error code for communication operation */
+	psm2_error_t error_code;
+	/**< User-associated context for send or receive */
+	void *context;
+} psm2_mq_status_t;
+
+/** @brief MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.  For
+ * wait and test functions, MQ fills in the structure upon completion.  Upon
+ * completion, requests fill in every field of the status structure with the
+ * exception of the nbytes field, which is only valid for receives.  Version 2
+ * of the status type contains an @ref psm2_mq_tag_t type to represent the tag
+ * instead of a 64-bit integer value and is for use with PSM v2 routines.
+ */
+
+typedef
+struct psm2_mq_status2 {
+	/** Remote peer's epaddr */
+	psm2_epaddr_t msg_peer;
+	/** Sender's original message tag */
+	psm2_mq_tag_t msg_tag __attribute__ ((aligned(16)));/* Alignment added
+							     * to preserve the
+							     * layout as is
+							     * expected by
+							     * existent code */
+	/** Sender's original message length */
+	uint32_t msg_length;
+	/** Actual number of bytes transfered (receiver only) */
+	uint32_t nbytes;
+	/** MQ error code for communication operation */
+	psm2_error_t error_code;
+	/** User-associated context for send or receive */
+	void *context;
+} psm2_mq_status2_t;
+
+/** @brief PSM2 Communication handle (opaque) */
+typedef struct psm2_mq_req *psm2_mq_req_t;
+
+
+/** @brief MQ Request Struct
+ *
+ * Message completion request for asynchronous communication operations.
+ * Upon completion, requests are filled with the valid data for the
+ * corresponding send/recv operation that was completed. This datatype
+ * contains the status data and is converted into the
+ * mq_status structures in wait/test functions.
+ */
+struct psm2_mq_req_user {
+	/* Tag matching vars */
+	psm2_epaddr_t peer;
+	psm2_mq_tag_t tag __attribute__ ((aligned(16)));/* Alignment added
+							 * to preserve the
+							 * layout as is
+							 * expected by
+							 * existent code */
+	psm2_mq_tag_t tagsel;	/* used for receives */
+
+	/* Buffer attached to request.  May be a system buffer for unexpected
+	 * messages or a user buffer when an expected message */
+	uint8_t *buf;
+	uint32_t buf_len;
+	uint32_t error_code;
+
+	uint32_t recv_msglen;	/* Message length we are ready to receive */
+	uint32_t send_msglen;	/* Message length from sender */
+
+	/* Used for request to send messages */
+	void *context;		/* user context associated to sends or receives */
+
+	uint64_t user_reserved[4];
+};
+
+/*! @} */
+/*! @ingroup mq
+ * @defgroup mq_options PSM Matched Queue Options
+ * @{
+ *
+ * MQ options can be modified at any point at runtime, unless otherwise noted.
+ * The following example shows how to retrieve the current message size at
+ * which messages are sent as synchronous.
+ *
+ * @code{.c}
+   	uint32_t get_hfirv_size(psm2_mq_t mq)
+   	{
+   	    uint32_t rvsize;
+   	    psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize);
+   	    return rvsize;
+   	}
+   @endcode
+ */
+
+/** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to retrieve the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve.  Possible values are:
+ *            @li @ref PSM2_MQ_RNDV_HFI_SZ
+ *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that can be used to store the value of
+ *            the option to be set.  It is up to the user to ensure that the
+ *            pointer points to a memory location large enough to accommodate
+ *            the value associated to the type.  Each option documents the size
+ *            associated to its value.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ */
+psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value);
+
+/** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ)
+ *
+ * Function to set the value of an MQ option.
+ *
+ * @param[in] mq Matched Queue handle
+ * @param[in] option Index of option to retrieve.  Possible values are:
+ *            @li @ref PSM2_MQ_RNDV_HFI_SZ
+ *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
+ *
+ * @param[in] value Pointer to storage that contains the value to be updated
+ *                  for the supplied option number.  It is up to the user to
+ *                  ensure that the pointer points to a memory location with a
+ *                  correct size.
+ *
+ * @returns PSM2_OK if option could be retrieved.
+ * @returns PSM2_PARAM_ERR if the option is not a valid option number
+ * @returns PSM2_OPT_READONLY if the option to be set is a read-only option
+ *                           (currently no MQ options are read-only).
+ */
+psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value);
+
+/*! @}  */
+/*! @ingroup mq
+ * @{
+ */
+
+#define PSM2_MQ_FLAG_SENDSYNC	0x01
+				/**< MQ Send Force synchronous send */
+
+#define PSM2_MQ_REQINVALID	((psm2_mq_req_t)(NULL))
+				/**< MQ request completion value */
+
+#define PSM2_MQ_ANY_ADDR		((psm2_epaddr_t)NULL)
+				/**< MQ receive from any source epaddr */
+
+
+/** @brief MQ fast-path operation enumeration
+ *
+ * To provide for quick enqueing of send/receives from within an AM handler
+ * PSM2 provdes fast path send/recv options that will enqueue those ops
+ * into the MQ. The supported operations to call in fast path are enumerated
+ * in the @ref psm2_mq_fp_op enum.
+ */
+enum psm2_mq_fp_op {
+	PSM2_MQ_ISEND_FP = 1,
+	PSM2_MQ_IRECV_FP,
+};
+
+/** @brief Post a fast-path isend/irecv into the MQ
+ *
+ * Function to only enqueue fast-path non-blocking sends or non-blocking recvs
+ * into a particular MQ. These calls only work if the process already holds
+ * the mq progress lock, this case traditionally only applies to calls from
+ * a registered AM Handler.
+ *
+ * This function helps to enable one-sided communication models from middleware
+ * such as OFI to provide fast >2KB message transfers for RMA operations.
+ *
+ * When posting irecvs every MQ message received on a particular MQ,
+ * the @c tag and @c tagsel parameters are used against the incoming
+ * message's send tag as described in @ref tagmatch.
+ *
+ * When posting isends the user gurantees that the source data will remain
+ * unmodified until the send is locally completed through a call such as
+ * @ref psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * Progress on the operations enqueued into the MQ will may not occur until
+ * the next PSM2 progress API is invoked.
+ *
+ * @param[in] ep PSM2 endpoint
+ * @param[in] mq Matched Queue Handle
+ * @param[in] addr Destination EP address (used only on isends)
+ * @param[in] tag Send/Receive tag
+ * @param[in] tagsel Receive tag selector (used only on irecvs)
+ * @param[in] flags Send/Receive Flags
+ * @param[in] buf Send/Receive buffer
+ * @param[in] len Send/Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ *                    upon completion
+ * @param[in] fp_type Fast-path op requested
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * @post The supplied buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ *       match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag,
+		psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len,
+		void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with tag selection criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c tag and @c
+ * tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ *                    upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ *       match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags,
+	     void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with source and tag selection
+ *  criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the @c src, @c tag
+ * and @c tagsel parameters are used against the incoming message's send tag as
+ * described in @ref tagmatch.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Receive tag
+ * @param[in] rtagsel Receive tag selector
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status2_t
+ *                    upon completion
+ * @param[out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * @post The supplied receive buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via @ref psm2_mq_cancel @e before any
+ *       match occurs.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+	      psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len,
+	      void *context, psm2_mq_req_t *req);
+
+/** @brief Post a receive to a Matched Queue with matched request
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. The provided request should already be matched using the @ref
+ * psm2_mq_improbe or @ref psm2_mq_improbe2 routines.  It is an error to pass a
+ * request that has not already been matched by one of those routines.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] flags Receive flags (None currently supported)
+ * @param[in] buf Receive buffer
+ * @param[in] len Receive buffer length
+ * @param[in] context User context pointer, available in @ref psm2_mq_status_t
+ *                    upon completion
+ * @param[inout] reqo PSM MQ Request handle matched previously by a matched
+ *		     probe routine (@ref psm2_mq_improbe or @ref
+ *		     psm2_mq_improbe2), also to be used for explicitly
+ *		     controlling message receive completion.
+ *
+ * @post The supplied receive buffer is given to MQ to deliver the matched
+ *       message.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm2_error_t
+psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+	      void *context, psm2_mq_req_t *reqo);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	    const void *buf, uint32_t len);
+
+/** @brief Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ *
+ * @post The source buffer is reusable and the send is locally complete.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully sent.
+ */
+psm2_error_t
+psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	     psm2_mq_tag_t *stag, const void *buf, uint32_t len);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ *                    psm2_mq_status_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ *                 be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ *       until its request is completed by either @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+   	psm2_mq_req_t
+   	non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+   	                      const void *buf, uint32_t len,
+   	     		 int context_id, int send_tag, const my_request_t *req)
+   	{
+   	    psm2_mq_req_t req_mq;
+   	    // Set up our send tag, assume that "my_rank" is global and represents
+   	    // the rank of this process in the job
+   	    uint64_t tag = ( ((context_id & 0xffff) << 48) |
+   	                     ((my_rank & 0xffff) << 32)    |
+   	                     ((send_tag & 0xffffffff)) );
+  
+   	    psm2_mq_isend(mq, dest_ep,
+   	                 0, // no flags
+   	                 tag,
+   	                 buf,
+   	                 len,
+   	                 req, // this req is available in psm2_mq_status_t when one
+   	                      // of the synchronization functions is called.
+   	                 &req_mq);
+   	    return req_mq;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	     const void *buf, uint32_t len, void *context, psm2_mq_req_t *req);
+
+/** @brief Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as @ref psm2_mq_wait or @ref
+ * psm2_mq_test.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] dest Destination EP address
+ * @param[in] flags Message flags, currently:
+ *            @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * @param[in] stag Message Send Tag, array of three 32-bit values.
+ * @param[in] buf Source buffer pointer
+ * @param[in] len Length of message starting at @c buf.
+ * @param[in] context Optional user-provided pointer available in @ref
+ *                    psm2_mq_status2_t when the send is locally completed.
+ * @param[out] req PSM MQ Request handle created by the non-blocking send, to
+ *                 be used for explicitly controlling message completion.
+ *
+ * @post The source buffer is not reusable and the send is not locally complete
+ *       until its request is completed by either @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The message has been successfully initiated.
+ *
+ * @code{.c}
+   	psm2_mq_req_t
+   	non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep,
+   	                      const void *buf, uint32_t len,
+   	     		 int context_id, int send_tag, const my_request_t *req)
+   	{
+   	    psm2_mq_req_t req_mq;
+   	    // Set up our send tag, assume that "my_rank" is global and represents
+   	    // the rank of this process in the job
+   	    psm2_mq_tag_t tag;
+   	    tag.tag[0] = send_tag;
+   	    tag.tag[1] = my_rank;
+   	    tag.tag[2] = context_id;
+  
+   	    psm2_mq_isend(mq, dest_ep,
+   	                 0, // no flags
+   	                 &tag,
+   	                 buf,
+   	                 len,
+   	                 req, // this req is available in psm2_mq_status2_t when one
+   	                      // of the synchronization functions is called.
+   	                 &req_mq);
+   	    return req_mq;
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	      psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context,
+	      psm2_mq_req_t *req);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received.  The message is not fully matched until the user
+ * provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv.
+ * Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ *                               unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel,
+	      psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received.  The message is not fully matched until the
+ * user provides a buffer with the successfully matching tag selection criteria
+ * through @ref psm2_mq_irecv.  Probing for messages may be useful if the size
+ * of the message to be received is unknown, in which case its size will be
+ * available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is
+ *                               unchanged.
+ */
+psm2_error_t
+psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+	       psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status);
+
+/** @brief Try to Probe if a message is received matching tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied source, tag, and tag
+ * selectors has been received.  If a match is successful, the message is
+ * removed from the matching queue and returned as a request object.  The
+ * message can be received using @ref psm2_mq_imrecv.  It is erroneous to use
+ * the request object returned by @ref psm2_mq_improbe for any purpose other
+ * than passing to @ref psm2_mq_imrecv.  Probing for messages may be useful if
+ * the size of the message to be received is unknown, in which case its size
+ * will be available in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] req PSM MQ Request handle, to be used for receiving the matched
+ *                 message.
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req,
+	       psm2_mq_status_t *status);
+
+/** @brief Try to Probe if a message is received matching source and tag
+ * selection criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received.  If a match is successful, the message is removed from
+ * the matching queue and returned as a request object.  The message can be
+ * received using @ref psm2_mq_imrecv.  It is erroneous to use the request
+ * object returned by @ref psm2_mq_improbe for any purpose other than passing to
+ * @ref psm2_mq_imrecv.  Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be available
+ * in the @c msg_length member of the returned @c status.
+ *
+ * Function ensures progress if matching request wasn’t found
+ * after the first attempt.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR)
+ * @param[in] rtag Message receive tag
+ * @param[in] rtagsel Message receive tag selector
+ * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched
+ *                  message.
+ * @param[out] status Upon return, @c status is filled with information
+ *                    regarding the matching send.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL.
+ * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged.
+ */
+psm2_error_t
+psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag,
+		psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo,
+		psm2_mq_status2_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion.  Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress.  The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ *       PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ *                ready for completion.  If @c status is non-NULL, it is also
+ *                updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ *                               are no further requests ready for completion.
+ *                               The contents of @c req and @c status remain
+ *                               unchanged.
+ * @code{.c}
+   	// Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+   	// We return the amount of non-blocking requests that we've completed
+   	int main_progress_loop(psm2_mq_t mq)
+   	{
+   	    int num_completed = 0;
+   	    psm2_mq_req_t req;
+   	    psm2_mq_status_t status;
+   	    psm2_error_t err;
+   	    my_request_t *myreq;
+  
+   	    do {
+   	        err = psm2_mq_ipeek(mq, &req,
+   	                           NULL); // No need for status in ipeek here
+   	        if (err == PSM2_MQ_NO_COMPLETIONS)
+   	            return num_completed;
+   	        else if (err != PSM2_OK)
+   	            goto errh;
+   	        num_completed++;
+  
+   	        // We obtained 'req' at the head of the completion queue.  We can
+   	        // now free the request with PSM and obtain our original reques
+   	        // from the status' context
+   	        err = psm2_mq_test(&req, // will be marked as invalid
+   	                          &status); // we need the status
+   	        myreq = (my_request_t *) status.context;
+  
+   	        // handle the completion for myreq whether myreq is a posted receive
+   	        // or a non-blocking send.
+   	   }
+   	   while (1);
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status);
+
+/** @brief Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion.  Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm2_mq_wait or @ref psm2_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress.  The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in,out] req MQ non-blocking request
+ * @param[in] status Optional MQ status, can be NULL.
+ *
+ * @post The user has ensured progress if the function returns @ref
+ *       PSM2_MQ_NO_COMPLETIONS
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as different MQ arguments are used in each of the calls.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The peek is successful and @c req is updated with a request
+ *                ready for completion.  If @c status is non-NULL, it is also
+ *                updated.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there
+ *                            are no further requests ready for completion.
+ *                            The contents of @c req and @c status remain
+ *                            unchanged.
+ * @code{.c}
+   	// Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll
+   	// We return the amount of non-blocking requests that we've completed
+   	int main_progress_loop(psm2_mq_t mq)
+   	{
+   	    int num_completed = 0;
+   	    psm2_mq_req_t req;
+   	    psm2_mq_status2_t status;
+   	    psm2_error_t err;
+   	    my_request_t *myreq;
+  
+   	    do {
+   	        err = psm2_mq_ipeek2(mq, &req,
+   	                           NULL); // No need for status in ipeek here
+   	        if (err == PSM2_MQ_NO_COMPLETIONS)
+   	            return num_completed;
+   	        else if (err != PSM2_OK)
+   	            goto errh;
+   	        num_completed++;
+  
+   	        // We obtained 'req' at the head of the completion queue.  We can
+   	        // now free the request with PSM and obtain our original reques
+   	        // from the status' context
+   	        err = psm2_mq_test2(&req, // will be marked as invalid
+   	                          &status); // we need the status
+   	        myreq = (my_request_t *) status.context;
+  
+   	        // handle the completion for myreq whether myreq is a posted receive
+   	        // or a non-blocking send.
+   	   }
+   	   while (1);
+   	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status);
+
+/** @brief User defined Callback function handling copy of MQ request into user datatype
+ *
+ * Callback function used to convert an MQ request into a user's desired
+ * status structure. The user's callback function converts the MQ request into
+ * the provided status_array at the specified index.
+ *
+ * @param[in] req MQ External non-blocking Request structure
+ * @param[in] status_array Array of User defined status datatypes
+ * @param[in] entry_index Index in array where the converted request will be
+ *                  stored if successful
+ *
+ * The following error codes are returned.
+ *
+ * @retval < 0 The MQ conversion failed with a user defined error.
+ *
+ * @retval 0 The MQ was successfully processed, but was not saved
+ *                 in the provided @c status_array.
+ *
+ * @retval 1 The MQ was successfully processed and was saved in the
+ *                 @c status_array at the specified index.
+ *
+ * @retval >1 The MQ was successfully processed and was saved in the
+ *                 @c status_array at the specified index. This should
+ *                 be the last MQ converted in the batch, even if there
+ *                 are still spaces in @c status_array.
+ */
+typedef int (*psmi_mq_status_copy_user_t) (struct psm2_mq_req_user *req,
+        void *status_array, int entry_index);
+
+/** @brief Check and dequeue MQ requests into a user's status array using a callback.
+ *
+ * Function to atomically check and dequeue MQ entries from the completed
+ * queue and copy the MQ requests into a user's status datatype through a
+ * status_copy callback function.
+ *
+ * Once the MQ request has been successfully converted by the callback, the
+ * MQ request is freed and the next entry is processed making the supplied
+ * Request pointer invalid.
+ *
+ * The variable "count" passed in will only be increased if the MQ request was
+ * successfully stored into the user's passed in array. Otherwise the count
+ * variable is unchanged.
+ *
+ * NOTE: a count of 0 passed into psm2_mq_ipeek_dequeue_multi will result in
+ * no MQ elements being processed.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] status_array Array of User defined status datatypes
+ * @param[in] status_copy Callback function pointer to convert
+ *                  MQ to caller datatype
+ * @param[in/out] count [in]Size of status_array, [out]number of elements
+ *                  populated into status_array or user's error return code
+ *
+ * The following error codes are returned.
+ *
+ * @retval PSM2_OK The dequeue operation was successful and populated the
+ *                  full @c status_array up to @c count entries. The parameter
+ *                  @c count is equal to the count passed in by the user.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not able to read
+ *                  @c count entries into the @c status_array. The number
+ *                  of entries that were successfully written to the
+ *                  @c status_array is set in the @c count for the user.
+ *
+ * @retval PSM2_INTERNAL_ERR The @c status_copy failed to successfully
+ *                  copy the status entry into the user's datatype.
+ *                  @c count is set to the return code from the
+ *                  @c status_copy.
+ */
+ psm2_error_t
+ psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array,
+        psmi_mq_status_copy_user_t status_copy, int *count);
+
+/** @brief Check and dequeue the first request entry from the completed queue.
+ *
+ * Function to atomically check and dequeue the first entry from the completed
+ * queue. It must be paired with function psm2_mq_req_free, which returns the
+ * request to PSM2 library.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[out] req PSM MQ Request handle, to be used for receiving the matched
+ *                  message.
+ *
+ * The following error codes are returned.
+ *
+ * @retval PSM2_OK The dequeue operation was successful and @c req is updated
+ *                 with a request ready for completion.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not successful,
+ *                            meaning that there are no further requests ready
+ *                            for completion. The contents of @c req remain
+ *                            unchanged.
+ */
+psm2_error_t
+psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *req);
+
+/** @brief Return the request to PSM2 library.
+ *
+ * Function returns the request previously obtained via psm2_mq_ipeek_dequeue
+ * to the PSM2 library.
+ *
+ * @param[in] mq Matched Queue Handle
+ * @param[in] req PSM MQ Request handle to be returned to PSM2 library.
+              If @p req is NULL, no operation is performed.
+ *
+ * The following error codes are returned.
+ *
+ * @retval PSM2_OK Return of an object to PSM2 library pool was successful.
+ */
+psm2_error_t
+psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends.  This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ *      suspended, the user need not ensure that progress is made prior to
+ *      calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ *       associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * @remarks
+ *  @li This function ensures progress on the endpoint as long as the request
+ *      is incomplete.
+ *  @li @c status can be NULL, in which case no status is written upon
+ *      completion.
+ *  @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ *      immediately.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ *                @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends.  This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in @ref mq_progress.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL when request successfully completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre Since MQ will internally ensure progress while the user is
+ *      suspended, the user need not ensure that progress is made prior to
+ *      calling this function.
+ *
+ * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all
+ *       associated MQ request storage is released back to the MQ library.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * @remarks
+ *  @li This function ensures progress on the endpoint as long as the request
+ *      is incomplete.
+ *  @li @c status can be NULL, in which case no status is written upon
+ *      completion.
+ *  @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns
+ *      immediately.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete or the value of @c was
+ *                @ref PSM2_MQ_REQINVALID.
+ *
+ */
+psm2_error_t
+psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion.  Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly.  It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed.  For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ *      psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ *       PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ *       back to the MQ library. If the request is incomplete, the contents of
+ *       @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ *       psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * The following two errors are always returned.  Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ *                PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ *                           unchanged.
+ *
+ * @code{.c}
+  	// Function that returns the first completed request in an array
+  	// of requests.
+  	void *
+  	user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+  	{
+  	  int i;
+  	  void *context = NULL;
+  	
+  	  // Ensure progress only once
+  	  psm2_poll(ep);
+  	
+  	  // Test for at least one completion and return it's context
+  	  psm2_mq_status_t stat;
+  	  for (i = 0; i < nreqs; i++) {
+  	    if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) {
+  	      context = stat.context;
+  	      break;
+  	    }
+  	  }
+  	  return context;
+  	}
+  @endcode
+ */
+psm2_error_t
+psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status);
+
+/** @brief Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion.  Unlike @ref psm2_mq_wait, this function
+ * tests @c request for completion and @e never ensures progress directly or
+ * indirectly.  It is up to the user to employ some of the progress functions
+ * described in @ref mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed.  For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * @param[in,out] request MQ non-blocking request
+ * @param[out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend
+ *      or @ref psm2_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a @ref psm2_mq_status2_t or NULL if status is to be
+ *      ignored.
+ *
+ * @pre The user has ensured progress on the Matched Queue if @ref
+ *      psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @post If the request is complete, the request is assigned the value @ref
+ *       PSM2_MQ_REQINVALID and all associated MQ request storage is released
+ *       back to the MQ library. If the request is incomplete, the contents of
+ *       @c request is unchanged.
+ *
+ * @post The user will ensure progress on the Matched Queue if @ref
+ *       psm2_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * The following two errors are always returned.  Other errors are handled by
+ * the PSM error handler (@ref psm2_error_register_handler).
+ *
+ * @retval PSM2_OK The request is complete and @c request is set to @ref
+ *                PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is
+ *                           unchanged.
+ *
+ * @code{.c}
+  	// Function that returns the first completed request in an array
+  	// of requests.
+  	void *
+  	user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs)
+  	{
+  	  int i;
+  	  void *context = NULL;
+  	
+  	  // Ensure progress only once
+  	  psm2_poll(ep);
+  	
+  	  // Test for at least one completion and return it's context
+  	  psm2_mq_status2_t stat;
+  	  for (i = 0; i < nreqs; i++) {
+  	    if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) {
+  	      context = stat.context;
+  	      break;
+  	    }
+  	  }
+  	  return context;
+  	}
+   @endcode
+ */
+psm2_error_t
+psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status);
+
+/** @brief Cancel a preposted request
+ *
+ * Function to cancel a preposted receive request returned by @ref
+ * psm2_mq_irecv.  It is currently illegal to cancel a send request initiated
+ * with @ref psm2_mq_isend.
+ *
+ * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend.
+ *
+ * @post Whether the cancel is successful or not, the user returns the
+ *       request to the library by way of @ref psm2_mq_test or @ref
+ *       psm2_mq_wait.
+ *
+ * @remark This function may be called simultaneously from multiple threads
+ * 	   as long as the requests that are used in each of the calls are
+ * 	   associated with different MQs.
+ *
+ * Only the two following errors can be returned directly, without being
+ * handled by the error handler (@ref psm2_error_register_handler):
+ *
+ * @retval PSM2_OK The request could be successfully cancelled such that the
+ *                preposted receive buffer could be removed from the preposted
+ *                receive queue before a match occurred. The associated @c
+ *                request remains unchanged and the user must still return
+ *                the storage to the MQ library.
+ *
+ * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled
+ *                           since the preposted receive buffer has already
+ *                           matched an incoming message.  The @c request
+ *                           remains unchanged.
+ *
+ */
+psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req);
+
+/*! @brief MQ statistics structure */
+struct psm2_mq_stats {
+	/** Bytes received into a matched user buffer */
+	uint64_t rx_user_bytes;
+	/** Messages received into a matched user buffer */
+	uint64_t rx_user_num;
+	/** Bytes received into an unmatched (or out of order) system buffer */
+	uint64_t rx_sys_bytes;
+	/** Messages received into an unmatched (or out of order) system buffer */
+	/** this count includes unexpected zero length eager recv */
+	uint64_t rx_sys_num;
+
+	/** Total Messages transmitted (shm and hfi) */
+	uint64_t tx_num;
+	/** Messages transmitted eagerly */
+	uint64_t tx_eager_num;
+	/** Bytes transmitted eagerly */
+	uint64_t tx_eager_bytes;
+	/** Messages transmitted using any rendezvous mechanism */
+	uint64_t tx_rndv_num;
+	/** Bytes transmitted using any rendezvous mechanism */
+	uint64_t tx_rndv_bytes;
+	/** Messages transmitted (shm only) */
+	uint64_t tx_shm_num;
+	/** Bytes transmitted (shm only) */
+	uint64_t tx_shm_bytes;
+	/** Messages received through shm */
+	uint64_t rx_shm_num;
+	/** Bytes received through shm */
+	uint64_t rx_shm_bytes;
+
+	/** sysbufs are used for unexpected eager receive (and RTS payload) */
+	/** Number of messages using system buffers (not used for 0 byte msg) */
+	uint64_t rx_sysbuf_num;
+	/** Bytes using system buffers */
+	uint64_t rx_sysbuf_bytes;
+
+	/** rank in MPI_COMM_WORLD, while unchanging, easiest to put here */
+	uint64_t comm_world_rank;
+
+#ifdef PSM_CUDA
+	/** Messages transmitted eagerly from CPU buffer */
+	uint64_t tx_eager_cpu_num;
+	/** Bytes transmitted eagerly from CPU buffer */
+	uint64_t tx_eager_cpu_bytes;
+	/** Messages transmitted eagerly from GPU buffer */
+	uint64_t tx_eager_gpu_num;
+	/** Bytes transmitted eagerly from GPU buffer */
+	uint64_t tx_eager_gpu_bytes;
+
+	/** Bytes copied from a system buffer into a matched CPU user buffer */
+	/** this count also includes unexpected zero length eager recv */
+	uint64_t rx_sysbuf_cpu_bytes;
+	/** Messages copied from a system buffer into a matched CPU user buffer */
+	uint64_t rx_sysbuf_cpu_num;
+	/** Bytes gdrCopied from a system buffer into a matched user GPU buffer */
+	uint64_t rx_sysbuf_gdrcopy_bytes;
+	/** Messages gdrCopied from a system buffer into a matched user GPU buffer */
+	uint64_t rx_sysbuf_gdrcopy_num;
+	/** Bytes cuCopied from a system buffer into a matched user GPU buffer */
+	uint64_t rx_sysbuf_cuCopy_bytes;
+	/** Messages cuCopied from a system buffer into a matched user GPU buffer */
+	uint64_t rx_sysbuf_cuCopy_num;
+
+	/** Internally reserved for future use */
+	uint64_t _reserved[3];
+#else
+	uint64_t _reserved[15];
+#endif
+};
+
+#define PSM2_MQ_NUM_STATS    13	/**< How many stats are currently used in @ref psm2_mq_stats */
+
+/*! @see psm2_mq_stats */
+	typedef struct psm2_mq_stats psm2_mq_stats_t;
+
+/** @brief Retrieve statistics from an instantiated MQ */
+	void
+	 psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats);
+
+/*! @} */
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_am.c b/deps/libfabric/prov/psm3/psm3/psm_am.c
new file mode 100644
index 0000000000000000000000000000000000000000..f1f3a450df8f537ffb836ad3c6b725a08e9f90a6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_am.c
@@ -0,0 +1,346 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+   and copied out in __psm2_am_get_parameters.  When debugging is enabled,
+   various assertions reference these parameters for sanity checking. */
+struct psm2_am_parameters psmi_am_parameters = { 0 };
+
+static int _ignore_handler(PSMI_AM_ARGS_DEFAULT)
+{
+	return 0;
+}
+
+int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT)
+{
+	abort();
+	return 0;
+}
+
+static void psmi_am_min_parameters(struct psm2_am_parameters *dest,
+				   struct psm2_am_parameters *src)
+{
+	dest->max_handlers = min(dest->max_handlers, src->max_handlers);
+	dest->max_nargs = min(dest->max_nargs, src->max_nargs);
+	dest->max_request_short =
+	    min(dest->max_request_short, src->max_request_short);
+	dest->max_reply_short =
+	    min(dest->max_reply_short, src->max_reply_short);
+}
+
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep)
+{
+	int i;
+	struct psm2_ep_am_handle_entry *am_htable;
+	struct psm2_am_parameters params;
+
+	psmi_am_parameters.max_handlers = INT_MAX;
+	psmi_am_parameters.max_nargs = INT_MAX;
+	psmi_am_parameters.max_request_short = INT_MAX;
+	psmi_am_parameters.max_reply_short = INT_MAX;
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		ep->ptl_self.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		ep->ptl_ips.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		ep->ptl_amsh.am_get_parameters(ep, &params);
+		psmi_am_min_parameters(&psmi_am_parameters, &params);
+	}
+
+	ep->am_htable =
+	    psmi_malloc(ep, UNDEFINED,
+			sizeof(struct psm2_ep_am_handle_entry) * PSMI_AM_NUM_HANDLERS);
+	if (ep->am_htable == NULL)
+		return PSM2_NO_MEMORY;
+
+	am_htable = (struct psm2_ep_am_handle_entry *) ep->am_htable;
+	for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+		am_htable[i].hfn = _ignore_handler;
+		am_htable[i].hctx = NULL;
+		am_htable[i].version = PSM2_AM_HANDLER_V2;
+	}
+
+	return PSM2_OK;
+
+}
+
+void psmi_am_fini_internal(psm2_ep_t ep)
+{
+	if(ep->am_htable != NULL) {
+		psmi_free(ep->am_htable);
+	}
+}
+
+psm2_error_t
+__psm2_am_register_handlers(psm2_ep_t ep,
+			   const psm2_am_handler_fn_t *handlers,
+			   int num_handlers, int *handlers_idx)
+{
+	int i, j;
+
+	psmi_assert_always(ep->am_htable != NULL);
+
+	PSM2_LOG_MSG("entering");
+	/* For now just assign any free one */
+	for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) {
+		if (ep->am_htable[i].hfn == _ignore_handler) {
+			ep->am_htable[i].hfn = handlers[j];
+			ep->am_htable[i].hctx = NULL;
+			ep->am_htable[i].version = PSM2_AM_HANDLER_V1;
+			handlers_idx[j] = i;
+			if (++j == num_handlers)	/* all registered */
+				break;
+		}
+	}
+
+	if (j < num_handlers) {
+		/* Not enough free handlers, restore unused handlers */
+		for (i = 0; i < j; i++) {
+			ep->am_htable[handlers_idx[i]].hfn = _ignore_handler;
+			ep->am_htable[handlers_idx[i]].hctx = NULL;
+			ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2;
+		}
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
+					 "Insufficient "
+					 "available AM handlers: registered %d of %d requested handlers",
+					 j, num_handlers);
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+}
+PSMI_API_DECL(psm2_am_register_handlers)
+
+psm2_error_t
+__psm2_am_register_handlers_2(psm2_ep_t ep,
+			   const psm2_am_handler_2_fn_t *handlers,
+			   int num_handlers, void **hctx, int *handlers_idx)
+{
+	int i, j;
+
+	psmi_assert_always(ep->am_htable != NULL);
+
+	PSM2_LOG_MSG("entering");
+	/* For now just assign any free one */
+	for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) {
+		if (ep->am_htable[i].hfn == _ignore_handler) {
+			ep->am_htable[i].hfn = handlers[j];
+			ep->am_htable[i].hctx = hctx[j];
+			ep->am_htable[i].version = PSM2_AM_HANDLER_V2;
+			handlers_idx[j] = i;
+			if (++j == num_handlers)	/* all registered */
+				break;
+		}
+	}
+
+	if (j < num_handlers) {
+		/* Not enough free handlers, restore unused handlers */
+		for (i = 0; i < j; i++) {
+			ep->am_htable[handlers_idx[i]].hfn = _ignore_handler;
+			ep->am_htable[handlers_idx[i]].hctx = NULL;
+			ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2;
+		}
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES,
+					 "Insufficient "
+					 "available AM handlers: registered %d of %d requested handlers",
+					 j, num_handlers);
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+}
+PSMI_API_DECL(psm2_am_register_handlers_2)
+
+void
+__psm2_am_unregister_handlers(psm2_ep_t ep)
+{
+	int i;
+
+	PSM2_LOG_MSG("entering");
+	for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+		if (ep->am_htable[i].hfn != _ignore_handler) {
+			ep->am_htable[i].hfn = _ignore_handler;
+			ep->am_htable[i].hctx = NULL;
+			ep->am_htable[i].version = PSM2_AM_HANDLER_V2;
+		}
+	}
+	PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_am_unregister_handlers)
+
+psm2_error_t
+__psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler,
+		       psm2_amarg_t *args, int nargs, void *src, size_t len,
+		       int flags, psm2_am_completion_fn_t completion_fn,
+		       void *completion_ctxt)
+{
+	psm2_error_t err;
+	ptl_ctl_t *ptlc = epaddr->ptlctl;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(epaddr != NULL);
+	psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+	psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+	psmi_assert(nargs > 0 ? args != NULL : 1);
+	psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short);
+	psmi_assert(len > 0 ? src != NULL : 1);
+
+	PSMI_LOCK(ptlc->ep->mq->progress_lock);
+
+	err = ptlc->am_short_request(epaddr, handler, args,
+				     nargs, src, len, flags, completion_fn,
+				     completion_ctxt);
+	PSMI_UNLOCK(ptlc->ep->mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_am_request_short)
+
+psm2_error_t
+__psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler,
+		     psm2_amarg_t *args, int nargs, void *src, size_t len,
+		     int flags, psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt)
+{
+	psm2_error_t err;
+	struct psmi_am_token *tok;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ptlc;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_always(token != NULL);
+	psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers);
+	psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs);
+	psmi_assert(nargs > 0 ? args != NULL : 1);
+	psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short);
+	psmi_assert(len > 0 ? src != NULL : 1);
+
+	tok = (struct psmi_am_token *)token;
+	epaddr = tok->epaddr_incoming;
+	ptlc = epaddr->ptlctl;
+
+	/* No locking here since we are already within handler context and already
+	 * locked */
+
+	err = ptlc->am_short_reply(token, handler, args,
+				   nargs, src, len, flags, completion_fn,
+				   completion_ctxt);
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_am_reply_short)
+
+psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out)
+{
+	struct psmi_am_token *tok;
+
+	PSM2_LOG_MSG("entering");
+	if (token == NULL || epaddr_out == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					 "Invalid %s parameters", __FUNCTION__);
+	}
+
+	tok = (struct psmi_am_token *)token;
+	*epaddr_out = tok->epaddr_incoming;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_source)
+
+psm2_error_t
+__psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters,
+			size_t sizeof_parameters_in,
+			size_t *sizeof_parameters_out)
+{
+	size_t s;
+
+	PSM2_LOG_MSG("entering");
+	if (parameters == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					 "Invalid %s parameters", __FUNCTION__);
+	}
+
+	memset(parameters, 0, sizeof_parameters_in);
+	s = min(sizeof(psmi_am_parameters), sizeof_parameters_in);
+	memcpy(parameters, &psmi_am_parameters, s);
+	*sizeof_parameters_out = s;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_am_get_parameters)
diff --git a/deps/libfabric/prov/psm3/psm3/psm_am_internal.h b/deps/libfabric/prov/psm3/psm3/psm_am_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..af151dc18c197e5e25cda38648b8a105152cf86a
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_am_internal.h
@@ -0,0 +1,108 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSM2_AM_INTERNAL_H
+#define _PSM2_AM_INTERNAL_H
+
+#define PSMI_AM_MAX_ARGS     10
+#define PSMI_AM_NUM_HANDLERS 256	/* must be power of 2 */
+
+#define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token,			\
+			     psm2_amarg_t *args, int nargs,		\
+			     void *src, uint32_t len,			\
+			     void *hctx
+
+enum psm2_am_handler_version
+{
+	PSM2_AM_HANDLER_V1 = 0,
+	PSM2_AM_HANDLER_V2,
+};
+
+struct psm2_ep_am_handle_entry
+{
+	void *hfn;
+	void *hctx;
+	enum psm2_am_handler_version version;
+};
+
+struct psmi_am_token {
+	psm2_epaddr_t epaddr_incoming;
+	uint32_t flags;
+	/* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */
+	uint32_t can_reply;
+
+	/* PTLs may add other stuff here */
+};
+
+/* AM capabilities parameters are initialized once in psmi_am_init_internal
+   and copied out in __psm2_am_get_parameters.  When debugging is enabled,
+   various assertions reference these parameters for sanity checking. */
+extern struct psm2_am_parameters psmi_am_parameters;
+
+PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry *
+		   psm_am_get_handler_function(psm2_ep_t ep,
+					       psm2_handler_t handler_idx))
+{
+	int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1);
+	struct psm2_ep_am_handle_entry *hentry = &ep->am_htable[hidx];
+	psmi_assert_always(hentry != NULL);
+	return hentry;
+}
+
+/* PSM internal initialization */
+psm2_error_t psmi_am_init_internal(psm2_ep_t ep);
+void psmi_am_fini_internal(psm2_ep_t ep);
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_config.h b/deps/libfabric/prov/psm3/psm3/psm_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f63bc00fbd3c210c75ca69b45e989566d87d61ab
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_config.h
@@ -0,0 +1,211 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PSM_CONFIG_H
+#define PSM_CONFIG_H
+
+/*
+ * The following flags can be used instead of `make` switches in order to
+ * change behavior achieved when using `make` without parameters.
+ */
+
+#ifndef RDPMC_PERF_FRAMEWORK
+/* #define RDPMC_PERF_FRAMEWORK */
+#endif
+
+#ifndef PSM2_MOCK_TESTING
+/* #define PSM2_MOCK_TESTING */
+#endif
+
+#ifndef PSM_CUDA
+/* #define PSM_CUDA */
+/* #define NVIDIA_GPU_DIRECT */
+#endif
+
+#ifndef PSM3_BRAKE_DEBUG
+/* #define PSM3_BRAKE_DEBUG */
+#endif
+
+#ifndef PSM_DEBUG
+/* #define PSM_DEBUG */
+/* #define _HFI_DEBUGGING 1 */
+/* #define _FORTIFY_SOURCE 2 */
+#endif
+
+#ifndef PSM_HEAP_DEBUG
+/* #define PSM_HEAP_DEBUG */
+#endif
+
+#ifndef PSM_PROFILE
+/* #define PSM_PROFILE */
+#endif
+
+#define PSMI_MIN_EP_CONNECT_TIMEOUT	(2 * SEC_ULL)
+#define PSMI_MIN_EP_CLOSE_TIMEOUT	(1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_TIMEOUT	(2 * SEC_ULL)
+
+#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL)
+
+
+#define PSMI_MAX_RAILS		32 /* Max number of unique devices */
+				      			/* also sets PSMX3_MAX_UNITS in psmx3.h */
+#define PSMI_MAX_QPS		32 /* Max number of total QPs (QPs/NIC * RAILs) */
+								/* must be >= PSMI_MAX_RAILS */
+
+#define AFFINITY_SHM_BASENAME			"/psm3_nic_affinity_shm"
+#define AFFINITY_SHMEMSIZE			sysconf(_SC_PAGE_SIZE)
+#define AFFINITY_SHM_REF_COUNT_LOCATION		0
+#define AFFINITY_SHM_HFI_INDEX_LOCATION		1
+#define SEM_AFFINITY_SHM_RW_BASENAME		"/psm3_nic_affinity_shm_rw_mutex"
+
+#define PSMI_RCVTHREAD_FLAGS	0x1
+/**<
+ * Default setting for Receive thread
+ *
+ * 0x0 disables rcvthread by default
+ * 0x1 enables ips receive thread by default
+ */
+
+/*
+ * Define one of these below.
+ *
+ * Spinlock gives the best performance and makes sense with the progress thread
+ * only because the progress thread does a "trylock" and then goes back to
+ * sleep in a poll.
+ *
+ * Mutexlock should be used for experimentation while the more useful
+ * mutexlock-debug should be enabled during development to catch potential
+ * errors.
+ */
+#ifdef PSM_DEBUG
+#define PSMI_LOCK_IS_MUTEXLOCK_DEBUG
+#else
+#define PSMI_LOCK_IS_SPINLOCK
+/* #define PSMI_LOCK_IS_MUTEXLOCK */
+/* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */
+/* #define PSMI_PLOCK_IS_NOLOCK */
+#endif
+
+#ifdef PSM_CUDA
+/* XXX TODO: Getting the gpu page size from driver at init time */
+#define PSMI_GPU_PAGESIZE 65536
+
+#define CUDA_SMALLHOSTBUF_SZ	(256*1024)
+#define CUDA_WINDOW_PREFETCH_DEFAULT	2
+#define GPUDIRECT_THRESH_RV 3
+
+#define GDR_COPY_LIMIT_SEND 128
+#define GDR_COPY_LIMIT_RECV 64000
+/* All GPU transfers beyond this threshold use
+ * RNDV protocol. It is mostly a send side knob.
+ */
+#define CUDA_THRESH_RNDV 8000
+#endif
+
+#define MQ_HFI_THRESH_TINY		8
+
+#define MQ_HFI_THRESH_EGR_SDMA		8192    /* Eager blocking */
+#define MQ_HFI_THRESH_EGR_SDMA_SQ	8192    /* Eager non-blocking */
+#define MQ_HFI_THRESH_GPU_EGR_SDMA	128    /* Eager blocking */
+#define MQ_HFI_THRESH_GPU_EGR_SDMA_SQ	128    /* Eager non-blocking */
+#define MQ_HFI_THRESH_RNDV_PHI2		200000
+#define MQ_HFI_THRESH_RNDV_XEON 	64000
+
+#define MQ_HFI_WINDOW_RNDV_PHI2		4194304
+#define MQ_HFI_WINDOW_RNDV_XEON		131072
+
+#ifdef PSM_CUDA
+#define MQ_HFI_WINDOW_RNDV_CUDA 2097152
+#endif
+
+#define MQ_SHM_THRESH_RNDV 16000
+
+#define NUM_HASH_BUCKETS 64
+#define HASH_THRESHOLD 65
+#define NUM_HASH_CONFIGS 3
+#define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1)
+
+#define REMOVE_ENTRY 1
+
+
+/* Keep timer stats */
+#define PSMI_TIMER_STATS	0
+
+
+/* Psm context */
+#define HAL_CONTEXT_OPEN_RETRY_MAX 3
+
+
+/*
+ * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is
+ * tested for reachability to each peer.  First self, then shm and finally
+ * hfi.  The order should really only affect endpoints that happen to be on
+ * the same node.  PSM will correctly detect that two endpoints are on the same
+ * node even though they may be using different host interfaces.
+ */
+#define PSMI_DEVICES_DEFAULT	"self,shm,nic"
+
+/* Lock */
+#define PSMI_USE_PTHREAD_SPINLOCKS	0
+
+/* Utils */
+#define PSMI_EPID_TABSIZE_CHUNK		128
+#define PSMI_EPID_TABLOAD_FACTOR	((float)0.7)
+
+#define	PSMI_EP_HOSTNAME_LEN	64	/* hostname only */
+#define	PSMI_EP_NAME_LEN	96	/* hostname:LID:context:subcontext */
+
+#define PSMI_FAULTINJ_SPEC_NAMELEN	32
+
+#endif /* PSM_CONFIG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_context.c b/deps/libfabric/prov/psm3/psm3/psm_context.c
new file mode 100644
index 0000000000000000000000000000000000000000..b95b03dab4dca507007dd5ae8de147da37981d8e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_context.c
@@ -0,0 +1,679 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "psm_user.h"
+#include "psm2_hal.h"
+
+static int psmi_get_hfi_selection_algorithm(void);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable)
+{
+	int poll_type;
+	int ret;
+
+	if (!enable == !psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED))
+		return PSM2_OK;
+
+	if (enable)
+		poll_type = PSMI_HAL_POLL_TYPE_URGENT;
+	else
+		poll_type = 0;
+
+	// we need the ep->verbs_ep and no way to get from psm_hw_ctxt to
+	// the ep.  So we need a new function instead of just changing a HAL func
+	// if verbs_ep was the psm_hw_ctxt for UD HAL, this would not be necessary
+	ret = __psm2_ep_poll_type(poll_type, context->ep);
+
+	if (ret != 0)
+		return PSM2_EP_NO_RESOURCES;
+	else {
+		if (enable)
+			psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
+		else
+			psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
+		return PSM2_OK;
+	}
+}
+
+int psmi_context_interrupt_isenabled(psmi_context_t *context)
+{
+	return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
+}
+
+
+/* returns the 8-bit hash value of an uuid. */
+static inline
+uint8_t
+psmi_get_uuid_hash(psm2_uuid_t const uuid)
+{
+	int i;
+	uint8_t hashed_uuid = 0;
+
+	for (i=0; i < sizeof(psm2_uuid_t); ++i)
+		hashed_uuid ^= *((uint8_t const *)uuid + i);
+
+	return hashed_uuid;
+}
+
+int psmi_get_current_proc_location()
+{
+        int core_id, node_id;
+
+	core_id = sched_getcpu();
+	if (core_id < 0)
+		return -EINVAL;
+
+	node_id = numa_node_of_cpu(core_id);
+	if (node_id < 0)
+		return -EINVAL;
+
+	return node_id;
+}
+
+static void
+psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start,
+			     long *unit_end, int nunits)
+{
+	{
+		/* else, we are going to look at:
+		   (a hash of the job key plus the local rank id) mod nunits. */
+
+		*unit_start = ((hfi_get_mylocalrank()+1) +
+			psmi_get_uuid_hash(job_key)) % nunits;
+		if (*unit_start > 0)
+			*unit_end = *unit_start - 1;
+		else
+			*unit_end = nunits-1;
+	}
+}
+
+static int
+psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key)
+{
+	int shm_fd, ret;
+	int first_to_create = 0;
+	size_t shm_name_len = 256;
+
+	psmi_assert_always(psmi_affinity_semaphore_open);
+	if (psmi_affinity_shared_file_opened) {
+		/* opened and have our reference counted in shm */
+		psmi_assert_always(affinity_shm_name != NULL);
+		psmi_assert_always(shared_affinity_ptr != NULL);
+		return 0;
+	}
+
+	shared_affinity_ptr = NULL;
+	affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len);
+
+	psmi_assert_always(affinity_shm_name != NULL);
+	snprintf(affinity_shm_name, shm_name_len,
+		 AFFINITY_SHM_BASENAME".%d",
+		 psmi_get_uuid_hash(job_key));
+	shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL,
+			  S_IRUSR | S_IWUSR);
+	if ((shm_fd < 0) && (errno == EEXIST)) {
+		shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR);
+		if (shm_fd < 0) {
+			_HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n",
+				  affinity_shm_name, errno);
+			goto free_name;
+		}
+	} else if (shm_fd >= 0) {
+		first_to_create = 1;
+	} else {
+		_HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n",
+			  affinity_shm_name, errno);
+		goto free_name;
+	}
+
+	ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE);
+	if ( ret < 0 ) {
+		_HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n",
+			affinity_shm_name, errno);
+		goto close_shm;
+	}
+
+	shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE,
+					MAP_SHARED, shm_fd, 0);
+	if (shared_affinity_ptr == MAP_FAILED) {
+		_HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n",
+			  affinity_shm_name, errno);
+		goto close_shm;
+	}
+	close(shm_fd);
+	shm_fd = -1;
+
+	if (first_to_create) {
+		_HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", affinity_shm_name);
+
+		memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE);
+
+		/*
+		 * Once shm object is initialized, unlock others to be able to
+		 * use it.
+		 */
+		psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+	} else {
+		_HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", affinity_shm_name);
+	}
+
+	/*
+	 * Start critical section to increment reference count when creating
+	 * or opening shm object. Decrement of ref count will be done before
+	 * closing the shm.
+	 */
+	if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update shm refcount\n");
+		goto unmap_shm;
+	}
+
+	shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1;
+	_HFI_VDBG("shm refcount = %"PRId64"\n",  shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]);
+
+	/* End critical section */
+	psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+	psmi_affinity_shared_file_opened = 1;
+
+	return 0;
+
+unmap_shm:
+	munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE);
+	shared_affinity_ptr = NULL;
+close_shm:
+	if (shm_fd >= 0) close(shm_fd);
+free_name:
+	psmi_free(affinity_shm_name);
+	affinity_shm_name = NULL;
+	return -1;
+}
+
+/*
+ * Spread HFI selection between units if we find more than one within a socket.
+ */
+static void
+psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
+			      int *saved_hfis, int found, psm2_uuid_t const job_key)
+{
+	int ret, shm_location;
+
+	/*
+	 * Take affinity lock and open shared memory region to be able to
+	 * accurately determine which HFI to pick for this process. If any
+	 * issues, bail by picking first known HFI.
+	 */
+	if (!psmi_affinity_semaphore_open)
+		goto spread_hfi_fallback;
+
+	ret = psmi_create_and_open_affinity_shm(job_key);
+	if (ret < 0)
+		goto spread_hfi_fallback;
+
+	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
+	if (shm_location > AFFINITY_SHMEMSIZE)
+		goto spread_hfi_fallback;
+
+	/* Start critical section to read/write shm object */
+	if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update NIC index\n");
+		goto spread_hfi_fallback;
+	}
+
+	*unit_start = *unit_end = saved_hfis[shared_affinity_ptr[shm_location]];
+	shared_affinity_ptr[shm_location] =
+		(shared_affinity_ptr[shm_location] + 1) % found;
+	_HFI_VDBG("Selected NIC index= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n",
+		  *unit_start, shared_affinity_ptr[shm_location], node_id,
+		  hfi_get_mylocalrank(), found);
+
+	/* End Critical Section */
+	psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name);
+
+	return;
+
+spread_hfi_fallback:
+	*unit_start = *unit_end = saved_hfis[0];
+}
+
+static void
+psmi_create_affinity_semaphores(psm2_uuid_t const job_key)
+{
+	int ret;
+	size_t sem_len = 256;
+
+	/*
+	 * If already opened, no need to do anything else.
+	 * This could be true for Multi-EP cases where a different thread has
+	 * already created the semaphores. We don't need separate locks here as
+	 * we are protected by the overall "psmi_creation_lock" which each
+	 * thread will take in psm2_ep_open()
+	 */
+	if (psmi_affinity_semaphore_open)
+		return;
+
+	sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len);
+	psmi_assert_always(sem_affinity_shm_rw_name != NULL);
+	snprintf(sem_affinity_shm_rw_name, sem_len,
+		 SEM_AFFINITY_SHM_RW_BASENAME".%d",
+		 psmi_get_uuid_hash(job_key));
+
+	ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name,
+				  S_IRUSR | S_IWUSR, 0);
+	if (ret) {
+		_HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n",
+			  sem_affinity_shm_rw_name);
+		if (sem_affinity_shm_rw)
+			sem_close(sem_affinity_shm_rw);
+		psmi_free(sem_affinity_shm_rw_name);
+		sem_affinity_shm_rw_name = NULL;
+		return;
+	}
+
+	_HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n",
+		  sem_affinity_shm_rw_name);
+
+	psmi_affinity_semaphore_open = 1;
+
+	return;
+}
+
+// return set of units to consider and which to start at.
+// caller will use 1st active unit which can be opened.
+// caller will wrap around so it's valid for start > end
+static
+psm2_error_t
+psmi_compute_start_and_end_unit(long unit_param,int nunitsactive,int nunits,
+				psm2_uuid_t const job_key,
+				long *unit_start,long *unit_end)
+{
+	unsigned short hfi_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	int node_id, unit_id, found = 0;
+	int saved_hfis[nunits];
+
+	/* if the user did not set PSM3_NIC then ... */
+	if (unit_param == PSM3_NIC_ANY)
+	{
+		if (nunitsactive > 1) {
+			// if NICs are on different subnets, and ! allow_routers
+			// we need to have all ranks default to the same subnet
+			// so force 1st active NIC in that case
+			uint64_t subnet;
+			int have_subnet = 0;
+			int have_eth = 0;
+			for (unit_id = 0; unit_id < nunits; unit_id++) {
+				uint64_t gid_hi, hi;
+				int is_eth = 0;
+				if (psmi_hal_get_unit_active(unit_id) <= 0)
+					continue;
+				if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/,
+								&gid_hi, NULL, NULL, NULL, NULL, &hi, NULL))
+					continue; // can't access NIC
+				is_eth = (gid_hi != hi);
+				if (! have_subnet) {
+					subnet = gid_hi;
+					have_subnet = 1;
+					have_eth = is_eth;
+				} else if (have_eth != is_eth
+					   || (subnet != gid_hi
+						 && (! is_eth || ! psmi_allow_routers))) {
+					// active units have different subnets
+					// caller will pick 1st active unit
+					*unit_start = 0;
+					*unit_end = nunits - 1;
+					return PSM2_OK;
+				}
+			}
+		}
+
+		/* Get the actual selection algorithm from the environment: */
+		hfi_sel_alg = psmi_get_hfi_selection_algorithm();
+		/* If round-robin is selection algorithm and ... */
+		if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) &&
+		    /* there are more than 1 active units then ... */
+		    (nunitsactive > 1))
+		{
+			/*
+			 * Pick first HFI we find on same root complex
+			 * as current task. If none found, fall back to
+			 * load-balancing algorithm.
+			 */
+			node_id = psmi_get_current_proc_location();
+			if (node_id >= 0) {
+				for (unit_id = 0; unit_id < nunits; unit_id++) {
+					if (psmi_hal_get_unit_active(unit_id) <= 0)
+						continue;
+
+					int node_id_i;
+
+					if (!psmi_hal_get_node_id(unit_id, &node_id_i)) {
+						if (node_id_i == node_id) {
+							saved_hfis[found] = unit_id;
+							found++;
+						}
+					}
+				}
+
+				if (found > 1) {
+					psmi_create_affinity_semaphores(job_key);
+					psmi_spread_hfi_within_socket(unit_start, unit_end,
+								      node_id, saved_hfis,
+								      found, job_key);
+				} else if (found == 1) {
+					*unit_start = *unit_end = saved_hfis[0];
+				}
+			}
+
+			if (node_id < 0 || !found) {
+				psmi_spread_hfi_selection(job_key, unit_start,
+							  unit_end, nunits);
+			}
+		} else if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) &&
+			 (nunitsactive > 1)) {
+				psmi_spread_hfi_selection(job_key, unit_start,
+							  unit_end, nunits);
+		}
+		else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit
+			// caller will pick 1st active unit
+			*unit_start = 0;
+			*unit_end = nunits - 1;
+		}
+	} else if (unit_param >= 0) {
+		/* the user specified PSM3_NIC, we use it. */
+		*unit_start = *unit_end = unit_param;
+	} else {
+		psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+				 "PSM3 can't open unit: %ld for reading and writing",
+				 unit_param);
+		return PSM2_EP_DEVICE_FAILURE;
+	}
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_param, long port,
+		  psm2_uuid_t const job_key, int64_t timeout_ns,
+		  psmi_context_t *context)
+{
+	long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev;
+	psm2_error_t err = PSM2_OK;
+	int nunits = psmi_hal_get_num_units(), nunitsactive=0;
+
+	/*
+	 * If shared contexts are enabled, try our best to schedule processes
+	 * across one or many devices
+	 */
+
+	/* if no units, then no joy. */
+	if (nunits <= 0)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3 no nic units are available");
+		goto ret;
+	}
+
+	/* Calculate the number of active units: */
+	for (unit_id=0;unit_id < nunits;unit_id++)
+	{
+		if (psmi_hal_get_unit_active(unit_id) > 0)
+			nunitsactive++;
+	}
+	/* if no active units, then no joy. */
+	if (nunitsactive == 0)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3 no nic units are active");
+		goto ret;
+	}
+	if (timeout_ns > 0)
+		open_timeout = (long)(timeout_ns / MSEC_ULL);
+
+
+	unit_start = 0; unit_end = nunits - 1;
+	err = psmi_compute_start_and_end_unit(unit_param, nunitsactive,
+					      nunits, job_key,
+					      &unit_start, &unit_end);
+	if (err != PSM2_OK)
+		return err;
+
+	/* this is the start of a loop that starts at unit_start and goes to unit_end.
+	   but note that the way the loop computes the loop control variable is by
+	   an expression involving the mod operator. */
+	int success = 0;
+	unit_id_prev = unit_id = unit_start;
+	do
+	{
+		/* close previous opened unit fd before attempting open of current unit. */
+		if (context->psm_hw_ctxt) {
+			psmi_hal_close_context(&context->psm_hw_ctxt);
+			context->psm_hw_ctxt = 0;
+		}
+
+		/* if the unit_id is not active, go to next one. */
+		if (psmi_hal_get_unit_active(unit_id) <= 0) {
+			unit_id_prev = unit_id;
+			unit_id = (unit_id + 1) % nunits;
+			continue;
+		}
+
+		/* open this unit. */
+		int rv = psmi_hal_context_open(unit_id, port, open_timeout,
+					       ep, job_key, context,
+					       psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED),
+					       HAL_CONTEXT_OPEN_RETRY_MAX);
+
+		/* go to next unit if failed to open. */
+		if (rv || context->psm_hw_ctxt == NULL) {
+			unit_id_prev = unit_id;
+			unit_id = (unit_id + 1) % nunits;
+			continue;
+		}
+
+		success = 1;
+		break;
+
+	} while (unit_id_prev != unit_end);
+
+	if (!success)
+	{
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3 can't open nic unit: %ld",unit_param);
+		goto bail;
+	}
+
+	context->ep = (psm2_ep_t) ep;
+
+	/* Check backward compatibility bits here and save the info */
+#ifdef PSM_CUDA
+#ifndef OPA
+	gdr_copy_limit_send = min(gdr_copy_limit_send, ep->mtu);
+
+	if (PSMI_IS_CUDA_DISABLED || ! psmi_parse_gpudirect()) {
+		// when CUDA and/or PSM3_GPUDIRECT* is disabled,
+		// PSM_HALCAP_GPUDIRECT is not fetched because it doesn't matter
+		// Just be silent about this situation.
+	} else // CUDA and GPUDIRECT are enabled, check CAP_GPUDIRECT in rv
+#endif
+#endif
+	if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT))
+	{
+#ifdef PSM_CUDA
+		is_driver_gpudirect_enabled = 1;
+#else
+		// we can allow this combination
+		//psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: "
+		//		  "CUDA version of rendezvous driver is loaded with non-CUDA version of "
+		//		  "psm3 provider.\n");
+#endif
+	}
+#ifdef PSM_CUDA
+	else // we warn here, later tests in ips_proto_init() will be fatal
+		_HFI_INFO("WARNING: running CUDA version of psm3 provider with non CUDA version of rendezvous driver.\n");
+#endif
+	_HFI_VDBG("hal_context_open() passed.\n");
+
+	/* Construct epid for this Endpoint */
+	psmi_assert_always(PSMI_EPID_VERSION == PSMI_EPID_V3
+						|| PSMI_EPID_VERSION == PSMI_EPID_V4);
+	psmi_assert_always (ep->verbs_ep.context);
+	// TBD - if we put the verbs_ep in hw_ctxt we could push this to HAL
+	// verbs_ep_open has initialized: ep->unit_id, ep->portnum,ep->dev_name,
+	//	ep->gid_hi, ep->gid_lo
+	if (ep->verbs_ep.link_layer == IBV_LINK_LAYER_ETHERNET) {
+		char buf[INET_ADDRSTRLEN];
+		int netmask_bits = psmi_count_high_bits(ep->verbs_ep.ip_netmask);
+		if (netmask_bits < 0) {
+			err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3 invalid netmask on %s: %s",
+					ep->dev_name, psmi_ipv4_ntop(ep->verbs_ep.ip_netmask, buf, sizeof(buf)));
+			goto bail;
+		}
+		psmi_epid_ver = PSMI_EPID_V4;	// overide default based on device
+		context->epid = PSMI_EPID_PACK_V4(ep->verbs_ep.ip_addr,
+							ep->verbs_ep.qp->qp_num, netmask_bits);
+		_HFI_VDBG("construct epid v4: 0x%"PRIx64" ip %s subnet_bits %u qp %d mtu %d\n",
+						context->epid,
+						psmi_ipv4_ntop(ep->verbs_ep.ip_addr, buf, sizeof(buf)),
+						netmask_bits, ep->verbs_ep.qp->qp_num, ep->mtu);
+	} else {
+		unsigned subnet = ep->gid_hi & 0xffff;
+		psmi_epid_ver = PSMI_EPID_V3;	// overide default based on device
+		context->epid = PSMI_EPID_PACK_V3(ep->verbs_ep.port_attr.lid,
+							ep->verbs_ep.qp->qp_num,
+							subnet /*ep->gid_hi*/);
+		_HFI_VDBG("construct epid v3: 0x%"PRIx64" lid %d qp %d subnet 0x%x mtu %d\n",
+						context->epid, ep->verbs_ep.port_attr.lid,
+						ep->verbs_ep.qp->qp_num, subnet, ep->mtu);
+	}
+
+	goto ret;
+
+bail:
+	_HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno));
+	if (context->psm_hw_ctxt) {
+		psmi_hal_close_context(&context->psm_hw_ctxt);
+		context->psm_hw_ctxt = 0;
+	}
+ret:
+
+	_HFI_VDBG("psmi_context_open() return %d\n", err);
+	return err;
+}
+
+psm2_error_t psmi_context_close(psmi_context_t *context)
+{
+	if (context->psm_hw_ctxt) {
+		psmi_hal_close_context(&context->psm_hw_ctxt);
+		context->psm_hw_ctxt = 0;
+	}
+
+	return PSM2_OK;
+}
+
+/*
+ * This function works whether a context is initialized or not in a psm2_ep.
+ *
+ * Returns one of
+ *
+ * PSM2_OK: Port status is ok (or context not initialized yet but still "ok")
+ * PSM2_OK_NO_PROGRESS: Cable pulled
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ * The message follows the per-port status
+ * As of 7322-ready driver, need to check port-specific qword for IB
+ * as well as older unit-only.  For now, we don't have the port interface
+ * defined, so just check port 0 qword for spi_status
+ */
+psm2_error_t psmi_context_check_status(const psmi_context_t *contexti)
+{
+	psm2_error_t err = PSM2_OK;
+	return err;
+}
+
+static
+int psmi_get_hfi_selection_algorithm(void)
+{
+	union psmi_envvar_val env_hfi1_alg;
+	int hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+
+	/* If a specific unit is set in the environment, use that one. */
+	psmi_getenv("PSM3_NIC_SELECTION_ALG",
+		    "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) "
+		    ", Packed[p] or Round Robin All[RoundRobinAll or rra].",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)"rr", &env_hfi1_alg);
+
+	if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin")
+		|| !strcasecmp(env_hfi1_alg.e_str, "RoundRobin")
+		|| !strcasecmp(env_hfi1_alg.e_str, "rr"))
+		hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	else if (!strcasecmp(env_hfi1_alg.e_str, "Packed")
+			 || !strcasecmp(env_hfi1_alg.e_str, "p"))
+		hfi1_alg = PSMI_UNIT_SEL_ALG_WITHIN;
+	else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All")
+			 || !strcasecmp(env_hfi1_alg.e_str, "RoundRobinAll")
+			 || !strcasecmp(env_hfi1_alg.e_str, "rra"))
+		hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL;
+	else {
+		_HFI_ERROR
+		    ("Unknown NIC selection algorithm %s. Defaulting to Round Robin "
+		     "allocation of NICs.\n", env_hfi1_alg.e_str);
+		hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	}
+
+	return hfi1_alg;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_context.h b/deps/libfabric/prov/psm3/psm3/psm_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9387d1ac25b3f56f6eed85404fa6dee2acc75e7
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_context.h
@@ -0,0 +1,119 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_context.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_CONTEXT_H
+#define _PSM_CONTEXT_H
+
+typedef
+struct psmi_context {
+
+	/* The following three member variables are used for sharing contexts among
+	   subcontexts and they have the following common properties:
+
+	   a. They are all initialized below HAL layer when the context is opened.
+	   b. If they are NULL that means no context is being shared among subcontexts,
+	   non-NULL means a context is being shared among some number of subcontexts.
+	   c. The initialization code is currently found in the gen1 hal instance.
+	*/
+
+	void *spio_ctrl;
+	void *tid_ctrl;
+	void *tf_ctrl;
+
+	/* end of shared context member variables. */
+
+	psmi_hal_hw_context psm_hw_ctxt;
+
+	psm2_ep_t ep;		/* psm ep handle */
+	psm2_epid_t epid;	/* psm integral ep id */
+	psm2_error_t status_lasterr;
+	time_t networkLostTime;
+} psmi_context_t;
+
+psm2_error_t
+psmi_context_open(const psm2_ep_t ep, long unit_id, long port,
+		  psm2_uuid_t const job_key,
+		  int64_t timeout_ns, psmi_context_t *context);
+
+psm2_error_t psmi_context_close(psmi_context_t *context);
+
+/* Check status of context */
+psm2_error_t psmi_context_check_status(const psmi_context_t *context);
+
+psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable);
+int psmi_context_interrupt_isenabled(psmi_context_t *context);
+
+/*
+ * round robin contexts across HFIs, then
+ * ports; this is the default.
+ * This option spreads the HFI selection within the local socket.
+ * If it is preferred to spread job over over entire set of
+ * HFIs within the system, see ALG_ACROSS_ALL below.
+ */
+#define PSMI_UNIT_SEL_ALG_ACROSS     PSM_HAL_ALG_ACROSS
+
+#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL
+
+/*
+ * use all contexts on an HFI (round robin
+ * active ports within), then next HFI
+ */
+#define PSMI_UNIT_SEL_ALG_WITHIN     PSM_HAL_ALG_WITHIN
+
+#endif /* PSM_CONTEXT_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_diags.c b/deps/libfabric/prov/psm3/psm3/psm_diags.c
new file mode 100644
index 0000000000000000000000000000000000000000..8b4ba8a1821641aae5f5fec80dc526999f4317b7
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_diags.c
@@ -0,0 +1,368 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n);
+static int psmi_test_memcpy(memcpy_fn_t, const char *name);
+static int psmi_test_epid_table(int numelems);
+
+int psmi_diags(void);
+
+#define diags_assert(x)	do {					\
+	    if (!(x))  {					\
+		_HFI_ERROR("Diags assertion failure: %s\n",	\
+		    #x);					\
+		goto fail;					\
+	    }							\
+	} while (0)
+
+#define DIAGS_RETURN_PASS(str)						\
+	do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; }	\
+	    while (0)
+#define DIAGS_RETURN_FAIL(str)						\
+	do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; }	\
+	    while (0)
+
+int psmi_diags(void)
+{
+	int ret = 0;
+	ret |= psmi_test_epid_table(2048);
+	ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo");
+	/* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */
+
+	if (ret)
+		DIAGS_RETURN_FAIL("");
+	else
+		DIAGS_RETURN_PASS("");
+}
+
+/*
+ * Hash table test
+ */
+#define NALLOC	1024
+static int psmi_test_epid_table(int numelems)
+{
+	ptl_ctl_t ctl;
+	psm2_epaddr_t *ep_array, epaddr, ep_alloc;
+	psm2_epid_t *epid_array, epid_tmp;
+	psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00;
+	struct psmi_epid_table *tab;
+	int i, j;
+	struct drand48_data drand48_data;
+
+	ep_alloc =
+	    (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+				       sizeof(struct psm2_epaddr));
+	ep_array =
+	    (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+					 sizeof(struct psm2_epaddr *));
+	epid_array =
+	    (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems,
+				       sizeof(psm2_epid_t));
+	diags_assert(ep_alloc != NULL);
+	diags_assert(ep_array != NULL);
+	diags_assert(epid_array != NULL);
+
+	srand48_r(12345678, &drand48_data);
+
+	psmi_epid_init();
+	tab = &psmi_epid_table;
+	ctl.ep = ep;
+
+	for (i = 0; i < numelems; i++) {
+		epid_array[i] = i;
+		ep_alloc[i].ptlctl = &ctl;
+		ep_alloc[i].epid = epid_array[i];
+		ep_array[i] = &ep_alloc[i];
+	}
+	for (i = 0; i < numelems; i++) {
+		psmi_epid_add(ep, epid_array[i], ep_array[i]);
+	}
+
+	/* Randomize epid_array */
+	for (i = 0; i < numelems; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		j = (int)(rand_result % numelems);
+		epid_tmp = epid_array[i];
+		epid_array[i] = epid_array[j];
+		epid_array[j] = epid_tmp;
+	}
+	/* Lookup. */
+	for (i = 0; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+	}
+
+	/* Randomize epid_array again */
+	for (i = 0; i < numelems; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		j = (int)(rand_result % numelems);
+		epid_tmp = epid_array[i];
+		epid_array[i] = epid_array[j];
+		epid_array[j] = epid_tmp;
+	}
+	/* Delete half */
+	for (i = 0; i < numelems / 2; i++) {
+		epaddr = psmi_epid_remove(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+	}
+	/* Lookup other half -- expect non-NULL, then delete */
+	for (i = numelems / 2; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr != NULL);
+		diags_assert(epaddr->epid == epid_array[i]);
+		diags_assert(epaddr->ptlctl->ep == ep);
+		epaddr = psmi_epid_remove(ep, epid_array[i]);
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr == NULL);
+	}
+	/* Lookup whole thing, expect done */
+	for (i = 0; i < numelems; i++) {
+		epaddr = psmi_epid_lookup(ep, epid_array[i]);
+		diags_assert(epaddr == NULL);
+	}
+	for (i = 0; i < tab->tabsize; i++) {
+		diags_assert(tab->table[i].entry == NULL ||
+			     tab->table[i].entry == EPADDR_DELETED);
+	}
+
+	/* Make sure we're not leaking memory somewhere... */
+	diags_assert(tab->tabsize > tab->tabsize_used &&
+		     tab->tabsize * PSMI_EPID_TABLOAD_FACTOR >
+		     tab->tabsize_used);
+
+	/* Only free on success */
+	psmi_epid_fini();
+	psmi_free(epid_array);
+	psmi_free(ep_array);
+	psmi_free(ep_alloc);
+	DIAGS_RETURN_PASS("");
+
+fail:
+	/* Klocwork scan report memory leak. */
+	psmi_epid_fini();
+	if (epid_array)
+		psmi_free(epid_array);
+	if (ep_array)
+		psmi_free(ep_array);
+	if (ep_alloc)
+		psmi_free(ep_alloc);
+	DIAGS_RETURN_FAIL("");
+}
+
+/*
+ * Memcpy correctness test
+ */
+static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n);
+static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n);
+
+static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name)
+{
+	const int CORNERS = 0;
+	const long long lo = 1;
+	const long long hi = 16 * 1024 * 1024;
+	const long long below = 32;
+	const long long above = 32;
+	long long n, m;
+	char buf[128];
+	int ret = 0;
+	int memcpy_passed;
+	int memcpy_failed;
+
+	memcpy_passed = 0;
+	memcpy_failed = 0;
+
+	ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0);
+	if (ret < 0)
+		DIAGS_RETURN_FAIL("no heap space");
+
+	for (n = lo; n <= hi; n <<= 1) {
+		_HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n);
+		for (m = n - below; m <= n + above; m++) {
+			if (m == n) {
+				ret =
+				    memcpy_check_size(fn, &memcpy_passed,
+						      &memcpy_failed, n);
+				if (ret < 0)
+					DIAGS_RETURN_FAIL("no heap space");
+			} else if (CORNERS && m >= lo && m <= hi && m > (n >> 1)
+				   && m < max(n, ((n << 1) - below))) {
+				ret =
+				    memcpy_check_size(fn, &memcpy_passed,
+						      &memcpy_failed,
+						      (size_t) m);
+				if (ret < 0)
+					DIAGS_RETURN_FAIL("no heap space");
+			}
+		}
+	}
+
+	int total = memcpy_passed + memcpy_failed;
+	if (total > 0) {
+		_HFI_INFO("%d memcpy tests with %d passed (%.2f%%) "
+			  "and %d failed (%.2f%%)\n",
+			  total, memcpy_passed, (100.0 * memcpy_passed) / total,
+			  memcpy_failed, (100.0 * memcpy_failed) / total);
+	}
+	if (memcpy_failed) {
+		snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed",
+			 memcpy_name, (100.0 * memcpy_failed) / total);
+		DIAGS_RETURN_FAIL(buf);
+	} else {
+		DIAGS_RETURN_PASS(memcpy_name);
+	}
+}
+
+void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n)
+{
+	int ok = 1;
+	unsigned int seed = (unsigned int)
+	    ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n);
+	size_t i;
+	struct drand48_data drand48_data;
+
+	if (!n)
+		return dst;
+
+	memset(src, 0x55, n);
+	memset(dst, 0xaa, n);
+	srand48_r(seed, &drand48_data);
+	for (i = 0; i < n; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff;
+	}
+
+	fn(dst, src, n);
+	memset(src, 0, n);
+	srand48_r(seed, &drand48_data);
+	for (i = 0; i < n; i++) {
+		long int rand_result;
+		lrand48_r(&drand48_data, &rand_result);
+		int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16);
+		int v = (int)((uint8_t *) dst)[i];
+		if (v != value) {
+			_HFI_ERROR
+			    ("Error on index %llu : got %d instead of %d\n",
+			     (unsigned long long)i, v, value);
+			ok = 0;
+		}
+	}
+	return ok ? dst : NULL;
+}
+
+int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n)
+{
+#define num_aligns 16
+#define USE_MALLOC 0
+#define DEBUG 0
+	uint8_t *src;
+	uint8_t *dst;
+	size_t size = n * 2 + num_aligns;
+	if (USE_MALLOC) {
+		src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+		dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+		if (src == NULL || dst == NULL) {
+			if (src) psmi_free(src);
+			if (dst) psmi_free(dst);
+			return -1;
+		}
+	} else {
+		void *src_p = NULL, *dst_p = NULL;
+		if (posix_memalign(&src_p, 64, size) != 0 ||
+		    posix_memalign(&dst_p, 64, size) != 0) {
+			if (src_p) free(src_p);
+			if (dst_p) free(dst_p);
+			return -1;
+		}
+		src = (uint8_t *) src_p;
+		dst = (uint8_t *) dst_p;
+	}
+
+	int src_align, dst_align;
+	for (src_align = 0; src_align < num_aligns; src_align++) {
+		for (dst_align = 0; dst_align < num_aligns; dst_align++) {
+			uint8_t *d = ((uint8_t *) dst) + dst_align;
+			uint8_t *s = ((uint8_t *) src) + src_align;
+			int ok = (memcpy_check_one(fn, d, s, n) != NULL);
+			if (DEBUG || !ok) {
+				_HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s,
+					  (unsigned long long)n,
+					  ok ? "passed" : "failed");
+			}
+			if (ok) {
+				(*p)++;
+			} else {
+				(*f)++;
+			}
+		}
+	}
+	if (USE_MALLOC) {
+		psmi_free(src);
+		psmi_free(dst);
+	} else {
+		free(src);
+		free(dst);
+	}
+	return 0;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_ep.c b/deps/libfabric/prov/psm3/psm3/psm_ep.c
new file mode 100644
index 0000000000000000000000000000000000000000..fd0bda96cf72120a4eb5aa259a48f6e928e2b50c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_ep.c
@@ -0,0 +1,1860 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+#include <stdbool.h>
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "ips_proto_params.h"
+
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+/*
+ * Endpoint management
+ */
+psm2_ep_t psmi_opened_endpoint = NULL;
+int psmi_opened_endpoint_count = 0;
+static uint32_t *hfi_lids;
+static uint32_t nlids;
+
+static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
+				       const struct psm2_ep_open_opts *opts,
+				       const psm2_uuid_t unique_job_key,
+				       struct psmi_context *context,
+				       psm2_epid_t *epid);
+
+/*
+ * Device management
+ *
+ * PSM uses "devices" as components to manage communication to self, to peers
+ * reachable via shared memory and finally to peers reachable only through
+ * hfi.
+ */
+
+static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
+				      const char *devstr);
+static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o)
+{
+	static int num_units = -1;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (num_units == -1) {
+		num_units = psmi_hal_get_num_units();
+		if (num_units == -1)
+			num_units = 0;
+	}
+
+	*num_units_o = (uint32_t) num_units;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_num_devunits)
+
+static int cmpfunc(const void *p1, const void *p2)
+{
+	uint64_t a = ((uint64_t *) p1)[0];
+	uint64_t b = ((uint64_t *) p2)[0];
+	if (a < b)
+		return -1;
+	if (a == b)
+		return 0;
+	return 1;
+}
+
+// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the
+// list of unit/port in unit[0-*num_rails] and port[0-*num_rails]
+// When *num_rails is returned as 0, multirail is not enabled and
+// other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be
+// used by the caller to select a single NIC for the process
+static psm2_error_t
+psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port)
+{
+	uint32_t num_units;
+	uint64_t gid_hi;
+	unsigned i, j, count = 0;
+	int ret;
+	psm2_error_t err = PSM2_OK;
+	uint64_t gidh[PSMI_MAX_RAILS][3];
+	union psmi_envvar_val env_multirail;
+	union psmi_envvar_val env_multirail_map;
+	int multirail_within_socket_used = 0;
+	int node_id = -1, found = 0;
+
+	psmi_getenv("PSM3_MULTIRAIL",
+			"Use all available NICs in the system for communication.\n"
+			 "0: Disabled (default),\n"
+			 "1: Enable multirail across all available NICs,\n"
+			 "2: Enable multirail within socket.\n"
+			 "\t For multirail within a socket, we try to find at\n"
+			 "\t least one NIC on the same socket as current task.\n"
+			 "\t If none found, we continue to use other NICs within\n"
+			 "\t the system.",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0,
+			&env_multirail);
+	if (!env_multirail.e_int) {
+		*num_rails = 0;
+		return PSM2_OK;
+	}
+
+	if (env_multirail.e_int == 2)
+		multirail_within_socket_used = 1;
+
+/*
+ * map is in format: unit:port,unit:port,...
+ * where :port is optional (default of 1) and unit can be name or number
+ */
+#define MAX_MAP_LEN (PSMI_MAX_RAILS*128)
+	if (!psmi_getenv("PSM3_MULTIRAIL_MAP",
+		"NIC selections for each rail in format:\n"
+		"     rail,rail,...\n"
+		"Where rail can be: unit:port or unit\n"
+		"When port is omitted, it defaults to 1\n"
+		"unit can be device name or unit number\n"
+		"default autoselects",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"", &env_multirail_map)) {
+
+		char temp[MAX_MAP_LEN+1];
+		char *s;
+		char *delim;
+
+		strncpy(temp, env_multirail_map.e_str, MAX_MAP_LEN);
+		if (temp[MAX_MAP_LEN-1] != 0)
+			return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3_MULTIRAIL_MAP too long: '%s'",
+					env_multirail_map.e_str);
+		s = temp;
+		psmi_assert(*s);
+		do {
+			int u, p;
+			int skip_port = 0;
+
+			if (! *s)	// trailing ',' on 2nd or later loop
+				break;
+			if (count >= PSMI_MAX_RAILS)
+				return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP exceeds %u rails: '%s'",
+						PSMI_MAX_RAILS, env_multirail_map.e_str);
+
+			// parse unit
+			delim = strchr(s, ':');
+			if (! delim) {
+				delim = strchr(s, ',');
+				skip_port = 1;
+				p = 1;
+			}
+			if (! delim && !skip_port)
+				return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP invalid format: '%s'",
+						env_multirail_map.e_str);
+			if (delim)
+				*delim = '\0';
+			u = sysfs_find_unit(s);
+			if (u < 0)
+				return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP invalid unit: '%s'", s);
+			if (delim)
+				s = delim+1;
+
+			// optionally parse port
+			if (! skip_port) {
+				delim = strchr(s, ',');
+				if (delim)
+					*delim = '\0';
+				p = psmi_parse_str_long(s);
+				if (p < 0)
+					return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP invalid port: '%s'", s);
+				if (delim)
+					s = delim+1;
+			}
+
+			unit[count] = u;
+			port[count] = p;
+			count++;
+		} while (delim);
+		*num_rails = count;
+
+/*
+ * Check if any of the port is not usable.
+ */
+		for (i = 0; i < count; i++) {
+			_HFI_VDBG("rail %d:  %u(%s) %u\n", i,
+				unit[i], sysfs_unit_dev_name(unit[i]), port[i]);
+			ret = psmi_hal_get_port_active(unit[i], port[i]);
+			if (ret <= 0)
+				return psmi_handle_error(NULL,
+						PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP: Unit/port: %d(%s):%d is not active.",
+						unit[i], sysfs_unit_dev_name(unit[i]),
+						port[i]);
+			ret = psmi_hal_get_port_lid(unit[i], port[i]);
+			if (ret <= 0 || ret == 0xFFFF)
+				return psmi_handle_error(NULL,
+						PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP: Couldn't get lid for unit %d(%s):%d",
+						unit[i], sysfs_unit_dev_name(unit[i]),
+						port[i]);
+			ret = psmi_hal_get_port_subnet(unit[i], port[i], NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+			if (ret == -1)
+				return psmi_handle_error(NULL,
+						PSM2_EP_DEVICE_FAILURE,
+						"PSM3_MULTIRAIL_MAP: Couldn't get subnet for unit %d(%s):%d",
+						unit[i], sysfs_unit_dev_name(unit[i]),
+						port[i]);
+		}
+		return PSM2_OK;
+	}
+
+	if ((err = psm2_ep_num_devunits(&num_units))) {
+		return err;
+	}
+	if (num_units > PSMI_MAX_RAILS) {
+		_HFI_INFO
+		    ("Found %d units, max %d units are supported, use %d\n",
+		     num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS);
+		num_units = PSMI_MAX_RAILS;
+	}
+
+	/*
+	 * PSM3_MULTIRAIL=2 functionality-
+	 *   - Try to find at least find one HFI in the same root
+	 *     complex. If none found, continue to run and
+	 *     use remaining HFIs in the system.
+	 *   - If we do find at least one HFI in same root complex, we
+	 *     go ahead and add to list.
+	 */
+	if (multirail_within_socket_used) {
+		node_id = psmi_get_current_proc_location();
+		for (i = 0; i < num_units; i++) {
+			if (psmi_hal_get_unit_active(i) <= 0)
+				continue;
+			int node_id_i;
+
+			if (!psmi_hal_get_node_id(i, &node_id_i)) {
+				if (node_id_i == node_id) {
+					found = 1;
+					break;
+				}
+			}
+		}
+	}
+/*
+ * Get all the ports with a valid lid and gid, one per unit.
+ */
+	for (i = 0; i < num_units; i++) {
+		int node_id_i;
+
+		if (!psmi_hal_get_node_id(i, &node_id_i))
+		{
+			if (multirail_within_socket_used &&
+			    found && (node_id_i != node_id))
+				continue;
+		}
+
+		for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+			ret = psmi_hal_get_port_lid(i, j);
+			if (ret <= 0 || ret == 0xFFFF)
+				continue;
+			ret = psmi_hal_get_port_subnet(i, j, &gid_hi, NULL, NULL, NULL, NULL, NULL, NULL);
+			if (ret == -1)
+				continue;
+
+			gidh[count][0] = gid_hi;
+			gidh[count][1] = i;
+			gidh[count][2] = j;
+			count++;
+			break;
+		}
+	}
+
+/*
+ * Sort all the ports with gidh from small to big.
+ * This is for multiple fabrics, and we use fabric with the
+ * smallest gid to make the master connection.
+ */
+	qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc);
+
+	for (i = 0; i < count; i++) {
+		unit[i] = (uint32_t) gidh[i][1];
+		port[i] = (uint16_t) (uint32_t) gidh[i][2];
+	}
+	*num_rails = count;
+	return PSM2_OK;
+}
+
+// this is used to find devices with the same address as another process,
+// implying intra-node comms.
+#define MAX_GID_IDX 31
+static psm2_error_t
+psmi_ep_devlids(uint32_t **lids, uint32_t *num_lids_o,
+		uint64_t my_gid_hi, uint64_t my_gid_lo, psm2_epid_t my_epid)
+{
+	uint32_t num_units;
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (hfi_lids == NULL) {
+		if ((err = psm2_ep_num_devunits(&num_units)))
+			goto fail;
+		hfi_lids = (uint32_t *)
+		    psmi_calloc(PSMI_EP_NONE, UNDEFINED,
+				num_units * psmi_hal_get_num_ports(), sizeof(*hfi_lids));
+		if (hfi_lids == NULL) {
+			err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+						"Couldn't allocate memory for dev_lids structure");
+			goto fail;
+		}
+
+		for (i = 0; i < num_units; i++) {
+			int j;
+			for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) {
+				int lid = psmi_hal_get_port_lid(i, j);
+				int ret, idx = 0;
+				uint64_t gid_hi = 0, gid_lo = 0;
+				uint64_t actual_gid_hi = 0;
+				uint32_t ipaddr = 0;
+
+				// if looking for IB/OPA lid, skip ports we can't get lid for
+				if ((lid <= 0 || lid == 0xFFFF) && psmi_epid_version(my_epid) == PSMI_EPID_V3)
+					continue;
+				// we just need subnet and addr within subnet and idx
+				ret = psmi_hal_get_port_subnet(i, j, &gid_hi, &gid_lo, &ipaddr, NULL, &idx, &actual_gid_hi, NULL);
+				if (ret == -1)
+					continue;
+				if (my_gid_hi != gid_hi) {
+					_HFI_VDBG("LID %d, unit %d, port %d, mismatched "
+							  "GID[%d] %llx:%llx and %llx:%llx\n",
+						lid, i, j, idx,
+						(unsigned long long)gid_hi,
+						(unsigned long long)gid_lo,
+						(unsigned long long)my_gid_hi,
+						(unsigned long long)my_gid_lo);
+					continue;
+				}
+				if (actual_gid_hi != gid_hi) {
+					if (_HFI_VDBG_ON) {
+						char buf[INET_ADDRSTRLEN];
+						_HFI_VDBG("LID %d=>IPaddr %s, unit %d, port %d, matched "
+								  "GID[%d] %llx:%llx and %llx:%llx\n",
+							lid, psmi_ipv4_ntop(ipaddr, buf, sizeof(buf)), i, j, idx,
+							(unsigned long long)gid_hi,
+							(unsigned long long)gid_lo,
+							(unsigned long long)my_gid_hi,
+							(unsigned long long)my_gid_lo);
+					}
+
+					hfi_lids[nlids++] = (uint32_t) ipaddr;
+				} else {
+					_HFI_VDBG("LID %d, unit %d, port %d, matched "
+							  "GID[%d] %llx:%llx and %llx:%llx\n",
+						lid, i, j, idx,
+						(unsigned long long)gid_hi,
+						(unsigned long long)gid_lo,
+						(unsigned long long)my_gid_hi,
+						(unsigned long long)my_gid_lo);
+
+					hfi_lids[nlids++] = (uint16_t) lid;
+				}
+			}
+		}
+		if (nlids == 0) {
+			err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"Couldn't get lid&gid from any unit/port");
+			goto fail;
+		}
+	}
+	*lids = hfi_lids;
+	*num_lids_o = nlids;
+
+fail:
+	return err;
+}
+
+static psm2_error_t
+psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oindex)
+{
+	int i, ret;
+	psm2_error_t err;
+
+	for (i = 0; i < 16; i++) {
+// TBD - if we adjust HAL to take a hw_context for this function and
+// put the verbs_ep inside the HAL hw context, we can eliminate this ifdef
+// and simply call into HAL
+		_HFI_PRDBG("looking for pkey 0x%x\n", pkey);
+		ret = verbs_get_port_index2pkey(ep, ep->portnum, i);
+		if (ret < 0) {
+			err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"Can't get a valid pkey value from pkey table on %s port %u\n", ep->dev_name, ep->portnum);
+			return err;
+		}
+		// pkey == 0 means get slot 0
+		if (! pkey && ! i)
+			break;
+		if ((pkey & 0x7fff) == (uint16_t)(ret & 0x7fff)) {
+			break;
+		}
+	}
+
+	/* if pkey does not match */
+	if (i == 16) {
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"Wrong pkey 0x%x on %s port %u, please use PSM3_PKEY to specify a valid pkey\n",
+					pkey, ep->dev_name, ep->portnum);
+		return err;
+	}
+
+	if (((uint16_t)ret & 0x8000) == 0) {
+		err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"Limited Member pkey 0x%x on %s port %u, please use PSM3_PKEY to specify a valid pkey\n",
+					(uint16_t)ret, ep->dev_name, ep->portnum);
+		return err;
+	}
+
+	/* return the final pkey */
+	*opkey = (uint16_t)ret;
+	*oindex = (uint16_t)i;
+
+	return PSM2_OK;
+}
+
+uint64_t __psm2_epid_nid(psm2_epid_t epid)
+{
+	uint64_t rv;
+
+	PSM2_LOG_MSG("entering");
+	rv = (uint64_t) PSMI_EPID_GET_LID(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_nid)
+
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * service levels encoding within epids. This may require
+ * changing to expose SLs
+ */
+uint64_t psmi_epid_version(psm2_epid_t epid)
+{
+	return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid);
+}
+
+uint64_t __psm2_epid_context(psm2_epid_t epid)
+{
+	uint64_t rv;
+
+	PSM2_LOG_MSG("entering");
+	rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_context)
+
+uint64_t __psm2_epid_port(psm2_epid_t epid)
+{
+	uint64_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = __psm2_epid_context(epid);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_epid_port)
+
+psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
+{
+	psm2_error_t err = PSM2_OK;
+	int i;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (*num_of_epinfo <= 0) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_query parameters");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	ep = psmi_opened_endpoint;
+	for (i = 0; i < *num_of_epinfo; i++) {
+		if (ep == NULL)
+			break;
+		array_of_epinfo[i].ep = ep;
+		array_of_epinfo[i].epid = ep->epid;
+		array_of_epinfo[i].jkey = ep->jkey;
+		memcpy(array_of_epinfo[i].uuid,
+		       (void *)ep->uuid, sizeof(psm2_uuid_t));
+		uuid_unparse_lower(ep->uuid, array_of_epinfo[i].uuid_str);
+		ep = ep->user_ep_next;
+	}
+	*num_of_epinfo = i;
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_query)
+
+psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	psm2_ep_t ep;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	/* Need to have an opened endpoint before we can resolve epids */
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	ep = psmi_opened_endpoint;
+	while (ep) {
+		epaddr = psmi_epid_lookup(ep, epid);
+		if (!epaddr) {
+			ep = ep->user_ep_next;
+			continue;
+		}
+
+		/* Found connection for epid. Return info about endpoint to caller. */
+		psmi_assert_always(epaddr->ptlctl->ep == ep);
+		epconn->addr = epaddr;
+		epconn->ep = ep;
+		epconn->mq = ep->mq;
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN,
+				"Endpoint connection status unknown");
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup);
+
+psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn)
+{
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	/* Need to have an opened endpoint before we can resolve epids */
+	if (ep == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	if (epconn == NULL) {
+		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+					"Invalid output parameter");
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid);
+	if (epaddr) {
+		/* Found connection for epid. Return info about endpoint to caller. */
+		psmi_assert_always(epaddr->ptlctl->ep == ep);
+		epconn->addr = epaddr;
+		epconn->ep = ep;
+		epconn->mq = ep->mq;
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+
+	err = psmi_handle_error(ep, PSM2_EPID_UNKNOWN,
+				"Endpoint connection status unknown");
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_epid_lookup2);
+
+psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid)
+{
+	psm2_error_t err = PSM2_OK;
+	PSM2_LOG_MSG("entering");
+	if (epaddr && epid) {
+		*epid = epaddr->epid;
+	}
+	else {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid input epaddr or output epid parameter");
+	}
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_epaddr_to_epid);
+
+psm2_error_t
+__psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o)
+{
+	int result = 0;
+	uint32_t num_lids = 0;
+	uint32_t epid_lid;
+	uint32_t *lids = NULL;
+	int i;
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert_always(ep != NULL);
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) ||
+		(psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) {
+		/* If we are in the no hfi-mode, or the other process is,
+		 * the epid doesn't help us - so assume both we're on the same
+		 * machine and try to connect.
+		 */
+		result = 1;
+	} else {
+		epid_lid = (uint32_t) psm2_epid_nid(epid);
+		err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo, ep->epid);
+		if (err) {
+			PSM2_LOG_MSG("leaving");
+			return err;
+		}
+		for (i = 0; i < num_lids; i++) {
+			if (epid_lid == lids[i]) {
+				/* we share memory if the lid is the same. */
+				result = 1;
+				break;
+			}
+		}
+	}
+	*result_o = result;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_epid_share_memory)
+
+psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts)
+{
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (!opts)
+		return PSM2_PARAM_ERR;
+
+	/* Set in order in the structure. */
+	opts->timeout = 30000000000LL;	/* 30 sec */
+	opts->unit = PSM3_NIC_ANY;
+	opts->affinity = PSM2_EP_OPEN_AFFINITY_SET;
+	opts->shm_mbytes = 0;	/* deprecated in psm2.h */
+	opts->sendbufs_num = 1024;
+	opts->network_pkey = psmi_hal_get_default_pkey();
+	opts->port = PSM3_NIC_PORT_ANY;
+	opts->outsl = PSMI_SL_DEFAULT;
+	opts->service_id = HFI_DEFAULT_SERVICE_ID;
+	opts->path_res_type = PSM2_PATH_RES_NONE;
+	opts->senddesc_num = 4096;
+	opts->imm_size = VERBS_SEND_MAX_INLINE; // PSM header size is 56
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_ep_open_opts_get_defaults)
+
+psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly);
+
+psm2_error_t
+__psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
+		       struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq,
+		       psm2_ep_t *epo, psm2_epid_t *epido)
+{
+	psm2_ep_t ep = NULL;
+	uint32_t num_units;
+	size_t len;
+	psm2_error_t err;
+	psm2_epaddr_t epaddr = NULL;
+	char buf[128], *p;
+	union psmi_envvar_val envvar_val;
+	size_t ptl_sizes;
+	struct psm2_ep_open_opts opts;
+	ptl_t *amsh_ptl, *ips_ptl, *self_ptl;
+	int i;
+
+	/* First get the set of default options, we overwrite with the user's
+	 * desired values afterwards */
+	if ((err = psm2_ep_open_opts_get_defaults(&opts)))
+		goto fail;
+
+	if (opts_i != NULL) {
+		if (opts_i->timeout != -1)
+			opts.timeout = opts_i->timeout;
+		if (opts_i->unit != -1)
+			opts.unit = opts_i->unit;
+		if (opts_i->affinity != -1)
+			opts.affinity = opts_i->affinity;
+
+		if (opts_i->sendbufs_num != -1)
+			opts.sendbufs_num = opts_i->sendbufs_num;
+
+		if (opts_i->network_pkey != psmi_hal_get_default_pkey())
+			opts.network_pkey = opts_i->network_pkey;
+
+		if (opts_i->port != 0)
+			opts.port = opts_i->port;
+
+		if (opts_i->outsl != -1)
+			opts.outsl = opts_i->outsl;
+
+		if (opts_i->service_id)
+			opts.service_id = (uint64_t) opts_i->service_id;
+		if (opts_i->path_res_type != PSM2_PATH_RES_NONE)
+			opts.path_res_type = opts_i->path_res_type;
+
+		if (opts_i->senddesc_num)
+			opts.senddesc_num = opts_i->senddesc_num;
+
+		if (opts_i->imm_size)
+			opts.imm_size = opts_i->imm_size;
+	}
+
+	/* Get Service ID from environment */
+	if (!psmi_getenv("PSM3_IB_SERVICE_ID",
+			 "Service ID for RV module RC QP connection establishment",
+			 PSMI_ENVVAR_LEVEL_USER,
+			 PSMI_ENVVAR_TYPE_ULONG_FLAGS, // FLAGS only affects output: hex
+			 (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID,
+			 &envvar_val)) {
+		opts.service_id = (uint64_t) envvar_val.e_ulonglong;
+	}
+
+	opts.path_res_type = PSM2_PATH_RES_NONE;
+
+	/* If a specific unit is set in the environment, use that one. */
+	// PSM3_NIC may be a unit name, number, "any" or -1
+	if (!psmi_getenv("PSM3_NIC", "Device Unit number or name (-1 or 'any' autodetects)",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)"any", &envvar_val)) {
+		if (0 == strcasecmp(envvar_val.e_str, "any")) {
+			opts.unit = PSM3_NIC_ANY;
+		} else {
+			// convert name to a unit number since rest of APIs use number
+			opts.unit = sysfs_find_unit(envvar_val.e_str);
+			if (opts.unit < 0) {
+				err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Unit unknown %s", envvar_val.e_str);
+				goto fail;
+			}
+		}
+	}
+
+	/* Get user specified port number to use. */
+	if (!psmi_getenv("PSM3_NIC_PORT", "NIC Port number (0 autodetects)",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_LONG,
+			 (union psmi_envvar_val)PSM3_NIC_PORT_ANY,
+			 &envvar_val)) {
+		opts.port = envvar_val.e_long;
+	}
+
+	/* Get service level from environment, path-query overrides it */
+	if (!psmi_getenv
+	    ("PSM3_NIC_SL", "NIC outging ServiceLevel number (default 0)",
+	     PSMI_ENVVAR_LEVEL_HIDDEN,
+	     PSMI_ENVVAR_TYPE_LONG,
+	     (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) {
+		opts.outsl = envvar_val.e_long;
+	}
+
+	/* Get network key from environment. MVAPICH and other vendor MPIs do not
+	 * specify it on ep open and we may require it for vFabrics.
+	 * path-query will override it.
+	 */
+	if (!psmi_getenv("PSM3_PKEY",
+			 "PKey to use for endpoint (0=use slot 0)",
+			 PSMI_ENVVAR_LEVEL_HIDDEN,
+			 PSMI_ENVVAR_TYPE_ULONG_FLAGS,	// show in hex
+			 (union psmi_envvar_val)((unsigned int)(psmi_hal_get_default_pkey())),
+			 &envvar_val)) {
+		opts.network_pkey = (uint64_t) envvar_val.e_ulong;
+	}
+
+	/* BACKWARDS COMPATIBILITY:  Open MPI likes to choose its own PKEY of
+	   0x7FFF.  That's no longer a valid default, so override it if the
+	   client was compiled against PSM v1 */
+	if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 &&
+			opts.network_pkey == 0x7FFF) {
+		opts.network_pkey = psmi_hal_get_default_pkey();;
+	}
+
+	/* Get number of default send buffers from environment */
+	if (!psmi_getenv("PSM3_NUM_SEND_BUFFERS",
+			 "Number of send buffers to allocate [1024]",
+			 PSMI_ENVVAR_LEVEL_HIDDEN,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)1024, &envvar_val)) {
+		opts.sendbufs_num = envvar_val.e_uint;
+	}
+
+	/* Get immediate data size - transfers less than immediate data size do
+	 * not consume a send buffer and require just a send descriptor.
+	 */
+	if (!psmi_getenv("PSM3_SEND_IMMEDIATE_SIZE",
+			 "Immediate data send size not requiring a buffer [128]",
+			 PSMI_ENVVAR_LEVEL_HIDDEN,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)128, &envvar_val)) {
+		opts.imm_size = envvar_val.e_uint;
+	}
+
+	/* Get number of send descriptors - by default this is 4 times the number
+	 * of send buffers - mainly used for short/inlined messages.
+	 */
+	if (!psmi_getenv("PSM3_NUM_SEND_DESCRIPTORS",
+			 "Number of send descriptors to allocate [4096]",
+			 PSMI_ENVVAR_LEVEL_HIDDEN,
+			 PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)4096, &envvar_val)) {
+		opts.senddesc_num = envvar_val.e_uint;
+	}
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK)
+			goto fail;
+	} else
+		num_units = 0;
+
+	/* do some error checking */
+	if (opts.timeout < -1) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid timeout value %lld",
+					(long long)opts.timeout);
+		goto fail;
+	} else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid Device Unit ID %d (%d units found)",
+					opts.unit, num_units);
+		goto fail;
+	} else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) &&
+				opts.port != PSM3_NIC_PORT_ANY) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid Device port number %d",
+					opts.port);
+		goto fail;
+	} else if (opts.affinity < 0
+		   || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) {
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Invalid Affinity option: %d",
+				      opts.affinity);
+		goto fail;
+	} else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Invalid SL number: %lld",
+					(unsigned long long)opts.outsl);
+		goto fail;
+	}
+
+	/* Allocate end point structure storage */
+	ptl_sizes =
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
+	     psmi_ptl_self.sizeof_ptl() : 0) +
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ?
+	     psmi_ptl_ips.sizeof_ptl() : 0) +
+	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ?
+	     psmi_ptl_amsh.sizeof_ptl() : 0);
+	if (ptl_sizes == 0)
+		return PSM2_EP_NO_DEVICE;
+
+	ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64,
+				      sizeof(struct psm2_ep) + ptl_sizes);
+	epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+					    1, sizeof(struct psm2_epaddr));
+	if (ep == NULL || epaddr == NULL) {
+		err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+					"Couldn't allocate memory for %s structure",
+					ep == NULL ? "psm2_ep" : "psm2_epaddr");
+		goto fail;
+	}
+	memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes);
+
+	/* Copy PTL enabled status */
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		ep->devid_enabled[i] = devid_enabled[i];
+
+	/* Matched Queue initialization.  We do this early because we have to
+	 * make sure ep->mq exists and is valid before calling ips_do_work.
+	 */
+	ep->mq = mq;
+
+	/* Get ready for PTL initialization */
+	memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t));
+	ep->epaddr = epaddr;
+	ep->memmode = mq->memmode;
+	ep->hfi_num_sendbufs = opts.sendbufs_num;
+	ep->service_id = opts.service_id;
+	ep->path_res_type = opts.path_res_type;
+	ep->hfi_num_descriptors = opts.senddesc_num;
+	ep->hfi_imm_size = opts.imm_size;
+	ep->errh = psmi_errhandler_global;	/* by default use the global one */
+	ep->ptl_amsh.ep_poll = psmi_poll_noop;
+	ep->ptl_ips.ep_poll = psmi_poll_noop;
+	ep->connections = 0;
+	ep->rdmamode = psmi_parse_rdmamode();	// PSM3_RDMA
+	/* MR cache mode */
+	// we need this early when creating the verbs_ep since it may affect
+	// if we open rv module.
+	// The value returned is a MR_CACHE_MODE_* selection
+	{
+		union psmi_envvar_val env_mr_cache_mode;
+		if (! (ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED)
+#ifdef PSM_CUDA
+			&& (PSMI_IS_CUDA_DISABLED || ! psmi_parse_gpudirect())
+#endif
+			&& ! psmi_parse_senddma()) {
+			env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE;
+		} else if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) {
+			// RDMA enabled in kernel mode.  Must use rv MR cache
+			env_mr_cache_mode.e_uint = MR_CACHE_MODE_RV;
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+		} else if (PSMI_IS_CUDA_ENABLED && psmi_parse_gpudirect()) {
+			// GPU Direct (RDMA, send DMA and/or gdrcopy) must
+			// use kernel MR cache in RV
+			env_mr_cache_mode.e_uint = MR_CACHE_MODE_KERNEL;
+#endif
+#endif
+		} else {
+			/* Behavior of user space MR Cache
+			 * when 0, we merely share MRs for concurrently used buffers
+			 */
+			// mode 2 (user space MR w/cache) is purposely not documented
+			psmi_getenv("PSM3_MR_CACHE_MODE",
+					"Enable MR caching 0=user space MR no cache"
+#ifdef RNDV_MOD
+					", 1=kernel MR w/cache [1]",
+#else
+					"[0]",
+#endif
+					PSMI_ENVVAR_LEVEL_USER,
+					PSMI_ENVVAR_TYPE_UINT,
+#ifdef RNDV_MOD
+					(union psmi_envvar_val)MR_CACHE_MODE_KERNEL,
+#else
+					(union psmi_envvar_val)MR_CACHE_MODE_NONE,
+#endif
+					 &env_mr_cache_mode);
+			if (! MR_CACHE_MODE_VALID(env_mr_cache_mode.e_uint)
+				|| env_mr_cache_mode.e_uint == MR_CACHE_MODE_RV)
+				env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE;
+		}
+#ifndef RNDV_MOD
+		if (env_mr_cache_mode.e_uint == MR_CACHE_MODE_KERNEL) {
+			static int logged = 0;
+			if (! logged) {
+				_HFI_INFO("WARNING: PSM built without rv module enabled, kernel MR caching unavailable\n");
+				logged = 1;
+			}
+			env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE;
+		}
+#endif
+		ep->mr_cache_mode = env_mr_cache_mode.e_uint;
+	}
+
+	/* See how many iterations we want to spin before yielding */
+	psmi_getenv("PSM3_YIELD_SPIN_COUNT",
+		    "Spin poll iterations before yield",
+		    PSMI_ENVVAR_LEVEL_HIDDEN,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD,
+		    &envvar_val);
+	ep->yield_spin_cnt = envvar_val.e_uint;
+
+	/* Set skip_affinity flag if PSM is not allowed to set affinity */
+	if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP)
+		ep->skip_affinity = true;
+
+	ptl_sizes = 0;
+	amsh_ptl = ips_ptl = self_ptl = NULL;
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_amsh.sizeof_ptl();
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_ips.sizeof_ptl();
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+		ptl_sizes += psmi_ptl_self.sizeof_ptl();
+	}
+
+	/* Get number of send WQEs
+	 */
+	psmi_getenv("PSM3_NUM_SEND_WQES",
+			"Number of send WQEs to allocate [4080]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)VERBS_SEND_QP_ENTRIES, &envvar_val);
+	ep->hfi_num_send_wqes = envvar_val.e_uint;
+
+	psmi_getenv("PSM3_SEND_REAP_THRESH",
+			"Number of outstanding send WQEs before reap CQEs [256]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)VERBS_SEND_CQ_REAP, &envvar_val);
+	ep->hfi_send_reap_thresh = envvar_val.e_uint;
+
+	psmi_getenv("PSM3_NUM_SEND_RDMA",
+			"Number of user space send RDMA to allow [128]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)VERBS_NUM_SEND_RDMA, &envvar_val);
+	ep->hfi_num_send_rdma = envvar_val.e_uint;
+
+	/* Get number of recv WQEs
+	 */
+	psmi_getenv("PSM3_NUM_RECV_WQES",
+			"Number of recv WQEs to allocate [4095]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)VERBS_RECV_QP_ENTRIES, &envvar_val);
+	ep->hfi_num_recv_wqes = envvar_val.e_uint;
+
+	/* Get number of recv CQEs
+	 */
+	psmi_getenv("PSM3_NUM_RECV_CQES",
+			"Number of recv CQEs to allocate\n"
+			"(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n"
+			"and 4000 more than that for PSM3_RDMA=3]) [0]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)0, &envvar_val);
+	ep->hfi_num_recv_cqes = envvar_val.e_uint;
+
+	/* Get RC QP timeout and retry
+	 */
+	psmi_getenv("PSM3_QP_TIMEOUT",
+			"Number of microseconds for RC QP timeouts [536870]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_ULONG,
+			(union psmi_envvar_val)VERBS_QP_TIMEOUT, &envvar_val);
+	ep->hfi_qp_timeout = timeout_usec_to_mult(envvar_val.e_ulong);
+
+	psmi_getenv("PSM3_QP_RETRY",
+			"Limit on retries after RC QP timeout or RNR [7]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)VERBS_QP_RETRY, &envvar_val);
+	ep->hfi_qp_retry = (envvar_val.e_uint <= VERBS_QP_MAX_RETRY)?
+								envvar_val.e_uint:VERBS_QP_MAX_RETRY;
+	/* Size of RV Cache - only used for MR_CACHE_MODE_RV or KERNEL,
+	 * otherwise ignored
+	 */
+	// RV defaults are sufficient for default PSM parameters
+	// but if user adjusts ep->hfi_num_send_rdma or mq->hfi_base_window_rv
+	// they also need to increase the cache size.  psm2_verbs_alloc_mr_cache
+	// will verify cache size is sufficient.
+	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
+	// chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_defaults)
+	// for OPA native, actual window_rv may be smaller, but for UD it
+	// is not reduced
+	psmi_getenv("PSM3_RV_MR_CACHE_SIZE",
+			"kernel space MR cache size"
+			" (MBs, 0 lets rv module decide) [0]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)0, &envvar_val);
+	ep->rv_mr_cache_size = envvar_val.e_uint;
+
+#ifdef PSM_CUDA
+	/* Size of RV GPU Cache - only used for PSM3_CUDA=1 MR_CACHE_MODE_KERNEL,
+	 * otherwise ignored
+	 */
+	// RV defaults are sufficient for default PSM parameters
+	// but if user adjusts ep->hfi_num_send_rdma or mq->hfi_base_window_rv
+	// they also need to increase the cache size.  psm2_verbs_alloc_mr_cache
+	// will verify cache size is sufficient.
+	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
+	// chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_defaults)
+	// for OPA native, actual window_rv may be smaller, but for UD it
+	// is not reduced
+	if (PSMI_IS_CUDA_ENABLED) {
+		psmi_getenv("PSM3_RV_GPU_CACHE_SIZE",
+				"kernel space GPU cache size"
+				" (MBs, 0 lets rv module decide) [0]",
+				PSMI_ENVVAR_LEVEL_USER,
+				PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)0, &envvar_val);
+		ep->rv_gpu_cache_size = envvar_val.e_uint;
+	} else {
+		ep->rv_gpu_cache_size = 0;
+	}
+#endif
+
+	psmi_getenv("PSM3_RV_QP_PER_CONN",
+			"Number of sets of RC QPs per RV connection (0 lets rv module decide) [0]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)0, &envvar_val);
+	ep->rv_num_conn = envvar_val.e_uint;
+
+	psmi_getenv("PSM3_RV_Q_DEPTH",
+			"Size of QPs and CQs per RV QP (0 lets rv module decide) [0]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)0, &envvar_val);
+	ep->rv_q_depth = envvar_val.e_uint;
+
+	psmi_getenv("PSM3_RV_RECONNECT_TIMEOUT",
+			"RV End-point minimum re-connection timeout in seconds. 0 for no connection recovery [30]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)30, &envvar_val);
+	ep->rv_reconnect_timeout = envvar_val.e_uint;
+
+	psmi_getenv("PSM3_RV_HEARTBEAT_INTERVAL",
+			"RV End-point heartbeat interval in milliseconds. 0 for no heartbeat [1000]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1000, &envvar_val);
+	ep->rv_hb_interval = envvar_val.e_uint;
+
+	// HFI Interface.
+	if ((err = psmi_ep_open_device(ep, &opts, unique_job_key,
+				       &(ep->context), &ep->epid)))
+		goto fail;
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		_HFI_PRDBG("my QPN=%u (0x%x)  EPID=0x%"PRIx64" %s\n",
+			ep->verbs_ep.qp->qp_num, ep->verbs_ep.qp->qp_num, (uint64_t)ep->epid,
+			psmi_epaddr_fmt_addr(ep->epid));
+	}
+	psmi_assert_always(ep->epid != 0);
+	ep->epaddr->epid = ep->epid;
+
+	_HFI_VDBG("psmi_ep_open_device() passed\n");
+
+	/* Set our new label as soon as we know what it is */
+	strncpy(buf, psmi_gethostname(), sizeof(buf) - 1);
+	buf[sizeof(buf) - 1] = '\0';
+
+	p = buf + strlen(buf);
+
+	/* If our rank is set, use it (same as mylabel). If not, use context */
+	/* hostname.rank# or hostname.# (context), or hostname.pid#
+	 */
+	if (hfi_get_myrank() >= 0)
+		len = snprintf(p, sizeof(buf) - strlen(buf), ":rank%d.", hfi_get_myrank());
+	else
+		len = snprintf(p, sizeof(buf) - strlen(buf), ":"PSMI_EPID_CONTEXT_FMT".",
+				PSMI_EPID_GET_CONTEXT_VAL(ep->epid));
+	*(p + len) = '\0';
+	ep->context_mylabel = psmi_strdup(ep, buf);
+	if (ep->context_mylabel == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	/* hfi_set_mylabel(ep->context_mylabel); */
+
+	if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0)))
+		goto fail;
+
+	if (! mq->ep)	// only call on 1st EP within MQ
+		psmi_mq_initstats(mq, ep->epid);
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		verify_device_support_unified_addr();
+#endif
+
+	_HFI_VDBG("start ptl device init...\n");
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+		if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self)))
+			goto fail;
+	}
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips)))
+			goto fail;
+	}
+	/* If we're shm-only, this device is enabled above */
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh)))
+			goto fail;
+	} else {
+		/* We may have pre-attached as part of getting our rank for enabling
+		 * shared contexts.  */
+	}
+
+	_HFI_VDBG("finish ptl device init...\n");
+
+	/*
+	 * Keep only IPS since only IPS support multi-rail, other devices
+	 * are only setup once. IPS device can come to this function again.
+	 */
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (devid_enabled[i] != PTL_DEVID_IPS) {
+			devid_enabled[i] = -1;
+		}
+	}
+
+	*epido = ep->epid;
+	*epo = ep;
+
+	return PSM2_OK;
+
+fail:
+	if (ep != NULL) {
+		psmi_hal_close_context(&ep->context.psm_hw_ctxt);
+		psmi_free(ep);
+	}
+	if (epaddr != NULL)
+		psmi_free(epaddr);
+	return err;
+}
+
+psm2_error_t
+__psm2_ep_open(psm2_uuid_t const unique_job_key,
+	      struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo,
+	      psm2_epid_t *epido)
+{
+	psm2_error_t err;
+	psm2_mq_t mq;
+	psm2_epid_t epid;
+	psm2_ep_t ep, tmp;
+	uint32_t units[PSMI_MAX_QPS];
+	uint16_t ports[PSMI_MAX_QPS];
+	int i, num_rails = 0;
+	char *uname = "PSM3_NIC";
+	char *pname = "PSM3_NIC_PORT";
+	char uvalue[6], pvalue[6];
+	int devid_enabled[PTL_MAX_INIT];
+	union psmi_envvar_val devs;
+	int show_nics = 0;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (!epo || !epido)
+		return PSM2_PARAM_ERR;
+
+	/* Allowing only one EP (unless explicitly enabled). */
+	if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_TOO_MANY_ENDPOINTS;
+	}
+
+	/* Matched Queue initialization.  We do this early because we have to
+	 * make sure ep->mq exists and is valid before calling ips_do_work.
+	 */
+	err = psmi_mq_malloc(&mq);
+	PSMI_LOCK(psmi_creation_lock);
+	if (err != PSM2_OK)
+		goto fail;
+
+	/* Set some of the MQ thresholds from the environment.
+	   Do this before ptl initialization - the ptl may have other
+	   constraints that will limit the MQ's settings. */
+	err = psmi_mq_initialize_defaults(mq);
+	if (err != PSM2_OK)
+		goto fail;
+
+	psmi_init_lock(&(mq->progress_lock));
+
+	/* See which ptl devices we want to use for this ep to be opened */
+	psmi_getenv("PSM3_DEVICES",
+		    "Ordered list of PSM-level devices",
+		    PSMI_ENVVAR_LEVEL_USER,
+		    PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs);
+
+	if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
+		goto fail;
+
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		show_nics = psmi_parse_identify();
+		err = psmi_ep_multirail(&num_rails, units, ports);
+		if (err != PSM2_OK)
+			goto fail;
+
+		/* If multi-rail is used, set the first ep unit/port */
+		if (num_rails > 0) {
+			snprintf(uvalue, 6, "%1d", units[0]);
+			snprintf(pvalue, 6, "%1d", ports[0]);
+			setenv(uname, uvalue, 1);
+			setenv(pname, pvalue, 1);
+		}
+	}
+#ifdef PSM_CUDA
+	else {
+		// only IPS opens RV, needed for gdrcopy
+		is_gdr_copy_enabled = gdr_copy_limit_send =
+			gdr_copy_limit_recv = 0;
+	}
+	if (PSMI_IS_GDR_COPY_ENABLED)
+		hfi_gdr_open();
+#endif
+
+	err = __psm2_ep_open_internal(unique_job_key,
+				     devid_enabled, opts_i, mq, &ep, &epid);
+	if (err != PSM2_OK)
+		goto fail;
+
+	if (psmi_opened_endpoint == NULL) {
+		psmi_opened_endpoint = ep;
+	} else {
+		tmp = psmi_opened_endpoint;
+		while (tmp->user_ep_next)
+			tmp = tmp->user_ep_next;
+		tmp->user_ep_next = ep;
+	}
+	psmi_opened_endpoint_count++;
+	ep->mctxt_prev = ep->mctxt_next = ep;
+	ep->mctxt_master = ep;
+	mq->ep = ep;
+
+	if (show_nics) {
+		int node_id;
+		psmi_hal_get_node_id(ep->unit_id, &node_id);
+		printf("%s %s NIC %u (%s) Port %u NUMA %d\n",
+			hfi_get_mylabel(), hfi_ident_tag,
+			ep->unit_id,  ep->dev_name,
+			ep->portnum, node_id);
+	}
+
+	/* Active Message initialization */
+	err = psmi_am_init_internal(ep);
+	if (err != PSM2_OK)
+		goto fail;
+
+	*epo = ep;
+	*epido = epid;
+	psmi_hal_context_initstats(ep);
+
+	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		int j;
+		union psmi_envvar_val envvar_val;
+
+		if (num_rails <= 0) {
+			// the NIC has now been selected for our process
+			// use the same NIC for any additional QPs below
+			num_rails = 1;
+			units[0] = ep->unit_id;
+			ports[0] = ep->portnum;
+		}
+		// When QP_PER_NIC >1, creates more than 1 QP on each NIC and then
+		// uses the multi-rail algorithms to spread the traffic across QPs
+		// This helps get better BW when there are relatively few processes/node
+		// care must be taken when combining this with user space RC QPs as
+		// scalability (memory footprint) issues can be multiplied
+		// This approach duplicates some per NIC resources (CQs, etc) but
+		// provides a simple approach
+		psmi_getenv("PSM3_QP_PER_NIC",
+			"Number of sets of QPs to open per NIC [1]",
+			PSMI_ENVVAR_LEVEL_USER,
+			PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1, &envvar_val);
+
+		if ((num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) {
+			err = psmi_handle_error(NULL, PSM2_TOO_MANY_ENDPOINTS,
+				"PSM3_QP_PER_NIC (%u) * num_rails (%d) > Max Support QPs (%u)",
+				envvar_val.e_uint, num_rails, PSMI_MAX_QPS);
+			goto fail;
+		}
+
+		for (j= 0; j< envvar_val.e_uint; j++) {
+			for (i = 0; i < num_rails; i++) {
+				_HFI_VDBG("rail %d unit %u port %u\n", i, units[i], ports[i]);
+				// did 0, 0 already above
+				if (i == 0 && j== 0)
+					continue;
+				snprintf(uvalue, 6, "%1d", units[i]);
+				snprintf(pvalue, 6, "%1d", ports[i]);
+				setenv(uname, uvalue, 1);
+				setenv(pname, pvalue, 1);
+
+				/* Create slave EP */
+				err = __psm2_ep_open_internal(unique_job_key,
+						     devid_enabled, opts_i, mq,
+						     &tmp, &epid);
+				if (err)
+					goto fail;
+
+				/* Point back to shared resources on the master EP */
+				tmp->am_htable = ep->am_htable;
+
+				/* Link slave EP after master EP. */
+				PSM_MCTXT_APPEND(ep, tmp);
+				if (j == 0 && show_nics) {
+					int node_id;
+					psmi_hal_get_node_id(ep->unit_id, &node_id);
+					printf("%s %s NIC %u (%s) Port %u NUMA %d\n",
+						hfi_get_mylabel(), hfi_ident_tag,
+						tmp->unit_id,  tmp->dev_name,
+						tmp->portnum, node_id);
+				}
+				psmi_hal_context_initstats(tmp);
+			}
+		}
+	}
+
+	_HFI_VDBG("psm2_ep_open() OK....\n");
+
+fail:
+	fflush(stdout);
+	PSMI_UNLOCK(psmi_creation_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_open)
+
+psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
+{
+	psm2_error_t err = PSM2_OK;
+
+	psmi_stats_ep_close();	// allow output of stats on 1st ep close if desired
+
+#if _HFI_DEBUGGING
+	uint64_t t_start = 0;
+	if (_HFI_PRDBG_ON) {
+		t_start = get_cycles();
+	}
+#endif
+
+#ifdef PSM_CUDA
+	/*
+	 * The close on the gdr fd needs to be called before the
+	 * close on the hfi fd as the the gdr device will hold
+	 * reference count on the hfi device which will make the close
+	 * on the hfi fd return without actually closing the fd.
+	 */
+	if (PSMI_IS_GDR_COPY_ENABLED)
+		hfi_gdr_close();
+#endif
+	union psmi_envvar_val timeout_intval;
+	psm2_ep_t tmp;
+	psm2_mq_t mmq;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+	psmi_assert_always(ep->mctxt_master == ep);
+
+	PSMI_LOCK(psmi_creation_lock);
+
+	psmi_am_fini_internal(ep);
+
+	if (psmi_opened_endpoint == NULL) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		PSMI_UNLOCK(psmi_creation_lock);
+		return err;
+	}
+
+	tmp = psmi_opened_endpoint;
+	while (tmp && tmp != ep) {
+		tmp = tmp->user_ep_next;
+	}
+	if (!tmp) {
+		err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED,
+					"PSM Endpoint is closed or does not exist");
+		PSM2_LOG_MSG("leaving");
+		PSMI_UNLOCK(psmi_creation_lock);
+		return err;
+	}
+
+	psmi_getenv("PSM3_CLOSE_TIMEOUT",
+		    "End-point close timeout over-ride.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &timeout_intval);
+
+	if (getenv("PSM3_CLOSE_TIMEOUT")) {
+		timeout_in = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout_in > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100);
+	}
+
+	if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT)
+		timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+	/* Infinite and excessive close time-out are limited here to a max.
+	 * The "rationale" is that there is no point waiting around forever for
+	 * graceful termination. Normal (or forced) process termination should clean
+	 * up the context state correctly even if termination is not graceful. */
+	if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT)
+		timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT;
+	_HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and "
+		   "%d connections\n",
+		   ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO",
+		   (double)timeout_in / 1e9, (int)ep->connections);
+
+	/* XXX We currently cheat in the sense that we leave each PTL the allowed
+	 * timeout.  There's no good way to do this until we change the PTL
+	 * interface to allow asynchronous finalization
+	 */
+
+
+	/* Check if transfer ownership of receive thread is needed before closing ep.
+	 * In case of PSM3_MULTI_EP support receive thread is created and assigned
+	 * to first opened endpoint. Receive thread is killed when closing this
+	 * endpoint.
+	 */
+	if (ep->user_ep_next != NULL) {
+		/* Receive thread will be transfered and assigned to ep->user_ep_next
+		 * only if currently working receive thread (which will be killed) is
+		 * assigned to ep and there isn't any assigned to ep->user_ep_next.
+		 */
+		if ((psmi_ptl_ips_rcvthread.is_enabled(ep->ptl_ips.ptl)) &&
+		    (!psmi_ptl_ips_rcvthread.is_enabled(ep->user_ep_next->ptl_ips.ptl)))
+			psmi_ptl_ips_rcvthread.transfer_ownership(ep->ptl_ips.ptl, ep->user_ep_next->ptl_ips.ptl);
+	}
+
+	/*
+	 * Before freeing the master ep itself,
+	 * remove it from the global linklist.
+	 * We do it here to let atexit handler in ptl_am directory
+	 * to search the global linklist and free the shared memory file.
+	 */
+	if (psmi_opened_endpoint == ep) {
+		/* Removing ep from global endpoint list. */
+		psmi_opened_endpoint = ep->user_ep_next;
+	} else {
+		tmp = psmi_opened_endpoint;
+		while (tmp->user_ep_next != ep) {
+			tmp = tmp->user_ep_next;
+		}
+		/* Removing ep from global endpoint list. */
+		tmp->user_ep_next = ep->user_ep_next;
+	}
+	psmi_opened_endpoint_count--;
+
+	/*
+	 * This do/while loop is used to close and free memory of endpoints.
+	 *
+	 * If MULTIRAIL feature is disable this loop will be passed only once
+	 * and only endpoint passed in psm2_ep_close will be closed/removed.
+	 *
+	 * If MULTIRAIL feature is enabled then this loop will be passed
+	 * multiple times (depending on number of rails). The order in which
+	 * endpoints will be closed is shown below:
+	 *
+	 *                      |--this is master endpoint in case of multirail
+	 *	                |  this endpoint is passed to psm2_ep_close and
+	 *			V  this is only endpoint known to user.
+	 *   +<-Ep0<-Ep1<-Ep2<-Ep3
+	 *   |__________________|	Ep3->mctxt_prev points to Ep2
+	 *	(3)  (2)  (1)  (4)	Ep2->mctxt_prev points to Ep1
+	 *	 ^			Ep1->mctxt_prev points to Ep0
+	 *	 |			Ep0->mctxt_prev points to Ep3 (master ep)
+	 *	 |
+	 *       |---- order in which endpoints will be closed.
+	 *
+	 * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0)
+	 * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if
+	 * feature is disabled then Ep3->mctxt_prev will point to Ep3 and
+	 * do/while loop will have one pass.
+	 *
+	 * In case of MULTIRAIL enabled Ep3 which is master endpoint will be
+	 * closed as the last one.
+	 */
+	mmq = ep->mq;
+	if (mmq) {
+		// in case mq_finalize not called, need to get stats out
+		// it will be a noop if called twice
+		psm2_mq_finalize(mmq);
+	}
+	tmp = ep->mctxt_prev;
+	do {
+		ep = tmp;
+		tmp = ep->mctxt_prev;
+
+		PSMI_LOCK(ep->mq->progress_lock);
+
+		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH))
+			err =
+			    psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode,
+					       timeout_in);
+
+		if ((err == PSM2_OK || err == PSM2_TIMEOUT) &&
+		    psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+			err =
+			    psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode,
+					      timeout_in);
+		PSM_MCTXT_REMOVE(ep);
+		/* If there's timeouts in the disconnect requests,
+		 * still make sure that we still get to close the
+		 *endpoint and mark it closed */
+		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+			psmi_context_close(&ep->context);
+
+		psmi_epid_remove_all(ep);
+		psmi_free(ep->epaddr);
+		psmi_free(ep->context_mylabel);
+
+		PSMI_UNLOCK(ep->mq->progress_lock);
+
+		ep->mq = NULL;
+		__psm2_ep_free_verbs(ep);
+
+		psmi_free(ep);
+
+	} while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep);
+
+	if (mmq) {
+		psmi_destroy_lock(&(mmq->progress_lock));
+		err = psmi_mq_free(mmq);
+	}
+
+	if (hfi_lids)
+	{
+		psmi_free(hfi_lids);
+		hfi_lids = NULL;
+		nlids = 0;
+	}
+
+	PSMI_UNLOCK(psmi_creation_lock);
+
+	if (_HFI_PRDBG_ON) {
+		_HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n",
+				 (double)cycles_to_nanosecs(get_cycles() -
+				 t_start) / SEC_ULL);
+	}
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_close)
+
+static
+psm2_error_t
+psmi_ep_open_device(const psm2_ep_t ep,
+		    const struct psm2_ep_open_opts *opts,
+		    const psm2_uuid_t unique_job_key,
+		    struct psmi_context *context, psm2_epid_t *epid)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Skip affinity.  No affinity if:
+	 * 1. User explicitly sets no-affinity=YES in environment.
+	 * 2. User doesn't set affinity in environment and PSM is opened with
+	 *    option affinity skip.
+	 */
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		union psmi_envvar_val env_rcvthread;
+		static int norcvthread;	/* only for first rail */
+
+		ep->out_sl = opts->outsl;
+
+		if ((err =
+		     psmi_context_open(ep, opts->unit, opts->port,
+				       unique_job_key, opts->timeout,
+				       context)) != PSM2_OK)
+			goto fail;
+
+		_HFI_DBG("[%d]use unit %d port %d\n", getpid(),
+			 ep->unit_id, 1);
+
+		/* At this point, we have the unit id and port number, so
+		 * check if pkey is not 0x0/0x7fff/0xffff, and match one
+		 * of the pkey in table.
+		 */
+		if ((err =
+		     psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey,
+					 &ep->network_pkey, &ep->network_pkey_index)) != PSM2_OK)
+			goto fail;
+
+		/* See if we want to activate support for receive thread */
+		psmi_getenv("PSM3_RCVTHREAD",
+			    "Enable Recv thread (0 disables thread)",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+				// default to 0 for all but 1st rail
+			    (union psmi_envvar_val)(norcvthread++ ? 0 :
+						    PSMI_RCVTHREAD_FLAGS),
+			    &env_rcvthread);
+
+		/* If enabled, use the polling capability to implement a receive
+		 * interrupt thread that can handle urg packets */
+		if (env_rcvthread.e_uint) {
+			psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD);
+#ifdef PSMI_PLOCK_IS_NOLOCK
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "#define PSMI_PLOCK_IS_NOLOCK not functional yet "
+					  "with RCVTHREAD on");
+#endif
+		}
+
+		*epid = context->epid;
+	} else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		*epid = PSMI_EPID_PACK_SHM(getpid(),
+								PSMI_EPID_SHM_ONLY); /*is a only-shm epid */
+	} else {
+		/* Self-only, meaning only 1 proc max */
+		*epid = PSMI_EPID_PACK_SHM(0,
+								PSMI_EPID_SHM_ONLY); /*is a only-shm epid */
+	}
+
+fail:
+	return err;
+}
+
+/* Get a list of PTLs we want to use.  The order is important, it affects
+ * whether node-local processes use shm or ips */
+static
+psm2_error_t
+psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
+{
+	char *devstr = NULL;
+	char *b_new, *e, *ee, *b;
+	psm2_error_t err = PSM2_OK;
+	int len;
+	int i = 0;
+
+	psmi_assert_always(devstring != NULL);
+	len = strlen(devstring) + 1;
+
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		devices[i] = -1;
+
+	devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len);
+	if (devstr == NULL)
+		goto fail;
+
+	b_new = (char *)devstr;
+	e = b_new + len;
+	strncpy(e, devstring, len);
+	ee = e + len;
+	i = 0;
+	while (e < ee && *e && i < PTL_MAX_INIT) {
+		while (*e && !isalpha(*e))
+			e++;
+		b = e;
+		while (*e && isalpha(*e))
+			e++;
+		*e = '\0';
+		if (*b) {
+			if (!strcasecmp(b, "self")) {
+				devices[i++] = PTL_DEVID_SELF;
+				b_new = strcpy(b_new, "self,");
+				b_new += 5;
+			} else if (!strcasecmp(b, "shm") ||
+					!strcasecmp(b, "shmem") ||
+					!strcasecmp(b, "amsh")) {
+				devices[i++] = PTL_DEVID_AMSH;
+				strcpy(b_new, "amsh,");
+				b_new += 5;
+			} else if (!strcasecmp(b, "hfi") ||
+					!strcasecmp(b, "nic") ||
+					!strcasecmp(b, "ipath") ||
+					!strcasecmp(b, "ips")) {
+				devices[i++] = PTL_DEVID_IPS;
+				strcpy(b_new, "ips,");
+				b_new += 4;
+			} else {
+				err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+							"%s set in environment variable PSM_PTL_DEVICES=\"%s\" "
+							"is not one of the recognized PTL devices (%s)",
+							b, devstring,
+							PSMI_DEVICES_DEFAULT);
+				goto fail;
+			}
+			e++;
+		}
+	}
+	if (b_new != devstr)	/* we parsed something, remove trailing comma */
+		*(b_new - 1) = '\0';
+
+	_HFI_PRDBG("PSM Device allocation order: %s\n", devstr);
+fail:
+	if (devstr != NULL)
+		psmi_free(devstr);
+	return err;
+
+}
+
+static
+int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
+{
+	int i;
+	for (i = 0; i < PTL_MAX_INIT; i++)
+		if (devid_enabled[i] == devid)
+			return 1;
+	return 0;
+}
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid)
+{
+	return psmi_device_is_enabled(ep->devid_enabled, devid);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_ep.h b/deps/libfabric/prov/psm3/psm3/psm_ep.h
new file mode 100644
index 0000000000000000000000000000000000000000..0defc1143285393833b3d23ea05b72c061b2ecaa
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_ep.h
@@ -0,0 +1,264 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm2_ep.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_EP_H
+#define _PSMI_EP_H
+
+
+#include "psm_verbs_ep.h"
+
+/*
+ * EPIDs encode the basic information needed to establish
+ * datagram traffic so that PSM connection establishment can
+ * negotiate and exchange the rest.
+ *
+ * EPID includes: EPID format version, network address, queue ID within NIC
+ */
+
+
+#define PSMI_SL_DEFAULT 0
+#define PSMI_SL_MIN	0
+#define PSMI_SL_MAX	31
+// IB/OPA:
+// 0-2: ver = 3
+// 3-7: spare
+// 8-31: QPN
+// 32-47: lid [note, IB & OPA100 only support 16 bit LIDs]
+// 48-63: subnet prefix low 16 bits
+#define PSMI_EPID_PACK_V3(lid, qpn, subnet_id) \
+	(((((uint64_t)PSMI_EPID_V3)&0x7)<<0)	|								\
+	 ((((uint64_t)qpn)&0xffffff)<<8)	|			  					\
+	 ((((uint64_t)lid)&0xffff)<<32)			|								\
+	 ((((uint64_t)subnet_id)&0xffff)<<48))
+// Eth:
+// 0-2: ver = 4
+// 3-7: subnet (number of high bits in IP addr representing IP subnet)
+// 8-31: UD QPN or UDP socket
+// 32-63: IPv4 address
+#define PSMI_EPID_PACK_V4(ip, qpn, subnet_bits) \
+	(((((uint64_t)PSMI_EPID_V4)&0x7)<<0)	|							\
+	 ((((uint64_t)subnet_bits)&0x1f)<<3)	|			  			\
+	 ((((uint64_t)qpn)&0xffffff)<<8)	|			  					\
+	 ((((uint64_t)ip)&0xffffffff)<<32))
+
+// shm and self:
+// 0-2: ver = 0
+// 3: shm-only flag (1)
+// 4-31: spare
+// 32-63: pid
+#define PSMI_EPID_PACK_SHM(process_id, shmbool) \
+	(((((uint64_t)process_id)&0xffffffff)<<32)			|				\
+	 ((((uint64_t)shmbool)&0x1)<<3)		|			  					\
+	 ((((uint64_t)PSMI_EPID_VERSION_SHM)&0x7)<<0))
+
+#define PSMI_EPID_GET_EPID_VERSION(epid)	(((epid)>>0)&0x7)
+#define PSMI_EPID_GET_LID_V3(epid)          (((epid)>>32)&0xffff) // lid
+#define PSMI_EPID_GET_LID_V4(epid)          (((epid)>>32)&0xffffffff) // ip
+#define PSMI_EPID_GET_CONTEXT(epid)         (((epid)>>8)&0xffffff) // qpn/sock
+#define PSMI_EPID_GET_SUBNET_ID_V3(epid)	(((epid)>>48)&0xffff)
+#define PSMI_EPID_GET_SUBNET_ID_V4(epid)	(psmi_bit_count_to_mask(((epid)>>3)&0x1f) &  PSMI_EPID_GET_LID_V4(epid)) // subnetwork
+#define PSMI_EPID_GET_SUBNET_ID(epid) ((PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V3) ? \
+										(uint32_t)PSMI_EPID_GET_SUBNET_ID_V3(epid) \
+										: (uint32_t)PSMI_EPID_GET_SUBNET_ID_V4(epid))
+#define PSMI_EPID_CONTEXT_FMT				"%d"
+#define PSMI_EPID_GET_CONTEXT_VAL(epid)		(int)PSMI_EPID_GET_CONTEXT(epid)
+
+#define PSM_MCTXT_APPEND(head, node)	\
+	node->mctxt_prev = head->mctxt_prev; \
+	node->mctxt_next = head; \
+	head->mctxt_prev->mctxt_next = node; \
+	head->mctxt_prev = node; \
+	node->mctxt_master = head
+#define PSM_MCTXT_REMOVE(node)	\
+	node->mctxt_prev->mctxt_next = node->mctxt_next; \
+	node->mctxt_next->mctxt_prev = node->mctxt_prev; \
+	node->mctxt_next = node->mctxt_prev = node; \
+	node->mctxt_master = NULL
+
+struct psm2_ep {
+	psm2_epid_t epid;	    /**> This endpoint's Endpoint ID */
+	psm2_epaddr_t epaddr;	    /**> This ep's ep address */
+	psm2_mq_t mq;		    /**> only 1 MQ */
+	struct psm2_verbs_ep verbs_ep;
+
+	int unit_id;
+	uint16_t portnum;
+	uint16_t out_sl;
+	// mtu is PSM payload allowed by local HW,
+	// mtu may be further reduced via PSM3_MTU by ips_proto_init
+	// for UD/UDP, mtu is reduced by PSM hdr size
+	uint16_t mtu;		/* out_sl-->vl-->mtu in sysfs */
+	uint16_t network_pkey;	      /**> Pkey */
+	uint16_t network_pkey_index;  /**> Pkey index */
+	int did_syslog;
+	const char *dev_name;	/* just for logging */
+	psm2_uuid_t uuid;
+	uint16_t jkey;
+	uint64_t service_id;	/* OPA service ID */
+	psm2_path_res_t path_res_type;	/* Path resolution for endpoint */
+	psm2_ep_errhandler_t errh;
+	int devid_enabled[PTL_MAX_INIT];
+	int memmode;		    /**> min, normal, large memory mode */
+
+	uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */
+	uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/
+	uint32_t hfi_num_send_wqes;/** Number of allocated SQ WQEs for send*/
+	uint32_t hfi_num_send_rdma;/** Number of concurrent RDMA*/
+	uint32_t hfi_send_reap_thresh;/** when to reap SQ compleitions*/
+	uint32_t hfi_num_recv_wqes;/** Number of allocated RQ WQEs*/
+	uint32_t hfi_num_recv_cqes;/** Number of allocated RQ CQEs*/
+	uint8_t hfi_qp_timeout;/** RC QP timeout, IB enum */
+	uint8_t hfi_qp_retry;/** RC QP retry limit */
+	uint8_t rdmamode; /** PSM3_RDMA */
+	uint8_t mr_cache_mode; /** PSM3_MR_CACHE_MODE */
+	uint8_t rv_num_conn; /** PSM3_RV_QP_PER_CONN */
+	uint32_t rv_mr_cache_size; /** PSM3_RV_MR_CACHE_SIZE */
+#ifdef PSM_CUDA
+	uint32_t rv_gpu_cache_size; /** PSM3_RV_GPU_CACHE_SIZE */
+#endif
+	uint32_t rv_q_depth; /** PSM3_RV_Q_DEPTH */
+	uint32_t rv_reconnect_timeout; /* PSM3_RV_RECONNECT_TIMEOUT */
+	uint32_t rv_hb_interval; /* PSM3_RV_HEARTBEAT_INTERVAL */
+	uint32_t hfi_imm_size;	  /** Immediate data size */
+	uint32_t connections;	    /**> Number of connections */
+
+	psmi_context_t context;
+	char *context_mylabel;
+	uint32_t yield_spin_cnt;
+
+	/* EP link-lists */
+	struct psm2_ep *user_ep_next;
+
+	/* EP link-lists for multi-context. */
+	struct psm2_ep *mctxt_prev;
+	struct psm2_ep *mctxt_next;
+	struct psm2_ep *mctxt_master;
+
+	/* Active Message handler table */
+	struct psm2_ep_am_handle_entry *am_htable;
+
+	uint64_t gid_hi;
+	uint64_t gid_lo;
+
+	ptl_ctl_t ptl_amsh;
+	ptl_ctl_t ptl_ips;
+	ptl_ctl_t ptl_self;
+
+	/* All ptl data is allocated inline below */
+	uint8_t ptl_base_data[0] __attribute__ ((aligned(64)));
+	bool skip_affinity;
+};
+
+struct mqq {
+	psm2_mq_req_t first;
+	psm2_mq_req_t last;
+};
+
+typedef
+union psmi_seqnum {
+	struct {
+		uint32_t psn_seq:11;
+		uint32_t psn_gen:20;
+	};
+	struct {
+		uint32_t psn_num:31;
+	};
+	uint32_t psn_val;
+} psmi_seqnum_t;
+
+/*
+ * PSM end point address. One per connection and per rail.
+ */
+struct psm2_epaddr {
+	psm2_epid_t epid;	/* peer's epid */
+	ptl_ctl_t *ptlctl;	/* The control structure for the ptl */
+	struct ips_proto *proto;	/* only for ips protocol */
+	void *usr_ep_ctxt;	/* User context associated with endpoint */
+};
+
+#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD
+#  define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD  250
+#endif
+
+/*
+ * Users of BLOCKUNTIL should check the value of err upon return
+ */
+#define PSMI_BLOCKUNTIL(ep, err, cond)	do {				\
+	int spin_cnt = 0;						\
+	PSMI_PROFILE_BLOCK();						\
+	while (!(cond)) {						\
+		err = psmi_poll_internal(ep, 1);			\
+		if (err == PSM2_OK_NO_PROGRESS) {			\
+			PSMI_PROFILE_REBLOCK(1);			\
+			if (++spin_cnt == (ep)->yield_spin_cnt) {	\
+				spin_cnt = 0;				\
+				PSMI_YIELD((ep)->mq->progress_lock);	\
+			}						\
+		}							\
+		else if (err == PSM2_OK) {				\
+			PSMI_PROFILE_REBLOCK(0);			\
+			spin_cnt = 0;					\
+		}							\
+		else							\
+		break;							\
+	}								\
+	PSMI_PROFILE_UNBLOCK();						\
+} while (0)
+
+#endif /* _PSMI_EP_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_ep_connect.c b/deps/libfabric/prov/psm3/psm3/psm_ep_connect.c
new file mode 100644
index 0000000000000000000000000000000000000000..7907952cf9102846955ff7e40054ebbbf4512307
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_ep_connect.c
@@ -0,0 +1,621 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+#if _HFI_DEBUGGING
+PSMI_ALWAYS_INLINE(
+char *psmi_getdevice(int type))
+{
+	switch (type) {
+	case PTL_DEVID_IPS:
+		return "ips";
+	case PTL_DEVID_AMSH:
+		return "amsh";
+	case PTL_DEVID_SELF:
+		return "self";
+	default:
+		return "ips";
+	}
+}
+#endif
+
+psm2_error_t
+__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid,
+		 int const *array_of_epid_mask,	/* can be NULL */
+		 psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr,
+		 int64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ptl_ctl_t *ptlctl;
+	ptl_t *ptl;
+	int i, j, dup_idx;
+	int num_toconnect = 0;
+	int *epid_mask = NULL;
+	int *epid_mask_isdupof = NULL;
+	uint64_t t_start = get_cycles();
+	uint64_t t_left;
+	union psmi_envvar_val timeout_intval;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	/*
+	 * Normally we would lock here, but instead each implemented ptl component
+	 * does its own locking.  This is mostly because the ptl components are
+	 * ahead of the PSM2 interface in that they can disconnect their peers.
+	 */
+	if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL ||
+	    num_of_epid < 1) {
+		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_connect parameters");
+		goto fail_nolock;
+	}
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	/* We need two of these masks to detect duplicates */
+	err = PSM2_NO_MEMORY;
+	epid_mask =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+	if (epid_mask == NULL)
+		goto fail;
+	epid_mask_isdupof =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+	if (epid_mask_isdupof == NULL)
+		goto fail;
+	err = PSM2_OK;
+
+	/* Eventually handle timeouts across all connects. */
+	for (j = 0; j < num_of_epid; j++) {
+		if (array_of_epid_mask != NULL && !array_of_epid_mask[j])
+			epid_mask[j] = 0;
+		else {
+			epid_mask[j] = 1;
+			array_of_errors[j] = PSM2_EPID_UNKNOWN;
+			array_of_epaddr[j] = NULL;
+			if (psmi_epid_version(array_of_epid[j]) !=
+						 PSMI_EPID_VERSION
+				&& psmi_epid_version(array_of_epid[j]) !=
+					psmi_epid_version(ep->epid)) {
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Mismatched version of EPID - %"PRIu64"\n"
+					  "Confirm all nodes are running the same interconnect HW and PSM version\n",
+					  psmi_epid_version(array_of_epid[j]));
+			}
+			num_toconnect++;
+		}
+		epid_mask_isdupof[j] = -1;
+	}
+
+	psmi_getenv("PSM3_CONNECT_TIMEOUT",
+		    "End-point minimum connection timeout. 0 for no time-out.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)(timeout/SEC_ULL), &timeout_intval);
+
+	if (getenv("PSM3_CONNECT_TIMEOUT")) {
+		timeout = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout = max(timeout, (num_toconnect * SEC_ULL) / 100);
+	}
+
+	if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+		timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+	_HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n",
+		   num_toconnect, (double)timeout / 1e9);
+
+	/* Look for duplicates in input array */
+	for (i = 0; i < num_of_epid; i++) {
+		for (j = i + 1; j < num_of_epid; j++) {
+			if (array_of_epid[i] == array_of_epid[j] &&
+			    epid_mask[i] && epid_mask[j]) {
+				epid_mask[j] = 0;	/* don't connect more than once */
+				epid_mask_isdupof[j] = i;
+			}
+		}
+	}
+
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (ep->devid_enabled[i] == -1)
+			continue;
+		/* Set up the right connect ptrs */
+		switch (ep->devid_enabled[i]) {
+		case PTL_DEVID_IPS:
+			ptlctl = &ep->ptl_ips;
+			ptl = ep->ptl_ips.ptl;
+			break;
+		case PTL_DEVID_AMSH:
+			ptlctl = &ep->ptl_amsh;
+			ptl = ep->ptl_amsh.ptl;
+			break;
+		case PTL_DEVID_SELF:
+			ptlctl = &ep->ptl_self;
+			ptl = ep->ptl_self.ptl;
+			break;
+		default:
+			ptlctl = &ep->ptl_ips;	/*no-unused */
+			ptl = ep->ptl_ips.ptl;	/*no-unused */
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled PTL id %d\n",
+					  ep->devid_enabled[i]);
+			break;
+		}
+		t_left = psmi_cycles_left(t_start, timeout);
+
+		if (_HFI_VDBG_ON) {
+			_HFI_VDBG_ALWAYS
+				("Trying to connect with device %s\n",
+				psmi_getdevice(ep->devid_enabled[i]));
+		}
+		if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid,
+					      epid_mask, array_of_errors,
+					      array_of_epaddr,
+					      cycles_to_nanosecs(t_left)))) {
+			if (_HFI_PRDBG_ON) {
+				_HFI_PRDBG_ALWAYS
+					("Connect failure in device %s err=%d\n",
+					psmi_getdevice(ep->devid_enabled[i]), err);
+			}
+			goto connect_fail;
+		}
+
+		/* Now process what's been connected */
+		for (j = 0; j < num_of_epid; j++) {
+			dup_idx = epid_mask_isdupof[j];
+			if (!epid_mask[j] && dup_idx == -1)
+				continue;
+
+			if (dup_idx != -1) {	/* dup */
+				array_of_epaddr[j] = array_of_epaddr[dup_idx];
+				array_of_errors[j] = array_of_errors[dup_idx];
+				epid_mask_isdupof[j] = -1;
+			}
+
+			if (array_of_errors[j] == PSM2_OK) {
+				epid_mask[j] = 0;	/* don't try on next ptl */
+				ep->connections++;
+			}
+		}
+	}
+
+	for (i = 0; i < num_of_epid; i++) {
+		ptl_ctl_t *c = NULL;
+		if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
+			continue;
+		/* If we see unreachable here, that means some PTLs were not enabled */
+		if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+			err = PSM2_EPID_UNREACHABLE;
+			break;
+		}
+
+		psmi_assert_always(array_of_epaddr[i] != NULL);
+		c = array_of_epaddr[i]->ptlctl;
+		psmi_assert_always(c != NULL);
+		_HFI_VDBG("%-20s DEVICE %s (%p)\n",
+			  psmi_epaddr_get_name(array_of_epid[i]),
+			  c == &ep->ptl_ips ? "nic" :
+			  (c == &ep->ptl_amsh ? "amsh" : "self"),
+			  (void *)array_of_epaddr[i]->ptlctl->ptl);
+	}
+
+        if (err == PSM2_OK)
+                for (i=0; i<num_of_epid; i++)
+                        array_of_errors[i] = PSM2_OK;
+
+connect_fail:
+	if (err != PSM2_OK) {
+		char errbuf[PSM2_ERRSTRING_MAXLEN];
+		size_t len;
+		int j = 0;
+
+		if (err == PSM2_EPID_UNREACHABLE) {
+			char *deverr = "of an incorrect setting";
+			char *eperr = "";
+			char *devname = NULL;
+			if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+				deverr =
+				    "there is no shared memory PSM3 device (shm)";
+				eperr = " shared memory";
+			} else
+			    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+				deverr =
+				    "there is no OPA PSM3 device (nic)";
+				eperr = " OPA";
+			}
+
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "Some%s endpoints could not be connected because %s "
+				       "in the currently enabled PSM3_DEVICES (",
+				       eperr, deverr);
+			for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1;
+			     i++) {
+				switch (ep->devid_enabled[i]) {
+				case PTL_DEVID_IPS:
+					devname = "nic";
+					break;
+				case PTL_DEVID_AMSH:
+					devname = "shm";
+					break;
+				case PTL_DEVID_SELF:
+				default:
+					devname = "self";
+					break;
+				}
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%s,",
+					     devname);
+			}
+			if (len < sizeof(errbuf) - 1 && devname != NULL)
+				/* parsed something, remove trailing comma */
+				errbuf[len - 1] = ')';
+		} else
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "%s", err == PSM2_TIMEOUT ?
+				       "Detected connection timeout" :
+				       psm2_error_get_string(err));
+
+		/* first pass, look for all nodes with the error */
+		for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) {
+			if (array_of_epid_mask != NULL
+			    && !array_of_epid_mask[i])
+				continue;
+			if (array_of_errors[i] == PSM2_OK)
+				continue;
+			if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+			    err != PSM2_EPID_UNREACHABLE)
+				continue;
+			if (array_of_errors[i])
+				array_of_epaddr[i] = NULL;
+			if (err == array_of_errors[i]) {
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%c %s",
+					     j == 0 ? ':' : ',',
+					     psmi_epaddr_get_hostname
+					     (array_of_epid[i]));
+				j++;
+			}
+		}
+		errbuf[sizeof(errbuf) - 1] = '\0';
+		err = psmi_handle_error(ep, err, "%s", errbuf);
+	}
+
+fail:
+	PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+	if (epid_mask != NULL)
+		psmi_free(epid_mask);
+	if (epid_mask_isdupof != NULL)
+		psmi_free(epid_mask_isdupof);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_connect)
+
+psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr,
+				  psm2_epaddr_t *array_of_epaddr,
+				  const int *array_of_epaddr_mask,
+				  psm2_error_t *array_of_errors,
+				  int64_t timeout)
+{
+	return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr,
+				   array_of_epaddr_mask, array_of_errors,
+				   PSM2_EP_DISCONNECT_GRACEFUL, timeout);
+}
+PSMI_API_DECL(psm2_ep_disconnect)
+
+psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
+				  psm2_epaddr_t *array_of_epaddr,
+				  const int *array_of_epaddr_mask,
+				  psm2_error_t *array_of_errors,
+				  int mode, int64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ptl_ctl_t *ptlctl;
+	ptl_t *ptl;
+	int i, j, dup_idx;
+	int num_todisconnect = 0;
+	int *epaddr_mask = NULL;
+	int *epaddr_mask_isdupof = NULL;
+	uint64_t t_start = get_cycles();
+	uint64_t t_left;
+	union psmi_envvar_val timeout_intval;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+
+	/*
+	 * Normally we would lock here, but instead each implemented ptl component
+	 * does its own locking.  This is mostly because the ptl components are
+	 * ahead of the PSM2 interface in that they can disconnect their peers.
+	 */
+	if (ep == NULL || array_of_epaddr == NULL ||
+	    num_of_epaddr < 1) {
+		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
+					"Invalid psm2_ep_disconnect parameters");
+		goto fail_nolock;
+	}
+
+	PSMI_LOCK(ep->mq->progress_lock);
+
+	/* We need two of these masks to detect duplicates */
+	err = PSM2_NO_MEMORY;
+	epaddr_mask =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+	if (epaddr_mask == NULL)
+		goto fail;
+	epaddr_mask_isdupof =
+	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr);
+	if (epaddr_mask_isdupof == NULL)
+		goto fail;
+	err = PSM2_OK;
+
+	/* Eventually handle timeouts across all connects. */
+	for (j = 0; j < num_of_epaddr; j++) {
+		if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j])
+			epaddr_mask[j] = 0;
+		else {
+			epaddr_mask[j] = 1;
+			array_of_errors[j] = PSM2_EPID_UNKNOWN;
+			num_todisconnect++;
+		}
+		epaddr_mask_isdupof[j] = -1;
+	}
+
+	psmi_getenv("PSM3_DISCONNECT_TIMEOUT",
+		    "End-point disconnection timeout over-ride. 0 for no time-out.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &timeout_intval);
+
+	if (getenv("PSM3_DISCONNECT_TIMEOUT")) {
+		timeout = timeout_intval.e_uint * SEC_ULL;
+	} else if (timeout > 0) {
+		/* The timeout parameter provides the minimum timeout. A heuristic
+		 * is used to scale up the timeout linearly with the number of
+		 * endpoints, and we allow one second per 100 endpoints. */
+		timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100);
+	}
+
+	if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+		timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+	_HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n",
+		   num_todisconnect, (double)timeout / 1e9);
+
+	/* Look for duplicates in input array */
+	for (i = 0; i < num_of_epaddr; i++) {
+		for (j = i + 1; j < num_of_epaddr; j++) {
+			if (array_of_epaddr[i] == array_of_epaddr[j] &&
+			    epaddr_mask[i] && epaddr_mask[j]) {
+				epaddr_mask[j] = 0;	/* don't disconnect more than once */
+				epaddr_mask_isdupof[j] = i;
+			}
+		}
+	}
+
+	for (i = 0; i < PTL_MAX_INIT; i++) {
+		if (ep->devid_enabled[i] == -1)
+			continue;
+		/* Set up the right connect ptrs */
+		switch (ep->devid_enabled[i]) {
+		case PTL_DEVID_IPS:
+			ptlctl = &ep->ptl_ips;
+			ptl = ep->ptl_ips.ptl;
+			break;
+		case PTL_DEVID_AMSH:
+			ptlctl = &ep->ptl_amsh;
+			ptl = ep->ptl_amsh.ptl;
+			break;
+		case PTL_DEVID_SELF:
+			ptlctl = &ep->ptl_self;
+			ptl = ep->ptl_self.ptl;
+			break;
+		default:
+			ptlctl = &ep->ptl_ips;	/*no-unused */
+			ptl = ep->ptl_ips.ptl;	/*no-unused */
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled PTL id %d\n",
+					  ep->devid_enabled[i]);
+			break;
+		}
+		t_left = psmi_cycles_left(t_start, timeout);
+
+		if (_HFI_CONNDBG_ON) {
+			_HFI_CONNDBG_ALWAYS
+				("Trying to disconnect with device %s\n",
+				psmi_getdevice(ep->devid_enabled[i]));
+		}
+		if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE),
+					      num_of_epaddr, array_of_epaddr,
+					      epaddr_mask, array_of_errors,
+					      cycles_to_nanosecs(t_left)))) {
+			if (_HFI_PRDBG_ON) {
+				_HFI_PRDBG_ALWAYS
+					("Disconnect failure in device %s err=%d\n",
+					psmi_getdevice(ep->devid_enabled[i]), err);
+			}
+			goto disconnect_fail;
+		}
+
+		/* Now process what's been disconnected */
+		for (j = 0; j < num_of_epaddr; j++) {
+			dup_idx = epaddr_mask_isdupof[j];
+			if (!epaddr_mask[j] && dup_idx == -1)
+				continue;
+
+			if (dup_idx != -1) {	/* dup */
+				array_of_errors[j] = array_of_errors[dup_idx];
+				epaddr_mask_isdupof[j] = -1;
+			}
+
+			if (array_of_errors[j] == PSM2_OK) {
+				epaddr_mask[j] = 0;	/* don't try on next ptl */
+				array_of_epaddr[j] = NULL;
+				ep->connections--;
+			}
+		}
+	}
+
+	for (i = 0; i < num_of_epaddr; i++) {
+		if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i])
+			continue;
+		/* If we see unreachable here, that means some PTLs were not enabled */
+		if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
+			err = PSM2_EPID_UNREACHABLE;
+			break;
+		}
+	}
+
+disconnect_fail:
+	/* If the error is a timeout (at worse) and the client is OPA MPI,
+	 * just return timeout to let OPA MPI handle the hostnames that
+	 * timed out */
+	if (err != PSM2_OK) {
+		char errbuf[PSM2_ERRSTRING_MAXLEN];
+		size_t len;
+		int j = 0;
+
+		if (err == PSM2_EPID_UNREACHABLE) {
+			char *deverr = "of an incorrect setting";
+			char *eperr = "";
+			char *devname = NULL;
+			if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+				deverr =
+				    "there is no shared memory PSM3 device (shm)";
+				eperr = " shared memory";
+			} else
+			    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+				deverr =
+				    "there is no OPA PSM3 device (nic)";
+				eperr = " OPA";
+			}
+
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "Some%s endpoints could not be disconnected because %s "
+				       "in the currently enabled PSM3_DEVICES (",
+				       eperr, deverr);
+			for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) {
+				switch (ep->devid_enabled[i]) {
+				case PTL_DEVID_IPS:
+					devname = "nic";
+					break;
+				case PTL_DEVID_AMSH:
+					devname = "shm";
+					break;
+				case PTL_DEVID_SELF:
+				default:
+					devname = "self";
+					break;
+				}
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%s,",
+					     devname);
+			}
+			if (len < sizeof(errbuf) - 1 && devname != NULL)
+				/* parsed something, remove trailing comma */
+				errbuf[len - 1] = ')';
+		} else
+			len = snprintf(errbuf, sizeof(errbuf) - 1,
+				       "%s", err == PSM2_TIMEOUT ?
+				       "Detected disconnect timeout" :
+				       psm2_error_get_string(err));
+
+		/* first pass, look for all nodes with the error */
+		for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) {
+			if (array_of_epaddr_mask != NULL
+			    && !array_of_epaddr_mask[i])
+				continue;
+			if (array_of_errors[i] == PSM2_OK)
+				continue;
+			if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
+			    err != PSM2_EPID_UNREACHABLE)
+				continue;
+			if (err == array_of_errors[i]) {
+				len +=
+				    snprintf(errbuf + len,
+					     sizeof(errbuf) - len - 1, "%c %s",
+					     j == 0 ? ':' : ',',
+					     array_of_epaddr[i]?psmi_epaddr_get_hostname
+					     (array_of_epaddr[i]->epid):"Unknown");
+				j++;
+			}
+		}
+		errbuf[sizeof(errbuf) - 1] = '\0';
+		err = psmi_handle_error(ep, err, "%s", errbuf);
+	}
+
+fail:
+	PSMI_UNLOCK(ep->mq->progress_lock);
+
+fail_nolock:
+	if (epaddr_mask != NULL)
+		psmi_free(epaddr_mask);
+	if (epaddr_mask_isdupof != NULL)
+		psmi_free(epaddr_mask_isdupof);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_ep_disconnect2)
diff --git a/deps/libfabric/prov/psm3/psm3/psm_error.c b/deps/libfabric/prov/psm3/psm3/psm_error.c
new file mode 100644
index 0000000000000000000000000000000000000000..27da64115bb374ea5a6068985c81e8543ddc40b0
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_error.c
@@ -0,0 +1,351 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_NOLOG  -1
+
+struct psm2_error_token {
+	psm2_ep_t ep;
+	psm2_error_t error;
+	char err_string[PSM2_ERRSTRING_MAXLEN];
+};
+
+static
+psm2_error_t
+psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err,
+		     const char *error_string, psm2_error_token_t token)
+{
+	return err;
+}
+
+static
+psm2_error_t
+psmi_errhandler_psm(psm2_ep_t ep,
+		    const psm2_error_t err,
+		    const char *error_string, psm2_error_token_t token)
+{
+	/* we want the error to be seen through ssh, etc., so we flush and then
+	 * sleep a bit.   Not perfect, but not doing so means it almost never
+	 * gets seen. */
+	fprintf(stderr, "%s: %s\n", hfi_get_mylabel(), token->err_string);
+	fflush(stdout);
+	fflush(stderr);
+
+	/* XXX Eventually, this will hook up to a connection manager, and we'll
+	 * issue an upcall into the connection manager at shutdown time */
+	sleep(3);
+
+	/* We use this "special" ep internally to handle internal errors that are
+	 * triggered from within code that is not expected to return to the user.
+	 * Errors of this sort on not expected to be handled by users and always
+	 * mean we have an internal PSM bug. */
+	if (err == PSM2_INTERNAL_ERR)
+		abort();
+	else
+		exit(-1);
+}
+
+psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop;
+
+psm2_error_t __psm2_error_defer(psm2_error_token_t token)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_errhandler_psm(token->ep, token->error, token->err_string,
+				   token);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_error_defer)
+
+psm2_error_t
+__psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler)
+{
+	psm2_ep_errhandler_t *errh;
+
+	PSM2_LOG_MSG("entering");
+
+	if (ep == NULL)
+		errh = &psmi_errhandler_global;
+	else
+		errh = &ep->errh;
+
+	if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER)
+		*errh = psmi_errhandler_psm;
+	else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER)
+		*errh = psmi_errhandler_noop;
+	else
+		*errh = errhandler;
+
+	PSM2_LOG_MSG("leaving");
+
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_error_register_handler)
+
+psm2_error_t
+MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...)
+{
+	va_list argptr;
+	int syslog_level;
+	int console_print = 0;
+	psm2_error_t newerr;
+	struct psm2_error_token token;
+	char *c, fullmsg[PSM2_ERRSTRING_MAXLEN];
+	token.error = error;
+	snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf);
+	fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+	va_start(argptr, buf);
+	vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr);
+	va_end(argptr);
+	token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0';
+
+	/* Unless the user has set PSM3_NO_VERBOSE_ERRORS, always print errors to
+	 * console */
+	c = getenv("PSM3_NO_VERBOSE_ERRORS");
+	console_print = 0;
+	if (ep == PSMI_EP_LOGEVENT)
+		console_print = 1;
+	else if (!c || *c == '\0') {	/* no desire to prevent verbose errors */
+		/* Remove the console print if we're internally handling the error */
+		if (ep == PSMI_EP_NORETURN)
+			console_print = 0;
+		else if (ep == NULL
+			 && psmi_errhandler_global != psmi_errhandler_psm)
+			console_print = 1;
+		else if (ep != NULL && ep->errh != psmi_errhandler_psm)
+			console_print = 1;
+	}
+
+	/* Before we let the user even handle the error, send to syslog */
+	syslog_level = psmi_error_syslog_level(error);
+	if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT)
+		psmi_syslog(ep, console_print,
+			    ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level,
+			    "%s (err=%d)", token.err_string, error);
+
+	if (ep == PSMI_EP_LOGEVENT)	/* we're just logging */
+		newerr = PSM2_OK;
+	else if (ep == PSMI_EP_NORETURN)
+		newerr =
+		    psmi_errhandler_psm(NULL, error, token.err_string, &token);
+	else if (ep == NULL)
+		newerr =
+		    psmi_errhandler_global(NULL, error, token.err_string,
+					   &token);
+	else
+		newerr = ep->errh(ep, error, token.err_string, &token);
+
+	return newerr;
+}
+MOCK_DEF_EPILOGUE(psmi_handle_error);
+
+/* Returns the "worst" error out of errA and errB */
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB)
+{
+#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err)
+
+	/* Bad runtime or before initialization */
+	_PSMI_ERR_IS(PSM2_NO_MEMORY);
+	_PSMI_ERR_IS(PSM2_INTERNAL_ERR);
+	_PSMI_ERR_IS(PSM2_INIT_NOT_INIT);
+	_PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION);
+
+	/* Before we cget an endpoint */
+	_PSMI_ERR_IS(PSM2_EP_NO_DEVICE);
+	_PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND);
+	_PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE);
+	_PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL);
+	_PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS);
+
+	/* As we open/close the endpoint */
+	_PSMI_ERR_IS(PSM2_EP_NO_NETWORK);
+	_PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR);
+	_PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT);
+	_PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY);
+	_PSMI_ERR_IS(PSM2_EP_NO_RESOURCES);
+
+	/* In connect phase */
+	_PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_NODE);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY);
+	_PSMI_ERR_IS(PSM2_EPID_INVALID_MTU);
+	_PSMI_ERR_IS(PSM2_EPID_RV_CONNECT_ERROR);
+
+	/* Timeout if nothing else */
+	_PSMI_ERR_IS(PSM2_TIMEOUT);
+
+	_PSMI_ERR_IS(PSM2_EPID_RV_CONNECT_RECOVERING);
+
+	/* Last resort */
+	return max(errA, errB);
+}
+
+struct psmi_error_item {
+	int syslog_level;
+	const char *error_string;
+};
+
+static
+struct psmi_error_item psmi_error_items[] = {
+	{PSMI_NOLOG, "Success"},	/*  PSM2_OK = 0, */
+	{PSMI_NOLOG, "No events were progressed in psm_poll"},	/* PSM2_OK_NO_PROGRESS = 1 */
+	{PSMI_NOLOG, "unknown 2"},
+	{PSMI_NOLOG, "Error in a function parameter"},	/* PSM2_PARAM_ERR = 3 */
+	{LOG_CRIT, "Ran out of memory"},	/* PSM2_NO_MEMORY = 4 */
+	{PSMI_NOLOG, "PSM has not been initialized"},	/* PSM2_INIT_NOT_INIT = 5 */
+	{LOG_INFO, "API version passed is incompatible"},	/* PSM2_INIT_BAD_API_VERSION = 6 */
+	{PSMI_NOLOG, "PSM Could not set affinity"},	/* PSM2_NO_AFFINITY = 7 */
+	{LOG_ALERT, "PSM Unresolved internal error"},	/* PSM2_INTERNAL_ERR = 8 */
+	{LOG_CRIT, "PSM could not set up shared memory segment"},	/* PSM2_SHMEM_SEGMENT_ERR = 9 */
+	{PSMI_NOLOG, "PSM option is a read-only option"},	/* PSM2_OPT_READONLY = 10 */
+	{PSMI_NOLOG, "Operation timed out"},	/* PSM2_TIMEOUT = 11 */
+	{LOG_INFO, "Exceeded supported amount of endpoints"},
+	/* PSM2_TOO_MANY_ENDPOINTS = 12 */
+	{PSMI_NOLOG, "PSM is in the finalized state"},	/* PSM2_IS_FINALIZED = 13 */
+	{PSMI_NOLOG, "unknown 14"},
+	{PSMI_NOLOG, "unknown 15"},
+	{PSMI_NOLOG, "unknown 16"},
+	{PSMI_NOLOG, "unknown 17"},
+	{PSMI_NOLOG, "unknown 18"},
+	{PSMI_NOLOG, "unknown 19"},
+	{PSMI_NOLOG, "Endpoint was closed"},	/* PSM2_EP_WAS_CLOSED = 20 */
+	{LOG_ALERT, "PSM Could not find an OPA Unit"},	/* PSM2_EP_NO_DEVICE = 21 */
+	{PSMI_NOLOG, "User passed a bad unit number"},	/* PSM2_EP_UNIT_NOT_FOUND = 22 */
+	{LOG_ALERT, "Failure in initializing endpoint"},	/* PSM2_EP_DEVICE_FAILURE = 23 */
+	{PSMI_NOLOG, "Error closing the endpoing error"},	/* PSM2_EP_CLOSE_TIMEOUT = 24 */
+	{PSMI_NOLOG, "No free contexts could be obtained"},	/* PSM2_EP_NO_PORTS_AVAIL = 25 */
+	{LOG_ALERT, "Could not detect network connectivity"},	/* PSM2_EP_NO_NETWORK = 26 */
+	{LOG_INFO, "Invalid Unique job-wide UUID Key"},	/* PSM2_EP_INVALID_UUID_KEY = 27 */
+	{LOG_INFO, "Out of endpoint resources"},	/* PSM2_EP_NO_RESOURCES = 28 */
+	{PSMI_NOLOG, "unknown 29"},
+	{PSMI_NOLOG, "unknown 30"},
+	{PSMI_NOLOG, "unknown 31"},
+	{PSMI_NOLOG, "unknown 32"},
+	{PSMI_NOLOG, "unknown 33"},
+	{PSMI_NOLOG, "unknown 34"},
+	{PSMI_NOLOG, "unknown 35"},
+	{PSMI_NOLOG, "unknown 36"},
+	{PSMI_NOLOG, "unknown 37"},
+	{PSMI_NOLOG, "unknown 38"},
+	{PSMI_NOLOG, "unknown 39"},
+	{PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"},	/* PSM2_EPID_UNKNOWN = 40 */
+	{PSMI_NOLOG, "Endpoint could not be reached"},	/* PSM2_EPID_UNREACHABLE = 41 */
+	{PSMI_NOLOG, "unknown 42"},
+	{LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"},	/* PSM2_EPID_INVALID_NODE = 43 */
+	{LOG_CRIT, "Invalid MTU"},	/* PSM2_EPID_INVALID_MTU =  44 */
+	{PSMI_NOLOG, "UUID key mismatch"},	/* PSM2_EPID_INVALID_UUID_KEY = 45 */
+	{LOG_ERR, "Incompatible PSM version"},	/* PSM2_EPID_INVALID_VERSION = 46 */
+	{LOG_CRIT, "Connect received garbled connection information"},	/* PSM2_EPID_INVALID_CONNECT = 47 */
+	{PSMI_NOLOG, "Endpoint was already connected"},	/* PSM2_EPID_ALREADY_CONNECTED = 48 */
+	{LOG_CRIT, "Two or more endpoints have the same network id (LID)"},	/* PSM2_EPID_NETWORK_ERROR = 49 */
+	{LOG_CRIT, "Endpoint provided incompatible Partition Key"},
+	{LOG_CRIT, "Unable to resolve network path. Check connectivity and routing between nodes"},
+	{LOG_CRIT, "Unable to establish RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_ERROR */
+	{LOG_INFO, "Recovering RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_RECOVERING */
+	{PSMI_NOLOG, "unknown 54"},
+	{PSMI_NOLOG, "unknown 55"},
+	{PSMI_NOLOG, "unknown 56"},
+	{PSMI_NOLOG, "unknown 57"},
+	{PSMI_NOLOG, "unknown 58"},
+	{PSMI_NOLOG, "unknown 59"},
+	{PSMI_NOLOG, "MQ Non-blocking request is incomplete"},	/* PSM2_MQ_NO_COMPLETIONS = 60 */
+	{PSMI_NOLOG, "MQ Message has been truncated at the receiver"},	/* PSM2_MQ_TRUNCATION = 61 */
+	{PSMI_NOLOG, "unknown 62"},
+	{PSMI_NOLOG, "unknown 63"},
+	{PSMI_NOLOG, "unknown 64"},
+	{PSMI_NOLOG, "unknown 65"},
+	{PSMI_NOLOG, "unknown 66"},
+	{PSMI_NOLOG, "unknown 67"},
+	{PSMI_NOLOG, "unknown 68"},
+	{PSMI_NOLOG, "unknown 69"},
+	{PSMI_NOLOG, "Invalid AM reply"},
+	{PSMI_NOLOG, "unknown 71"},
+	{PSMI_NOLOG, "unknown 72"},
+	{PSMI_NOLOG, "unknown 73"},
+	{PSMI_NOLOG, "unknown 74"},
+	{PSMI_NOLOG, "unknown 75"},
+	{PSMI_NOLOG, "unknown 76"},
+	{PSMI_NOLOG, "unknown 77"},
+	{PSMI_NOLOG, "unknown 78"},
+	{PSMI_NOLOG, "unknown 79"},
+	{PSMI_NOLOG, "unknown 80"},
+};
+
+const char *__psm2_error_get_string(psm2_error_t error)
+{
+	PSM2_LOG_MSG("entering");
+	if (error >= PSM2_ERROR_LAST) {
+		PSM2_LOG_MSG("leaving");
+		return "unknown";
+	}
+	else {
+		PSM2_LOG_MSG("leaving");
+		return psmi_error_items[error].error_string;
+	}
+}
+PSMI_API_DECL(psm2_error_get_string)
+
+int psmi_error_syslog_level(psm2_error_t error)
+{
+	if (error >= PSM2_ERROR_LAST)
+		return PSMI_NOLOG;
+	else
+		return psmi_error_items[error].syslog_level;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_error.h b/deps/libfabric/prov/psm3/psm3/psm_error.h
new file mode 100644
index 0000000000000000000000000000000000000000..c986ea0bd721e98718922ca66d89756ea3442e0d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_error.h
@@ -0,0 +1,105 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+#include "psm2_mock_testing.h"
+
+#ifndef _PSMI_IN_USER_H
+#error psm_error.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_ERROR_H
+#define _PSMI_ERROR_H
+
+#define PSMI_EP_NONE		    (NULL)
+#define PSMI_EP_NORETURN	    ((psm2_ep_t) -2)
+#define PSMI_EP_LOGEVENT	    ((psm2_ep_t) -3)
+
+extern psm2_ep_errhandler_t psmi_errhandler_global;
+
+//
+// psmi_handle_error has a 1st argument which controls how it behaves.
+// PSMI_EP_NO_RETURN – unconditionally outputs message and exits or aborts
+//		process.
+// other values – behavior is controlled by how psm2 error handler has been set
+//		via PSM API.  OFI/psm3 provider disables error handler so these will
+//		be silent
+//
+// to have PSM stop immediately with a message. use
+//		psmi_handle_error(PSMI_EP_NO_RETURN
+// all other uses of psmi_handle_error are under the control of the middleware
+// or OFI provider.  The OFI provider turns all of them off.
+//
+// To have a a message be unconditionally output for all builds, regardless of
+// env variables, use _HFI_ERROR or _HFI_UNIT_ERROR
+// All other logging macros are under the control of the user via env variables
+// and build options can disable them
+//
+// Other logging calls are only enabled if _HFI_DEBUGGING is defined,
+// in which case _HFI_INFO is also enabled by default (but env can disable it).
+// All others controlled by env variable.
+//
+// Currently opa_debug.h always defines _HFI_DEBUGGING and it is included by
+// opa_udebug.h, so logging is presently enabled in all builds.  At some point
+// may want to explore a performance optimization and disable logging macros
+// for lower level debug messages in non-debug builds.
+
+psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error,
+			      const char *buf, ...)
+			      __attribute__((format(printf, 3, 4)));
+MOCK_DCL_EPILOGUE(psmi_handle_error);
+
+psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB);
+int psmi_error_syslog_level(psm2_error_t error);
+
+#endif /* _PSMI_ERROR_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_gdrcpy.h b/deps/libfabric/prov/psm3/psm3/psm_gdrcpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb245be41902abfcb0df85e67d940256a32ec1f2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_gdrcpy.h
@@ -0,0 +1,77 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2018 Intel Corporation. All rights reserved. */
+#ifndef GDR_CPY_H
+#define GDR_CPY_H
+#ifdef PSM_CUDA
+
+#include "ptl_ips/ips_proto.h"
+
+#define GDR_FD get_gdr_fd()
+
+int get_gdr_fd();
+
+void hfi_gdr_open();
+
+void hfi_gdr_close();
+
+// flags=0 for send, 1 for recv
+void *
+gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
+				size_t size, int flags,
+				psm2_ep_t ep);
+
+
+#endif
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e64b4708858a3a80917ff2d824a0cb70d364ebe
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h
@@ -0,0 +1,69 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/*
+
+  hfi1_deprecated_gen1.h
+
+  Contains certain features of the hfi1 module that have been deprecated.
+
+  These features may still need to be supported by the psm library for
+  reasons of backwards compatibility.
+ */
+
+#ifndef __HFI1_DEPRECATED_GEN1_H__
+
+#define __HFI1_DEPRECATED_GEN1_H__
+
+
+#endif /* #ifndef __HFI1_DEPRECATED_GEN1_H__ */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbe8e3e6c3514b399447278166955c959b63b5d3
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h
@@ -0,0 +1,61 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef OPA_COMMON_GEN1_H
+#define OPA_COMMON_GEN1_H
+
+#include "hfi1_deprecated_gen1.h"
+
+#endif /* OPA_COMMON_GEN1_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b7628ca20c9e9efc4e9d2c04e2c61e35b3a1dc79
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c
@@ -0,0 +1,64 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cb1e8c15cbff5de4d73917e191c257f103f06a9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c
@@ -0,0 +1,77 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains the initialization functions used by the low
+   level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "opa_user_gen1.h"
+#include "opa_udebug.h"
+
+#include <sched.h>
+
+size_t arrsz[MAPSIZE_MAX] = { 0 };
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f1572ad7601d942b3c29e114270903cb7a504713
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c
@@ -0,0 +1,471 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low
+   level hfi protocol code. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <poll.h>
+#include "opa_service_gen1.h"
+#include "psmi_wrappers.h"
+#include "psm_netutils.h"
+
+#define HFI_UD_NUM_CONTEXTS 	1024
+#define HFI_UD_NUM_FREE_CTXTS 	1024
+
+
+
+
+
+
+
+
+#ifdef PSM2_SUPPORT_IW_CMD_API
+ustatic
+int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count)
+{
+	uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr };
+	const static struct
+	{
+		unsigned int ioctlCmd;
+		unsigned int addrOrLiteralIdx;
+	} cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = {
+        [PSMI_HFI_CMD_ASSIGN_CTXT]      = {HFI1_IOCTL_ASSIGN_CTXT   , 0},
+        [PSMI_HFI_CMD_CTXT_INFO]        = {HFI1_IOCTL_CTXT_INFO     , 0},
+        [PSMI_HFI_CMD_USER_INFO]        = {HFI1_IOCTL_USER_INFO     , 0},
+        [PSMI_HFI_CMD_TID_UPDATE]       = {HFI1_IOCTL_TID_UPDATE    , 0},
+        [PSMI_HFI_CMD_TID_FREE]         = {HFI1_IOCTL_TID_FREE      , 0},
+        [PSMI_HFI_CMD_CREDIT_UPD]       = {HFI1_IOCTL_CREDIT_UPD    , 1},
+        [PSMI_HFI_CMD_RECV_CTRL]        = {HFI1_IOCTL_RECV_CTRL     , 1},
+        [PSMI_HFI_CMD_POLL_TYPE]        = {HFI1_IOCTL_POLL_TYPE     , 1},
+        [PSMI_HFI_CMD_ACK_EVENT]        = {HFI1_IOCTL_ACK_EVENT     , 1},
+        [PSMI_HFI_CMD_SET_PKEY]         = {HFI1_IOCTL_SET_PKEY      , 1},
+        [PSMI_HFI_CMD_CTXT_RESET]       = {HFI1_IOCTL_CTXT_RESET    , 1},
+        [PSMI_HFI_CMD_TID_INVAL_READ]   = {HFI1_IOCTL_TID_INVAL_READ, 0},
+        [PSMI_HFI_CMD_GET_VERS]         = {HFI1_IOCTL_GET_VERS      , 1},
+#ifdef PSM_CUDA
+	[PSMI_HFI_CMD_TID_UPDATE_V2]	= {HFI1_IOCTL_TID_UPDATE_V2 , 0},
+#endif
+    };
+
+	if (cmd->type < PSMI_HFI_CMD_LAST)
+		return psmi_ioctl(fd,
+			     cmdTypeToIoctlNum[cmd->type].ioctlCmd,
+			     addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]);
+	else
+	{
+		errno = EINVAL;
+		return -1;
+	}
+}
+#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */
+
+/* we use mmap64() because we compile in both 32 and 64 bit mode,
+   and we have to map physical addresses that are > 32 bits long.
+   While linux implements mmap64, it doesn't have a man page,
+   and isn't declared in any header file, so we declare it here ourselves.
+
+   We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+   redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+   it doesn't work when the address being mapped is > 32 bits.  It chips
+   off bits 32 and above.   So we stay with mmap64. */
+void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd,
+		 __off64_t offset)
+{
+	return mmap64(addr, length, prot, flags, fd, offset);
+}
+
+/* get the number of units supported by the driver.  Does not guarantee */
+/* that a working chip has been found for each possible unit #. */
+/* number of units >=0 (0 means none found). */
+/* formerly used sysfs file "num_units" */
+int hfi_get_num_units(void)
+{
+	int ret = 0;
+
+	while (1) {
+		char pathname[PATH_MAX];
+		struct stat st;
+		int r;
+
+		snprintf(pathname, sizeof(pathname), "/dev/infiniband/uverbs%d", ret);
+		r = stat(pathname, &st);
+		if (r) break;
+
+		ret++;
+	}
+	return ret;
+}
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+   returns 0 if no port on the unit is active.
+   returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit)
+{
+	int p, lid;
+
+	for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) {
+		lid = hfi_get_port_lid(unit, p);
+		if (lid > 0 && lid != 0xFFFF)
+			break;
+	}
+
+	if (p <= HFI_MAX_PORT)
+	{
+		return 1;
+	}
+
+	return lid;
+}
+
+/* get the number of contexts from the unit id. */
+/* Returns 0 if no unit or no match. */
+int hfi_get_num_contexts(int unit_id)
+{
+	return HFI_UD_NUM_CONTEXTS;
+}
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+   returns 0 if the unit and port are not active.
+   returns -1 when an error occurred. */
+int hfi_get_port_active(int unit, int port)
+{
+	int ret;
+	char *state;
+	ret = hfi_sysfs_port_read(unit, port, "phys_state", &state);
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single port chips */
+			_HFI_VDBG
+			    ("Failed to get phys_state for unit %u:%u: %s\n",
+			     unit, port, strerror(errno));
+		else
+			_HFI_DBG
+			    ("Failed to get phys_state for unit %u:%u: %s\n",
+			     unit, port, strerror(errno));
+		return -1;
+	} else {
+		if (strncmp(state, "5: LinkUp", 9)) {
+			_HFI_DBG("Link is not Up for unit %u:%u\n", unit, port);
+			free(state);
+			return 0;
+		}
+		free(state);
+		return 1;
+	}
+}
+
+/* Given the unit number, return an error, or the corresponding LID
+   For now, it's used only so the MPI code can determine it's own
+   LID, and which other LIDs (if any) are also assigned to this node
+   Returns an int, so -1 indicates an error.  0 may indicate that
+   the unit is valid, but no LID has been assigned.
+   No error print because we call this for both potential
+   ports without knowing if both ports exist (or are connected) */
+/* This routine is used in many places, such as get_unit_active, to
+ * confirm the port is usable.  As such it includes additional checks that
+ * the port is active and for link_layer ethernet that it includes a RoCE
+ * IPv4 GID whose subnet can be identified
+ */
+int hfi_get_port_lid(int unit, int port)
+{
+	int ret = 0;
+	int64_t val = 0;
+
+	if (hfi_get_port_active(unit,port) != 1)
+		return -2;
+	ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0);
+	_HFI_VDBG("ret %d, unit %d port %d lid %ld\n", ret, unit,
+		  port, (long int)val);
+
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single port chips */
+			_HFI_VDBG("Failed to get LID for unit %u:%u: %s\n",
+				  unit, port, strerror(errno));
+		else
+			_HFI_DBG("Failed to get LID for unit %u:%u: %s\n",
+				 unit, port, strerror(errno));
+	} else {
+		char *link_lyr;
+		ret = hfi_sysfs_port_read(unit, port, "link_layer", &link_lyr);
+		if (ret == -1) {
+			if (errno == ENODEV)
+				/* this is "normal" for port != 1, on single port chips */
+				_HFI_VDBG("Failed to get link_layer for unit %u:%u: %s\n",
+					  unit, port, strerror(errno));
+			else
+				_HFI_DBG("Failed to get link_layer for unit %u:%u: %s\n",
+					 unit, port, strerror(errno));
+		} else {
+			_HFI_VDBG("ret %d, unit %d port %d link_layer %s\n",
+					  ret, unit, port, link_lyr);
+
+			/* If this port is an Ethernet Port lid does not matter, return 1 */
+			if (strncmp(link_lyr, "Ethernet", strlen("Ethernet")) == 0) {
+				uint64_t subnet, hi;
+				if (0 != hfi_get_port_subnet(unit, port,
+								&subnet, NULL, NULL, NULL, NULL, &hi, NULL)) {
+					_HFI_DBG("Failed to get subnet for unit %u:%u: %s\n",
+						unit, port, strerror(errno));
+					ret = -1;
+				} else if (subnet == hi) {
+					_HFI_DBG("Skipping unit %u:%u: no RoCE IPv4 GID\n",
+						 unit, port);
+					ret = -1;
+				} else
+					ret = 1;	// for RoCE LID does not matter, return 1
+			} else
+				ret = val;	// OPA/IB LID we got
+			free(link_lyr);
+		}
+	}
+
+	return ret;
+}
+
+/* Given the unit number, return an error, or the corresponding GID
+   For now, it's used only so the MPI code can determine its fabric ID.
+   Returns an int, so -1 indicates an error.
+   No error print because we call this for both potential
+   ports without knowing if both ports exist (or are connected) */
+static int hfi_get_port_gid(int unit, int port, int idx, uint64_t *hi, uint64_t *lo)
+{
+	int ret;
+	char *gid_str = NULL;
+	char attr_str[64];
+
+	snprintf(attr_str, 64, "gids/%d", idx < 0 ? 0 : idx);
+	ret = hfi_sysfs_port_read(unit, port, attr_str, &gid_str);
+	if (ret == -1) {
+		if (errno == ENODEV)
+			/* this is "normal" for port != 1, on single
+			 * port chips */
+			_HFI_VDBG("Failed to get GID for unit %u:%u: %s\n",
+				  unit, port, strerror(errno));
+		else
+			_HFI_DBG("Failed to get GID for unit %u:%u: %s\n",
+				 unit, port, strerror(errno));
+	} else {
+		uint32_t gid[8] = {0};
+		if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x",
+			   &gid[0], &gid[1], &gid[2], &gid[3],
+			   &gid[4], &gid[5], &gid[6], &gid[7]) != 8) {
+			_HFI_DBG("Failed to parse GID for unit %u:%u: %s\n",
+				 unit, port, gid_str);
+			ret = -1;
+		} else {
+			*hi = (((uint64_t) gid[0]) << 48)
+				| (((uint64_t) gid[1]) << 32)
+				| (((uint64_t) gid[2]) << 16)
+				| (((uint64_t) gid[3]) << 0);
+			*lo = (((uint64_t) gid[4]) << 48)
+				| (((uint64_t) gid[5]) << 32)
+				| (((uint64_t) gid[6]) << 16)
+				| (((uint64_t) gid[7]) << 0);
+		}
+		free(gid_str);
+	}
+
+	return ret;
+}
+int hfi_get_unit_cpumask(int unit, cpu_set_t *cpuset)
+{
+	int ret = -1;
+	char *cpulist;
+
+	CPU_ZERO(cpuset);
+
+	ret = hfi_sysfs_unit_read(unit, "device/local_cpulist", &cpulist);
+	if (ret == -1) {
+		_HFI_VDBG("Failed to get cpu list for unit %u: %s\n",
+				  unit, strerror(errno));
+	} else {
+		int i = 0;
+		char *next_comma = NULL;
+		char *temp = cpulist;
+		char *dash;
+		int first = -1, last = -1;
+
+		do {
+			next_comma = strchr(temp, ',');
+			dash = strchr(temp, '-');
+
+			first = atoi(temp);
+
+			if (dash == NULL || (dash > next_comma && next_comma != NULL)) {
+				last = first;
+			} else {
+				last = atoi(dash + 1);
+			}
+
+			for (i = first; i <= last; i++) {
+				CPU_SET(i, cpuset);
+				ret++;
+			}
+
+			temp = next_comma + 1;
+		} while (next_comma != NULL);
+
+		free(cpulist);
+	}
+
+	return (ret >= 0 ? 0 : -1);
+}
+
+/* Given the unit number, return an error, or the corresponding subnet
+   For IB/OPA the subnet is the hi 64b of the 1st GID
+		addr is the low 64b of the gid, ip_addr and netmask are N/A (0)
+   For Ethernet it's the IPv4 subnet derived from the 1st RoCE IPv4 GID
+		subnet is the upper portion of the ip_addr (& netmask)
+		addr is the lower portion of the ip_addr (& ~netmask)
+		and ip_addr and netmask are returned
+   In all cases, idx, hi and lo are the actual gid
+   All values are in host byte order
+   Returns an int, so -1 indicates an error.
+   No error print because we call this for both potential
+   ports without knowing if both ports exist (or are connected) */
+int hfi_get_port_subnet(int unit, int port, uint64_t *subnet, uint64_t *addr,
+					uint32_t *ip_addr, uint32_t *netmask,
+					int *idx, uint64_t *hi, uint64_t *lo)
+{
+	int i;
+	int have_subnet = 0;
+	uint64_t gid_hi, gid_lo;
+
+	for (i =0; ; i++) {
+		if (-1 == hfi_get_port_gid(unit, port, i, &gid_hi, &gid_lo))
+			break; // stop at 1st non-existent gid (or non-existent port)
+		if (gid_lo == 0) // Skip over empty gid table entries.
+			continue;
+		if (! have_subnet) {
+			// save 1st valid gid, this is answer unless we find eth
+			if (idx) *idx = i;
+			if (subnet) *subnet = gid_hi;
+			if (addr) *addr = gid_lo;
+			if (ip_addr) *ip_addr = 0;
+			if (netmask) *netmask = 0;
+			if (hi) *hi = gid_hi;
+			if (lo) *lo = gid_lo;
+			have_subnet = 1;
+		}
+		// RoCEv2 Gid => ::ffff:<ipv4>
+		if (gid_hi == 0x0 && (gid_lo >> 32) == 0x0000ffff) {
+			uint32_t ipaddr = (uint32_t)(gid_lo & 0xffffffff);
+			__be32 mask = 0;
+			if (!psmi_get_eth_netmask(__cpu_to_be32(ipaddr), &mask)) {
+				// stop at 1st valid ethernet gid
+				uint32_t nm = __be32_to_cpu(mask);
+				if (idx) *idx = i;
+				if (subnet) *subnet = ipaddr & nm;
+				if (addr) *addr = ipaddr & ~nm;
+				if (ip_addr) *ip_addr = ipaddr;
+				if (netmask) *netmask = nm;
+				if (hi) *hi = gid_hi;
+				if (lo) *lo = gid_lo;
+				break;
+			} else {
+				return -1;	// we're stuck, can't figure out netmask
+			}
+		}
+	}
+	return (have_subnet?0:-1);
+}
+
+
+/* Given the unit number, return an error, or the corresponding link rate
+   for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port)
+{
+	int ret;
+	double rate;
+	char *data_rate = NULL, *newptr;
+
+	ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate);
+	if (ret == -1)
+		goto get_port_rate_error;
+	else {
+		rate = strtod(data_rate, &newptr);
+		if ((rate == 0) && (data_rate == newptr))
+			goto get_port_rate_error;
+	}
+
+	free(data_rate);
+	return ((int)(rate * 2) >> 1);
+
+get_port_rate_error:
+	_HFI_INFO("Failed to get link rate for unit %u:%u: %s\n",
+		  unit, port, strerror(errno));
+
+	return ret;
+}
+
+
+
+
+
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9fbb4e6fb9dc85f85da753f46a59d24bc57d156
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h
@@ -0,0 +1,181 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_SERVICE_GEN1_H
+#define OPA_SERVICE_GEN1_H
+
+/* This file contains all the lowest level routines calling into sysfs */
+/* and qib driver. All other calls are based on these routines. */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE             /* See feature_test_macros(7) */
+#endif
+#include <sched.h>              /* cpu_set_t and CPU_* MACROs */
+#include <libgen.h>
+
+#include "opa_intf.h"
+#include "opa_common_gen1.h"
+#include "opa_udebug.h"
+#include "opa_byteorder.h"
+
+/* upper and lower bounds for HFI port numbers */
+#define HFI_MIN_PORT 1
+#define HFI_MAX_PORT 1
+#ifndef HFI_NUM_PORTS_GEN1
+#define HFI_NUM_PORTS_GEN1 (HFI_MAX_PORT - HFI_MIN_PORT + 1)
+#endif
+/* any unit id to match. */
+#define PSM3_NIC_ANY ((long)-1)
+/* any port num to match. */
+#define PSM3_NIC_PORT_ANY ((long)0)
+
+
+/* Given a unit number and port number, returns 1 if the unit and port are active.
+   returns 0 if the unit and port are not active. returns -1 when an error occurred. */
+int hfi_get_port_active(int, int);
+
+/* Given the unit number and port, return an error, or the corresponding LID */
+/* Returns an int, so -1 indicates a general error.  -2 indicates that the unit/port
+   are not active.  0 indicates that the unit is valid, but no LID has been assigned. */
+int hfi_get_port_lid(int, int);
+
+/* Given a unit number, return an error, or the corresponding cpuset. */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_unit_cpumask(int unit, cpu_set_t *cpuset);
+
+/* Given the unit number and port, return an error, or the corresponding */
+/* subnet, addr and gid.  For ethernet uses 1st IPv4 RoCE gid. */
+/* For IB/OPA uses 1st valid gid */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_subnet(int unit, int port, uint64_t *subnet, uint64_t *addr,
+		uint32_t *ip_addr, uint32_t *netmask,
+		int *idx, uint64_t *hi, uint64_t *lo);
+
+
+/* Given the unit number, return an error, or the corresponding link rate
+   for the port */
+/* Returns an int, so -1 indicates an error. */
+int hfi_get_port_rate(int unit, int port);
+
+
+/* Get the number of units supported by the driver.  Does not guarantee
+   that a working chip has been found for each possible unit #.
+   Returns -1 with errno set, or number of units >=0 (0 means none found). */
+int hfi_get_num_units();
+
+/* Given a unit number, returns 1 if any port on the unit is active.
+   returns 0 if no port on the unit is active.
+   returns -1 when an error occurred. */
+int hfi_get_unit_active(int unit);
+
+/* get the number of contexts from the unit id. */
+int hfi_get_num_contexts(int unit);
+
+
+/* We use mmap64() because we compile in both 32 and 64 bit mode,
+   and we have to map physical addresses that are > 32 bits long.
+   While linux implements mmap64, it doesn't have a man page,
+   and isn't declared in any header file, so we declare it here ourselves. */
+
+/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+   redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+   it doesn't work when the address being mapped is > 32 bits.  It chips
+   off bits 32 and above.   So we stay with mmap64. */
+extern void *mmap64(void *, size_t, int, int, int, __off64_t);
+void *hfi_mmap64(void *, size_t, int, int, int, __off64_t);
+
+/* Statistics maintained by the driver */
+int hfi_get_stats(uint64_t *, int);
+int hfi_get_stats_names(char **namep);
+/* Counters maintained in the chip, globally, and per-prot */
+int hfi_get_ctrs_unit(int unitno, uint64_t *, int);
+int hfi_get_ctrs_unit_names(int unitno, char **namep);
+int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int);
+int hfi_get_ctrs_port_names(int unitno, char **namep);
+
+/* sysfs helper routines (only those currently used are exported;
+ * try to avoid using others) */
+
+const char *sysfs_unit_path(int unit_id);
+
+/* read a string value */
+int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char **datap);
+
+/* read a string value into buff, no more than size bytes.
+   returns the number of bytes read */
+size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr,
+			char *buff, size_t size);
+
+/* open attribute in unit's sysfs directory via open(2) */
+int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags);
+int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr,
+			int flags);
+
+int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap);
+
+/* print to attribute in {unit,port} sysfs directory */
+int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+			  const char *fmt, ...)
+			  __attribute__((format(printf, 4, 5)));
+int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...)
+			  __attribute__((format(printf, 3, 4)));
+
+/* read a signed 64-bit quantity, in some arbitrary base */
+int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+			    int64_t *valp, int base);
+int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+			    int64_t *valp, int base);
+int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit);
+
+#endif /* OPA_SERVICE_GEN1_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..49e786f73845545e83a3845e091aea65fc9aae25
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h
@@ -0,0 +1,294 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef OPA_USER_GEN1_H
+#define OPA_USER_GEN1_H
+
+/* This file contains all of the data structures and routines that are
+   publicly visible and usable (to low level infrastructure code; it is
+   not expected that any application, or even normal application-level library,
+   will ever need to use any of this).
+
+   Additional entry points and data structures that are used by these routines
+   may be referenced in this file, but they should not be generally available;
+   they are visible here only to allow use in inlined functions.  Any variable,
+   data structure, or function that starts with a leading "_" is in this
+   category.
+*/
+
+/* Include header files we need that are unlikely to otherwise be needed by */
+/* programs. */
+#include <stddef.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <syslog.h>
+#include <stdbool.h>
+#include "opa_intf.h"
+#include "opa_common_gen1.h"
+#include "opa_byteorder.h"
+#include "opa_udebug.h"
+#include "opa_service_gen1.h"
+#include "opa_user.h"
+
+#define HFI_RHF_USE_EGRBFR_MASK 0x1
+#define HFI_RHF_USE_EGRBFR_SHIFT 15
+#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF
+#define HFI_RHF_EGRBFR_INDEX_SHIFT 16
+
+#define HFI_RHF_SEQ_MASK 0xF
+#define HFI_RHF_SEQ_SHIFT 28
+#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF
+#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0
+#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF
+#define HFI_RHF_HDRQ_OFFSET_SHIFT 12
+#define HFI_RHF_TIDERR     0x08000000
+
+/* TidFlow related bits */
+#define HFI_TF_SEQNUM_SHIFT                 0
+#define HFI_TF_SEQNUM_MASK                  0x7ff
+
+#define HFI_TF_GENVAL_SHIFT                 11
+#define HFI_TF_GENVAL_MASK                  0xfffff
+
+#define HFI_TF_FLOWVALID_SHIFT              32
+#define HFI_TF_FLOWVALID_MASK               0x1
+
+#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT      34
+#define HFI_TF_KEEP_AFTER_SEQERR_MASK       0x1
+#define HFI_TF_KEEP_ON_GENERR_SHIFT         35
+#define HFI_TF_KEEP_ON_GENERR_MASK          0x1
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36
+#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK  0x1
+#define HFI_TF_STATUS_SEQMISMATCH_SHIFT     37
+#define HFI_TF_STATUS_SEQMISMATCH_MASK      0x1
+#define HFI_TF_STATUS_GENMISMATCH_SHIFT     38
+#define HFI_TF_STATUS_GENMISMATCH_MASK      0x1
+
+/* PBC bits */
+#define HFI_PBC_STATICRCC_SHIFT         0
+#define HFI_PBC_STATICRCC_MASK          0xffff
+
+#define HFI_PBC_SC4_SHIFT               4
+#define HFI_PBC_SC4_MASK                0x1
+
+#define HFI_PBC_INTR_SHIFT              31
+#define HFI_PBC_DCINFO_SHIFT            30
+#define HFI_PBC_TESTEBP_SHIFT           29
+#define HFI_PBC_PACKETBYPASS_SHIFT      28
+#define HFI_PBC_INSERTHCRC_SHIFT        26
+#define HFI_PBC_INSERTHCRC_MASK         0x3
+#define HFI_PBC_CREDITRETURN_SHIFT      25
+#define HFI_PBC_INSERTBYPASSICRC_SHIFT  24
+#define HFI_PBC_TESTBADICRC_SHIFT       23
+#define HFI_PBC_FECN_SHIFT              22
+#define HFI_PBC_VL_SHIFT                12
+#define HFI_PBC_VL_MASK                 0xf
+#define HFI_PBC_LENGTHDWS_SHIFT         0
+#define HFI_PBC_LENGTHDWS_MASK          0xfff
+
+/* this portion only defines what we currently use */
+struct hfi_pbc {
+	__u32 pbc0;
+	__u16 PbcStaticRateControlCnt;
+	__u16 fill1;
+};
+
+typedef enum mapsize
+{	SC_CREDITS,
+	PIO_BUFBASE_SOP,
+	PIO_BUFBASE,
+	RCVHDR_BUFBASE,
+	RCVEGR_BUFBASE,
+	SDMA_COMP_BUFBASE,
+	USER_REGBASE,
+	RCVHDRTAIL_BASE,
+	EVENTS_BUFBASE,
+	STATUS_BUFBASE,
+	SUBCTXT_UREGBASE,
+	SUBCTXT_RCVHDRBUF,
+	SUBCTXT_RCVEGRBUF,
+	MAPSIZE_MAX
+} mapsize_t;
+
+/* TODO: consider casting in the ALIGN() macro */
+#define ALIGN(x, a)				(((x)+(a)-1)&~((a)-1))
+#define ALIGNDOWN_PTR(x, a)			((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1))))
+
+/* using the same flags for all the mappings */
+#define HFI_MMAP_FLAGS				(MAP_SHARED|MAP_LOCKED)
+#define HFI_MMAP_PGSIZE				sysconf(_SC_PAGESIZE)
+/* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type
+ *  * on which one should not perform bitwise operations (undefined behavior)
+ *   */
+#define HFI_MMAP_PGMASK				(~(uintptr_t)(HFI_MMAP_PGSIZE-1))
+
+/* this is only an auxiliary macro for HFI_MMAP_ERRCHECK()
+ * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior
+ */
+#define U64_TO_OFF64_PGMASK(off)		((__off64_t)((off) & HFI_MMAP_PGMASK))
+
+#define HFI_MMAP_ALIGNOFF(fd, off, size, prot)	hfi_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off)))
+/* complementary */
+#define HFI_MUNMAP(addr, size)			munmap((addr), (size))
+
+/* make sure uintmax_t can hold the result of unsigned int multiplication */
+#if UINT_MAX > (UINTMAX_MAX / UINT_MAX)
+#error We cannot safely multiply unsigned integers on this platform
+#endif
+
+/* @member assumed to be of type u64 and validated to be so */
+#define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({						\
+		typeof((binfo)->member) *__tptr = (__u64 *)NULL;					\
+		(void)__tptr;										\
+		void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot));		\
+		do {											\
+			if (unlikely(__maddr == MAP_FAILED)) {						\
+				uintmax_t outval = (uintmax_t)((binfo)->member);			\
+				_HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n",		\
+					outval, size, strerror(errno));					\
+				goto err_mmap_##member;							\
+			}										\
+			(binfo)->member = (__u64)__maddr;						\
+			_HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member));	\
+		} while(0);										\
+		__maddr;										\
+})
+
+/* assigns 0 to the member after unmapping */
+#define HFI_MUNMAP_ERRCHECK(binfo, member, size)						\
+		do {	typeof((binfo)->member) *__tptr = (__u64 *)NULL;			\
+			(void)__tptr;								\
+			void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE);		\
+			if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) {	\
+				_HFI_INFO("unmap of " #member " (%p) failed: %s\n",		\
+					__addr, strerror(errno));				\
+			}									\
+			else {									\
+				_HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr);	\
+				(binfo)->member = 0;						\
+			}									\
+		} while(0)
+
+#define HFI_PCB_SIZE_IN_BYTES 8
+
+/* Usable bytes in header (hdrsize - lrh - bth) */
+#define HFI_MESSAGE_HDR_SIZE_HFI       (HFI_MESSAGE_HDR_SIZE-20)
+
+/*
+ * SDMA includes 8B sdma hdr, 8B PBC, and message header.
+ * If we are using GPU workloads, we need to set a new
+ * "flags" member which takes another 2 bytes in the
+ * sdma hdr. We let the driver know of this 2 extra bytes
+ * at runtime when we set the length for the iovecs.
+ */
+#define HFI_SDMA_HDR_SIZE      (8+8+56)
+
+static inline __u32 hfi_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT)
+	    & HFI_RHF_SEQ_MASK;
+}
+
+static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT)
+	    & HFI_RHF_HDRQ_OFFSET_MASK;
+}
+
+
+
+/* don't inline these; it's all init code, and not inlining makes the */
+/* overall code shorter and easier to debug */
+void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline));
+
+
+/*
+* Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once.
+*/
+#if defined(__x86_64__) && defined(HAVE_PSM3_DWORD_FAST)
+void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src,
+		       uint32_t ndwords);
+void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src,
+		       uint32_t nqwords);
+#else
+#define hfi_dwordcpy_safe hfi_dwordcpy
+#define hfi_qwordcpy_safe hfi_qwordcpy
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#endif /* OPA_USER_GEN1_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c
new file mode 100644
index 0000000000000000000000000000000000000000..7ed8e123e4fdbcb04c498abf1e36e7cc7a318757
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c
@@ -0,0 +1,97 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* This file contains hfi service routine interface used by the low */
+/* level hfi protocol code. */
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <time.h>
+
+#include "opa_user_gen1.h"
+
+/* touch the pages, with a 32 bit read */
+void hfi_touch_mmap(void *m, size_t bytes)
+{
+	volatile uint32_t *b = (volatile uint32_t *)m, c;
+	size_t i;		/* m is always page aligned, so pgcnt exact */
+	int __hfi_pg_sz;
+
+	/* First get the page size */
+	__hfi_pg_sz = sysconf(_SC_PAGESIZE);
+
+	_HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n",
+		  (unsigned long)bytes / __hfi_pg_sz, m);
+	bytes /= sizeof(c);
+	for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c))
+		c = b[i];
+}
+
+
+// never called for UD/UDP, we use __psm2_ep_poll_type instead
+
+
+
+
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c
new file mode 100644
index 0000000000000000000000000000000000000000..398646d1777b70843fd78273df7bcaa548cdfec5
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c
@@ -0,0 +1,119 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifdef PSM_CUDA
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_gdrcpy.h"
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include "ptl_ips/ips_tid.h"
+#include "ptl_ips/ips_expected_proto.h"
+#include "opa_user_gen1.h"
+
+static int gdr_fd;
+
+int get_gdr_fd(){
+	return gdr_fd;
+}
+
+#define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1)
+#define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK
+
+
+
+
+
+// flags=0 for send, 1 for recv
+void *
+gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
+							 size_t size, int flags,
+							 psm2_ep_t ep)
+{
+	void *host_addr_buf;
+
+	uintptr_t pageaddr = buf & GPU_PAGE_MASK;
+	uint64_t pagelen = (uint64_t) (PSMI_GPU_PAGESIZE +
+					   ((buf + size - 1) & GPU_PAGE_MASK) -
+					   pageaddr);
+
+	_HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%"PRIu64" flags=0x%x ep=%p\n",
+		(void *)buf, size, (void *)pageaddr, pagelen, flags, ep);
+#ifdef RNDV_MOD
+	ep = ep->mctxt_master;
+	host_addr_buf = __psm2_rv_pin_and_mmap(ep->verbs_ep.rv, pageaddr, pagelen, IBV_ACCESS_IS_GPU_ADDR);
+	if_pf (! host_addr_buf) {
+		if (errno == ENOMEM) {
+			if (psm2_verbs_evict_some(ep, pagelen, IBV_ACCESS_IS_GPU_ADDR) > 0)
+				host_addr_buf = __psm2_rv_pin_and_mmap(ep->verbs_ep.rv, pageaddr, pagelen, IBV_ACCESS_IS_GPU_ADDR);
+		}
+		if_pf (! host_addr_buf)
+			return NULL;
+	}
+//_HFI_ERROR("pinned buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x ep=%p, @ %p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, ep, host_addr_buf);
+#else
+	psmi_assert_always(0);	// unimplemented, should not get here
+	host_addr_buf = NULL;
+#endif /* RNDV_MOD */
+	return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK);
+}
+
+void hfi_gdr_open(){
+	return;
+}
+
+void hfi_gdr_close()
+{
+}
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec5d48a44e8f16a8698b671f77cf9818b6b66d6c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c
@@ -0,0 +1,122 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+
+#if PSMI_HAL_INST_CNT > 1
+#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_gen1_ ## KERNEL
+#include "psm2_hal_inline_t.h"
+#include "psm_hal_inline_i.h"
+#endif
+
+/* define the singleton that implements hal for gen1 */
+static hfp_gen1_t psm_gen1_hi = {
+	/* start of public psmi_hal_instance_t data */
+	.phi = {
+		.type					  = PSM_HAL_INSTANCE_GEN1,
+		.description				  = "PSM3 HAL instance for GEN1"
+#ifdef PSM_CUDA
+								" (cuda)"
+#endif
+									,
+		.hfi_name				  = "hfi1",
+		.hfi_sys_class_path			  = "/sys/class/infiniband/hfi1",
+		.params					  = {0},
+
+		/* The following methods are alphabetized */
+#if PSMI_HAL_INST_CNT > 1
+
+		.hfp_close_context			  = hfp_gen1_close_context,
+		.hfp_context_open			  = hfp_gen1_context_open,
+		.hfp_context_initstats			  = hfp_gen1_context_initstats,
+
+
+		.hfp_finalize_				  = hfp_gen1_finalize_,
+
+
+		.hfp_get_jkey				  = hfp_gen1_get_jkey,
+
+
+		.hfp_get_node_id			  = hfp_gen1_get_node_id,
+
+
+
+		.hfp_get_port_lid			  = hfp_gen1_get_port_lid,
+
+
+		.hfp_get_port_rate			  = hfp_gen1_get_port_rate,
+
+
+		.hfp_spio_process_events		  = hfp_gen1_spio_process_events,
+		.hfp_spio_transfer_frame		  = hfp_gen1_spio_transfer_frame,
+
+
+#endif // PSMI_HAL_INST_CNT > 1
+		.hfp_get_port_subnet		  = hfp_gen1_get_port_subnet,
+		.hfp_get_default_pkey			  = hfp_gen1_get_default_pkey,
+		.hfp_get_num_contexts			  = hfp_gen1_get_num_contexts,
+		.hfp_get_num_free_contexts		  = hfp_gen1_get_num_free_contexts,
+		.hfp_get_num_units			  = hfp_gen1_get_num_units,
+		.hfp_get_num_ports			  = hfp_gen1_get_num_ports,
+		.hfp_get_port_active			  = hfp_gen1_get_port_active,
+		.hfp_get_unit_active			  = hfp_gen1_get_unit_active,
+		.hfp_initialize				  = hfp_gen1_initialize,
+	},
+};
+
+/* __psmi_hal_gen1_constructor */
+static void __attribute__ ((constructor)) __psmi_hal_gen1_constructor(void)
+{
+	psmi_hal_register_instance((psmi_hal_instance_t*)&psm_gen1_hi);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..3509bcddf57d7292d6a3fd9fc11b37dfa2a9d3a8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h
@@ -0,0 +1,74 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "psm_hal_gen1_spio.h"
+#include "psm_mq_internal.h"
+#include "opa_user_gen1.h"
+
+
+/* Private struct on a per-context basis. */
+typedef struct _hfp_gen1_pc_private
+{
+} hfp_gen1_pc_private;
+
+
+/* declare hfp_gen1_t struct, (combines public psmi_hal_instance_t
+   together with a private struct) */
+typedef struct _hfp_gen1
+{
+	psmi_hal_instance_t phi;
+} hfp_gen1_t;
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c
new file mode 100644
index 0000000000000000000000000000000000000000..d4a832fd1e6c0ffa108eebeed21854cd372c29e4
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -0,0 +1,306 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */
+
+/* included header files  */
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sched.h>
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "psm_hal_gen1_spio.h"
+#include "ips_proto_params.h"
+
+/* Report PIO stalls every 20 seconds at the least */
+#define SPIO_STALL_WARNING_INTERVAL	  (nanosecs_to_cycles(20e9))
+#define SPIO_MAX_CONSECUTIVE_SEND_FAIL	  (1<<20)	/* 1M */
+/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */
+#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4)	/* 16 */
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*
+ * Check and process events
+ * return value:
+ *  PSM2_OK: normal events processing;
+ *  PSM2_OK_NO_PROGRESS: no event is processed;
+ */
+static PSMI_HAL_INLINE psm2_error_t
+ips_spio_process_events(const struct ptl *ptl_gen)
+{
+	// TODD - TBD - check link status events for UD/UDP
+	return PSM2_OK;
+}
+
+
+// TBD we could get also get scb->cksum out of scb
+// when called:
+//		scb->ips_lrh has fixed size PSM header including OPA LRH
+//		payload, length is data after header
+//		we don't do checksum, let verbs handle that for us
+// we need to manage our own registered send buffers because
+// in the control paths (connect, disconnect), the scb may be on the stack
+// and we must be done with it when this returns.
+// in the normal path the scb could be longer lived if we wanted it to be.
+// OPA SDMA had a synchronous routine on control path (ips_dma_transfer_frame)
+// which started the DMA and waits for it to complete
+// in the normal path, scb_send_dma was used.  This sends all the scb's on a
+// pending queue.  It only reaps DMA in that path if it is out of DMA resources
+// a few receive paths for ack and nak also reap send DMAs.
+// In general scb's just describe an IO, they do not have persistent buffers.
+// So send bounce buffers avoid MR handling overheads.
+// So for simplicity here we will take a lazy Send CQ reaping strategy.
+// We'll reap if we need more and will do a quick reap after we post a new send
+// this should keep CQ reaping out of the latency path for microbenchmarks.
+// It does not seem that DMA does any reaping in other progress calls
+// however the reaping in ack's may help it.
+// important to note that UD Send completion just means the packet exited the
+// local HFI, does not imply end to end delivery.  PIO has
+// similar semantics and we know the UDP sendto simply puts a packet on
+// a UDP queue for future transmission, much like a UD QP post_send works
+psm2_error_t
+ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
+			struct ips_scb *scb, uint32_t *payload,
+			uint32_t length, uint32_t isCtrlMsg,
+			uint32_t cksum_valid, uint32_t cksum
+#ifdef PSM_CUDA
+			, uint32_t is_cuda_payload
+#endif
+			)
+{
+	psm2_error_t ret = PSM2_OK;
+	psm2_error_t err;
+	psm2_ep_t ep = proto->ep;
+	struct ibv_send_wr wr;
+	struct ibv_send_wr *bad_wr;
+	struct ibv_sge list[2];
+	sbuf_t sbuf;
+	struct ips_message_header *ips_lrh = &scb->ips_lrh;
+	int send_dma = ips_scb_flags(scb) & IPS_SEND_FLAG_SEND_MR;
+
+	// these defines are bit ugly, but make code below simpler with less ifdefs
+	// once we decide if USE_RC is valuable we can cleanup
+	// for RC we continue to use UD QP for control messages
+	// (connect/disconnect/ack/nak/becn), this avoids issues especially during
+	// QP teardown in disconnect.  We also use UD for ACK/NAK, this allows
+	// flow credits to be managed over UD
+#define USE_ALLOCATOR (isCtrlMsg?&ep->verbs_ep.send_allocator:flow->ipsaddr->use_allocator)
+#define USE_QP (isCtrlMsg?ep->verbs_ep.qp:flow->ipsaddr->use_qp)
+#define USE_MAX_INLINE (isCtrlMsg?ep->verbs_ep.qp_cap.max_inline_data:flow->ipsaddr->use_max_inline_data)
+
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_sendlost, "sendlost",
+				"drop "
+				"RC eager or any "
+				"UD packet before sending",
+				1, IPS_FAULTINJ_SENDLOST);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_sendlost, ""))
+			return PSM2_OK;
+	}
+#endif // PSM_FI
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+	psmi_assert_always(! cksum_valid);	// no software checksum yet
+	// allocate a send buffer
+	// if we have no buffers, we can return PSM2_EP_NO_RESOURCES and caller
+	// will try again later
+	sbuf = __psm2_ep_verbs_alloc_sbuf(USE_ALLOCATOR);
+	if_pf (! sbuf) {
+		// reap some SQ completions
+		ret = psm2_verbs_completion_update(proto->ep);
+		if_pf (ret != PSM2_OK)
+			return ret;
+		sbuf = __psm2_ep_verbs_alloc_sbuf(USE_ALLOCATOR);
+	}
+	if_pf (! sbuf) {
+		_HFI_VDBG("out of send buffers\n");
+		return PSM2_EP_NO_RESOURCES;
+	}
+	_HFI_VDBG("got sbuf %p index %lu\n", sbuf_to_buffer(sbuf), send_buffer_index(sbuf_pool(ep, sbuf), sbuf_to_buffer(sbuf)));
+	// TBD - we should be able to skip sending some headers such as OPA lrh and
+	// perhaps bth (does PSM use bth to hold PSNs?)
+	// copy scb->ips_lrh to send buffer
+	_HFI_VDBG("copy lrh %p\n", ips_lrh);
+	memcpy(sbuf_to_buffer(sbuf), ips_lrh, sizeof(*ips_lrh));
+	if (!send_dma) {
+		// copy payload to send buffer, length could be zero, be safe
+		_HFI_VDBG("copy payload %p %u\n",  payload, length);
+#ifdef PSM_CUDA
+		if (is_cuda_payload) {
+			//_HFI_ERROR("cuMemcpyDtoH %p %u\n", payload, length);
+			PSMI_CUDA_CALL(cuMemcpyDtoH, sbuf_to_buffer(sbuf)+sizeof(*ips_lrh),
+				(CUdeviceptr)payload, length);
+		} else
+#endif
+		{
+			memcpy(sbuf_to_buffer(sbuf)+sizeof(*ips_lrh), payload, length);
+		}
+	}
+	_HFI_VDBG("%s send - opcode %x dma %d MR %p\n", qp_type_str(USE_QP),
+            _get_proto_hfi_opcode((struct  ips_message_header*)sbuf_to_buffer(sbuf)), !!send_dma, scb->mr);
+	// we don't support software checksum
+	psmi_assert_always(! (proto->flags & IPS_PROTO_FLAG_CKSUM));
+	psmi_assert_always(USE_QP);	// make sure we aren't called too soon
+	list[0].addr = (uintptr_t)sbuf_to_buffer(sbuf);
+	list[0].lkey = sbuf_lkey(ep, sbuf);
+	if (send_dma) {
+		list[0].length = sizeof(*ips_lrh);	// note no UD_ADDITION
+		list[1].addr = scb->mr->iova
+			+ ((uintptr_t)ips_scb_buffer(scb) - (uintptr_t)scb->mr->addr);
+		psmi_assert(ips_scb_buffer(scb) == payload);
+#ifdef RNDV_MOD
+		psmi_assert(psm2_verbs_user_space_mr(scb->mr));
+#endif
+		list[1].length = length;
+		list[1].lkey = scb->mr->lkey;
+	} else {
+		list[0].length = sizeof(*ips_lrh)+ length ;	// note no UD_ADDITION
+		list[1].length = 0;
+	}
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_sq_lkey, "sq_lkey",
+				"send "
+				"RC eager or any "
+				"UD packet with bad lkey",
+				0, IPS_FAULTINJ_SQ_LKEY);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_sq_lkey, " QP %u", USE_QP->qp_num ))
+			list[0].lkey = 0x55;
+	}
+#endif // PSM_FI
+	wr.next = NULL;	// just post 1
+	psmi_assert(!((uintptr_t)sbuf & VERBS_SQ_WR_ID_MASK));
+	wr.wr_id = (uintptr_t)sbuf | VERBS_SQ_WR_ID_SEND;	// we'll get this back in completion
+		// we don't use the scb as wr_id since for PIO they may be freed
+		// immediately after a succesful call to transfer
+	wr.sg_list = list;
+	if (send_dma)
+		wr.num_sge = 2;	// size of sg_list
+	else
+		wr.num_sge = 1;	// size of sg_list
+	wr.opcode = IBV_WR_SEND;
+	// we want to only get occasional send completions
+	// and use them to release a whole set of buffers for reuse
+	// For USE_RC this is imperfect, we track when to ask for a CQE
+	// per RC QP.  However when traffic is using varied RC QPs, we may be
+	// left with some RC QPs with up to VERBS_SEND_CQ_COALLESCE-1 unsignalled
+	// WQEs and no traffic for a while, hence consuming a few send buffers per
+	// QP.  By tracking it per RC QP we at least avoid the case of a rotating
+	// traffic pattern never asking for a CQE for a given QP
+	if_pf ( ! --(USE_ALLOCATOR->send_num_til_coallesce)) {
+		wr.send_flags = IBV_SEND_SIGNALED;	// get a completion
+		USE_ALLOCATOR->send_num_til_coallesce = VERBS_SEND_CQ_COALLESCE;
+	} else {
+		wr.send_flags = 0;
+	}
+	if_pf (ips_lrh->khdr.kdeth0 & __cpu_to_le32(IPS_SEND_FLAG_INTR)) {
+		_HFI_VDBG("send solicted event\n");
+		wr.send_flags |= IBV_SEND_SOLICITED;
+	}
+
+		// for small messages, we may use IBV_SEND_INLINE for performance
+	if (! send_dma && list[0].length <= USE_MAX_INLINE)
+		wr.send_flags |= IBV_SEND_INLINE;
+	//wr.imm_data = 0;	// only if we use IBV_WR_SEND_WITH_IMM;
+	// ud fields are ignored for RC send (overlay fields for RDMA)
+	// so reduce branches by just always filling in these few fields
+	//if (USE_QP->qp_type == IBV_QPT_UD)
+	psmi_assert_always(flow->path->ah);
+	wr.wr.ud.ah = flow->path->ah;
+	wr.wr.ud.remote_qpn = flow->ipsaddr->remote_qpn;
+	wr.wr.ud.remote_qkey = ep->verbs_ep.qkey;
+
+	if (_HFI_PDBG_ON) {
+		_HFI_PDBG("ud_transfer_frame: len %u, remote qpn %u payload %u\n",
+			list[0].length+list[1].length,
+				(USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->remote_qpn :
+				 wr.wr.ud.remote_qpn,
+			length);
+		_HFI_PDBG_DUMP((uint8_t*)list[0].addr, list[1].length);
+		_HFI_PDBG("post send: QP %p (%u)\n", USE_QP, USE_QP->qp_num);
+	}
+	if_pf (ibv_post_send(USE_QP, &wr, &bad_wr)) {
+		if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM)
+			_HFI_ERROR("failed to post SQ on %s: %s", ep->dev_name, strerror(errno));
+		proto->stats.post_send_fail++;
+		ret = PSM2_EP_NO_RESOURCES;
+	}
+	_HFI_VDBG("done ud_transfer_frame: len %u, remote qpn %u\n",
+		list[0].length +list[1].length,
+		(USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->remote_qpn :
+ 		wr.wr.ud.remote_qpn);
+	// reap any completions
+	err = psm2_verbs_completion_update(proto->ep);
+	if_pf (err != PSM2_OK)
+		return err;
+	return ret;
+#undef USE_ALLOCATOR
+#undef USE_QP
+#undef USE_MAX_INLINE
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fb3ac219fd65a8bf3145bd91537a372a135653c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h
@@ -0,0 +1,177 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */
+
+#ifndef IPS_SPIO_H
+#define IPS_SPIO_H
+
+#include "psm_user.h"
+
+#define IPS_CTXT_RESET_MAX	1000	/* max send context reset */
+struct ips_spio;
+struct ptl;
+struct ips_proto;
+struct ips_flow;
+
+/* 64B move instruction support */
+#define AVX512F_BIT		16	/* level 07h, ebx */
+/* 32B move instruction support */
+#define AVX2_BIT		 5	/* level 07h, ebx */
+/* 16B move instruction support */
+#define SSE2_BIT		26	/* level 01h, edx */
+
+typedef
+void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#ifdef PSM_AVX512
+void hfi_pio_blockcpy_512(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+#endif
+void hfi_pio_blockcpy_256(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+void hfi_pio_blockcpy_128(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+void hfi_pio_blockcpy_64(volatile uint64_t *dest,
+				const uint64_t *src, uint32_t nblock);
+
+
+
+static inline psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto,
+				struct ips_flow *flow, struct ips_scb *scb,
+				uint32_t *payload, uint32_t length,
+				uint32_t isCtrlMsg, uint32_t cksum_valid,
+				uint32_t cksum
+#ifdef PSM_CUDA
+				, uint32_t is_cuda_payload
+#endif
+);
+
+static psm2_error_t ips_spio_process_events(const struct ptl *ptl);
+
+#define SPIO_CREDITS_Counter(value)       (((value) >> 0) & 0x7FF)
+#define SPIO_CREDITS_Status(value)        (((value) >> 11) & 0x1)
+#define SPIO_CREDITS_DueToPbc(value)      (((value) >> 12) & 0x1)
+#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1)
+#define SPIO_CREDITS_DueToErr(value)      (((value) >> 14) & 0x1)
+#define SPIO_CREDITS_DueToForce(value)    (((value) >> 15) & 0x1)
+struct ips_spio_credits {
+/* don't use bit operation for performance reason,
+ * using above macro instead.
+	uint16_t	Counter:11;
+	uint16_t	Status:1;
+	uint16_t	CreditReturnDueToPbc:1;
+	uint16_t	CreditReturnDueToThreshold:1;
+	uint16_t	CreditReturnDueToErr:1;
+	uint16_t	CreditReturnDueToForce:1;
+*/
+	union {
+		struct {
+			uint16_t value;
+			uint16_t pad0;
+			uint32_t pad1;
+		};
+		uint64_t credit_return;
+	};
+};
+
+struct ips_spio_ctrl {
+	/* credit return lock for context sharing */
+	pthread_spinlock_t spio_ctrl_lock;
+
+	/* PIO write in progress for context sharing */
+	volatile uint16_t spio_write_in_progress;
+	/* send context reset count */
+	volatile uint16_t spio_reset_count;
+	/* HFI frozen count, shared copy */
+	volatile uint16_t spio_frozen_count;
+
+	volatile uint16_t spio_available_blocks;
+	volatile uint16_t spio_block_index;
+	volatile uint16_t spio_fill_counter;
+	volatile struct ips_spio_credits spio_credits;
+} __attribute__ ((aligned(64)));
+
+struct ips_spio {
+	const psmi_context_t *context;
+	struct ptl *ptl;
+	uint16_t unit_id;
+	uint16_t portnum;
+
+	pthread_spinlock_t spio_lock;	/* thread lock */
+	volatile __le64 *spio_credits_addr __attribute__ ((aligned(64)));
+	volatile uint64_t *spio_bufbase_sop;
+	volatile uint64_t *spio_bufbase;
+	volatile struct ips_spio_ctrl *spio_ctrl;
+
+	uint16_t spio_frozen_count;	/* local copy */
+	uint16_t spio_total_blocks;
+	uint16_t spio_block_index;
+
+	uint32_t spio_consecutive_failures;
+	uint64_t spio_num_stall;
+	uint64_t spio_num_stall_total;
+	uint64_t spio_next_stall_warning;
+	uint64_t spio_last_stall_cyc;
+	uint64_t spio_init_cyc;
+
+	psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl);
+	psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl);
+
+	/* copying routines based on block size */
+	ips_spio_blockcpy_fn_t spio_blockcpy_med;
+	ips_spio_blockcpy_fn_t spio_blockcpy_large;
+
+};
+
+#endif /* IPS_SPIO_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h
new file mode 100644
index 0000000000000000000000000000000000000000..0119dafc5254ab729c7ea021b07712c767f1853f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h
@@ -0,0 +1,443 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_hal_gen1.h"
+
+extern size_t arrsz[MAPSIZE_MAX];
+
+static inline struct _hfp_gen1 *get_psm_gen1_hi(void)
+{
+	return (struct _hfp_gen1*) psmi_hal_current_hal_instance;
+}
+
+/* hfp_gen1_initialize */
+static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi)
+{
+	return 0;
+}
+
+/* hfp_gen1_finalize_ */
+static PSMI_HAL_INLINE int hfp_gen1_finalize_(void)
+{
+	return 0;
+}
+
+/* hfp_gen1_get_num_units */
+static PSMI_HAL_INLINE int hfp_gen1_get_num_units(void)
+{
+	return hfi_get_num_units();
+}
+
+/* hfp_gen1_get_num_ports */
+static PSMI_HAL_INLINE int hfp_gen1_get_num_ports(void)
+{
+	return HFI_NUM_PORTS_GEN1;
+}
+
+/* hfp_gen1_get_unit_active */
+static PSMI_HAL_INLINE int hfp_gen1_get_unit_active(int unit)
+{
+	return hfi_get_unit_active(unit);
+}
+
+/* hfp_gen1_get_port_active */
+static PSMI_HAL_INLINE int hfp_gen1_get_port_active(int unit, int port)
+{
+	return hfi_get_port_active(unit, port);
+}
+
+// Most of these defines are in opa_service_gen1.c, but there are no
+// include files common to that file and this one
+#define HFI_UD_NUM_CTXTS   1024
+
+/* hfp_gen1_get_num_contexts */
+static PSMI_HAL_INLINE int hfp_gen1_get_num_contexts(int unit)
+{
+	return HFI_UD_NUM_CTXTS;
+}
+
+// Most of these defines are in opa_service_gen1.c, but there are no
+// include files common to that file and this one
+#define HFI_UD_NUM_FREE_CTXTS   1024
+
+/* hfp_gen1_get_num_free_contexts */
+static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit)
+{
+	return HFI_UD_NUM_FREE_CTXTS;
+}
+
+
+
+/* hfp_gen1_close_context */
+static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp)
+{
+	hfp_gen1_pc_private *psm_hw_ctxt;
+
+	if (!ctxtp || !*ctxtp)
+		return PSM_HAL_ERROR_OK;
+
+	psm_hw_ctxt = (hfp_gen1_pc_private *)(*ctxtp);
+	psmi_free(psm_hw_ctxt);
+
+	return PSM_HAL_ERROR_OK;
+}
+
+/* Moved from psm_context.c */
+
+
+
+
+
+
+static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * cpuset) {
+	int i;
+	int isfirst = 1;
+	char tmp[25]; //%d = 10 :: 10 + '-' + 10 + ',' + '\0' = 23
+	int first = -1, last = -1;
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (CPU_ISSET(i, cpuset)) {
+			if (first == -1) {
+				first = last = i;
+			} else if ((last+1) == i) {
+				last = i;
+			}
+		} else if (first != -1) {
+			if (first == last) {
+				snprintf(tmp, sizeof(tmp), "%d,", first);
+			} else {
+				snprintf(tmp, sizeof(tmp), "%d-%d,", first, last);
+			}
+			first = last = -1;
+
+			if (isfirst) {
+				strncpy(buf, tmp, buf_size-1);
+				isfirst=0;
+			} else {
+				strncat(buf, tmp, buf_size-1);
+			}
+			buf[buf_size-1] = '\0';
+		}
+	}
+
+	if (first != -1) {
+		if (first == last) {
+			snprintf(tmp, sizeof(tmp), "%d,", first);
+		} else {
+			snprintf(tmp, sizeof(tmp), "%d-%d,", first, last);
+		}
+		if (isfirst) {
+			strncpy(buf, tmp, buf_size-1);
+		} else {
+			strncat(buf, tmp, buf_size-1);
+		}
+		buf[buf_size-1] = '\0';
+	}
+	char *comma = strrchr(buf, ',');
+	if (comma) comma[0] = '\0';
+
+	return buf;
+} //	pthread_getaffinity_np
+
+/* hfp_gen1_context_open */
+static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit,
+				 int port,
+				 uint64_t open_timeout,
+				 psm2_ep_t ep,
+				 psm2_uuid_t const job_key,
+				 psmi_context_t *psm_ctxt,
+				 uint32_t cap_mask,
+				 unsigned retryCnt)
+{
+	psm2_error_t err = PSM2_OK;
+	hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private));
+
+	if_pf (!pc_private) {
+		//err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT;
+		goto bail;
+	}
+
+	memset(pc_private, 0, sizeof(hfp_gen1_pc_private));
+
+
+	// open verbs 1st so psmi_context_open can get pkey, lid, etc
+	if ((err = __psm2_ep_open_verbs(ep, unit, port, job_key)) != PSM2_OK) {
+		_HFI_ERROR( "Unable to initialize verbs\n");
+		err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT;
+		goto bail;
+	}
+
+	pthread_t mythread = pthread_self();
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+
+	int s = pthread_getaffinity_np(mythread, sizeof(cpu_set_t), &cpuset);
+	if (s != 0) {
+		psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+			"Can't get CPU affinity: %s\n", strerror(errno));
+		goto bail;
+	}
+
+	if (_HFI_DBG_ON) {
+		char cpu_buf[128] = {0};
+		_HFI_DBG( "CPU affinity Before set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset));
+	}
+
+	if (getenv("PSM3_FORCE_CPUAFFINITY") ||
+		!(getenv("PSM3_NO_CPUAFFINITY") || ep->skip_affinity))
+	{
+		cpu_set_t mycpuset, andcpuset;
+
+		if (hfi_get_unit_cpumask(unit, &mycpuset)) {
+			_HFI_ERROR( "Failed to get %s (unit %d) cpu set\n", ep->dev_name, unit);
+			//err = -PSM_HAL_ERROR_GENERAL_ERROR;
+			goto bail;
+		}
+
+		int cpu_count = CPU_COUNT(&cpuset);
+		int my_count = CPU_COUNT(&mycpuset);
+		if (cpu_count > my_count) {
+			andcpuset = cpuset;
+		} else {
+			CPU_AND(&andcpuset, &cpuset, &mycpuset);
+		}
+		int cpu_and_count = CPU_COUNT(&andcpuset);
+
+		if (cpu_and_count > 0 && pthread_setaffinity_np(mythread, sizeof(andcpuset), &andcpuset)) {
+			_HFI_ERROR( "Failed to set %s (unit %d) cpu set: %s\n", ep->dev_name,  unit, strerror(errno));
+			//err = -PSM_HAL_ERROR_GENERAL_ERROR;
+			goto bail;
+		} else if (cpu_and_count == 0 && _HFI_DBG_ON) {
+			char buf1[128] = {0};
+			char buf2[128] = {0};
+			_HFI_DBG( "CPU affinity not set, NIC selected is not on the same socket as thread (\"%s\" & \"%s\" == 0).\n",
+				_dump_cpu_affinity(buf1, 128, &mycpuset), _dump_cpu_affinity(buf2, 128, &cpuset));
+		}
+	}
+	if (_HFI_DBG_ON) {
+		CPU_ZERO(&cpuset);
+		int s = pthread_getaffinity_np(mythread, sizeof(cpu_set_t), &cpuset);
+		if (s != 0) {
+			psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+				"Can't get CPU affinity: %s\n", strerror(errno));
+			goto bail;
+		}
+		char cpu_buf[128] = {0};
+		_HFI_DBG( "CPU affinity After set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset));
+	}
+
+// TBD - inside hfi_userinit_internal we would find CPU
+// which HFI is closest to and set affinity.  Need a way to do that for UD
+// we would also wash jkey through driver and stash it in _hfi_ctrl
+// but because we disable this we won't have an _hfi_ctrl structure
+
+		psm_ctxt->psm_hw_ctxt = pc_private;
+	return PSM_HAL_ERROR_OK;
+
+bail:
+	if (pc_private) {
+		psmi_free(pc_private);
+	}
+
+	return -PSM_HAL_ERROR_GENERAL_ERROR;
+}
+
+/* hfp_gen1_context_initstats */
+static PSMI_HAL_INLINE void hfp_gen1_context_initstats(psm2_ep_t ep)
+{
+	__psm2_ep_initstats_verbs(ep);
+}
+
+
+
+
+
+static PSMI_HAL_INLINE int hfp_gen1_get_port_rate(int unit, int port)
+{
+	return hfi_get_port_rate(unit, port);
+}
+
+
+
+
+
+static PSMI_HAL_INLINE int hfp_gen1_get_port_lid(int unit, int port)
+{
+	return hfi_get_port_lid(unit, port);
+}
+
+static PSMI_HAL_INLINE int hfp_gen1_get_port_subnet(int unit, int port,
+	uint64_t *subnet, uint64_t *addr, uint32_t *ip_addr, uint32_t *netmask,
+	int *idx, uint64_t *hi, uint64_t *lo)
+{
+	return hfi_get_port_subnet(unit, port, subnet, addr, ip_addr, netmask,
+								idx, hi, lo);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static PSMI_HAL_INLINE int hfp_gen1_get_default_pkey(void)
+{
+	return HFI_DEFAULT_P_KEY;
+}
+
+#include "psm_hal_gen1_spio.c"
+
+
+
+static PSMI_HAL_INLINE int hfp_gen1_spio_transfer_frame(struct ips_proto *proto,
+					struct ips_flow *flow, struct ips_scb *scb,
+					uint32_t *payload, uint32_t length,
+					uint32_t isCtrlMsg, uint32_t cksum_valid,
+					uint32_t cksum, psmi_hal_hw_context ctxt
+#ifdef PSM_CUDA
+				, uint32_t is_cuda_payload
+#endif
+	)
+{
+	return ips_spio_transfer_frame(proto, flow, scb,
+					 payload, length, isCtrlMsg,
+					 cksum_valid, cksum
+#ifdef PSM_CUDA
+				, is_cuda_payload
+#endif
+	);
+}
+
+static PSMI_HAL_INLINE int hfp_gen1_spio_process_events(const struct ptl *ptl)
+{
+	return ips_spio_process_events(ptl);
+}
+
+static PSMI_HAL_INLINE int hfp_gen1_get_node_id(int unit, int *nodep)
+{
+	int64_t node_id = hfi_sysfs_unit_read_node_s64(unit);
+	*nodep = (int)node_id;
+	if (node_id != -1)
+		return PSM_HAL_ERROR_OK;
+	else
+		return -PSM_HAL_ERROR_GENERAL_ERROR;
+}
+
+
+
+
+static PSMI_HAL_INLINE int      hfp_gen1_get_jkey(psmi_hal_hw_context ctxt)
+{
+	return 0;	// TBD - washed through driver - see HED-542
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_help.h b/deps/libfabric/prov/psm3/psm3/psm_help.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fc880b6467eb42c228cb563cf172a7604dbe83b
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_help.h
@@ -0,0 +1,195 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_HELP_H
+#define _PSMI_HELP_H
+#include "psm_log.h"
+
+/* XXX gcc only */
+#define PSMI_INLINE(FN) \
+	static inline FN
+
+#ifndef PACK_SUFFIX
+/* XXX gcc only */
+#define PACK_SUFFIX __attribute__((packed))
+#endif
+
+#define PSMI_ALWAYS_INLINE(FN) \
+	static __inline__ FN __attribute__((always_inline));  \
+	static __inline__ FN
+
+#define PSMI_NEVER_INLINE(FN)             \
+	static FN __attribute__((noinline));  \
+	static FN
+
+#define _PPragma(x) _Pragma(x)
+
+#define STRINGIFY(s)	_STRINGIFY(s)
+#define _STRINGIFY(s)	#s
+#define PSMI_CURLOC	__FILE__ ":" STRINGIFY(__LINE__)
+#define psmi_assert_always_loc(x, curloc)				\
+	do {								\
+	if_pf(!(x)) {							\
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Assertion failure at %s: %s", curloc,	\
+				STRINGIFY(x));				\
+	} } while (0)
+
+#define psmi_assert_always(x)  psmi_assert_always_loc(x, PSMI_CURLOC)
+
+#ifdef PSM_DEBUG
+#  define psmi_assert(x)	psmi_assert_always(x)
+#  define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized())
+#else
+#  define psmi_assert(x)
+#  define PSMI_ASSERT_INITIALIZED()
+#endif
+
+#define _PSMI_API_NAME(FN)  __ ## FN
+#define _PSMI_API_STR(FN)   _STRINGIFY(__ ## FN)
+#define PSMI_API_DECL(FN)							\
+	typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN))));
+
+#define PSMI_ERR_UNLESS_INITIALIZED(ep)					\
+	do {								\
+		if (!psmi_isinitialized()) {				\
+			PSM2_LOG_MSG("leaving");				\
+			return psmi_handle_error(ep, PSM2_INIT_NOT_INIT,	\
+				"PSM3 has not been initialized");	\
+	  }								\
+	} while (0)
+
+#define PSMI_CHECKMEM(err, mem)			\
+	do {					\
+		if ((mem) == NULL) {		\
+			(err) = PSM2_NO_MEMORY;	\
+			goto fail;		\
+		}				\
+	} while (0)
+
+#define PSMI_CACHEALIGN	__attribute__((aligned(64)))
+
+/* Easy way to ignore the OK_NO_PROGRESS case */
+PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err))
+{
+	if (err > PSM2_OK_NO_PROGRESS)
+		return err;
+	else
+		return PSM2_OK;
+}
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+#define SEC_ULL	 1000000000ULL
+#define MSEC_ULL 1000000ULL
+#define USEC_ULL 1000ULL
+#define NSEC_ULL 1ULL
+
+#define PSMI_TRUE   1
+#define PSMI_FALSE  0
+
+#define PSMI_CYCLES_TO_SECSF(cycles)			\
+		((double) cycles_to_nanosecs(cycles) / 1.0e9)
+
+#define PSMI_PAGESIZE       psmi_getpagesize()
+#define PSMI_POWEROFTWO(P)  (((P)&((P)-1)) == 0)
+#define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1)))
+#define PSMI_ALIGNUP(p, P)   (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P)))
+
+#define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff))
+
+#ifdef PSM_DEBUG
+
+/* The intent of the following two macros is to emit an internal error if a size of a
+   'member' is not as expected, violating an assumption in the code. There are some
+   problems with the implementation of this code:
+
+   The first macro creates a static const variable with ABSOLUTELY NO references
+   to them.  For example there are ABSOLUTELY NO uses of the second macro in the
+   PSM code. This is not completely pure. GCC version 5, for example, emits a
+   warning for defining a static const when it is not referenced.
+
+   A better implementation of the intent of this code is to use static_assert()
+   so that at compile time the violations can be caught and corrected - not at
+   run time.  */
+
+#define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz
+#define PSMI_STRICT_SIZE_VERIFY(member, sz)				\
+	do {								\
+		if (__psm2_ss_ ## member != (sz)) {			\
+			char errmsg[64];				\
+			snprintf(errmsg, 32, "Internal error: %s "	\
+					"size doesn't match expected %d bytes",	\
+					STRINGIFY(member), (int) __psm2_ss_ ## member);	\
+			exit(-1);					\
+		}							\
+	} while (0)
+
+#else
+
+#define PSMI_STRICT_SIZE_DECL(member, sz)   /* nothing */
+#define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */
+
+#endif /*  PSM_DEBUG */
+
+#endif /* _PSMI_HELP_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_lock.h b/deps/libfabric/prov/psm3/psm3/psm_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5aad6ff899624dfb17d03a9c2a3bae63ac1f906
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_lock.h
@@ -0,0 +1,239 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_lock.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_LOCK_H
+#define _PSMI_LOCK_H
+
+#ifndef PSMI_USE_PTHREAD_SPINLOCKS
+#define PSMI_USE_PTHREAD_SPINLOCKS 0
+#endif
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+typedef pthread_spinlock_t psmi_spinlock_t;
+
+#define psmi_spin_init(lock)	  pthread_spin_init(lock, \
+					PTHREAD_PROCESS_PRIVATE)
+#define psmi_spin_destroy(lock)	pthread_spin_destroy(lock)
+#define psmi_spin_lock(lock)	  pthread_spin_lock(lock)
+#define psmi_spin_trylock(lock) pthread_spin_trylock(lock)
+#define psmi_spin_unlock(lock)  pthread_spin_unlock(lock)
+#else
+typedef ips_atomic_t psmi_spinlock_t;
+#define PSMI_SPIN_INVALID   2
+#define PSMI_SPIN_LOCKED    1
+#define PSMI_SPIN_UNLOCKED  0
+#endif
+
+/* psmi_lock_t structure */
+typedef struct {
+
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	psmi_spinlock_t lock;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	pthread_mutex_t lock;
+	pthread_t lock_owner;
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+	pthread_mutex_t lock;
+#endif
+} psmi_lock_t;
+
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+#else
+PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock))
+{
+	ips_atomic_set(lock, PSMI_SPIN_UNLOCKED);
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock))
+{
+	if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED)
+			== PSMI_SPIN_UNLOCKED) {
+		return 0;
+	}
+
+	return EBUSY;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_destroy(psmi_spinlock_t *lock))
+{
+	if (lock == NULL) {
+		return EINVAL;
+	}
+
+	/* We could just do psmi_spin_trylock() here and dispense with the invalid state */
+	if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_INVALID)
+			== PSMI_SPIN_UNLOCKED) {
+		return 0;
+	}
+
+	return EBUSY;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock))
+{
+	while (psmi_spin_trylock(lock) == EBUSY) {
+	}
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock))
+{
+	atomic_set(lock, PSMI_SPIN_UNLOCKED);
+	return 0;
+}
+#endif /* PSMI_USE_PTHREAD_SPINLOCKS */
+
+PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
+{
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	psmi_spin_init(&(lock->lock));
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+	pthread_mutex_init(&(lock->lock), NULL);
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP);
+	pthread_mutex_init(&(lock->lock), &attr);
+	pthread_mutexattr_destroy(&attr);
+	lock->lock_owner = PSMI_LOCK_NO_OWNER;
+#endif
+}
+
+PSMI_ALWAYS_INLINE(void psmi_destroy_lock(psmi_lock_t *lock))
+{
+#ifdef PSMI_LOCK_IS_SPINLOCK
+	int err;
+	/* This will map to either pthread_spin_destroy() or our custom psmi_spin_destroy().
+	 * Both their return values can be interpreted by strerror().
+	 */
+	if ((err = psmi_spin_destroy(&(lock->lock))) != 0) {
+		_HFI_VDBG("Destroying spinlock failed: %s\n", strerror(err));
+	}
+	/* The same path for both the regular mutex and the debugging mutex */
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK) || defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+	int err;
+	if ((err = pthread_mutex_destroy(&(lock->lock))) != 0) {
+		/* strerror_r() may be a better choice here but it is tricky
+		 * to reliably detect the XSI vs GNU version, and if hardcoded,
+		 * may be inadvertently changed when tampering with headers/makefiles
+		 * in the long run.
+		 *
+		 * This would result in incorrect operation: a segfault from
+		 * derefencing the return value or failure to retrieve the
+		 * error string.
+		 *
+		 * The C11's strerror_s may be an option here too.
+		 */
+		_HFI_VDBG("Destroying mutex failed: %s\n", strerror(err));
+	}
+#endif
+}
+
+PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name))
+{
+	if (sem_post(sem) == -1) {
+		_HFI_VDBG("Semaphore %s: post failed\n", name ? name : "NULL" );
+		return -1;
+	}
+
+	_HFI_VDBG("Semaphore %s: post succeeded\n", name ? name : "NULL");
+
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_sem_timedwait(sem_t *sem, const char *name))
+{
+	/* Wait 5 seconds for shm read-write lock to open */
+	struct timespec ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	ts.tv_sec += 5;
+
+	if (sem_timedwait(sem, &ts) == -1) {
+		_HFI_VDBG("Semaphore %s: Timedwait failed: %s (%d)\n",
+				name ? name : "NULL", strerror(errno), errno );
+		return -1;
+	}
+
+	_HFI_VDBG("Semaphore %s: Timedwait succeeded\n", name ? name : "NULL");
+
+	return 0;
+}
+
+PSMI_ALWAYS_INLINE(int psmi_init_semaphore(sem_t **sem, const char *name,
+					   mode_t mode, int value))
+{
+	*sem = sem_open(name, O_CREAT | O_EXCL, mode, value);
+	if ((*sem == SEM_FAILED) && (errno == EEXIST)) {
+		*sem = sem_open(name, O_CREAT, mode, value);
+		if (*sem == SEM_FAILED) {
+			_HFI_VDBG("Cannot open semaphore %s, errno=%d\n",
+				  name, errno);
+			return -1;
+		}
+	} else if (*sem == SEM_FAILED) {
+		_HFI_VDBG("Cannot create semaphore %s, errno=%d\n", name, errno);
+		return -1;
+	}
+
+	return 0;
+}
+
+#endif /* _PSMI_LOCK_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_log.h b/deps/libfabric/prov/psm3/psm3/psm_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e4ab814ec7a796fba216299bed58b389f0b5b16
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_log.h
@@ -0,0 +1,282 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_LOG_H
+#define _PSMI_LOG_H
+
+/*
+
+  A note about PSM_LOG and PSM_LOG_FAST_IO:
+
+  By default, the PSM_LOG facility is safe and slow. Log messages
+  are written to a file under /tmp as they're generated. So, if the test case
+  has an abnormal termination such as a segmentation fault or an abort(),
+  the log messages will still be available.
+
+  However, debugging timing sensitive problems, make the default PSM_LOG
+  facility inadequate as the timing overhead that it introduces dominates, and
+  the symptoms of the problem being tested may change.
+
+  When performance is important, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO.
+  With PSM_LOG_FAST_IO, log messages are written to a memory buffer, and when
+  the program terminates, the log messages are written to a file under /tmp
+
+  * How to use basic functionality of PSM LOG:
+
+    - To use default PSM_LOG, build PSM2 with macro
+          PSM_LOG=1
+
+    - To use PSM_LOG when performance is critical, build PSM2 with macros
+          PSM_LOG=1 PSM_LOG_FAST_IO=1
+
+    - Insert log message in code with a <LOG TAG>. Log message follow the same
+      format as printf(). For example:
+          PSM2_LOG_MSG("<LOG TAG> %u", 1);
+
+    - To filter out <LOG TAG> log messages, set environment variable
+      PSM3_LOG_SRCH_FORMAT_STRING to <LOG TAG> and the wildcard character (*).
+      For example,
+          PSM3_LOG_SRCH_FORMAT_STRING=<LOG TAG>*
+
+    - A more detailed explanation to use PSM LOG can be found below.
+
+  * How to get log messages with abnormal termination while using
+    PSM LOG with PSM_LOG_FAST_IO:
+
+    - Log messages are saved from a memory buffer to a file under /tmp when
+      psmi_log_fini() is called. psmi_log_fini() is exposed to the outside
+      world via the linker script file, so client test code can psmi_log_fini()
+      on a fatal error.
+
+  --------------------------------------------------------------------------------
+
+  This file (psm_log.h) defines macros for logging messages to assist
+  investigations into the psm library.
+
+  By default, these macros are not defined when building psm.  When not defined,
+  the macros become no-ops in the PSM code.
+
+  When enabled (by defining the PSM_LOG symbol), the macros present information
+  to the psmi_log_message() facility for processing.  See below for more
+  information on the psmi_log_message() facility.
+
+  The macros are described in the following:
+
+  PSM2_LOG_MSG(FORMAT,...)        Spills a printf-style message to the log.
+  PSM2_LOG_DECLARE_BT_BUFFER()    Declares a local back trace buffer for use
+                                  with the PSM2_LOG_BT() macro.
+  PSM2_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs
+                                  from the previous backtrace spilled to the
+				  log.
+
+  The psmi_log_message() facility is the backend for these messages when
+  PSM_LOG is enabled.  The psmi_log_message() facility spills messages to
+  unique log files based on the process id and the thread id.  So every unique
+  process id, and thread id will spill to unique log files.  The
+  psmi_log_message prefixes each message in the log files with a high
+  resolution timer message so that messages from multiple threads and log files
+  can be reconciled to one timeline.  It is left as an exercise to the reader
+  to reconcile log messages from different hosts to one timeline.
+
+  The backtrace capability in the PSM_LOG functionality needs some explanation:
+  often a bug happens only when the code is tickled from a specific call-chain.
+  The PSM2_LOG_BT() macro supports identifying the unique call-chain when a
+  problem occurs.  The model is as follows:
+
+  A unique declaration is made for a backtrace to spill the backtrace
+  information to.  This declaration should be made in the same basic block as
+  the use of the PSM2_LOG_BT() macro.  To make the declaration, use
+  PSM2_LOG_DECLARE_BT_BUFFER().
+
+  When the PSM_LOG is enabled, at the statement for the macro:
+  PSM2_LOG_BT(NFRAMES,FORMAT,...), the psmi_log_message() facility generates
+  the current backtrace, and compares the first NFRAMES of the current backtrace
+  against the previous backtrace stored in the backtrace buffer declared with
+  the declaration.  If the two backtraces differ, the psmi_log_message() code
+  saves the current backtrace into the declared buffer, and then spills the
+  backtrace to the log file.
+
+  At runtime, setting environment variables can squelch the log file from
+  getting too big:
+
+  PSM3_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated
+  FNL) (see below), that will INClude the FNL's into the colleciton of functions
+  to spill log data for.
+
+  PSM3_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude
+  the FNL's from the collection of functions to spill log data for.
+
+  An FNL is a 'Function Name List' that is defined by the following grammar:
+
+  # A LINE1 is either a single line number of a range of line numbers:
+  LINE1 :: lineNumber |
+           lineNumber1 '-' lineNumber2
+
+  # LINES is a list of LINE1's separated by commas:
+  LINES :: LINE1 |
+           LINE1 ',' LINES
+
+  # An FN is either a function name, or a function name with a list of lines:
+  FN :: functionName |
+        functionName ';' LINES
+
+  # A FNL is a list of FN's separated by colons:
+  FNL ::  FN |
+          FN ':' FNL
+
+  # Examples:
+  foo:bar    the two functions foo and bar
+  foo;1-10   lines 1 to 10 of function foo.
+  bar;1,3,5  lines 1, 3 and 5 of function bar
+
+  PSM3_LOG_SRCH_FORMAT_STRING If set, overrides the PSM3_LOG_INC_FUNCTION_NAMES
+  and PSM3_LOG_EXC_FUNCTION_NAMES settings.  Causes the psmi_log_message()
+  facility to only emit the log messages that match (using fnmatch()) the
+  message in FORMAT.
+
+ */
+
+typedef enum
+{
+	PSM2_LOG_TX   = 0,
+	PSM2_LOG_RX   = 1,
+	PSM2_LOG_PEND = 2,
+} psmi_log_tx_rx_t;
+
+#ifdef PSM_LOG
+
+extern void psmi_log_initialize(void);
+
+/* defined in psm_utils.c */
+extern void psmi_log_message(const char *fileName,
+			     const char *functionName,
+			     int lineNumber,
+			     const char *format, ...);
+
+#ifdef PSM_LOG_FAST_IO
+extern void psmi_log_fini(void);
+#else
+#define psmi_log_fini() /* nothing */
+#endif
+
+#define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__)
+
+#define PSM2_LOG_BT_BUFFER_SIZE 100
+
+#define PSM2_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM2_LOG_BT_BUFFER_SIZE]
+
+#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) static void * psm_log_bt_buffer[SIZE]
+
+#define PSM2_LOG_BT_MAGIC ((const char *)-1)
+
+#define PSM2_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__)
+
+#define PSM2_LOG_EPM_MAGIC ((const char *)-2)
+
+/* EPM is short for Emit Protocol Message to the log file.
+OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h
+TXRX is an int, and should be one of the above two consts (PSM2_LOG_TX, or PSM2_LOG_RX).
+FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender   of the message
+                                   and the toepid   should be the epid (end point id) of the receiver of the message
+    */
+#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...)				\
+	psmi_log_message(__FILE__,__FUNCTION__,__LINE__,				\
+			PSM2_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,		\
+			## __VA_ARGS__)
+
+/* Just adds a condition to the PSM2_LOG_EPM() macro. */
+#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...)			\
+	if (COND)									\
+		PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__)
+
+#define PSM2_LOG_DUMP_MAGIC ((const char *)-3)
+
+#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...)					\
+	psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_DUMP_MAGIC,ADDR,SIZE,	\
+			 FORMAT, ## __VA_ARGS__)
+
+#define PSM2_LOG_PKT_STRM_MAGIC ((const char *)-4)
+
+#define PSM2_LOG_MIN_MAGIC PSM2_LOG_BT_MAGIC
+
+#define PSM2_LOG_MAX_MAGIC PSM2_LOG_PKT_STRM_MAGIC
+
+#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...)				\
+	psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_PKT_STRM_MAGIC,TXRX,	\
+			 IPS_MSG_HDRP,FORMAT, ## __VA_ARGS__)
+
+#else
+
+#define psmi_log_initialize()                               /* nothing */
+
+#define PSM2_LOG_MSG(FORMAT , ...)                          /* nothing */
+
+#define psmi_log_fini()                                     /* nothing */
+
+#define PSM2_LOG_DECLARE_BT_BUFFER()                         /* nothing */
+
+#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE)                  /* nothing */
+
+#define PSM2_LOG_BT(NFRAMES,FORMAT , ...)                    /* nothing */
+
+#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */
+
+#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...)                      /* nothing */
+
+#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...)               /* nothing */
+
+#endif /* #ifdef PSM_LOG */
+
+#endif /* #ifndef _PSMI_LOG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_memcpy.c b/deps/libfabric/prov/psm3/psm3/psm_memcpy.c
new file mode 100644
index 0000000000000000000000000000000000000000..b7c7a89523ef078bdb7a21ac95e0022898a63e8c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_memcpy.c
@@ -0,0 +1,68 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+void *psmi_memcpyo(void *dst, const void *src, size_t n)
+{
+	psmi_mq_mtucpy(dst, src, n);
+	return dst;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mock.c b/deps/libfabric/prov/psm3/psm3/psm_mock.c
new file mode 100644
index 0000000000000000000000000000000000000000..bdcfd41909463e4c4dfcafbbb9645dc276715065
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mock.c
@@ -0,0 +1,90 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm2_mock_testing.h"
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK_INIT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_init);
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl)
+{
+	int ret = _PSMI_LOCK_TRY(*pl);
+	return ret;
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_try);
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock);
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl)
+{
+	_PSMI_UNLOCK(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock);
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl)
+{
+	_PSMI_LOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert);
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl)
+{
+	_PSMI_UNLOCK_ASSERT(*pl);
+}
+MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert);
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mpool.c b/deps/libfabric/prov/psm3/psm3/psm_mpool.c
new file mode 100644
index 0000000000000000000000000000000000000000..7da50befca09d61a79effd284ba6603837f2bb11
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mpool.c
@@ -0,0 +1,576 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#define PSMI_MPOOL_ALIGNMENT	64
+
+struct mpool_element {
+	union {
+		SLIST_ENTRY(mpool_element) me_next;
+		mpool_t me_mpool;
+	};
+
+	uint32_t me_gen_count;
+	uint32_t me_index;
+#ifdef PSM_DEBUG
+	uint32_t me_isused;
+#endif
+} __attribute__ ((aligned(16)));
+
+#ifdef PSM_DEBUG
+#  define me_mark_used(me)    ((me)->me_isused = 1)
+#  define me_mark_unused(me)  ((me)->me_isused = 0)
+#else
+#  define me_mark_used(me)
+#  define me_mark_unused(me)
+#endif
+
+struct mpool {
+	int mp_type;
+	int mp_flags;
+	int mp_vector_shift;
+
+	uint32_t mp_elm_vector_size;
+	uint32_t mp_elm_offset;
+	uint32_t mp_num_obj;
+	uint32_t mp_num_obj_inuse;
+	uint32_t mp_elm_size;
+	uint32_t mp_obj_size;
+	uint32_t mp_num_obj_per_chunk;
+	uint32_t mp_num_obj_max_total;
+	psmi_memtype_t mp_memtype;
+
+	 SLIST_HEAD(, mpool_element) mp_head;
+	struct mpool_element **mp_elm_vector;
+	struct mpool_element **mp_elm_vector_free;
+	non_empty_callback_fn_t mp_non_empty_cb;
+	void *mp_non_empty_cb_context;
+
+#ifdef PSM_CUDA
+	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
+	void *mp_alloc_dealloc_cb_context;
+#endif
+};
+
+static int psmi_mpool_allocate_chunk(mpool_t);
+
+/**
+ * psmi_mpool_create()
+ *
+ * Create a memory pool and allocates <num_obj_per_chunk> objects of size
+ * <obj_size>.  If more memory is needed to accommodate mpool_get()
+ * requests, the memory pool will allocate another chunk of
+ * <num_obj_per_chunk> objects, until it reaches the maximum number of objects
+ * it can allocate.
+ *
+ * <obj_size>		size of each individual object
+ * <num_obj_per_chunk>	number of objects to allocate per chunk (power of two)
+ * <num_obj_max_total>	total number of objects that may be allocated
+ *			at any given time. Must be a power of two greater than
+ *			<num_obj_per_chunk>.
+ *
+ * <flags>		flags to be applied on the memory pool (ie. memory
+ *			alignment)
+ *
+ * <cb>			callback to be called when the memory pool has some
+ *			free objects available again (after running out of them).
+ * <context>		context pointer for the callback
+ *
+ * Return the mpool on success, NULL on failure.
+ */
+mpool_t
+psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk,
+			uint32_t num_obj_max_total, int flags,
+			psmi_memtype_t statstype,
+			non_empty_callback_fn_t cb, void *context)
+{
+	mpool_t mp;
+	int s;
+	size_t hdr_size;
+
+	if (!PSMI_POWEROFTWO(num_obj_per_chunk) ||
+	    !PSMI_POWEROFTWO(num_obj_max_total) ||
+	    num_obj_max_total < num_obj_per_chunk) {
+		return NULL;
+	}
+
+	mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool));
+	if (mp == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool: %s\n",
+			strerror(errno));
+		return NULL;
+	}
+
+	for (s = 1; s < num_obj_per_chunk; s <<= 1)
+		mp->mp_vector_shift++;
+
+	mp->mp_flags = flags;
+	mp->mp_num_obj_per_chunk = num_obj_per_chunk;
+	mp->mp_num_obj_max_total = num_obj_max_total;
+	mp->mp_non_empty_cb = cb;
+	mp->mp_non_empty_cb_context = context;
+
+	mp->mp_memtype = statstype;
+
+	SLIST_INIT(&mp->mp_head);
+	mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk;
+	mp->mp_elm_vector =
+	    psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size,
+			sizeof(struct mpool_element *));
+	if (mp->mp_elm_vector == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool vector: "
+			"%s\n", strerror(errno));
+		psmi_free(mp);
+		return NULL;
+	}
+
+	mp->mp_elm_vector_free = mp->mp_elm_vector;
+
+	if (flags & PSMI_MPOOL_ALIGN) {
+		// TBD - this is broken, mp_elm_offset is not
+		// used all the places where it needs to be
+		// fortunately this flag is not used yet
+		psmi_assert_always(0);
+		/* User wants its block to start on a PSMI_MPOOL_ALIGNMENT
+		 * boundary. */
+		hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element),
+					PSMI_MPOOL_ALIGNMENT);
+		mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT);
+		mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+		mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element);
+	} else {
+		hdr_size = sizeof(struct mpool_element);
+		mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8);
+		mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+		mp->mp_elm_offset = 0;
+	}
+
+	return mp;
+}
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+		  uint32_t num_obj_max_total, int flags,
+		  psmi_memtype_t statstype, non_empty_callback_fn_t cb,
+		  void *context)
+{
+	mpool_t mp;
+
+	mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+					num_obj_max_total, flags, statstype,
+					cb, context);
+
+	if (mp == NULL)
+		return NULL;
+
+	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+		psmi_mpool_destroy(mp);
+		return NULL;
+	}
+
+	return mp;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_create);
+
+#ifdef PSM_CUDA
+mpool_t
+psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+			   uint32_t num_obj_max_total, int flags,
+			   psmi_memtype_t statstype,
+			   non_empty_callback_fn_t cb, void *context,
+			   alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+{
+	mpool_t mp;
+
+	mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk,
+					num_obj_max_total, flags, statstype,
+					cb, context);
+
+	if (mp == NULL)
+		return NULL;
+
+	mp->mp_alloc_dealloc_cb = ad_cb;
+	mp->mp_alloc_dealloc_cb_context = ad_context;
+
+	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
+		psmi_mpool_destroy(mp);
+		return NULL;
+	}
+
+	return mp;
+}
+#endif
+
+/**
+ * psmi_mpool_get()
+ *
+ * <mp>	    memory pool
+ *
+ * Requests an object from the memory pool.
+ *
+ * Returns NULL if the maximum number of objects has been allocated (refer to
+ * <num_obj_max_total> in psmi_mpool_create) or if running out of memory.
+ */
+void *psmi_mpool_get(mpool_t mp)
+{
+	struct mpool_element *me;
+	void *obj;
+
+	if (SLIST_EMPTY(&mp->mp_head)) {
+		if (psmi_mpool_allocate_chunk(mp) != PSM2_OK)
+			return NULL;
+	}
+
+	me = SLIST_FIRST(&mp->mp_head);
+	SLIST_REMOVE_HEAD(&mp->mp_head, me_next);
+
+	psmi_assert(!me->me_isused);
+	me_mark_used(me);
+
+	/* store a backpointer to the memory pool */
+	me->me_mpool = mp;
+	mp->mp_num_obj_inuse++;
+	psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj);
+
+	obj = (void *)((uintptr_t) me + sizeof(struct mpool_element));
+
+	return obj;
+}
+
+/**
+ * psmi_mpool_put()
+ *
+ * <obj>    object to return to the memory pool
+ *
+ * Returns an <obj> to the memory pool subsystem.  This object will be re-used
+ * to fulfill new psmi_mpool_get() requests.
+ */
+void psmi_mpool_put(void *obj)
+{
+	struct mpool_element *me;
+	int was_empty;
+	mpool_t mp;
+
+	me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+	me->me_gen_count++;
+
+	mp = me->me_mpool;
+
+	psmi_assert(mp != NULL);
+	psmi_assert(mp->mp_num_obj_inuse >= 0);
+	psmi_assert(me->me_isused);
+	me_mark_unused(me);
+
+	was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total;
+	SLIST_INSERT_HEAD(&mp->mp_head, me, me_next);
+
+	mp->mp_num_obj_inuse--;
+
+	/* tell the user that memory is available */
+	if (mp->mp_non_empty_cb && was_empty)
+		mp->mp_non_empty_cb(mp->mp_non_empty_cb_context);
+}
+
+/**
+ * psmi_mpool_get_obj_index()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in the memory pool.
+ */
+
+int psmi_mpool_get_obj_index(void *obj)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	return me->me_index;
+}
+
+/**
+ * psmi_mpool_get_obj_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the generation count of the <obj>.
+ */
+uint32_t psmi_mpool_get_obj_gen_count(void *obj)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	return me->me_gen_count;
+}
+
+/**
+ * psmi_mpool_get_obj_index_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in <index>.
+ * Returns the generation count of the <obj> in <gen_count>.
+ */
+int
+psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index,
+				   uint32_t *gen_count)
+{
+	struct mpool_element *me = (struct mpool_element *)
+	    ((uintptr_t) obj - sizeof(struct mpool_element));
+
+	*index = me->me_index;
+	*gen_count = me->me_gen_count;
+	return 0;
+}
+
+/**
+ * psmi_mpool_find_obj_by_index()
+ *
+ * <mp>	    memory pool
+ * <index>  index of the object
+ *
+ * Returns the object located at <index> in the memory pool or NULL if the
+ * <index> is invalid.
+ */
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index)
+{
+	struct mpool_element *me;
+
+	if_pf(index < 0 || index >= mp->mp_num_obj)
+	    return NULL;
+
+	me = (struct mpool_element *)
+	    ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] +
+	     (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size +
+	     mp->mp_elm_offset);
+
+	/* If this mpool doesn't require generation counts, it's illegal to find a
+	 * freed object */
+#ifdef PSM_DEBUG
+	if (mp->mp_flags & PSMI_MPOOL_NOGENERATION)
+		psmi_assert(!me->me_isused);
+#endif
+
+	return (void *)((uintptr_t) me + sizeof(struct mpool_element));
+}
+
+#ifdef PSM_CUDA
+/**
+ * psmi_mpool_chunk_dealloc()
+ * <mp>	    memory pool
+ * <i>	    index
+ * Calls the dealloc function on each element in the chunk.
+ */
+void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
+{
+	int j;
+	for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
+		mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
+					mp->mp_alloc_dealloc_cb_context,
+					((void *) mp->mp_elm_vector[idx]) +
+					j * mp->mp_elm_size +
+					sizeof(struct mpool_element));
+}
+#endif
+/**
+ * psmi_mpool_destroy()
+ *
+ * <mp>	    memory pool
+ *
+ * Destroy a previously allocated memory pool and reclaim its associated
+ * memory.  The behavior is undefined if some objects have not been returned
+ * to the memory pool with psmi_mpool_put().
+ */
+void psmi_mpool_destroy(mpool_t mp)
+{
+	int i = 0;
+	size_t nbytes = mp->mp_num_obj * mp->mp_elm_size;
+
+	for (i = 0; i < mp->mp_elm_vector_size; i++) {
+		if (mp->mp_elm_vector[i]) {
+#ifdef PSM_CUDA
+			if (mp->mp_alloc_dealloc_cb)
+				psmi_mpool_chunk_dealloc(mp, i);
+#endif
+			psmi_free(mp->mp_elm_vector[i]);
+		}
+	}
+	psmi_free(mp->mp_elm_vector);
+	nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *);
+	psmi_free(mp);
+	nbytes += sizeof(struct mpool);
+}
+
+/**
+ * psmi_mpool_get_max_obj()
+ *
+ * <mp>	    memory pool
+ *
+ * Returns the num-obj-per-chunk
+ * Returns the num-obj-max-total
+ */
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+			uint32_t *num_obj_max_total)
+{
+	*num_obj_per_chunk = mp->mp_num_obj_per_chunk;
+	*num_obj_max_total = mp->mp_num_obj_max_total;
+	return;
+}
+MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info);
+
+static int psmi_mpool_allocate_chunk(mpool_t mp)
+{
+	struct mpool_element *elm;
+	void *chunk;
+	uint32_t i = 0, num_to_allocate;
+
+	num_to_allocate =
+	    mp->mp_num_obj + mp->mp_num_obj_per_chunk >
+	    mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk;
+
+	psmi_assert(mp->mp_num_obj + num_to_allocate <=
+		    mp->mp_num_obj_max_total);
+
+	if (num_to_allocate == 0)
+		return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+	if (mp->mp_alloc_dealloc_cb)
+		chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype,
+				    num_to_allocate, mp->mp_elm_size);
+	else
+		chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+				    num_to_allocate * mp->mp_elm_size);
+#else
+	chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
+			    num_to_allocate * mp->mp_elm_size);
+#endif
+	if (chunk == NULL) {
+		fprintf(stderr,
+			"Failed to allocate memory for memory pool chunk: %s\n",
+			strerror(errno));
+		return PSM2_NO_MEMORY;
+	}
+
+	for (i = 0; i < num_to_allocate; i++) {
+#ifdef PSM_CUDA
+		if (mp->mp_alloc_dealloc_cb)
+			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
+						mp->mp_alloc_dealloc_cb_context,
+						chunk + i * mp->mp_elm_size +
+						sizeof(struct mpool_element));
+#endif
+		elm = (struct mpool_element *)((uintptr_t) chunk +
+					       i * mp->mp_elm_size +
+					       mp->mp_elm_offset);
+		elm->me_gen_count = 0;
+		elm->me_index = mp->mp_num_obj + i;
+#ifdef PSM_DEBUG
+		elm->me_isused = 0;
+#endif
+		SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next);
+#if 0
+		fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n",
+			(long)(mp->mp_elm_vector_free - mp->mp_elm_vector),
+			(int)i, elm,
+			(void *)((uintptr_t) elm +
+				 sizeof(struct mpool_element)), SLIST_NEXT(elm,
+									   me_next));
+#endif
+	}
+
+	psmi_assert((uintptr_t) mp->mp_elm_vector_free
+		    < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size
+		    * sizeof(struct mpool_element *));
+
+	mp->mp_elm_vector_free[0] = chunk;
+	mp->mp_elm_vector_free++;
+	mp->mp_num_obj += num_to_allocate;
+
+	return PSM2_OK;
+}
+
+#if 0
+void psmi_mpool_dump(mpool_t mp)
+{
+	int i, j;
+	struct mpool_element *me;
+
+	fprintf(stderr, "Memory pool %p has %d elements per chunk.\n",
+		mp, mp->mp_num_obj_per_chunk);
+	for (i = 0; i < mp->mp_elm_vector_size; i++) {
+		if (mp->mp_elm_vector[i] != NULL) {
+			fprintf(stderr, "===========================\n");
+			fprintf(stderr, "mpool chunk #%d\n", i);
+
+			for (j = 0, me = mp->mp_elm_vector[i];
+			     j < mp->mp_num_obj_per_chunk;
+			     j++, me = (struct mpool_element *)
+			     ((uintptr_t) me + mp->mp_elm_size)) {
+				fprintf(stderr,
+					"obj=%p index=%d gen_count=%d\n",
+					(void *)((uintptr_t) me +
+						 sizeof(struct mpool_element)),
+					me->me_index, me->me_gen_count);
+			}
+			fprintf(stderr, "===========================\n");
+		}
+	}
+}
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mpool.h b/deps/libfabric/prov/psm3/psm3/psm_mpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..8098f60ce7198c15bfdd5871f5079fa031b18b34
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mpool.h
@@ -0,0 +1,107 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_mpool.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef PSM_MPOOL_H
+#define PSM_MPOOL_H
+
+/* mpool flags */
+#define PSMI_MPOOL_ALIGN_CACHE	0x1
+#define PSMI_MPOOL_ALIGN_PAGE   0x2
+#define PSMI_MPOOL_NOGENERATION 0x4
+
+/* Backwards compatibility */
+#define PSMI_MPOOL_ALIGN	PSMI_MPOOL_ALIGN_CACHE
+
+typedef struct mpool *mpool_t;
+typedef void (*non_empty_callback_fn_t) (void *context);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
+					     void *chunk);
+
+mpool_t
+MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
+			  uint32_t num_obj_max_total, int flags,
+			  psmi_memtype_t statstype,
+			  non_empty_callback_fn_t cb, void *context);
+MOCK_DCL_EPILOGUE(psmi_mpool_create);
+
+mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
+				   uint32_t num_obj_max_total, int flags,
+				   psmi_memtype_t statstype,
+				   non_empty_callback_fn_t cb, void *context,
+				   alloc_dealloc_callback_fn_t ad_cb,
+				   void *ad_context);
+
+void psmi_mpool_destroy(mpool_t mp);
+
+void
+MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
+			     uint32_t *num_obj_max_total);
+MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info);
+
+void *psmi_mpool_get(mpool_t mp);
+void psmi_mpool_put(void *obj);
+
+int psmi_mpool_get_obj_index(void *obj);
+uint32_t psmi_mpool_get_obj_gen_count(void *obj);
+int psmi_mpool_get_obj_index_gen_count(void *obj,
+				       uint32_t *index, uint32_t *gen_count);
+
+void *psmi_mpool_find_obj_by_index(mpool_t mp, int index);
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mq.c b/deps/libfabric/prov/psm3/psm3/psm_mq.c
new file mode 100644
index 0000000000000000000000000000000000000000..660f188d379684d62d9dc50010e5855bfd6bdb9f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mq.c
@@ -0,0 +1,1645 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sched.h>
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "ips_proto_params.h"
+
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
+/*
+ * Functions to manipulate the expected queue in mq_ep.
+ */
+
+/*
+ * Once the linked lists cross the size limit, this function will enable tag
+ * hashing and disable the non-hashing fastpath. We need to go back and insert
+ * reqs into the hash tables where the hashing searches will look for them.
+ */
+void
+psmi_mq_fastpath_disable(psm2_mq_t mq)
+{
+	psm2_mq_req_t *curp, cur;
+	struct mqq *qp;
+	unsigned hashvals[NUM_HASH_CONFIGS];
+	int t = PSM2_ANYTAG_ANYSRC;
+
+	mq->nohash_fastpath = 0;
+	/* Everything in the unexpected_q needs to be duplicated into
+	   each of the (three) unexpected hash tables. */
+	qp = &mq->unexpected_q;
+	for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) {
+		mq->unexpected_hash_len++;
+		hashvals[PSM2_TAG_SRC] =
+			hash_64(cur->req_data.tag.tag64) % NUM_HASH_BUCKETS;
+		hashvals[PSM2_TAG_ANYSRC] =
+			hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS;
+		hashvals[PSM2_ANYTAG_SRC] =
+			hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS;
+		for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++)
+			mq_qq_append_which(mq->unexpected_htab,
+					   t, hashvals[t], cur);
+	}
+
+	/* Everything in the expected_q needs to be moved into the
+	   (single) correct expected hash table. */
+	qp = &mq->expected_q;
+	for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) {
+		/* must read next ptr before remove */
+		curp = &cur->next[PSM2_ANYTAG_ANYSRC];
+		if ((cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) &&
+		    (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF)) {
+			/* hash tag0 and tag1 */
+			t = PSM2_TAG_SRC;
+			hashvals[t] = hash_64(cur->req_data.tag.tag64) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else if (cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) {
+			t = PSM2_TAG_ANYSRC;
+			hashvals[t] = hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else if (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF) {
+			t = PSM2_ANYTAG_SRC;
+			hashvals[t] = hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS;
+			mq_qq_append_which(mq->expected_htab,
+					   t, hashvals[t], cur);
+		} else
+			continue; /* else, req must stay in ANY ANY */
+
+		mq->expected_list_len--;
+		mq->expected_hash_len++;
+		mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC);
+	}
+}
+
+/* easy threshold to re-enable: if |hash| == 0 && |list| < X
+   aggressive threshold: if |hash| + |list| < X
+   even easier: if |hash| + |list| == 0
+   might be better approach to avoid constant bouncing between modes */
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq)
+{
+	if_pf(mq->nohash_fastpath == 0 &&
+	      mq->unexpected_hash_len == 0 &&
+	      mq->expected_hash_len == 0 &&
+	      mq->unexpected_list_len == 0 &&
+	      mq->expected_list_len == 0){
+		mq->nohash_fastpath = 1;
+	}
+}
+
+/*
+ * ! @brief PSM exposed version to allow PTLs to match
+ */
+
+/*! @brief Try to match against the MQ using a tag and tagsel
+ *
+ * @param[in] mq Message Queue
+ * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR.
+ * @param[in] tag Input Tag
+ * @param[in] tagsel Input Tag Selector
+ * @param[in] remove Non-zero to remove the req from the queue
+ *
+ * @returns NULL if no match or an mq request if there is a match
+ */
+static
+psm2_mq_req_t
+mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src,
+			 psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove)
+{
+	psm2_mq_req_t *curp;
+	psm2_mq_req_t cur;
+	unsigned hashval;
+	int i, j = 0;
+	struct mqq *qp;
+
+	if_pt (mq->nohash_fastpath) {
+		i = j = PSM2_ANYTAG_ANYSRC;
+		qp = &mq->unexpected_q;
+	} else if ((tagsel->tag[0] == 0xFFFFFFFF) &&
+		   (tagsel->tag[1] == 0xFFFFFFFF)) {
+		i = PSM2_TAG_SRC;
+		hashval = hash_64(tag->tag64) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else if (tagsel->tag[0] == 0xFFFFFFFF) {
+		i = PSM2_TAG_ANYSRC;
+		hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else if (tagsel->tag[1] == 0xFFFFFFFF) {
+		i = PSM2_ANYTAG_SRC;
+		hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+		qp = &mq->unexpected_htab[i][hashval];
+	} else {
+		/* unhashable tag */
+		i = PSM2_ANYTAG_ANYSRC;
+		qp = &mq->unexpected_q;
+	}
+
+	for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) {
+		psmi_assert(cur->req_data.peer != PSM2_MQ_ANY_ADDR);
+		if ((src == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) &&
+		    !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & tagsel->tag[0]) &&
+		    !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & tagsel->tag[1]) &&
+		    !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & tagsel->tag[2])) {
+			/* match! */
+			if (remove) {
+				if_pt (i == PSM2_ANYTAG_ANYSRC)
+					mq->unexpected_list_len--;
+				else
+					mq->unexpected_hash_len--;
+				for (; j < NUM_MQ_SUBLISTS; j++)
+					mq_qq_remove_which(cur, j);
+				psmi_mq_fastpath_try_reenable(mq);
+			}
+			return cur;
+		}
+	}
+	return NULL;
+}
+
+static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	unsigned hashval;
+	int i;
+
+	req->timestamp = mq->timestamp++;
+	if_pt (mq->nohash_fastpath) {
+		mq_qq_append(&mq->expected_q, req);
+		req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+		mq->expected_list_len++;
+		if_pf (mq->expected_list_len >= HASH_THRESHOLD)
+			psmi_mq_fastpath_disable(mq);
+	} else if ((req->req_data.tagsel.tag[0] == 0xFFFFFFFF) &&
+		   (req->req_data.tagsel.tag[1] == 0xFFFFFFFF)) {
+		i = PSM2_TAG_SRC;
+		hashval = hash_64(req->req_data.tag.tag64) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else if (req->req_data.tagsel.tag[0] == 0xFFFFFFFF) {
+		i = PSM2_TAG_ANYSRC;
+		hashval = hash_32(req->req_data.tag.tag[0]) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else if (req->req_data.tagsel.tag[1] == 0xFFFFFFFF) {
+		i = PSM2_ANYTAG_SRC;
+		hashval = hash_32(req->req_data.tag.tag[1]) % NUM_HASH_BUCKETS;
+		mq_qq_append_which(mq->expected_htab, i, hashval, req);
+		mq->expected_hash_len++;
+	} else {
+		mq_qq_append(&mq->expected_q, req);
+		req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q;
+		mq->expected_list_len++;
+	}
+}
+
+/*! @brief Try to remove the req in the MQ
+ *
+ * @param[in] mq Message Queue
+ * @param[in] req MQ request
+ *
+ * @returns 1 if successfully removed, or 0 if req cannot be found.
+ */
+static
+int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	int i;
+
+	/* item should only exist in one expected queue at a time */
+	psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1);
+
+	for (i = 0; i < NUM_MQ_SUBLISTS; i++)
+		if (req->q[i]) /* found */
+			break;
+	switch (i) {
+	case PSM2_ANYTAG_ANYSRC:
+		mq->expected_list_len--;
+		break;
+	case PSM2_TAG_SRC:
+	case PSM2_TAG_ANYSRC:
+	case PSM2_ANYTAG_SRC:
+		mq->expected_hash_len--;
+		break;
+	default:
+		return 0;
+	}
+
+	mq_qq_remove_which(req, i);
+	psmi_mq_fastpath_try_reenable(mq);
+	return 1;
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src,
+		     psm2_mq_tag_t *tag,
+		     psm2_mq_tag_t *tagsel, int remove_req))
+{
+	psm2_mq_req_t req;
+
+	PSMI_LOCK(mq->progress_lock);
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+	if (req != NULL) {
+		PSMI_UNLOCK(mq->progress_lock);
+		return req;
+	}
+
+	psmi_poll_internal(mq->ep, 1);
+	/* try again */
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req);
+
+	PSMI_UNLOCK(mq->progress_lock);
+	return req;
+}
+
+psm2_error_t
+__psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src,
+		 psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		 psm2_mq_status2_t *status)
+{
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0);
+	psmi_assert_req_not_internal(req);
+
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status2_copy(req, status);
+		}
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe2)
+
+psm2_error_t
+__psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+		psm2_mq_status_t *status)
+{
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	rtag.tag64 = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	rtagsel.tag64 = tagsel;
+	rtagsel.tag[2] = 0;
+
+	req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0);
+	psmi_assert_req_not_internal(req);
+
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status_copy(req, status);
+		}
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	PSM2_LOG_MSG("leaving");
+
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_iprobe)
+
+psm2_error_t
+__psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src,
+		  psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		  psm2_mq_req_t *reqo, psm2_mq_status2_t *status)
+{
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+
+	req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1);
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status2_copy(req, status);
+		}
+		*reqo = req;
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	*reqo = NULL;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe2)
+
+psm2_error_t
+__psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel,
+		 psm2_mq_req_t *reqo, psm2_mq_status_t *status)
+{
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+	psm2_mq_req_t req;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	rtag.tag64 = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	rtagsel.tag64 = tagsel;
+	rtagsel.tag[2] = 0;
+
+	req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1);
+	if (req != NULL) {
+		if (status != NULL) {
+			mq_status_copy(req, status);
+		}
+		*reqo = req;
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK;
+	}
+
+	*reqo = NULL;
+	PSM2_LOG_MSG("leaving");
+	return PSM2_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm2_mq_improbe)
+
+psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq)
+{
+	psm2_mq_req_t req = *ireq;
+	psm2_mq_t mq;
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == NULL) {
+		PSM2_LOG_MSG("leaving");
+		return PSM2_MQ_NO_COMPLETIONS;
+	}
+
+	/* Cancelling a send is a blocking operation, and expensive.
+	 * We only allow cancellation of rendezvous sends, consider the eager sends
+	 * as always unsuccessfully cancelled.
+	 */
+	mq = req->mq;
+	PSMI_LOCK(mq->progress_lock);
+
+	if (MQE_TYPE_IS_RECV(req->type)) {
+		if (req->state == MQ_STATE_POSTED) {
+			int rc;
+
+			rc = mq_req_remove_single(mq, req);
+			psmi_assert_always(rc);
+			req->state = MQ_STATE_COMPLETE;
+			mq_qq_append(&mq->completed_q, req);
+			err = PSM2_OK;
+		} else
+			err = PSM2_MQ_NO_COMPLETIONS;
+	} else {
+		err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+					"Cannot cancel send requests (req=%p)",
+					req);
+	}
+
+	PSMI_UNLOCK(mq->progress_lock);
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_cancel)
+
+/* This is the only PSM function that blocks.
+ * We handle it in a special manner since we don't know what the user's
+ * execution environment is (threads, oversubscribing processes, etc).
+ *
+ * The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status,
+		   psmi_mq_status_copy_t status_copy,
+		   int do_lock))
+{
+	psm2_error_t err = PSM2_OK;
+
+	psm2_mq_req_t req = *ireq;
+	if (req == PSM2_MQ_REQINVALID) {
+		return PSM2_OK;
+	}
+
+	if (do_lock)
+		PSMI_LOCK(req->mq->progress_lock);
+
+	if (req->state != MQ_STATE_COMPLETE) {
+		psm2_mq_t mq = req->mq;
+
+		/* We'll be waiting on this req, mark it as so */
+		req->type |= MQE_TYPE_WAITING;
+
+		_HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n",
+			  req, req->req_data.buf, req->req_data.buf_len);
+
+		if (req->testwait_callback) {
+			err = req->testwait_callback(ireq);
+			if (do_lock)
+				PSMI_UNLOCK(req->mq->progress_lock);
+			if (status != NULL) {
+				status_copy(req, status);
+			}
+			return err;
+		}
+
+		PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE);
+
+		if (err > PSM2_OK_NO_PROGRESS)
+			goto fail_with_lock;
+		else
+			err = PSM2_OK;
+	}
+
+	if(!psmi_is_req_internal(req))
+		mq_qq_remove(&req->mq->completed_q, req);
+
+	if (status != NULL) {
+		status_copy(req, status);
+	}
+
+	_HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n",
+		  req, req->req_data.buf, req->req_data.buf_len, req->req_data.error_code);
+
+	psmi_mq_req_free(req);
+	*ireq = PSM2_MQ_REQINVALID;
+
+fail_with_lock:
+	if (do_lock)
+		PSMI_UNLOCK(req->mq->progress_lock);
+	return err;
+}
+
+psm2_error_t
+__psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_req_not_internal(*ireq);
+
+	rv = psmi_mq_wait_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status2_copy, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_wait2)
+
+psm2_error_t
+__psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert_req_not_internal(*ireq);
+
+	rv = psmi_mq_wait_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status_copy, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_wait)
+
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq)
+{
+	return psmi_mq_wait_inner(ireq, NULL, NULL, 0);
+}
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status,
+		   psmi_mq_status_copy_t status_copy))
+{
+	psm2_mq_req_t req = *ireq;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == PSM2_MQ_REQINVALID) {
+		return PSM2_OK;
+	}
+
+	if (req->state != MQ_STATE_COMPLETE) {
+		if (req->testwait_callback) {
+			PSMI_LOCK(req->mq->progress_lock);
+			err = req->testwait_callback(ireq);
+			if (status != NULL) {
+				status_copy(req, status);
+			}
+			PSMI_UNLOCK(req->mq->progress_lock);
+			return err;
+		} else
+			return PSM2_MQ_NO_COMPLETIONS;
+	}
+
+	if (status != NULL)
+		status_copy(req, status);
+
+	_HFI_VDBG
+	    ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n",
+	     req, req->req_data.tag.tag[0], req->req_data.tag.tag[1],
+	     req->req_data.tag.tag[2], req->req_data.buf,
+	     req->req_data.buf_len, req->req_data.error_code);
+
+	PSMI_LOCK(req->mq->progress_lock);
+	mq_qq_remove(&req->mq->completed_q, req);
+	psmi_mq_req_free(req);
+	PSMI_UNLOCK(req->mq->progress_lock);
+
+	*ireq = PSM2_MQ_REQINVALID;
+
+	return err;
+}
+
+psm2_error_t
+__psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_test_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status2_copy);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_test2)
+
+psm2_error_t
+__psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_test_inner(ireq, status,
+				  (psmi_mq_status_copy_t) mq_status_copy);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+
+}
+PSMI_API_DECL(psm2_mq_test)
+
+psm2_error_t
+__psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+		psm2_mq_tag_t *stag, const void *buf, uint32_t len,
+		void *context, psm2_mq_req_t *req)
+{
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(stag != NULL);
+
+	PSMI_LOCK(mq->progress_lock);
+	err =
+		dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL,
+				stag, buf, len, context, req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+	psmi_assert(*req != NULL);
+	psmi_assert_req_not_internal(*req);
+
+	(*req)->req_data.peer = dest;
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_isend2)
+
+psm2_error_t
+__psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	       const void *buf, uint32_t len, void *context, psm2_mq_req_t *req)
+{
+	psm2_error_t err;
+	psm2_mq_tag_t tag;
+
+	PSM2_LOG_MSG("entering");
+
+	tag.tag64 = stag;
+	tag.tag[2] = 0;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+	err = dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL,
+				&tag, buf, len, context, req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+	psmi_assert(*req != NULL);
+	psmi_assert_req_not_internal(*req);
+
+	(*req)->req_data.peer = dest;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_isend)
+
+psm2_error_t
+__psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
+	       psm2_mq_tag_t *stag, const void *buf, uint32_t len)
+{
+	psm2_error_t err;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+	psmi_assert(stag != NULL);
+
+	PSMI_LOCK(mq->progress_lock);
+	err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len);
+	PSMI_UNLOCK(mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_send2)
+
+psm2_error_t
+__psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
+	      const void *buf, uint32_t len)
+{
+	psm2_error_t err;
+	psm2_mq_tag_t tag;
+
+	PSM2_LOG_MSG("entering stag: 0x%" PRIx64, stag);
+
+	tag.tag64 = stag;
+	tag.tag[2] = 0;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+	err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len);
+	PSMI_UNLOCK(mq->progress_lock);
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_send)
+
+/*
+ * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv.  This code assumes
+ * that the provided request has been matched, and begins copying message data
+ * that has already arrived to the user's buffer.  Any remaining data is copied
+ * by PSM polling until the message is complete.
+ * Caller has initialized req->is_buf_gpu_mem and req->user_gpu_buffer
+ * consistently with buf/len which represent the application buffer
+ * but req->req_data.buf and req->req_data.len still point to the sysbuf
+ * where data was landed.
+ */
+static psm2_error_t
+psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
+{
+	uint32_t msglen;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(MQE_TYPE_IS_RECV(req->type));
+
+	_HFI_VDBG("(req=%p) buf=%p len=%u req.state=%u\n", req, buf, len, req->state);
+
+	switch (req->state) {
+	case MQ_STATE_COMPLETE:
+		if (req->req_data.buf != NULL) {	/* 0-byte messages don't alloc a sysbuf */
+			msglen = mq_set_msglen(req, len, req->req_data.send_msglen);
+			psmi_mq_recv_copy(mq, req,
+#ifdef PSM_CUDA
+					req->is_buf_gpu_mem,
+#endif
+					buf, len, msglen);
+			psmi_mq_sysbuf_free(mq, req->req_data.buf);
+#ifdef PSM_CUDA
+		} else {
+			mq->stats.rx_sysbuf_cpu_num++;
+#endif
+		}
+		req->req_data.buf = buf;
+		req->req_data.buf_len = len;
+		mq_qq_append(&mq->completed_q, req);
+		break;
+
+	case MQ_STATE_UNEXP:	/* not done yet */
+		msglen = mq_set_msglen(req, len, req->req_data.send_msglen);
+		/* Copy What's been received so far and make sure we don't receive
+		 * any more than copysz.  After that, swap system with user buffer
+		 */
+		req->recv_msgoff = min(req->recv_msgoff, msglen);
+		psmi_mq_recv_copy(mq, req,
+#ifdef PSM_CUDA
+				req->is_buf_gpu_mem,
+#endif
+				buf, len, req->recv_msgoff);
+		psmi_mq_sysbuf_free(mq, req->req_data.buf);
+
+		req->state = MQ_STATE_MATCHED;
+		req->req_data.buf = buf;
+		req->req_data.buf_len = len;
+		break;
+
+	case MQ_STATE_UNEXP_RV:	/* rendez-vous ... */
+		msglen = mq_set_msglen(req, len, req->req_data.send_msglen);
+		/* Copy What's been received so far and make sure we don't receive
+		 * any more than copysz.  After that, swap system with user buffer
+		 */
+		req->recv_msgoff = min(req->recv_msgoff, msglen);
+		if (req->send_msgoff) {	// only have sysbuf if RTS w/payload
+			psmi_mq_recv_copy(mq, req,
+#ifdef PSM_CUDA
+					req->is_buf_gpu_mem,
+#endif
+					buf, len, req->recv_msgoff);
+			psmi_mq_sysbuf_free(mq, req->req_data.buf);
+		}
+
+		req->state = MQ_STATE_MATCHED;
+		req->req_data.buf = buf;
+		req->req_data.buf_len = len;
+		req->rts_callback(req, 0);
+		break;
+
+	default:
+		fprintf(stderr, "Unexpected state %d in req %p\n", req->state,
+			req);
+		fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+			req->type, req->mq, req->req_data.tag.tag[0], req->req_data.tag.tag[1],
+			req->req_data.tag.tag[2]);
+		abort();
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag,
+		psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len,
+		void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req)
+{
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK_ASSERT(mq->progress_lock);
+
+	if (fp_type == PSM2_MQ_ISEND_FP) {
+		psmi_assert(tag != NULL);
+		err =
+			addr->ptlctl->mq_isend(mq, addr, flags, PSMI_REQ_FLAG_FASTPATH,
+					       tag, buf, len, context, req);
+
+		psmi_assert(*req != NULL);
+		psmi_assert_req_not_internal(*req);
+
+		(*req)->req_data.peer = addr;
+	} else if (fp_type == PSM2_MQ_IRECV_FP) {
+		psm2_mq_req_t recv_req;
+
+#ifdef PSM_CUDA
+		int gpu_mem = 0;
+		void *gpu_user_buffer = NULL;
+
+		if (len && PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+			psmi_cuda_set_attr_sync_memops(buf);
+
+			gpu_mem = 1;
+			gpu_user_buffer = buf;
+		}
+#endif
+
+		/* First check unexpected Queue and remove req if found */
+		recv_req = mq_req_match_with_tagsel(mq, addr, tag, tagsel, REMOVE_ENTRY);
+
+		if (recv_req == NULL) {
+			/* prepost before arrival, add to expected q */
+			recv_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+			if_pf(recv_req == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto recv_ret;
+			}
+
+			recv_req->req_data.peer = addr;
+			recv_req->req_data.tag = *tag;
+			recv_req->req_data.tagsel = *tagsel;
+			recv_req->state = MQ_STATE_POSTED;
+			recv_req->req_data.buf = buf;
+			recv_req->req_data.buf_len = len;
+			recv_req->req_data.recv_msglen = len;
+			recv_req->recv_msgoff = 0;
+			recv_req->req_data.context = context;
+
+#ifdef PSM_CUDA
+			recv_req->is_buf_gpu_mem = gpu_mem;
+			recv_req->user_gpu_buffer = gpu_user_buffer;
+#endif
+
+			mq_add_to_expected_hashes(mq, recv_req);
+			_HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x "
+				  " tagsel=%08x.%08x.%08x req=%p\n",
+				  buf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+				  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req);
+		} else {
+			_HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x"
+				  " tagsel=%08x.%08x.%08x req=%p\n", buf, len,
+				  tag->tag[0], tag->tag[1], tag->tag[2],
+				  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req);
+
+#ifdef PSM_CUDA
+			recv_req->is_buf_gpu_mem = gpu_mem;
+			recv_req->user_gpu_buffer = gpu_user_buffer;
+#endif
+
+			recv_req->req_data.context = context;
+
+			psm2_mq_irecv_inner(mq, recv_req, buf, len);
+		}
+recv_ret:
+		psmi_assert_req_not_internal(recv_req);
+		*req = recv_req;
+	} else {
+		err = PSM2_PARAM_ERR;
+	}
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_fp_msg)
+
+psm2_error_t
+__psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
+		psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel,
+		uint32_t flags, void *buf, uint32_t len, void *context,
+		psm2_mq_req_t *reqo)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_mq_req_t req;
+
+#ifdef PSM_CUDA
+	int gpu_mem = 0;
+
+	if (len && PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+		psmi_cuda_set_attr_sync_memops(buf);
+
+		gpu_mem = 1;
+	}
+#endif
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	PSMI_LOCK(mq->progress_lock);
+
+	/* First check unexpected Queue and remove req if found */
+	req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY);
+
+	if (req == NULL) {
+		/* prepost before arrival, add to expected q */
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+		if_pf(req == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto ret;
+		}
+
+		req->req_data.peer = src;
+		req->req_data.tag = *tag;
+		req->req_data.tagsel = *tagsel;
+		req->state = MQ_STATE_POSTED;
+		req->req_data.buf = buf;
+		req->req_data.buf_len = len;
+		req->req_data.recv_msglen = len;
+		req->recv_msgoff = 0;
+		req->req_data.context = context;
+
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem = gpu_mem;
+		if (gpu_mem)
+			req->user_gpu_buffer = buf;
+		else
+			req->user_gpu_buffer = NULL;
+#endif
+
+		mq_add_to_expected_hashes(mq, req);
+		_HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x "
+			  " tagsel=%08x.%08x.%08x req=%p\n",
+			  buf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+	} else {
+		_HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x"
+			  " tagsel=%08x.%08x.%08x req=%p\n", buf, len,
+			  tag->tag[0], tag->tag[1], tag->tag[2],
+			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
+#ifdef PSM_CUDA
+		req->is_buf_gpu_mem = gpu_mem;
+		if (gpu_mem)
+			req->user_gpu_buffer = buf;
+		else
+			req->user_gpu_buffer = NULL;
+#endif
+
+		req->req_data.context = context;
+
+		psm2_mq_irecv_inner(mq, req, buf, len);
+	}
+
+ret:
+	PSMI_UNLOCK(mq->progress_lock);
+	psmi_assert_req_not_internal(req);
+	*reqo = req;
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_irecv2)
+
+psm2_error_t
+__psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags,
+	       void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo)
+{
+	psm2_error_t rv;
+	psm2_mq_tag_t rtag;
+	psm2_mq_tag_t rtagsel;
+
+	*reqo = NULL;
+
+	PSM2_LOG_MSG("entering tag: 0x%" PRIx64, tag);
+
+	rtag.tag64 = tag;
+#ifdef PSM_DEBUG
+	rtag.tag[2] = 0;
+#endif
+	rtagsel.tag64 = tagsel;
+	rtagsel.tag[2] = 0;
+	rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel,
+			       flags, buf, len, context, reqo);
+
+	psmi_assert_req_not_internal(*reqo);
+	PSM2_LOG_MSG("leaving");
+
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_irecv)
+
+psm2_error_t
+__psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
+		void *context, psm2_mq_req_t *reqo)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_mq_req_t req = *reqo;
+
+	PSM2_LOG_MSG("entering");
+	PSMI_ASSERT_INITIALIZED();
+
+	if (req == PSM2_MQ_REQINVALID) {
+		err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR,
+					"Invalid request (req=%p)", req);
+	} else {
+		/* Message is already matched -- begin delivering message data to the
+		   user's buffer. */
+		req->req_data.context = context;
+
+#ifdef PSM_CUDA
+		if (len && PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) {
+			psmi_cuda_set_attr_sync_memops(buf);
+			req->is_buf_gpu_mem = 1;
+			req->user_gpu_buffer = buf;
+		} else {
+			req->is_buf_gpu_mem = 0;
+			req->user_gpu_buffer = NULL;
+		}
+#endif
+
+		PSMI_LOCK(mq->progress_lock);
+		psm2_mq_irecv_inner(mq, req, buf, len);
+		PSMI_UNLOCK(mq->progress_lock);
+	}
+
+	PSM2_LOG_MSG("leaving");
+
+	return err;
+}
+PSMI_API_DECL(psm2_mq_imrecv)
+
+/* The status argument can be an instance of either type psm2_mq_status_t or
+ * psm2_mq_status2_t.  Depending on the type, a corresponding status copy
+ * routine should be passed in.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq,
+		    void *status,
+		    psmi_mq_status_copy_t status_copy))
+{
+	psm2_mq_req_t req;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	if ((req = mq->completed_q.first) == NULL) {
+		PSMI_LOCK(mq->progress_lock);
+		psmi_poll_internal(mq->ep, 1);
+		if ((req = mq->completed_q.first) == NULL) {
+			PSMI_UNLOCK(mq->progress_lock);
+			return PSM2_MQ_NO_COMPLETIONS;
+		}
+		PSMI_UNLOCK(mq->progress_lock);
+	}
+	/* something in the queue */
+	*oreq = req;
+	if (status != NULL)
+		status_copy(req, status);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+__psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status)
+{
+	psm2_error_t rv;
+
+	*oreq = NULL;
+
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_ipeek_inner(mq, oreq, status,
+				   (psmi_mq_status_copy_t) mq_status2_copy);
+
+	psmi_assert_req_not_internal(*oreq);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek2)
+
+psm2_error_t
+__psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status)
+{
+	psm2_error_t rv;
+
+	*oreq = NULL;
+	PSM2_LOG_MSG("entering");
+	rv = psmi_mq_ipeek_inner(mq, oreq, status,
+				   (psmi_mq_status_copy_t) mq_status_copy);
+
+	psmi_assert_req_not_internal(*oreq);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_ipeek)
+
+psm2_error_t __psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array,
+		psmi_mq_status_copy_user_t status_copy, int *count)
+{
+	psm2_mq_req_t req;
+	int read_count = *count;
+	int ret = 0;
+
+	PSMI_ASSERT_INITIALIZED();
+
+	*count = 0;
+	while (*count < read_count) {
+		PSMI_LOCK(mq->progress_lock);
+
+		if (mq->completed_q.first == NULL)
+			psmi_poll_internal(mq->ep, 1);
+
+		if ((req = mq->completed_q.first) == NULL) {
+			PSMI_UNLOCK(mq->progress_lock);
+			return PSM2_MQ_NO_COMPLETIONS;
+		}
+
+		mq_qq_remove(&mq->completed_q, req);
+		PSMI_UNLOCK(mq->progress_lock);
+
+		ret = status_copy(&req->req_data, status_array, *count);
+		psm2_mq_req_free(mq, req);
+
+		if (unlikely(ret < 0)) {
+			*count = ret;
+			return PSM2_INTERNAL_ERR;
+		} else if (ret == 0) {
+			continue;
+		}
+
+		*count = *count + 1;
+
+		if (ret > 1)
+			break;
+	}
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_mq_ipeek_dequeue_multi)
+
+psm2_error_t __psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *oreq)
+{
+	psm2_mq_req_t req;
+
+	PSMI_ASSERT_INITIALIZED();
+	PSMI_LOCK(mq->progress_lock);
+	if (mq->completed_q.first == NULL)
+		psmi_poll_internal(mq->ep, 1);
+	if ((req = mq->completed_q.first) == NULL) {
+		PSMI_UNLOCK(mq->progress_lock);
+		return PSM2_MQ_NO_COMPLETIONS;
+	}
+	mq_qq_remove(&mq->completed_q, req);
+	PSMI_UNLOCK(mq->progress_lock);
+	*oreq = req;
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_mq_ipeek_dequeue)
+
+psm2_error_t __psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	PSMI_ASSERT_INITIALIZED();
+	if (req == NULL)
+		return PSM2_OK;
+	PSMI_LOCK(mq->progress_lock);
+	psmi_mq_req_free(req);
+	PSMI_UNLOCK(mq->progress_lock);
+
+	return PSM2_OK;
+}
+PSMI_API_DECL(psm2_mq_req_free)
+
+static
+psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
+{
+	psm2_error_t err = PSM2_OK;
+	uint32_t val32;
+
+	switch (key) {
+	case PSM2_MQ_RNDV_HFI_SZ:
+		if (get)
+			*((uint32_t *) value) = mq->hfi_thresh_rv;
+		else {
+			val32 = *((uint32_t *) value);
+			mq->hfi_thresh_rv = val32;
+		}
+		_HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n",
+			  mq->hfi_thresh_rv, get ? "GET" : "SET");
+		break;
+
+	case PSM2_MQ_RNDV_SHM_SZ:
+		if (get)
+			*((uint32_t *) value) = mq->shm_thresh_rv;
+		else {
+			val32 = *((uint32_t *) value);
+			mq->shm_thresh_rv = val32;
+		}
+		_HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n",
+			  mq->shm_thresh_rv, get ? "GET" : "SET");
+		break;
+	case PSM2_MQ_MAX_SYSBUF_MBYTES:
+		/* Deprecated: this option no longer does anything. */
+		break;
+
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown option key=%u", key);
+		break;
+	}
+	return err;
+}
+
+psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+	rv = psmi_mqopt_ctl(mq, key, value, 1);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_getopt)
+
+psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value)
+{
+	psm2_error_t rv;
+	PSM2_LOG_MSG("entering");
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+	rv = psmi_mqopt_ctl(mq, key, (void *)value, 0);
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_setopt)
+
+#define TAB_SIZE 16
+#define STATS				\
+	STAT(rx_user_bytes)		\
+	STAT(rx_user_num)		\
+	STAT(rx_sys_bytes)		\
+	STAT(rx_sys_num)		\
+	STAT(tx_num)			\
+	STAT(tx_eager_num)		\
+	STAT(tx_eager_bytes)		\
+	STAT(tx_rndv_num)		\
+	STAT(tx_rndv_bytes)		\
+	STAT(tx_shm_num)		\
+	STAT(rx_shm_num)		\
+	STAT(rx_sysbuf_num)		\
+	STAT(rx_sysbuf_bytes)		\
+	STAT(comm_world_rank)
+
+static
+void
+psmi_mq_print_stats(psm2_mq_t mq, FILE *perf_stats_fd)
+{
+	psm2_mq_stats_t stats;
+	char msg_buffer[MSG_BUFFER_LEN];
+
+	psm2_mq_get_stats(mq, &stats);
+
+#define STAT(x) \
+	snprintf(msg_buffer, MSG_BUFFER_LEN, "%*lu",TAB_SIZE, stats.x); \
+	fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+
+	STATS
+
+#undef STAT
+
+	fwrite("\n", sizeof(char), 1, perf_stats_fd);
+}
+
+
+static
+void
+*psmi_mq_print_stats_thread(void *_mq)
+{
+	psm2_mq_t mq = (psm2_mq_t)_mq;
+	char perf_file_name[MSG_BUFFER_LEN];
+	char msg_buffer[MSG_BUFFER_LEN];
+	int delta_t = 0;
+
+	snprintf(perf_file_name, MSG_BUFFER_LEN, "./psm3-perf-stat-ep-0x%" PRIx64 "-pid-%d",
+			(uint64_t)(mq->ep->epid),
+			getpid());
+	FILE *perf_stats_fd = fopen(perf_file_name, "w+");
+
+	if (!perf_stats_fd)
+	{
+		_HFI_ERROR("Failed to create fd for performance logging\n");
+		goto end;
+	}
+
+#define STAT(x) \
+	snprintf(msg_buffer, MSG_BUFFER_LEN, "%*s",TAB_SIZE, #x);\
+	fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+
+	STAT(delta_t)
+	STATS
+
+#undef STAT
+
+	fwrite("\n", sizeof(char), 1, perf_stats_fd);
+
+	/* Performance stats will be printed every $PSM3_MQ_PRINT_STATS seconds */
+	do {
+		snprintf(msg_buffer, MSG_BUFFER_LEN, "%*d",TAB_SIZE, delta_t);
+		fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd);
+		psmi_mq_print_stats(mq, perf_stats_fd);
+		fflush(perf_stats_fd);
+		usleep(MICRO_SEC * mq->print_stats);
+		delta_t += mq->print_stats;
+	} while (mq->mq_perf_data.perf_print_stats);
+
+	fclose(perf_stats_fd);
+end:
+	pthread_exit(NULL);
+}
+
+static
+void
+psmi_mq_print_stats_init(psm2_mq_t mq)
+{
+	mq->mq_perf_data.perf_print_stats = 1;
+	if (pthread_create(&(mq->mq_perf_data.perf_print_thread), NULL,
+				psmi_mq_print_stats_thread, (void*)mq))
+	{
+		mq->mq_perf_data.perf_print_stats = 0;
+		_HFI_ERROR("Failed to create logging thread\n");
+	}
+}
+
+static
+void
+psmi_mq_print_stats_finalize(psm2_mq_t mq)
+{
+	if (mq->mq_perf_data.perf_print_stats)
+	{
+		mq->mq_perf_data.perf_print_stats = 0;
+		pthread_join(mq->mq_perf_data.perf_print_thread, NULL);
+	}
+}
+
+/*
+ * This is the API for the user.  We actually allocate the MQ much earlier, but
+ * the user can set options after obtaining an endpoint
+ */
+psm2_error_t
+__psm2_mq_init(psm2_ep_t ep, uint64_t ignored,
+	      const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (ep == NULL) {
+		err = PSM2_PARAM_ERR;
+		goto fail;
+	}
+
+	psm2_mq_t mq = ep->mq;
+	int i;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+	psmi_assert_always(mq != NULL);
+	psmi_assert_always(mq->ep != NULL);
+
+	mq->stats.comm_world_rank = hfi_get_myrank();
+
+	/* Process options */
+	for (i = 0; err == PSM2_OK && i < numopts; i++)
+		err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0);
+	if (err != PSM2_OK)	/* error already handled */
+		goto fail;
+
+	/* Initialize the unexpected system buffer allocator */
+	psmi_mq_sysbuf_init(mq);
+	char buf[128];
+	psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf);
+	_HFI_VDBG("%s", buf);
+
+	*mqo = mq;
+
+	if (mq->print_stats > 0)
+		psmi_mq_print_stats_init(mq);
+
+fail:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+PSMI_API_DECL(psm2_mq_init)
+
+psm2_error_t __psm2_mq_finalize(psm2_mq_t mq)
+{
+	psm2_error_t rv = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+
+	PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+
+	if (mq->print_stats == -1)
+	{
+		mq->print_stats = 1;
+		psmi_mq_print_stats_init(mq);
+	}
+	if (mq->print_stats != 0)
+		psmi_mq_print_stats_finalize(mq);
+
+	PSM2_LOG_MSG("leaving");
+	return rv;
+}
+PSMI_API_DECL(psm2_mq_finalize)
+
+void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats)
+{
+	PSM2_LOG_MSG("entering");
+	memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t));
+	PSM2_LOG_MSG("leaving");
+}
+PSMI_API_DECL(psm2_mq_get_stats)
+
+psm2_error_t psmi_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
+{
+	 struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("COMM_WORLD_Rank",
+					MPSPAWN_STATS_REDUCTION_ALL, NULL,
+					&mq->stats.comm_world_rank),
+		PSMI_STATS_DECLU64("Total_count_sent", &mq->stats.tx_num),
+		PSMI_STATS_DECLU64("Eager_count_sent", &mq->stats.tx_eager_num),
+		PSMI_STATS_DECLU64("Eager_bytes_sent", &mq->stats.tx_eager_bytes),
+		PSMI_STATS_DECLU64("Rendezvous_count_sent", &mq->stats.tx_rndv_num),
+		PSMI_STATS_DECLU64("Rendezvous_bytes_sent", &mq->stats.tx_rndv_bytes),
+		PSMI_STATS_DECLU64("Expected_count_recv", &mq->stats.rx_user_num),
+		PSMI_STATS_DECLU64("Expected_bytes_recv", &mq->stats.rx_user_bytes),
+		PSMI_STATS_DECLU64("Unexpected_count_recv", &mq->stats.rx_sys_num),
+		PSMI_STATS_DECLU64("Unexpected_bytes_recv", &mq->stats.rx_sys_bytes),
+		PSMI_STATS_DECLU64("shm_count_sent", &mq->stats.tx_shm_num),
+		PSMI_STATS_DECLU64("shm_bytes_sent", &mq->stats.tx_shm_bytes),
+		PSMI_STATS_DECLU64("shm_count_recv", &mq->stats.rx_shm_num),
+		PSMI_STATS_DECLU64("shm_bytes_recv", &mq->stats.rx_shm_bytes),
+		PSMI_STATS_DECLU64("sysbuf_count_recv", &mq->stats.rx_sysbuf_num),
+		PSMI_STATS_DECLU64("sysbuf_bytes_recv", &mq->stats.rx_sysbuf_bytes),
+#ifdef PSM_CUDA
+		PSMI_STATS_DECLU64("Eager_cpu_count_sent", &mq->stats.tx_eager_cpu_num),
+		PSMI_STATS_DECLU64("Eager_cpu_bytes_sent", &mq->stats.tx_eager_cpu_bytes),
+		PSMI_STATS_DECLU64("Eager_gpu_count_sent", &mq->stats.tx_eager_gpu_num),
+		PSMI_STATS_DECLU64("Eager_gpu_bytes_sent", &mq->stats.tx_eager_gpu_bytes),
+		PSMI_STATS_DECLU64("sysbuf_cpu_count_recv", &mq->stats.rx_sysbuf_cpu_num),
+		PSMI_STATS_DECLU64("sysbuf_cpu_bytes_recv", &mq->stats.rx_sysbuf_cpu_bytes),
+		PSMI_STATS_DECLU64("sysbuf_gdrcopy_count_recv", &mq->stats.rx_sysbuf_gdrcopy_num),
+		PSMI_STATS_DECLU64("sysbuf_gdrcopy_bytes_recv", &mq->stats.rx_sysbuf_gdrcopy_bytes),
+		PSMI_STATS_DECLU64("sysbuf_cuCopy_count_recv", &mq->stats.rx_sysbuf_cuCopy_num),
+		PSMI_STATS_DECLU64("sysbuf_cuCopy_bytes_recv", &mq->stats.rx_sysbuf_cuCopy_bytes),
+#endif
+	};
+
+	return psmi_stats_register_type("MPI_Statistics_Summary",
+					PSMI_STATSTYPE_MQ,
+					entries,
+					PSMI_STATS_HOWMANY(entries),
+					epid, mq, NULL);
+}
+
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo)
+{
+	psm2_error_t err = PSM2_OK;
+
+	psm2_mq_t mq =
+	    (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq));
+	if (mq == NULL) {
+		err = psmi_handle_error(NULL, PSM2_NO_MEMORY,
+					"Couldn't allocate memory for mq endpoint");
+		goto fail;
+	}
+
+	mq->ep = NULL;
+	/*mq->unexpected_callback = NULL; */
+	mq->memmode = psmi_parse_memmode();
+
+	memset(mq->unexpected_htab, 0,
+	       NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+	memset(mq->expected_htab, 0,
+	       NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq));
+	memset(&mq->expected_q, 0, sizeof(struct mqq));
+	memset(&mq->unexpected_q, 0, sizeof(struct mqq));
+	memset(&mq->completed_q, 0, sizeof(struct mqq));
+	memset(&mq->outoforder_q, 0, sizeof(struct mqq));
+	STAILQ_INIT(&mq->eager_q);
+
+
+	/* The values are overwritten in initialize_defaults, they're just set to
+	 * sensible defaults until then */
+	if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M)
+	{
+		mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2;
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2;
+	} else {
+		mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON;
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON;
+	}
+	if (! (psmi_parse_rdmamode() & IPS_PROTOEXP_FLAG_ENABLED)) {
+		// TBD - when RDMA is disabled do we want to disable rendezvous?
+		// even without RDMA, the receiver controlled pacing helps scalability
+		mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
+	}
+	mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY;
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_CUDA;
+#endif
+	mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
+
+	memset(&mq->stats, 0, sizeof(psm2_mq_stats_t));
+	err = psmi_mq_req_init(mq);
+	if (err)
+		goto fail;
+
+	*mqo = mq;
+
+	return PSM2_OK;
+fail:
+	if (mq != NULL)
+		psmi_free(mq);
+	return err;
+}
+
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq)
+{
+	union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv,
+		env_shmrv, env_stats;
+
+	psmi_getenv("PSM3_MQ_TINY_NIC_LIMIT",
+		    "NIC tiny packet limit (max 8, default 8)",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny);
+	mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8);
+
+	psmi_getenv("PSM3_MQ_RNDV_NIC_THRESH",
+		    "NIC eager-to-rendezvous switchover",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv);
+	mq->hfi_thresh_rv = env_hfirv.e_uint;
+
+	psmi_getenv("PSM3_MQ_RNDV_NIC_WINDOW",
+		    "NIC rendezvous window size, max 4M",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin);
+	mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint);
+
+	/* Re-evaluate this since it may have changed after initializing the shm
+	 * device */
+	mq->shm_thresh_rv = psmi_shm_mq_rv_thresh;
+	psmi_getenv("PSM3_MQ_RNDV_SHM_THRESH",
+		    "shm eager-to-rendezvous switchover",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv);
+	mq->shm_thresh_rv = env_shmrv.e_uint;
+
+	psmi_getenv("PSM3_MQ_PRINT_STATS",
+		    "Prints MQ performance stats every n seconds to file "
+			"./psm3-perf-stat-ep-[epid]-pid-[pid] when set to -1 stats are "
+			"printed only once during finalization",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val) 0, &env_stats);
+	mq->print_stats = env_stats.e_uint;
+
+	mq->nohash_fastpath = 1;
+	return PSM2_OK;
+}
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq)
+{
+	psmi_mq_req_fini(mq);
+	psmi_mq_sysbuf_fini(mq);
+	psmi_stats_deregister_type(PSMI_STATSTYPE_MQ, mq);
+	psmi_free(mq);
+	return PSM2_OK;
+}
+MOCK_DEF_EPILOGUE(psmi_mq_free);
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mq_internal.h b/deps/libfabric/prov/psm3/psm3/psm_mq_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3d246b696d7e70d280aa0d61fa3ffa899624fa1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mq_internal.h
@@ -0,0 +1,637 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef MQ_INT_H
+#define MQ_INT_H
+
+/* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */
+#ifdef malloc
+#undef malloc
+#endif
+#ifdef free
+#undef free
+#endif
+#include <smmintrin.h>
+#include "psm_user.h"
+#include "psm_sysbuf.h"
+
+#include "psm2_mock_testing.h"
+
+#if 0
+typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t)
+	(psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr,
+	 uint64_t tag, uint32_t send_msglen, const void *payload,
+	 uint32_t paylen);
+#endif
+
+#define MICRO_SEC 1000000
+#define MSG_BUFFER_LEN 100
+
+struct psm2_mq_perf_data
+{
+	pthread_t perf_print_thread;
+	int perf_print_stats;
+};
+
+enum psm2_mq_tag_pattern {
+	PSM2_TAG_SRC = 0,
+	PSM2_TAG_ANYSRC,
+	PSM2_ANYTAG_SRC,
+	PSM2_ANYTAG_ANYSRC,
+};
+
+struct psm2_mq {
+	psm2_ep_t ep;		/**> ep back pointer */
+	mpool_t sreq_pool;
+	mpool_t rreq_pool;
+
+	struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+	struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS];
+
+	/* in case the compiler can't figure out how to preserve the hashed values
+	between mq_req_match() and mq_add_to_unexpected_hashes() ... */
+	unsigned hashvals[NUM_HASH_CONFIGS];
+
+	/*psm_mq_unexpected_callback_fn_t unexpected_callback; */
+	struct mqq expected_q;		/**> Preposted (expected) queue */
+	struct mqq unexpected_q;	/**> Unexpected queue */
+	struct mqq completed_q;		/**> Completed queue */
+
+	struct mqq outoforder_q;	/**> OutofOrder queue */
+	STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */
+
+	uint32_t hfi_thresh_tiny;
+	uint32_t hfi_thresh_rv;
+	uint32_t shm_thresh_rv;
+	uint32_t hfi_base_window_rv;	/**> this is a base rndv window size,
+					     will be further trimmed down per-connection based
+					     on the peer's MTU */
+	int memmode;
+
+	uint64_t timestamp;
+
+	psm2_mq_stats_t stats;	/**> MQ stats, accumulated by each PTL */
+
+	int print_stats;
+	struct psm2_mq_perf_data mq_perf_data;
+
+	int nohash_fastpath;
+	unsigned unexpected_hash_len;
+	unsigned unexpected_list_len;
+	unsigned expected_hash_len;
+	unsigned expected_list_len;
+
+	psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS];
+	int mem_ctrl_is_init;
+	uint64_t mem_ctrl_total_bytes;
+
+	psmi_lock_t progress_lock;
+};
+
+#define MQE_TYPE_IS_SEND(type)	((type) & MQE_TYPE_SEND)
+#define MQE_TYPE_IS_RECV(type)	((type) & MQE_TYPE_RECV)
+
+#define MQE_TYPE_SEND		0x1000
+#define MQE_TYPE_RECV		0x2000
+#define MQE_TYPE_FLAGMASK	0x0fff
+#define MQE_TYPE_WAITING	0x0001
+#define MQE_TYPE_WAITING_PEER	0x0004
+#define MQE_TYPE_EAGER_QUEUE	0x0008
+
+#define MQ_STATE_COMPLETE	0
+#define MQ_STATE_POSTED		1
+#define MQ_STATE_MATCHED	2
+#define MQ_STATE_UNEXP		3
+#define MQ_STATE_UNEXP_RV	4
+#define MQ_STATE_FREE		5
+
+/*
+ * These must match the ips protocol message opcode.
+ */
+#define MQ_MSG_TINY		0xc1
+#define MQ_MSG_SHORT		0xc2
+#define MQ_MSG_EAGER		0xc3
+#define MQ_MSG_LONGRTS		0xc4
+
+/*
+ * Descriptor allocation limits.
+ * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure
+ */
+#define MQ_SENDREQ_LIMITS {					\
+	    .env = "PSM3_MQ_SENDREQS_MAX",			\
+	    .descr = "Max num of isend requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+#define MQ_RECVREQ_LIMITS {					\
+	    .env = "PSM3_MQ_RECVREQS_MAX",			\
+	    .descr = "Max num of irecv requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted);
+typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req);
+
+
+/* If request is marked as internal, then it will not
+   be exposed to the user, will not be added to the mq->completed_q.
+   This flag is set if request is used by e.g. MPI_SEND */
+#define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0)
+/* Identifies req as part of fast path. */
+#define PSMI_REQ_FLAG_FASTPATH    (1 << 1)
+/* Identifies req as a NORMAL operation with no special cases.*/
+#define PSMI_REQ_FLAG_NORMAL      0
+
+#define psmi_is_req_internal(req) ((req)->flags_internal & PSMI_REQ_FLAG_IS_INTERNAL)
+
+#define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \
+							(!psmi_is_req_internal(req)))
+
+/* receive mq_req, the default */
+struct psm2_mq_req {
+	struct psm2_mq_req_user req_data;
+
+	struct {
+		psm2_mq_req_t next[NUM_MQ_SUBLISTS];
+		psm2_mq_req_t prev[NUM_MQ_SUBLISTS];
+		STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */
+	};
+	struct mqq *q[NUM_MQ_SUBLISTS];
+	uint64_t timestamp;
+	uint32_t state;
+	uint32_t type;
+	psm2_mq_t mq;
+
+	/* Some PTLs want to get notified when there's a test/wait event */
+	mq_testwait_callback_fn_t testwait_callback;
+
+	uint16_t msg_seqnum;	/* msg seq num for mctxt */
+	uint32_t recv_msgoff;	/* Message offset into req_data.buf */
+	union {
+		uint32_t send_msgoff;	/* Bytes received so far.. can be larger than buf_len */
+		uint32_t recv_msgposted;
+	};
+	uint32_t rts_reqidx_peer;
+
+	uint32_t flags_user;
+	uint32_t flags_internal;
+
+	/* Used to keep track of unexpected rendezvous */
+	mq_rts_callback_fn_t rts_callback;
+	psm2_epaddr_t rts_peer;
+	uintptr_t rts_sbuf;
+
+	psm2_verbs_mr_t	mr;	// local registered memory for app buffer
+
+#ifdef PSM_CUDA
+	uint8_t* user_gpu_buffer;	/* for recv */
+	STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch;
+	uint32_t prefetch_send_msgoff;
+	int cuda_hostbuf_used;
+	CUipcMemHandle cuda_ipc_handle;
+	uint8_t cuda_ipc_handle_attached;
+	uint32_t cuda_ipc_offset;
+	/*
+	 * is_sendbuf_gpu_mem - Used to always select TID path on the receiver
+	 * when send is on a device buffer
+	 */
+	uint8_t is_sendbuf_gpu_mem;
+	/*
+	 * is_buf_gpu_mem - used to indicate if the send or receive is issued
+	 * on a device/host buffer.
+	 */
+	uint8_t is_buf_gpu_mem;
+#endif
+
+	/* PTLs get to store their own per-request data.  MQ manages the allocation
+	 * by allocating psm2_mq_req so that ptl_req_data has enough space for all
+	 * possible PTLs.
+	 */
+	union {
+		void *ptl_req_ptr;	/* when used by ptl as pointer */
+		uint8_t ptl_req_data[0];	/* when used by ptl for "inline" data */
+	};
+};
+
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_64(uint64_t a))
+{
+	return _mm_crc32_u64(0, a);
+}
+PSMI_ALWAYS_INLINE(
+unsigned
+hash_32(uint32_t a))
+{
+	return _mm_crc32_u32(0, a);
+}
+
+void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars);
+MOCK_DCL_EPILOGUE(psmi_mq_mtucpy);
+void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars);
+
+#if defined(__x86_64__)
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars);
+#else
+#define psmi_mq_mtucpy_safe psmi_mq_mtucpy
+#endif
+
+/*
+ * Optimize for 0-8 byte case, but also handle others.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+#ifdef PSM_CUDA
+	if (len && PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) {
+		PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len);
+		return;
+	}
+#endif
+	switch (len) {
+	case 8: *dest++ = *src++;
+	/* fall through */
+	case 4: *dest++ = *src++;
+	/* fall through */
+	case 0:
+		return;
+	case 7:
+	case 6:
+	case 5:
+		*dest++ = *src++;
+		len -= 4;
+	/* fall through */
+	case 3:
+	case 2:
+	case 1:
+		break;
+	default:		/* greater than 8 */
+		psmi_mq_mtucpy(dest, src, len);
+		return;
+	}
+	uint8_t *dest1 = (uint8_t *) dest;
+	uint8_t *src1 = (uint8_t *) src;
+	switch (len) {
+	case 3: *dest1++ = *src1++;
+	/* fall through */
+	case 2: *dest1++ = *src1++;
+	/* fall through */
+	case 1: *dest1++ = *src1++;
+	}
+}
+
+typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len);
+typedef void (*psmi_copy_tiny_fn_t)(uint32_t *dest, uint32_t *src, uint8_t len);
+#ifdef PSM_CUDA
+
+PSMI_ALWAYS_INLINE(
+void
+mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len))
+{
+	switch (len) {
+	case 8: *dest++ = *src++;
+	/* fall through */
+	case 4: *dest++ = *src++;
+	/* fall through */
+	case 0:
+		return;
+	case 7:
+	case 6:
+	case 5:
+		*dest++ = *src++;
+		len -= 4;
+	/* fall through */
+	case 3:
+	case 2:
+	case 1:
+		break;
+	default:		/* greater than 8 */
+		psmi_mq_mtucpy(dest, src, len);
+		return;
+	}
+	uint8_t *dest1 = (uint8_t *) dest;
+	uint8_t *src1 = (uint8_t *) src;
+	switch (len) {
+	case 3: *dest1++ = *src1++;
+	/* fall through */
+	case 2: *dest1++ = *src1++;
+	/* fall through */
+	case 1: *dest1++ = *src1++;
+	}
+}
+#endif
+
+/* Typedef describing a function to populate a psm2_mq_status(2)_t given a
+ * matched request.  The purpose of this typedef is to avoid duplicating
+ * code to handle both PSM v1 and v2 status objects.  Outer routines pass in
+ * either mq_status_copy or mq_status2_copy and the inner routine calls that
+ * provided routine to fill in the correct status type.
+ */
+typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status);
+
+/*
+ * Given an req with buffer ubuf of length ubuf_len,
+ * fill in the req's status and return the amount of bytes the request
+ * can receive.
+ *
+ * The function sets status truncation errors. Basically what MPI_Status does.
+ */
+PSMI_ALWAYS_INLINE(
+void
+mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status))
+{
+	status->msg_tag = req->req_data.tag.tag64;
+	status->msg_length = req->req_data.send_msglen;
+	status->nbytes = req->req_data.recv_msglen;
+	status->error_code = (psm2_error_t)req->req_data.error_code;
+	status->context = req->req_data.context;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status))
+{
+	status->msg_peer = req->req_data.peer;
+	status->msg_tag = req->req_data.tag;
+	status->msg_length = req->req_data.send_msglen;
+	status->nbytes = req->req_data.recv_msglen;
+	status->error_code = (psm2_error_t)req->req_data.error_code;
+	status->context = req->req_data.context;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t
+mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen))
+{
+	req->req_data.send_msglen = sendlen;
+	if (recvlen < sendlen) {
+		req->req_data.recv_msglen = recvlen;
+		req->req_data.error_code = PSM2_MQ_TRUNCATION;
+		return recvlen;
+	} else {
+		req->req_data.recv_msglen = sendlen;
+		req->req_data.error_code = PSM2_OK;
+		return sendlen;
+	}
+}
+
+PSMI_ALWAYS_INLINE(
+int
+min_timestamp_4(psm2_mq_req_t *match))
+{
+	uint64_t oldest = -1;
+	int which = -1, i;
+	for (i = 0; i < 4; i++) {
+		if (match[i] && (match[i]->timestamp < oldest)) {
+			oldest = match[i]->timestamp;
+			which = i;
+		}
+	}
+	return which;
+}
+
+#ifndef PSM_DEBUG
+/*! Append to Queue */
+PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req))
+{
+	req->next[PSM2_ANYTAG_ANYSRC] = NULL;
+	req->prev[PSM2_ANYTAG_ANYSRC] = q->last;
+	if (q->last)
+		q->last->next[PSM2_ANYTAG_ANYSRC] = req;
+	else
+		q->first = req;
+	q->last = req;
+	req->q[PSM2_ANYTAG_ANYSRC] = q;
+}
+#else
+#define mq_qq_append(qq, req)						\
+	do {								\
+		psmi_assert_req_not_internal(req);			\
+		(req)->next[PSM2_ANYTAG_ANYSRC] = NULL;			\
+		(req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last;		\
+		if ((qq)->last)						\
+			(qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req);	\
+		else							\
+			(qq)->first = (req);				\
+		(qq)->last = (req);					\
+		(req)->q[PSM2_ANYTAG_ANYSRC] = (qq);			\
+		if (qq == &(req)->mq->completed_q)			\
+			_HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \
+				  (req), __FILE__, __LINE__);		\
+	} while (0)
+#endif
+PSMI_ALWAYS_INLINE(
+void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS],
+			int table, int bucket, psm2_mq_req_t req))
+{
+	req->next[table] = NULL;
+	req->prev[table] = q[table][bucket].last;
+	if (q[table][bucket].last)
+		q[table][bucket].last->next[table] = req;
+	else
+		q[table][bucket].first = req;
+	q[table][bucket].last = req;
+	req->q[table] = &q[table][bucket];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req))
+{
+	if (req->next[PSM2_ANYTAG_ANYSRC] != NULL)
+		req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] =
+			req->prev[PSM2_ANYTAG_ANYSRC];
+	else
+		q->last = req->prev[PSM2_ANYTAG_ANYSRC];
+	if (req->prev[PSM2_ANYTAG_ANYSRC])
+		req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] =
+			req->next[PSM2_ANYTAG_ANYSRC];
+	else
+		q->first = req->next[PSM2_ANYTAG_ANYSRC];
+}
+PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table))
+{
+	struct mqq *q = req->q[table];
+
+	req->q[table] = NULL;
+	if (req->next[table] != NULL)
+		req->next[table]->prev[table] = req->prev[table];
+	else
+		q->last = req->prev[table];
+	if (req->prev[table])
+		req->prev[table]->next[table] = req->next[table];
+	else
+		q->first = req->next[table];
+}
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq);
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq);
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type);
+MOCK_DCL_EPILOGUE(psmi_mq_req_alloc);
+#define      psmi_mq_req_free(req)  psmi_mpool_put(req)
+
+/*
+ * Main receive progress engine, for shmops and hfi, in mq.c
+ */
+psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo);
+psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq);
+psm2_error_t psmi_mq_initstats(psm2_mq_t mq, psm2_epid_t epid);
+
+psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq);
+MOCK_DCL_EPILOGUE(psmi_mq_free);
+
+/* Three functions that handle all MQ stuff */
+#define MQ_RET_MATCH_OK	0
+#define MQ_RET_UNEXP_OK 1
+#define MQ_RET_UNEXP_NO_RESOURCES 2
+#define MQ_RET_DATA_OK 3
+#define MQ_RET_DATA_OUT_OF_ORDER 4
+
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req);
+int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+			uint32_t offset, const void *payload, uint32_t paylen
+#ifdef PSM_CUDA
+			, int use_gdrcopy,
+			psm2_ep_t ep
+#endif
+			);
+int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+		       struct ptl_strategy_stats *stats,
+		       uint32_t msglen, const void *payload, uint32_t paylen,
+		       int msgorder, mq_rts_callback_fn_t cb,
+		       psm2_mq_req_t *req_o);
+int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+			    struct ptl_strategy_stats *stats,
+			    uint32_t msglen, uint32_t offset,
+			    const void *payload, uint32_t paylen, int msgorder,
+			    uint32_t opcode, psm2_mq_req_t *req_o);
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
+
+// perform the actual copy for a recv matching a sysbuf.  We copy from a sysbuf
+// (req->req_data.buf) to the actual user buffer (buf) and keep statistics.
+// is_buf_gpu_mem indicates if buf is a gpu buffer
+// len - recv buffer size posted, we use this for any GDR copy pinning so
+// 	can get future cache hits on other size messages in same buffer
+// not needed - msglen - negotiated total message size
+// copysz - actual amount to copy (<= msglen)
+#ifdef PSM_CUDA
+void psmi_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem,
+                                void *buf, uint32_t len, uint32_t copysz);
+#else
+PSMI_ALWAYS_INLINE(
+void psmi_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, void *buf,
+                                uint32_t len, uint32_t copysz))
+{
+	if (copysz)
+		psmi_mq_mtucpy(buf, (const void *)req->req_data.buf, copysz);
+}
+#endif
+
+#if 0   // unused code, specific to QLogic MPI
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn);
+#endif
+
+void psmi_mq_fastpath_disable(psm2_mq_t mq);
+void psmi_mq_fastpath_try_reenable(psm2_mq_t mq);
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum))
+{
+	psm2_mq_req_t *curp;
+	psm2_mq_req_t cur;
+
+	for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) {
+		if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) {
+			/* match! */
+			mq_qq_remove(q, cur);
+			return cur;
+		}
+	}
+	return NULL; /* no match */
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_mq_req_t
+mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum))
+{
+	psm2_mq_req_t cur;
+
+	cur = STAILQ_FIRST(&mq->eager_q);
+	while (cur) {
+		if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum)
+			return cur;
+		cur = STAILQ_NEXT(cur, nextq);
+	}
+	return NULL;		/* no match */
+}
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+				     psm_mq_unexpected_callback_fn_t fn);
+#endif
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mq_recv.c b/deps/libfabric/prov/psm3/psm3/psm_mq_recv.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e938077e8939058e05dff90a426687f1cb1df13
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mq_recv.c
@@ -0,0 +1,834 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "ptl_ips/ips_proto_header.h"
+
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
+#if 0
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm2_mq_t mq,
+				     psm_mq_unexpected_callback_fn_t fn)
+{
+	psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback;
+	mq->unexpected_callback = fn;
+	return old_fn;
+}
+#endif
+
+// the RTS/CTS sequence using TID is now complete
+// used on both sender and receiver side
+// LONG_DATA on sender ends up in ips_proto_mq_eager_complete
+// LONG_DATA on receiver ends up in psmi_mq_handle_data
+void psmi_mq_handle_rts_complete(psm2_mq_req_t req)
+{
+	psm2_mq_t mq = req->mq;
+
+	if (req->mr) {
+		_HFI_MMDBG("RTS complete, releasing MR: rkey: 0x%x\n", req->mr->rkey);
+		psm2_verbs_release_mr(req->mr);
+		req->mr = NULL;
+		ips_tid_mravail_callback(req->rts_peer->proto);
+	}
+
+	req->state = MQ_STATE_COMPLETE;
+	ips_barrier();
+	if(!psmi_is_req_internal(req))
+		mq_qq_append(&mq->completed_q, req);
+
+	_HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n",
+		  req, req->req_data.recv_msglen);
+	return;
+}
+
+static void
+psmi_mq_req_copy(psm2_mq_req_t req,
+		 uint32_t offset, const void *buf, uint32_t nbytes
+#ifdef PSM_CUDA
+		, int use_gdrcopy,
+		psm2_ep_t ep
+#endif
+		)
+{
+	/* recv_msglen may be changed by unexpected receive req_data.buf. */
+	uint32_t msglen_this, end;
+	uint8_t *msgptr = (uint8_t *) req->req_data.buf + offset;
+
+	/* out of receiving range. */
+	if (offset >= req->req_data.recv_msglen) {
+		req->send_msgoff += nbytes;
+		return;
+	}
+
+	end = offset + nbytes;
+	if (end > req->req_data.recv_msglen) {
+		msglen_this = req->req_data.recv_msglen - offset;
+		end = req->req_data.recv_msglen;
+	} else {
+		msglen_this = nbytes;
+	}
+#ifdef PSM_CUDA
+	if (use_gdrcopy) {
+		void *ubuf;
+		ubuf = gdr_convert_gpu_to_host_addr(GDR_FD,
+				(unsigned long)msgptr, msglen_this, 1,
+				ep);
+		if  (! ubuf)
+			psmi_mq_mtucpy(msgptr, buf, msglen_this);
+		else
+		psmi_mq_mtucpy_host_mem(ubuf, buf, msglen_this);
+	} else
+#endif
+		psmi_mq_mtucpy(msgptr, buf, msglen_this);
+
+	if (req->recv_msgoff < end) {
+		req->recv_msgoff = end;
+	}
+
+	req->send_msgoff += nbytes;
+	return;
+}
+
+// This handles eager and LONG_DATA payload and completion for receiver
+// For ips eager, the caller will have already prepared for gdrcopy
+// For ips, LONG_DATA will not be used for GPU buffers unless RDMA disabled
+// So no need/opportunity to take advantage of gdrcopy here.
+int
+psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
+		    uint32_t offset, const void *buf, uint32_t nbytes
+#ifdef PSM_CUDA
+		    , int use_gdrcopy,
+		    psm2_ep_t ep
+#endif
+		)
+{
+	psmi_assert(req != NULL);
+	int rc;
+
+	if (req->state == MQ_STATE_MATCHED)
+		rc = MQ_RET_MATCH_OK;
+	else {
+		psmi_assert(req->state == MQ_STATE_UNEXP);
+		// TBD - will be sysbuf, could tell psmi_mq_req_copy to
+		// use psmi_mq_mtucpy_host_mem by passing a func arg
+		// but limited benefit for eager/long protocol
+		rc = MQ_RET_UNEXP_OK;
+	}
+
+#ifdef PSM_CUDA
+	psmi_mq_req_copy(req, offset, buf, nbytes, use_gdrcopy, ep);
+#else
+	psmi_mq_req_copy(req, offset, buf, nbytes);
+#endif
+
+	/*
+	 * the reason to use >= is because send_msgoff
+	 * may be DW pad included.
+	 */
+	if (req->send_msgoff >= req->req_data.send_msglen) {
+		if (req->type & MQE_TYPE_EAGER_QUEUE) {
+			STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq);
+		}
+
+		if (req->state == MQ_STATE_MATCHED) {
+			psmi_assert(! req->mr);
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+		} else {	/* MQ_STATE_UNEXP */
+			req->state = MQ_STATE_COMPLETE;
+		}
+	}
+
+	return rc;
+}
+
+static
+void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req)
+{
+	int table;
+	mq_qq_append(&mq->unexpected_q, req);
+	req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q;
+	mq->unexpected_list_len++;
+	if_pt (mq->nohash_fastpath) {
+		if_pf (mq->unexpected_list_len >= HASH_THRESHOLD)
+			psmi_mq_fastpath_disable(mq);
+		return;
+	}
+
+	for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+		mq_qq_append_which(mq->unexpected_htab,
+				   table, mq->hashvals[table], req);
+	mq->unexpected_hash_len++;
+}
+
+
+psm2_mq_req_t
+mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold)
+{
+	psm2_mq_req_t *curp, cur;
+
+	for (curp = &q->first;
+	     ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold);
+	     curp = &cur->next[which]) {
+		if ((cur->req_data.peer == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) &&
+		    !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & cur->req_data.tagsel.tag[0]) &&
+		    !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & cur->req_data.tagsel.tag[1]) &&
+		    !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & cur->req_data.tagsel.tag[2])) {
+			*time_threshold = cur->timestamp;
+			return cur;
+		}
+	}
+	return NULL;
+}
+
+psm2_mq_req_t
+mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove)
+{
+	psm2_mq_req_t match[4];
+	int table;
+	uint64_t best_ts = -1;
+
+	if (mq->nohash_fastpath) {
+		table = PSM2_ANYTAG_ANYSRC;
+		match[table] =
+			mq_list_scan(&mq->expected_q,
+				     src, tag, PSM2_ANYTAG_ANYSRC, &best_ts);
+		if (match[table] && remove) {
+			mq->expected_list_len--;
+			mq_qq_remove_which(match[table], table);
+		}
+		return match[table];
+	}
+
+	mq->hashvals[PSM2_TAG_SRC] = hash_64(tag->tag64) % NUM_HASH_BUCKETS;
+	mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS;
+	mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS;
+
+	for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++)
+		match[table] =
+			mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]],
+				     src, tag, table, &best_ts);
+	table = PSM2_ANYTAG_ANYSRC;
+	match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts);
+
+	table = min_timestamp_4(match);
+	if (table == -1)
+		return NULL;
+
+	if (remove) {
+		if_pt (table == PSM2_ANYTAG_ANYSRC)
+			mq->expected_list_len--;
+		else
+			mq->expected_hash_len--;
+		mq_qq_remove_which(match[table], table);
+		psmi_mq_fastpath_try_reenable(mq);
+	}
+	return match[table];
+}
+/*
+ * This handles the rendezvous MPI envelopes, the packet might have the whole
+ * message payload, or zero payload.
+ * our return indicates if we had a match.  If no match we prepare the
+ * req for future processing and callback when a future MPI_recv call matches
+ * as a performance optmization, the first time we lack a match we ask for
+ * a REVISIT of the message to help the case where the MPI_recv is just slightly
+ * after the RTS arrived
+ */
+int
+psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+		   struct ptl_strategy_stats *stats,
+		   uint32_t send_msglen, const void *payload, uint32_t paylen,
+		   int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req;
+	uint32_t msglen;
+	int rc;
+
+	PSMI_LOCK_ASSERT(mq->progress_lock);
+
+	_HFI_MMDBG("rts from 0x%"PRIx64" 0x%x,0x%x,0x%x",
+					src->epid, tag->tag0, tag->tag1, tag->tag2);
+	if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+		/* we have a match, no need to callback */
+		msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen);
+		/* reset send_msglen because sender only sends this many */
+		req->req_data.send_msglen = msglen;
+		req->state = MQ_STATE_MATCHED;
+		req->req_data.peer = src;
+		req->req_data.tag = *tag;
+
+		if (paylen > msglen) paylen = msglen;
+		if (paylen) {
+			// payload of RTS can contain a single packet synchronous MPI msg
+			psmi_mq_mtucpy(req->req_data.buf, payload, paylen);
+#ifdef PSM_CUDA
+			if (req->is_buf_gpu_mem) {
+				stats->rndv_rts_cuCopy_recv++;
+				stats->rndv_rts_cuCopy_recv_bytes += paylen;
+			} else
+#endif
+			{
+				stats->rndv_rts_cpu_recv++;
+				stats->rndv_rts_cpu_recv_bytes += paylen;
+			}
+		}
+		req->recv_msgoff = req->send_msgoff = paylen;
+		*req_o = req;	/* yes match */
+		PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid,
+			    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+		rc = MQ_RET_MATCH_OK;
+	} else if (msgorder > 1) {
+		/* There is NO request match, and this is the first time
+		 * to try to process this packet, we leave the packet in
+		 * hardware queue for retry in hope there is a request
+		 * match next time, this is for performance
+		 * consideration.
+		 */
+		_HFI_MMDBG("no match 1st time - revisit msgorder=%d\n", msgorder);
+		rc = MQ_RET_UNEXP_NO_RESOURCES;
+	} else {		/* No match, keep track of callback */
+		/* this is the 2nd attempt so we need to put it on the unexpected
+		 * queue and move on.  A future MPI_Recv call will match it
+		 */
+		// TBD - on OPA for OSU latency we tend to hit revisit queue and then
+		// match on 2nd call.  On PSM UD we tend to hit revisit queue and
+		// then still not match on 2nd attempt and end up here.  Unclear
+		// why MPI_Recv gets posted a little slower.  Maybe RDMA Write acks
+		// occur a little slower then Native OPA's explicit TID_COMPLETION
+		// such that sender does not get it's MPI_Send done prior to remote
+		// node completing it's MPI_Recv and starting it's next MPI_Send
+		// may want to see if REVISIT is providing value or whether anything
+		// to tune to speed up RDMA Send completion (eg. and the ack which
+		// triggers it).
+		// Experiment with skipping revisit return above and always doing
+		// this need more analysis but limited if any impact on native OPA.
+		_HFI_MMDBG("no match req queue msgorder=%d\n", msgorder);
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+		psmi_assert(req != NULL);
+		/* We don't know recv_msglen yet but we set it here for
+		 * mq_iprobe */
+		req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen;
+		PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv,
+				 OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid,
+				    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+		req->state = MQ_STATE_UNEXP_RV;
+		req->req_data.peer = src;
+		req->req_data.tag = *tag;
+		req->rts_callback = cb;
+		if (paylen > send_msglen) paylen = send_msglen;
+		if (paylen) {
+			req->req_data.buf = psmi_mq_sysbuf_alloc(mq, paylen);
+			psmi_assert(paylen == 0 || req->req_data.buf != NULL);
+#ifdef PSM_CUDA
+			psmi_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen);
+#else
+			psmi_mq_mtucpy(req->req_data.buf, payload, paylen);
+#endif
+			stats->rndv_rts_sysbuf_recv++;
+			stats->rndv_rts_sysbuf_recv_bytes += paylen;
+		}
+		req->recv_msgoff = req->send_msgoff = paylen;
+
+		if (msgorder) {
+			mq_add_to_unexpected_hashes(mq, req);
+		}
+		/* caller will handle out of order case */
+		*req_o = req;	/* no match, will callback */
+		rc = MQ_RET_UNEXP_OK;
+	}
+
+#ifdef PSM_DEBUG
+	if (req)
+		_HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d "
+			  "sendlen=%d errcode=%d\n",
+			  rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+			  psmi_epaddr_get_name(src->epid),
+			  req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2],
+			  req->req_data.recv_msglen, req->req_data.send_msglen, req->req_data.error_code);
+	else
+		_HFI_VDBG("match=%s (req=%p) src=%s\n",
+			  rc == MQ_RET_MATCH_OK ? "YES" : "NO", req,
+			  psmi_epaddr_get_name(src->epid));
+#endif /* #ifdef PSM_DEBUG */
+	return rc;
+}
+
+/*
+ * This handles the regular (i.e. non-rendezvous MPI envelopes)
+ */
+int
+psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
+			struct ptl_strategy_stats *stats,
+			uint32_t send_msglen, uint32_t offset,
+			const void *payload, uint32_t paylen, int msgorder,
+			uint32_t opcode, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req;
+	uint32_t msglen;
+	psmi_mtucpy_fn_t psmi_mtucpy_fn;
+#if defined(PSM_CUDA)
+	int use_gdrcopy = 0;
+#endif // PSM_CUDA
+
+	if (msgorder && (req = mq_req_match(mq, src, tag, 1))) {
+		/* we have a match */
+		void *user_buffer = req->req_data.buf;
+		psmi_assert(MQE_TYPE_IS_RECV(req->type));
+		req->req_data.peer = src;
+		req->req_data.tag = *tag;
+		msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen);
+
+		_HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x"
+			  " msglen=%d paylen=%d\n", req, opcode,
+			  psmi_epaddr_get_name(src->epid),
+			  tag->tag[0], tag->tag[1], tag->tag[2], msglen,
+			  paylen);
+
+		switch (opcode) {
+		case MQ_MSG_TINY:
+			/* mq_copy_tiny() can handle zero byte */
+#ifdef PSM_CUDA
+			if (!req->is_buf_gpu_mem) {
+				mq_copy_tiny_host_mem((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
+				stats->tiny_cpu_recv++;
+				stats->tiny_cpu_recv_bytes += msglen;
+			// conversion will round up to 64K so just use
+			// msglen here to protect against huge buf_len
+			} else if (PSMI_USE_GDR_COPY_RECV(msglen) &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+								(unsigned long)req->req_data.buf,
+								msglen, 1, mq->ep))) {
+				mq_copy_tiny_host_mem((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
+				stats->tiny_gdrcopy_recv++;
+				stats->tiny_gdrcopy_recv_bytes += msglen;
+			} else {
+				user_buffer = req->req_data.buf;
+#endif
+				mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
+#ifdef PSM_CUDA
+				stats->tiny_cuCopy_recv++;
+				stats->tiny_cuCopy_recv_bytes += msglen;
+			}
+#else
+			stats->tiny_cpu_recv++;
+			stats->tiny_cpu_recv_bytes += msglen;
+#endif
+
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+			break;
+
+		case MQ_MSG_SHORT:	/* message fits in 1 payload */
+			psmi_mtucpy_fn = psmi_mq_mtucpy;
+#ifdef PSM_CUDA
+			if (!req->is_buf_gpu_mem) {
+				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+				stats->short_cpu_recv++;
+				stats->short_cpu_recv_bytes += msglen;
+			// conversion will round up to 64K so just use
+			// msglen here to protect against huge buf_len
+			} else if (PSMI_USE_GDR_COPY_RECV(msglen) &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+							(unsigned long)req->req_data.buf,
+							msglen, 1, mq->ep))) {
+				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+				stats->short_gdrcopy_recv++;
+				stats->short_gdrcopy_recv_bytes += msglen;
+			} else {
+				user_buffer = req->req_data.buf;
+#endif
+#ifdef PSM_CUDA
+				stats->short_cuCopy_recv++;
+				stats->short_cuCopy_recv_bytes += msglen;
+			}
+#else
+			stats->short_cpu_recv++;
+			stats->short_cpu_recv_bytes += msglen;
+#endif
+			if (msglen <= paylen) {
+				psmi_mtucpy_fn(user_buffer, payload, msglen);
+			} else {
+				psmi_assert((msglen & ~0x3) == paylen);
+				psmi_mtucpy_fn(user_buffer, payload, paylen);
+				/*
+				 * there are nonDW bytes attached in header,
+				 * copy after the DW payload.
+				 */
+				mq_copy_tiny((uint32_t *)((uint8_t *)user_buffer + paylen),
+					(uint32_t *)&offset, msglen & 0x3);
+			}
+			req->state = MQ_STATE_COMPLETE;
+			ips_barrier();
+			mq_qq_append(&mq->completed_q, req);
+			break;
+
+		case MQ_MSG_EAGER:
+			req->state = MQ_STATE_MATCHED;
+			req->type |= MQE_TYPE_EAGER_QUEUE;
+			req->send_msgoff = req->recv_msgoff = 0;
+			STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+			_HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n",
+				  msglen, paylen);
+			// !offset -> only count recv msgs on 1st pkt in msg
+#ifdef PSM_CUDA
+			if (!req->is_buf_gpu_mem) {
+				if (!offset) stats->eager_cpu_recv++;
+				stats->eager_cpu_recv_bytes += paylen;
+			} else if (PSMI_USE_GDR_COPY_RECV(paylen)) {
+				req->req_data.buf = req->user_gpu_buffer;
+				use_gdrcopy = 1;
+				if (!offset) stats->eager_gdrcopy_recv++;
+				stats->eager_gdrcopy_recv_bytes += paylen;
+			} else {
+				req->req_data.buf = req->user_gpu_buffer;
+				if (!offset) stats->eager_cuCopy_recv++;
+				stats->eager_cuCopy_recv_bytes += paylen;
+			}
+#else
+			if (!offset) stats->eager_cpu_recv++;
+			stats->eager_cpu_recv_bytes += paylen;
+#endif
+			if (paylen > 0)
+				psmi_mq_handle_data(mq, req, offset, payload,
+#ifdef PSM_CUDA
+						    paylen, use_gdrcopy, mq->ep);
+#else
+						    paylen);
+#endif
+			break;
+
+		default:
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Internal error, unknown packet 0x%x",
+					  opcode);
+		}
+
+		mq->stats.rx_user_bytes += msglen;
+		mq->stats.rx_user_num++;
+
+		*req_o = req;	/* yes match */
+		return MQ_RET_MATCH_OK;
+	}
+
+	/* unexpected message or out of order message. */
+
+#if 0
+	/*
+	 * Keep a callback here in case we want to fit some other high-level
+	 * protocols over MQ (i.e. shmem).  These protocols would bypass the
+	 * normal message handling and go to higher-level message handlers.
+	 */
+	if (msgorder && mq->unexpected_callback) {
+		mq->unexpected_callback(mq, opcode, epaddr, req_data.tag, send_msglen,
+					payload, paylen);
+		*req_o = NULL;
+		return MQ_RET_UNEXP_OK;
+	}
+#endif
+
+	if (msgorder > 1) {
+		/* There is NO request match, and this is the first time
+		 * to try to process this packet, we leave the packet in
+		 * hardware queue for retry in hope there is a request
+		 * match nex time, this is for performance
+		 * consideration.
+		 */
+		return MQ_RET_UNEXP_NO_RESOURCES;
+	}
+
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	psmi_assert(req != NULL);
+
+	req->req_data.peer = src;
+	req->req_data.tag = *tag;
+	req->recv_msgoff = 0;
+	// don't yet know recv buffer size, so use send_msglen for now
+	req->req_data.recv_msglen = req->req_data.send_msglen = req->req_data.buf_len = msglen =
+	    send_msglen;
+
+	_HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x"
+		  " send_msglen=%d\n", req, opcode,
+		  psmi_epaddr_get_name(src->epid),
+		  tag->tag[0], tag->tag[1], tag->tag[2], send_msglen);
+
+	switch (opcode) {
+	case MQ_MSG_TINY:
+		if (msglen > 0) {
+			req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen);
+			psmi_assert(msglen == 0 || req->req_data.buf != NULL);
+#ifdef PSM_CUDA
+			mq_copy_tiny_host_mem((uint32_t *) req->req_data.buf,
+				     (uint32_t *) payload, msglen);
+#else
+			mq_copy_tiny((uint32_t *) req->req_data.buf,
+				     (uint32_t *) payload, msglen);
+#endif
+			stats->tiny_sysbuf_recv++;
+			stats->tiny_sysbuf_recv_bytes += msglen;
+		} else {
+			req->req_data.buf = NULL;
+			stats->tiny_sysbuf_recv++;	// 0 length
+		}
+		req->state = MQ_STATE_COMPLETE;
+		break;
+
+	case MQ_MSG_SHORT:
+		req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		psmi_assert(msglen == 0 || req->req_data.buf != NULL);
+		if (msglen <= paylen) {
+#ifdef PSM_CUDA
+			psmi_mq_mtucpy_host_mem(req->req_data.buf, payload, msglen);
+#else
+			psmi_mq_mtucpy(req->req_data.buf, payload, msglen);
+#endif
+		} else {
+			psmi_assert((msglen & ~0x3) == paylen);
+#ifdef PSM_CUDA
+			psmi_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen);
+#else
+			psmi_mq_mtucpy(req->req_data.buf, payload, paylen);
+#endif
+			/*
+			 * there are nonDW bytes attached in header,
+			 * copy after the DW payload.
+			 */
+#ifdef PSM_CUDA
+			mq_copy_tiny_host_mem((uint32_t *)(req->req_data.buf+paylen),
+				(uint32_t *)&offset, msglen & 0x3);
+#else
+			mq_copy_tiny((uint32_t *)(req->req_data.buf+paylen),
+				(uint32_t *)&offset, msglen & 0x3);
+#endif
+		}
+		stats->short_sysbuf_recv++;
+		stats->short_sysbuf_recv_bytes += msglen;
+		req->state = MQ_STATE_COMPLETE;
+		break;
+
+	case MQ_MSG_EAGER:
+		req->send_msgoff = 0;
+		req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		psmi_assert(msglen == 0 || req->req_data.buf != NULL);
+		req->state = MQ_STATE_UNEXP;
+		req->type |= MQE_TYPE_EAGER_QUEUE;
+		STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq);
+		_HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n",
+			  msglen, paylen);
+		if (paylen > 0)
+#ifdef PSM_CUDA
+			psmi_mq_handle_data(mq, req, offset, payload, paylen, 0, NULL);
+#else
+			psmi_mq_handle_data(mq, req, offset, payload, paylen);
+#endif
+		stats->eager_sysbuf_recv++;
+		stats->eager_sysbuf_recv_bytes += paylen;
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Internal error, unknown packet 0x%x",
+				  opcode);
+	}
+
+	mq->stats.rx_sys_bytes += msglen;
+	mq->stats.rx_sys_num++;
+
+	if (msgorder) {
+		mq_add_to_unexpected_hashes(mq, req);
+	}
+	/* caller will handle out of order case */
+	*req_o = req;		/* no match, will callback */
+	return MQ_RET_UNEXP_OK;
+}
+
+#ifdef PSM_CUDA	// declared inline in psm_mq_internal.h for non-CUDA
+// perform the actual copy for an psmi_mq_irecv_inner.  We copy from a sysbuf
+// (req->req_data.buf) to the actual user buffer (buf) and keep statistics.
+// is_buf_gpu_mem indicates if buf is a gpu buffer
+// len - recv buffer size posted, we use this for any GDR copy pinning so
+// 	can get future cache hits on other size messages in same buffer
+// not needed - msglen - negotiated total message size
+// copysz - actual amount to copy (<= msglen)
+void psmi_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem,
+				void *buf, uint32_t len, uint32_t copysz)
+{
+	psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
+	void *ubuf = buf;
+	if (! copysz) {
+		mq->stats.rx_sysbuf_cpu_num++; // zero length
+		return;
+	}
+	if (!is_buf_gpu_mem) {
+		psmi_assert(!PSMI_IS_CUDA_MEM(buf));
+		mq->stats.rx_sysbuf_cpu_num++;
+		mq->stats.rx_sysbuf_cpu_bytes += copysz;
+		psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+	// len could be huge, so limit ourselves to gdr_copy_limit_recv
+	// Note to get here copysz <= gdr_copy_limit_recv
+	} else if (PSMI_USE_GDR_COPY_RECV(copysz) &&
+		NULL != (ubuf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)buf,
+						    min(gdr_copy_limit_recv, len), 1,
+						    mq->ep))) {
+		psmi_assert(PSMI_IS_CUDA_MEM(buf));
+		psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+		mq->stats.rx_sysbuf_gdrcopy_num++;
+		mq->stats.rx_sysbuf_gdrcopy_bytes += copysz;
+	} else {
+		psmi_assert(PSMI_IS_CUDA_MEM(buf));
+		ubuf = buf;
+		mq->stats.rx_sysbuf_cuCopy_num++;
+		mq->stats.rx_sysbuf_cuCopy_bytes += copysz;
+	}
+	if (copysz)
+		psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
+}
+#endif // PSM_CUDA
+
+// we landed an out of order message in a sysbuf and can now process it
+// ureq is where we landed it.  If found, ereq is the user posted receive.
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
+{
+	psm2_mq_req_t ereq;
+	uint32_t msglen;
+
+	ereq = mq_req_match(mq, ureq->req_data.peer, &ureq->req_data.tag, 1);
+	if (ereq == NULL) {
+		mq_add_to_unexpected_hashes(mq, ureq);
+		return 0;
+	}
+
+	psmi_assert(MQE_TYPE_IS_RECV(ereq->type));
+	ereq->req_data.peer = ureq->req_data.peer;
+	ereq->req_data.tag = ureq->req_data.tag;
+	msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen);
+
+	switch (ureq->state) {
+	case MQ_STATE_COMPLETE:
+		if (ureq->req_data.buf != NULL) {	/* 0-byte don't alloc a sysreq_data.buf */
+			psmi_mq_recv_copy(mq, ureq,
+#ifdef PSM_CUDA
+					ereq->is_buf_gpu_mem,
+#endif
+					ereq->req_data.buf,
+					ereq->req_data.buf_len, msglen);
+			psmi_mq_sysbuf_free(mq, ureq->req_data.buf);
+#ifdef PSM_CUDA
+		} else {
+			mq->stats.rx_sysbuf_cpu_num++; // zero length
+#endif
+		}
+		ereq->state = MQ_STATE_COMPLETE;
+		ips_barrier();
+		mq_qq_append(&mq->completed_q, ereq);
+		break;
+	case MQ_STATE_UNEXP:	/* not done yet */
+		ereq->state = MQ_STATE_MATCHED;
+		ereq->msg_seqnum = ureq->msg_seqnum;
+		ereq->ptl_req_ptr = ureq->ptl_req_ptr;
+		ereq->send_msgoff = ureq->send_msgoff;
+		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+		psmi_mq_recv_copy(mq, ureq,
+#ifdef PSM_CUDA
+				ereq->is_buf_gpu_mem,
+#endif
+				ereq->req_data.buf,
+			 	ereq->req_data.buf_len, ereq->recv_msgoff);
+		psmi_mq_sysbuf_free(mq, ureq->req_data.buf);
+		ereq->type = ureq->type;
+		STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq);
+		STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq);
+		break;
+	case MQ_STATE_UNEXP_RV:	/* rendez-vous ... */
+		ereq->state = MQ_STATE_MATCHED;
+		ereq->rts_peer = ureq->rts_peer;
+		ereq->rts_sbuf = ureq->rts_sbuf;
+		ereq->send_msgoff = ureq->send_msgoff;
+		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+		if (ereq->send_msgoff) { // only have sysbuf if RTS w/payload
+			psmi_mq_recv_copy(mq, ureq,
+#ifdef PSM_CUDA
+					ereq->is_buf_gpu_mem,
+#endif
+					ereq->req_data.buf,
+			 		ereq->req_data.buf_len,
+					ereq->recv_msgoff);
+			psmi_mq_sysbuf_free(mq, ureq->req_data.buf);
+		}
+		ereq->rts_callback = ureq->rts_callback;
+		ereq->rts_reqidx_peer = ureq->rts_reqidx_peer;
+		ereq->type = ureq->type;
+		ereq->rts_callback(ereq, 0);
+		break;
+	default:
+		fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state,
+			ureq);
+		fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n",
+			ureq->type, ureq->mq, ureq->req_data.tag.tag[0],
+			ureq->req_data.tag.tag[1], ureq->req_data.tag.tag[2]);
+		abort();
+	}
+
+	psmi_mq_req_free(ureq);
+	return 0;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_mq_utils.c b/deps/libfabric/prov/psm3/psm3/psm_mq_utils.c
new file mode 100644
index 0000000000000000000000000000000000000000..7ea5bb78478ef14e46180ea6126dea00243fc653
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_mq_utils.c
@@ -0,0 +1,228 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * MQ request allocator
+ *
+ */
+
+psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
+{
+	psm2_mq_req_t req;
+
+	psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND);
+
+	if (type == MQE_TYPE_SEND)
+		req = psmi_mpool_get(mq->sreq_pool);
+	else
+		req = psmi_mpool_get(mq->rreq_pool);
+
+	if_pt(req != NULL) {
+		memset(req, 0, sizeof(struct psm2_mq_req));
+		req->type = type;
+		req->state = MQ_STATE_FREE;
+		req->mq = mq;
+
+		return req;
+	} else {	/* we're out of reqs */
+		int issend = (type == MQE_TYPE_SEND);
+		uint32_t reqmax, reqchunk;
+		psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool,
+					&reqchunk, &reqmax);
+
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR,
+				  "Exhausted %d MQ %s request descriptors, which usually indicates "
+				  "a user program error or insufficient request descriptors (%s=%d)",
+				  reqmax, issend ? "isend" : "irecv",
+				  issend ? "PSM3_MQ_SENDREQS_MAX" :
+				  "PSM3_MQ_RECVREQS_MAX", reqmax);
+		return NULL;
+	}
+}
+MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
+
+psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
+{
+	psm2_mq_req_t warmup_req;
+	psm2_error_t err = PSM2_OK;
+
+	_HFI_VDBG("mq element sizes are %d bytes\n",
+		  (int)sizeof(struct psm2_mq_req));
+
+	/*
+	 * Send MQ requests
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err =
+		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		if ((mq->sreq_pool =
+		     psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+				       maxsz, 0, DESCRIPTORS, NULL,
+				       NULL)) == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Receive MQ requests
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err =
+		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+			goto fail;
+		if ((mq->rreq_pool =
+			psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
+				       maxsz, 0, DESCRIPTORS, NULL,
+				       NULL)) == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/* Warm up the allocators */
+	warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	psmi_assert_always(warmup_req != NULL);
+	psmi_mq_req_free(warmup_req);
+
+	warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	psmi_assert_always(warmup_req != NULL);
+	psmi_mq_req_free(warmup_req);
+
+fail:
+	return err;
+}
+
+psm2_error_t psmi_mq_req_fini(psm2_mq_t mq)
+{
+	psmi_mpool_destroy(mq->rreq_pool);
+	psmi_mpool_destroy(mq->sreq_pool);
+	return PSM2_OK;
+}
+
+
+#if 0 // unused code, specific to QLogic MPI
+/*
+ * Hooks to plug into QLogic MPI stats
+ */
+
+static
+void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args)
+{
+	uint64_t *entry = args->stats;
+	psm2_mq_t mq = (psm2_mq_t) args->context;
+	psm2_mq_stats_t mqstats;
+
+	psm2_mq_get_stats(mq, &mqstats);
+
+	if (args->num < 8)
+		return;
+
+	entry[0] = mqstats.tx_eager_num;
+	entry[1] = mqstats.tx_eager_bytes;
+	entry[2] = mqstats.tx_rndv_num;
+	entry[3] = mqstats.tx_rndv_bytes;
+
+	entry[4] = mqstats.rx_user_num;
+	entry[5] = mqstats.rx_user_bytes;
+	entry[6] = mqstats.rx_sys_num;
+	entry[7] = mqstats.rx_sys_bytes;
+}
+
+void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn)
+{
+	char *desc[8];
+	uint16_t flags[8];
+	int i;
+	struct mpspawn_stats_add_args mp_add;
+	/*
+	 * Hardcode flags until we correctly move mpspawn to its own repo.
+	 * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN;
+	 */
+	for (i = 0; i < 8; i++)
+		flags[i] = MPSPAWN_STATS_REDUCTION_ALL;
+
+	desc[0] = "Eager count sent";
+	desc[1] = "Eager bytes sent";
+	desc[2] = "Rendezvous count sent";
+	desc[3] = "Rendezvous bytes sent";
+	desc[4] = "Expected count received";
+	desc[5] = "Expected bytes received";
+	desc[6] = "Unexpect count received";
+	desc[7] = "Unexpect bytes received";
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = 8;
+	mp_add.header = "MPI Statistics Summary (max,min @ rank)";
+	mp_add.req_fn = psmi_mq_stats_callback;
+	mp_add.desc = desc;
+	mp_add.flags = flags;
+	mp_add.context = mq;
+
+	add_fn(&mp_add);
+}
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/psm_netutils.h b/deps/libfabric/prov/psm3/psm3/psm_netutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..316529bef8bcda86c826a6610f093fdcbcdf12dc
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_netutils.h
@@ -0,0 +1,87 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_NETUTILS_H
+#define _PSMI_NETUTILS_H
+
+#include <arpa/inet.h>		/* ipv4addr */
+#include <stdlib.h>
+#include <ifaddrs.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+// network function subset of psm_utils.c so that HAL can use this without
+// needing psm_ep_t and psm_epid_t from psm_user.h
+
+/*
+ * network address manipulation
+ */
+	// prefered size for psmi_sockaddr_ntop
+#define PSM_ADDRSTRLEN (INET6_ADDRSTRLEN+19+7)	// 16 digit sid, plus 3 " 0x"
+												// 4 digit pkey plues 3 " 0x"
+const char *psmi_sockaddr_ntop(struct sockaddr* addr, char *dst, socklen_t size);
+const char *psmi_ipv4_ntop(uint32_t ip_addr, char *dst, socklen_t size);
+socklen_t psmi_sockaddr_len(struct sockaddr* addr);
+
+int psmi_count_high_bits(uint32_t netmask);
+// This converts a bit count generated by psmi_count_high_bits back into
+// a IPv4 netmask
+static inline uint32_t psmi_bit_count_to_mask(int count)
+{
+	return (uint32_t)(0xffffffff << (32-count));
+}
+
+int psmi_get_eth_netmask(__be32 ip_addr, __be32 *netmask);
+
+#endif /* _PSMI_NETUTILS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_perf.c b/deps/libfabric/prov/psm3/psm3/psm_perf.c
new file mode 100644
index 0000000000000000000000000000000000000000..aaf3fd05213db88bffdf7ceb2b3df6cdccabed07
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_perf.c
@@ -0,0 +1,260 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+#include "psm_user.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+
+/* Configuration */
+
+#define RDPMC_PERF_DEFAULT_TYPE   (PERF_TYPE_HARDWARE)
+#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES)
+
+__thread struct rdpmc_ctx global_rdpmc_ctx;
+
+u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+__thread unsigned int global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
+__thread unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+
+struct rdpmc_ctx {
+	int fd;
+	struct perf_event_mmap_page *buf;
+};
+
+typedef unsigned long long u64;
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+#include "immintrin.h"
+#endif
+
+/**
+ * DOC: Ring 3 counting for CPU performance counters
+ *
+ * This library allows accessing CPU performance counters from ring 3
+ * using the perf_events subsystem. This is useful to measure specific
+ * parts of programs (e.g. excluding initialization code)
+ *
+ * Requires a Linux 3.3+ kernel
+ */
+
+/**
+ * rdpmc_open_attr - initialize a raw ring 3 readable performance counter
+ * @attr: perf struct %perf_event_attr for the counter
+ * @ctx:  Pointer to struct %rdpmc_ctx that is initialized.
+ * @leader_ctx: context of group leader or NULL
+ *
+ * This allows more flexible setup with a custom &perf_event_attr.
+ * For simple uses rdpmc_open() should be used instead.
+ * Must be called for each thread using the counter.
+ * Must be closed with rdpmc_close()
+ */
+PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx,
+									   struct rdpmc_ctx *leader_ctx))
+{
+	ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1,
+			  leader_ctx ? leader_ctx->fd : -1, 0);
+	if (ctx->fd < 0) {
+		perror("perf_event_open");
+		return -1;
+	}
+	ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0);
+	if (ctx->buf == MAP_FAILED) {
+		close(ctx->fd);
+		perror("mmap on perf fd");
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * rdpmc_open - initialize a simple ring 3 readable performance counter
+ * @counter: Raw event descriptor (UUEE UU unit mask EE event)
+ * @ctx:     Pointer to struct &rdpmc_ctx that is initialized
+ *
+ * The counter will be set up to count CPU events excluding the kernel.
+ * Must be called for each thread using the counter.
+ * The caller must make sure counter is suitable for the running CPU.
+ * Only works in 3.3+ kernels.
+ * Must be closed with rdpmc_close()
+ */
+
+PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx))
+{
+	struct perf_event_attr attr = {
+		.type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE,
+		.size = PERF_ATTR_SIZE_VER0,
+		.config = counter,
+		.sample_type = PERF_SAMPLE_READ,
+		.exclude_kernel = 1,
+	};
+	return rdpmc_open_attr(&attr, ctx, NULL);
+}
+
+/**
+ * rdpmc_close: free a ring 3 readable performance counter
+ * @ctx: Pointer to &rdpmc_ctx context.
+ *
+ * Must be called by each thread for each context it initialized.
+ */
+PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx))
+{
+	close(ctx->fd);
+	munmap(ctx->buf, sysconf(_SC_PAGESIZE));
+}
+
+static void psmi_rdpmc_perf_framework_init()
+{
+    int rdpmc_retval;
+
+    struct rdpmc_ctx *leader = NULL;
+
+    int env_result    = 1;
+    char * env_type = NULL;
+    char * env_config = NULL;
+
+    env_type = getenv("RDPMC_PERF_TYPE");
+
+    if (env_type)
+    {
+        global_rdpmc_type = (int)strtoll(env_type, NULL, 16);
+    }
+    else
+    {
+        env_result = 0;
+    }
+
+    env_config = getenv("RDPMC_PERF_CONFIG");
+
+    if (env_config)
+    {
+        global_rdpmc_config = (int)strtoll(env_config, NULL, 16);
+    }
+    else
+    {
+        env_result = 0;
+    }
+
+    if (env_result != 1)
+    {
+        global_rdpmc_type   = RDPMC_PERF_DEFAULT_TYPE;
+        global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG;
+    }
+
+    struct perf_event_attr attr = {
+        .type = global_rdpmc_type,
+        .size = sizeof(struct perf_event_attr),
+        .config = global_rdpmc_config,
+        .sample_type = PERF_SAMPLE_READ,
+    };
+
+    rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader);
+
+    if (rdpmc_retval < 0)
+    {
+        printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval);
+        exit(-1);
+    }
+}
+
+/**
+ * rdpmc_read: read a ring 3 readable performance counter
+ * @ctx: Pointer to initialized &rdpmc_ctx structure.
+ *
+ * Read the current value of a running performance counter.
+ */
+unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
+{
+	static __thread int rdpmc_perf_initialized = 0;
+
+	if_pf(!rdpmc_perf_initialized)
+	{
+		psmi_rdpmc_perf_framework_init();
+		rdpmc_perf_initialized = 1;
+	}
+
+	u64 val;
+	unsigned seq;
+	u64 offset = 0;
+
+	typeof (ctx->buf) buf = ctx->buf;
+	do {
+		seq = buf->lock;
+		ips_rmb();
+		if (buf->index <= 0)
+			return buf->offset;
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+                val = _rdpmc(buf->index - 1);
+#else /* GCC */
+                val = __builtin_ia32_rdpmc(buf->index - 1);
+#endif
+		offset = buf->offset;
+		ips_rmb();
+	} while (buf->lock != seq);
+	return val + offset;
+}
+
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_perf.h b/deps/libfabric/prov/psm3/psm3/psm_perf.h
new file mode 100644
index 0000000000000000000000000000000000000000..7233ba8d3987a2bab321e824707644f35bd99447
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_perf.h
@@ -0,0 +1,149 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* When the perf framework is enabled, GENERIC_PERF_BEGIN/END can be
+ * used to mark sections of code to be included in a given "slot number"
+ * for performance statistics.
+ * The PMU will be used to measure instruction cycles used between the BEGIN/END
+ * This permits precise statistics to be gathered for how much CPU is required
+ * to execute all the code in a given slot number during a given run.
+ * At the end of the run the statistics are output.
+ * This capability is only enabled when PSM is built with -DRDPMC_PERF_FRAMEWORK
+ */
+
+/* slot numbers for the counters we want */
+#define PSM_TX_SPEEDPATH_CTR 0
+#define PSM_RX_SPEEDPATH_CTR 1
+
+#ifdef RDPMC_PERF_FRAMEWORK
+
+/* Configuration */
+
+#define RDPMC_PERF_MAX_SLOT_NUMBER (8)	// we only use 2, RX and TX
+#define RDPMC_PERF_MAX_SLOT_NAME   (256)
+
+/* RDPMC infrastructure */
+
+extern __thread struct rdpmc_ctx global_rdpmc_ctx;
+
+typedef unsigned long long u64;
+
+extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER];
+extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER];
+
+extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME];
+
+extern __thread unsigned int global_rdpmc_type;
+extern __thread unsigned int global_rdpmc_config;
+
+extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
+
+#define RDPMC_PERF_INIT() \
+{                         \
+    int i;                \
+    for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+    {                                                \
+        global_rdpmc_begin[i]  = 0; \
+        global_rdpmc_summ[i]   = 0; \
+        global_rdpmc_number[i] = 0; \
+        global_rdpmc_slot_name[i][0] = '\0'; \
+    }                               \
+}
+
+/* There is no slot_number max range check */
+
+#define RDPMC_PERF_SET_SLOT_NAME(slot_number, name)  \
+{                                                    \
+    strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \
+    global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0';           \
+}
+
+#define RDPMC_PERF_BEGIN(slot_number) \
+{                                     \
+    global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \
+}
+
+#define RDPMC_PERF_END(slot_number) \
+{                        \
+    global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \
+    global_rdpmc_number[(slot_number)]++;                                                                    \
+}
+
+#define RDPMC_PERF_DUMP(stream) \
+{                         \
+    int i;                \
+    for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \
+    {                                                \
+        if (global_rdpmc_slot_name[i][0])                  \
+        {                                            \
+            fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \
+                    global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \
+                    (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \
+            fflush((stream));                                                 \
+        } \
+    }     \
+}
+
+#define GENERIC_PERF_INIT()                           RDPMC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number)               RDPMC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number)                 RDPMC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream)                     RDPMC_PERF_DUMP(stream)
+#else /* RDPMC_PERF_FRAMEWORK */
+#define GENERIC_PERF_INIT()
+#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name)
+#define GENERIC_PERF_BEGIN(slot_number)
+#define GENERIC_PERF_END(slot_number)
+#define GENERIC_PERF_DUMP(stream)
+#endif /* RDPMC_PERF_FRAMEWORK */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.c b/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.c
new file mode 100644
index 0000000000000000000000000000000000000000..f21aa6c3bcfac24b038aaff87d44726c7284f59c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.c
@@ -0,0 +1,1266 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+#ifdef RNDV_MOD
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+//#include <sys/socket.h>
+//#include <fcntl.h>
+//#include <poll.h>
+//#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+//#include <netdb.h>
+#include <infiniband/verbs.h>
+#include "psm_user.h"	// get psmi_calloc and free
+#include "psm_rndv_mod.h"
+#include "ips_config.h"
+
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+// Intel Columbiaville (800 series NIC) specific udata for RV reg_mr ioctl
+// Mellanox and OPA ignore udata, so doesn't matter what we pass them
+
+/* For CVL irdma device */
+/* nd_linux-lib_cpk_rdma/src/DRIVER_CORE/src/CORE/icrdma-abi.h */
+enum irdma_memreg_type {
+        IW_MEMREG_TYPE_MEM  = 0,
+        IW_MEMREG_TYPE_QP   = 1,
+        IW_MEMREG_TYPE_CQ   = 2,
+        IW_MEMREG_TYPE_RSVD = 3,
+        IW_MEMREG_TYPE_MW   = 4,
+};
+
+struct irdma_mem_reg_req {
+	uint16_t reg_type;  /* Memory, QP or CQ */
+        uint16_t cq_pages;
+        uint16_t rq_pages;
+        uint16_t sq_pages;
+};
+
+// we won't have ep in kernel API and won't have this memory tracking
+// so just use EP_NONE
+#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, PEER_RNDV, (nmemb), (size)))
+//#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, NETWORK_BUFFERS, (nmemb), (size)))
+#define my_free(p) (psmi_free(p))
+
+#ifdef PSM_CUDA
+static int gpu_pin_check;	// PSM3_GPU_PIN_CHECK
+static uint64_t *gpu_bars;
+static int num_gpu_bars = 0;
+static uint64_t min_gpu_bar_size;
+
+// The second BAR address is where the GPU will map GPUDirect memory.
+// The beginning of this BAR is reserved for non-GPUDirect uses.
+// However, it has been observed that in some multi-process
+// pinning failures, HED-2035, the nvidia_p2p_get_pages can foul up
+// it's IOMMU after which the next successful pin will incorrectly
+// return the 1st physical address of the BAR for the pinned pages.
+// In this case it will report this same physical address for other GPU virtual
+// addresses and cause RDMA to use the wrong memory.
+// As a workaround, we gather the Region 1 BAR address start for each
+// GPU and if we see this address returned as the phys_addr of a mmapped
+// GPUDirect Copy or the iova of a GPU MR we fail the job before it can
+// corrupt any more application data.
+static uint64_t get_nvidia_bar_addr(int domain, int bus, int slot)
+{
+	char sysfs[100];
+	int ret;
+	FILE *f;
+	unsigned long long start_addr, end_addr, bar_size;
+
+	ret = snprintf(sysfs, sizeof(sysfs),
+		"/sys/class/pci_bus/%04x:%02x/device/%04x:%02x:%02x.0/resource",
+		domain, bus, domain, bus, slot);
+	psmi_assert_always(ret < sizeof(sysfs));
+	f = fopen(sysfs, "r");
+	if (! f) {
+		if (gpu_pin_check) {
+			_HFI_ERROR("Unable to open %s for GPU BAR Address: %s\n",
+				sysfs, strerror(errno));
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Unable to get GPU BAR address\n");
+		}
+		return 0;
+	}
+	// for each BAR region, start, end and flags are listed in hex
+	// nVidia uses the 2nd BAR region (aka Region #1) to map peer to peer
+	// accesses into it's potentially larger GPU local memory space
+	ret = fscanf(f, "%*x %*x %*x %llx %llx", &start_addr, &end_addr);
+	if (ret != 2) {
+		if (gpu_pin_check) {
+			_HFI_ERROR("Unable to get GPU BAR Address from %s: %s\n",
+				sysfs, strerror(errno));
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Unable to get GPU BAR address\n");
+		}
+		fclose(f);
+		return 0;
+	}
+	fclose(f);
+
+	bar_size = (end_addr - start_addr) + 1;
+	_HFI_DBG("GPU BAR Addr from %s is 0x%llx - 0x%llx (size 0x%llx)\n", sysfs, start_addr, end_addr, bar_size);
+	if (! min_gpu_bar_size || bar_size < min_gpu_bar_size)
+		min_gpu_bar_size = bar_size;
+	return start_addr;
+}
+
+void psm2_get_gpu_bars(void)
+{
+	int num_devices, dev;
+	union psmi_envvar_val env;
+
+	psmi_getenv("PSM3_GPU_PIN_CHECK",
+			"Enable sanity check of physical addresses mapped into GPU BAR space (Enabled by default)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)1, &env);
+	gpu_pin_check = env.e_int;
+
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+	gpu_bars = psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_devices, sizeof(gpu_bars[0]));
+	if (! gpu_bars)
+		return;	// psmi_calloc will have exited for Out of Memory
+
+	if (gpu_pin_check)
+		num_gpu_bars = num_devices;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		int domain, bus, slot;
+
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&domain,
+				CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+				device);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&bus,
+				CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+				device);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&slot,
+				CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+				device);
+		gpu_bars[dev] = get_nvidia_bar_addr(domain, bus, slot);
+	}
+}
+
+static psm2_error_t psm2_check_phys_addr(uint64_t phys_addr)
+{
+	int i;
+	for (i=0; i < num_gpu_bars; i++) {
+		if (phys_addr == gpu_bars[i]) {
+			_HFI_ERROR("Incorrect Physical Address (0x%"PRIx64") returned by nVidia driver.  PSM3 exiting to avoid data corruption.  Job may be rerun with PSM3_GPUDIRECT=0 to avoid this issue.\n",
+				phys_addr);
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Incorrect Physical Address returned by nVidia driver\n");
+			psmi_assert_always(0);
+			return PSM2_INTERNAL_ERR;
+		}
+	}
+	return PSM2_OK;
+}
+
+uint64_t __psm2_min_gpu_bar_size(void)
+{
+	return min_gpu_bar_size;
+}
+#endif
+
+static int rv_map_event_ring(psm2_rv_t rv, struct rv_event_ring* ring,
+				int entries, int offset)
+{
+#ifdef RV_RING_ALLOC_LEN
+	ring->len = RV_RING_ALLOC_LEN(entries);
+#else /* older version of RV header */
+	ring->len = RING_ALLOC_LEN(entries);
+#endif
+
+
+	//printf("Calling mmap for offset: %d len:%d\n", offset, ring->len);
+
+	ring->hdr = (struct rv_ring_header *)mmap(0, ring->len,
+						PROT_READ | PROT_WRITE,
+						MAP_SHARED | MAP_POPULATE | MAP_LOCKED,
+						rv->fd, offset);
+	if (!ring->hdr) {
+		ring->len = 0;
+		return -1;
+	}
+	ring->num = entries;
+	return 0;
+}
+
+static void rv_unmap_event_ring(psm2_rv_t rv, struct rv_event_ring* ring)
+{
+	if (ring->hdr)
+		if(munmap(ring->hdr, ring->len))
+			_HFI_ERROR("rv munmap event ring failed:%s (%d)\n", strerror(errno),errno);
+	ring->hdr = NULL;
+	ring->len = 0;
+	ring->num = 0;
+}
+
+// we call this once per ep (eg. NIC) so we supply the local address
+// of our NIC for use in the IB CM bind, especially for ethernet
+psm2_rv_t __psm2_rv_open(const char *devname, struct local_info *loc_info)
+{
+	psm2_rv_t rv = NULL;
+	struct rv_attach_params aparams;
+	struct rv_query_params_out qparams;
+	int ret;
+	int save_errno;
+
+	loc_info->capability = 0;
+	rv = (psm2_rv_t)my_calloc(1, sizeof(struct psm2_rv));
+	if (! rv) {
+		save_errno = ENOMEM;
+		goto fail;
+	}
+	//printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__);
+	rv->fd = open(RV_FILE_NAME, O_RDWR);
+	if (rv->fd == -1) {
+		save_errno = errno;
+		_HFI_ERROR("fd open failed %s: %s\n", RV_FILE_NAME, strerror(errno));
+		goto fail;
+	}
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_QUERY, &qparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv query ioctl failed ret:%s (%d)\n", strerror(errno), ret);
+		goto fail;
+	}
+	loc_info->major_rev = qparams.major_rev;
+	loc_info->minor_rev = qparams.minor_rev;
+	loc_info->capability = qparams.capability;
+
+#ifdef PSM_CUDA
+	loc_info->gpu_major_rev = qparams.gpu_major_rev;
+	loc_info->gpu_minor_rev = qparams.gpu_minor_rev;
+	if (loc_info->rdma_mode & RV_RDMA_MODE_GPU) {
+		if (!(qparams.capability & RV_CAP_GPU_DIRECT)) {
+			// caller will warn and avoid GPUDirect use
+			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
+		}
+		if (!(qparams.capability & RV_CAP_EVICT)) {
+			save_errno = ENOTSUP;
+			_HFI_ERROR("Error: rv lacks EVICT ioctl, needed for GPU Support\n");
+			goto fail;
+		}
+	}
+#endif
+	if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_USER
+		&& !(qparams.capability & RV_CAP_USER_MR)) {
+		save_errno = ENOTSUP;
+		_HFI_ERROR("Error: rv lacks enable_user_mr capability\n");
+		goto fail;
+	}
+
+	memset(&aparams, 0, sizeof(aparams));
+	snprintf(aparams.in.dev_name, RV_MAX_DEV_NAME_LEN, "%s", devname);
+	aparams.in.mr_cache_size = loc_info->mr_cache_size;
+#ifdef PSM_CUDA
+	aparams.in.gpu_cache_size = loc_info->gpu_cache_size;
+#endif
+	aparams.in.rdma_mode = loc_info->rdma_mode;
+	aparams.in.port_num = loc_info->port_num;
+	aparams.in.num_conn = loc_info->num_conn;
+	aparams.in.loc_addr = loc_info->loc_addr;
+	aparams.in.index_bits = loc_info->index_bits;
+	aparams.in.loc_gid_index = loc_info->loc_gid_index;
+	memcpy(&aparams.in.loc_gid, &loc_info->loc_gid, sizeof(aparams.in.loc_gid));
+
+	if (loc_info->job_key_len > sizeof(aparams.in.job_key)) {
+		save_errno = EINVAL;
+		_HFI_ERROR("Error: job_key_len too long\n");
+		goto fail;
+	}
+	aparams.in.job_key_len = loc_info->job_key_len;
+	memcpy(&aparams.in.job_key, loc_info->job_key, loc_info->job_key_len);
+	// if 0 specified, kernel will pick a value for all jobs
+	// otherwise PSM can specify a job specific value, must be same in all
+	// processes in a given job
+	// ok if multiple PSM processes in different jobs all funnel
+	// through same listener service id as the job_key will differentiate them
+	aparams.in.service_id = loc_info->service_id;
+	aparams.in.context = (uint64_t)loc_info->context;
+	aparams.in.cq_entries = loc_info->cq_entries;
+	aparams.in.q_depth = loc_info->q_depth;
+	aparams.in.reconnect_timeout = loc_info->reconnect_timeout;
+	aparams.in.hb_interval = loc_info->hb_interval;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_ATTACH, &aparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv attach ioctl failed (mode 0x%x) ret:%s (%d)\n", loc_info->rdma_mode, strerror(errno), ret);
+		goto fail;
+	}
+
+#ifdef PSM_CUDA
+	if (loc_info->rdma_mode & RV_RDMA_MODE_GPU) {
+		loc_info->rv_index = aparams.out_gpu.rv_index;
+		loc_info->mr_cache_size = aparams.out_gpu.mr_cache_size;
+		loc_info->q_depth = aparams.out_gpu.q_depth;
+		loc_info->reconnect_timeout = aparams.out_gpu.reconnect_timeout;
+		loc_info->gpu_cache_size = aparams.out_gpu.gpu_cache_size;
+	} else {
+#endif
+		loc_info->rv_index = aparams.out.rv_index;
+		loc_info->mr_cache_size = aparams.out.mr_cache_size;
+		loc_info->q_depth = aparams.out.q_depth;
+		loc_info->reconnect_timeout = aparams.out.reconnect_timeout;
+#ifdef PSM_CUDA
+	}
+#endif
+
+	//printf("XXXX 0x%lx %s fd:%d\n", pthread_self(), __FUNCTION__, rv->fd);
+	if (loc_info->cq_entries) {
+		if (rv_map_event_ring(rv, &rv->events, loc_info->cq_entries, 0)) {
+			save_errno = errno;
+			_HFI_ERROR("rv mmap event ring failed:%s (%d)\n", strerror(errno), errno);
+			goto fail;
+		}
+	}
+
+	return rv;
+fail:
+	if (rv) {
+		(void)__psm2_rv_close(rv);
+	}
+	errno = save_errno;
+	return NULL;
+}
+
+// 0 on success
+// -1 if rv invalid or not open and errno set
+int __psm2_rv_close(psm2_rv_t rv)
+{
+
+	if (! rv) {
+		errno = EINVAL;
+		return -1;
+	}
+	//printf("XXXX 0x%lx %s fd:%d\n", pthread_self(), __FUNCTION__, rv->fd);
+	rv_unmap_event_ring(rv, &rv->events);
+#if 0
+	if ((ret = ioctl(rv->fd, RV_IOCTL_DETACH, NULL)) != 0) {
+		perror("close failed\n");
+	}
+#endif
+	if (rv->fd != -1) {
+		close(rv->fd);
+	}
+
+	my_free(rv);
+	return 0;
+}
+
+int __psm2_rv_get_cache_stats(psm2_rv_t rv, struct psm2_rv_cache_stats *stats)
+{
+	struct rv_cache_stats_params_out sparams;
+	int ret;
+	int save_errno;
+
+	memset(&sparams, 0, sizeof(sparams));
+	if ((ret = ioctl(rv->fd, RV_IOCTL_GET_CACHE_STATS, &sparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv get_cache_stats failed ret:%d: %s\n", ret, strerror(errno));
+		goto fail;
+	}
+	stats->cache_size = sparams.cache_size;
+	stats->max_cache_size = sparams.max_cache_size;
+	stats->limit_cache_size = sparams.limit_cache_size;
+	stats->count = sparams.count;
+	stats->max_count = sparams.max_count;
+	stats->inuse = sparams.inuse;
+	stats->max_inuse = sparams.max_inuse;
+	stats->inuse_bytes = sparams.inuse_bytes;
+	stats->max_inuse_bytes = sparams.max_inuse_bytes;
+	stats->max_refcount = sparams.max_refcount;
+	stats->hit = sparams.hit;
+	stats->miss = sparams.miss;
+	stats->full = sparams.full;
+	stats->failed = sparams.failed;
+	stats->remove = sparams.remove;
+	stats->evict = sparams.evict;
+	return 0;
+fail:
+	errno = save_errno;
+	return -1;
+}
+
+#ifdef PSM_CUDA
+int __psm2_rv_gpu_get_cache_stats(psm2_rv_t rv, struct psm2_rv_gpu_cache_stats *stats)
+{
+	struct rv_gpu_cache_stats_params_out sparams;
+	int ret;
+	int save_errno;
+
+	memset(&sparams, 0, sizeof(sparams));
+	if ((ret = ioctl(rv->fd, RV_IOCTL_GPU_GET_CACHE_STATS, &sparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv gpu_get_cache_stats failed ret:%d: %s\n", ret, strerror(errno));
+		goto fail;
+	}
+	stats->cache_size = sparams.cache_size;
+	stats->cache_size_reg = sparams.cache_size_reg;
+	stats->cache_size_mmap = sparams.cache_size_mmap;
+	stats->cache_size_both = sparams.cache_size_both;
+	stats->max_cache_size = sparams.max_cache_size;
+	stats->max_cache_size_reg = sparams.max_cache_size_reg;
+	stats->max_cache_size_mmap = sparams.max_cache_size_mmap;
+	stats->max_cache_size_both = sparams.max_cache_size_both;
+	stats->limit_cache_size = sparams.limit_cache_size;
+	stats->count = sparams.count;
+	stats->count_reg = sparams.count_reg;
+	stats->count_mmap = sparams.count_mmap;
+	stats->count_both = sparams.count_both;
+	stats->max_count = sparams.max_count;
+	stats->max_count_reg = sparams.max_count_reg;
+	stats->max_count_mmap = sparams.max_count_mmap;
+	stats->max_count_both = sparams.max_count_both;
+	stats->inuse = sparams.inuse;
+	stats->inuse_reg = sparams.inuse_reg;
+	stats->inuse_mmap = sparams.inuse_mmap;
+	stats->inuse_both = sparams.inuse_both;
+	stats->max_inuse = sparams.max_inuse;
+	stats->max_inuse_reg = sparams.max_inuse_reg;
+	stats->max_inuse_mmap = sparams.max_inuse_mmap;
+	stats->max_inuse_both = sparams.max_inuse_both;
+	stats->max_refcount = sparams.max_refcount;
+	stats->max_refcount_reg = sparams.max_refcount_reg;
+	stats->max_refcount_mmap = sparams.max_refcount_mmap;
+	stats->max_refcount_both = sparams.max_refcount_both;
+	stats->inuse_bytes = sparams.inuse_bytes;
+	stats->inuse_bytes_reg = sparams.inuse_bytes_reg;
+	stats->inuse_bytes_mmap = sparams.inuse_bytes_mmap;
+	stats->inuse_bytes_both = sparams.inuse_bytes_both;
+	stats->max_inuse_bytes = sparams.max_inuse_bytes;
+	stats->max_inuse_bytes_reg = sparams.max_inuse_bytes_reg;
+	stats->max_inuse_bytes_mmap = sparams.max_inuse_bytes_mmap;
+	stats->max_inuse_bytes_both = sparams.max_inuse_bytes_both;
+	stats->hit = sparams.hit;
+	stats->hit_reg = sparams.hit_reg;
+	stats->hit_add_reg = sparams.hit_add_reg;
+	stats->hit_mmap = sparams.hit_mmap;
+	stats->hit_add_mmap = sparams.hit_add_mmap;
+	stats->miss = sparams.miss;
+	stats->miss_reg = sparams.miss_reg;
+	stats->miss_mmap = sparams.miss_mmap;
+	stats->full = sparams.full;
+	stats->full_reg = sparams.full_reg;
+	stats->full_mmap = sparams.full_mmap;
+	stats->failed_pin = sparams.failed_pin;
+	stats->failed_reg = sparams.failed_reg;
+	stats->failed_mmap = sparams.failed_mmap;
+	stats->remove = sparams.remove;
+	stats->remove_reg = sparams.remove_reg;
+	stats->remove_mmap = sparams.remove_mmap;
+	stats->remove_both = sparams.remove_both;
+	stats->evict = sparams.evict;
+	stats->evict_reg = sparams.evict_reg;
+	stats->evict_mmap = sparams.evict_mmap;
+	stats->evict_both = sparams.evict_both;
+	stats->inval_mr = sparams.inval_mr;
+	stats->post_write = sparams.post_write;
+	stats->post_write_bytes = sparams.post_write_bytes;
+	stats->gpu_post_write = sparams.gpu_post_write;
+	stats->gpu_post_write_bytes = sparams.gpu_post_write_bytes;
+	return 0;
+fail:
+	errno = save_errno;
+	return -1;
+}
+#endif
+
+// we have a little dance here to hide the RV connect REQ and RSP from PSM
+// without needing a callback into PSM.
+// We do this by creating the rv_conn object with the remote addressing
+// information before any connection activity.
+// PSM has it's own connection REQ/RSP which will occur.
+// By creating the rv_conn object before PSM sends it's REQ or RSP and not
+// starting the rv connection process until PSM is about to send a PSM RSP
+// (or receives a PSM RSP), we ensure that both sides
+// will have a rv_conn ready by the time RV's CM REQ arrives.
+// Inbound RV CM REQs can compare the REQ against expected remote addresses
+// and match the proper one.
+
+// For RV at the kernel level, we only need connections at the node level.
+// In the kernel a single rv_conn will be created per remote NIC, these
+// rv_conn will be shared among multiple PSM processes.  So they can be
+// identified by remote addr alone.
+//
+// For kernel RV, the REQ/RSP also needs to include the
+// job_key.  RC QPs are not shared across jobs.  Kernel RV will use the
+// job_key to select the proper set of rv and rv_conn objects.  If none are
+// found the connection is rejected (or discarded?  Which is better for
+// Denial of service protection?).
+
+// We implement a simple peer-peer connect model here and
+// the listener side will also create conn for inbound connect REQs
+// thus PSM must call this function for both sides of a connection
+// we will compae rem_addr aganint our local address (already
+// set in rv_open) to decide which side is passive vs active side of IB CM
+// connection establishment
+// See description above for more info on connection model
+psm2_rv_conn_t __psm2_rv_create_conn(psm2_rv_t rv,
+				     struct ibv_ah_attr *ah_attr, // for remote node
+					 uint32_t rem_addr)  // for simple compare to loc_addr
+{
+	psm2_rv_conn_t conn = NULL;
+	struct rv_conn_create_params param;
+	int save_errno;
+
+	conn = (psm2_rv_conn_t)my_calloc(1, sizeof(struct psm2_rv_conn));
+	if (! conn) {
+		save_errno = ENOMEM;
+		goto fail;
+	}
+	conn->rv = rv;
+	// call kernel, kernel will save off this info, will have a single
+	// shared rv_conn for all processes talking to a given remote node
+	// NO IB CM activity here, just save info in prep for rv_connect
+	// TBD, do we need rem_addr argument?  It can be figured out from
+	// ah_attr: for IB use dlid, for eth use low 32 bits of gid
+	// TBD should we specify PKey here for an additional inbound check
+	memset(&param, 0, sizeof(param));
+	memcpy(&param.in.ah, ah_attr, sizeof(param.in.ah));
+	param.in.rem_addr = rem_addr;
+	// while a user context could be supplied here that turns out to be
+	// expensive as the kernel must either search to find the right
+	// rv_user and rv_user_conn or the kernel must keep an array of 2^index_bits
+	// rv_user_conn pointers to find the right conn_context to supply in
+	// recv CQEs.  Given PSM is only using conn_context as a sanity check
+	// we can have the CQE contain the rv_conn handle instead and eliminate
+	// the need for a kernel rv_user_conn all together
+
+	if (ioctl(rv->fd, RV_IOCTL_CONN_CREATE, &param)) {
+		save_errno = errno;
+		goto fail;
+	}
+
+	/* Copy the params to conn for connection use */
+	conn->handle = param.out.handle;
+	conn->conn_handle = param.out.conn_handle;
+	return conn;
+
+fail:
+	if (conn)
+		my_free(conn);
+	errno = save_errno;
+	return NULL;
+}
+
+int __psm2_rv_connect(psm2_rv_conn_t conn, const struct ib_user_path_rec *path)
+{
+	struct rv_conn_connect_params_in param;
+	int ret;
+
+	if (!conn) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	// kernel will:
+	// 	compare conn->rem_addr and rv->loc_addr to pick passive and active side
+	// 	active side will start the IB CM connection (and return immediately)
+	// 	passive side will ensure listener is started on 1st crate_conn for
+	// 	a given NIC
+	//
+	// 	on the listener, as inbound connections arrive their job_key directs
+	// 	them to the proper node level rv (shared by all local process rv_open
+	// 	with same job_key). (reject or ignore if no rv's match job key)
+	// 	The proper rv then compares the remote address and other info from
+	// 	CM REQ against conn->ah_attr to confirm it is coming from a node we
+	// 	expect to be part of the job reject (or ignore) unmatched REQs
+	// 	(note ah_attr is a superset of rem_addr, so can just compare ah_attr)
+	// 	but note that ah_attr format is a little different for IB vs Eth
+	// 	Eth uses GID to hold IP address while IB will use LID
+	// 	TBD what we will enforced regarding SL, pkey, etc for Eth
+	// 	for IB/OPA they should match
+	// 	The loc_gid and dgid are available for use by the active side to
+	// 	satisfy IB CM.  The passive side can ignore these and simply use
+	// 	ah_attr to verify incoming connections.  Note on the passive side
+	// 	an incoming connection can arrive before this call, so it may not have
+	// 	The dgid available when the inbound connect request arrives.
+	//
+	// 	in either case, the connection process continues in background in
+	// 	kernel and PSM can poll for rv_connected to determine when it is done
+	//
+	// 	kernel will concurrently make progress on multiple connections
+	// 	active side may have a limit on how many it starts at once and may
+	// 	progress through the needed connections in "clumps"
+	// 	all connections are at node to node level and shared by all
+	// 	processes within the given job.
+	//
+	//return 0 on success, -1 w/errno on error
+	memset(&param, 0, sizeof(param));
+	param.handle = conn->handle;
+	memcpy(&param.path, path, sizeof(param.path));
+	ret = ioctl(conn->rv->fd, RV_IOCTL_CONN_CONNECT, &param);
+	if (ret)
+		conn->handle = 0;	// invalid handle, rv has freed uconn
+	return ret;
+}
+
+int __psm2_rv_connected(psm2_rv_conn_t conn)
+{
+	struct rv_conn_connected_params_in param;
+
+	if (! conn) {
+		errno = EINVAL;
+		return -1;
+	}
+	// verify if conn is now fully established
+	// 0=no
+	// 1=yes
+	// -1=error and errno set
+	memset(&param, 0, sizeof(param));
+	param.handle = conn->handle;
+	return ioctl(conn->rv->fd, RV_IOCTL_CONN_CONNECTED, &param);
+}
+
+// get connection count for specified sconn index within given conn
+// the count is incremented each time a successful (re)connection occurs
+// The advancement of the count can be used as a barrier to indicate
+// all transactions related to a previous QP prior to recovery are done
+// and drained.
+// returns -1 with EIO if connection cannot be recovered
+// return 0 with latest conn_count if connected or being recovered
+int __psm2_rv_get_conn_count(psm2_rv_t rv, psm2_rv_conn_t conn,
+				uint8_t index, uint32_t *count)
+{
+	struct rv_conn_get_conn_count_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	if (conn)
+		params.in.handle = conn->handle;
+	params.in.index = index;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_CONN_GET_CONN_COUNT, &params)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv get_conn_count failed ret:%d: %s\n", ret, strerror(errno));
+		goto fail;
+	}
+	*count = params.out.count;
+	return 0;
+fail:
+	errno = save_errno;
+	return -1;
+}
+
+int __psm2_rv_get_conn_stats(psm2_rv_t rv, psm2_rv_conn_t conn,
+				uint8_t index, struct psm2_rv_conn_stats *stats)
+{
+	struct rv_conn_get_stats_params sparams;
+	int ret;
+	int save_errno;
+
+	memset(&sparams, 0, sizeof(sparams));
+	if (conn)
+		sparams.in.handle = conn->handle;
+	sparams.in.index = index;
+	if ((ret = ioctl(rv->fd, RV_IOCTL_CONN_GET_STATS, &sparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv get_conn_stats failed ret:%d: %s\n", ret, strerror(errno));
+		goto fail;
+	}
+	stats->index = sparams.out.index;
+	stats->flags = sparams.out.flags;
+	stats->num_conn = sparams.out.num_conn;
+
+	stats->req_error = sparams.out.req_error;
+	stats->req_recv = sparams.out.req_recv;
+	stats->rep_error = sparams.out.rep_error;
+	stats->rep_recv = sparams.out.rep_recv;
+	stats->rtu_recv = sparams.out.rtu_recv;
+	stats->established = sparams.out.established;
+	stats->dreq_error = sparams.out.dreq_error;
+	stats->dreq_recv = sparams.out.dreq_recv;
+	stats->drep_recv = sparams.out.drep_recv;
+	stats->timewait = sparams.out.timewait;
+	stats->mra_recv = sparams.out.mra_recv;
+	stats->rej_recv = sparams.out.rej_recv;
+	stats->lap_error = sparams.out.lap_error;
+	stats->lap_recv = sparams.out.lap_recv;
+	stats->apr_recv = sparams.out.apr_recv;
+	stats->unexp_event = sparams.out.unexp_event;
+	stats->req_sent = sparams.out.req_sent;
+	stats->rep_sent = sparams.out.rep_sent;
+	stats->rtu_sent = sparams.out.rtu_sent;
+	stats->rej_sent = sparams.out.rej_sent;
+	stats->dreq_sent = sparams.out.dreq_sent;
+	stats->drep_sent = sparams.out.drep_sent;
+	stats->wait_time = sparams.out.wait_time;
+	stats->resolve_time = sparams.out.resolve_time;
+	stats->connect_time = sparams.out.connect_time;
+	stats->connected_time = sparams.out.connected_time;
+	stats->resolve = sparams.out.resolve;
+	stats->resolve_fail = sparams.out.resolve_fail;
+	stats->conn_recovery = sparams.out.conn_recovery;
+	stats->rewait_time = sparams.out.rewait_time;
+	stats->reresolve_time = sparams.out.reresolve_time;
+	stats->reconnect_time = sparams.out.reconnect_time;
+	stats->max_rewait_time = sparams.out.max_rewait_time;
+	stats->max_reresolve_time = sparams.out.max_reresolve_time;
+	stats->max_reconnect_time = sparams.out.max_reconnect_time;
+	stats->reresolve = sparams.out.reresolve;
+	stats->reresolve_fail = sparams.out.reresolve_fail;
+
+	stats->post_write = sparams.out.post_write;
+	stats->post_write_fail = sparams.out.post_write_fail;
+	stats->post_write_bytes = sparams.out.post_write_bytes;
+	stats->outstand_send_write = sparams.out.outstand_send_write;
+	stats->send_write_cqe = sparams.out.send_write_cqe;
+	stats->send_write_cqe_fail = sparams.out.send_write_cqe_fail;
+
+	stats->recv_write_cqe = sparams.out.recv_write_cqe;
+	stats->recv_write_bytes = sparams.out.recv_write_bytes;
+	stats->recv_cqe_fail = sparams.out.recv_cqe_fail;
+
+	stats->post_hb = sparams.out.post_hb;
+	stats->post_hb_fail = sparams.out.post_hb_fail;
+	stats->send_hb_cqe = sparams.out.send_hb_cqe;
+	stats->send_hb_cqe_fail = sparams.out.send_hb_cqe_fail;
+	stats->recv_hb_cqe = sparams.out.recv_hb_cqe;
+	return 0;
+fail:
+	errno = save_errno;
+	return -1;
+}
+
+int __psm2_rv_get_event_stats(psm2_rv_t rv, struct psm2_rv_event_stats *stats)
+{
+	struct rv_event_stats_params_out sparams;
+	int ret;
+	int save_errno;
+
+	memset(&sparams, 0, sizeof(sparams));
+	if ((ret = ioctl(rv->fd, RV_IOCTL_GET_EVENT_STATS, &sparams)) != 0) {
+		save_errno = errno;
+		_HFI_ERROR("rv get_event_stats failed ret:%d: %s\n", ret, strerror(errno));
+		goto fail;
+	}
+	stats->send_write_cqe = sparams.send_write_cqe;
+	stats->send_write_cqe_fail = sparams.send_write_cqe_fail;
+	stats->send_write_bytes = sparams.send_write_bytes;
+
+	stats->recv_write_cqe = sparams.recv_write_cqe;
+	stats->recv_write_cqe_fail = sparams.recv_write_cqe_fail;
+	stats->recv_write_bytes = sparams.recv_write_bytes;
+	return 0;
+fail:
+	errno = save_errno;
+	return -1;
+}
+
+int __psm2_rv_disconnect(psm2_rv_conn_t conn)
+{
+	if (! conn) {
+		errno = EINVAL;
+		return -1;
+	}
+	// reduce reference count on kernel connection.
+	// When reference count hits 0, kernel can start IB CM disconnection
+	// said disconnect process may continue on past when the processes exit
+	// TBD - if PSM should wait for disconnect to finish, especially after
+	// find disconnect is called. - assume NO
+	// start disconnection
+	// return 0 on success
+	// return -1 and errno on error
+	// once disconnected an event will occur with id from original conn req
+	return 0;
+}
+
+void __psm2_rv_destroy_conn(psm2_rv_conn_t conn)
+{
+	if (! conn) {
+		// TBD - could have errno and return code here?
+		return;
+	}
+	//psm2_rv_t rv = conn->rv;
+	//TBD - tell kernel, it will cleanup and start disconnect if not alraedy
+	//		started
+	//TBD - cleanup conn resources
+
+	my_free(conn);
+}
+
+psm2_rv_mr_t __psm2_rv_reg_mem(psm2_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
+				void *addr, uint64_t length, int access)
+{
+	psm2_rv_mr_t mr = NULL;
+	struct rv_mem_params mparams;
+	struct irdma_mem_reg_req req;
+	int save_errno;
+
+	if (!rv || (!pd && !(access & IBV_ACCESS_KERNEL))) {
+		save_errno = EINVAL;
+		goto fail;
+	}
+
+#ifdef PSM_CUDA
+#ifdef PSM_FI
+	if_pf((access & IBV_ACCESS_IS_GPU_ADDR) && PSMI_FAULTINJ_ENABLED()) {
+                PSMI_FAULTINJ_STATIC_DECL(fi_gpu_reg_mr, "gpu_reg_mr",
+                                          "fail GPU reg_mr",
+                                           1, IPS_FAULTINJ_GPU_REG_MR);
+                if_pf(PSMI_FAULTINJ_IS_FAULT(fi_gpu_reg_mr, "")) {
+                        errno = ENOMEM;
+                        return NULL;
+                }
+        }
+#endif
+#endif
+
+	mr = (psm2_rv_mr_t)my_calloc(1, sizeof(struct psm2_rv_mr));
+	if (! mr) {
+		save_errno = ENOMEM;
+		goto fail;
+	}
+
+	//printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__);
+	memset(&mparams, 0, sizeof(mparams));
+	if (pd)
+		mparams.in.ibv_pd_handle = pd->handle;
+	mparams.in.cmd_fd_int = cmd_fd_int;
+	mparams.in.addr = (uint64_t)addr;
+	mparams.in.length = length;
+	mparams.in.access = access;
+	memset(&req, 0, sizeof(req));
+	// driver specific data type
+	req.reg_type = IW_MEMREG_TYPE_MEM;
+	mparams.in.ulen = sizeof(req);
+	mparams.in.udata = &req;
+	if (ioctl(rv->fd, RV_IOCTL_REG_MEM, &mparams)) {
+		save_errno = errno;
+		goto fail;
+	}
+#ifdef PSM_CUDA
+	if ((access & IBV_ACCESS_IS_GPU_ADDR)
+		&& PSM2_OK != psm2_check_phys_addr(mparams.out.iova)) {
+		(void)__psm2_rv_dereg_mem(rv, mr);
+		errno = EFAULT;
+		return NULL;
+	}
+#endif
+	mr->addr = (uint64_t)addr;
+	mr->length = length;
+	mr->access = access;
+	mr->handle = mparams.out.mr_handle;
+	mr->iova = mparams.out.iova;
+	mr->lkey = mparams.out.lkey;
+	mr->rkey = mparams.out.rkey;
+	//printf("XXXX 0x%lx %s pdh:0x%x cmd_fd_int:%d addr:0x%p len:%ld acc:0x%x lkey:0x%x rkey:0x%x mr:%d\n",
+	//	 pthread_self(), __FUNCTION__, pd->handle, cmd_fd_int, addr, length, access,
+	//	 mr->lkey, mr->rkey, mr->handle);
+
+	return mr;
+fail:
+	if (mr)
+		my_free(mr);
+	errno = save_errno;
+	return NULL;
+}
+
+int __psm2_rv_dereg_mem(psm2_rv_t rv, psm2_rv_mr_t mr)
+{
+	struct rv_dereg_params_in dparams;
+	int ret;
+
+	if (! rv || ! mr) {
+		errno = EINVAL;
+		return -1;
+	}
+	//printf("XXXX 0x%lx %s mr:%d\n", pthread_self(), __FUNCTION__, mr->handle);
+	dparams.mr_handle = mr->handle;
+	dparams.addr = mr->addr;
+	dparams.length = mr->length;
+	dparams.access = mr->access;
+	if ((ret = ioctl(rv->fd, RV_IOCTL_DEREG_MEM, &dparams)) != 0)
+		return ret;
+	my_free(mr);
+	return 0;
+}
+
+#ifdef PSM_CUDA
+
+void * __psm2_rv_pin_and_mmap(psm2_rv_t rv, uintptr_t pageaddr,
+				uint64_t pagelen, int access)
+{
+	struct rv_gpu_mem_params params;
+	int ret;
+
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED()) {
+                PSMI_FAULTINJ_STATIC_DECL(fi_gdrmmap, "gdrmmap",
+                                          "fail GPU gdrcopy mmap",
+                                           1, IPS_FAULTINJ_GDRMMAP);
+                if_pf(PSMI_FAULTINJ_IS_FAULT(fi_gdrmmap, "")) {
+                        errno = ENOMEM;
+                        return NULL;
+                }
+        }
+#endif
+
+	memset(&params, 0, sizeof(params));
+	params.in.gpu_buf_addr = pageaddr;
+	params.in.gpu_buf_size = pagelen;
+	params.in.access = access;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_GPU_PIN_MMAP, &params)) != 0)
+		return NULL;
+
+	if (PSM2_OK != psm2_check_phys_addr(params.out.phys_addr)) {
+		(void)__psm2_rv_evict_exact(rv, (void*)pageaddr, pagelen, access);
+		errno = EFAULT;
+		return NULL;
+	}
+	// return mapped host address or NULL with errno set
+	return (void*)(uintptr_t)params.out.host_buf_addr;
+}
+#endif /* PSM_CUDA */
+
+// addr, length, access are what was used in a previous call to
+// __psm_rv_reg_mem or __psm2_rv_pin_and_mmap
+// this will remove from the cache the matching entry if it's
+// refcount is 0.  In the case of reg_mem, a matching call
+// to dereg_mem is required for this to be able to evict the entry
+// return number of bytes evicted (> 0) on success or -1 with errno
+// Reports ENOENT if entry not found in cache (may already be evicted)
+int64_t __psm2_rv_evict_exact(psm2_rv_t rv, void *addr, uint64_t length, int access)
+{
+#ifdef RV_IOCTL_EVICT
+	struct rv_evict_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	params.in.type = RV_EVICT_TYPE_SEARCH_EXACT;
+	params.in.search.addr = (uint64_t)addr;
+	params.in.search.length = length;
+	params.in.search.access = access;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_EVICT, &params)) != 0) {
+		if (errno != ENOENT) {
+			save_errno = errno;
+			perror("rv_evict_exact failed\n");
+			errno = save_errno;
+		}
+		return ret;
+	}
+
+	return params.out.bytes;
+#else
+	errno = EINVAL;
+	return -1;
+#endif
+}
+
+// this will remove from the cache all entries which include
+// addresses between addr and addr+length-1 inclusive if it's
+// refcount is 0.  In the case of reg_mem, a matching call
+// to dereg_mem is required for this to be able to evict the entry
+// return number of bytes evicted (> 0) on success or -1 with errno
+// Reports ENOENT if no matching entries found in cache (may already be evicted)
+int64_t __psm2_rv_evict_range(psm2_rv_t rv, void *addr, uint64_t length)
+{
+#ifdef RV_IOCTL_EVICT
+	struct rv_evict_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	params.in.type = RV_EVICT_TYPE_SEARCH_RANGE;
+	params.in.search.addr = (uint64_t)addr;
+	params.in.search.length = length;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_EVICT, &params)) != 0) {
+		if (errno != ENOENT) {
+			save_errno = errno;
+			perror("rv_evict_range failed\n");
+			errno = save_errno;
+		}
+		return ret;
+	}
+
+	return params.out.bytes;
+#else
+	errno = EINVAL;
+	return -1;
+#endif
+}
+
+#ifdef PSM_CUDA
+// this will remove from the GPU cache all entries which include
+// addresses between addr and addr+length-1 inclusive if it's
+// refcount is 0.  In the case of reg_mem, a matching call
+// to dereg_mem is required for this to be able to evict the entry
+// return number of bytes evicted (> 0) on success or -1 with errno
+// Reports ENOENT if no matching entries found in cache (may already be evicted)
+int64_t __psm2_rv_evict_gpu_range(psm2_rv_t rv, uintptr_t addr, uint64_t length)
+{
+#ifdef RV_IOCTL_EVICT
+	struct rv_evict_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	params.in.type = RV_EVICT_TYPE_GPU_SEARCH_RANGE;
+	params.in.search.addr = addr;
+	params.in.search.length = length;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_EVICT, &params)) != 0) {
+		if (errno != ENOENT) {
+			save_errno = errno;
+			perror("rv_evict_gpu_range failed\n");
+			errno = save_errno;
+		}
+		return ret;
+	}
+
+	return params.out.bytes;
+#else
+	errno = EINVAL;
+	return -1;
+#endif
+}
+#endif // PSM_CUDA
+
+// this will remove from the cache up to the amount specified
+// Only entries with a refcount of 0 are removed.
+// In the case of reg_mem, a matching call
+// to dereg_mem is required for this to be able to evict the entry
+// return number of bytes evicted (> 0) on success or -1 with errno
+// Reports ENOENT if no entries could be evicted
+int64_t __psm2_rv_evict_amount(psm2_rv_t rv, uint64_t bytes, uint32_t count)
+{
+#ifdef RV_IOCTL_EVICT
+	struct rv_evict_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	params.in.type = RV_EVICT_TYPE_AMOUNT;
+	params.in.amount.bytes = bytes;
+	params.in.amount.count = count;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_EVICT, &params)) != 0) {
+		if (errno != ENOENT) {
+			save_errno = errno;
+			perror("rv_evict_amount failed\n");
+			errno = save_errno;
+		}
+		return ret;
+	}
+
+	return params.out.bytes;
+#else
+	errno = EINVAL;
+	return -1;
+#endif
+}
+
+#ifdef PSM_CUDA
+// this will remove from the GPU cache up to the amount specified
+// Only entries with a refcount of 0 are removed.
+// In the case of reg_mem, a matching call
+// to dereg_mem is required for this to be able to evict the entry
+// return number of bytes evicted (> 0) on success or -1 with errno
+// Reports ENOENT if no entries could be evicted
+int64_t __psm2_rv_evict_gpu_amount(psm2_rv_t rv, uint64_t bytes, uint32_t count)
+{
+#ifdef RV_IOCTL_EVICT
+	struct rv_evict_params params;
+	int ret;
+	int save_errno;
+
+	memset(&params, 0, sizeof(params));
+	params.in.type = RV_EVICT_TYPE_GPU_AMOUNT;
+	params.in.amount.bytes = bytes;
+	params.in.amount.count = count;
+
+	if ((ret = ioctl(rv->fd, RV_IOCTL_EVICT, &params)) != 0) {
+		if (errno != ENOENT) {
+			save_errno = errno;
+			perror("rv_evict_gpu_amount failed\n");
+			errno = save_errno;
+		}
+		return ret;
+	}
+
+	return params.out.bytes;
+#else
+	errno = EINVAL;
+	return -1;
+#endif
+}
+#endif // PSM_CUDA
+
+int __psm2_rv_post_rdma_write_immed(psm2_rv_t rv, psm2_rv_conn_t conn,
+				void *loc_buf, psm2_rv_mr_t loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				uint64_t len, uint32_t immed, uint64_t wr_id,
+				uint8_t *sconn_index, uint32_t *conn_count)
+{
+	struct rv_post_write_params pparams;
+	int ret;
+
+	if (! rv || ! conn || ! loc_buf || ! loc_mr || ! rem_buf) {
+		errno = EINVAL;
+		return -1;
+	}
+	//printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__);
+	memset(&pparams, 0, sizeof(pparams));
+	pparams.in.handle = conn->handle;
+	pparams.in.loc_addr = (uint64_t)loc_buf;
+	pparams.in.loc_mr_handle = loc_mr->handle;
+	pparams.in.loc_mr_addr = loc_mr->addr;
+	pparams.in.loc_mr_length = loc_mr->length;
+	pparams.in.loc_mr_access = loc_mr->access;
+	pparams.in.rem_addr = rem_buf;
+	pparams.in.rkey = rkey;
+	pparams.in.length = len;
+	pparams.in.immed = immed;
+	pparams.in.wr_id = wr_id;
+	ret = ioctl(rv->fd, RV_IOCTL_POST_RDMA_WR_IMMED, &pparams);
+	*sconn_index = pparams.out.sconn_index;
+	*conn_count = pparams.out.conn_count;
+	return ret;
+}
+
+// Safely poll an event and consume it.
+// returns 0 if CQ empty, 1 if consumed an entry and -1 if error
+// given PSM locking model, we don't need to get a lock here, caller will
+// already hold progress_lock if needed
+int __psm2_rv_poll_cq(psm2_rv_t rv, struct rv_event *ev)
+{
+	uint32_t next;
+	// TBD - may want to skip error checks for datapath perf
+	if (! rv || ! rv->events.hdr) {
+		errno = EINVAL;
+		return -1;
+	}
+	next =  rv->events.hdr->head;
+	if (next == rv->events.hdr->tail)
+		return 0;	// empty
+	// make sure read of tail completes before fetch event
+	{ asm volatile("lfence":::"memory"); }
+	*ev =  rv->events.hdr->entries[next++];
+	// make sure event fully fetched before advance head
+	{ asm volatile("sfence":::"memory"); }
+	if (next == rv->events.num)
+		next = 0;
+	rv->events.hdr->head = next;
+	return 1;
+}
+
+// Safely scan CQ for an event without consuming anything.
+// returns 1 if matching successful CQ event found
+// returns 0 if CQE empty or no matching successful event found
+// Only messages on CQ immediately prior to call are scanned, new CQ events
+// arriving during or after this function are not scanned
+// given PSM locking model, we don't need to get a lock here, caller will
+// already hold progress_lock if needed
+int __psm2_rv_scan_cq(psm2_rv_t rv, uint8_t event_type,
+			uint32_t imm_mask, uint32_t imm_value)
+{
+	uint32_t next;
+	uint32_t tail;
+	struct rv_event *ev;
+
+	// TBD - may want to skip error checks for datapath perf
+	if (! rv || ! rv->events.hdr) {
+		errno = EINVAL;
+		return -1;
+	}
+	next = rv->events.hdr->head;
+	tail = rv->events.hdr->tail;
+	// make sure read of tail completes before read events
+	{ asm volatile("lfence":::"memory"); }
+	while (next != tail) {
+		ev =  &rv->events.hdr->entries[next++];
+		if (ev->event_type == event_type
+			&& ev->wc.status == 0
+			&& (ev->wc.imm_data & imm_mask) == imm_value)
+			return 1; // found
+		if (next == rv->events.num)
+			next = 0;
+	}
+	return 0; // not found
+}
+
+// check if CQ has ever overflowed.
+// returns 1 if CQ has overflowed in past
+// returns 0 if CQ has never overflowed
+// In future could use overflow_cnt to identify if ring recently
+// overflowed (eg. save overflow_cnt when check) and trigger PSM recovery
+int __psm2_rv_cq_overflowed(psm2_rv_t rv)
+{
+	if (! rv || ! rv->events.hdr) {
+		errno = EINVAL;
+		return -1;
+	}
+#ifndef HAVE_NO_PSM3_RV_OVERFLOW_CNT
+	return (rv->events.hdr->overflow_cnt != 0);
+#else
+	return 0;
+#endif
+}
+#endif // RNDV_MOD
diff --git a/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.h b/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5fa322e82d7e3e4bc774908f445f64c480193b8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_rndv_mod.h
@@ -0,0 +1,248 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+#ifdef RNDV_MOD
+
+#ifndef _PSMI_RNDV_MOD_H
+#define _PSMI_RNDV_MOD_H
+
+#include <sys/types.h>
+//#include <sys/socket.h>
+//#include <rdma/rdma_verbs.h>
+#ifndef HAVE_OLD_RV_HEADER
+#include <rdma/rv_user_ioctls.h>
+#else
+#include <rv/rv_user_ioctls.h>
+#endif
+
+struct local_info {
+	uint32_t mr_cache_size;	// in MBs
+#ifdef PSM_CUDA
+	uint32_t gpu_cache_size;	// in MBs
+#endif
+	uint8_t rdma_mode;	// RV_RDMA_MODE_*
+
+	// additional information for RV_RDMA_MODE_KERNEL
+	uint8_t port_num;
+	uint8_t num_conn;		// # QPs between each pair of nodes
+	uint32_t loc_addr;		// our local address. (cpu byte order)
+							// for OPA/IB a 16 bit LID
+							// for ethernet a 32 bit IPv4 address
+	uint8_t index_bits;		// num high bits of immed data with rv index
+	uint16_t loc_gid_index;	// index for loc_gid
+	union ibv_gid loc_gid;	// our local GID for use in IB CM connections
+	uint16_t qos_class_sl;	// TBD if will use
+							// indicated in ah_attr when create_conn
+	uint16_t job_key_len;
+	uint8_t *job_key;
+	uint64_t service_id;	// optional override to rv kernel param
+	void *context;
+	uint32_t cq_entries;	// rv event queue for PSM polling
+	uint32_t q_depth;		// depth of QP and CQ per QP
+	uint32_t reconnect_timeout;	// in seconds
+	uint32_t hb_interval;		// in milliseconds
+	// output from RNDV driver
+	uint16_t major_rev;		// driver ABI rev
+	uint16_t minor_rev;		// driver ABI rev
+#ifdef PSM_CUDA
+	uint16_t gpu_major_rev;		// driver GPU ABI rev
+	uint16_t gpu_minor_rev;		// driver GPU ABI rev
+#endif
+	uint64_t capability;
+	uint32_t rv_index;		// unique within job on given NIC
+};
+
+struct rv_event_ring {
+	struct rv_ring_header *hdr;
+	int len;
+	uint32_t num;
+};
+
+struct psm2_rv {
+	int fd; /* file handle used to issue ioctls to rv driver */
+	struct rv_event_ring events;
+};
+typedef struct psm2_rv *psm2_rv_t;
+
+struct psm2_rv_conn {
+	psm2_rv_t rv;	// our parent
+	uint64_t handle;	// rv_user_conn
+	uint64_t conn_handle;	// rv_conn
+	// ah, path and context are saved only in kernel
+};
+typedef struct psm2_rv_conn *psm2_rv_conn_t;
+
+// for simple sanity check
+static inline uint64_t psm2_rv_conn_get_conn_handle(psm2_rv_conn_t conn)
+{
+	return conn->conn_handle;
+}
+
+struct psm2_rv_mr {
+	uint64_t		addr;
+	uint64_t		length;
+	int			access;
+	uint64_t		handle;
+	uint64_t		iova;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+typedef struct psm2_rv_mr *psm2_rv_mr_t;
+
+#define psm2_rv_cache_stats rv_cache_stats_params_out
+
+#ifdef PSM_CUDA
+#define psm2_rv_gpu_cache_stats rv_gpu_cache_stats_params_out
+#endif
+
+#define psm2_rv_conn_stats rv_conn_get_stats_params_out
+
+#define psm2_rv_event_stats rv_event_stats_params_out
+
+static inline uint16_t psm2_rv_get_user_major_bldtime_version(void)
+{
+	return RV_ABI_VER_MAJOR;
+}
+
+static inline uint16_t psm2_rv_get_user_minor_bldtime_version(void)
+{
+	return RV_ABI_VER_MINOR;
+}
+
+#ifdef NVIDIA_GPU_DIRECT
+static inline uint16_t psm2_rv_get_gpu_user_major_bldtime_version(void)
+{
+	return RV_GPU_ABI_VER_MAJOR;
+}
+
+static inline uint16_t psm2_rv_get_gpu_user_minor_bldtime_version(void)
+{
+	return RV_GPU_ABI_VER_MINOR;
+}
+
+extern uint64_t __psm2_min_gpu_bar_size(void);
+#endif
+
+extern psm2_rv_t __psm2_rv_open(const char *devname, struct local_info *loc_info);
+
+extern int __psm2_rv_close(psm2_rv_t rv);
+
+extern int __psm2_rv_get_cache_stats(psm2_rv_t rv,
+									struct psm2_rv_cache_stats *stats);
+
+#ifdef PSM_CUDA
+extern int __psm2_rv_gpu_get_cache_stats(psm2_rv_t rv,
+									struct psm2_rv_gpu_cache_stats *stats);
+#endif
+
+extern psm2_rv_conn_t __psm2_rv_create_conn(psm2_rv_t rv,
+		struct ibv_ah_attr *ah_attr, // for remote node
+		uint32_t rem_addr);  // for simple compare to loc_addr
+
+extern int __psm2_rv_connect(psm2_rv_conn_t conn,
+                    const struct ib_user_path_rec *path);
+
+extern int __psm2_rv_connected(psm2_rv_conn_t conn);
+
+extern int __psm2_rv_get_conn_count(psm2_rv_t rv, psm2_rv_conn_t conn,
+				uint8_t index, uint32_t *count);
+
+extern int __psm2_rv_get_conn_stats(psm2_rv_t rv, psm2_rv_conn_t conn,
+				uint8_t index, struct psm2_rv_conn_stats *stats);
+
+extern int __psm2_rv_get_event_stats(psm2_rv_t rv,
+									struct psm2_rv_event_stats *stats);
+
+extern int __psm2_rv_disconnect(psm2_rv_conn_t conn);
+
+extern void __psm2_rv_destroy_conn(psm2_rv_conn_t conn);
+
+extern psm2_rv_mr_t __psm2_rv_reg_mem(psm2_rv_t rv, int cmd_fd, struct ibv_pd *pd, void *addr,
+				uint64_t length, int access);
+
+extern int __psm2_rv_dereg_mem(psm2_rv_t rv, psm2_rv_mr_t mr);
+
+extern void * __psm2_rv_pin_and_mmap(psm2_rv_t rv, uintptr_t pageaddr,
+			uint64_t pagelen, int access);
+
+extern int64_t __psm2_rv_evict_exact(psm2_rv_t rv, void *addr,
+			uint64_t length, int access);
+
+extern int64_t __psm2_rv_evict_range(psm2_rv_t rv, void *addr, uint64_t length);
+
+extern int64_t __psm2_rv_evict_amount(psm2_rv_t rv, uint64_t bytes, uint32_t count);
+
+#ifdef PSM_CUDA
+extern int64_t __psm2_rv_evict_gpu_range(psm2_rv_t rv, uintptr_t addr,
+			uint64_t length);
+
+extern int64_t __psm2_rv_evict_gpu_amount(psm2_rv_t rv, uint64_t bytes,
+			uint32_t count);
+#endif
+
+extern int __psm2_rv_post_rdma_write_immed(psm2_rv_t rv, psm2_rv_conn_t conn,
+				void *loc_buf, psm2_rv_mr_t loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				size_t len, uint32_t immed, uint64_t wr_id,
+				uint8_t *sconn_index, uint32_t *conn_count);
+
+extern int __psm2_rv_poll_cq(psm2_rv_t rv, struct rv_event *ev);
+
+extern int __psm2_rv_scan_cq(psm2_rv_t rv, uint8_t event_type,
+			uint32_t imm_mask, uint32_t imm_value);
+
+extern int __psm2_rv_cq_overflowed(psm2_rv_t rv);
+
+#endif // _PSMI_RNDV_MOD_H
+#endif // RNDV_MOD
diff --git a/deps/libfabric/prov/psm3/psm3/psm_stats.c b/deps/libfabric/prov/psm3/psm3/psm_stats.c
new file mode 100644
index 0000000000000000000000000000000000000000..684b9f67565adec478da9bc9d38d0f49ade6a9a2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_stats.c
@@ -0,0 +1,794 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include <sys/syscall.h>
+
+struct psmi_stats_type {
+	STAILQ_ENTRY(psmi_stats_type) next;
+	struct psmi_stats_entry *entries;
+
+	int num_entries;
+	const char *heading;
+	uint32_t statstype;
+	uint64_t id;	// identifier to include in output, typically epid
+	void *context;
+	char *info;
+	pid_t tid;	// thread id, useful for multi-ep
+};
+
+static STAILQ_HEAD(, psmi_stats_type) psmi_stats =
+STAILQ_HEAD_INITIALIZER(psmi_stats);
+
+pthread_spinlock_t psmi_stats_lock;	// protects psmi_stats list
+// stats output
+static int print_statsmask;
+static time_t stats_start;
+static char perf_file_name[PATH_MAX];
+static FILE *perf_stats_fd;
+// stats thread
+static int print_stats_freq;
+static int print_stats_running;
+static pthread_t perf_print_thread;
+
+// we attempt open only once and only output error once
+// this prevents multiple failures and also prevents reopen during finalize
+static void psmi_open_stats_fd()
+{
+	static int attempted_open;
+
+	if (! attempted_open && ! perf_stats_fd) {
+		perf_stats_fd = fopen(perf_file_name, "w+");
+		if (!perf_stats_fd)
+			_HFI_ERROR("Failed to create fd for performance logging\n");
+		attempted_open = 1;
+	}
+}
+
+// caller must get psmi_stats_lock
+static psm2_error_t
+psmi_stats_deregister_type_internal(uint32_t statstype,
+					 void *context)
+{
+	struct psmi_stats_type *type;
+
+	STAILQ_FOREACH(type, &psmi_stats, next) {
+		if (type->statstype == statstype && type->context == context) {
+			STAILQ_REMOVE(&psmi_stats, type, psmi_stats_type, next);
+			psmi_free(type->entries);
+			if (type->info)
+				psmi_free(type->info);
+			psmi_free(type);
+			return PSM2_OK;
+		}
+	}
+	return PSM2_INTERNAL_ERR;	// not found
+}
+
+static psm2_error_t
+psmi_stats_register_type_internal(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries_i,
+			 int num_entries, uint64_t id, void *context,
+			 const char *info, bool rereg)
+{
+	struct psmi_stats_entry *entries;
+	struct psmi_stats_type *type;
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	if (! heading || ! context || ! statstype || ! num_entries || ! entries_i)
+		return PSM2_PARAM_ERR;
+
+	entries =
+	    psmi_calloc(PSMI_EP_NONE, STATS, num_entries,
+			sizeof(struct psmi_stats_entry));
+	type =
+	    psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type));
+	PSMI_CHECKMEM(err, entries);
+	PSMI_CHECKMEM(err, type);
+
+	type->entries = entries;
+	type->num_entries = num_entries;
+	type->statstype = statstype;
+	type->id = id;
+	type->context = context;
+	type->heading = heading;
+	if (info)
+		type->info = psmi_strdup(NULL, info);
+#ifdef SYS_gettid
+	type->tid = (long int)syscall(SYS_gettid); // gettid();
+#else
+	type->tid = 0;
+#endif
+
+	for (i = 0; i < num_entries; i++) {
+		type->entries[i].desc = entries_i[i].desc;
+		type->entries[i].flags = entries_i[i].flags;
+		type->entries[i].getfn = entries_i[i].getfn;
+		type->entries[i].u.val = entries_i[i].u.val;
+	}
+
+	pthread_spin_lock(&psmi_stats_lock);
+	if (rereg)
+		(void) psmi_stats_deregister_type_internal(statstype, context);
+	STAILQ_INSERT_TAIL(&psmi_stats, type, next);
+	pthread_spin_unlock(&psmi_stats_lock);
+	return err;
+
+fail:
+	if (entries)
+		psmi_free(entries);
+	if (type) {
+		if (type->info)
+			psmi_free(type->info);
+		psmi_free(type);
+	}
+	return err;
+}
+
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries_i,
+			 int num_entries, uint64_t id, void *context,
+			 const char* info)
+{
+	return psmi_stats_register_type_internal(heading, statstype, entries_i,
+			 num_entries, id, context, info, 0);
+}
+
+psm2_error_t
+psmi_stats_reregister_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries_i,
+			 int num_entries, uint64_t id, void *context,
+			 const char *info)
+{
+	return psmi_stats_register_type_internal(heading, statstype, entries_i,
+			 num_entries, id, context, info, 1);
+}
+
+void psmi_stats_show(uint32_t statsmask)
+{
+	struct psmi_stats_type *type;
+	time_t now;
+	char buf[100];
+
+	pthread_spin_lock(&psmi_stats_lock);
+	psmi_open_stats_fd();
+	if (! perf_stats_fd)
+		goto unlock;
+
+	now = time(NULL);
+
+	fprintf(perf_stats_fd, "Time Delta %u seconds %s",
+		(unsigned)(now - stats_start), ctime_r(&now, buf));
+
+	STAILQ_FOREACH(type, &psmi_stats, next) {
+		int i;
+		struct psmi_stats_entry *entry;
+
+		if (! (type->statstype & statsmask))
+			continue;
+		// when id == 0, we expect 1 report of given type per
+		// process, so we also omit tid.  In which case info probably
+		// NULL but show it if provided when stats_register called.
+		if (type->id)
+			fprintf(perf_stats_fd, " %s id 0x%"PRIx64"%s%s tid %d\n",
+				type->heading, type->id, type->info?" ":"",
+				type->info?type->info:"", type->tid);
+		else
+			fprintf(perf_stats_fd, " %s%s%s\n",
+				type->heading, type->info?" ":"",
+				type->info?type->info:"");
+		for (i=0, entry=&type->entries[0]; i<type->num_entries; i++, entry++) {
+			uint64_t value;
+			value = (entry->getfn != NULL)? entry->getfn(type->context)
+										: *entry->u.val;
+			if (value || ! (entry->flags & MPSPAWN_STATS_SKIP_IF_ZERO)
+					|| (statsmask & _PSMI_STATSTYPE_SHOWZERO))
+				fprintf(perf_stats_fd, "    %s %"PRIu64" (%"PRId64")\n", entry->desc,
+						 value, (int64_t)value - (int64_t)entry->old_value);
+			entry->old_value = value;
+		}
+	}
+	fprintf(perf_stats_fd, "\n");
+	fflush(perf_stats_fd);
+unlock:
+	pthread_spin_unlock(&psmi_stats_lock);
+}
+
+psm2_error_t psmi_stats_deregister_type(uint32_t statstype, void *context)
+{
+	psm2_error_t err;
+
+	pthread_spin_lock(&psmi_stats_lock);
+	err = psmi_stats_deregister_type_internal(statstype, context);
+	pthread_spin_unlock(&psmi_stats_lock);
+	return err;
+}
+
+psm2_error_t psmi_stats_deregister_all(void)
+{
+	struct psmi_stats_type *type;
+
+	/* Currently our mpi still reads stats after finalize so this isn't safe
+	 * yet */
+	pthread_spin_lock(&psmi_stats_lock);
+	while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) {
+		STAILQ_REMOVE_HEAD(&psmi_stats, next);
+		psmi_free(type->entries);
+		if (type->info)
+			psmi_free(type->info);
+		psmi_free(type);
+	}
+	pthread_spin_unlock(&psmi_stats_lock);
+
+	return PSM2_OK;
+}
+
+static
+void
+*psmi_print_stats_thread(void *unused)
+{
+	if (print_stats_freq <= 0)
+		goto end;
+
+	psmi_open_stats_fd();
+	if (!perf_stats_fd)
+		goto end;
+
+	/* Performance stats will be printed every $PSM3_PRINT_STATS seconds */
+	do {
+		psmi_stats_show(print_statsmask);
+		usleep(MICRO_SEC * print_stats_freq);
+	} while (print_stats_running);
+
+end:
+	pthread_exit(NULL);
+}
+
+static void
+psmi_print_stats_init_thread(void)
+{
+	print_stats_running = 1;
+	if (pthread_create(&perf_print_thread, NULL,
+				psmi_print_stats_thread, (void*)NULL))
+	{
+		print_stats_running = 0;
+		_HFI_ERROR("Failed to create logging thread\n");
+	}
+}
+
+psm2_error_t
+psmi_stats_initialize(void)
+{
+	union psmi_envvar_val env_stats;
+
+	psmi_getenv("PSM3_PRINT_STATS",
+			"Prints performance stats every n seconds to file "
+			"./psm3-perf-stat-[hostname]-pid-[pid] when set to -1 stats are "
+			"printed only once on 1st ep close",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val) 0, &env_stats);
+	print_stats_freq = env_stats.e_uint;
+
+	psmi_getenv("PSM3_PRINT_STATSMASK",
+			"Mask of statistic types to print: "
+			"MQ=1, RCVTHREAD=0x100, IPS=0x200"
+			", RDMA=0x400, MRCache=0x800"
+#ifdef PSM_DEBUG
+			", MEMORY=0x1000"
+#endif
+#ifdef RNDV_MOD
+			", RVEvents=0x2000, RVRDMA=0x4000"
+#endif
+#ifdef PSM_FI
+			", FaultInj=0x8000"
+#endif
+			".  0x100000 causes zero values to also be shown",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			(union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_stats);
+	print_statsmask = env_stats.e_uint;
+
+	pthread_spin_init(&psmi_stats_lock, PTHREAD_PROCESS_PRIVATE);
+	stats_start = time(NULL);
+
+	snprintf(perf_file_name, sizeof(perf_file_name),
+			"./psm3-perf-stat-%s-pid-%d",
+			psmi_gethostname(), getpid());
+
+	if (print_stats_freq > 0)
+		psmi_print_stats_init_thread();
+	return PSM2_OK;
+}
+
+void
+psmi_stats_finalize(void)
+{
+	if (print_stats_freq == -1) {
+		psmi_stats_show(print_statsmask);
+	} else if (print_stats_running) {
+		print_stats_running = 0;
+		pthread_join(perf_print_thread, NULL);
+	}
+	if (perf_stats_fd) {
+		fclose(perf_stats_fd);
+		perf_stats_fd = NULL;
+	}
+	psmi_stats_deregister_all();
+}
+
+// called at start of ep_close so we can output 1 shot as needed while
+// most of the interesting stats are available
+// we only output if we have done no previous outputs, so
+// if there are multiple EPs this only outputs on 1st EP close
+void
+psmi_stats_ep_close(void)
+{
+	if (print_stats_freq == -1 && ! perf_stats_fd)
+		psmi_stats_show(print_statsmask);
+}
+
+#if 0   // unused code, specific to QLogic MPI
+static uint32_t typestring_to_type(const char *typestr)
+{
+	if (strncasecmp(typestr, "all", 4) == 0)
+		return PSMI_STATSTYPE_ALL;
+	else if (strncasecmp(typestr, "p2p", 4) == 0)
+		return PSMI_STATSTYPE_P2P;
+	else if (strncasecmp(typestr, "hfi", 6) == 0)
+		return PSMI_STATSTYPE_HFI;
+	else if (strncasecmp(typestr, "ips", 4) == 0)
+		return PSMI_STATSTYPE_IPSPROTO;
+	else if ((strncasecmp(typestr, "intr", 5) == 0) ||
+		 (strncasecmp(typestr, "thread", 7) == 0) ||
+		 (strncasecmp(typestr, "rcvthread", 10) == 0))
+		return PSMI_STATSTYPE_RCVTHREAD;
+	else if ((strncasecmp(typestr, "mq", 3) == 0) ||
+		 (strncasecmp(typestr, "mpi", 4) == 0))
+		return PSMI_STATSTYPE_MQ;
+	else if ((strncasecmp(typestr, "tid", 4) == 0) ||
+		 (strncasecmp(typestr, "tids", 5) == 0))
+		return PSMI_STATSTYPE_TIDS;
+	else if ((strncasecmp(typestr, "memory", 7) == 0) ||
+		 (strncasecmp(typestr, "alloc", 6) == 0) ||
+		 (strncasecmp(typestr, "malloc", 7) == 0))
+		return PSMI_STATSTYPE_MEMORY;
+	else
+		return 0;
+}
+
+static uint32_t stats_parse_enabled_mask(const char *stats_string)
+{
+	char *b = (char *)stats_string;
+	char *e = b;
+	char buf[128];
+
+	uint32_t stats_enabled_mask = 0;
+
+	while (*e) {
+		b = e;
+		while (*e && *e != ',' && *e != '+' && *e != '.' &&
+		       *e != '|' && *e != ':')
+			e++;
+		if (e > b) {	/* something new to parse */
+			int len = ((e - b) > (sizeof(buf) - 1)) ?
+			    (sizeof(buf) - 1) : (e - b);
+			strncpy(buf, b, len);
+			buf[len] = '\0';
+			stats_enabled_mask |= typestring_to_type(buf);
+		}
+		if (*e)
+			e++;	/* skip delimiter */
+	}
+	return stats_enabled_mask;
+}
+
+static
+void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args)
+{
+	const struct psmi_stats_entry *entry;
+	struct psmi_stats_type *type = (struct psmi_stats_type *)args->context;
+	int i, num = args->num;
+	uint64_t *stats = args->stats;
+	uint64_t *c = NULL;
+	uint64_t *s = NULL;
+
+	psmi_assert(num == type->num_entries);
+
+	 if (type->statstype == PSMI_STATSTYPE_MEMORY) {
+		for (i = 0; i < num; i++) {
+			entry = &type->entries[i];
+			stats[i] =
+			    *(uint64_t *) ((uintptr_t) &psmi_stats_memory +
+					   (uintptr_t) entry->u.off);
+		}
+	} else {
+		for (i = 0; i < num; i++) {
+			entry = &type->entries[i];
+			if (entry->getfn != NULL)
+				stats[i] = entry->getfn(type->context);
+			else
+				stats[i] = *entry->u.val;
+		}
+	}
+
+	if (c != NULL)
+		psmi_free(c);
+	if (s != NULL)
+		psmi_free(s);
+}
+
+static
+void
+stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn,
+			      char *heading,
+			      int num_entries,
+			      struct psmi_stats_entry *entries,
+			      mpspawn_stats_req_fn req_fn, void *context)
+{
+	int i;
+	struct mpspawn_stats_add_args mp_add;
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = num_entries;
+	mp_add.header = heading;
+	mp_add.req_fn = req_fn;
+	mp_add.context = context;
+
+	mp_add.desc = (char **)alloca(sizeof(char *) * num_entries);
+
+	mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries);
+
+	for (i = 0; i < num_entries; i++) {
+		mp_add.desc[i] = (char *)entries[i].desc;
+		mp_add.flags[i] = entries[i].flags;
+	}
+
+	/* Ignore return code, doesn't matter to *us* if register failed */
+	add_fn(&mp_add);
+
+	return;
+}
+
+static void stats_register_mem_stats(psm2_ep_t ep);
+static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args
+					      *args);
+
+/*
+ * Downcall from QLogic MPI into PSM, so we can register stats
+ */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args)
+{
+	struct psmi_stats_type *type;
+	uint32_t statsmask;
+
+	/*
+	 * Args has a version string in it, but we can ignore it since mpspawn
+	 * will decide if it supports *our* version
+	 */
+
+	/*
+	 * Eventually, parse the stats_types to add various "flavours" of stats
+	 */
+	if (args->stats_types == NULL)
+		return NULL;
+
+	statsmask = stats_parse_enabled_mask(args->stats_types);
+
+	/* MQ (MPI-level) statistics */
+	if (statsmask & PSMI_STATSTYPE_MQ)
+		psmi_mq_stats_register(args->mq, args->add_fn);
+
+
+	if (statsmask & PSMI_STATSTYPE_MEMORY)
+		stats_register_mem_stats(args->mq->ep);
+
+	/*
+	 * At this point all PSM and hfi-level components have registered stats
+	 * with the PSM stats interface.  We register with the mpspawn stats
+	 * interface with an upcall in add_fn
+	 */
+	STAILQ_FOREACH(type, &psmi_stats, next) {
+		if (type->statstype & statsmask)
+			stats_register_mpspawn_single(args->add_fn,
+						      type->heading,
+						      type->num_entries,
+						      type->entries,
+						      psmi_stats_mpspawn_callback,
+						      type);
+	}
+
+	/*
+	 * Special handling for per-endpoint statistics
+	 * Only MPI knows what the endpoint-addresses are in the running program,
+	 * PSM has no sense of MPI worlds.  In stats register, MPI tells PSM how
+	 * many endpoints it anticipates having and PSM simply reserves that amount
+	 * of stats entries X the amount of per-endpoint stats.
+	 */
+	if (statsmask & PSMI_STATSTYPE_P2P)
+		psmi_stats_epaddr_register(args);
+
+	return NULL;
+}
+
+struct stats_epaddr {
+	psm2_ep_t ep;
+	mpspawn_map_epaddr_fn epaddr_map_fn;
+	int num_ep;
+	int num_ep_stats;
+};
+
+static
+void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args)
+{
+	int i, num, off;
+	uint64_t *statsp;
+	struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context;
+	psm2_ep_t ep = stats_ctx->ep;
+	psm2_epaddr_t epaddr;
+
+	num = stats_ctx->num_ep * stats_ctx->num_ep_stats;
+
+	/* First always NAN the entire stats request */
+	for (i = 0; i < num; i++) {
+		if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE)
+			args->stats[i] = MPSPAWN_NAN;
+		else
+			args->stats[i] = MPSPAWN_NAN_U64;
+	}
+
+	for (i = 0; i < stats_ctx->num_ep; i++) {
+		statsp = args->stats + i * stats_ctx->num_ep_stats;
+		off = 0;
+		epaddr = stats_ctx->epaddr_map_fn(i);
+		if (epaddr == NULL)
+			continue;
+
+		/* Self */
+		if (&ep->ptl_self == epaddr->ptlctl) {
+			if (ep->ptl_self.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_self.epaddr_stats_get(epaddr,
+								  statsp + off);
+		} else {
+			if (ep->ptl_self.epaddr_stats_num != NULL)
+				off += ep->ptl_self.epaddr_stats_num();
+		}
+
+		/* Shm */
+		if (&ep->ptl_amsh == epaddr->ptlctl) {
+			if (ep->ptl_amsh.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_amsh.epaddr_stats_get(epaddr,
+								  statsp + off);
+		} else {
+			if (ep->ptl_amsh.epaddr_stats_num != NULL)
+				off += ep->ptl_amsh.epaddr_stats_num();
+		}
+
+		/* ips */
+		if (&ep->ptl_ips == epaddr->ptlctl) {
+			if (ep->ptl_ips.epaddr_stats_get != NULL)
+				off +=
+				    ep->ptl_ips.epaddr_stats_get(epaddr,
+								 statsp + off);
+		} else {
+			if (ep->ptl_ips.epaddr_stats_num != NULL)
+				off += ep->ptl_ips.epaddr_stats_num();
+		}
+	}
+	return;
+}
+
+static
+psm2_error_t
+psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args)
+{
+	int i = 0, j;
+	int num_ep = args->num_epaddr;
+	int num_ep_stats = 0;
+	int nz;
+	char **desc, **desc_i;
+	uint16_t *flags, *flags_i;
+	char *p;
+	char buf[128];
+	psm2_ep_t ep;
+	struct mpspawn_stats_add_args mp_add;
+	struct stats_epaddr *stats_ctx;
+	psm2_error_t err = PSM2_OK;
+
+	if (args->mq == NULL)
+		return PSM2_OK;
+	ep = args->mq->ep;
+
+	/* Figure out how many stats there are in an endpoint from all devices */
+	if (ep->ptl_self.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_self.epaddr_stats_num();
+	if (ep->ptl_amsh.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_amsh.epaddr_stats_num();
+	if (ep->ptl_ips.epaddr_stats_num != NULL)
+		num_ep_stats += ep->ptl_ips.epaddr_stats_num();
+
+	/* Allocate desc and flags and let each device initialize their
+	 * descriptions and flags */
+	desc =
+	    psmi_malloc(ep, STATS,
+			sizeof(char *) * num_ep_stats * (num_ep + 1));
+	if (desc == NULL)
+		return PSM2_NO_MEMORY;
+	flags =
+	    psmi_malloc(ep, STATS,
+			sizeof(uint16_t) * num_ep_stats * (num_ep + 1));
+	if (flags == NULL) {
+		psmi_free(desc);
+		return PSM2_NO_MEMORY;
+	}
+
+	/* Get the descriptions/flags from each device */
+	i = 0;
+	i += ep->ptl_self.epaddr_stats_num != NULL ?
+	    ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0;
+	i += ep->ptl_amsh.epaddr_stats_num != NULL ?
+	    ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0;
+	i += ep->ptl_ips.epaddr_stats_num != NULL ?
+	    ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0;
+	psmi_assert_always(i == num_ep_stats);
+
+	/*
+	 * Clone the descriptions for each endpoint but append "rank %d" to it
+	 * beforehand.
+	 */
+	nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 :	/* cheap log */
+				 (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 :
+						       (num_ep <
+							10000 ? 5 : 6)))));
+
+	desc_i = desc + num_ep_stats;
+	flags_i = flags + num_ep_stats;
+	memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats);
+
+	for (i = 0; i < num_ep; i++) {
+		for (j = 0; j < num_ep_stats; j++) {
+			snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i,
+				 desc[j]);
+			buf[sizeof(buf) - 1] = '\0';
+			p = psmi_strdup(ep, buf);
+			if (p == NULL) {
+				err = PSM2_NO_MEMORY;
+				goto clean;
+			}
+			desc_i[i * num_ep_stats + j] = p;
+			flags_i[i * num_ep_stats + j] = flags[j];
+		}
+	}
+
+	mp_add.version = MPSPAWN_STATS_VERSION;
+	mp_add.num = num_ep_stats * num_ep;
+	mp_add.header = "Endpoint-to-Endpoint Stats (by <rank>)";
+	mp_add.req_fn = psmi_stats_epaddr_callback;
+	mp_add.desc = desc_i;
+	mp_add.flags = flags_i;
+	stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr));
+	if (stats_ctx == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto clean;
+	}
+	stats_ctx->ep = ep;
+	stats_ctx->epaddr_map_fn = args->epaddr_map_fn;
+	stats_ctx->num_ep = num_ep;
+	stats_ctx->num_ep_stats = num_ep_stats;
+	mp_add.context = stats_ctx;
+
+	args->add_fn(&mp_add);
+
+clean:
+	/* Now we can free all the descriptions */
+	for (i = 0; i < num_ep; i++) {
+		for (j = 0; j < num_ep_stats; j++)
+			if (desc_i[i * num_ep_stats + j])
+				psmi_free(desc_i[i * num_ep_stats + j]);
+	}
+
+	psmi_free(desc);
+	psmi_free(flags);
+
+	return err;
+}
+
+
+
+#undef _SDECL
+#define _SDECL(_desc, _param) {					\
+	    .desc  = _desc,					\
+	    .flags = MPSPAWN_STATS_REDUCTION_ALL		\
+		     | MPSPAWN_STATS_SKIP_IF_ZERO,		\
+	    .getfn = NULL,					\
+	    .u.off = offsetof(struct psmi_stats_malloc, _param)	\
+	}
+
+static
+void stats_register_mem_stats(psm2_ep_t ep)
+{
+	struct psmi_stats_entry entries[] = {
+		_SDECL("Total_(current)", m_all_total),
+		_SDECL("Total_(max)", m_all_max),
+		_SDECL("All_Peers_(current)", m_perpeer_total),
+		_SDECL("All_Peers_(max)", m_perpeer_max),
+		_SDECL("Network_Buffers_(current)", m_netbufs_total),
+		_SDECL("Network_Buffers_(max)", m_netbufs_max),
+		_SDECL("PSM_desctors_(current)", m_descriptors_total),
+		_SDECL("PSM_desctors_(max)", m_descriptors_max),
+		_SDECL("Unexp._buffers_(current)", m_unexpbufs_total),
+		_SDECL("Unexp._Buffers_(max)", m_unexpbufs_max),
+#ifdef RNDV_MOD
+		_SDECL("Peer_Rndv_(current)", m_peerrndv_total),
+		_SDECL("Peer_Rndv_(max)", m_peerrndv_max),
+#endif
+		_SDECL("Other_(current)", m_undefined_total),
+		_SDECL("Other_(max)", m_undefined_max),
+	};
+
+	// TBD - these are global, should only call once and not provide
+	// ep nor device name
+	psmi_stats_register_type("PSM_memory_allocation_statistics",
+				 PSMI_STATSTYPE_MEMORY,
+				 entries, PSMI_STATS_HOWMANY(entries), ep,
+				 ep->dev_name);
+}
+#endif // 0   // unused code, specific to QLogic MPI
diff --git a/deps/libfabric/prov/psm3/psm3/psm_stats.h b/deps/libfabric/prov/psm3/psm3/psm_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..3143af4cd7c07300eb45ead982c5151a6c26d60e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_stats.h
@@ -0,0 +1,163 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_stats.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_STATS_H
+#define _PSM_STATS_H
+
+#include "mpspawn_stats.h"
+
+#define PSMI_STATSTYPE_MQ	    	0x00001
+#ifdef PSM_CUDA
+#define PSMI_STATSTYPE_CUDA	    	0x00002 /* count of cuda calls */
+#endif
+#define PSMI_STATSTYPE_RCVTHREAD    0x00100	/* num_wakups, ratio, etc. */
+#define PSMI_STATSTYPE_IPSPROTO	    0x00200	/* acks,naks,err_chks */
+#define PSMI_STATSTYPE_TIDS	    	0x00400
+#if 0	// unused code, specific to QLogic MPI
+#define PSMI_STATSTYPE_P2P	    	0x00800	/* ep-to-ep details */
+#endif
+#define PSMI_STATSTYPE_MR_CACHE	    0x00800
+#define PSMI_STATSTYPE_MEMORY	    0x01000
+#ifdef RNDV_MOD
+#define PSMI_STATSTYPE_RV_EVENT	    0x02000	/* RV user event */
+#define PSMI_STATSTYPE_RV_RDMA	    0x04000	/* RV shared conn RDMA */
+#endif
+#define PSMI_STATSTYPE_FAULTINJ	    0x08000	/* fault injection - PSM_FI */
+#define PSMI_STATSTYPE_ALL	    	0xfffff
+#define _PSMI_STATSTYPE_SHOWZERO	0x100000
+
+#if 0	// unused code, specific to QLogic MPI
+#define PSMI_STATSTYPE_HFI	    (PSMI_STATSTYPE_RCVTHREAD|	\
+				     PSMI_STATSTYPE_IPSPROTO |  \
+				     PSMI_STATSTYPE_MEMORY |  \
+				     PSMI_STATSTYPE_TIDS)
+#endif
+
+/* Used to determine how many stats in static array decl. */
+#define PSMI_STATS_HOWMANY(entries)	    \
+	    (sizeof(entries)/sizeof(entries[0]))
+
+#define PSMI_STATS_DECL(_desc, _flags, _getfn, _val)   \
+	{  .desc  = _desc,			    \
+	   .flags = _flags,			    \
+	   .getfn = _getfn,			    \
+	   .u.val = _val,			    \
+	}
+
+#define PSMI_STATS_DECLU64(_desc, _val)					  \
+	    PSMI_STATS_DECL(_desc,					  \
+		MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \
+		NULL,							  \
+		_val)
+
+#define PSMI_STATS_DECL_FUNC(_desc, _getfn)					  \
+	    PSMI_STATS_DECL(_desc,					  \
+		MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \
+		_getfn,							  \
+		NULL)
+
+struct psmi_stats_entry {
+	const char *desc;
+	uint16_t flags;
+	uint64_t(*getfn) (void *context); /* optional fn ptr to get value */
+	union {
+		uint64_t *val;	/* where value is stored if getfn is NULL */
+		//uint64_t off;	/* or offset if that makes more sense */
+	} u;
+	uint64_t old_value;	/* value fetched from previous report */
+};
+
+static inline void
+psmi_stats_init_u64(struct psmi_stats_entry *e, const char *desc, uint64_t *val)
+{
+	e->desc = desc;
+	e->flags = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO;
+	e->getfn = NULL;
+	e->u.val = val;
+	e->old_value = 0;
+}
+
+/*
+ * Copy the array of entries and keep track of the context
+ * statstype and context form a unique key to identify the stats for deregister
+ */
+psm2_error_t
+psmi_stats_register_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries,
+			 int num_entries, uint64_t id, void *context,
+			 const char *info);
+
+/* deregister old copy and register a new one in it's place */
+psm2_error_t
+psmi_stats_reregister_type(const char *heading,
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries,
+			 int num_entries, uint64_t id, void *context,
+			 const char *info);
+
+psm2_error_t psmi_stats_deregister_type(uint32_t statstype, void *context);
+
+psm2_error_t  psmi_stats_initialize(void);
+
+void psmi_stats_finalize(void);
+
+void psmi_stats_ep_close(void);	// let stats react to 1st ep close if desired
+
+#endif /* PSM_STATS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_sysbuf.c b/deps/libfabric/prov/psm3/psm3/psm_sysbuf.c
new file mode 100644
index 0000000000000000000000000000000000000000..234ba8f27884b4f098a0f53726e9ef04899b23e9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_sysbuf.c
@@ -0,0 +1,225 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * System buffer (unexpected message) allocator
+ *
+ */
+
+#define MM_FLAG_NONE  0
+#define MM_FLAG_TRANSIENT  0x1
+
+struct psmi_mem_block_ctrl {
+	union {
+		psmi_mem_ctrl_t *mem_handler;
+		struct psmi_mem_block_ctrl *next;
+	};
+};
+
+
+/* Per MQ allocators */
+void psmi_mq_sysbuf_init(psm2_mq_t mq)
+{
+    int i;
+    uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1};
+    uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0};
+
+    if (mq->mem_ctrl_is_init)
+        return;
+    mq->mem_ctrl_is_init = 1;
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        mq->handler_index[i].block_size = block_sizes[i];
+        mq->handler_index[i].current_available = 0;
+        mq->handler_index[i].free_list = NULL;
+        mq->handler_index[i].total_alloc = 0;
+        mq->handler_index[i].replenishing_rate = replenishing_rate[i];
+
+        if (block_sizes[i] == -1) {
+            psmi_assert_always(replenishing_rate[i] == 0);
+            mq->handler_index[i].flags = MM_FLAG_TRANSIENT;
+        }
+        else {
+            psmi_assert_always(replenishing_rate[i] > 0);
+            mq->handler_index[i].flags = MM_FLAG_NONE;
+        }
+    }
+
+    /* Hit once on each block size so we have a pool that's allocated */
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        void *ptr;
+        if (block_sizes[i] == -1)
+            continue;
+        ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]);
+        psmi_mq_sysbuf_free(mq, ptr);
+    }
+    // undo counters from psmi_mq_sysbuf_alloc during init
+    mq->stats.rx_sysbuf_num = 0;
+    mq->stats.rx_sysbuf_bytes  = 0;
+}
+
+void psmi_mq_sysbuf_fini(psm2_mq_t mq)  // free all buffers that is currently not used
+{
+    struct psmi_mem_block_ctrl *block;
+    int i;
+
+    if (mq->mem_ctrl_is_init == 0)
+        return;
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        while ((block = mq->handler_index[i].free_list) != NULL) {
+            mq->handler_index[i].free_list = block->next;
+            psmi_free(block);
+        }
+    }
+    mq->mem_ctrl_is_init = 0;
+}
+
+void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len)
+{
+    snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n",
+             mq->mem_ctrl_total_bytes);
+    buf[len-1] = '\0';
+    return;
+}
+
+void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
+{
+    psmi_mem_ctrl_t *mm_handler = mq->handler_index;
+    struct psmi_mem_block_ctrl *new_block;
+    int replenishing;
+
+    /* There is a timing race with ips initialization, fix later.
+ *      * XXX */
+    if (!mq->mem_ctrl_is_init)
+        psmi_mq_sysbuf_init(mq);
+
+    mq->stats.rx_sysbuf_num++;
+    mq->stats.rx_sysbuf_bytes += alloc_size;
+
+    while (mm_handler->block_size < alloc_size)
+        mm_handler++;
+
+    replenishing = mm_handler->replenishing_rate;
+
+    if (mm_handler->current_available == 0) { // allocate more buffers
+        if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+            uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl);
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+
+            if (new_block) {
+                new_block->mem_handler = mm_handler;
+                new_block++;
+                mm_handler->total_alloc++;
+                mq->mem_ctrl_total_bytes += newsz;
+            }
+            return new_block;
+        }
+
+        do {
+            uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl);
+
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+            mq->mem_ctrl_total_bytes += newsz;
+
+            if (new_block) {
+                mm_handler->current_available++;
+                mm_handler->total_alloc++;
+
+                new_block->next = mm_handler->free_list;
+                mm_handler->free_list = new_block;
+            }
+
+        } while (--replenishing && new_block);
+    }
+
+    if (mm_handler->current_available) {
+        mm_handler->current_available--;
+
+        new_block = mm_handler->free_list;
+        mm_handler->free_list = new_block->next;
+
+        new_block->mem_handler = mm_handler;
+        new_block++;
+
+        return new_block;
+    }
+    return NULL;
+}
+
+void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free)
+{
+    struct psmi_mem_block_ctrl * block_to_free;
+    psmi_mem_ctrl_t *mm_handler;
+
+    psmi_assert_always(mq->mem_ctrl_is_init);
+
+    block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1;
+    mm_handler = block_to_free->mem_handler;
+
+    if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+        psmi_free(block_to_free);
+    } else {
+        block_to_free->next = mm_handler->free_list;
+        mm_handler->free_list = block_to_free;
+        mm_handler->current_available++;
+    }
+
+    return;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_sysbuf.h b/deps/libfabric/prov/psm3/psm3/psm_sysbuf.h
new file mode 100644
index 0000000000000000000000000000000000000000..07ab5939104f7f3a32ef32ef653244763af7fc7d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_sysbuf.h
@@ -0,0 +1,81 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef SYSBUF_INT_H
+#define SYSBUF_INT_H
+
+#include "psm_user.h"
+
+#define MM_NUM_OF_POOLS 7
+
+typedef struct psmi_mem_ctrl {
+    struct psmi_mem_block_ctrl *free_list;
+    uint32_t total_alloc;
+    uint32_t current_available;
+    uint32_t block_size;
+    uint32_t flags;
+    uint32_t replenishing_rate;
+} psmi_mem_ctrl_t;
+
+/*
+ * MQ unexpected buffer management
+ */
+void  psmi_mq_sysbuf_init(psm2_mq_t mq);
+void  psmi_mq_sysbuf_fini(psm2_mq_t mq);
+void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes);
+void  psmi_mq_sysbuf_free(psm2_mq_t mq, void *);
+void  psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len);
+
+#endif /* SYSBUF_INT_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_timer.c b/deps/libfabric/prov/psm3/psm3/psm_timer.c
new file mode 100644
index 0000000000000000000000000000000000000000..9a8dddd2889a21fa3d1a7d077690a49aa211da39
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_timer.c
@@ -0,0 +1,198 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+
+#if PSMI_TIMER_STATS
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)	((ctrl)->num_insertions++)
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)	((ctrl)->num_traversals++)
+#else
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)
+#endif
+
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl)
+{
+	ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+
+#if PSMI_TIMER_STATS
+	ctrl->num_insertions = 0;
+	ctrl->num_traversals = 0;
+#endif
+
+	TAILQ_INIT(&ctrl->timerq);
+	return PSM2_OK;
+}
+
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl)
+{
+#if PSMI_TIMER_STATS
+	if (ctrl->num_insertions > 0) {
+		_HFI_INFO("avg elem traversals/insertion = %3.2f %%\n",
+			  100.0 * (double)ctrl->num_traversals /
+			  ctrl->num_insertions);
+	}
+#endif
+	return PSM2_OK;
+}
+
+void
+psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+			  struct psmi_timer *t_insert, uint64_t t_cyc_expire)
+{
+	struct psmi_timer *t_cursor;
+
+	psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING));
+
+	t_insert->t_timeout = t_cyc_expire;
+	t_insert->flags |= PSMI_TIMER_FLAG_PENDING;
+
+	/*
+	 * We keep the list from oldest (head) to newest (tail), with the
+	 * assumption that insert and remove occur much more often than search
+	 * (when the timer expires).  Newly added timers are more likely to expire
+	 * later rather than sooner, which is why the head is older.
+	 */
+	PSMI_TIMER_STATS_ADD_INSERTION(ctrl);
+
+	if (TAILQ_EMPTY(&ctrl->timerq)) {	/* Common case */
+		TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+		ctrl->t_cyc_next_expire = t_cyc_expire;
+		PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		return;
+	} else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) {
+		TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) {
+			if (t_cursor->t_timeout <= t_cyc_expire) {
+				TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer);
+				return;
+			}
+			PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		}
+		/* Got to the end of the list -- We're the next to expire */
+		ctrl->t_cyc_next_expire = t_cyc_expire;
+		TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+		return;
+	} else {
+		TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) {
+			if (t_cursor->t_timeout >= t_cyc_expire) {
+				TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor,
+						   t_insert, timer);
+				ctrl->t_cyc_next_expire =
+				    min(t_cyc_expire, ctrl->t_cyc_next_expire);
+				return;
+			}
+			PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+		}
+		TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer);
+		/* No need to check if we inserted last, given first branch case */
+		/* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */
+		/* ctrl->t_cyc_next_expire = t_cyc_expire; */
+		return;
+	}
+
+	return;
+}
+
+psm2_error_t
+psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire)
+{
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq);
+
+	PSM2_LOG_MSG("entering");
+
+	while (t_cursor) {
+		if (t_cursor->t_timeout > t_cyc_expire)
+			break;
+
+		err = PSM2_OK;
+		psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING);
+		t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING;
+		TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer);
+		t_cursor->expire_callback(t_cursor, t_cyc_expire);
+		t_cursor = TAILQ_PREV(t_cursor, timerq, timer);
+	}
+
+	if (TAILQ_EMPTY(&ctrl->timerq))
+		ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+	else
+		ctrl->t_cyc_next_expire =
+		    TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+void
+psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+			struct psmi_timer *t_remove)
+{
+
+	psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING);
+
+	t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING;
+	TAILQ_REMOVE(&ctrl->timerq, t_remove, timer);
+
+	/*
+	 * If we're removing the last entry, we need to reset the
+	 * expiration cycle time.
+	 */
+	if (TAILQ_EMPTY(&ctrl->timerq))
+		ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+	else
+		ctrl->t_cyc_next_expire =
+		    TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+	return;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_timer.h b/deps/libfabric/prov/psm3/psm3/psm_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c03d18729541db1a6f5cc86d04883606ae88e5e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_timer.h
@@ -0,0 +1,160 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_timer.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_TIMER_H
+#define _PSMI_TIMER_H
+
+
+typedef struct psmi_timer psmi_timer;
+typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *,
+						    uint64_t);
+
+struct psmi_timer {
+	TAILQ_ENTRY(psmi_timer) timer;	/* opaque */
+	uint64_t t_timeout;	/* opaque */
+	uint8_t flags;		/* opaque */
+
+	psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */
+	void *context;		/* user -- callback param */
+};
+
+struct psmi_timer_ctrl {
+	uint64_t t_cyc_next_expire;
+	 TAILQ_HEAD(timerq, psmi_timer) timerq;
+
+#if PSMI_TIMER_STATS
+	uint64_t num_insertions;
+	uint64_t num_traversals;
+#endif
+};
+
+/*
+ * Some events need to be unconditionally enqueued at the beginning of the
+ * timerq -- they are not timers meant to expire but merely operations that
+ * need to be delayed.  For delayed operations, there are 5 levels of
+ * priority.
+ */
+#define PSMI_TIMER_PRIO_0	 0ULL
+#define PSMI_TIMER_PRIO_1	 1ULL
+#define PSMI_TIMER_PRIO_2	 2ULL
+#define PSMI_TIMER_PRIO_3	 3ULL
+#define PSMI_TIMER_PRIO_4	 4ULL
+#define PSMI_TIMER_PRIO_LAST	 PSMI_TIMER_PRIO_4
+
+#define PSMI_TIMER_INFINITE	 0xFFFFFFFFFFFFFFFFULL
+#define PSMI_TIMER_FLAG_PENDING  0x01
+
+/*
+ * Timer control initialization and finalization
+ */
+psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl);
+psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl);
+
+/*
+ * Timer entry initialization (a timer must be initialized before it can be
+ * added to the timer request queue).
+ */
+
+PSMI_ALWAYS_INLINE(
+void
+psmi_timer_entry_init(struct psmi_timer *t_init,
+		      psmi_timer_expire_callback_t expire_fn,
+		      void *context))
+{
+	t_init->flags = 0;
+	t_init->expire_callback = expire_fn;
+	t_init->context = context;
+	return;
+}
+
+/*
+ * Timer requests, conditional (macro) or unconditional
+ */
+#define psmi_timer_request(ctrl, t_insert, t_cyc)			\
+	    if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING))		\
+		psmi_timer_request_always((ctrl), (t_insert), (t_cyc))
+
+void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+			       struct psmi_timer *t_insert,
+			       uint64_t t_cyc_expire);
+
+/*
+ * Timer cancelations, conditional (macro) only (cancel_inner is internal)
+ */
+#define psmi_timer_cancel(ctrl, t_remove)		    \
+	    if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \
+		psmi_timer_cancel_inner(ctrl, t_remove)
+void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+			     struct psmi_timer *t_remove);
+
+/*
+ * Timer processing, conditional or unconditional.
+ */
+#define psmi_timer_process_if_expired(ctrl, t_cyc_expire)		\
+	    (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ?		\
+	     psmi_timer_process_expired(ctrl, t_cyc_expire) :           \
+	     PSM2_OK_NO_PROGRESS)
+
+#define psmi_timer_is_expired(ctrl, t_cyc_expire)			\
+	    ((ctrl)->t_cyc_next_expire <= (t_cyc_expire))
+
+psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl,
+				       uint64_t t_cyc_expire);
+
+#endif /* _PSMI_TIMER_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_udp_ep.c b/deps/libfabric/prov/psm3/psm3/psm_udp_ep.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd99a7260ff0ef4a4281e041bbdf21be8ef726f8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_udp_ep.c
@@ -0,0 +1,54 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_udp_ep.h b/deps/libfabric/prov/psm3/psm3/psm_udp_ep.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a02ee9e6e400d7f8f66f4f4dbacd14ed71e1d14
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_udp_ep.h
@@ -0,0 +1,56 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_user.h b/deps/libfabric/prov/psm3/psm3/psm_user.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3765155ad37beee43ea1fc659d5628439bfd4a0
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_user.h
@@ -0,0 +1,737 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_USER_H
+#define _PSMI_USER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "psm_config.h"
+#include <inttypes.h>
+#include <pthread.h>
+
+#include <sched.h>
+#include <numa.h>
+#include <semaphore.h>
+#include <fcntl.h>
+#include <stdbool.h>
+
+#include "psm2.h"
+#include "psm2_mq.h"
+
+#include "ptl.h"
+
+#include "opa_user.h"
+#include "opa_queue.h"
+
+#include "psm_log.h"
+#include "psm_perf.h"
+
+#define PSMI_LOCK_NO_OWNER	((pthread_t)(-1))
+
+#define _PSMI_IN_USER_H
+
+/* Opaque hw context pointer used in HAL,
+   and defined by each HAL instance. */
+typedef void *psmi_hal_hw_context;
+
+#include "psm_help.h"
+#include "psm_error.h"
+#include "psm_context.h"
+#include "psm_utils.h"
+#include "psm_timer.h"
+#include "psm_mpool.h"
+#include "psm_ep.h"
+#include "psm_lock.h"
+#include "psm_stats.h"
+#include "psm2_mock_testing.h"
+
+#undef _PSMI_IN_USER_H
+
+#define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff))
+#define PSMI_VERNO  PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR)
+#define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff)
+#define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff)
+
+int psmi_verno_client();
+int psmi_verno_isinteroperable(uint16_t verno);
+int MOCKABLE(psmi_isinitialized)();
+MOCK_DCL_EPILOGUE(psmi_isinitialized);
+
+psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh);
+psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq);
+
+int psmi_get_current_proc_location();
+
+extern int psmi_epid_ver;
+extern int psmi_allow_routers;
+extern uint32_t non_dw_mul_sdma;
+extern psmi_lock_t psmi_creation_lock;
+extern psm2_ep_t psmi_opened_endpoint;
+
+extern int psmi_affinity_shared_file_opened;
+extern uint64_t *shared_affinity_ptr;
+extern char *affinity_shm_name;
+
+extern sem_t *sem_affinity_shm_rw;
+extern int psmi_affinity_semaphore_open;
+extern char *sem_affinity_shm_rw_name;
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_get_epid_version()) {
+	return psmi_epid_ver;
+}
+
+#define PSMI_EPID_VERSION_SHM 				0
+#define PSMI_EPID_SHM_ONLY				1
+#define PSMI_EPID_IPS_SHM				0
+#define PSMI_EPID_VERSION 				_psmi_get_epid_version()
+#define PSMI_MAX_EPID_VERNO_SUPPORTED			4
+#define PSMI_MIN_EPID_VERNO_SUPPORTED			3
+#define PSMI_EPID_VERNO_DEFAULT				3	// allows 3 or 4 based on NIC
+#define PSMI_EPID_V3					3	// IB UD
+#define PSMI_EPID_V4					4	// Eth UD
+
+#define PSMI_EPID_GET_LID(epid) ((PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V3) ? \
+								 (int)PSMI_EPID_GET_LID_V3(epid)      \
+							   : (int)PSMI_EPID_GET_LID_V4(epid))
+// for V3 we use low 16 and next 16 should be zero
+// for V4 we have network in low 32 bits
+#define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffffffff)
+
+
+/*
+ * Following is the definition of various lock implementations. The choice is
+ * made by defining specific lock type in relevant section of psm_config.h
+ */
+#ifdef PSMI_LOCK_IS_SPINLOCK
+#define _PSMI_LOCK_INIT(pl)	psmi_spin_init(&((pl).lock))
+#define _PSMI_LOCK_TRY(pl)	psmi_spin_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl)		psmi_spin_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl)	psmi_spin_unlock(&((pl).lock))
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#define PSMI_LOCK_DISABLED	0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_trylock_inner(pthread_mutex_t *mutex,
+			  const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner != pthread_self(),
+			       curloc);
+	int ret = pthread_mutex_trylock(mutex);
+	if (ret == 0)
+		*lock_owner = pthread_self();
+	return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_mutex_lock_inner(pthread_mutex_t *mutex,
+		       const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner != pthread_self(),
+			       curloc);
+	int ret = pthread_mutex_lock(mutex);
+	psmi_assert_always_loc(ret != EDEADLK, curloc);
+	*lock_owner = pthread_self();
+	return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+_psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
+			 const char *curloc, pthread_t *lock_owner))
+{
+	psmi_assert_always_loc(*lock_owner == pthread_self(),
+			       curloc);
+	*lock_owner = PSMI_LOCK_NO_OWNER;
+	psmi_assert_always_loc(pthread_mutex_unlock(mutex) !=
+			       EPERM, curloc);
+	return;
+}
+
+#define _PSMI_LOCK_INIT(pl)	/* static initialization */
+#define _PSMI_LOCK_TRY(pl)							\
+	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,		\
+					&((pl).lock_owner))
+#define _PSMI_LOCK(pl)								\
+	    _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC,			\
+                                        &((pl).lock_owner))
+#define _PSMI_UNLOCK(pl)							\
+	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,			\
+                                        &((pl).lock_owner))
+#define _PSMI_LOCK_ASSERT(pl)							\
+	psmi_assert_always((pl).lock_owner == pthread_self());
+#define _PSMI_UNLOCK_ASSERT(pl)							\
+	psmi_assert_always((pl).lock_owner != pthread_self());
+#define PSMI_LOCK_DISABLED	0
+
+#elif defined(PSMI_LOCK_IS_MUTEXLOCK)
+#define _PSMI_LOCK_INIT(pl)	/* static initialization */
+#define _PSMI_LOCK_TRY(pl)	pthread_mutex_trylock(&((pl).lock))
+#define _PSMI_LOCK(pl)		pthread_mutex_lock(&((pl).lock))
+#define _PSMI_UNLOCK(pl)	pthread_mutex_unlock(&((pl).lock))
+#define PSMI_LOCK_DISABLED	0
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+
+#elif defined(PSMI_PLOCK_IS_NOLOCK)
+#define _PSMI_LOCK_TRY(pl)	0	/* 0 *only* so progress thread never succeeds */
+#define _PSMI_LOCK(pl)
+#define _PSMI_UNLOCK(pl)
+#define PSMI_LOCK_DISABLED	1
+#define _PSMI_LOCK_ASSERT(pl)
+#define _PSMI_UNLOCK_ASSERT(pl)
+#else
+#error No LOCK lock type declared
+#endif
+
+#define PSMI_YIELD(pl)							\
+	do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0)
+
+#ifdef PSM2_MOCK_TESTING
+/* If this is a mocking tests build, all the operations on the locks
+ * are routed through functions which may be mocked, if necessary.  */
+void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_init);
+
+int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_try);
+
+void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock);
+
+void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock);
+
+void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert);
+
+void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl);
+MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert);
+
+#define PSMI_LOCK_INIT(pl)	psmi_mockable_lock_init(&(pl))
+#define PSMI_LOCK_TRY(pl)	psmi_mockable_lock_try(&(pl))
+#define PSMI_LOCK(pl)		psmi_mockable_lock(&(pl))
+#define PSMI_UNLOCK(pl)		psmi_mockable_unlock(&(pl))
+#define PSMI_LOCK_ASSERT(pl)	psmi_mockable_lock_assert(&(pl))
+#define PSMI_UNLOCK_ASSERT(pl)	psmi_mockable_unlock_assert(&(pl))
+#else
+#define PSMI_LOCK_INIT(pl)	_PSMI_LOCK_INIT(pl)
+#define PSMI_LOCK_TRY(pl)	_PSMI_LOCK_TRY(pl)
+#define PSMI_LOCK(pl)		_PSMI_LOCK(pl)
+#define PSMI_UNLOCK(pl)		_PSMI_UNLOCK(pl)
+#define PSMI_LOCK_ASSERT(pl)	_PSMI_LOCK_ASSERT(pl)
+#define PSMI_UNLOCK_ASSERT(pl)	_PSMI_UNLOCK_ASSERT(pl)
+#endif
+
+#ifdef PSM_PROFILE
+void psmi_profile_block() __attribute__ ((weak));
+void psmi_profile_unblock() __attribute__ ((weak));
+void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
+
+#define PSMI_PROFILE_BLOCK()		psmi_profile_block()
+#define PSMI_PROFILE_UNBLOCK()		psmi_profile_unblock()
+#define PSMI_PROFILE_REBLOCK(noprog)	psmi_profile_reblock(noprog)
+#else
+#define PSMI_PROFILE_BLOCK()
+#define PSMI_PROFILE_UNBLOCK()
+#define PSMI_PROFILE_REBLOCK(noprog)
+#endif
+
+#ifdef PSM_CUDA
+
+#ifndef PSM_CUDA_MOCK
+#include <cuda.h>
+#include <driver_types.h>
+
+#if CUDA_VERSION < 7000
+#error Please update CUDA driver, required minimum version is 7.0
+#endif
+#else
+// included in stand-alone unit test that does not use real CUDA functions
+#include "psmi_cuda_mock.h"
+#endif /* PSM_CUDA_MOCK */
+
+extern int is_cuda_enabled;
+extern int is_gdr_copy_enabled;
+extern int is_gpudirect_enabled; // only for use during parsing of other params
+extern int _device_support_unified_addr;
+extern int _device_support_gpudirect;
+extern int _gpu_p2p_supported;
+extern int my_gpu_device;
+extern int cuda_lib_version;
+
+extern CUcontext cu_ctxt;
+extern void *psmi_cuda_lib;
+
+extern CUresult (*psmi_cuInit)(unsigned int  Flags );
+extern CUresult (*psmi_cuCtxDetach)(CUcontext c);
+extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
+extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
+extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
+extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
+extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
+extern CUresult (*psmi_cuDeviceGetCount)(int* count);
+extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
+extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
+extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
+extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
+extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
+extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
+extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
+extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
+extern CUresult (*psmi_cuMemFreeHost)(void* p);
+extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
+extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
+extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
+extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
+extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
+extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
+extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
+extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
+extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
+
+extern uint64_t psmi_count_cuInit;
+extern uint64_t psmi_count_cuCtxDetach;
+extern uint64_t psmi_count_cuCtxGetCurrent;
+extern uint64_t psmi_count_cuCtxSetCurrent;
+extern uint64_t psmi_count_cuPointerGetAttribute;
+extern uint64_t psmi_count_cuPointerSetAttribute;
+extern uint64_t psmi_count_cuDeviceCanAccessPeer;
+extern uint64_t psmi_count_cuDeviceGet;
+extern uint64_t psmi_count_cuDeviceGetAttribute;
+extern uint64_t psmi_count_cuDriverGetVersion;
+extern uint64_t psmi_count_cuDeviceGetCount;
+extern uint64_t psmi_count_cuStreamCreate;
+extern uint64_t psmi_count_cuStreamDestroy;
+extern uint64_t psmi_count_cuStreamSynchronize;
+extern uint64_t psmi_count_cuEventCreate;
+extern uint64_t psmi_count_cuEventDestroy;
+extern uint64_t psmi_count_cuEventQuery;
+extern uint64_t psmi_count_cuEventRecord;
+extern uint64_t psmi_count_cuEventSynchronize;
+extern uint64_t psmi_count_cuMemHostAlloc;
+extern uint64_t psmi_count_cuMemFreeHost;
+extern uint64_t psmi_count_cuMemcpy;
+extern uint64_t psmi_count_cuMemcpyDtoD;
+extern uint64_t psmi_count_cuMemcpyDtoH;
+extern uint64_t psmi_count_cuMemcpyHtoD;
+extern uint64_t psmi_count_cuMemcpyDtoHAsync;
+extern uint64_t psmi_count_cuMemcpyHtoDAsync;
+extern uint64_t psmi_count_cuIpcGetMemHandle;
+extern uint64_t psmi_count_cuIpcOpenMemHandle;
+extern uint64_t psmi_count_cuIpcCloseMemHandle;
+extern uint64_t psmi_count_cuMemGetAddressRange;
+extern uint64_t psmi_count_cuDevicePrimaryCtxGetState;
+extern uint64_t psmi_count_cuDevicePrimaryCtxRetain;
+extern uint64_t psmi_count_cuCtxGetDevice;
+extern uint64_t psmi_count_cuDevicePrimaryCtxRelease;
+
+static int check_set_cuda_ctxt(void)
+{
+	CUresult err;
+	CUcontext tmpctxt = {0};
+
+	if (unlikely(!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent))
+		return 0;
+
+	err = psmi_cuCtxGetCurrent(&tmpctxt);
+	if (likely(!err)) {
+		if (unlikely(!tmpctxt && cu_ctxt)) {
+			err = psmi_cuCtxSetCurrent(cu_ctxt);
+			return !!err;
+		} else if (unlikely(tmpctxt && !cu_ctxt)) {
+			cu_ctxt = tmpctxt;
+		}
+	}
+	return 0;
+}
+
+
+#define PSMI_CUDA_CALL(func, args...) do {				\
+		CUresult cudaerr;					\
+		if (unlikely(check_set_cuda_ctxt())) {			\
+			psmi_handle_error(PSMI_EP_NORETURN,		\
+			PSM2_INTERNAL_ERR, "Failed to set/synchronize"	\
+			" CUDA context.\n");				\
+		}							\
+		psmi_count_##func++;					\
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != CUDA_SUCCESS) {				\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		}							\
+	} while (0)
+
+PSMI_ALWAYS_INLINE(
+void verify_device_support_unified_addr())
+{
+	if (likely(_device_support_unified_addr > -1)) return;
+
+	int num_devices, dev;
+
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_device_support_unified_addr = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+		int unifiedAddressing;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&unifiedAddressing,
+				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+				device);
+
+		if (unifiedAddressing !=1) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
+				"CUDA device %d does not support Unified Virtual Addressing.\n",
+				dev);
+		}
+	}
+
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+int device_support_gpudirect())
+{
+	if (likely(_device_support_gpudirect > -1)) return _device_support_gpudirect;
+
+	int num_devices, dev;
+
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_device_support_gpudirect = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+
+		int major;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&major,
+				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+				device);
+		if (major < 3) {
+			_device_support_gpudirect = 0;
+			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+		}
+	}
+
+	return _device_support_gpudirect;
+}
+
+PSMI_ALWAYS_INLINE(
+int gpu_p2p_supported())
+{
+	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
+
+	if (unlikely(!is_cuda_enabled)) {
+		_gpu_p2p_supported=0;
+		return 0;
+	}
+
+	int num_devices, dev;
+	CUcontext c;
+
+	/* Check which devices the current device has p2p access to. */
+	CUdevice current_device;
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+	_gpu_p2p_supported = 0;
+
+	if (num_devices > 1) {
+		PSMI_CUDA_CALL(cuCtxGetCurrent, &c);
+		if (c == NULL) {
+			_HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n");
+			return 0;
+		}
+		PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
+	}
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+
+		if (num_devices > 1 && device != current_device) {
+			int canAccessPeer = 0;
+			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+					current_device, device);
+
+			if (canAccessPeer != 1)
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+			else
+				_gpu_p2p_supported |= (1 << device);
+		} else {
+			/* Always support p2p on the same GPU */
+			my_gpu_device = device;
+			_gpu_p2p_supported |= (1 << device);
+		}
+	}
+
+	return _gpu_p2p_supported;
+}
+
+/**
+ * Similar to PSMI_CUDA_CALL() except does not error out
+ * if func(args) returns CUDA_SUCCESS or except_err
+ *
+ * Invoker must provide 'CUresult cudaerr' in invoked scope
+ * so invoker can inspect whether cudaerr == CUDA_SUCCESS or
+ * cudaerr == except_err after expanded code is executed.
+ *
+ * As except_err is an allowed value, message is printed at
+ * DBG level.
+ */
+#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do {		\
+		if (unlikely(check_set_cuda_ctxt())) {			\
+			psmi_handle_error(PSMI_EP_NORETURN,		\
+				PSM2_INTERNAL_ERR, "Failed to "		\
+				"set/synchronize CUDA context.\n");	\
+		}							\
+		psmi_count_##func++;					\
+		cudaerr = psmi_##func(args);				\
+		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
+			if (cu_ctxt == NULL)				\
+				_HFI_ERROR(				\
+				"Check if CUDA is initialized"	\
+				"before psm2_ep_open call \n");		\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		} else if (cudaerr == except_err) { \
+			_HFI_DBG( \
+				"CUDA non-zero return value: %s() (at %s:%d)"		\
+				"returned %d\n",			\
+				#func, __FILE__, __LINE__, cudaerr);	\
+		} \
+	} while (0)
+
+#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
+		psmi_count_cuEventQuery++;				\
+		cudaerr = psmi_cuEventQuery(event);			\
+		if ((cudaerr != CUDA_SUCCESS) &&			\
+		    (cudaerr != CUDA_ERROR_NOT_READY)) {		\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() returned %d\n",	\
+				"cuEventQuery", cudaerr);		\
+			psmi_handle_error(				\
+				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function.\n");\
+		}							\
+	} while (0)
+
+#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do {                        \
+	psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func));            \
+	if (!psmi_##func) {               				\
+		psmi_handle_error(PSMI_EP_NORETURN,                     \
+			       PSM2_INTERNAL_ERR,                       \
+			       " Unable to resolve %s symbol"		\
+			       " in CUDA libraries.\n",STRINGIFY(func));\
+	}                                                               \
+} while (0)
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_cuda_mem(const void *ptr))
+{
+	CUresult cres;
+	CUmemorytype mt;
+	unsigned uvm = 0;
+	psmi_count_cuPointerGetAttribute++;
+	cres = psmi_cuPointerGetAttribute(
+		&mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr);
+	if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) {
+		psmi_count_cuPointerGetAttribute++;
+		cres = psmi_cuPointerGetAttribute(
+			&uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+		if ((cres == CUDA_SUCCESS) && (uvm == 0))
+			return 1;
+		else
+			return 0;
+	} else
+		return 0;
+}
+
+#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
+#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
+
+PSMI_ALWAYS_INLINE(
+int
+_psmi_is_gdr_copy_enabled())
+{
+        return is_gdr_copy_enabled;
+}
+
+#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled()
+
+#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p)
+extern void psm2_get_gpu_bars(void);
+
+struct ips_cuda_hostbuf {
+	STAILQ_ENTRY(ips_cuda_hostbuf) req_next;
+	STAILQ_ENTRY(ips_cuda_hostbuf) next;
+	uint32_t size, offset, bytes_read;
+	/* This flag indicates whether a chb is
+	 * pulled from a mpool or dynamically
+	 * allocated using calloc. */
+	uint8_t is_tempbuf;
+	CUevent copy_status;
+	psm2_mq_req_t req;
+	void *host_buf;
+	CUdeviceptr gpu_buf;
+};
+
+struct ips_cuda_hostbuf_mpool_cb_context {
+	unsigned bufsz;
+};
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+
+#define CUDA_HOSTBUFFER_LIMITS {				\
+	    .env = "PSM3_CUDA_BOUNCEBUFFERS_MAX",		\
+	    .descr = "Max CUDA bounce buffers (in MB)",		\
+	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
+	    .minval = 1,					\
+	    .maxval = 1<<30,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = {  16, 256 },		\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,   1 },		\
+	    .mode[PSMI_MEMMODE_LARGE]   = {  32, 512 }		\
+	}
+
+extern uint32_t gpudirect_send_limit;
+extern uint32_t gpudirect_recv_limit;
+extern uint32_t cuda_thresh_rndv;
+/* This limit dictates when the sender turns off
+ * GDR Copy and uses SDMA. The limit needs to be less than equal
+ * CUDA RNDV threshold.
+ * set to 0 if GDR Copy disabled
+ */
+extern uint32_t gdr_copy_limit_send;
+/* This limit dictates when the reciever turns off
+ * GDR Copy. The limit needs to be less than equal
+ * CUDA RNDV threshold.
+ * set to 0 if GDR Copy disabled
+ */
+extern uint32_t gdr_copy_limit_recv;
+
+uint64_t gpu_cache_evict;
+
+// Only valid if called for a GPU buffer
+#define PSMI_USE_GDR_COPY_RECV(len) ((len) >=1 && (len) <= gdr_copy_limit_recv)
+
+enum psm2_chb_match_type {
+	/* Complete data found in a single chb */
+	PSMI_CUDA_FULL_MATCH_FOUND = 0,
+	/* Data is spread across two chb's */
+	PSMI_CUDA_SPLIT_MATCH_FOUND = 1,
+	/* Data is only partially prefetched */
+	PSMI_CUDA_PARTIAL_MATCH_FOUND = 2,
+	PSMI_CUDA_CONTINUE = 3
+};
+typedef enum psm2_chb_match_type psm2_chb_match_type_t;
+
+/*
+ * CUDA documentation dictates the use of SYNC_MEMOPS attribute
+ * when the buffer pointer received into PSM has been allocated
+ * by the application. This guarantees that all memory operations
+ * to this region of memory (used by multiple layers of the stack)
+ * always synchronize.
+ */
+static inline
+void psmi_cuda_set_attr_sync_memops(const void *ubuf)
+{
+	int true_flag = 1;
+
+	PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag,
+		       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf);
+}
+
+#endif /* PSM_CUDA */
+
+#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND]
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _PSMI_USER_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_utils.c b/deps/libfabric/prov/psm3/psm3/psm_utils.c
new file mode 100644
index 0000000000000000000000000000000000000000..20f640aed52ae62e55fa7e67499d79324e417e43
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_utils.c
@@ -0,0 +1,3200 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <netdb.h>		/* gethostbyname */
+#include <malloc.h>             /* malloc_usable_size */
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_am_internal.h"
+#include "psm_mq_internal.h"
+#include "ips_proto_params.h"
+#include <netinet/in.h>  // for sockaddr
+#include <fnmatch.h>
+
+
+int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
+
+struct psmi_epid_table psmi_epid_table;
+
+/* Iterator to access the epid table.
+ * 'ep' can be NULL if remote endpoints from all endpoint handles are requested
+ */
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep)
+{
+	itor->i = 0;
+	itor->ep = ep;
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+}
+
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor)
+{
+	int i;
+	struct psmi_epid_tabentry *e;
+
+	if (itor->i >= psmi_epid_table.tabsize)
+		return NULL;
+	for (i = itor->i; i < psmi_epid_table.tabsize; i++) {
+		e = &psmi_epid_table.table[i];
+		if (!e->entry || e->entry == EPADDR_DELETED)
+			continue;
+		if (itor->ep && e->ep != itor->ep)
+			continue;
+		itor->i = i + 1;
+		return e->entry;
+	}
+	itor->i = psmi_epid_table.tabsize;	/* put at end of table */
+	return NULL;
+}
+
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor)
+{
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	itor->i = 0;
+}
+
+#define mix64(a, b, c) \
+{ \
+	a -= b; a -= c; a ^= (c>>43); \
+	b -= c; b -= a; b ^= (a<<9);  \
+	c -= a; c -= b; c ^= (b>>8);  \
+	a -= b; a -= c; a ^= (c>>38); \
+	b -= c; b -= a; b ^= (a<<23); \
+	c -= a; c -= b; c ^= (b>>5);  \
+	a -= b; a -= c; a ^= (c>>35); \
+	b -= c; b -= a; b ^= (a<<49); \
+	c -= a; c -= b; c ^= (b>>11); \
+	a -= b; a -= c; a ^= (c>>12); \
+	b -= c; b -= a; b ^= (a<<18); \
+	c -= a; c -= b; c ^= (b>>22); \
+}
+
+psm2_error_t psmi_epid_init()
+{
+	pthread_mutexattr_t attr;
+	psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0;
+	psmi_epid_table.tabsize_used = 0;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutex_init(&psmi_epid_table.tablock, &attr);
+	pthread_mutexattr_destroy(&attr);
+	return PSM2_OK;
+};
+
+psm2_error_t psmi_epid_fini()
+{
+	if (psmi_epid_table.table != NULL) {
+		psmi_free(psmi_epid_table.table);
+		psmi_epid_table.table = NULL;
+	}
+	psmi_epid_table.tabsize = 0;
+	psmi_epid_table.tabsize_used = 0;
+	return PSM2_OK;
+}
+
+PSMI_ALWAYS_INLINE(
+uint64_t
+hash_this(const psm2_ep_t ep, const psm2_epid_t epid))
+{
+	uint64_t ep_i = (uint64_t) (uintptr_t) ep;
+	uint64_t epid_i = (uint64_t) epid;
+	uint64_t hash = 0x9e3779b97f4a7c13LL;
+	mix64(ep_i, epid_i, hash);
+	return hash;
+}
+
+PSMI_ALWAYS_INLINE(
+void *
+psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove))
+{
+	uint64_t key = hash_this(ep, epid);
+	struct psmi_epid_tabentry *e;
+	void *entry = NULL;
+	int idx;
+
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+	if (!psmi_epid_table.table)
+		goto ret;
+	idx = (int)(key % psmi_epid_table.tabsize);
+	while (psmi_epid_table.table[idx].entry != NULL) {
+		/* An epid can be added twice if there's more than one opened endpoint,
+		 * but really we match on epid *and* on endpoint */
+		e = &psmi_epid_table.table[idx];
+		if (e->entry != EPADDR_DELETED && e->key == key) {
+			entry = e->entry;
+			if (remove)
+				psmi_epid_table.table[idx].entry =
+				    EPADDR_DELETED;
+			goto ret;
+		}
+		if (++idx == psmi_epid_table.tabsize)
+			idx = 0;
+	}
+ret:
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	return entry;
+}
+
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid)
+{
+	void *entry = psmi_epid_lookup_inner(ep, epid, 0);
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid,
+			  entry);
+	return entry;
+}
+
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid)
+{
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid);
+	return psmi_epid_lookup_inner(ep, epid, 1);
+}
+
+void psmi_epid_remove_all(psm2_ep_t ep)
+{
+	size_t i;
+	struct psmi_epid_tabentry *e;
+
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+
+	for (i = 0; i < psmi_epid_table.tabsize; i++) {
+		e = &psmi_epid_table.table[i];
+
+		if (e->entry == NULL || e->entry == EPADDR_DELETED)
+			continue;
+
+		if (e->ep == ep) {
+			/* unspecified fields implicitly zeroed */
+			*e = (struct psmi_epid_tabentry) {
+				.entry = EPADDR_DELETED
+			};
+		}
+	}
+
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+}
+
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry)
+{
+	uint64_t key;
+	int idx, i, newsz;
+	struct psmi_epid_tabentry *e;
+	psm2_error_t err = PSM2_OK;
+
+	if (PSMI_EP_HOSTNAME != ep)
+		_HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid,
+			  entry);
+	pthread_mutex_lock(&psmi_epid_table.tablock);
+	/* Leave this here, mostly for sanity and for the fact that the epid
+	 * table is currently not used in the critical path */
+	if (++psmi_epid_table.tabsize_used >
+	    (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) {
+		struct psmi_epid_tabentry *newtab;
+		newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK;
+		newtab = (struct psmi_epid_tabentry *)
+		    psmi_calloc(ep, PER_PEER_ENDPOINT,
+				newsz, sizeof(struct psmi_epid_tabentry));
+		if (newtab == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		if (psmi_epid_table.table) {	/* rehash the table */
+			for (i = 0; i < psmi_epid_table.tabsize; i++) {
+				e = &psmi_epid_table.table[i];
+				if (e->entry == NULL)
+					continue;
+				/* When rehashing, mark deleted as free again */
+				if (e->entry == EPADDR_DELETED) {
+					psmi_epid_table.tabsize_used--;
+					continue;
+				}
+				idx = (int)(e->key % newsz);
+				while (newtab[idx].entry != NULL)
+					if (++idx == newsz)
+						idx = 0;
+				newtab[idx].entry = e->entry;
+				newtab[idx].key = e->key;
+				newtab[idx].ep = e->ep;
+				newtab[idx].epid = e->epid;
+			}
+			psmi_free(psmi_epid_table.table);
+		}
+		psmi_epid_table.table = newtab;
+		psmi_epid_table.tabsize = newsz;
+	}
+	key = hash_this(ep, epid);
+	idx = (int)(key % psmi_epid_table.tabsize);
+	e = &psmi_epid_table.table[idx];
+	while (e->entry && e->entry != EPADDR_DELETED) {
+		if (++idx == psmi_epid_table.tabsize)
+			idx = 0;
+		e = &psmi_epid_table.table[idx];
+	}
+	e->entry = entry;
+	e->key = key;
+	e->epid = epid;
+	e->ep = ep;
+
+fail:
+	pthread_mutex_unlock(&psmi_epid_table.tablock);
+	return err;
+}
+
+static psmi_lock_t psmi_gethostname_lock;
+
+static void __attribute__ ((constructor)) __psmi_gethostname_lock_constructor(void)
+{
+	psmi_init_lock(&psmi_gethostname_lock);
+}
+
+char *psmi_gethostname(void)
+{
+	static char hostname[80] = { '\0' };
+	char *c;
+
+	if (hostname[0] == '\0') {
+		PSMI_LOCK(psmi_gethostname_lock);
+		/* CRITICAL SECTION START */
+		if (hostname[0] == '\0') {
+			gethostname(hostname, sizeof(hostname));
+			hostname[sizeof(hostname) - 1] = '\0';	/* no guarantee of nul termination */
+			if ((c = strchr(hostname, '.')))
+				*c = '\0';
+		}
+		PSMI_UNLOCK(psmi_gethostname_lock);
+		/* CRITICAL SECTION END */
+	}
+
+	return hostname;
+}
+
+/*
+ * Hostname stuff.  We really only register the network portion of the epid
+ * since all epids from the same nid are assumed to have the same hostname.
+ */
+psm2_error_t
+psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite)
+{
+	size_t hlen;
+	char *h;
+	psm2_error_t err = PSM2_OK;
+
+	if (hostname == NULL)
+		return PSM2_OK;
+	/* First see if a hostname already exists */
+	if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) {
+		if (!overwrite)
+			return PSM2_OK;
+
+		h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid);
+		if (h != NULL)	/* free the previous hostname if so exists */
+			psmi_free(h);
+	}
+
+	hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1);
+	h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen);
+	if (h == NULL)
+		return PSM2_NO_MEMORY;
+	snprintf(h, hlen, "%s", hostname);
+	h[hlen - 1] = '\0';
+	err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h);
+	return err;
+}
+
+/* XXX These three functions are not thread safe, we'll use a rotating buffer
+ * trick to make them thread safe because we really only have a few thread
+ * (assuming multi_ep has < 8 threads of it's own) */
+/* this returns just the addressing */
+const char *psmi_epaddr_fmt_addr(psm2_epid_t epid)
+{
+	static char hostnamebufs[16][PSMI_EP_HOSTNAME_LEN];
+	static int bufno;
+	char *hostname;
+
+	hostname = hostnamebufs[bufno];
+	bufno = (bufno + 1) % 16;
+
+	char buf[INET_ADDRSTRLEN];
+	if (PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V4)
+		snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "IP=%s QP=%d",
+			psmi_ipv4_ntop((uint32_t)PSMI_EPID_GET_LID(epid), buf, sizeof(buf)),
+			(int)PSMI_EPID_GET_CONTEXT(epid));
+	else
+		snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d QP=%d",
+			(int)PSMI_EPID_GET_LID(epid),
+			(int)PSMI_EPID_GET_CONTEXT(epid));
+	hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+	return hostname;
+}
+
+/* this returns the simple name, if not known gives addressing */
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid)
+{
+	uint64_t nid = psm2_epid_nid(epid);
+	char *h;
+
+	/* First, if we have registered a host for this epid, just return that, or
+	 * else try to return something with lid and context */
+	h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid);
+	if (h != NULL)
+		return h;
+	else {
+		return psmi_epaddr_fmt_addr(epid);
+	}
+}
+
+/* this returns the name and addressing */
+/* if not known just gives addressing */
+const char *psmi_epaddr_get_name(psm2_epid_t epid)
+{
+	static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+	static int bufno;
+	char *h, *hostname;
+	hostname = hostnamebufs[bufno];
+	bufno = (bufno + 1) % 4;
+
+	h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid));
+	if (h == NULL)
+		return psmi_epaddr_get_hostname(epid);
+	else {
+		snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "%s (%s)", h,
+				psmi_epaddr_fmt_addr(epid));
+		hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0';
+	}
+	return hostname;
+}
+
+
+// superset of inet_ntop.  For AF_INET and AF_INET6 outputs address and port
+// for AF_IB outputs address sid and pkey
+const char *psmi_sockaddr_ntop(struct sockaddr* addr, char *dst, socklen_t size)
+{
+	if (! dst || size < PSM_ADDRSTRLEN) {
+		// be strict, keeps it simple
+		errno = ENOSPC;
+		return "ENOSPC";	// callers just use in a printf
+	}
+	*dst = '\0';	// be safe
+	if (! addr) {
+		snprintf(dst, size, "(nil)");
+		return dst;
+	}
+	// show network address and port (or sid)
+	switch (addr->sa_family) {
+	case AF_INET:
+	{
+		struct sockaddr_in* in_addr = ((struct sockaddr_in*)addr);
+		// we show the IPv4 address and port
+		inet_ntop(AF_INET,  &in_addr->sin_addr, dst, size);
+		snprintf(dst+strlen(dst), size-strlen(dst), " %u", be16toh(in_addr->sin_port));
+		return dst;
+	}
+	case AF_INET6:
+	{
+		struct sockaddr_in6* in_addr = ((struct sockaddr_in6*)addr);
+		// we show just the IPv6 address and port.
+		// could also show scope_id and flowinfo
+		inet_ntop(AF_INET6,  &in_addr->sin6_addr, dst, size);
+		snprintf(dst+strlen(dst), size-strlen(dst), " %u", be16toh(in_addr->sin6_port));
+		return dst;
+	}
+	default:
+		snprintf(dst, size, "Unsupported");
+		return dst;
+	}
+}
+
+// subset of inet_ntop.
+// formats address or netmask (in host byte order)
+// into buf which has >= buf_size bytes available.
+// returns a \0 terminated string suitable for use in printf such as:
+// { char buf[INET_ADDRSTRLEN];
+//		 printf("IP=%s\n", psmi_ipv4_ntop(ip_addr, buf, sizeof(buf));}
+// on success pointer returned will be to buf.  For various errors a
+// constant string outside of buf may be returned such that caller can safely
+// call printf (or similar functions) without checking return value.
+// on errors, errno is also set.
+// Note IPv4 addresses worse case length is INET_ADDRSTRLEN.
+const char *psmi_ipv4_ntop(uint32_t ip_addr, char *dst, socklen_t size)
+{
+	struct in_addr in_addr;
+	if (! dst || size < INET_ADDRSTRLEN) {
+		// be strict, keeps it simple
+		errno = ENOSPC;
+		return "ENOSPC";	// callers just use in a printf
+	}
+	*dst = '\0';	// be safe
+	in_addr.s_addr = __cpu_to_be32(ip_addr);
+	// we show the IPv4 address and port
+	inet_ntop(AF_INET, &in_addr, dst, size);
+	return dst;
+}
+
+socklen_t psmi_sockaddr_len(struct sockaddr* addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return (sizeof(struct sockaddr_in));
+	case AF_INET6:
+		return (sizeof(struct sockaddr_in6));
+	default:
+		// unknown
+		return 0;	// be conservative
+	}
+}
+
+// used for IPv4 netmask processing.  A valid netmask has a sequence of 1s
+// and then all other bits are 0.
+// This counts how many 1s are in the high end of the netmask and confirms
+// the remaining low bits are 0.
+int psmi_count_high_bits(uint32_t netmask)
+{
+	int i=0;
+	uint32_t mask = 0x80000000;
+	while (mask & netmask) {
+		i++; mask >>= 1;
+	}
+	// confirm all low bits of netmask are 0
+	if (netmask != psmi_bit_count_to_mask(i))
+			return -1;
+	return i;
+}
+
+// given an IPv4 address, figure out which ifconfig entry matches and
+// return the netmask
+int psmi_get_eth_netmask(__be32 ip_addr, __be32 *netmask)
+{
+	struct ifaddrs *ifap, *ifa;
+
+	if (getifaddrs(&ifap) == 0) {
+		for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) {
+			struct sockaddr_in *addr = (struct sockaddr_in  *)ifa->ifa_addr;
+			struct sockaddr_in *nmask = (struct sockaddr_in  *)ifa->ifa_netmask;
+			__be32 nm;
+			char buf[INET_ADDRSTRLEN];
+			char buf2[INET_ADDRSTRLEN];
+
+			if (!nmask) continue;
+			if (addr->sin_family != AF_INET) continue;
+			if (addr->sin_addr.s_addr != ip_addr) continue;
+
+			nm = (__be32)nmask->sin_addr.s_addr;
+
+			if (_HFI_DBG_ON) {
+				_HFI_DBG("Related ifaddr[%s]: %s netmask %s\n",
+					ifa->ifa_name,
+                	psmi_ipv4_ntop(__be32_to_cpu(ip_addr), buf, sizeof(buf)),
+                	psmi_ipv4_ntop(__be32_to_cpu(nm), buf2, sizeof(buf2)));
+			}
+			*netmask = nm;
+			break;
+		}
+		(void)freeifaddrs(ifap);
+	} else {
+		return -1;
+	}
+	return 0;
+}
+
+/* Wrapper, in case we port to OS xyz that doesn't have sysconf */
+uintptr_t psmi_getpagesize(void)
+{
+	static uintptr_t pagesz = (uintptr_t) -1;
+	long sz;
+	if (pagesz != (uintptr_t) -1)
+		return pagesz;
+	sz = sysconf(_SC_PAGESIZE);
+	if (sz == -1) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Can't query system page size");
+	}
+
+	pagesz = (uintptr_t) sz;
+	return pagesz;
+}
+
+/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all
+   of the input passed to it. */
+#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0))
+
+/* parse env of the form 'val' or 'val:' or 'val:pattern'
+ * for PSM3_VERBOSE_ENV and PSM3_IDENITFY
+ * if nothing provided or doesn't match current process, def is returned
+ * if syntax error, def_syntax is returned
+ */
+int psmi_parse_val_pattern(const char *env, int def, int def_syntax)
+{
+	int ret = def;
+
+	if (env && *env) {
+		char *e = psmi_strdup(NULL, env);
+		char *ep;
+		char *p;
+
+		psmi_assert_always(e != NULL);
+		if (e == NULL)	// for klocwork
+			goto done;
+		p = strchr(e, ':');
+		if (p)
+			*p = '\0';
+		int val = (int)strtol(e, &ep, 0);
+		if (! _CONSUMED_ALL(ep))
+			ret = def_syntax;
+		else
+			ret = val;
+		if (val && p) {
+			if (! *(p+1)) { // val: -> val:*:rank0
+				if (hfi_get_myrank() != 0)
+					ret = def;
+			} else if (0 != fnmatch(p+1, hfi_get_mylabel(),  0
+#ifdef FNM_EXTMATCH
+										| FNM_EXTMATCH
+#endif
+					))
+					ret = def;
+		}
+		psmi_free(e);
+	}
+done:
+	return ret;
+}
+
+/* If PSM3_VERBOSE_ENV is set in the environment, we determine
+ * what its verbose level is and print the environment at "INFO"
+ * level if the environment's level matches the desired printlevel.
+ */
+static int psmi_getenv_verblevel = -1;
+static int psmi_getenv_is_verblevel(int printlevel)
+{
+	if (psmi_getenv_verblevel == -1) {
+		char *env = getenv("PSM3_VERBOSE_ENV");
+		int nlevel = PSMI_ENVVAR_LEVEL_USER;
+		psmi_getenv_verblevel = psmi_parse_val_pattern(env, 0, 2);
+		if (psmi_getenv_verblevel < 0 || psmi_getenv_verblevel > 3)
+			psmi_getenv_verblevel = 2;
+		if (psmi_getenv_verblevel > 0)
+			nlevel = 0; /* output at INFO level */
+		if (psmi_getenv_verblevel == 1)
+			_HFI_ENVDBG(0, " %-25s => '%s' (default was '%s')\n",
+				"PSM3_VERBOSE_ENV", env?env:"", "0");
+		else if (env && *env)
+			_HFI_ENVDBG(nlevel, " %-25s %-40s => '%s' (default was '%s')\n",
+				"PSM3_VERBOSE_ENV",
+				"Enable verbose output of environment variables. "
+				"(0 - none, 1 - changed w/o help, 2 - user help, "
+				"#: - limit output to rank 0, #:pattern - limit output "
+				"to processes whose label matches "
+#ifdef FNM_EXTMATCH
+				"extended "
+#endif
+				"glob pattern)",
+// don't document that 3 and 3: and 3:pattern can output hidden params
+				env, "0");
+		else	/* defaulted */
+			_HFI_ENVDBG(nlevel,
+				" %-25s %-40s => '%s'\n",
+				"PSM3_VERBOSE_ENV",
+				"Enable verbose output of environment variables. "
+				"(0 - none, 1 - changed w/o help, 2 - user help, "
+				"#: - limit output to rank 0, #:pattern - limit output "
+				"to processes whose label matches "
+#ifdef FNM_EXTMATCH
+				"extended "
+#endif
+				"glob pattern)",
+// don't document that 3 and 3: and 3:pattern can output hidden params
+				"0");
+	}
+	return ((printlevel <= psmi_getenv_verblevel
+			&& psmi_getenv_verblevel == 1)
+		|| printlevel <= psmi_getenv_verblevel-1);
+}
+
+#define GETENV_PRINTF(_level, _fmt, ...)				\
+	do {								\
+		if ((_level & PSMI_ENVVAR_LEVEL_NEVER_PRINT) == 0)	\
+		{							\
+			int nlevel = _level;				\
+			if (psmi_getenv_is_verblevel(nlevel))		\
+				nlevel = 0; /* output at INFO level */	\
+			_HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__);	\
+		}							\
+	} while (0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+	    int type, union psmi_envvar_val defval,
+	    union psmi_envvar_val *newval)
+{
+	int used_default = 0;
+	union psmi_envvar_val tval;
+	char *env = getenv(name);
+#if _HFI_DEBUGGING
+	int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS ||
+		     type == PSMI_ENVVAR_TYPE_UINT_FLAGS);
+#endif
+
+	/* for verblevel 1 we only output non-default values with no help
+	 * for verblevel>1 we promote to info (verblevel=2 promotes USER,
+	 *		verblevel=3 promotes HIDDEN) and show help.
+	 * for verblevel< 1 we don't promote anything and show help
+	 */
+#define _GETENV_PRINT(used_default, fmt, val, defval) \
+	do {	\
+		(void)psmi_getenv_is_verblevel(level);			\
+		if (used_default && psmi_getenv_verblevel != 1)		\
+			GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt	\
+				"\n", level > 1 ? "*" : " ", name,	\
+				descr, ishex ? "0x" : " ", val);	\
+		else if (! used_default && psmi_getenv_verblevel == 1)	\
+			GETENV_PRINTF(1, "%s%-25s =>%s"			\
+				fmt " (default was%s" fmt ")\n",	\
+				level > 1 ? "*" : " ", name,		\
+				ishex ? " 0x" : " ", val,		\
+				ishex ? " 0x" : " ", defval);		\
+		else if (! used_default && psmi_getenv_verblevel != 1)	\
+			GETENV_PRINTF(1, "%s%-25s %-40s =>%s"		\
+				fmt " (default was%s" fmt ")\n",	\
+				level > 1 ? "*" : " ", name, descr,	\
+				ishex ? " 0x" : " ", val,		\
+				ishex ? " 0x" : " ", defval);		\
+	} while (0)
+
+#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL)						\
+	do {										\
+		char *ep;								\
+		/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */	\
+		DEST = (TYPE)STRTOL(env, &ep, 10);					\
+		if (! _CONSUMED_ALL(ep)) {						\
+			DEST = (TYPE)STRTOL(env, &ep, 16);				\
+			if (! _CONSUMED_ALL(ep)) {					\
+				used_default = 1;					\
+				tval = defval;						\
+			}								\
+		}									\
+	} while (0)
+
+	switch (type) {
+	case PSMI_ENVVAR_TYPE_YESNO:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else if (env[0] == 'Y' || env[0] == 'y')
+			tval.e_int = 1;
+		else if (env[0] == 'N' || env[0] == 'n')
+			tval.e_int = 0;
+		else {
+			char *ep;
+			tval.e_ulong = strtoul(env, &ep, 0);
+			if (ep == env) {
+				used_default = 1;
+				tval = defval;
+			} else if (tval.e_ulong != 0)
+				tval.e_ulong = 1;
+		}
+		_GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO",
+			      defval.e_int ? "YES" : "NO");
+		break;
+
+	case PSMI_ENVVAR_TYPE_STR:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else
+			tval.e_str = env;
+		_GETENV_PRINT(used_default, "'%s'", tval.e_str, defval.e_str);
+		break;
+
+	case PSMI_ENVVAR_TYPE_INT:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			_CONVERT_TO_NUM(tval.e_int,int,strtol);
+		}
+		_GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int);
+		break;
+
+	case PSMI_ENVVAR_TYPE_UINT:
+	case PSMI_ENVVAR_TYPE_UINT_FLAGS:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			_CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul);
+		}
+		if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
+			_GETENV_PRINT(used_default, "%x", tval.e_uint,
+				      defval.e_uint);
+		else
+			_GETENV_PRINT(used_default, "%u", tval.e_uint,
+				      defval.e_uint);
+		break;
+
+	case PSMI_ENVVAR_TYPE_LONG:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			_CONVERT_TO_NUM(tval.e_long,long,strtol);
+		}
+		_GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG_ULONG:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			_CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull);
+		}
+		_GETENV_PRINT(used_default, "%llu",
+			      tval.e_ulonglong, defval.e_ulonglong);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG:
+	case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
+	default:
+		if (!env || *env == '\0') {
+			tval = defval;
+			used_default = 1;
+		} else {
+			_CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul);
+		}
+		if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
+			_GETENV_PRINT(used_default, "%lx", tval.e_ulong,
+				      defval.e_ulong);
+		else
+			_GETENV_PRINT(used_default, "%lu", tval.e_ulong,
+				      defval.e_ulong);
+		break;
+	}
+#undef _GETENV_PRINT
+	*newval = tval;
+
+	return used_default;
+}
+MOCK_DEF_EPILOGUE(psmi_getenv);
+
+/*
+ * Parsing long parameters
+ * -1 -> parse error
+ */
+long psmi_parse_str_long(const char *string)
+{
+	char *ep;                               \
+	long ret;
+
+	if (! string || ! *string)
+		return -1;
+	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+	ret = strtol(string, &ep, 10);
+	if (! _CONSUMED_ALL(ep)) {
+		ret = strtol(string, &ep, 16);
+		if (! _CONSUMED_ALL(ep))
+			return -1;
+	}
+	return ret;
+}
+
+/*
+ * Parsing int parameters set in string tuples.
+ * Output array int *vals should be able to store 'ntup' elements.
+ * Values are only overwritten if they are parsed.
+ * Tuples are always separated by colons ':'
+ */
+int psmi_parse_str_tuples(const char *string, int ntup, int *vals)
+{
+	char *b = (char *)string;
+	char *e = b;
+	int tup_i = 0;
+	int n_parsed = 0;
+	char *buf = psmi_strdup(NULL, string);
+	psmi_assert_always(buf != NULL);
+
+	while (*e && tup_i < ntup) {
+		b = e;
+		while (*e && *e != ':')
+			e++;
+		if (e > b) {	/* something to parse */
+			char *ep;
+			int len = e - b;
+			long int l;
+			strncpy(buf, b, len);
+			buf[len] = '\0';
+			l = strtol(buf, &ep, 0);
+			if (ep != buf) {	/* successful conversion */
+				vals[tup_i] = (int)l;
+				n_parsed++;
+			}
+		}
+		if (*e == ':')
+			e++;	/* skip delimiter */
+		tup_i++;
+	}
+	psmi_free(buf);
+	return n_parsed;
+}
+
+/*
+ * Memory footprint/usage mode.
+ *
+ * This can be used for debug or for separating large installations from
+ * small/medium ones.  The default is to assume a medium installation.  Large
+ * is not that much larger in memory footprint, but we make a conscious effort
+ * an consuming only the amount of memory we need.
+ */
+int psmi_parse_memmode(void)
+{
+	union psmi_envvar_val env_mmode;
+	int used_default =
+	    psmi_getenv("PSM3_MEMORY", "Memory usage mode (min, normal or large)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"normal", &env_mmode);
+	if (used_default || !strcasecmp(env_mmode.e_str, "normal"))
+		return PSMI_MEMMODE_NORMAL;
+	else if (!strcasecmp(env_mmode.e_str, "min"))
+		return PSMI_MEMMODE_MINIMAL;
+	else if (!strcasecmp(env_mmode.e_str, "large") ||
+		 !strcasecmp(env_mmode.e_str, "big"))
+		return PSMI_MEMMODE_LARGE;
+	else {
+		_HFI_PRDBG("PSM3_MEMORY env value %s unrecognized, "
+			   "using 'normal' memory mode instead\n",
+			   env_mmode.e_str);
+		return PSMI_MEMMODE_NORMAL;
+	}
+}
+
+#ifdef PSM_CUDA
+// we need GPUDIRECT config early to influence rdmamode defaults,
+// MR Cache mode and whether we need to open RV.
+// These functions are later used to confirm and finalize config for
+// ips_proto_init
+
+// value returned is 0/1 (disable/enable)
+unsigned psmi_parse_gpudirect(void)
+{
+	union psmi_envvar_val envval;
+	static int have_value = 0;
+	static unsigned saved;
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved;
+
+	psmi_getenv("PSM3_GPUDIRECT",
+		"Use GPUDirect DMA and RDMA support to allow the NIC to directly read"
+		" from the GPU for send DMA and write to the GPU for recv RDMA."
+		" Also enable GPUDirect copy for more efficient CPU to/from GPU copies."
+		" Requires rv module support.(default is disabled i.e. 0)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val)0, /* Disabled by default */
+		&envval);
+
+	saved = envval.e_uint;
+	have_value = 1;
+	return saved;
+}
+
+// value returned is limit >= 0, (0 disables GPUDIRECT Send RDMA)
+unsigned psmi_parse_gpudirect_send_limit(void)
+{
+	union psmi_envvar_val envval;
+	static int have_value = 0;
+	static unsigned saved;
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved;
+
+	/* Default Send threshold for Gpu-direct set to 30000 */
+	psmi_getenv("PSM3_GPUDIRECT_SEND_LIMIT",
+		    "GPUDirect feature on send side will be switched off for messages larger than limit.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)30000, &envval);
+
+	saved = envval.e_uint;
+	have_value = 1;
+	return saved;
+}
+
+// value returned is limit >= 0, (0 disables GPUDIRECT Recv RDMA)
+unsigned psmi_parse_gpudirect_recv_limit(void)
+{
+	union psmi_envvar_val envval;
+	static int have_value = 0;
+	static unsigned saved;
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved;
+
+	/* Default Send threshold for Gpu-direct set to 30000 */
+	psmi_getenv("PSM3_GPUDIRECT_RECV_LIMIT",
+		    "GPUDirect feature on receive side will be switched off for messages larger than limit.",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)UINT_MAX, &envval);
+
+	saved = envval.e_uint;
+	have_value = 1;
+	return saved;
+}
+
+#endif	// PSM_CUDA
+
+/* Send DMA Enable */
+unsigned psmi_parse_senddma(void)
+{
+	union psmi_envvar_val envval;
+	static int have_value = 0;
+	static unsigned saved;
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved;
+
+	psmi_getenv("PSM3_SDMA",
+		"UD send dma flags (0 disables send dma, 1 enables), default 0",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val)0, &envval);
+	saved = envval.e_uint;
+	have_value = 1;
+	return saved;
+}
+
+
+/* RDMA mode */
+// we need this early when setting defaults for RV thresholds in psmi_mq_malloc
+// and also want this available when creating the verbs_ep since it may affect
+// sizing of CQs and buffers.  But during mq_malloc we don't have an ep or proto
+// to save this into
+// The value returned is a bitmask of IPS_PROTOEXP_FLAG_* selections
+unsigned psmi_parse_rdmamode(void)
+{
+	union psmi_envvar_val env_rdma;
+	static int have_value = 0;
+	static unsigned saved_rdmamode;
+	unsigned default_rdma;
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+	int gpudirect = 0;
+#endif
+#endif
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved_rdmamode;
+
+	default_rdma = IPS_PROTOEXP_FLAGS_DEFAULT;
+
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+	gpudirect = PSMI_IS_CUDA_ENABLED && psmi_parse_gpudirect();
+	// GPUDIRECT causes default of RDMA=1
+	if (gpudirect)
+		default_rdma = (default_rdma & ~IPS_PROTOEXP_FLAG_RDMA_MASK)
+				| IPS_PROTOEXP_FLAG_RDMA_KERNEL;
+#endif
+#endif
+	psmi_getenv("PSM3_RDMA",
+		    "RDMA proto control (0-no RDMA,"
+#ifdef RNDV_MOD
+			" 1-kernel RDMA,"
+#endif
+			" 2-user RDMA, 3-user RC send/RDMA) "
+			//" additional flags: 8-interleave, 0x10-serialize"
+			// IPS_PROTOEXP_FLAG_TID_DEBUG (0x4)      N/A
+			,
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)default_rdma,
+		    &env_rdma);
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+#if 1 // remove this code when RV is ready for RDMA=2, 3 w/GPU Direct
+	if (gpudirect && IPS_PROTOEXP_FLAG_USER_RC_QP(env_rdma.e_uint)) {
+		_HFI_INFO("WARNING: GPUDIRECT only allowed with PSM3_RDMA=0 or 1, using %u\n", default_rdma);
+		env_rdma.e_uint = default_rdma;
+	}
+#endif
+#endif
+#endif
+#ifndef RNDV_MOD
+	if (IPS_PROTOEXP_FLAG_KERNEL_QP(env_rdma.e_uint)) {
+		static int logged = 0;
+		if (! logged) {
+			_HFI_INFO("WARNING: PSM built without rv module enabled, RDMA mode %d unavailable\n", IPS_PROTOEXP_FLAG_RDMA_KERNEL);
+			logged = 1;
+		}
+		env_rdma.e_uint = 0;
+	}
+#endif
+	saved_rdmamode = env_rdma.e_uint;
+	have_value = 1;
+	return saved_rdmamode;
+}
+
+/* PSM3_IDENTIFY */
+// we need in multiple places
+int psmi_parse_identify(void)
+{
+	union psmi_envvar_val myenv;
+	static int have_value;
+	static int saved_identify;
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value)
+		return saved_identify;
+
+	psmi_getenv("PSM3_IDENTIFY", "Identify PSM version being run "
+				"(0 - disable, 1 - enable, 1: - limit output to rank 0, "
+				"1:pattern - limit output "
+				"to processes whose label matches "
+#ifdef FNM_EXTMATCH
+				"extended "
+#endif
+				"glob pattern)",
+		    	PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    	(union psmi_envvar_val)"0", &myenv);
+	saved_identify = psmi_parse_val_pattern(myenv.e_str, 0, 0);
+	have_value = 1;
+
+	return saved_identify;
+}
+
+static
+const char *psmi_memmode_string(int mode)
+{
+	psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM);
+	switch (mode) {
+	case PSMI_MEMMODE_NORMAL:
+		return "normal";
+	case PSMI_MEMMODE_MINIMAL:
+		return "minimal";
+	case PSMI_MEMMODE_LARGE:
+		return "large";
+	default:
+		return "unknown";
+	}
+}
+
+psm2_error_t
+psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+		     const struct psmi_rlimit_mpool *rlim,
+		     uint32_t *valo, uint32_t *chunkszo)
+{
+	uint32_t val;
+	const char *env = rlim->env;
+	int mode = mq->memmode;
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val env_val;
+
+	psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL
+			   && mode < PSMI_MEMMODE_NUM);
+
+	psmi_getenv(rlim->env, rlim->descr, rlim->env_level,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val);
+
+	val = env_val.e_uint;
+	if (val < rlim->minval || val > rlim->maxval) {
+		err = psmi_handle_error(NULL, PSM2_PARAM_ERR,
+					"Env. var %s=%u is invalid (valid settings in mode PSM3_MEMORY=%s"
+					" are inclusively between %u and %u)",
+					env, val, psmi_memmode_string(mode),
+					rlim->minval, rlim->maxval);
+		goto fail;
+	}
+
+	_HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n",
+		  env, val, rlim->mode[mode].obj_chunk,
+		  psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval);
+
+	*valo = val;
+	*chunkszo = rlim->mode[mode].obj_chunk;
+
+fail:
+	return err;
+}
+
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns)
+{
+	if (timeout_ns < 0)
+		return 0ULL;
+	else if (timeout_ns == 0ULL || timeout_ns == ~0ULL)
+		return ~0ULL;
+	else {
+		uint64_t t_end = nanosecs_to_cycles(timeout_ns);
+		uint64_t t_now = get_cycles() - start_cycles;
+
+		if (t_now >= t_end)
+			return 0ULL;
+		else
+			return (t_end - t_now);
+	}
+}
+
+uint32_t psmi_get_ipv4addr()
+{
+	struct hostent *he;
+	uint32_t addr = 0;
+
+	he = gethostbyname(psmi_gethostname());
+	if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) {
+		memcpy(&addr, he->h_addr, sizeof(uint32_t));
+		return addr;
+	} else
+		return 0;
+}
+
+#define PSMI_EP_IS_PTR(ptr)	    ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT)
+
+void
+psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...)
+{
+	va_list ap;
+
+	/* If we've never syslogged anything from this ep at the PSM level, make
+	 * sure we log context information */
+	if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) {
+		char uuid_str[64];
+		ep->did_syslog = 1;
+
+		memset(&uuid_str, 0, sizeof(uuid_str));
+		uuid_unparse(ep->uuid, uuid_str);
+		hfi_syslog("PSM", 0, LOG_WARNING,
+			   "uuid_key=%s,unit=%d"
+			   ,
+			   uuid_str,
+			   ep->unit_id
+			   );
+	}
+
+	va_start(ap, format);
+	hfi_vsyslog("PSM", to_console, level, format, ap);
+	va_end(ap);
+}
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+	uint32_t c;
+	int n, k;
+
+	for (n = 0; n < 256; n++) {
+		c = (uint32_t) n;
+		for (k = 0; k < 8; k++) {
+			if (c & 1)
+				c = 0xedb88320 ^ (c >> 1);
+			else
+				c = c >> 1;
+		}
+		crc_table[n] = c;
+	}
+	crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len)
+{
+	uint32_t c = crc;
+	int n;
+
+	if_pf(!crc_table_computed)
+	    make_crc_table();
+	for (n = 0; n < len; n++) {
+		c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
+	}
+	return c;
+}
+
+/* Return the CRC of the bytes buf[0..len-1]. */
+uint32_t psmi_crc(unsigned char *buf, int len)
+{
+	return update_crc(0xffffffff, buf, len) ^ 0xffffffff;
+}
+
+int psmi_multi_ep_enabled = 0;
+void psmi_multi_ep_init()
+{
+	union psmi_envvar_val env_fi;
+
+	psmi_getenv("PSM3_MULTI_EP", "PSM3 Multiple Endpoints (yes/no)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+		    PSMI_ENVVAR_VAL_YES, &env_fi);
+
+	psmi_multi_ep_enabled = env_fi.e_uint;
+}
+
+#ifdef PSM_FI
+
+int psmi_faultinj_enabled = 0;
+int psmi_faultinj_verbose = 0;
+char *psmi_faultinj_outfile = NULL;
+int psmi_faultinj_sec_rail = 0;
+
+static struct psmi_faultinj_spec psmi_faultinj_dummy;
+static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head =
+		STAILQ_HEAD_INITIALIZER(psmi_faultinj_head);
+int psmi_faultinj_num_entries;
+
+void psmi_faultinj_init()
+{
+	union psmi_envvar_val env_fi;
+
+	psmi_getenv("PSM3_FI", "PSM Fault Injection "
+				"(0 - disable, 1 - enable, 1: - limit to rank 0, "
+				"1:pattern - limit "
+				"to processes whose label matches "
+#ifdef FNM_EXTMATCH
+				"extended "
+#endif
+				"glob pattern)",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)"0", &env_fi);
+	psmi_faultinj_enabled = psmi_parse_val_pattern(env_fi.e_str, 0, 0);
+
+	if (psmi_faultinj_enabled) {
+		char *def = NULL;
+		if (!psmi_getenv
+		    ("PSM3_FI_TRACEFILE", "PSM Fault Injection output file",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+		     (union psmi_envvar_val)def, &env_fi)) {
+			psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str);
+		}
+		if (!psmi_getenv
+		    ("PSM3_FI_VERBOSE", "PSM Fault verbose output",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		     (union psmi_envvar_val)0, &env_fi)) {
+			psmi_faultinj_verbose = env_fi.e_int;
+		}
+		if (!psmi_getenv
+		    ("PSM3_FI_RAIL", "PSM Fault Injection rail (0=all, 1=secondary only)",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		     (union psmi_envvar_val)0, &env_fi)) {
+			psmi_faultinj_sec_rail = env_fi.e_int;
+		}
+	}
+
+	return;
+}
+
+/* we only grow new entries, so if we fail to allocate, just ignore request */
+static void psmi_faultinj_reregister_stats()
+{
+	struct psmi_stats_entry *entries;
+	struct psmi_stats_entry *e;
+	int num_entries = 0;
+	struct psmi_faultinj_spec *fi;
+
+	entries = psmi_calloc(PSMI_EP_NONE, STATS, psmi_faultinj_num_entries,
+			 		sizeof(struct psmi_stats_entry));
+	if (! entries)
+		return;
+	e = entries;
+	STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+		psmi_stats_init_u64(e, fi->spec_name, &fi->num_faults);
+		e++; num_entries++;
+	}
+
+	psmi_stats_reregister_type("Fault_Injection", PSMI_STATSTYPE_FAULTINJ,
+		entries, num_entries, 0, &psmi_faultinj_head, NULL);
+	psmi_free(entries);
+}
+
+void psmi_faultinj_fini()
+{
+	struct psmi_faultinj_spec *fi;
+	FILE *fp;
+	int do_fclose = 0;
+
+	if (!psmi_faultinj_enabled)
+		return;
+	psmi_stats_deregister_type(PSMI_STATSTYPE_FAULTINJ, &psmi_faultinj_head);
+
+	if (psmi_faultinj_outfile == NULL)
+		return;
+	if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0)
+		fp = stdout;
+	else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0)
+		fp = stderr;
+	else {
+		char *c = psmi_faultinj_outfile;
+		char buf[192];
+		int append = 0;
+		if (*c == '+') {
+			append = 1;
+			++c;
+		}
+		do_fclose = 1;
+		snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel());
+		buf[sizeof(buf) - 1] = '\0';
+		fp = fopen(buf, append ? "a" : "w");
+	}
+
+	if (fp != NULL) {
+		STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+			fprintf(fp, "%s:%s PSM3_FI_%-13s %2.3f%% => "
+				"%2.3f%% %10"PRIu64" faults/%10"PRIu64" events seed %10ld\n",
+				__progname, hfi_get_mylabel(), fi->spec_name,
+				(double)fi->num * 100.0 / fi->denom,
+				(fi->num_calls ?
+				(double)fi->num_faults * 100.0 / fi->num_calls
+				:(double)0.0),
+				fi->num_faults, fi->num_calls,
+				fi->initial_seed);
+		}
+		fflush(fp);
+		if (do_fclose)
+			fclose(fp);
+	}
+
+	psmi_free(psmi_faultinj_outfile);
+	return;
+}
+
+/*
+ * Intended to be used only once, not in the critical path
+ */
+struct psmi_faultinj_spec *psmi_faultinj_getspec(const char *spec_name,
+						 const char *help, int num,
+						 int denom)
+{
+	struct psmi_faultinj_spec *fi;
+
+	if (!psmi_faultinj_enabled)
+		return &psmi_faultinj_dummy;
+
+	STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+		if (strcmp(fi->spec_name, spec_name) == 0)
+			return fi;
+	}
+
+	/* We got here, so no spec -- allocate one */
+	fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+			 sizeof(struct psmi_faultinj_spec));
+	psmi_assert_always(fi != NULL);
+	strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1);
+	fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0';
+	fi->num = num;
+	fi->denom = denom;
+	fi->initial_seed = getpid();
+	fi->num_faults = 0;
+	fi->num_calls = 0;
+
+	/*
+	 * See if we get a hint from the environment.
+	 * Format is
+	 * <num:denom:initial_seed>
+	 *
+	 * By default, we chose the initial seed to be the 'pid'.  If users need
+	 * repeatability, they should set initial_seed to be the 'pid' when the
+	 * error was observed or force the initial_seed to be a constant number in
+	 * each running process.  Using 'pid' is useful because core dumps store
+	 * pids and our backtrace format does as well so if a crash is observed for
+	 * a specific seed, programs can reuse the 'pid' to regenerate the same
+	 * error condition.
+	 */
+	{
+		int fvals[3] = { num, denom, (int)getpid() };
+		union psmi_envvar_val env_fi;
+		char fvals_str[128];
+		char fname[128];
+		char fdesc[300];
+
+		snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num,
+			 denom);
+		fvals_str[sizeof(fvals_str) - 1] = '\0';
+		snprintf(fname, sizeof(fname) - 1, "PSM3_FI_%s", spec_name);
+		fname[sizeof(fname) - 1] = '\0';
+		snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection - %s <%s>",
+			 help, fvals_str);
+
+		if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN,
+				 PSMI_ENVVAR_TYPE_STR,
+				 (union psmi_envvar_val)fvals_str, &env_fi)) {
+			/* not using default values */
+			int n_parsed =
+			    psmi_parse_str_tuples(env_fi.e_str, 3, fvals);
+			if (n_parsed >= 1)
+				fi->num = fvals[0];
+			if (n_parsed >= 2)
+				fi->denom = fvals[1];
+			if (n_parsed >= 3)
+				fi->initial_seed = (long int)fvals[2];
+		}
+	}
+	srand48_r(fi->initial_seed, &fi->drand48_data);
+
+	psmi_faultinj_num_entries++;
+	STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next);
+	psmi_faultinj_reregister_stats();
+	return fi;
+}
+
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi)
+{
+	if (!psmi_faultinj_enabled)	/* never fault if disabled */
+		return 0;
+	if (fi->num == 0)
+		return 0;
+
+	fi->num_calls++;
+	long int rnum;
+	lrand48_r(&fi->drand48_data, &rnum);
+	if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) {
+		fi->num_faults++;
+		return 1;
+	} else
+		return 0;
+}
+
+#endif /* #ifdef PSM_FI */
+
+/* For memory allocation, we kind of break the PSM error handling rules.
+ * If the caller gets NULL, it has to assume that the error has been handled
+ * and should always return PSM2_NO_MEMORY */
+
+/*
+ * Log memory increments or decrements of type memstats_t.
+ */
+struct psmi_memtype_hdr {
+	struct {
+		uint64_t size:48;
+		uint64_t magic:8;
+		uint64_t type:8;
+	};
+	void *original_allocation;
+};
+
+// Memory stats will only be collected under debug builds
+
+#ifdef PSM_DEBUG
+#define psmi_stats_mask PSMI_STATSTYPE_MEMORY
+#else
+#define psmi_stats_mask 0
+#endif
+
+struct psmi_stats_malloc psmi_stats_memory;
+
+void psmi_mem_stats_register(void)
+{
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECLU64("Total_(current)",
+				(uint64_t*)&psmi_stats_memory.m_all_total),
+		PSMI_STATS_DECLU64("Total_(max)",
+				(uint64_t*)&psmi_stats_memory.m_all_max),
+		PSMI_STATS_DECLU64("All_Peers_(current)",
+				(uint64_t*)&psmi_stats_memory.m_perpeer_total),
+		PSMI_STATS_DECLU64("All_Peers_(max)",
+				(uint64_t*)&psmi_stats_memory.m_perpeer_max),
+		PSMI_STATS_DECLU64("Network_Buffers_(current)",
+				(uint64_t*)&psmi_stats_memory.m_netbufs_total),
+		PSMI_STATS_DECLU64("Network_Buffers_(max)",
+				(uint64_t*)&psmi_stats_memory.m_netbufs_max),
+		PSMI_STATS_DECLU64("PSM_desctors_(current)",
+				(uint64_t*)&psmi_stats_memory.m_descriptors_total),
+		PSMI_STATS_DECLU64("PSM_desctors_(max)",
+				(uint64_t*)&psmi_stats_memory.m_descriptors_max),
+		PSMI_STATS_DECLU64("Unexp._buffers_(current)",
+				(uint64_t*)&psmi_stats_memory.m_unexpbufs_total),
+		PSMI_STATS_DECLU64("Unexp._Buffers_(max)",
+				(uint64_t*)&psmi_stats_memory.m_unexpbufs_max),
+#ifdef RNDV_MOD
+		PSMI_STATS_DECLU64("Peer_Rndv_(current)",
+				(uint64_t*)&psmi_stats_memory.m_peerrndv_total),
+		PSMI_STATS_DECLU64("Peer_Rndv_(max)",
+				(uint64_t*)&psmi_stats_memory.m_peerrndv_max),
+#endif
+		PSMI_STATS_DECLU64("statistics_(current)",
+				(uint64_t*)&psmi_stats_memory.m_stats_total),
+		PSMI_STATS_DECLU64("statistics_(max)",
+				(uint64_t*)&psmi_stats_memory.m_stats_max),
+		PSMI_STATS_DECLU64("Other_(current)",
+				(uint64_t*)&psmi_stats_memory.m_undefined_total),
+		PSMI_STATS_DECLU64("Other_(max)",
+				(uint64_t*)&psmi_stats_memory.m_undefined_max),
+	};
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		psmi_stats_register_type("PSM_memory_allocation_statistics",
+                    PSMI_STATSTYPE_MEMORY,
+                    entries,
+                    PSMI_STATS_HOWMANY(entries), 0, &psmi_stats_memory, NULL);
+	}
+}
+
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes)
+{
+#define _add_max_total(type, nbytes)				\
+	psmi_stats_memory.m_ ## type ## _total += (nbytes);	\
+	psmi_stats_memory.m_ ## type ## _max = max(		\
+	    psmi_stats_memory.m_ ## type ## _total,		\
+	    psmi_stats_memory.m_ ## type ## _max);
+
+	switch (type) {
+	case PER_PEER_ENDPOINT:
+		_add_max_total(perpeer, nbytes);
+		break;
+	case NETWORK_BUFFERS:
+		_add_max_total(netbufs, nbytes);
+		break;
+	case DESCRIPTORS:
+		_add_max_total(descriptors, nbytes);
+		break;
+	case UNEXPECTED_BUFFERS:
+		_add_max_total(unexpbufs, nbytes);
+		break;
+	case STATS:
+		_add_max_total(stats, nbytes);
+		break;
+#ifdef RNDV_MOD
+	case PEER_RNDV:
+		_add_max_total(peerrndv, nbytes);
+		break;
+#endif
+	case UNDEFINED:
+		_add_max_total(undefined, nbytes);
+		break;
+	default:
+		psmi_assert_always(type == TOTAL);
+		break;
+	}
+	_add_max_total(all, nbytes);
+	psmi_stats_memory.m_all_max++;
+#undef _add_max_total
+
+	return;
+}
+
+#ifdef malloc
+#undef malloc
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* PSM HEAP DEBUG documentation:
+
+   In the following code, the acronym: 'HD' is short for "Heap Debug".
+
+   Each actual heap allocation will have a header and a trailer surrounding it,
+   and the header itself may have some vacant space preceding it due to alignment
+   needs:
+
+   0. This area is the actual return value of posix_memalign and is due to
+      alignment requirements.  (This area does not exist for heap allocations
+      from malloc()).
+   1. HD HEADER
+   2. Actual allocation
+   3. HD TRAILER
+
+   malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code,
+   then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to
+   the caller.  Thereafter, the HD code will inspect areas 1 and 3 of all heap
+   allocations to make sure they have retained their integrity.
+
+   Surrounding the actual allocation like this enables:
+
+   1. Checking for heap overrun / underrun of all allocations.
+   2. Checking for double frees.
+   3. Use of an area that has been freed.
+   4. Identifying orphaned heap allocations.
+
+Constant no-mans-land written to areas that no-one should be writing to:
+
+ */
+
+#define HD_NO_MANS_LAND -15
+
+/*   The following is the declaration of the HD header. */
+
+/* Heap debug header magic number type: */
+typedef char HD_Hdr_Magic_Type[8];
+
+typedef struct HD_Header_Struct
+{
+	HD_Hdr_Magic_Type        magic1;         /* Magic number to ensure this
+						    allocation has integrity.
+						    (guards against heap
+						    overrun from above). */
+	const char              *allocLoc;       /* Source file name/line
+						    number where this heap
+						    allocation was made. */
+	const char              *freeLoc;        /* Source filename/line number
+						    where this heap allocation
+						    was freed. */
+	struct HD_Header_Struct *nextHD_header;  /* Creates a singly-linked
+						    list of all heap
+						    allocations. */
+	uint64_t                 sizeOfAlloc;    /* size of this heap
+						    allocation. */
+	void                    *systemAlloc;    /* The actual return value
+						    from malloc()/posix_memaligh(). */
+	uint64_t                 systemAllocSize;/* The size that is actually allocated
+						    by malloc()/posix_memalign(). */
+	HD_Hdr_Magic_Type        magic2;         /* Second magic number to
+						    ensure this allocation
+						    has integrity.
+						    (guards against heap
+						    underrun from the actual
+						    allocation that follows). */
+} __attribute__ ((packed)) HD_Header_Type;
+
+typedef struct HD_free_list_struct
+{
+	HD_Header_Type *freedStuct;
+	struct HD_free_list_struct *next_free_struct;
+} HD_Free_Struct_Type;
+
+static HD_Free_Struct_Type  *HD_free_list_root   = NULL;
+static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root;
+
+typedef char HD_Trlr_Magic_Type[16];
+
+static const HD_Hdr_Magic_Type  HD_HDR_MGC_1 = "Eric";
+static const HD_Hdr_Magic_Type  HD_HDR_MGC_2 = "Emily";
+static const HD_Trlr_Magic_Type HD_TRLR_MGC  = "Erin&Elaine";
+
+/* Convert a pointer of an actual allocation to a pointer to its HD header: */
+static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa)
+{
+	char *p = (char*)aa;
+	return (HD_Header_Type*)(p - sizeof(HD_Header_Type));
+}
+
+/* Convert a pointer to an HD header to the actual allocation: */
+static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr)
+{
+	char *p = (char*)phdHdr;
+	return p + sizeof(HD_Header_Type);
+}
+
+/* Get the address of the trailer that follows the actual allocation: */
+static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr)
+{
+	char *p = (char*)HD_HDR_TO_AA(phdr);
+	return p + phdr->sizeOfAlloc;
+}
+
+static HD_Header_Type * HD_root_of_list = NULL;   /* Root of singly linked list
+						     of all heap allocations */
+static HD_Header_Type **HD_end_of_list = &HD_root_of_list;  /* Pointer to the
+	       last pointer of the singly linked list of all heap allocations. */
+
+/* Number of allocations in the list.  Maintained to assert the integrity
+   of the singly linked list of heap allocations. */
+static int n_allocations = 0;
+
+/* HD_check_one_struct() checks one heap allocation for integrity. */
+static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc)
+{
+	int s=0;
+
+	/* First check the magic values in the header and trailer: */
+	s |= memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1))       ? 1 : 0;
+	s |= memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2))       ? 2 : 0;
+	s |= memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)) ? 4 : 0;
+
+	if (s != 0)
+	{
+		fprintf(stderr,"header/trailer error: checking location: %s, s: %d, p: %p, "
+			"p->allocLoc: %s\n",curloc,s,p,p->allocLoc);
+		fprintf(stderr,"actual allocation starts at: %p, length: %" PRIu64  "\n", (char*)HD_HDR_TO_AA(p),p->sizeOfAlloc);
+		fflush(0);
+		abort();
+	}
+
+	/* Next, check the area between systemAlloc and the start of the header */
+	signed char *pchr = (signed char *)p->systemAlloc;
+	while (pchr < (signed char*)p)
+	{
+		psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND);
+		pchr++;
+	}
+
+	/* Lastly, check the actual allocation area if directed to do so: */
+	if (checkAA)
+	{
+		uint64_t i;
+		signed char *pchr = HD_HDR_TO_AA(p);
+		for (i=0;i < p->sizeOfAlloc;i++)
+			if (pchr[i] != (signed char) HD_NO_MANS_LAND)
+			{
+				fprintf(stderr,
+					"use after free; ptr: %p,\n"
+					" allocated from: %s,\n"
+					" validated from: %s\n"
+					" freed from: %s\n",
+					pchr+i,p->allocLoc,curloc,p->freeLoc);
+				fflush(0);
+				psmi_assert_always(0);
+			}
+	}
+}
+
+/* _psmi_heapdebug_val_heapallocs() walks the singly linked list and inspects all
+ *  heap allocations to ensure all of them have integrity still. */
+void _psmi_heapdebug_val_heapallocs(const char *curloc)
+{
+	/* first check current allocation list: */
+	HD_Header_Type *p = HD_root_of_list;
+	int cnt = 0;
+
+	while (p)
+	{
+		HD_check_one_struct(p,0,curloc);
+		p = p->nextHD_header;
+		cnt++;
+	}
+	psmi_assert_always(cnt == n_allocations);
+	/* Next check free list */
+	HD_Free_Struct_Type *pfreestruct = HD_free_list_root;
+	while (pfreestruct)
+	{
+		HD_check_one_struct(pfreestruct->freedStuct,1,curloc);
+		pfreestruct = pfreestruct->next_free_struct;
+	}
+}
+
+/* psmi_heapdebug_finalize() validates the heap and then emits all of the allocations to stdout.
+   to help debug heap memory leaks. */
+void psmi_heapdebug_finalize(void)
+{
+	/* First validate the existing heap allocations: */
+
+	psmi_heapdebug_val_heapallocs();
+
+	printf("orphaned heap allocations: %d\n", n_allocations);
+
+	if (n_allocations > 0)
+	{
+		/* Now, emit all of the alloations to stdout. */
+
+		HD_Header_Type *p = HD_root_of_list;
+
+		while (p)
+		{
+			printf("orphaned heap allocation: %p allocated at: %s, size: %lu\n",
+			       p, p->allocLoc, p->sizeOfAlloc);
+
+			p = p->nextHD_header;
+		}
+		fflush(0);
+		/* Abort if any allocations still exist: */
+		abort();
+	}
+}
+
+/* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds
+ * the header and trailer to the allocation.  Lastly, it validates the existing singly-linked
+ * list for integrity. */
+static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc,
+			    void *systemAlloc,
+			    uint64_t systemSize,
+			    uint64_t actualSize,
+			    const char *curloc)
+{
+	/* First, write HD_NO_MANS_LAND to the entire allocation: */
+	memset(systemAlloc,HD_NO_MANS_LAND,systemSize);
+
+	/* Write the HD header info: */
+	memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1));
+	hd_alloc->allocLoc = curloc;
+	hd_alloc->freeLoc = NULL;
+	hd_alloc->nextHD_header = NULL;
+	hd_alloc->sizeOfAlloc = actualSize;
+	hd_alloc->systemAlloc = systemAlloc;
+	hd_alloc->systemAllocSize = systemSize;
+	memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2));
+	memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC));
+	*HD_end_of_list = hd_alloc;
+	HD_end_of_list = &hd_alloc->nextHD_header;
+	n_allocations++;
+	psmi_heapdebug_val_heapallocs();
+}
+
+/* hd_malloc() is the heap debug version of malloc that will create the header and trailer
+ * and link the allocation into the singly linked list. */
+static inline void *hd_malloc(size_t sz, const char *curloc)
+{
+	const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC);
+	HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize);
+
+	hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc);
+	return HD_HDR_TO_AA(hd_alloc);
+}
+
+/* hd_memalign() is the heap debug version of posix_memalign(). */
+static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc)
+{
+	void *systemAlloc = NULL;
+	const uint64_t alignMask = alignment - 1;
+	uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC);
+	int rv = posix_memalign(&systemAlloc,alignment,systemSize);
+	char *actualAlloc = NULL;
+	const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize;
+
+	if (rv)
+		return rv;
+
+	uint64_t actualAllocu64 = (uint64_t) systemAlloc;
+	actualAllocu64 += sizeof(HD_Header_Type) + alignMask;
+	actualAllocu64 &= ~ alignMask;
+	actualAlloc = (char*)actualAllocu64;
+	psmi_assert_always((actualAllocu64 & alignMask) == 0);
+	psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc);
+	psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type));
+
+	hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc);
+	*ptr = actualAlloc;
+	return rv;
+}
+
+/* hd_free() is the heap debug version of free().  First, hd_free() ensures that the ptr to be
+ * freed in fact is known by the HD code.  Next, hd_free() removes the ptr from the list. Then,
+ * hd_free scribbles to the ptr's area and actually frees the heap space. */
+static inline void hd_free(void *ptr,const char *curloc)
+{
+	HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+	HD_Header_Type *p = HD_root_of_list, *q = NULL;
+
+	psmi_heapdebug_val_heapallocs();
+	while (p)
+	{
+		if (p == hd_alloc)
+		{
+			/* first, fix the next pointers: */
+			if (q)
+			{
+				q->nextHD_header = p->nextHD_header;
+			}
+			else
+			{
+				psmi_assert_always(p == HD_root_of_list);
+				HD_root_of_list = p->nextHD_header;
+			}
+			/* Now, handle the case of removing the last entry in the list. */
+			if (&p->nextHD_header == HD_end_of_list)
+			{
+				if (q)
+				{
+					q->nextHD_header = NULL;
+					HD_end_of_list = &q->nextHD_header;
+				}
+				else
+				{
+					HD_root_of_list = NULL;
+					HD_end_of_list = &HD_root_of_list;
+				}
+			}
+			/* Scribble to the actual allocation to make further access to the heap
+			   area unusable. */
+			n_allocations--;
+			memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc);
+			hd_alloc->freeLoc = curloc;
+			/* Add this allocation to the free list. */
+			HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type));
+			*HD_free_list_bottom = pfreestruct;
+			HD_free_list_bottom = &pfreestruct->next_free_struct;
+			pfreestruct->freedStuct = hd_alloc;
+			pfreestruct->next_free_struct = NULL;
+			psmi_heapdebug_val_heapallocs();
+			return;
+		}
+		q = p;
+		p = p->nextHD_header;
+	}
+	/* trying to free a heap allocation that we did not allocate. */
+	psmi_assert_always(0);
+}
+
+size_t hd_malloc_usable_size(void *ptr,const char *curloc)
+{
+	HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr);
+	return hd_alloc->systemAllocSize;
+}
+
+#endif
+
+#ifdef PSM_HEAP_DEBUG
+
+/* For HD code, we retarget the malloc, memaligh and free calls to the hd versions
+ * of the code. */
+
+#define my_malloc(SZ,CURLOC)              hd_malloc(SZ,CURLOC)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC)  hd_memalign(PTR,ALIGN,SZ,CURLOC)
+#define my_free(PTR,CURLOC)               hd_free(PTR,CURLOC)
+#define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC)
+
+#else
+
+/* For non-HD code, we target the code to the usual functions: */
+#define my_malloc(SZ,CURLOC)              malloc(SZ)
+#define my_memalign(PTR,ALIGN,SZ,CURLOC)  posix_memalign(PTR,ALIGN,SZ)
+#define my_free(PTR,CURLOC)               free(PTR)
+#define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR)
+
+#endif
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+			   size_t sz, const char *curloc)
+{
+	size_t newsz = sz;
+	void *newa;
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+	    newsz += sizeof(struct psmi_memtype_hdr);
+
+	newa = my_malloc(newsz,curloc);
+	if (newa == NULL) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				  "Out of memory for malloc at %s", curloc);
+		return NULL;
+	}
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa;
+		hdr->size = newsz;
+		hdr->type = type;
+		hdr->magic = 0x8c;
+		hdr->original_allocation = newa;
+		psmi_log_memstats(type, newsz);
+		newa = (void *)(hdr + 1);
+		/* _HFI_INFO("alloc is %p\n", newa); */
+	}
+	return newa;
+}
+
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type,
+			    void *ptr, size_t nsz, const char *curloc)
+{
+	if (ptr)
+	{
+		size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc);
+		if (nsz > existingSize)
+		{
+			void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc);
+
+			memcpy(newPtr,ptr,existingSize);
+			psmi_free_internal(ptr,curloc);
+			return newPtr;
+		}
+		else
+			/* We will not support shrinking virtual space
+			   for performance reasons. */
+			return ptr;
+	}
+	else
+		return psmi_malloc_internal(ep,type,nsz,curloc);
+}
+
+#ifdef memalign
+#undef memalign
+#endif
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type,
+			     size_t alignment, size_t sz, const char *curloc)
+{
+	size_t newsz = sz;
+	void *newa;
+	int ret, preambleSize = 0;
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+	{
+		if (sizeof(struct psmi_memtype_hdr) > alignment)
+		{
+			int n = sizeof(struct psmi_memtype_hdr) / alignment;
+			int r = sizeof(struct psmi_memtype_hdr) % alignment;
+			if (r)
+				n++;
+			preambleSize = n * alignment;
+		}
+		else
+			preambleSize = alignment;
+		newsz += preambleSize;
+	}
+
+	ret = my_memalign(&newa, alignment, newsz, curloc);
+	if (ret) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				  "Out of memory for malloc at %s", curloc);
+		return NULL;
+	}
+
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		void *rv = (void *)((uint8_t *)newa + preambleSize);
+		struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)((uint8_t *)rv - sizeof(struct psmi_memtype_hdr));
+		hdr->size = newsz;
+		hdr->type = type;
+		hdr->magic = 0x8c;
+		hdr->original_allocation = newa;
+		psmi_log_memstats(type, newsz);
+		newa = rv;
+		/* _HFI_INFO("alloc is %p\n", newa); */
+	}
+	return newa;
+}
+
+#ifdef calloc
+#undef calloc
+#endif
+
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem,
+			   size_t elemsz, const char *curloc)
+{
+	void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc);
+	if (newa == NULL)	/* error handled above */
+		return NULL;
+	memset(newa, 0, nelem * elemsz);
+	return newa;
+}
+
+#ifdef strdup
+#undef strdup
+#endif
+
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc)
+{
+	size_t len = strlen(string) + 1;
+	void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc);
+	if (newa == NULL)
+		return NULL;
+	memcpy(newa, string, len);	/* copy with \0 */
+	return newa;
+}
+
+#ifdef free
+#undef free
+#endif
+
+void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc)
+{
+	if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+		struct psmi_memtype_hdr *hdr =
+		    (struct psmi_memtype_hdr *)ptr - 1;
+		/* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */
+		psmi_memtype_t type = hdr->type;
+		int64_t size = hdr->size;
+		int magic = (int)hdr->magic;
+		psmi_log_memstats(type, -size);
+		psmi_assert_always(magic == 0x8c);
+		ptr = hdr->original_allocation;
+	}
+	my_free(ptr,curloc);
+}
+MOCK_DEF_EPILOGUE(psmi_free_internal);
+
+#ifdef malloc_usable_size
+#undef malloc_usable_size
+#endif
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc)
+{
+	return my_malloc_usable_size(ptr,curLoc);
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_coreopt_ctl(const void *core_obj, int optname,
+		 void *optval, uint64_t *optlen, int get))
+{
+	psm2_error_t err = PSM2_OK;
+
+	switch (optname) {
+	case PSM2_CORE_OPT_DEBUG:
+		/* Sanity check length */
+		if (*optlen < sizeof(unsigned)) {
+			err =  psmi_handle_error(NULL,
+					PSM2_PARAM_ERR,
+					"Option value length error");
+			*optlen = sizeof(unsigned);
+			return err;
+		}
+
+		if (get) {
+			*((unsigned *)optval) = hfi_debug;
+		} else
+			hfi_debug = *(unsigned *)optval;
+		break;
+	case PSM2_CORE_OPT_EP_CTXT:
+		{
+			/* core object is epaddr */
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+
+			/* Sanity check epaddr */
+			if (!epaddr) {
+				return psmi_handle_error(NULL,
+						PSM2_PARAM_ERR,
+						"Invalid endpoint address");
+			}
+
+			/* Sanity check length */
+			if (*optlen < sizeof(unsigned long)) {
+				err =  psmi_handle_error(NULL,
+						PSM2_PARAM_ERR,
+						"Option value length error");
+				*optlen = sizeof(void *);
+				return err;
+			}
+
+			if (get) {
+				*((unsigned long *)optval) =
+				    (unsigned long)epaddr->usr_ep_ctxt;
+			} else
+				epaddr->usr_ep_ctxt = optval;
+		}
+		break;
+	default:
+		/* Unknown/unrecognized option */
+		err = psmi_handle_error(NULL,
+				PSM2_PARAM_ERR,
+				"Unknown PSM3_CORE option %u.",
+				optname);
+		break;
+	}
+	return err;
+}
+
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+			     const void *optval, uint64_t optlen)
+{
+	return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+			     void *optval, uint64_t *optlen)
+{
+	return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1);
+}
+
+/* PSM AM component option handling */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+psmi_amopt_ctl(const void *am_obj, int optname,
+	       void *optval, uint64_t *optlen, int get))
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* AM object is a psm2_epaddr (or NULL for global minimum sz) */
+	/* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */
+
+	/* All AM options are read-only. */
+	if (!get) {
+		return err =
+		    psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY,
+				      "Attempted to set read-only option value");
+	}
+
+	/* Sanity check length -- all AM options are uint32_t. */
+	if (*optlen < sizeof(uint32_t)) {
+		*optlen = sizeof(uint32_t);
+		return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR,
+					       "Option value length error");
+	}
+
+	switch (optname) {
+	case PSM2_AM_OPT_FRAG_SZ:
+		*((uint32_t *) optval) = psmi_am_parameters.max_request_short;
+		break;
+	case PSM2_AM_OPT_NARGS:
+		*((uint32_t *) optval) = psmi_am_parameters.max_nargs;
+		break;
+	case PSM2_AM_OPT_HANDLERS:
+		*((uint32_t *) optval) = psmi_am_parameters.max_handlers;
+		break;
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown PSM3_AM option %u.", optname);
+	}
+
+	return err;
+}
+
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+			   const void *optval, uint64_t optlen)
+{
+	return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0);
+}
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+			   void *optval, uint64_t *optlen)
+{
+	return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1);
+}
+
+#ifdef PSM_LOG
+
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include "ptl_ips/ips_proto_header.h"
+
+/* A treeNode is used to store the list of Function Name Lists that
+   are passed to the PSM_LOG facility via environment variables.
+   See psm_log.h for more information.
+
+   Note that treeNode is a node in a binary tree data structure. */
+typedef struct _treeNode
+{
+	const char *name;
+	int line1,line2;
+	struct _treeNode *left,*right;
+} treeNode;
+
+/* An epmTreeNode is used to track the number of protocol packets
+   that are send/recevied, for a given opcode, and source epid
+   to another epid. */
+typedef struct _epmTreeNode
+{
+	int opcode,count,txrx;
+	uint64_t fromepid,toepid;
+	struct _epmTreeNode *left,*right;
+} epmTreeNode;
+
+
+/* given a line range: [*line1 .. *line2], and another line, line
+   'join' the line range to the new line if the line immediately abuts
+   the line range.  The new line does not abut the existing range,
+   return 0.  Else, return 1.
+
+   For example, take the line range [ 20 .. 30 ] and the line: 19.
+   Since 19 comes immediately before 20, the line range can be joined
+   resulting in the line rage: [ 19 .. 30 ].  The function returns 1 for this
+   case.
+
+   The following other examples gives the new line range given the new line and
+   range [ 20 .. 30 ], and gives the return value:
+
+   31 [ 20 .. 31 ] 1
+   18 [ 20 .. 30 ] 0
+   32 [ 20 .. 30 ] 0
+   25 [ 20 .. 30 ] 1 */
+static int joinOverlap(int *line1,int *line2,int line)
+{
+	long long ll_line = line;
+
+	if (ll_line+1 >= *line1 && ll_line-1 <= *line2)
+	{
+		*line1 = min(*line1,line);
+		*line2 = max(*line2,line);
+		return 1;
+	}
+	return 0;
+}
+
+/* given two line ranges, determine the range that encompasses both line ranges
+   if an overlap has occurred.  Returns 0 if the two ranges do not overlap and
+   do not abutt.
+
+   Some examples, if line1=20 and line2=30
+
+   [20 30] [20 30] 2
+   [19 30] [19 30] 2
+   [19 20] [19 30] 2
+   [10 15] [20 30] 0
+   [40 50] [20 30] 0 */
+static int joinOverlapRange(int *line1,int *line2,int l1,int l2)
+{
+	return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2);
+}
+
+/* inserts a new treeNode into the FNL tree, or, merges the lines that are already
+   present in the tree. */
+static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2)
+{
+	if (*root)
+	{
+		int c = strcmp(name,(*root)->name);
+		if (c < 0)
+			insertNodeInTree(&((*root)->left),name,line1,line2);
+		else if (c > 0)
+			insertNodeInTree(&((*root)->right),name,line1,line2);
+		else
+		{
+			if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2))
+				return;
+			else if (line1 < (*root)->line1)
+				insertNodeInTree(&((*root)->left),name,line1,line2);
+			else if (line2 > (*root)->line2)
+				insertNodeInTree(&((*root)->right),name,line1,line2);
+			else psmi_assert_always(0); /* should never happen. */
+		}
+	}
+	else
+	{
+		*root = malloc(sizeof(treeNode));
+		(*root)->name  = strdup(name);
+		(*root)->line1 = line1;
+		(*root)->line2 = line2;
+		(*root)->left  = (*root)->right  = NULL;
+	}
+}
+
+/* Returns -1 if the data in the node is less    than the data supplied as parameter, else
+   Returns  1 if the data in the node is greater than the data supplied as parameter, else
+   Returns  0.
+   */
+static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+#define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1
+	COMPARE_ONE(opcode);
+	COMPARE_ONE(txrx);
+	COMPARE_ONE(fromepid);
+	COMPARE_ONE(toepid);
+	return 0;
+}
+
+/* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree.
+   In either case, this code returns a pointer to the count in the node. */
+static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid)
+{
+	if (*root)
+	{
+		int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid);
+		if (a < 0)
+			return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid);
+		else if (a > 0)
+			return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid);
+		else
+			return &((*root)->count);
+	}
+	else
+	{
+		*root = malloc(sizeof(epmTreeNode));
+		(*root)->opcode   = opcode;
+		(*root)->txrx     = txrx;
+		(*root)->count    = 0;
+		(*root)->fromepid = fromepid;
+		(*root)->toepid   = toepid;
+		(*root)->left     = (*root)->right  = NULL;
+		return &((*root)->count);
+	}
+}
+
+/* returns 0, if the node is present, non-zero if it is absent. */
+static int lookupNodeInTree(const treeNode *root,const char *name,int line)
+{
+	if (root)
+	{
+		int c = strcmp(name,root->name);
+		if (c < 0)
+			return lookupNodeInTree(root->left,name,line);
+		else if (c > 0)
+			return lookupNodeInTree(root->right,name,line);
+		else
+		{
+			if (line < root->line1)
+				return lookupNodeInTree(root->left,name,line);
+			else if (line > root->line2)
+				return lookupNodeInTree(root->right,name,line);
+			else /* line must be >= root->line1 and line must be <= root->line2. */
+				return 0;
+		}
+	}
+	else
+	{
+		return 1;
+	}
+}
+
+/* Declare a prototype for a parserFunc - referenced in the following code: */
+typedef void parserFunc(char *,int,int,void *);
+
+/* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */
+static void parseString(char *ps,char c,parserFunc pf,void *ctx)
+{
+	int idx,n=0;
+	char *p;
+
+	/* first, count the number of instances of c in ps, for use by the parser function: */
+	for (idx=0;ps[idx];idx++)
+		if (ps[idx] == c)
+			n++;
+	/* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */
+	for (idx=0,p=ps;p && *p;idx++)
+	{
+		char *t = strchr(p,c);
+		if (!t)
+		{
+			break;
+		}
+		else
+		{
+			*t = 0;
+			pf(p,idx,n,ctx);
+			p = t+1;
+		}
+	}
+	/* finally, call pf on the final substring. */
+	pf(p,idx,n,ctx);
+}
+
+/* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */
+typedef struct
+{
+	const char *currentFuncName;
+	int firstLineNumber;
+	treeNode **root;
+} funcNameCtx;
+
+/* This is the start of the parser code for parsing FNL's.  Here is the grammar:
+
+  An FNL is a 'Function Name List' that is defined by the following grammar:
+
+  # A LINE1 is either a single line number of a range of line numbers:
+(1)  LINE1 :: lineNumber |
+(2)           lineNumber1 '-' lineNumber2
+
+  # LINES is a list of LINE1's separated by commas:
+(3)  LINES :: LINE1 |
+(4)           LINE1 ',' LINES
+
+  # An FN is either a function name, or a function name with a list of lines:
+(5)  FN :: functionName |
+(6)        functionName ';' LINES
+
+  # A FNL is a list of FN's separated by colons:
+(7)  FNL ::  FN |
+(8)          FN ':' FNL
+
+  # Examples:
+  foo:bar    the two functions foo and bar
+  foo;1-10   lines 1 to 10 of function foo.
+  bar;1,3,5  lines 1, 3 and 5 of function bar
+
+*/
+
+/* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree.
+*/
+static void p4(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n == 0) /* production (1) */
+	{
+		pfnc->firstLineNumber = atoi(s);
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber);
+	}
+	else if (n == 1) /* production (2) */
+	{
+		if (idx == 0) /* lhs of production (2) */
+			pfnc->firstLineNumber = atoi(s);
+		else /* rhs of production (2). */
+			insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s));
+	}
+}
+
+/* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and
+   uses p4 to spill each individual range (or just one line number) into the tree */
+static void p3(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n == 0 && *s == 0) /* production (5)/(7) */
+	{
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX);
+	}
+	else if (*s) /* production (2) */
+	{
+		/* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */
+		parseString(s,'-',p4,ctx);
+	}
+}
+
+/* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */
+static void p2(char *s,int idx,int n,void *ctx)
+{
+	funcNameCtx *pfnc = (funcNameCtx *)ctx;
+
+	if (n)
+	{
+		if (idx == 0)
+			pfnc->currentFuncName = s;
+		else
+		{
+			/* production (4) */
+			/* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */
+			parseString(s,',',p3,ctx);
+		}
+	}
+	else
+	{
+		/* production (7)/(5). */
+		insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX);
+	}
+}
+
+/* p1 parses each function name and line range list. */
+static void p1(char *s,int idx,int n,void *ctx)
+{
+	/* production (5)/(6)) */
+	/* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */
+	parseString(s,';',p2,ctx);
+}
+
+static void parseAndInsertInTree(const char *buf,treeNode **root)
+{
+	funcNameCtx t;
+	t.root = root;
+	char *p = alloca(strlen(buf)+1);
+	strcpy(p,buf);
+	/* productions (7)/(8) */
+	/* separates string into colon-separated strings, and then parses each substring in p1: */
+	parseString(p,':',p1,(void*)&t);
+}
+
+/* initialization code for the psmi log mechanism. */
+static inline void psmi_initialize(const char **plmf_fileName_kernel,
+				   const char **plmf_search_format_string,
+				   treeNode   **includeFunctionNamesTreeRoot,
+				   treeNode   **excludeFunctionNamesTreeRoot)
+{
+	static volatile int  plmf_initialized = 0;
+
+	if (!plmf_initialized)
+	{
+		static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+		if (pthread_mutex_lock(&plmf_init_mutex))
+		{
+			perror("cannot lock mutex for psmi_log_message facility");
+			return;
+		}
+                /* CRITICAL SECTION BEGIN */
+		if (!plmf_initialized)
+		{
+			/* initializing psmi log message facility here. */
+			const char *env = getenv("PSM3_LOG_FILENAME");
+			if (env)
+				*plmf_fileName_kernel = env;
+			env = getenv("PSM3_LOG_SRCH_FORMAT_STRING");
+			if (env)
+			{
+				*plmf_search_format_string = env;
+			}
+			else
+			{
+				env = getenv("PSM3_LOG_INC_FUNCTION_NAMES");
+				if (env)
+				{
+					parseAndInsertInTree(env,includeFunctionNamesTreeRoot);
+				}
+				env = getenv("PSM3_LOG_EXC_FUNCTION_NAMES");
+				if (env)
+				{
+					parseAndInsertInTree(env,excludeFunctionNamesTreeRoot);
+				}
+			}
+			/* initialization of psmi log message facility is completed. */
+			plmf_initialized = 1;
+		}
+		/* CRITICAL SECTION END */
+		if (pthread_mutex_unlock(&plmf_init_mutex))
+		{
+			perror("cannot unlock mutex for psmi_log_message facility");
+			return;
+		}
+	}
+}
+
+/* Utility function to map the integer txrx value to the given strings for emitting to the log file. */
+static const char * const TxRxString(int txrx)
+{
+	switch(txrx)
+	{
+	case PSM2_LOG_TX:	return "Sent";
+	case PSM2_LOG_RX:	return "Received";
+	case PSM2_LOG_PEND:	return "Pending";
+	default:		return "Unknown";
+	}
+}
+
+/* Utility function to map an integer opcode value to the given strings for emitting to the log file. */
+static const char * const OpcodeString(int opcode)
+{
+	switch(opcode)
+	{
+	case OPCODE_LONG_RTS:          return "RTS";
+	case OPCODE_LONG_CTS:          return "CTS";
+	case OPCODE_LONG_DATA:         return "DATA";
+	case OPCODE_ERR_CHK_RDMA:      return "ERR_CHK_RDMA";
+	case OPCODE_ERR_CHK_RDMA_RESP: return "ERR_CHK_RDMA_RESP";
+	default:                       return "UNKNOWN";
+	}
+}
+
+static const char     *plmf_fileName_kernel         = "/tmp/psm2_log";
+static const char     *plmf_search_format_string    = NULL;
+static       treeNode *includeFunctionNamesTreeRoot = NULL;
+static       treeNode *excludeFunctionNamesTreeRoot = NULL;
+
+void psmi_log_initialize(void)
+{
+	/* If not initialized, then, initialize in a single thread of execution. */
+	psmi_initialize(&plmf_fileName_kernel,
+			&plmf_search_format_string,
+			&includeFunctionNamesTreeRoot,
+			&excludeFunctionNamesTreeRoot);
+}
+
+#ifdef PSM_LOG_FAST_IO
+
+struct psmi_log_io_thread_info
+{
+	pthread_t thread_id;
+	char *buff;
+	unsigned long max_buff_length, curr_buff_length;
+	pthread_mutex_t flags_mutex;
+	volatile int flags;
+#define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1  /* io is currently in progress */
+#define PSMI_LOG_IO_FLAG_IO_SHUTDOWN    2  /* we are shutting down logging. */
+};
+
+/* Please note that psmi_log_io_info is in thread local storage. */
+static __thread struct psmi_log_io_thread_info psmi_log_io_info =
+{
+	.thread_id        = 0,
+	.buff             = NULL,
+	.max_buff_length  = 0,
+	.curr_buff_length = 0,
+	.flags_mutex      = PTHREAD_MUTEX_INITIALIZER,
+	.flags            = 0
+};
+
+static struct
+{
+	unsigned int nTableEntries,maxTableEntries;
+	pthread_mutex_t table_mutex;
+	struct psmi_log_io_thread_info **table;
+} psmi_log_io_table =
+{
+	.nTableEntries   = 0,
+	.maxTableEntries = 0,
+	.table_mutex     = PTHREAD_MUTEX_INITIALIZER,
+	.table           = NULL
+};
+
+void psmi_log_fini()
+{
+	if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+	{
+		perror("Cannot lock mutex for psmi_log_io_table");
+		return;
+	}
+	/* Start critical section. */
+
+	unsigned int i;
+	for (i=0;i < psmi_log_io_table.nTableEntries;i++)
+	{
+		if (psmi_log_io_table.table[i])
+		{
+			struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i];
+			int flags;
+
+			if (pthread_mutex_lock(&pti->flags_mutex))
+			{
+				perror("can't lock the flags mutex.");
+				continue;
+			}
+			/* critical section */
+			flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN);
+			/* end critical section */
+			pthread_mutex_unlock(&pti->flags_mutex);
+			/* if io is currenctly in progress, allow it to complete. */
+			while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+			{
+				sleep(1);
+				if (pthread_mutex_lock(&pti->flags_mutex))
+				{
+					perror("can't lock the flags mutex.");
+					continue;
+				}
+				flags = pti->flags;
+				pthread_mutex_unlock(&pti->flags_mutex);
+			}
+			if (pti->buff)
+			{
+				char logFileName[256];
+				FILE *fout;
+
+				snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+					 plmf_fileName_kernel,getpid(),pti->thread_id);
+				fout = fopen(logFileName,"w");
+				if (!fout)
+				{
+					perror(logFileName);
+					continue;
+				}
+				fwrite(pti->buff,pti->curr_buff_length,1,fout);
+				fclose(fout);
+			}
+		}
+		psmi_log_io_table.table[i] = NULL;
+	}
+	psmi_log_io_table.nTableEntries = 0;
+	psmi_free(psmi_log_io_table.table);
+	psmi_log_io_table.table = NULL;
+	psmi_log_io_table.maxTableEntries = 0;
+	/* End critical section. */
+	pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+}
+
+static int psmi_log_register_tls(void)
+{
+	if (psmi_log_io_info.thread_id != pthread_self())
+	{
+		psmi_log_io_info.thread_id = pthread_self();
+		if (pthread_mutex_lock(&psmi_log_io_table.table_mutex))
+		{
+			perror("cannot lock table mutex");
+			return -1;
+		}
+		/* critical section start. */
+		if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1)
+		{
+			if (psmi_log_io_table.maxTableEntries == 0)
+			{
+				psmi_log_io_table.maxTableEntries = 2;
+				psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE,
+								      PER_PEER_ENDPOINT,
+								      psmi_log_io_table.maxTableEntries *
+								      sizeof(struct psmi_log_io_thread_info *));
+			}
+			else
+			{
+				psmi_log_io_table.maxTableEntries *= 2;
+				psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE,
+								       PER_PEER_ENDPOINT,
+								       psmi_log_io_table.table,
+								       psmi_log_io_table.maxTableEntries *
+								       sizeof(struct psmi_log_io_thread_info *));
+			}
+		}
+		psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info;
+		psmi_log_io_table.nTableEntries++;
+		/* critical section end. */
+		pthread_mutex_unlock(&psmi_log_io_table.table_mutex);
+	}
+	if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+	{
+		perror("cannot lock table mutex");
+		return -1;
+	}
+	/* critical section start. */
+	int old_flags = psmi_log_io_info.flags;
+	int new_flags = old_flags;
+	if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN))
+		new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+	psmi_log_io_info.flags = new_flags;
+	/* critical section end. */
+	pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+	if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS)
+		return 0;
+	return -1;
+}
+
+static void psmi_buff_fclose(int port)
+{
+	if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex))
+	{
+		perror("cannot lock table mutex");
+		return;
+	}
+	/* critical section start. */
+	psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS;
+	/* critical section end. */
+	pthread_mutex_unlock(&psmi_log_io_info.flags_mutex);
+}
+
+static void growBuff(size_t minExcess)
+{
+       while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length)
+	{
+		if (!psmi_log_io_info.buff)
+			psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+								    psmi_log_io_info.max_buff_length = 1 << 20);
+		else
+		{
+			psmi_log_io_info.max_buff_length *= 2;
+			psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
+								     psmi_log_io_info.buff,
+								     psmi_log_io_info.max_buff_length);
+		}
+	}
+}
+
+static int psmi_buff_vfprintf(int port, const char *format, va_list ap)
+{
+	int done = 0;
+	size_t excess = 1024;
+	int length;
+
+	while (!done)
+	{
+		growBuff(excess);
+
+		length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length,
+				   excess, format, ap);
+		if (length >= excess)
+			excess *= 2;
+		else
+			done = 1;
+	}
+	psmi_log_io_info.curr_buff_length += length;
+	return length;
+}
+
+static int psmi_buff_fprintf(int port,const char *format, ...)
+{
+	int length;
+	va_list ap;
+
+	va_start(ap, format);
+
+	length = psmi_buff_vfprintf(port,format,ap);
+
+	va_end(ap);
+	return length;
+}
+
+static int psmi_buff_fputc(int c, int port)
+{
+	growBuff(1024);
+	psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c;
+	psmi_log_io_info.curr_buff_length++;
+	return 1;
+}
+#endif
+
+
+#define IS_PSMI_LOG_MAGIC(S) ((((uint64_t)(S)) <= ((uint64_t)PSM2_LOG_MIN_MAGIC)) && \
+			      (((uint64_t)(S)) >= ((uint64_t)PSM2_LOG_MAX_MAGIC)))
+
+/* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h
+   are serviced from this back end. */
+void psmi_log_message(const char *fileName,
+		      const char *functionName,
+		      int         lineNumber,
+		      const char *format, ...)
+{
+	va_list ap;
+
+	va_start(ap, format);
+
+	/* Next, determine if this log message is signal or noise. */
+	if (plmf_search_format_string)
+	{
+		if (!IS_PSMI_LOG_MAGIC(format))
+		{
+			if (fnmatch(plmf_search_format_string, format, 0))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+	}
+	else
+	{
+		if (includeFunctionNamesTreeRoot)
+		{
+			if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+
+		if (excludeFunctionNamesTreeRoot)
+		{
+			if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+	}
+
+	/* At this point, we think that this may be a message that we want to emit to the log.
+	   But, there is one more test, to apply to the cases where the format is one of the
+	   special formats for backtrack, and packet stream for example. */
+	{
+		void           **voidarray   = NULL;
+		int              nframes     = 0;
+		const char      *newFormat   = format;
+		int              opcode      = 0;
+		psmi_log_tx_rx_t txrx        = 0;
+		uint64_t         fromepid    = 0;
+		uint64_t         toepid      = 0;
+		void            *dumpAddr[2] = {0};
+		size_t           dumpSize[2] = {0};
+
+#ifdef PSM_LOG_FAST_IO
+#define IO_PORT         0
+#define MY_FPRINTF      psmi_buff_fprintf
+#define MY_VFPRINTF     psmi_buff_vfprintf
+#define MY_FPUTC        psmi_buff_fputc
+#define MY_FCLOSE       psmi_buff_fclose
+#else
+		char logFileName[256];
+		FILE *fout;
+#define IO_PORT         fout
+#define MY_FPRINTF      fprintf
+#define MY_VFPRINTF     vfprintf
+#define MY_FPUTC        fputc
+#define MY_FCLOSE       fclose
+#endif
+		struct timespec tp;
+
+		/* Pop arguments for the alternative forms of PSM_LOG functionality: */
+		if (format == PSM2_LOG_BT_MAGIC)
+		{
+			voidarray = va_arg(ap,void **);
+			nframes   = va_arg(ap,int);
+			newFormat = va_arg(ap,const char *);
+		}
+		else if (format == PSM2_LOG_EPM_MAGIC)
+		{
+			opcode    = va_arg(ap,int);
+			txrx      = va_arg(ap,psmi_log_tx_rx_t);
+			fromepid  = va_arg(ap,uint64_t);
+			toepid    = va_arg(ap,uint64_t);
+			newFormat = va_arg(ap,const char *);
+		}
+		else if (format == PSM2_LOG_DUMP_MAGIC)
+		{
+			dumpAddr[0]  = va_arg(ap,void*);
+			dumpSize[0]  = va_arg(ap,size_t);
+			newFormat    = va_arg(ap,const char *);
+		}
+		else if (format == PSM2_LOG_PKT_STRM_MAGIC)
+		{
+			txrx        = va_arg(ap,psmi_log_tx_rx_t);
+			dumpAddr[0] = va_arg(ap,struct ips_message_header *);
+			if (txrx == PSM2_LOG_RX)
+			{
+				dumpAddr[1] = va_arg(ap,uint32_t *);
+				dumpSize[1] = sizeof(uint64_t);
+			}
+			newFormat   = va_arg(ap,const char *);
+			dumpSize[0] = sizeof(struct ips_message_header);
+		}
+
+		/* One last test to make sure that this message is signal: */
+		if (plmf_search_format_string && newFormat)
+		{
+			if (fnmatch(plmf_search_format_string, newFormat, 0))
+			{
+				va_end(ap);
+				/* tis noise, return. */
+				return;
+			}
+		}
+
+#ifdef PSM_LOG_FAST_IO
+		if (psmi_log_register_tls() != 0)
+		{
+			va_end(ap);
+			return;
+		}
+#else
+		/* At this point we know that the message is not noise, and it is going to be emitted to the log. */
+		snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld",
+			 plmf_fileName_kernel,getpid(),
+			 pthread_self());
+		fout = fopen(logFileName,"a");
+		if (!fout)
+		{
+			va_end(ap);
+			return;
+		}
+#endif
+
+#define M1()	clock_gettime(CLOCK_REALTIME, &tp);				 	\
+			MY_FPRINTF(IO_PORT,"%f %s %s:%d: ",				\
+			   (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0),	\
+			   functionName,fileName,lineNumber)
+
+		M1();
+
+		if (!IS_PSMI_LOG_MAGIC(format))
+		{
+			MY_VFPRINTF(IO_PORT,format,ap);
+			MY_FPUTC('\n',IO_PORT);
+		}
+		else if (format == PSM2_LOG_BT_MAGIC)
+		{
+			void *newframes[nframes];
+			int  newframecnt      = backtrace(newframes,nframes);
+			int  pframes          = min(newframecnt,nframes);
+
+			MY_VFPRINTF(IO_PORT,newFormat,ap);
+			MY_FPUTC('\n',IO_PORT);
+
+			if (memcmp(voidarray,newframes,pframes * sizeof(void*)))
+			{
+				int i;
+				char **strings;
+
+				memcpy(voidarray,newframes,sizeof(newframes));
+				M1();
+				MY_FPRINTF(IO_PORT,
+					   "backtrace() returned %d addresses\n",
+					   newframecnt);
+				strings = backtrace_symbols(voidarray, pframes);
+				if (strings == NULL)
+				{
+					perror("backtrace_symbols");
+					exit(EXIT_FAILURE);
+				}
+				for (i = 0; i < pframes; i++)
+				{
+					M1();
+					MY_FPRINTF(IO_PORT,"%s\n", strings[i]);
+				}
+#undef free
+				free(strings);
+			}
+		}
+		else if (format == PSM2_LOG_EPM_MAGIC)
+		{
+			static epmTreeNode *root = 0;
+			static pthread_mutex_t plmf_epm_mutex =
+				PTHREAD_MUTEX_INITIALIZER;
+			int *pcount = 0;
+			if (pthread_mutex_lock(&plmf_epm_mutex))
+			{
+				perror("cannot lock mutex for "
+				       "psmi_log_message facility");
+				va_end(ap);
+				return;
+			}
+			/* START OF CRITICAL SECTION */
+			pcount = insertNodeInEpmTree(&root,opcode,txrx,
+						     fromepid,toepid);
+			/* END OF CRITICAL SECTION */
+			if (pthread_mutex_unlock(&plmf_epm_mutex))
+			{
+				perror("cannot unlock mutex for "
+				       "psmi_log_message facility");
+				va_end(ap);
+				return;
+			}
+			(*pcount)++;
+			MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64
+				   ", to: %" PRIx64 ", count: %d, ",
+				   TxRxString(txrx),OpcodeString(opcode),
+				   fromepid,toepid,*pcount);
+			MY_VFPRINTF(IO_PORT,newFormat,ap);
+			MY_FPUTC('\n',IO_PORT);
+		}
+		else if (format == PSM2_LOG_PKT_STRM_MAGIC)
+		{
+			MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p%s ", TxRxString(txrx),
+				   dumpAddr[0], (txrx == PSM2_LOG_RX) ? "," : "");
+			if (txrx == PSM2_LOG_RX)
+				MY_FPRINTF(IO_PORT,"rhf: %p ", dumpAddr[1]);
+			goto dumpit;
+		}
+		else if (format == PSM2_LOG_DUMP_MAGIC)
+		{
+			MY_VFPRINTF(IO_PORT,newFormat,ap);
+			MY_FPUTC('\n',IO_PORT);
+		dumpit:
+			M1();
+
+			uint8_t *pu8 = (uint8_t *)dumpAddr[0];
+			size_t   i,cnt=0;
+			for (i=0;i < dumpSize[0];i++)
+			{
+				if ((i != 0) && ((i % 8) == 0))
+				{
+					MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8));
+					M1();
+					cnt = 0;
+				}
+				else if (cnt)
+					MY_FPUTC(',',IO_PORT);
+				MY_FPRINTF(IO_PORT,"0x%02x", pu8[i]);
+				cnt++;
+			}
+			if (cnt)
+				MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8));
+			if (dumpSize[1])
+			{
+				dumpSize[0] = dumpSize[1];
+				dumpAddr[0] = dumpAddr[1];
+				dumpSize[1] = 0;
+				goto dumpit;
+			}
+		}
+		MY_FCLOSE(IO_PORT);
+	}
+
+	va_end(ap);
+}
+#endif /* #ifdef PSM_LOG */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_utils.h b/deps/libfabric/prov/psm3/psm3/psm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27bb81e8826666435b3782d05178c86437bdeab
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_utils.h
@@ -0,0 +1,458 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_IN_USER_H
+#error psm_utils.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_UTILS_H
+#define _PSMI_UTILS_H
+
+#include <arpa/inet.h>		/* ipv4addr */
+#include <stdlib.h>		/* malloc/free */
+#include <psm_netutils.h>
+
+/*
+ * Endpoint 'id' hash table, with iterator interface
+ */
+struct psmi_epid_table {
+	struct psmi_epid_tabentry *table;
+	int tabsize;
+	int tabsize_used;
+	pthread_mutex_t tablock;
+};
+/*
+ * Endpoint address hash table
+ */
+struct psmi_epid_tabentry {
+	void *entry;
+	uint64_t key;
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+};
+
+extern struct psmi_epid_table psmi_epid_table;
+#define EPADDR_DELETED	((void *)-1)	/* tag used to mark deleted entries */
+
+psm2_error_t psmi_epid_init();
+psm2_error_t psmi_epid_fini();
+void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid);
+void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid);
+void psmi_epid_remove_all(psm2_ep_t ep);
+psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry);
+#define PSMI_EP_HOSTNAME    ((psm2_ep_t) -1)	/* Special endpoint handle we use
+						 * to register hostnames */
+#define PSMI_EP_CROSSTALK   ((psm2_ep_t) -2)	/* Second special endpoint handle
+						 * to log which nodes we've seen
+						 * crosstalk from */
+struct psmi_eptab_iterator {
+	int i;			/* last index looked up */
+	psm2_ep_t ep;
+};
+void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep);
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor);
+void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor);
+
+uint64_t psmi_epid_version(psm2_epid_t epid);
+
+/*
+ * Hostname manipulation
+ */
+char *psmi_gethostname(void);
+const char *psmi_epaddr_fmt_addr(psm2_epid_t epid);
+const char *psmi_epaddr_get_hostname(psm2_epid_t epid);
+const char *psmi_epaddr_get_name(psm2_epid_t epid);
+psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname,
+				   int overwrite);
+
+/*
+ * Memory allocation, use macros only.
+ *
+ * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE
+ * if no endpoint is available.
+ *
+ *   psmi_malloc_usable_size(void *ptr)
+ *   psmi_malloc(ep, memtype, size)
+ *   psmi_realloc(ep, memtype, ptr, newsize)
+ *   psmi_memalign(ep, memtype, alignment, size)
+ *   psmi_calloc(ep, memtype, elemsz, numelems)
+ *   psmi_strdup(ep, memtype, ptr)
+ *   psmi_free(ptr)
+ *
+ */
+typedef enum psmi_memtype {
+	TOTAL = 0,		/* Logged automatically by malloc/calloc */
+	UNDEFINED,		/* For tracking "other types" of allocations */
+	PER_PEER_ENDPOINT,	/* For tracking "per peer" allocations */
+	NETWORK_BUFFERS,	/* For tracking network buffers */
+	DESCRIPTORS,		/* For tracking send/recv descriptors */
+	UNEXPECTED_BUFFERS,	/* For tracking unexpected recv buffers */
+	STATS,			/* For tracking stats-related allocs */
+#ifdef RNDV_MOD
+	// TBD, should we just tabulate this into PER_PEER_ENDPOINT
+	// maybe once debugged we should consolidate?
+	PEER_RNDV,			/* for tracking Rendezvous per RC QP resources */
+#endif
+} psmi_memtype_t;
+
+/*
+ * We track allocation stats.
+ */
+struct psmi_stats_malloc {
+	int64_t m_all_total;
+	int64_t m_all_max;
+	int64_t m_perpeer_total;
+	int64_t m_perpeer_max;
+	int64_t m_netbufs_total;
+	int64_t m_netbufs_max;
+	int64_t m_descriptors_total;
+	int64_t m_descriptors_max;
+	int64_t m_unexpbufs_total;
+	int64_t m_unexpbufs_max;
+	int64_t m_undefined_total;
+	int64_t m_undefined_max;
+	int64_t m_stats_total;
+	int64_t m_stats_max;
+#ifdef RNDV_MOD
+	int64_t m_peerrndv_total;
+	int64_t m_peerrndv_max;
+#endif
+};
+
+extern struct psmi_stats_malloc psmi_stats_memory;
+
+void psmi_mem_stats_register(void);
+
+void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz,
+			   const char *curloc);
+void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr,
+			    size_t newSz, const char *curloc);
+void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment,
+			     size_t sz, const char *curloc);
+void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num,
+			   size_t sz, const char *curloc);
+void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc);
+
+void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc);
+MOCK_DCL_EPILOGUE(psmi_free_internal);
+
+size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc);
+
+#ifdef PSM_HEAP_DEBUG
+/* During heap debug code, we can sprinkle function calls:
+   psmi_heapdebug_val_heapallocs(), that will examine all of the heap allocations
+   to ensure integrity. */
+void _psmi_heapdebug_val_heapallocs(const char *curloc);
+
+#define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC)
+
+/* Finialize the heapdebug functionality after tear down of the psm
+   session when you are certain that all heap allocations have been
+   freed. psmi_heapdebug_finalize() will emit all of the extant
+   heap allocations and abort if there are any.  This is to aid
+   in debug of heap leaks. */
+void psmi_heapdebug_finalize(void);
+
+#else
+
+#define psmi_heapdebug_val_heapallocs() /* nothing */
+#define psmi_heapdebug_finalize() /* nothing */
+
+#endif
+
+#define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC)
+#define psmi_calloc(ep, mt, nelem, elemsz) \
+	psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC)
+#define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC)
+#define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC)
+#define psmi_memalign(ep, mt, al, sz) \
+	psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC)
+#define psmi_free(ptr)	psmi_free_internal(ptr, PSMI_CURLOC)
+#define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC)
+#ifndef PSM_IS_TEST
+#define malloc(sz)        _use_psmi_malloc_instead_of_plain_malloc
+#define realloc(ptr,nsz)  _use_psmi_realloc_instead_of_plain_realloc
+#define memalign(algn,sz) _use_psmi_memalign_instead_of_plain_memalign
+#define calloc(sz, nelm)  _use_psmi_calloc_instead_of_plain_calloc
+#ifdef strdup
+#undef strdup
+#endif
+#define strdup(ptr)             _use_psmi_strdup_instead_of_plain_strdup
+#define free(ptr)               _use_psmi_free_instead_of_plain_free
+#define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size
+#endif /* PSM_IS_TEST */
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes);
+
+/*
+ * Parse int parameters
+ * -1 -> parse error
+ */
+long psmi_parse_str_long(const char *str);
+
+/*
+ * Parsing int parameters set in string tuples.
+ */
+int psmi_parse_str_tuples(const char *str, int ntup, int *vals);
+
+/*
+ * Resource Limiting based on PSM memory mode.
+ */
+#define PSMI_MEMMODE_NORMAL  0
+#define PSMI_MEMMODE_MINIMAL 1
+#define PSMI_MEMMODE_LARGE   2
+#define PSMI_MEMMODE_NUM     3
+
+struct psmi_rlimit_mpool {
+	const char *env;
+	const char *descr;
+	int env_level;
+	uint32_t minval;
+	uint32_t maxval;
+	struct {
+		uint32_t obj_chunk;
+		uint32_t obj_max;
+	} mode[PSMI_MEMMODE_NUM];
+};
+psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level,
+				 const struct psmi_rlimit_mpool *rlim,
+				 uint32_t *valo, uint32_t *chunkszo);
+int psmi_parse_memmode(void);
+int psmi_parse_identify(void);
+unsigned psmi_parse_senddma(void);
+unsigned psmi_parse_rdmamode(void);
+#ifdef PSM_CUDA
+unsigned psmi_parse_gpudirect(void);
+unsigned psmi_parse_gpudirect_send_limit(void);
+unsigned psmi_parse_gpudirect_recv_limit(void);
+#endif
+
+/*
+ * Parsing environment variables
+ */
+
+union psmi_envvar_val {
+	void *e_void;
+	char *e_str;
+	int e_int;
+	unsigned int e_uint;
+	long e_long;
+	unsigned long e_ulong;
+	unsigned long long e_ulonglong;
+};
+
+#define PSMI_ENVVAR_LEVEL_USER	         1
+#define PSMI_ENVVAR_LEVEL_HIDDEN         2
+#define PSMI_ENVVAR_LEVEL_NEVER_PRINT    4
+
+#define PSMI_ENVVAR_TYPE_YESNO		0
+#define PSMI_ENVVAR_TYPE_STR		1
+#define PSMI_ENVVAR_TYPE_INT		2
+#define PSMI_ENVVAR_TYPE_UINT		3
+#define PSMI_ENVVAR_TYPE_UINT_FLAGS	4
+#define PSMI_ENVVAR_TYPE_LONG		5
+#define PSMI_ENVVAR_TYPE_ULONG		6
+#define PSMI_ENVVAR_TYPE_ULONG_FLAGS	7
+#define PSMI_ENVVAR_TYPE_ULONG_ULONG    8
+
+#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1)
+#define PSMI_ENVVAR_VAL_NO  ((union psmi_envvar_val) 0)
+
+int
+MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level,
+		int type, union psmi_envvar_val defval,
+		union psmi_envvar_val *newval);
+MOCK_DCL_EPILOGUE(psmi_getenv);
+int psmi_parse_val_pattern(const char *env, int def, int def_syntax);
+/*
+ * Misc functionality
+ */
+uintptr_t psmi_getpagesize(void);
+uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns);
+uint32_t psmi_get_ipv4addr();
+void psmi_syslog(psm2_ep_t ep, int to_console, int level,
+		 const char *format, ...);
+void *psmi_memcpyo(void *dst, const void *src, size_t n);
+uint32_t psmi_crc(unsigned char *buf, int len);
+
+/*
+ * Internal CPUID detection
+ */
+#define CPUID_FAMILY_MASK       0x00000f00
+#define CPUID_MODEL_MASK        0x000000f0
+#define CPUID_EXMODEL_MASK      0x000f0000
+
+/*
+ * CPUID return values
+ */
+#define CPUID_FAMILY_XEON       0x00000600
+#define CPUID_MODEL_PHI_GEN2    87
+#define CPUID_MODEL_PHI_GEN2M   133
+/*
+ * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX
+ * due to Little Endian and Hex it is not so obvious
+ */
+#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */
+#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */
+#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */
+
+/*
+ * These values are internal only, not real register values
+ */
+#define CPUID_GENUINE_INTEL     0xf0000000
+#define CPUID_MODEL_UNDEFINED   -1
+
+/*
+ * Global model so we can tune defaults better for specific cpu's
+ */
+extern uint32_t psmi_cpu_model;
+
+/*
+ * Diagnostics, all in psm_diags.c
+ */
+int psmi_diags(void);
+
+/*
+ * Multiple Endpoints
+ */
+extern int psmi_multi_ep_enabled;
+void psmi_multi_ep_init();
+
+#ifdef PSM_FI
+/*
+ * Fault injection
+ * Controlled by:
+ *	PSM3_FI=0/1 - enable
+ *	PSM3_FI_TRACEFILE - where to put summary stats at end of run
+ *		"stdout", "stderr", of prefix for per process filename
+ *	PSM3_FI_VERBOSE - output to std when generate fault
+ *	PSM3_FI_RAIL - only generate for secondary EPs/Rails/QPs
+ *	PSM3_FI_X - for each fault type: num:denom:seed
+ *			fault num/denom of events, seed random for reproducing
+ *		recvlost - discard packet on receive before processing
+ *		rq_lkey - RQ WQE with bad lkey
+ *		rc_rdma_lkey - User RC SQ WQE with bad lkey
+ *		rc_rdma_rkey - User RC SQ WQE with bad rkey
+ *		rv_rdma_len - RV SQ WQE with bad len
+ *		rv_rdma_rkey - RV SQ WQE with bad rkey
+ *		sq_lkey - SQ WQE with bad lkey
+ *		sendlost - discard packet on send before sending
+ *		reg_mr - register MR failure (ENOMEM)
+ *		nonpri_reg_mr - non-priority register MR failure (ENOMEM)
+ *		pri_reg_mr - priority register MR failure (ENOMEM)
+ *		gdrmmap - GPU gdrcopy pin and mmap failure
+ */
+int psmi_faultinj_enabled;	/* use macro to test */
+int psmi_faultinj_verbose;	/* use IS_FAULT macro to test */
+int psmi_faultinj_sec_rail;	/* faults only on secondary rails or EPs */
+
+struct psmi_faultinj_spec {
+	STAILQ_ENTRY(psmi_faultinj_spec) next;
+	char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN];
+
+	uint64_t num_faults;
+	uint64_t num_calls;
+
+	struct drand48_data drand48_data;
+	int num;
+	int denom;
+	long int initial_seed;
+};
+
+#define PSMI_FAULTINJ_ENABLED()	(!!psmi_faultinj_enabled)
+#define PSMI_FAULTINJ_ENABLED_EP(ep)	(PSMI_FAULTINJ_ENABLED() \
+		&& (!psmi_faultinj_sec_rail || ((ep)->mctxt_master != (ep))))
+
+int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi); // use macro instead
+#define PSMI_FAULTINJ_IS_FAULT(fi, fmt, ...) \
+	(psmi_faultinj_is_fault(fi)? \
+			psmi_faultinj_verbose? \
+				(printf("%s: injecting fault: %s" fmt "\n", hfi_get_mylabel(), fi->spec_name, ##__VA_ARGS__), fflush(stdout), 1) \
+				: 1 \
+			: 0)
+
+void psmi_faultinj_init();
+void psmi_faultinj_fini();
+struct psmi_faultinj_spec *psmi_faultinj_getspec(const char *spec_name,
+						 const char *help,
+						 int num, int denom);
+#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, help, num, denom)	\
+	static struct psmi_faultinj_spec *var;			\
+	if_pf(PSMI_FAULTINJ_ENABLED() && (var) == NULL)			\
+	    (var) = psmi_faultinj_getspec((spec_name), (help), (num), (denom));
+
+#else
+#define PSMI_FAULTINJ_ENABLED()	0
+#define PSMI_FAULTINJ_ENABLED_EP(ep)	0
+#define PSMI_FAULTINJ_IS_FAULT(fi, fmt, ...) 0
+#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, help, num, denom)
+#endif /* #ifdef PSM_FI */
+/*
+ * PSM core component set/get options
+ */
+psm2_error_t psmi_core_setopt(const void *core_obj, int optname,
+			     const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_core_getopt(const void *core_obj, int optname,
+			     void *optval, uint64_t *optlen);
+
+/*
+ * PSM AM component set/get options
+ */
+psm2_error_t psmi_am_setopt(const void *am_obj, int optname,
+			   const void *optval, uint64_t optlen);
+
+psm2_error_t psmi_am_getopt(const void *am_obj, int optname,
+			   void *optval, uint64_t *optlen);
+
+#endif /* _PSMI_UTILS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.c b/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.c
new file mode 100644
index 0000000000000000000000000000000000000000..15d957bece03b2f53fb816d40af69c6a0295d04d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.c
@@ -0,0 +1,2195 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+#include <netdb.h>
+//#include <infiniband/verbs.h>
+#include <ifaddrs.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#ifdef RNDV_MOD
+#include "psm_rndv_mod.h"
+#endif
+#include "opa_byteorder.h"
+#include "ips_proto_params.h"
+#include "psm2_hal.h"
+#ifdef PSM_FI
+#include "ips_config.h"
+#endif
+
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+// macros taken fron IbAccess imath.h
+/* round up value to align, align must be a power of 2 */
+#ifndef ROUNDUPP2
+#define ROUNDUPP2(val, align)   \
+    (((uint32_t)(val) + (uint32_t)(align) - 1) & (~((uint32_t)(align)-1)))
+#endif
+/* force to use 64 bits in 32bit box */
+#ifndef ROUNDUP64P2
+#define ROUNDUP64P2(val, align)   \
+        (((uint64_t)(val) + (uint64_t)(align) - 1) & (~((uint64_t)(align)-1)))
+#endif
+
+/* round up value to align, align can be any value, less efficient than ROUNDUPP2 */
+#ifndef ROUNDUP
+#define ROUNDUP(val, align) \
+    ((( ((uint32_t)(val)) + (uint32_t)(align) -1) / (align) ) * (align))
+#endif
+
+/* round down value to align, align must be a power of 2 */
+#ifndef ROUNDDOWNP2
+#define ROUNDDOWNP2(val, align) \
+    (((uint32_t)(val)) & (~((uint32_t)(align)-1)))
+#endif
+
+/* round down value to align, align can be any value, less efficient than ROUNDDOWNP2 */
+#ifndef ROUNDDOWN
+#define ROUNDDOWN(val, align)   \
+    ((( ((uint32_t)(val))) / (align) ) * (align))
+#endif
+
+
+
+// convert MTU enums to bytes
+// TBD - is there a way to specify MTU > 4K, such as 9000 byte jumbo
+#define MTU_FIX          (7) // mtu_ind of 1 (256) => 2^(7+1)
+#define MTU_SIZE(mtu_ind) (((uint64_t)1 << (MTU_FIX + mtu_ind)))
+
+static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key);
+static psm2_error_t
+check_port_state(psm2_ep_t ep);
+static struct ibv_qp* ud_qp_create(psm2_ep_t ep);
+static psm2_error_t modify_ud_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp);
+static psm2_error_t modify_ud_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp);
+static psm2_error_t modify_ud_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp);
+static const char *link_layer_str(int8_t link_layer);
+static enum psm_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed);
+
+void __psm2_ep_free_verbs(psm2_ep_t ep);
+#ifdef RNDV_MOD
+static void deregister_rv_conn_stats(psm2_ep_t ep);
+static void deregister_rv_event_stats(psm2_ep_t ep);
+#endif
+
+// initialize the ep->verbs_ep portion of the ep
+psm2_error_t
+__psm2_ep_open_verbs(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key)
+{
+	int flags;
+
+	// make sure all fields are empty.
+	memset(&ep->verbs_ep,0,sizeof(ep->verbs_ep));
+
+	ep->verbs_ep.qkey = *(uint32_t*)job_key;	// use 1st 32 bits of job_key
+
+	if (_HFI_PRDBG_ON) {
+		char uuid_str[64];
+		memset(&uuid_str, 0, sizeof(uuid_str));
+		uuid_unparse(job_key, uuid_str);
+		_HFI_PRDBG("job key %s qkey=0x%x\n", uuid_str, ep->verbs_ep.qkey);
+	}
+
+	if (PSM2_OK != verbs_open_dev(ep, unit, port, job_key)) {
+		// verbs_open_dev already posted error.
+		goto fail;
+	}
+
+	// compute an appropriate PSM payload size based on the UD MTU
+	// and save result into ep->mtu
+	if (PSM2_OK != check_port_state(ep)) {
+		goto fail;
+	}
+
+	// we'll poll, so no need to allocate an event channel
+	// 		eg. ibv_create_comp_channel
+
+	ep->verbs_ep.pd = ibv_alloc_pd(ep->verbs_ep.context);
+	if (! ep->verbs_ep.pd) {
+		_HFI_ERROR( "Unable to alloc PD on %s: %s\n",
+						ep->dev_name, strerror(errno));
+		goto fail;
+	}
+
+	// planned QP sizes, also influences CQ sizes
+	// PSM3_NUM_SEND_WQES, PSM3_NUM_RECV_WQES
+
+	// we use ep as the cq_context (would be in callbacks if any)
+	// we don't setup a completion channel nor completion vector since we will
+	// poll
+	// we will never have more than hfi_num_send_wqes +  hfi_num_send_rdma
+	// so CQ only needs a little headroom to be safe (1000)
+	ep->verbs_ep.send_cq = ibv_create_cq(ep->verbs_ep.context, ep->hfi_num_send_wqes+ep->hfi_num_send_rdma + 1000, (void*)ep, NULL, 0);
+	if (! ep->verbs_ep.send_cq) {
+		_HFI_ERROR( "Unable to create send CQ of size %u on %s: %s\n",
+						ep->hfi_num_send_wqes+1000, ep->dev_name,
+						strerror(errno));
+		goto fail;
+	}
+
+	ep->verbs_ep.recv_comp_channel = ibv_create_comp_channel(ep->verbs_ep.context);
+	if (! ep->verbs_ep.recv_comp_channel) {
+		_HFI_ERROR( "Unable to create recv CQ completion channel on %s: %s\n",
+						ep->dev_name, strerror(errno));
+		goto fail;
+	}
+	// change completion channel to non-blocking
+	flags = fcntl( ep->verbs_ep.recv_comp_channel->fd, F_GETFL);
+	if (0 > fcntl( ep->verbs_ep.recv_comp_channel->fd, F_SETFL, flags | O_NONBLOCK)) {
+		_HFI_ERROR( "Unable to change file descriptor of completion event channel for %s: %s\n",
+					ep->dev_name, strerror(errno));
+		goto fail;
+	}
+	// this gets done by __psm2_ep_poll_type
+	//if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) {
+	//	_HFI_ERROR("Can't request RQ events from %s: %s\n",
+	//					ep->dev_name, strerror(errno));
+	//	goto fail;
+	//}
+
+	// TBD - should we pick an EQ number
+	// we use ep as the cq_context (would be in callbacks if any)
+	// we will never have more than hfi_num_recv_wqes+HFI_TF_NFLOWS
+	// inflight WQEs
+	// so CQ only needs a little headroom to be safe (1000)
+	// HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound
+	// RDMA w/immed).
+	// For USER RC Eager we can have num_recv_wqes/FRACTION per QP
+	// in which case theoretical need could be huge.  We add 4000 as a
+	// swag to cover most cases and user can always tune higher as needed
+	if (! ep->hfi_num_recv_cqes) {
+		ep->hfi_num_recv_cqes = ep->hfi_num_recv_wqes+HFI_TF_NFLOWS+1000;
+		if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC)
+			ep->hfi_num_recv_cqes += 4000;
+	}
+	ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context,
+						 ep->hfi_num_recv_cqes,
+						 (void*)ep,  ep->verbs_ep.recv_comp_channel, 0);
+	if (! ep->verbs_ep.recv_cq) {
+		_HFI_ERROR( "Unable to create recv CQ of size %u on %s: %s\n",
+					ep->hfi_num_recv_cqes, ep->dev_name,
+					strerror(errno));
+		goto fail;
+	}
+
+	ep->verbs_ep.qp = ud_qp_create(ep);
+	if (! ep->verbs_ep.qp) {
+		_HFI_ERROR( "Unable to create UD QP on %s\n", ep->dev_name);
+		goto fail;
+	}
+
+	// rest of resources initialized by __psm2_ep_initialize_queues after we
+	// have processed PSM3_MTU configuration
+	return PSM2_OK;
+
+fail:
+	__psm2_ep_free_verbs(ep);
+	return PSM2_INTERNAL_ERR;
+}
+
+// ep->mtu is now max PSM payload, not including headers and perhaps decreased
+// via PSM3_MTU
+// initialize the buffer pools and move the UD QP to RTS
+psm2_error_t
+__psm2_ep_initialize_queues(psm2_ep_t ep)
+{
+
+	if (PSM2_OK != psm_verbs_alloc_send_pool(ep, ep->verbs_ep.pd, &ep->verbs_ep.send_pool, 
+				// save 1 send WQE just to be paranoid (should be unnecessary)
+				min(ep->hfi_num_send_wqes, ep->verbs_ep.qp_cap.max_send_wr-1),
+				// want to end up with multiple of cache line (64)
+				// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+				// be conservative (+BUFFER_HEADROOM)
+				ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
+		)) {
+		_HFI_ERROR( "Unable to allocate UD send buffer pool\n");
+		goto fail;
+	}
+	if (PSM2_OK != psm_verbs_init_send_allocator(&ep->verbs_ep.send_allocator, 
+					&ep->verbs_ep.send_pool)) {
+		_HFI_ERROR( "Unable to init UD send buffer allocator\n");
+		goto fail;
+	}
+
+	ep->verbs_ep.send_reap_thresh = min(ep->hfi_send_reap_thresh, ep->verbs_ep.send_pool.send_total/2);
+	_HFI_PRDBG("reaping when %u posted.\n", ep->verbs_ep.send_reap_thresh);
+
+	if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, 
+				min(ep->hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr),
+				// want to end up with multiple of cache line (64)
+				// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+				// be conservative (+BUFFER_HEADROOM)
+				ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
+		)) {
+		_HFI_ERROR( "Unable to allocate UD recv buffer pool\n");
+		goto fail;
+	}
+
+	if (PSM2_OK != modify_ud_qp_to_init(ep, ep->verbs_ep.qp)) {
+		goto fail;
+	}
+
+	if (PSM2_OK != __psm2_ep_verbs_prepost_recv(&ep->verbs_ep.recv_pool)) {
+		_HFI_ERROR( "Unable to prepost recv buffers on QP for %s port %u\n", ep->dev_name, ep->portnum);
+		goto fail;
+	}
+
+	// move QP to RTR and RTS
+	if(PSM2_OK != modify_ud_qp_to_rtr(ep, ep->verbs_ep.qp)) {
+		goto fail;
+	}
+	if(PSM2_OK != modify_ud_qp_to_rts(ep, ep->verbs_ep.qp)) {
+		goto fail;
+	}
+	_HFI_PRDBG("created QP %p (%u)\n", ep->verbs_ep.qp, ep->verbs_ep.qp->qp_num);
+	return PSM2_OK;
+
+fail:
+	psm_verbs_free_send_pool(&ep->verbs_ep.send_pool);
+	psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool);
+	return PSM2_INTERNAL_ERR;
+}
+
+int __psm2_ep_poll_type(int poll_type, psm2_ep_t ep)
+{
+	//if (poll_type == PSMI_HAL_POLL_TYPE_URGENT) {
+	if (poll_type) {
+		// set for event on solicted recv
+		_HFI_PRDBG("enable solicited event\n");
+		if (0 != ibv_req_notify_cq(ep->verbs_ep.recv_cq, 1)) {
+			_HFI_ERROR("Can't request solicitied RQ events on %s: %s\n",
+							ep->dev_name, strerror(errno));
+			return -1;
+		}
+#if 0
+	} else if (poll_type = PSMI_HAL_POLL_TYPE_ANYRCV) {
+		// set for event on all recv completions
+		psmi_assert_always(0);	// not used by PSM
+		if (0 != ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) {
+			_HFI_ERROR("Can't request all RQ events on %s: %s\n",
+							ep->dev_name, strerror(errno));
+			return -1;
+		}
+#endif
+	} else {
+		// no events for solicted and unsolictited recv
+		_HFI_PRDBG("disable solicited event - noop\n");
+		// this is only done once during PSM shutdown of rcvthread.
+		// Verbs events are one-shots.  No way to disable.  However once
+		// PSM stops rcvthread shortly after this call, no one will be
+		// polling for these events so worst case only 1 additional event
+		// occurs and does not get reenabled.
+	}
+	return 0;
+}
+
+// free reources in ep->verbs_ep portion of the ep
+void __psm2_ep_free_verbs(psm2_ep_t ep)
+{
+	if (ep->verbs_ep.qp) {
+		ibv_destroy_qp(ep->verbs_ep.qp);
+		ep->verbs_ep.qp = NULL;
+	}
+	psm_verbs_free_send_pool(&ep->verbs_ep.send_pool);
+	psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool);
+	if (ep->verbs_ep.recv_cq) {
+		ibv_destroy_cq(ep->verbs_ep.recv_cq);
+		ep->verbs_ep.recv_cq = NULL;
+	}
+	if (ep->verbs_ep.recv_comp_channel) {
+		ibv_destroy_comp_channel(ep->verbs_ep.recv_comp_channel);
+		ep->verbs_ep.recv_comp_channel = NULL;
+	}
+
+	if (ep->verbs_ep.send_cq) {
+		ibv_destroy_cq(ep->verbs_ep.send_cq);
+		ep->verbs_ep.send_cq = NULL;
+	}
+	if (ep->verbs_ep.pd) {
+		ibv_dealloc_pd(ep->verbs_ep.pd);
+		ep->verbs_ep.pd = NULL;
+	}
+#ifdef RNDV_MOD
+	if (ep->verbs_ep.rv) {
+		if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) {
+			deregister_rv_conn_stats(ep);
+			deregister_rv_event_stats(ep);
+		}
+		__psm2_rv_close(ep->verbs_ep.rv);
+		ep->verbs_ep.rv = NULL;
+	}
+#endif
+	if (ep->verbs_ep.context) {
+		ibv_close_device(ep->verbs_ep.context);
+		ep->verbs_ep.context = NULL;
+	}
+	if (ep->dev_name) {
+		psmi_free((char*)ep->dev_name);
+		ep->dev_name = NULL;
+	}
+}
+
+// ep argument is only for calloc to associate memory statistics with ep
+// do NOT use ep->verbs_ep.*_pool in this function, instead of pool
+// to access buffering fields.  This function will be called for other pools
+// which are tracked in other structures but still part of the ep's memory stats
+psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd,
+			psm2_verbs_send_pool_t pool,
+			uint32_t send_total, uint32_t send_buffer_size)
+{
+	memset(pool,0,sizeof(*pool));
+
+	// use what we got, make sure it's a multiple of coallesce
+	// don't grow beyond requested, otherwise we could exceed CQ sizes
+	pool->send_total = ROUNDDOWN(send_total, VERBS_SEND_CQ_COALLESCE);
+
+	if (send_total && send_buffer_size) {
+		// allocate send buffers
+		int i;
+		pool->send_buffer_size = send_buffer_size;
+		pool->send_num_free = pool->send_total;
+		pool->send_buffers = (uint8_t *)psmi_memalign(ep, NETWORK_BUFFERS, CPU_PAGE_ALIGN,
+													 pool->send_total*pool->send_buffer_size);
+		if (! pool->send_buffers) {
+			_HFI_ERROR( "can't alloc send buffers");
+			goto fail;
+		}
+
+		_HFI_PRDBG("send pool: buffers: %p size %u\n",  pool->send_buffers, pool->send_buffer_size);
+		pool->send_bufs = (struct verbs_sbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
+							 pool->send_total*sizeof(struct verbs_sbuf), 1);
+		if (! pool->send_bufs) {
+			_HFI_ERROR("can't alloc send buffers ctrl");
+			goto fail;
+		}
+		// prepare free list, put lower numbered buffers at head of free list
+		for (i=pool->send_total-1; i >= 0; i--) {
+			pool->send_bufs[i].buffer = &(pool->send_buffers[send_buffer_start(pool, i)]);
+			pool->send_bufs[i].next = pool->send_free;
+			pool->send_free = &(pool->send_bufs[i]);
+		}
+		_HFI_PRDBG("%u Send Buffers of %u bytes each allocated at %p.\n", pool->send_total, pool->send_buffer_size,
+			pool->send_buffers);
+
+		// UD doesn't support RDMA, so we just need local NIC to be able to
+		// access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE)
+		// technically we probably don't need LOCAL_WRITE for send buffers
+		pool->send_buffer_mr = ibv_reg_mr(
+						pd, pool->send_buffers,
+						pool->send_total*pool->send_buffer_size,
+						IBV_ACCESS_LOCAL_WRITE);
+		if (! pool->send_buffer_mr) {
+			_HFI_ERROR( "Unable to alloc send buffer MR on %s: %s\n",
+							ep->dev_name, strerror(errno));
+			goto fail;
+		}
+	}
+	return PSM2_OK;
+
+fail:
+	psm_verbs_free_send_pool(pool);
+	return PSM2_INTERNAL_ERR;
+}
+
+extern psm2_error_t psm_verbs_init_send_allocator(
+			psm2_verbs_send_allocator_t allocator,
+			psm2_verbs_send_pool_t pool)
+{
+
+	memset(allocator,0,sizeof(*allocator));
+	allocator->pool = pool;
+	allocator->send_num_til_coallesce = VERBS_SEND_CQ_COALLESCE;
+	return PSM2_OK;
+}
+
+
+// ep argument is only for calloc to associate memory statistics with ep
+// do NOT use ep->verbs_ep.*_pool in this function, instead of pool
+// to access buffering fields.  This function will be called for other pools
+// which are tracked in other structures but still part of the ep's memory stats
+// For RC QPs receiving only RDMA Write with immediate, no buffer space is
+// needed.  Caller will specify recv_buffer_size==0 with a recv_total.
+psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
+			psm2_verbs_recv_pool_t pool,
+			uint32_t recv_total, uint32_t recv_buffer_size)
+{
+	memset(pool,0,sizeof(*pool));
+
+	pool->qp = qp;	// save a reference
+	pool->ep = ep;
+	pool->recv_total = recv_total;
+
+	if (recv_total ) {
+		int i;
+		if (recv_buffer_size) {
+			// allocate recv buffers
+			pool->recv_buffer_size = recv_buffer_size;
+			// beginning of UD QP Recv Buf always consumed with space for IB GRH
+			if (qp->qp_type == IBV_QPT_UD) {
+				// round up UD_ADDITION (40) to multiple of 64 for better
+				// cache alignment of buffers
+				pool->recv_buffer_size += ROUNDUP(UD_ADDITION, 64);
+				pool->addition = UD_ADDITION;
+			}
+			pool->recv_buffers = (uint8_t *)psmi_calloc(ep, NETWORK_BUFFERS,
+							 pool->recv_total*pool->recv_buffer_size, 1);
+			if (! pool->recv_buffers) {
+				_HFI_ERROR( "can't alloc recv buffers");
+				goto fail;
+			}
+			//printf("recv pool: buffers: %p size %u\n",  pool->recv_buffers, pool->recv_buffer_size);
+			pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
+								 pool->recv_total*sizeof(struct verbs_rbuf), 1);
+			if (! pool->recv_bufs) {
+				_HFI_ERROR("can't alloc recv buffers ctrl");
+				goto fail;
+			}
+			// prepare rbuf handles for use as wr_id
+			for (i=0; i<pool->recv_total; i++) {
+				pool->recv_bufs[i].buffer = &(pool->recv_buffers[recv_buffer_start(pool, i)]);
+				pool->recv_bufs[i].pool = pool;
+			}
+			_HFI_PRDBG("%u Recv Buffers of %u bytes each allocated at %p.\n", pool->recv_total, pool->recv_buffer_size,
+				pool->recv_buffers);
+
+			// UD doesn't support RDMA, so we just need local NIC to be able to
+			// access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE)
+			pool->recv_buffer_mr = ibv_reg_mr(
+							qp->pd, pool->recv_buffers,
+							pool->recv_total*pool->recv_buffer_size,
+							IBV_ACCESS_LOCAL_WRITE);
+			if (! pool->recv_buffer_mr) {
+				_HFI_ERROR( "Unable to alloc recv buffer MR on %s: %s\n",
+								ep->dev_name, strerror(errno));
+				goto fail;
+			}
+		} else {
+			// we want a pool for RDMA Write w/immediate recv.  No buffers
+			psmi_assert(qp->qp_type != IBV_QPT_UD);
+			// we use exactly 1 rbuf so wr_id can lead us to pool and qp
+			pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
+							 sizeof(struct verbs_rbuf), 1);
+			if (! pool->recv_bufs) {
+				_HFI_ERROR("can't alloc recv buffers ctrl");
+				goto fail;
+			}
+			// prepare rbuf handle for use as wr_id
+			pool->recv_bufs->pool = pool;
+			_HFI_PRDBG("%u Recv Buffers of %u bytes each allocated.\n", pool->recv_total, pool->recv_buffer_size);
+		}
+#if VERBS_RECV_QP_COALLESCE > 1
+		// prebuild as much as we can
+		for (i=0; i < VERBS_RECV_QP_COALLESCE; i++ ) {
+			struct ibv_recv_wr *wr = &(pool->recv_wr_list[i]);
+			wr->next = &(pool->recv_wr_list[i+1]);
+			if (recv_buffer_size) {
+				struct ibv_sge *list = &(pool->recv_sge_list[i]);
+				wr->sg_list = list;
+				list->length = pool->recv_buffer_size;
+				list->lkey = pool->recv_buffer_mr->lkey;
+				wr->num_sge = 1;	// size of sg_list
+			} else {
+				wr->sg_list = NULL;
+				wr->num_sge = 0;	// size of sg_list
+			}
+		}
+		// fixup end of list
+		pool->recv_wr_list[VERBS_RECV_QP_COALLESCE-1].next = NULL;
+		pool->next_recv_wqe = 0;
+#endif
+	}
+	return PSM2_OK;
+
+fail:
+	psm_verbs_free_recv_pool(pool);
+	return PSM2_INTERNAL_ERR;
+}
+
+void psm_verbs_free_send_pool(psm2_verbs_send_pool_t pool)
+{
+	if (pool->send_buffer_mr) {
+		ibv_dereg_mr(pool->send_buffer_mr);
+		pool->send_buffer_mr = NULL;
+	}
+	if (pool->send_bufs) {
+		psmi_free(pool->send_bufs);
+		pool->send_bufs = NULL;
+	}
+	if (pool->send_buffers) {
+		psmi_free(pool->send_buffers);
+		pool->send_buffers = NULL;
+	}
+	memset(pool,0,sizeof(*pool));	// in case anyone looks at other integers
+}
+
+// this is not allowed to access pool->qp, it may already be destroyed
+void psm_verbs_free_recv_pool(psm2_verbs_recv_pool_t pool)
+{
+	if (pool->recv_buffer_mr) {
+		ibv_dereg_mr(pool->recv_buffer_mr);
+		pool->recv_buffer_mr = NULL;
+	}
+	if (pool->recv_bufs) {
+		psmi_free(pool->recv_bufs);
+		pool->recv_bufs = NULL;
+	}
+	if (pool->recv_buffers) {
+		psmi_free(pool->recv_buffers);
+		pool->recv_buffers = NULL;
+	}
+	memset(pool,0,sizeof(*pool));	// in case anyone looks at other integers
+}
+
+// the allocator tries to reallocate recently freed send buffers
+// so we can tend to allocate a small set of buffers
+// to improve CPU, MMU and NIC MMU hit rates
+sbuf_t __psm2_ep_verbs_alloc_sbuf(psm2_verbs_send_allocator_t allocator)
+{
+	psm2_verbs_send_pool_t pool = allocator->pool;
+	sbuf_t sbuf = pool->send_free;
+	if_pt (sbuf) {
+		// take off head of free list
+		pool->send_free = sbuf->next;
+		pool->send_num_free--;
+		sbuf->next = NULL;
+		// keep a list of allocated buffers in order at alloc_head
+		// and put this one at the alloc_end of the list
+		if_pf (! allocator->send_alloc_head)	// unlikely when more than 1 posted
+			allocator->send_alloc_head = sbuf;
+		if_pt (allocator->send_alloc_end)	// likely when more than 1 posted
+			allocator->send_alloc_end->next = sbuf;
+		allocator->send_alloc_end = sbuf;
+		sbuf->allocator = allocator;
+	}
+	return sbuf;
+}
+
+// buffers must be freed in order, the fact the SQ reports completions in
+// same order as send WQEs ensures this
+// this will free count buffers with buf being the last freed
+void __psm2_ep_verbs_free_sbuf(
+			sbuf_t buf, uint32_t count)
+{
+	psm2_verbs_send_allocator_t allocator = buf->allocator;
+	psm2_verbs_send_pool_t pool = allocator->pool;
+	sbuf_t b;
+	do {
+		// take 1st off allocated list
+		b = allocator->send_alloc_head;
+		allocator->send_alloc_head = b->next;
+		if_pf (allocator->send_alloc_end == b)	// unlikely last outstanding
+			allocator->send_alloc_end = NULL;
+		// put at head of free list
+		b->next =  pool->send_free;
+		pool->send_free = b;
+		pool->send_num_free++;
+#ifdef UD_DEBUG
+		printf("freed: %u num free: %u\n", 
+			(uint32_t)send_buffer_index(pool, b->buffer),
+			pool->send_num_free);
+#endif
+	} while (--count && b != buf);
+	// normally we will find buf just as we exhaust count (coallesce amount).
+	// however when send error CQEs occur (such as flush) we may find less
+	// than count inflight ahead of buf
+	//psmi_assert_always(b == buf && count == 0);
+	psmi_assert_always(b == buf);
+}
+
+psm2_error_t __psm2_ep_verbs_post_recv(
+				rbuf_t buf)
+{
+	psm2_verbs_recv_pool_t pool = buf->pool;
+#if VERBS_RECV_QP_COALLESCE > 1
+	struct ibv_recv_wr *wr;
+#else
+	struct ibv_recv_wr wr;
+	struct ibv_sge list;
+#endif
+	struct ibv_recv_wr *bad_wr;
+
+	// only RC QPs doing just RDMA Write can have a zero buffer size
+	if (pool->recv_buffer_size) {
+		uint32_t index = recv_buffer_index(pool, rbuf_to_buffer(buf));
+		// make sure its a buffer in our pool
+		psmi_assert_always(index < pool->recv_total);
+		// assert on index covers these 2 asserts
+		//psmi_assert_always(rbuf_to_buffer(buf) >= pool->recv_buffers);
+		//psmi_assert_always(rbuf_to_buffer(buf) <= pool->recv_buffers +
+		//						 pool->recv_total)*pool->recv_buffer_size);
+		// make sure buf is exactly at the start of a buffer in our pool
+		psmi_assert_always(rbuf_to_buffer(buf) == &(pool->recv_buffers[recv_buffer_start(pool, index)]));
+
+#if VERBS_RECV_QP_COALLESCE > 1
+		// put buf in wr at end of list
+		wr = &(pool->recv_wr_list[pool->next_recv_wqe]);
+		psmi_assert(wr->sg_list == &(pool->recv_sge_list[pool->next_recv_wqe]));
+		wr->sg_list->addr = (uintptr_t)rbuf_to_buffer(buf);
+		wr->wr_id = (uintptr_t)buf;	// we'll get this back in completion
+#ifdef PSM_FI
+		if_pf(PSMI_FAULTINJ_ENABLED_EP(pool->ep)) {
+			PSMI_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey",
+					"post UD "
+					"or RC "
+					"RQ WQE with bad lkey",
+					0, IPS_FAULTINJ_RQ_LKEY);
+			if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rq_lkey, " QP %u", pool->qp->qp_num))
+				wr->sg_list->lkey = 55;
+		} else
+			wr->sg_list->lkey = pool->recv_buffer_mr->lkey;
+#endif // PSM_FI
+		if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) {
+			// we have a batch ready to post
+			if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
+				_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted RQ, including buffer %u\n", index);
+			pool->next_recv_wqe = 0;
+		} else {
+			//_HFI_VDBG("preped RQE, buffer %u\n", index);
+		}
+#else
+		list.addr = (uintptr_t)rbuf_to_buffer(buf);
+		list.length = pool->recv_buffer_size;
+		list.lkey = pool->recv_buffer_mr->lkey;
+#ifdef PSM_FI
+		if_pf(PSMI_FAULTINJ_ENABLED_EP(pool->ep)) {
+			PSMI_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey",
+					"post UD "
+					"or RC "
+					"RQ WQE with bad lkey",
+					0, IPS_FAULTINJ_RQ_LKEY);
+			if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rq_lkey, " QP %u", pool->qp->qp_num))
+				list.lkey = 55;
+		}
+#endif // PSM_FI
+		wr.next = NULL;	// just post 1
+		wr.wr_id = (uintptr_t)buf;	// we'll get this back in completion
+		wr.sg_list = &list;
+		wr.num_sge = 1;	// size of sg_list
+
+		if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
+			_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+			return PSM2_INTERNAL_ERR;
+		}
+		//_HFI_VDBG("posted RQ, buffer %u\n", index);
+#endif
+	} else {
+#if VERBS_RECV_QP_COALLESCE > 1
+		// put buf in wr at end of list
+		wr = &(pool->recv_wr_list[pool->next_recv_wqe]);
+		psmi_assert(wr->sg_list == NULL);
+		wr->wr_id = (uintptr_t)buf;	// we'll get this back in completion
+		if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) {
+			// we have a batch ready to post
+			if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
+				_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted RQ\n");
+			pool->next_recv_wqe = 0;
+		} else {
+			//_HFI_VDBG("preped RQE\n");
+		}
+#else
+		wr.next = NULL;	// just post 1
+		wr.wr_id = (uintptr_t)buf;	// we'll get this back in completion
+		wr.sg_list = NULL;
+		wr.num_sge = 0;	// size of sg_list
+
+		if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
+			_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+			return PSM2_INTERNAL_ERR;
+		}
+		//_HFI_VDBG("posted RQ\n");
+#endif
+	}
+	return PSM2_OK;
+}
+
+psm2_error_t __psm2_ep_verbs_prepost_recv(
+						psm2_verbs_recv_pool_t pool)
+{
+	int i;
+
+	if (! pool->recv_total)
+		return PSM2_INTERNAL_ERR;
+	// prepare RQ
+	for (i=0; i< pool->recv_total; i++) {
+		rbuf_t buf = &(pool->recv_bufs[i]);
+		if (pool->recv_buffer_size)
+			buf = &(pool->recv_bufs[i]);
+		else
+			buf = pool->recv_bufs;	// only 1, just to find pool and qp
+		if (PSM2_OK != __psm2_ep_verbs_post_recv(
+							buf)) {
+			_HFI_ERROR( "Unable to post RQ on %s port %u\n", pool->ep->dev_name, pool->ep->portnum);
+			return PSM2_INTERNAL_ERR;
+		}
+	}
+	return PSM2_OK;
+}
+
+// only used when PSM3_RDMA enabled
+psm2_error_t psm2_verbs_post_rdma_write_immed(psm2_ep_t ep, struct ibv_qp *qp,
+				void *loc_buf, struct psm2_verbs_mr *loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				size_t len, uint32_t immed, uint64_t wr_id)
+{
+	struct ibv_send_wr wr;
+	struct ibv_send_wr *bad_wr;
+	struct ibv_sge list;
+	psm2_error_t ret = PSM2_OK;
+
+	//printf("XXXX %s 0x%p %ld 0x%x\n", __FUNCTION__, loc_buf, len, loc_mr->lkey);
+	psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(ep->rdmamode));
+
+	list.addr = (uintptr_t)loc_buf;
+	list.length = len;
+	list.lkey = loc_mr->lkey;
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_rc_rdma_lkey, "rc_rdma_lkey",
+				"post RC RDMA Write WQE with bad lkey",
+				0, IPS_FAULTINJ_RC_RDMA_LKEY);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rc_rdma_lkey, " QP %u", qp->qp_num))
+			list.lkey = 55;
+	}
+#endif // PSM_FI
+	wr.next = NULL; // just post 1
+	psmi_assert(! (wr_id & VERBS_SQ_WR_ID_MASK));
+	wr.wr_id = wr_id | VERBS_SQ_WR_ID_RDMA_WRITE;
+	wr.sg_list = &list;
+	wr.num_sge = 1; // size of sg_list
+	wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+	wr.imm_data = immed;
+	wr.wr.rdma.remote_addr = rem_buf;
+	wr.wr.rdma.rkey = rkey;
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_rc_rdma_rkey, "rc_rdma_rkey",
+				"post RC RDMA Write WQE with bad rkey",
+				0, IPS_FAULTINJ_RC_RDMA_RKEY);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rc_rdma_rkey, " QP %u", qp->qp_num))
+			wr.wr.rdma.rkey = 55;
+	}
+#endif // PSM_FI
+	// RDMA Writes will tend to be larger and we want the completion
+	// to reflect the RDMA for a given CTS is completed
+	wr.send_flags = IBV_SEND_SIGNALED;  // get a completion
+	// no need for wr.send_flags |= IBV_SEND_SOLICITED
+	// these will be bigger sends, no need for inline
+	ep->verbs_ep.send_rdma_outstanding++;
+	if_pf (ibv_post_send(qp, &wr, &bad_wr)) {
+		if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM)
+			_HFI_ERROR("failed to post RC SQ on %s port %u: %s",
+					ep->dev_name, ep->portnum, strerror(errno));
+		// caller will try again later when next send buffer freed
+		// or timer expires
+		ret = PSM2_TIMEOUT;
+		ep->verbs_ep.send_rdma_outstanding--;
+		goto done;
+	}
+	_HFI_VDBG("posted RDMA Write: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n",
+		list.addr,  wr.wr.rdma.remote_addr, list.length,  wr.wr.rdma.rkey /* TBD rem QPN */ );
+#if 0
+	// we will not have many in flight at a time so
+	// normal progress calls should be sufficient
+	// no need to reap completions here
+	err = psm2_verbs_completion_update(ep);
+	if_pf (err != PSM2_OK)
+		return err;
+#endif
+done:
+	//printf("XXXX %s ret:%d\n", __FUNCTION__, ret);
+	return ret;
+}
+
+#ifdef RNDV_MOD
+psm2_error_t psm2_verbs_post_rv_rdma_write_immed(psm2_ep_t ep,
+				psm2_rv_conn_t conn,
+				void *loc_buf, struct psm2_verbs_mr *loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				size_t len, uint32_t immed, uint64_t wr_id,
+				uint8_t *sconn_index, uint32_t *conn_count)
+{
+	psm2_error_t ret = PSM2_OK;
+
+	//printf("XXXX %s 0x%p %ld 0x%x\n", __FUNCTION__, loc_buf, len, loc_mr->lkey);
+	psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode));
+
+	ep->verbs_ep.send_rdma_outstanding++;
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_rv_rdma_len, "rv_rdma_len",
+				"post RV RDMA Write with bad len (may want RV build with RNDV_LOCAL_ERR_TEST)",
+				0, IPS_FAULTINJ_RV_RDMA_LEN);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rv_rdma_len, ""))
+			len += 1000000000;
+	}
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_rv_rdma_rkey, "rv_rdma_rkey",
+				"post RV RDMA Write with bad rkey",
+				1, IPS_FAULTINJ_RV_RDMA_RKEY);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_rv_rdma_rkey, ""))
+			rkey = 55;
+	}
+#endif // PSM_FI
+	if (__psm2_rv_post_rdma_write_immed(ep->verbs_ep.rv, conn,
+                loc_buf, loc_mr->mr.rv_mr,
+                rem_buf, rkey,
+                len, immed, wr_id, sconn_index, conn_count)) {
+		switch (errno) {
+		case EIO:
+			// lost or failed connection
+			ret = PSM2_EPID_RV_CONNECT_ERROR;
+			break;
+		case EAGAIN:
+			// lost connection and are recoverying it
+			ret = PSM2_EPID_RV_CONNECT_RECOVERING;
+			break;
+		case ENOMEM:
+		case EBUSY:
+			// caller will try again later when next send buffer freed
+			// or timer expires
+			ret = PSM2_TIMEOUT;
+			break;
+		default:
+			ret = PSM2_INTERNAL_ERR;
+			break;
+		}
+		if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM) {
+			_HFI_ERROR("failed to post RV RC SQ on %s port %u: %s",
+					ep->dev_name, ep->portnum, strerror(errno));
+			psmi_assert_always(errno != EINVAL);
+		}
+		ep->verbs_ep.send_rdma_outstanding--;
+		goto done;
+	}
+	_HFI_VDBG("posted RV RDMA Write: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n",
+		(uint64_t)loc_buf,  rem_buf, (unsigned)len,  rkey /* TBD rem QPN */ );
+done:
+	//printf("XXXX %s ret:%d\n", __FUNCTION__, ret);
+	return ret;
+}
+#endif // RNDV_MOD
+
+extern int ips_protoexp_rdma_write_completion( uint64_t wr_id);
+
+// we structure this similar to ips_proto_dma_completion_update
+// this is non-blocking.  We reap what's available and then return
+psm2_error_t
+psm2_verbs_completion_update(psm2_ep_t ep)
+{
+	#define CQE_BATCH 10	// reap a few at a time, hopefully faster this way
+	//#define CQE_BATCH 8 or 18	// reap a few at a time, hopefully faster this way
+							// 18*COALLESE > default reap threshold so we
+							// should get away with one poll_q
+							// not sure if doing the exact math here would
+							// add clocks and hurt a bit more than approx math
+							// int batch = (send_reap_thresh/COALLESCE) + 2
+							// alloca(sizeof(ibv_wc) & batch)
+	struct ibv_wc wc[CQE_BATCH];
+	int ne;
+
+	PSMI_LOCK_ASSERT(ep->mq->progress_lock);
+	// TBD - when coallescing completions we'll tend to fall through to poll_cq
+	// this only called when out of buffers or immediately after posting a send
+	// reduce the frequency of poll_cq by only checking once we have at least
+	// send_reap_thresh sends in flight
+	// for USE_RC this is imperfect, we can have a handful of unsignaled
+	// send WQEs on multiple RC QPs, in which case we may exceed the
+	// reap_thresh but not find any CQEs until we post more sends and
+	// hit the coalsce threshold.
+	if_pt ((! ep->verbs_ep.send_rdma_outstanding
+				 || IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode))
+		   && ep->verbs_ep.send_pool.send_num_free > ep->verbs_ep.send_pool.send_total - ep->verbs_ep.send_reap_thresh  )
+		return PSM2_OK;	// not ready to reap, return quickly
+
+	//if ( 0 != (ne = ibv_poll_cq(ep->verbs_ep.send_cq, CQE_BATCH, wc)))
+	while ( 0 != (ne = ibv_poll_cq(ep->verbs_ep.send_cq, CQE_BATCH, wc)))
+	{
+		unsigned i;
+		for (i=0; i<ne; i++) {
+			psmi_assert_always(wc[i].wr_id);
+			if_pf (wc[i].status) {
+				if (wc[i].status != IBV_WC_WR_FLUSH_ERR)
+					_HFI_ERROR("failed %s on %s port %u status: '%s' (%d) QP %u\n",
+						VERBS_SQ_WR_OP_STR(wc[i].wr_id),
+						ep->dev_name, ep->portnum,
+						ibv_wc_status_str(wc[i].status), (int)wc[i].status,
+						wc[i].qp_num);
+				// For user space RC QP, the QP is now in QPS_ERROR and we
+				// need to reset (or replace) and reconnect it.
+				// Upcoming async event will cause us to stop.
+				// User's wanting reliability for RDMA should use RV.
+				if (VERBS_SQ_WR_OP(wc[i].wr_id) == VERBS_SQ_WR_ID_SEND)
+					__psm2_ep_verbs_free_sbuf(
+								(sbuf_t)(wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK),
+								VERBS_SEND_CQ_COALLESCE);
+				continue;
+			}
+			switch (wc[i].opcode) {
+			case IBV_WC_SEND:
+				// UD sends just mean it got onto the wire and can reuse our buf
+				// no guarantees it made it to the remote side
+				// buffer address is in wc.wr_id
+				_HFI_VDBG("send done (%u bytes) sbuf index %lu\n", wc[i].byte_len,
+					send_buffer_index(&ep->verbs_ep.send_pool, sbuf_to_buffer((sbuf_t)(wc[i].wr_id))));
+				__psm2_ep_verbs_free_sbuf(
+							(sbuf_t)(wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK),
+							VERBS_SEND_CQ_COALLESCE);
+				break;
+			case IBV_WC_RDMA_WRITE:
+				ep->verbs_ep.send_rdma_outstanding--;
+				ips_protoexp_rdma_write_completion(
+							 wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK);
+				break;
+			default:
+				_HFI_ERROR("unexpected send completion on %s port %u opcode %d QP %u\n",
+							ep->dev_name, ep->portnum,
+							wc[i].opcode, wc[i].qp_num);
+				break;
+			}
+		}
+#if 0
+		// this is optional, especially if use "if" above instead of while
+		if (ne <CQE_BATCH)
+			break;	// we got less than we asked, we are fast enought that
+					// there probably aren't any more on CQE, so just let our
+					// next pass reap any that appear while we were processing
+#endif
+	}
+	return PSM2_OK;
+}
+
+int verbs_get_port_index2pkey(psm2_ep_t ep, int port, int index)
+{
+	__be16 pkey;
+
+	psmi_assert_always(ep->verbs_ep.context);
+	if (0 != ibv_query_pkey(ep->verbs_ep.context, port, index, &pkey)) {
+		_HFI_ERROR( "Can't query pkey index %d on %s port %u: %s\n", index,
+				ep->dev_name, port, strerror(errno));
+		return -1;
+	}
+	_HFI_PRDBG("got pkey 0x%x on %s port %u\n", __be16_to_cpu(pkey), ep->dev_name, port);
+	return __be16_to_cpu(pkey);
+}
+
+#ifdef RNDV_MOD
+// accessor functions for cm statistics
+#define EP_STAT_FUNC(func, stat) \
+	static uint64_t func(void *context) \
+	{ \
+		psm2_ep_t ep = (psm2_ep_t)context; \
+		return ep->stat; \
+	}
+
+EP_STAT_FUNC(rv_q_depth, rv_q_depth)
+EP_STAT_FUNC(rv_reconnect_timeout, rv_reconnect_timeout)
+EP_STAT_FUNC(rv_hb_interval, rv_hb_interval)
+#undef EP_STAT_FUNC
+
+static uint64_t rv_index(void *context)
+{
+	struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep;
+	return vep->rv_index;
+}
+
+static uint64_t rv_conn_flags(void *context)
+{
+	struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep;
+	if (vep->rv) {
+		// this is a little sly, we know the stats processing routines will
+		// call the accessors in the order from the entries list
+		// so we use the 1st of the rv statistics accessors to get
+		// the statistics from rv into the cache structure so other accessors
+		// can simply return the relevant value
+			// we get aggregated values instead of per conn
+		(void)__psm2_rv_get_conn_stats(vep->rv, NULL, 0, &vep->rv_conn_stats);
+	}
+	return vep->rv_conn_stats.flags;
+}
+
+#define RV_CM_STAT_FUNC(func, stat) \
+	static uint64_t func(void *context) \
+	{ \
+		struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; \
+		return vep->rv_conn_stats.stat; \
+	}
+
+RV_CM_STAT_FUNC(rv_conn_num_conn, num_conn)
+RV_CM_STAT_FUNC(rv_conn_req_error, req_error)
+RV_CM_STAT_FUNC(rv_conn_req_recv, req_recv)
+RV_CM_STAT_FUNC(rv_conn_rep_error, rep_error)
+RV_CM_STAT_FUNC(rv_conn_rep_recv, rep_recv)
+RV_CM_STAT_FUNC(rv_conn_rtu_recv, rtu_recv)
+RV_CM_STAT_FUNC(rv_conn_established, established)
+RV_CM_STAT_FUNC(rv_conn_dreq_error, dreq_error)
+RV_CM_STAT_FUNC(rv_conn_dreq_recv, dreq_recv)
+RV_CM_STAT_FUNC(rv_conn_drep_recv, drep_recv)
+RV_CM_STAT_FUNC(rv_conn_timewait, timewait)
+RV_CM_STAT_FUNC(rv_conn_mra_recv, mra_recv)
+RV_CM_STAT_FUNC(rv_conn_rej_recv, rej_recv)
+RV_CM_STAT_FUNC(rv_conn_lap_error, lap_error)
+RV_CM_STAT_FUNC(rv_conn_lap_recv, lap_recv)
+RV_CM_STAT_FUNC(rv_conn_apr_recv, apr_recv)
+RV_CM_STAT_FUNC(rv_conn_unexp_event, unexp_event)
+RV_CM_STAT_FUNC(rv_conn_req_sent, req_sent)
+RV_CM_STAT_FUNC(rv_conn_rep_sent, rep_sent)
+RV_CM_STAT_FUNC(rv_conn_rtu_sent, rtu_sent)
+RV_CM_STAT_FUNC(rv_conn_rej_sent, rej_sent)
+RV_CM_STAT_FUNC(rv_conn_dreq_sent, dreq_sent)
+RV_CM_STAT_FUNC(rv_conn_drep_sent, drep_sent)
+//RV_CM_STAT_FUNC(rv_conn_wait_time, wait_time)
+//RV_CM_STAT_FUNC(rv_conn_resolve_time, resolve_time)
+//RV_CM_STAT_FUNC(rv_conn_connect_time, connect_time)
+//RV_CM_STAT_FUNC(rv_conn_connected_time, connected_time)
+RV_CM_STAT_FUNC(rv_conn_resolve, resolve)
+RV_CM_STAT_FUNC(rv_conn_resolve_fail, resolve_fail)
+RV_CM_STAT_FUNC(rv_conn_conn_recovery, conn_recovery)
+//RV_CM_STAT_FUNC(rv_conn_rewait_time, rewait_time)
+//RV_CM_STAT_FUNC(rv_conn_reresolve_time, reresolve_time)
+//RV_CM_STAT_FUNC(rv_conn_reconnect_time, reconnect_time)
+//RV_CM_STAT_FUNC(rv_conn_max_rewait_time, max_rewait_time)
+//RV_CM_STAT_FUNC(rv_conn_max_reresolve_time, max_reresolve_time)
+//RV_CM_STAT_FUNC(rv_conn_max_reconnect_time, max_reconnect_time)
+RV_CM_STAT_FUNC(rv_conn_reresolve, reresolve)
+RV_CM_STAT_FUNC(rv_conn_reresolve_fail, reresolve_fail)
+//RV_CM_STAT_FUNC(rv_conn_post_write, post_write)
+//RV_CM_STAT_FUNC(rv_conn_post_write_fail, post_write_fail)
+//RV_CM_STAT_FUNC(rv_conn_post_write_bytes, post_write_bytes)
+RV_CM_STAT_FUNC(rv_conn_outstand_send_write, outstand_send_write)
+//RV_CM_STAT_FUNC(rv_conn_send_write_cqe, send_write_cqe)
+//RV_CM_STAT_FUNC(rv_conn_send_write_cqe_fail, send_write_cqe_fail)
+//RV_CM_STAT_FUNC(rv_conn_recv_write_cqe, recv_write_cqe)
+//RV_CM_STAT_FUNC(rv_conn_recv_write_bytes, recv_write_bytes)
+//RV_CM_STAT_FUNC(rv_conn_recv_cqe_fail, recv_cqe_fail)
+//RV_CM_STAT_FUNC(rv_conn_post_hb, post_hb)
+//RV_CM_STAT_FUNC(rv_conn_post_hb_fail, post_hb_fail)
+//RV_CM_STAT_FUNC(rv_conn_send_hb_cqe, send_hb_cqe)
+//RV_CM_STAT_FUNC(rv_conn_send_hb_cqe_fail, send_hb_cqe_fail)
+//RV_CM_STAT_FUNC(rv_conn_recv_hb_cqe, recv_hb_cqe)
+#undef RV_CM_STAT_FUNC
+
+static void register_rv_conn_stats(psm2_ep_t ep)
+{
+	struct psm2_rv_conn_stats *ep_rv_conn_stats = &ep->verbs_ep.rv_conn_stats;
+
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("rv_q_depth", MPSPAWN_STATS_REDUCTION_ALL,
+				rv_q_depth, NULL),
+		PSMI_STATS_DECL("rv_reconnect_timeout", MPSPAWN_STATS_REDUCTION_ALL,
+				rv_reconnect_timeout, NULL),
+		PSMI_STATS_DECL("rv_hb_interval", MPSPAWN_STATS_REDUCTION_ALL,
+				rv_hb_interval, NULL),
+		PSMI_STATS_DECL("rv_index", MPSPAWN_STATS_REDUCTION_ALL,
+				rv_index, NULL),
+
+		PSMI_STATS_DECL("rv_conn_flags", MPSPAWN_STATS_REDUCTION_ALL,
+				rv_conn_flags, NULL),
+
+		PSMI_STATS_DECL_FUNC("num_conn", rv_conn_num_conn),
+		PSMI_STATS_DECL_FUNC("req_error", rv_conn_req_error),
+		PSMI_STATS_DECL_FUNC("req_recv", rv_conn_req_recv),
+		PSMI_STATS_DECL_FUNC("rep_error", rv_conn_rep_error),
+		PSMI_STATS_DECL_FUNC("rep_recv", rv_conn_rep_recv),
+		PSMI_STATS_DECL_FUNC("rtu_recv", rv_conn_rtu_recv),
+		PSMI_STATS_DECL_FUNC("established", rv_conn_established),
+		PSMI_STATS_DECL_FUNC("dreq_error", rv_conn_dreq_error),
+		PSMI_STATS_DECL_FUNC("dreq_recv", rv_conn_dreq_recv),
+		PSMI_STATS_DECL_FUNC("drep_recv", rv_conn_drep_recv),
+		PSMI_STATS_DECL_FUNC("timewait", rv_conn_timewait),
+		PSMI_STATS_DECL_FUNC("mra_recv", rv_conn_mra_recv),
+		PSMI_STATS_DECL_FUNC("rej_recv", rv_conn_rej_recv),
+		PSMI_STATS_DECL_FUNC("lap_error", rv_conn_lap_error),
+		PSMI_STATS_DECL_FUNC("lap_recv", rv_conn_lap_recv),
+		PSMI_STATS_DECL_FUNC("apr_recv", rv_conn_apr_recv),
+		PSMI_STATS_DECL_FUNC("unexp_event", rv_conn_unexp_event),
+		PSMI_STATS_DECL_FUNC("req_sent", rv_conn_req_sent),
+		PSMI_STATS_DECL_FUNC("rep_sent", rv_conn_rep_sent),
+		PSMI_STATS_DECL_FUNC("rtu_sent", rv_conn_rtu_sent),
+		PSMI_STATS_DECL_FUNC("rej_sent", rv_conn_rej_sent),
+		PSMI_STATS_DECL_FUNC("dreq_sent", rv_conn_dreq_sent),
+		PSMI_STATS_DECL_FUNC("drep_sent", rv_conn_drep_sent),
+		PSMI_STATS_DECLU64("wait_time", (uint64_t*)&ep_rv_conn_stats->wait_time),
+		PSMI_STATS_DECLU64("resolve_time", (uint64_t*)&ep_rv_conn_stats->resolve_time),
+		PSMI_STATS_DECLU64("connect_time", (uint64_t*)&ep_rv_conn_stats->connect_time),
+		PSMI_STATS_DECLU64("connected_time", (uint64_t*)&ep_rv_conn_stats->connected_time),
+		PSMI_STATS_DECL_FUNC("resolve", rv_conn_resolve),
+		PSMI_STATS_DECL_FUNC("resolve_fail", rv_conn_resolve_fail),
+		PSMI_STATS_DECL_FUNC("conn_recovery", rv_conn_conn_recovery),
+		PSMI_STATS_DECLU64("rewait_time", (uint64_t*)&ep_rv_conn_stats->rewait_time),
+		PSMI_STATS_DECLU64("reresolve_time", (uint64_t*)&ep_rv_conn_stats->reresolve_time),
+		PSMI_STATS_DECLU64("reconnect_time", (uint64_t*)&ep_rv_conn_stats->reconnect_time),
+		PSMI_STATS_DECLU64("max_rewait_time", (uint64_t*)&ep_rv_conn_stats->max_rewait_time),
+		PSMI_STATS_DECLU64("max_reresolve_time", (uint64_t*)&ep_rv_conn_stats->max_reresolve_time),
+		PSMI_STATS_DECLU64("max_reconnect_time", (uint64_t*)&ep_rv_conn_stats->max_reconnect_time),
+		PSMI_STATS_DECL_FUNC("reresolve", rv_conn_reresolve),
+		PSMI_STATS_DECL_FUNC("reresolve_fail", rv_conn_reresolve_fail),
+		PSMI_STATS_DECLU64("post_write", (uint64_t*)&ep_rv_conn_stats->post_write),
+		PSMI_STATS_DECLU64("post_write_fail", (uint64_t*)&ep_rv_conn_stats->post_write_fail),
+		PSMI_STATS_DECLU64("post_write_bytes", (uint64_t*)&ep_rv_conn_stats->post_write_bytes),
+		PSMI_STATS_DECL_FUNC("send_write_out", rv_conn_outstand_send_write),
+		PSMI_STATS_DECLU64("send_write_cqe", (uint64_t*)&ep_rv_conn_stats->send_write_cqe),
+		PSMI_STATS_DECLU64("send_write_cqe_fail", (uint64_t*)&ep_rv_conn_stats->send_write_cqe_fail),
+
+		PSMI_STATS_DECLU64("recv_write_cqe", (uint64_t*)&ep_rv_conn_stats->recv_write_cqe),
+		PSMI_STATS_DECLU64("recv_write_bytes", (uint64_t*)&ep_rv_conn_stats->recv_write_bytes),
+		PSMI_STATS_DECLU64("recv_cqe_fail", (uint64_t*)&ep_rv_conn_stats->recv_cqe_fail),
+
+		PSMI_STATS_DECLU64("post_hb", (uint64_t*)&ep_rv_conn_stats->post_hb),
+		PSMI_STATS_DECLU64("post_hb_fail", (uint64_t*)&ep_rv_conn_stats->post_hb_fail),
+		PSMI_STATS_DECLU64("send_hb_cqe", (uint64_t*)&ep_rv_conn_stats->send_hb_cqe),
+		PSMI_STATS_DECLU64("send_hb_cqe_fail", (uint64_t*)&ep_rv_conn_stats->send_hb_cqe_fail),
+		PSMI_STATS_DECLU64("recv_hb_cqe", (uint64_t*)&ep_rv_conn_stats->recv_hb_cqe),
+	};
+
+	psmi_stats_register_type("RV_Shared_Conn_RDMA_Statistics",
+					PSMI_STATSTYPE_RV_RDMA,
+					entries,
+					PSMI_STATS_HOWMANY(entries),
+					ep->epid, ep, ep->dev_name);
+}
+
+static void deregister_rv_conn_stats(psm2_ep_t ep)
+{
+	psmi_stats_deregister_type(PSMI_STATSTYPE_RV_RDMA, ep);
+}
+
+// accessor functions for event statistics
+static uint64_t rv_send_write_cqe(void *context)
+{
+	struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep;
+	if (vep->rv) {
+		// this is a little sly, we know the stats processing routines will
+		// call the accessors in the order from the entries list
+		// so we use the 1st of the rv statistics accessors to get
+		// the statistics from rv into the cache structure so other accessors
+		// can simply return the relevant value
+			// we get aggregated values instead of per conn
+		(void)__psm2_rv_get_event_stats(vep->rv, &vep->rv_event_stats);
+	}
+	return vep->rv_event_stats.send_write_cqe;
+}
+
+#define RV_EVENT_STAT_FUNC(func, stat) \
+	static uint64_t func(void *context) \
+	{ \
+		struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; \
+		return vep->rv_event_stats.stat; \
+	}
+
+//RV_EVENT_STAT_FUNC(rv_send_write_cqe_fail, send_write_cqe_fail)
+//RV_EVENT_STAT_FUNC(rv_send_write_bytes, send_write_bytes)
+
+//RV_EVENT_STAT_FUNC(rv_recv_write_cqe, recv_write_cqe)
+//RV_EVENT_STAT_FUNC(rv_recv_write_cqe_fail, recv_write_cqe_fail)
+//RV_EVENT_STAT_FUNC(rv_recv_write_bytes, recv_write_bytes)
+#undef RV_EVENT_STAT_FUNC
+
+static void register_rv_event_stats(psm2_ep_t ep)
+{
+	struct psm2_rv_event_stats *ep_rv_event_stats = &ep->verbs_ep.rv_event_stats;
+
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL_FUNC("send_write_cqe", rv_send_write_cqe),
+		PSMI_STATS_DECLU64("send_write_cqe_fail", (uint64_t*)&ep_rv_event_stats->send_write_cqe_fail),
+		PSMI_STATS_DECLU64("send_write_bytes", (uint64_t*)&ep_rv_event_stats->send_write_bytes),
+
+		PSMI_STATS_DECLU64("recv_write_cqe", (uint64_t*)&ep_rv_event_stats->recv_write_cqe),
+		PSMI_STATS_DECLU64("recv_write_cqe_fail", (uint64_t*)&ep_rv_event_stats->recv_write_cqe_fail),
+		PSMI_STATS_DECLU64("recv_write_bytes", (uint64_t*)&ep_rv_event_stats->recv_write_bytes),
+	};
+
+	psmi_stats_register_type("RV_User_Event_Statistics",
+					PSMI_STATSTYPE_RV_EVENT,
+					entries,
+					PSMI_STATS_HOWMANY(entries),
+					ep->epid, ep, ep->dev_name);
+}
+
+static void deregister_rv_event_stats(psm2_ep_t ep)
+{
+	psmi_stats_deregister_type(PSMI_STATSTYPE_RV_EVENT, ep);
+}
+
+static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
+{
+	struct local_info loc_info = { 0 };
+
+	// we always fill in everything we might need in loc_info
+	// in some modes, some of the fields are not used by RV
+	loc_info.mr_cache_size = ep->rv_mr_cache_size;
+#ifdef PSM_CUDA
+	/* gpu_cache_size ignored unless RV_RDMA_MODE_GPU */
+	loc_info.gpu_cache_size = ep->rv_gpu_cache_size;
+#endif
+	loc_info.rdma_mode = IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)?
+					RV_RDMA_MODE_KERNEL: RV_RDMA_MODE_USER;
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED) {
+		// when Cuda is enabled we will have larger window_sz and
+		// need to upsize the caches we will use for priority MRs
+		if (ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED) {
+			// priority window_sz reg_mr for CPU
+			loc_info.rdma_mode |= RV_RDMA_MODE_UPSIZE_CPU;
+		}
+ 		if (psmi_parse_gpudirect()) {
+			// When GPU Direct is enabled we need a GPU Cache
+			loc_info.rdma_mode |= RV_RDMA_MODE_GPU;
+			if ((ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED)
+				&& (psmi_parse_gpudirect_send_limit()
+				|| psmi_parse_gpudirect_recv_limit())) {
+				// priority window_sz reg_mr for GPU memory
+				loc_info.rdma_mode |= RV_RDMA_MODE_UPSIZE_GPU;
+			}
+		}
+	}
+#endif
+
+	// need portnum for rdma_mode KERNEL or USER|GPU
+	loc_info.port_num = ep->portnum;
+	// the rest of loc_info is really only needed for RV_RDMA_MODE_KERNEL
+	loc_info.num_conn = ep->rv_num_conn;
+	// caller computes our local EPID, but loc_addr must == PSMI_EPID_GET_LID
+	// for what will be established as our local epid by psmi_context_open
+	// later rem_addr will be compared to this and is based on PSMI_EPID_GET_LID
+	// for a remote epid
+	if (ep->verbs_ep.port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+		// use IPv4 addr in lgid as local address
+		loc_info.loc_addr = ep->verbs_ep.ip_addr;
+	} else {
+		loc_info.loc_addr = ep->verbs_ep.port_attr.lid;
+	}
+	loc_info.index_bits = RV_INDEX_BITS;
+	loc_info.loc_gid_index = ep->verbs_ep.lgid_index;
+	loc_info.loc_gid = ep->verbs_ep.lgid;
+	// TBD qos_class_sl
+	loc_info.job_key_len = min(RV_MAX_JOB_KEY_LEN, sizeof(psm2_uuid_t));
+	loc_info.job_key = (uint8_t*)job_key;
+	loc_info.service_id = ep->service_id;
+	loc_info.context = ep;
+	if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) {
+		// HFI_TF_NFLOWS (32) limits recv side concurrent tidflows (aka inbound
+		// for send we never have more than hfi_num_send_rdma RDMA outstanding
+		loc_info.cq_entries = ep->hfi_num_send_rdma + HFI_TF_NFLOWS + 32;
+	}
+	loc_info.q_depth = ep->rv_q_depth;
+	loc_info.reconnect_timeout = ep->rv_reconnect_timeout;
+	loc_info.hb_interval = ep->rv_hb_interval;
+
+	ep->verbs_ep.rv =__psm2_rv_open(ep->dev_name, &loc_info);
+	if (! ep->verbs_ep.rv) {
+		return PSM2_INTERNAL_ERR;
+	}
+	// parallel psm_hal_gen1/psm_hal_inline_i.h handling HFI1_CAP_GPUDIRECT_OT
+	// for OPA psm_context.c, treats CUDA driver w/non-CUDA PSM as fatal
+#ifndef RV_CAP_GPU_DIRECT
+#ifdef PSM_CUDA
+#error "Inconsistent build.  RV_CAP_GPU_DIRECT must be defined for CUDA builds."
+#else
+// lifted from rv_user_ioctls.h
+#define RV_CAP_GPU_DIRECT (1UL << 63)
+#endif
+#endif
+	if (psmi_parse_identify()) {
+		if (loc_info.capability & RV_CAP_GPU_DIRECT)
+#ifdef PSM_CUDA
+			printf("%s %s run-time rv interface v%d.%d%s gpu v%d.%d cuda\n",
+			       hfi_get_mylabel(), hfi_ident_tag,
+			       loc_info.major_rev,
+			       loc_info.minor_rev,
+			       (loc_info.capability & RV_CAP_USER_MR)?" mr":"",
+			       loc_info.gpu_major_rev,
+			       loc_info.gpu_minor_rev);
+#else
+			printf("%s %s run-time rv interface v%d.%d%s cuda\n",
+			       hfi_get_mylabel(), hfi_ident_tag,
+			       loc_info.major_rev,
+			       loc_info.minor_rev,
+			       (loc_info.capability & RV_CAP_USER_MR)?" mr":"");
+#endif
+		else
+			printf("%s %s run-time rv interface v%d.%d%s\n",
+			       hfi_get_mylabel(), hfi_ident_tag,
+			       loc_info.major_rev,
+			       loc_info.minor_rev,
+			       (loc_info.capability & RV_CAP_USER_MR)?" mr":"");
+	}
+	if (loc_info.capability & RV_CAP_USER_MR)
+		psmi_hal_add_cap(PSM_HAL_CAP_USER_MR);
+	if (loc_info.capability & RV_CAP_EVICT)
+		psmi_hal_add_cap(PSM_HAL_CAP_EVICT);
+	if (loc_info.capability & RV_CAP_GPU_DIRECT)
+		psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_OT);
+	ep->verbs_ep.rv_index = loc_info.rv_index;
+	ep->rv_mr_cache_size = loc_info.mr_cache_size;
+#ifdef PSM_CUDA
+	ep->rv_gpu_cache_size = loc_info.gpu_cache_size;
+#endif
+	ep->rv_q_depth = loc_info.q_depth;
+	ep->rv_reconnect_timeout = loc_info.reconnect_timeout;
+
+	return PSM2_OK;
+}
+#endif // RNDV_MOD
+
+// initialize verbs specific statistics
+void
+__psm2_ep_initstats_verbs(psm2_ep_t ep)
+{
+#ifdef RNDV_MOD
+	if (ep->verbs_ep.rv && IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) {
+		// only one set of conn stats per job_dev, so
+		// no use gathering for any extra QPs we open
+		if (ep->mctxt_master == ep)
+			register_rv_conn_stats(ep);
+		register_rv_event_stats(ep);
+	}
+#endif
+
+}
+
+static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key)
+{
+	// similar to code in ifs-all/Topology, enumerates devices and picks one
+	int i, num_of_devices;
+	struct ibv_device **dev_list = NULL;
+	struct ibv_device *ib_dev = NULL;
+	int err = PSM2_OK;
+	const char *unitpath = sysfs_unit_path(unit);
+	uint64_t hi, lo;
+	int flags;
+
+	// callers tend not to set port, 0 means any
+	if (PSM3_NIC_PORT_ANY == port)
+		port = VERBS_PORT;
+	ep->portnum = port;
+	if (! unitpath) {
+		_HFI_ERROR( "NULL sysfs unitpath for unit %d\n", unit);
+		return PSM2_INTERNAL_ERR;
+	}
+
+	char *dev_name = strrchr(unitpath, '/');
+	if (dev_name == NULL) {
+		_HFI_ERROR( "invalid sysfs unitpath for unit %d\n", unit);
+		return PSM2_INTERNAL_ERR;
+	}
+	dev_name++; // Inc past last '/'
+
+	ep->dev_name = psmi_strdup(ep, dev_name);
+	if (! ep->dev_name) {
+		_HFI_ERROR( "can't alloc devname");
+		return PSM2_INTERNAL_ERR;
+	}
+
+	dev_list = ibv_get_device_list(&num_of_devices);
+	if (num_of_devices <= 0) {
+		_HFI_ERROR(" Did not detect any RDMA devices \n");
+		_HFI_ERROR(" If device exists, check if driver is up\n");
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+	if (!dev_list) {
+		_HFI_ERROR(" Internal error, exiting.\n");
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+
+	for (i = 0; i < num_of_devices; i++) {
+		if (!strcmp(ibv_get_device_name(dev_list[i]), ep->dev_name))
+			break;
+	}
+	if (i >= num_of_devices) {
+		_HFI_ERROR("Unit Id [%d] name %s not found, number of devices is %d\n",
+				   unit, ep->dev_name, num_of_devices);
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+	ep->unit_id = unit;
+	_HFI_PRDBG("Using unit_id[%d] %s.\n", ep->unit_id, ep->dev_name);
+
+	ib_dev = dev_list[i];	// device list order may differ from unit order
+	ep->verbs_ep.context = ibv_open_device(ib_dev);
+	if (! ep->verbs_ep.context) {
+		_HFI_ERROR( "Unable to open %s: %s\n", ep->dev_name,
+						strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	} else {
+		_HFI_PRDBG("Opened %s.\n",ep->dev_name);
+	}
+	// change async events to non-blocking
+	flags = fcntl( ep->verbs_ep.context->async_fd, F_GETFL);
+	if (0 > fcntl( ep->verbs_ep.context->async_fd, F_SETFL, flags | O_NONBLOCK)) {
+		_HFI_ERROR( "Unable to change file descriptor of async events for %s: %s\n",
+					ep->dev_name, strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+
+	if (ibv_query_port(ep->verbs_ep.context, ep->portnum, &ep->verbs_ep.port_attr)) {
+		_HFI_ERROR( "Unable to query port %u of %s: %s\n", ep->portnum,
+						ep->dev_name, strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	} else {
+		_HFI_PRDBG("Queried %s.\n",ep->dev_name);
+	}
+
+	if (0 != psmi_hal_get_port_subnet(ep->unit_id, ep->portnum,
+			&ep->gid_hi, &ep->gid_lo,	// effective subnet and addr in subnet
+			&ep->verbs_ep.ip_addr, &ep->verbs_ep.ip_netmask,	// if eth
+			&ep->verbs_ep.lgid_index, &hi, &lo)) {
+		_HFI_ERROR( "Unable to get subnet for port %u of %s: %s\n", ep->portnum,
+						ep->dev_name, strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	} else {
+		ep->verbs_ep.lgid.global.subnet_prefix = __cpu_to_be64(hi);
+		ep->verbs_ep.lgid.global.interface_id = __cpu_to_be64(lo);
+		_HFI_PRDBG("Subnet for port %u of %s: 0x%"PRIx64" addr 0x%"PRIx64" gid 0x%"PRIx64":0x%"PRIx64"\n",
+					ep->portnum, ep->dev_name,
+					ep->gid_hi, ep->gid_lo, hi, lo);
+	}
+
+#ifdef RNDV_MOD
+	if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)
+		|| ep->mr_cache_mode == MR_CACHE_MODE_KERNEL ) {
+		// open rendezvous module for the same port as our verbs device
+		err = open_rv(ep, job_key);
+		if (err != PSM2_OK) {
+			_HFI_ERROR( "Unable to open rendezvous module for port %u of %s.\n",
+				ep->portnum, ep->dev_name);
+			// TBD - could ignore error and proceed with UD mode
+			//err = PSM2_OK;
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+		if (ep->mr_cache_mode == MR_CACHE_MODE_KERNEL
+			&& ! psmi_hal_has_cap(PSM_HAL_CAP_USER_MR)) {
+			_HFI_ERROR( "Rendezvous module lacks enable_user_mr capability.\n");
+			// TBD - could ignore error and proceed with UD mode
+			//err = PSM2_OK;
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+	}
+#endif
+
+done:
+	if (dev_list)
+		ibv_free_device_list(dev_list);
+	return err;
+
+fail:
+	if (ep->verbs_ep.context) {
+		ibv_close_device(ep->verbs_ep.context);
+		ep->verbs_ep.context = NULL;
+	}
+	if (ep->dev_name) {
+		psmi_free((char*)ep->dev_name);
+		ep->dev_name = NULL;
+	}
+	goto done;
+}
+
+static psm2_error_t
+check_port_state(psm2_ep_t ep)
+{
+	uint32_t active_mtu;
+
+	active_mtu = MTU_SIZE(ep->verbs_ep.port_attr.active_mtu);
+	if (ep->verbs_ep.port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+		_HFI_PRDBG("running on ethernet at %d MTU\n", active_mtu);
+	} else {
+		_HFI_PRDBG( "running on %s at %d MTU\n", link_layer_str(ep->verbs_ep.port_attr.link_layer), active_mtu);
+	}
+	if (strcmp("Unknown", link_layer_str(ep->verbs_ep.port_attr.link_layer)) == 0) {
+		_HFI_ERROR( "Link layer on port %d of %s is Unknown\n", ep->portnum,
+						ep->dev_name);
+		return PSM2_INTERNAL_ERR;
+	}
+	ep->verbs_ep.link_layer = ep->verbs_ep.port_attr.link_layer;
+
+	if (ep->verbs_ep.port_attr.state != IBV_PORT_ACTIVE) {
+		_HFI_ERROR( " Port state is not active for %s port %d: %d\n",
+						ep->dev_name, ep->portnum,
+						ep->verbs_ep.port_attr.state);
+		//_HFI_ERROR( " Port number %d on %s state is %s\n",
+				//params->ib_port, ep->dev_name,
+				//portStates[ep->verbs_ep.port_attr.state]);
+		return PSM2_INTERNAL_ERR;
+	}
+
+	// compute MTU.
+	// ep->mtu is the PSM payload size.  For OPA native mode, this did not
+	// include headers as OPA allowed up to an additional 128 bytes of headers.
+	// However all UD QP payloads (including PSM headers) are
+	// counted toward MTU in UD verbs.  So need to discount by PSM header size
+	ep->mtu = active_mtu - MAX_PSM_HEADER;
+	_HFI_PRDBG("Max PSM payload (aka MTU): %u\n", ep->mtu);
+	// TBD - *act_mtu = defined constant, we can use an eager RC message size
+	// for PSM which is larger than packet MTU
+	ep->verbs_ep.active_rate = verbs_get_rate(
+									ep->verbs_ep.port_attr.active_width,
+									ep->verbs_ep.port_attr.active_speed);
+	return PSM2_OK;
+}
+
+static struct ibv_qp* ud_qp_create(psm2_ep_t ep)
+{
+	struct ibv_qp* qp = NULL;
+
+	struct ibv_qp_init_attr attr = { 0 };
+
+	attr.qp_context = ep;	// our own pointer
+	attr.send_cq = ep->verbs_ep.send_cq;
+	attr.recv_cq = ep->verbs_ep.recv_cq;
+	// one extra WQE to be safe in case verbs needs a spare WQE
+	attr.cap.max_send_wr  = ep->hfi_num_send_wqes+1;
+	attr.cap.max_send_sge = 2;
+	attr.cap.max_inline_data = ep->hfi_imm_size;
+
+	attr.srq = NULL;
+	attr.cap.max_recv_wr  = ep->hfi_num_recv_wqes;
+	attr.cap.max_recv_sge = 1;
+
+	attr.qp_type = IBV_QPT_UD;
+
+	qp = ibv_create_qp(ep->verbs_ep.pd, &attr);
+	if (qp == NULL && errno == ENOMEM) {
+		_HFI_ERROR( "Unable to create UD QP on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		_HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n");
+		_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
+					ep->hfi_num_send_wqes+1, ep->hfi_num_recv_wqes);
+	}
+
+	// attr reports what we got, double check and react in case
+	ep->verbs_ep.qp_cap = attr.cap;
+
+	// QP adjusted values due to HW limits
+	if (ep->hfi_imm_size > attr.cap.max_inline_data) {
+		_HFI_PRDBG( "Limited to inline size of %d, requested %u\n",
+			attr.cap.max_inline_data, ep->hfi_imm_size);
+	} else {
+		_HFI_PRDBG("Inline Size: %u\n", attr.cap.max_inline_data);
+	}
+	if (ep->hfi_num_send_wqes+1 > attr.cap.max_send_wr) {
+		_HFI_PRDBG( "Limited to %d SQ WQEs, requested %u\n",
+			attr.cap.max_send_wr, ep->hfi_num_send_wqes+1);
+	} else {
+		_HFI_PRDBG("SQ WQEs: %u\n", attr.cap.max_send_wr);
+	}
+	if (2 > attr.cap.max_send_sge) {
+		_HFI_PRDBG( "Limited to %d SQ SGEs\n",
+			attr.cap.max_send_sge);
+	}
+	if (ep->hfi_num_recv_wqes > attr.cap.max_recv_wr) {
+		_HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n",
+			attr.cap.max_recv_wr, ep->hfi_num_recv_wqes);
+	} else {
+		_HFI_PRDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr);
+	}
+	if (1 > attr.cap.max_recv_sge) {
+		_HFI_PRDBG( "Limited to %d RQ SGEs\n",
+			attr.cap.max_recv_sge);
+	}
+
+	return qp;
+}
+
+static psm2_error_t modify_ud_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp)
+{
+	struct ibv_qp_attr attr = { 0 };
+	int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
+
+	attr.qp_state = IBV_QPS_INIT;
+	attr.pkey_index = ep->network_pkey_index;
+	attr.port_num =  ep->portnum;
+	attr.qkey = ep->verbs_ep.qkey;
+	//attr.qp_access_flags N/A for UD
+	//flags |= IBV_QP_ACCESS_FLAGS;
+
+	if (ibv_modify_qp(qp, &attr,flags)) {
+		_HFI_ERROR( "Failed to modify UD QP to INIT on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	return PSM2_OK;
+}
+
+static psm2_error_t modify_ud_qp_to_rtr(psm2_ep_t ep,struct ibv_qp *qp)
+{
+	struct ibv_qp_attr attr = { 0 };
+	int flags = IBV_QP_STATE;
+
+	attr.qp_state = IBV_QPS_RTR;
+
+	if (ibv_modify_qp(qp, &attr, flags)) {
+		_HFI_ERROR( "Failed to modify UD QP to RTR on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	return PSM2_OK;
+}
+
+static psm2_error_t modify_ud_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp)
+{
+	struct ibv_qp_attr attr = { 0 };
+	int flags = IBV_QP_STATE | IBV_QP_SQ_PSN;
+
+	attr.qp_state = IBV_QPS_RTS;
+	attr.sq_psn = 0x1234;	// doesn't really matter for UD
+
+	if (ibv_modify_qp(qp, &attr, flags)) {
+		_HFI_ERROR( "Failed to modify UD QP to RTS on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	return PSM2_OK;
+}
+
+struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
+{
+	struct ibv_qp* qp = NULL;
+
+	struct ibv_qp_init_attr attr;
+	memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+
+	attr.qp_context = context;
+	attr.send_cq = ep->verbs_ep.send_cq;
+	attr.recv_cq = ep->verbs_ep.recv_cq;
+	attr.srq = NULL;
+	// one extra WQE to be safe in case verbs needs a spare WQE
+	if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+		// need to be prepared in case all sends posted to same RC QP, so
+		// match the number of send buffers we plan to allocate
+		attr.cap.max_send_wr  = ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1;
+		attr.cap.max_send_sge = 2;
+		// inline data helps latency and message rate for small sends
+		// Later we may explore use of
+		// send SGEs pointing to application buffers, somewhat like WFR send DMA
+		attr.cap.max_inline_data = ep->hfi_imm_size;
+		attr.cap.max_recv_wr  = ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION;// TBD
+		attr.cap.max_recv_sge = 1;
+	} else {
+		// only RDMA Write w/immediate
+		attr.cap.max_send_wr  = ep->hfi_num_send_rdma+1;
+		attr.cap.max_send_sge = 1;
+		attr.cap.max_inline_data = 0;
+		// incoming Write w/immediate consumes a RQ WQE but no buffer needed
+		attr.cap.max_recv_wr  = HFI_TF_NFLOWS+1;
+		attr.cap.max_recv_sge = 0;
+	}
+
+	attr.qp_type = IBV_QPT_RC;
+
+	qp = ibv_create_qp(ep->verbs_ep.pd, &attr);
+	if (qp == NULL) {
+		_HFI_ERROR( "Unable to create RC QP on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		_HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n");
+		_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
+					ep->hfi_num_send_wqes+1,
+					ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);
+		return NULL;
+	}
+
+// TBD - getting too small resources should be fatal or adjust limits to be smaller
+	if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+		// QP adjusted values due to HW limits
+		if (ep->hfi_imm_size > attr.cap.max_inline_data) {
+			_HFI_PRDBG( "Limited to inline size of %d, requested %u\n",
+				attr.cap.max_inline_data, ep->hfi_imm_size);
+		} else {
+			_HFI_PRDBG("Inline Size: %u\n", attr.cap.max_inline_data);
+		}
+		if (ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1 > attr.cap.max_send_wr) {
+			_HFI_PRDBG( "Limited to %d SQ WQEs, requested %u\n",
+				attr.cap.max_send_wr, ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1);
+		} else {
+			_HFI_PRDBG("SQ WQEs: %u\n", attr.cap.max_send_wr);
+		}
+		if (2 > attr.cap.max_send_sge) {
+			_HFI_PRDBG( "Limited to %d SQ SGEs\n",
+				attr.cap.max_send_sge);
+		}
+		if (ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) {
+			_HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n",
+				attr.cap.max_recv_wr, ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);
+		} else {
+			_HFI_PRDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr);
+		}
+		if (1 > attr.cap.max_recv_sge) {
+			_HFI_PRDBG( "Limited to %d RQ SGEs\n",
+				attr.cap.max_recv_sge);
+		}
+	} else {
+		// QP adjusted values due to HW limits
+		if (ep->hfi_num_send_rdma+1 > attr.cap.max_send_wr) {
+			_HFI_PRDBG( "Limited to %d SQ WQEs, requested %u\n",
+				attr.cap.max_send_wr, ep->hfi_num_send_rdma+1);
+		} else {
+			_HFI_PRDBG("SQ WQEs: %u\n", attr.cap.max_send_wr);
+		}
+		if (1 > attr.cap.max_send_sge) {
+			_HFI_PRDBG( "Limited to %d SQ SGEs\n",
+				attr.cap.max_send_sge);
+		}
+		if (HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) {
+			_HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n",
+				attr.cap.max_recv_wr, HFI_TF_NFLOWS+1);
+		} else {
+			_HFI_PRDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr);
+		}
+	}
+
+	if (cap)
+		*cap = attr.cap;
+	_HFI_MMDBG("created RC QP %d\n", qp->qp_num);
+	return qp;
+}
+
+void rc_qp_destroy(struct ibv_qp* qp)
+{
+	ibv_destroy_qp(qp);
+}
+
+psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp)
+{
+	struct ibv_qp_attr attr = { 0 };
+	int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
+
+	attr.qp_state        = IBV_QPS_INIT;
+	attr.pkey_index = ep->network_pkey_index;
+	attr.port_num =  ep->portnum;
+
+	//attr.qkey = ep->verbs_ep.qkey;
+	//flags |= IBV_QP_QKEY;	// only allowed for UD
+	attr.qp_access_flags = 0;
+	attr.qp_access_flags |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+	//attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
+	flags |= IBV_QP_ACCESS_FLAGS;
+
+	if (ibv_modify_qp(qp, &attr, flags)) {
+		_HFI_ERROR( "Failed to modify RC QP to INIT on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	_HFI_MMDBG("moved %d to INIT\n", qp->qp_num);
+	return PSM2_OK;
+}
+
+// initpsn is from packet we received
+// req_attr is from REQ or REP from other side
+psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp,
+				const struct psm_rc_qp_attr *req_attr,
+				const ips_path_rec_t *path_rec, uint32_t initpsn)
+{
+	int flags = IBV_QP_STATE;
+	struct ibv_qp_attr attr = { 0 };
+
+	attr.qp_state = IBV_QPS_RTR;
+
+	ips_path_rec_to_ah_attr(ep, path_rec, &attr.ah_attr);
+	flags |= IBV_QP_AV;
+
+	// TBD - we already factored in req vs pr to update pr no need
+	// for modify_cq_qp_to_rtr to repeat it
+	// pr_mtu is max PSM paylod in bytes and req_attr_mtu is IB enum
+	attr.path_mtu = MIN(opa_mtu_int_to_enum(path_rec->pr_mtu), req_attr->mtu);
+	attr.dest_qp_num = req_attr->qpn;
+	attr.rq_psn = initpsn;
+	flags |= (IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN);
+
+	_HFI_PRDBG("set max_dest_rd_atomic to %u\n", attr.max_dest_rd_atomic);
+	attr.min_rnr_timer = 12;	// TBD well known
+	flags |= (IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC);
+
+	if (ibv_modify_qp(qp, &attr, flags)) {
+		_HFI_ERROR( "Failed to modify RC QP to RTR on %s: %s\n",
+					ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	_HFI_MMDBG("moved %d to RTR\n", qp->qp_num);
+
+	return PSM2_OK;
+}
+
+// initpsn is value we sent in our req and rep
+// req_attr is from REP we received from other side
+psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp,
+				const struct psm_rc_qp_attr *req_attr, uint32_t initpsn)
+{
+	int flags = IBV_QP_STATE;
+	struct ibv_qp_attr attr = { 0 };
+
+	attr.qp_state = IBV_QPS_RTS;
+
+	attr.sq_psn = initpsn;	// value we told other side
+	flags |= IBV_QP_SQ_PSN;
+
+	_HFI_PRDBG("set max_rd_atomic to %u\n", attr.max_rd_atomic);
+	flags |=  IBV_QP_MAX_QP_RD_ATOMIC;
+
+	attr.retry_cnt = ep->hfi_qp_retry;
+	attr.rnr_retry = ep->hfi_qp_retry;	// only for eager RC QP rdmamode
+	attr.timeout = ep->hfi_qp_timeout;
+	flags |= IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_TIMEOUT;
+
+	_HFI_MMDBG("moving %d to RTS\n", qp->qp_num);
+	if (ibv_modify_qp(qp, &attr, flags)) {
+		_HFI_ERROR( "Failed to modify RC QP to RTS on %s: %s\n",
+						ep->dev_name, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	//__psm2_dump_verbs_qp(qp);
+	return PSM2_OK;
+}
+
+/******************************************************************************
+ *  * Try to map verbs' link layer types to a descriptive string or "Unknown"
+ *   ******************************************************************************/
+static const char *link_layer_str(int8_t link_layer)
+{
+    switch (link_layer) {
+
+        case IBV_LINK_LAYER_UNSPECIFIED:
+        case IBV_LINK_LAYER_INFINIBAND:
+            return "IB";
+        case IBV_LINK_LAYER_ETHERNET:
+            return "Ethernet";
+        default:
+            return "Unknown";
+    }
+}
+
+int __psm2_nonzero_gid(const union ibv_gid *gid)
+{
+	static union ibv_gid zero_gid = { { 0 } };
+
+	return memcmp(gid, &zero_gid, sizeof(*gid)) != 0;
+}
+
+char *
+__psm2_dump_gid(union ibv_gid *gid, char *buf, size_t bufsize)
+{
+	snprintf(buf, bufsize, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:"
+	                       "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+		gid->raw[0], gid->raw[1], gid->raw[2], gid->raw[3],
+		gid->raw[4], gid->raw[5], gid->raw[6], gid->raw[7],
+		gid->raw[8], gid->raw[9], gid->raw[10], gid->raw[11],
+		gid->raw[12], gid->raw[13], gid->raw[14], gid->raw[15]);
+
+	return buf;
+}
+
+void
+__psm2_dump_verbs_ep(psm2_ep_t ep, unsigned igid)
+{
+	struct psm2_verbs_ep *vep = &(ep->verbs_ep);
+	union ibv_gid gid;
+
+	printf("ib_devname = %s\n", ep->dev_name);
+	printf("qp_num     = %u\n", vep->qp->qp_num);
+	printf("GID        = ");
+	if (0 == ibv_query_gid(vep->context, ep->portnum, igid, &gid)) {
+		char buf[80];
+		printf("%s\n", __psm2_dump_gid(&gid, buf, sizeof(buf)));
+	} else {
+		printf("unavailable.\n");
+	}
+}
+
+void
+__psm2_dump_verbs_qp(struct ibv_qp *qp)
+{
+	struct ibv_qp_attr attr;
+	struct ibv_qp_init_attr init_attr;
+	int mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_CAP
+			/*| IBV_QP_RATE_LIMIT*/ ;
+	if (qp->qp_type == IBV_QPT_RC) {
+		mask |= IBV_QP_ACCESS_FLAGS | IBV_QP_AV | IBV_QP_PATH_MTU
+				| IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY
+				| IBV_QP_RQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC
+				// | IBV_QP_ALT_PATH
+				| IBV_QP_MIN_RNR_TIMER | IBV_QP_SQ_PSN
+				| IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_PATH_MIG_STATE
+				| IBV_QP_DEST_QPN;
+	} else {
+		mask |= IBV_QP_QKEY;
+	}
+	if (ibv_query_qp(qp, &attr, mask, &init_attr)) {
+			printf("unable to query QP\n");
+			return;
+	}
+	// rate_limit field not available in some versions of verbs.h
+	//printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u rate %u draining %u\n",
+	//		qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index,
+	//		attr.port_num, attr.rate_limit, attr.sq_draining);
+	printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u draining %u\n",
+			qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index,
+			attr.port_num, attr.sq_draining);
+	printf("  send: wr %u sge %u inline %u recv: wr %u sqe %u\n",
+			attr.cap.max_send_wr, attr.cap.max_send_sge, attr.cap.max_inline_data,
+			attr.cap.max_recv_wr, attr.cap.max_recv_sge);
+	printf("  context %p send_cq %p recv_cq %p srq %p sg_sig_all %u\n",
+			init_attr.qp_context, init_attr.send_cq, init_attr.recv_cq,
+			init_attr.srq, init_attr.sq_sig_all);
+	if (qp->qp_type == IBV_QPT_RC) {
+		char buf[80];
+		printf("  mtu %u mig %u rq_psn %u sq_psn %u dest_qp %u access %u\n",
+			attr.path_mtu, attr.path_mig_state, attr.rq_psn, attr.sq_psn,
+			attr.dest_qp_num, attr.qp_access_flags);
+		printf("  max_rd_atomic %u max_dest_rd_atomic %u\n",
+			attr.max_rd_atomic, attr.max_dest_rd_atomic);
+		printf("  min_rnr_timer %u timeout %u retry_cnt %u rnr_retry %u\n",
+			attr.min_rnr_timer, attr.timeout, attr.retry_cnt, attr.rnr_retry);
+		printf("  ah_attr:  port %u dlid %u sl %u src_path_bits %u rate %u global %u\n",
+			attr.ah_attr.port_num, attr.ah_attr.dlid,
+			attr.ah_attr.sl,
+			attr.ah_attr.src_path_bits, attr.ah_attr.static_rate,
+			attr.ah_attr.is_global);
+		if (attr.ah_attr.is_global) {
+			printf("           dgid: %s\n",
+				__psm2_dump_gid(&attr.ah_attr.grh.dgid, buf, sizeof(buf)));
+			printf("           flow %u sgid_idx %u hop %u tc %u\n",
+				attr.ah_attr.grh.flow_label, attr.ah_attr.grh.sgid_index,
+				attr.ah_attr.grh.hop_limit, attr.ah_attr.grh.traffic_class);
+		}
+		printf("  alt_ah_attr:  port %u dlid %u sl %u src_path_bits %u rate %u global %u\n",
+			attr.alt_ah_attr.port_num, attr.alt_ah_attr.dlid,
+			attr.alt_ah_attr.sl,
+			attr.alt_ah_attr.src_path_bits, attr.alt_ah_attr.static_rate,
+			attr.alt_ah_attr.is_global);
+		if (attr.alt_ah_attr.is_global) {
+			printf("              dgid: %s\n",
+				__psm2_dump_gid(&attr.alt_ah_attr.grh.dgid, buf, sizeof(buf)));
+			printf("              flow %u sgid_idx %u hop %u tc %u\n",
+				attr.alt_ah_attr.grh.flow_label, attr.alt_ah_attr.grh.sgid_index,
+				attr.alt_ah_attr.grh.hop_limit, attr.alt_ah_attr.grh.traffic_class);
+		}
+		printf("  alt pkey idx %u alt port %u alt timeout %u\n",
+			attr.alt_pkey_index, attr.alt_port_num, attr.alt_timeout);
+	} else {
+			printf("qkey: 0x%x\n", attr.qkey);
+	}
+	return;
+}
+
+static enum psm_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
+{
+	switch (width) {
+	case 1: /* 1x */
+		switch (speed) {
+		case 1: return PSM_IBV_RATE_2_5_GBPS;
+		case 2: return PSM_IBV_RATE_5_GBPS;
+		case 4: /* fall through */
+		case 8: return PSM_IBV_RATE_10_GBPS;
+		case 16: return PSM_IBV_RATE_14_GBPS;
+		case 32: return PSM_IBV_RATE_25_GBPS;
+		case 64: return PSM_IBV_RATE_50_GBPS;
+		default:
+				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
+				return PSM_IBV_RATE_100_GBPS;
+		}
+	case 2: /* 4x */
+		switch (speed) {
+		case 1: return PSM_IBV_RATE_10_GBPS;
+		case 2: return PSM_IBV_RATE_20_GBPS;
+		case 4: /* fall through */
+		case 8: return PSM_IBV_RATE_40_GBPS;
+		case 16: return PSM_IBV_RATE_56_GBPS;
+		case 32: return PSM_IBV_RATE_100_GBPS;
+		case 64: return PSM_IBV_RATE_200_GBPS;
+		default:
+				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
+				return PSM_IBV_RATE_100_GBPS;
+		}
+	case 4: /* 8x */
+		switch (speed) {
+		case 1: return PSM_IBV_RATE_20_GBPS;
+		case 2: return PSM_IBV_RATE_40_GBPS;
+		case 4: /* fall through */
+		case 8: return PSM_IBV_RATE_80_GBPS;
+		case 16: return PSM_IBV_RATE_112_GBPS;
+		case 32: return PSM_IBV_RATE_200_GBPS;
+		case 64: return PSM_IBV_RATE_400_GBPS;
+		default:
+				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
+				return PSM_IBV_RATE_100_GBPS;
+		}
+	case 8: /* 12x */
+		switch (speed) {
+		case 1: return PSM_IBV_RATE_30_GBPS;
+		case 2: return PSM_IBV_RATE_60_GBPS;
+		case 4: /* fall through */
+		case 8: return PSM_IBV_RATE_120_GBPS;
+		case 16: return PSM_IBV_RATE_168_GBPS;
+		case 32: return PSM_IBV_RATE_300_GBPS;
+		case 64: return PSM_IBV_RATE_600_GBPS;
+		default:
+				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
+				return PSM_IBV_RATE_100_GBPS;
+		}
+	case 16: /* 2x */
+		switch (speed) {
+		case 1: return PSM_IBV_RATE_5_GBPS;
+		case 2: return PSM_IBV_RATE_10_GBPS;
+		case 4: /* fall through */
+		case 8: return PSM_IBV_RATE_20_GBPS;
+		case 16: return PSM_IBV_RATE_28_GBPS;
+		case 32: return PSM_IBV_RATE_50_GBPS;
+		case 64: return PSM_IBV_RATE_100_GBPS;
+		default:
+				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
+				return PSM_IBV_RATE_100_GBPS;
+		}
+	default:
+		_HFI_ERROR( "unknown link width 0x%x\n", width);
+		return PSM_IBV_RATE_100_GBPS;
+	}
+}
+
+// unfortunately ibv_rate_to_mult and mult_to_ibv_rate have a bug as they
+// omit 100g rate and some others, so we create our own
+static int my_ibv_rate_to_mult(enum psm_ibv_rate rate)
+{
+	switch (rate) {
+	case PSM_IBV_RATE_2_5_GBPS: return  1;
+	case PSM_IBV_RATE_5_GBPS:   return  2;
+	case PSM_IBV_RATE_10_GBPS:  return  4;
+	case PSM_IBV_RATE_20_GBPS:  return  8;
+	case PSM_IBV_RATE_30_GBPS:  return 12;
+	case PSM_IBV_RATE_40_GBPS:  return 16;
+	case PSM_IBV_RATE_60_GBPS:  return 24;
+	case PSM_IBV_RATE_80_GBPS:  return 32;
+	case PSM_IBV_RATE_120_GBPS: return 48;
+	case PSM_IBV_RATE_14_GBPS:	return 5;
+	case PSM_IBV_RATE_56_GBPS:	return 22;
+	case PSM_IBV_RATE_112_GBPS:	return 44;
+	case PSM_IBV_RATE_168_GBPS:	return 67;
+	case PSM_IBV_RATE_25_GBPS:	return 10;
+	case PSM_IBV_RATE_100_GBPS:	return 40;
+	case PSM_IBV_RATE_200_GBPS:	return 80;
+	case PSM_IBV_RATE_300_GBPS:	return 120;
+	case PSM_IBV_RATE_28_GBPS:  return 11;
+	case PSM_IBV_RATE_50_GBPS:  return 20;
+	case PSM_IBV_RATE_400_GBPS: return 160;
+	case PSM_IBV_RATE_600_GBPS: return 240;
+	default:           return 40;
+	}
+}
+
+static enum psm_ibv_rate my_mult_to_ibv_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return PSM_IBV_RATE_2_5_GBPS;
+	case 2:  return PSM_IBV_RATE_5_GBPS;
+	case 4:  return PSM_IBV_RATE_10_GBPS;
+	case 8:  return PSM_IBV_RATE_20_GBPS;
+	case 12: return PSM_IBV_RATE_30_GBPS;
+	case 16: return PSM_IBV_RATE_40_GBPS;
+	case 24: return PSM_IBV_RATE_60_GBPS;
+	case 32: return PSM_IBV_RATE_80_GBPS;
+	case 48: return PSM_IBV_RATE_120_GBPS;
+	case 5:  return PSM_IBV_RATE_14_GBPS;
+	case 22: return PSM_IBV_RATE_56_GBPS;
+	case 44: return PSM_IBV_RATE_112_GBPS;
+	case 67: return PSM_IBV_RATE_168_GBPS;
+	case 10: return PSM_IBV_RATE_25_GBPS;
+	case 40: return PSM_IBV_RATE_100_GBPS;
+	case 80: return PSM_IBV_RATE_200_GBPS;
+	case 120: return PSM_IBV_RATE_300_GBPS;
+	case 11: return PSM_IBV_RATE_28_GBPS;
+	case 20: return PSM_IBV_RATE_50_GBPS;
+	case 160: return PSM_IBV_RATE_400_GBPS;
+	case 240: return PSM_IBV_RATE_600_GBPS;
+	default: return PSM_IBV_RATE_100_GBPS;
+    }
+}
+
+
+enum psm_ibv_rate min_rate(enum psm_ibv_rate a, enum psm_ibv_rate b)
+{
+	// unfortunately the ibv_rate enum is not sorted by link rate
+	// so we must convert to "mult" to compare then convert back
+	return my_mult_to_ibv_rate(min(my_ibv_rate_to_mult(a),
+                                 my_ibv_rate_to_mult(b)));
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.h b/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e4749b1abbe6b1c609646b312d4413e8e10dbef
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_verbs_ep.h
@@ -0,0 +1,392 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+
+#ifndef _PSMI_IN_USER_H
+#error psm_verbs_ep.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_VERBS_EP_H
+#define _PSMI_VERBS_EP_H
+
+#include <infiniband/verbs.h>
+#ifdef RNDV_MOD
+#include <psm_rndv_mod.h>
+#endif
+#include "ptl_ips/ips_path_rec.h"
+
+#define MAX_PSM_HEADER 64			// sizeof(ips_lrh) == 56, round up to 64
+
+// defaults, these are reconfigurable with:
+// PSM3_SEND_IMMEDIATE_SIZE
+// PSM3_NUM_SEND_WQES
+// PSM3_NUM_RECV_WQES
+// PSM3_QP_TIMEOUT
+// PSM3_QP_RETRY
+#define VERBS_SEND_MAX_INLINE 64	// 56 is PSM header size
+#define VERBS_SEND_QP_ENTRIES 4080	// will round down to multiple of COALLESCE
+#define VERBS_NUM_SEND_RDMA    128	// max conurrent RDMA send WQEs per NIC
+#define VERBS_RECV_QP_ENTRIES 4095	// avoid CQ overflow, CVL may be limited to 4095?
+#define VERBS_QP_TIMEOUT 536870	// in microseconds (17)
+#define VERBS_QP_RETRY 7	// limit on RC QP retries for rnr or timeout
+#define VERBS_QP_MAX_RETRY 7	// max allowed by verbs for QP_RETRY
+
+// hardcoded for now
+#define VERBS_RECV_QP_FRACTION 4	// size RC QPs as 1/FRACTION of the
+									// final UD RECV QP size
+		// only ask for a completion this often.
+		// If 1, ask for completion on every send.
+#define VERBS_SEND_CQ_COALLESCE 8
+									// For USE_RC, keep this modest as we
+									// could have up to this many -1 unsignaled
+									// WQEs per QP, which may consume send bufs
+									// for quite some time if the QP is only
+									// occasionally used
+									// if we have ~100 QPs and 1000s of send
+									// buffers, this should be ok
+#define VERBS_RECV_QP_COALLESCE 16	// gather and build this many recv WQEs
+									// before post on recv Q.
+									// Reduces verbs calls
+									// if 1, post as we recv them
+#define VERBS_SEND_CQ_REAP 256	// check for completions when this many unreaped
+#define VERBS_PORT 1			// default port if not specified
+#define VERBS_RECV_CQE_BATCH 1	// how many CQEs to ask for at a time
+#define UD_ADDITION (40)		// extra bytes at start of UD recv buffer
+								// defined in verbs API to accomidate IB GRH
+#define BUFFER_HEADROOM 0		// how much extra to allocate in buffers
+								// as a paranoid headroom for use of more than
+								// intended.  Was 64, but seems we can do
+								// without it and hence make buffers better
+								// page aligned
+								// value here should be a multiple of CPU
+								// cache size
+#define CPU_PAGE_ALIGN	PSMI_PAGESIZE	// boundary to align buffer pools for
+#include "psm_verbs_mr.h"
+
+// some older distros lack some of the rates, so define our own list here
+enum psm_ibv_rate {
+	//PSM_IBV_RATE_MAX		= 0,
+	PSM_IBV_RATE_2_5_GBPS	= 2,
+	PSM_IBV_RATE_5_GBPS		= 5,
+	PSM_IBV_RATE_10_GBPS	= 3,
+	PSM_IBV_RATE_20_GBPS	= 6,
+	PSM_IBV_RATE_30_GBPS	= 4,
+	PSM_IBV_RATE_40_GBPS	= 7,
+	PSM_IBV_RATE_60_GBPS	= 8,
+	PSM_IBV_RATE_80_GBPS	= 9,
+	PSM_IBV_RATE_120_GBPS	= 10,
+	PSM_IBV_RATE_14_GBPS	= 11,
+	PSM_IBV_RATE_56_GBPS	= 12,
+	PSM_IBV_RATE_112_GBPS	= 13,
+	PSM_IBV_RATE_168_GBPS	= 14,
+	PSM_IBV_RATE_25_GBPS	= 15,
+	PSM_IBV_RATE_100_GBPS	= 16,
+	PSM_IBV_RATE_200_GBPS	= 17,
+	PSM_IBV_RATE_300_GBPS	= 18,
+	PSM_IBV_RATE_28_GBPS	= 19,
+	PSM_IBV_RATE_50_GBPS	= 20,
+	PSM_IBV_RATE_400_GBPS	= 21,
+	PSM_IBV_RATE_600_GBPS	= 22,
+};
+
+// Per IBTA the wc.opcode is undefined in error CQEs
+// so we need to save that information in the wr_id.
+// Fortunately our wr_id's are well aligned pointers so
+// we can stash the flag in the low bits of wr_id
+#define VERBS_SQ_WR_ID_SEND		0x0
+#define VERBS_SQ_WR_ID_RDMA_WRITE	0x1
+#define VERBS_SQ_WR_ID_MASK		0x1
+#define VERBS_SQ_WR_OP(wr_id)		((wr_id)&VERBS_SQ_WR_ID_MASK)
+#define VERBS_SQ_WR_OP_STR(wr_id) (VERBS_SQ_WR_OP(wr_id)?"RDMA Write":"Send")
+
+struct verbs_sbuf {
+	struct verbs_sbuf *next;
+	uint8_t *buffer;
+	struct psm2_verbs_send_allocator *allocator;
+};
+typedef struct verbs_sbuf *sbuf_t;
+#define sbuf_to_buffer(buf)	((buf)->buffer)
+#define sbuf_pool(ep, buf)	((buf)->allocator->pool)
+#define sbuf_lkey(ep, buf)	(sbuf_pool(ep, buf)->send_buffer_mr->lkey)
+
+
+// when we get a CQE we need to find the pool and the QP it came from
+// (pool has a reference to the qp).
+// unfortunately, the CQE has a qp_num but not a ibv_qp pointer.  So we need
+// to keep this information here and use this structure as the wr_id for our
+// RQ WQE.
+struct verbs_rbuf {
+	uint8_t *buffer;
+	struct psm2_verbs_recv_pool *pool;;
+};
+typedef struct verbs_rbuf *rbuf_t;
+#define rbuf_to_buffer(buf)	((buf)->buffer)
+#define rbuf_addition(buf) ((buf)->pool->addition)
+#define rbuf_qp(ep, buf) ((buf)->pool->qp)
+
+static inline const char*qp_type_str(struct ibv_qp *qp) {
+	return (qp->qp_type == IBV_QPT_UD)?"UD":"RC";
+}
+
+// subset of RC QP attr which we need to exchange in PSM req/rep
+// when ! defined(USE_RC), it is zeroed to keep req/rep size consistent
+// this structure is also used in REQ/REP packet format and size can't change
+// list of fields comes from IB CM for RC QP connection
+// These fields are purely information about sender:
+//		qpn, srq, target_ack_delay
+// These fields are negotiated.
+// 		mtu, responder_resources, initiator_depth
+//		Each side sends their best possible value and the receiver picks
+//		the min of it's own best and the REQ/REP received
+struct psm_rc_qp_attr {
+	uint32_t qpn:24;
+	uint32_t mtu:4;  // HW MTU for RC QP
+	uint32_t srq:1;   // using SRQ
+	uint32_t resv:3;
+	uint8_t target_ack_delay:5; // 5 bits for computing timeout - TBD if need
+	uint8_t resv2:3;
+	// these control how many concurrent RDMA reads/atomics are allowed per QP
+	// the initiator of the RDMA reads must issue no more than target can handle
+	// can be 0 if we don't plan to use RDMA read
+	// behavior here is based on PSM CM approach, which differs from IB CM
+	// IB CM REP would have the result of the negotiated value
+	// for PSM CM, sender puts same values in REQ and REP
+	// receiver will use the min of its preferred value and the received value
+	// sent REQ/REP indicates what we desire to use from sender perspective
+	//   responder_resources <= local CA max_qp_rd_atom
+	//   initiator_depth <= local CA max_qp_init_rd_atom
+	// REQ/REP recipient sets values as follows:
+	//   QP max_rd_atomic = MIN(our requested initiator_depth,
+	//   							received responder_resouces)
+	//   QP max_dest_rd_atomic = MIN(our requested responder_resources,
+	//   							received initiator_depth)
+	// ibv_device_attr:
+	//   CA max_qp_rd_atom - max incoming RDMA Reads (responder)
+	//   CA max_qp_init_rd_atom -max outstanding outgoing RDMA Reads (initiator)
+	// ibv_qp_attr:
+	//   QP max_dest_rd_atomic - max incoming RDMA Reads (responder)
+	//   QP max_rd_atomic - max outstanding outgoing RDMA Reads (initiator)
+	uint8_t responder_resources;
+	uint8_t initiator_depth;
+	// QKey well known
+	// starting PSN - use initpsn in req/rep
+	// retry_cnt,rnr_retry_cnt - well known
+	// pkey - already known
+	// LID, GID, SL, etc - already known, same as UD QP
+	uint8_t resv3[17];	// future expansion, keeping struct mult of 64b
+} PACK_SUFFIX;
+
+// pool of send buffers
+// When USE_RC we allow multiple QPs to be to share the same send buffer pool.
+struct psm2_verbs_send_pool {
+	// our preregistered send buffers
+	uint32_t send_buffer_size;
+	uint32_t send_total;
+	uint32_t send_num_free;
+	uint8_t *send_buffers;				// aligned buffers for use
+	struct verbs_sbuf *send_bufs;
+	struct verbs_sbuf *send_free;	// head of free list
+	struct ibv_mr *send_buffer_mr;
+};
+typedef struct psm2_verbs_send_pool *psm2_verbs_send_pool_t;
+
+// track the list of allocated (aka inflight) send buffers so we
+// can coalesce completions and still find all the completed buffers
+// For USE_RC, we need an inflight list per QP to account for the unpredictable
+// order of send CQEs from different QPs.
+struct psm2_verbs_send_allocator {
+	psm2_verbs_send_pool_t pool;		// pool we allocate from
+	struct verbs_sbuf *send_alloc_head;	// head of allocated list
+	struct verbs_sbuf *send_alloc_end;	// end of allocated list
+	uint32_t send_num_til_coallesce;
+};
+typedef struct psm2_verbs_send_allocator *psm2_verbs_send_allocator_t;
+
+// receive buffer pool
+// we use the same basic mechanisms for UD and RC QP buffer pools
+// but sizes may differ
+// when USE_RC, we need a separate recv pool per QP so we can prepost bufs.
+struct psm2_verbs_recv_pool {
+	struct ibv_qp *qp;	// secondary reference to QP these buffers are for
+	psm2_ep_t ep;
+	// our preregistered recv buffers
+	uint32_t recv_buffer_size;
+	uint32_t recv_total;
+	uint8_t *recv_buffers;
+	struct ibv_mr *recv_buffer_mr;
+	uint32_t addition;	// UD_ADDITION for UD QP, 0 for RC QP
+#if VERBS_RECV_QP_COALLESCE > 1
+			// list of ready to post WQEs and SGEs
+	struct ibv_recv_wr recv_wr_list[VERBS_RECV_QP_COALLESCE];
+	struct ibv_sge recv_sge_list[VERBS_RECV_QP_COALLESCE];
+	uint32_t next_recv_wqe;	// next index in rsc_wr_list/sge_list to use
+#endif
+	struct verbs_rbuf *recv_bufs;
+};
+typedef struct psm2_verbs_recv_pool *psm2_verbs_recv_pool_t;
+
+// this structure can be part of psm2_ep
+// one instance of this per local end point (NIC)
+// we will create a single PD and UD QP with related resources to
+// permit an eager data movement mechanism
+// conceptually similar to a psmi_context_t which refers to an HFI context
+// TODO - later could optimize cache hit rates by putting some of the less
+// frequently used fields in a different part of psm2_ep struct
+struct psm2_verbs_ep {
+	//struct ibv_device *ib_dev;
+	struct ibv_context *context;
+	struct ibv_port_attr port_attr;
+	struct ibv_pd	*pd;
+	struct ibv_comp_channel *recv_comp_channel;
+	union  ibv_gid lgid;  // The GID to use when sending.
+	unsigned lgid_index; 
+	struct ibv_cq	*send_cq;
+	struct ibv_cq	*recv_cq;
+	struct ibv_qp	*qp;
+	struct ibv_qp_cap qp_cap;   // capabilities of QP we got
+	uint32_t qkey;
+	uint8_t link_layer;         // IBV_LINK_LAYER_ETHERNET or other
+	uint8_t active_rate;
+	uint32_t ip_addr;           // ip_addr (valid for link_layer == Eth)
+	uint32_t ip_netmask;        // netmask (valid for link_layer == Eth)
+	struct psm2_verbs_send_pool send_pool;
+	struct psm2_verbs_send_allocator send_allocator;
+	uint32_t send_rdma_outstanding;	// number of outstanding RDMAs
+	uint32_t send_reap_thresh;	// TBD if should be here or in pool
+	struct psm2_verbs_recv_pool recv_pool;
+#if VERBS_RECV_CQE_BATCH > 1
+	struct ibv_wc recv_wc_list[VERBS_RECV_CQE_BATCH];
+	int recv_wc_count;	// number left in recv_wc_list
+	int recv_wc_next;	// next index
+#else
+	// if asked to revisit a packet we save it here
+	rbuf_t revisit_buf;
+	uint32_t revisit_payload_size;
+#endif
+#ifdef RNDV_MOD
+	psm2_rv_t rv;	// rendezvous module open handle
+	uint32_t rv_index;
+	struct psm2_rv_conn_stats rv_conn_stats;
+	struct psm2_rv_event_stats rv_event_stats;
+#endif
+};
+
+// given index, return buffer start
+#define send_buffer_start(pool, i) ((pool)->send_buffer_size *(i))
+// given buffer start, return index
+#define send_buffer_index(pool, buf) (((buf)-(pool)->send_buffers)/(pool)->send_buffer_size)
+
+// given index, return buffer start
+#define recv_buffer_start(pool, i) ((pool)->recv_buffer_size *(i))
+// given buffer start, return index
+#define recv_buffer_index(pool, buf) (((buf)-(pool)->recv_buffers)/(pool)->recv_buffer_size)
+
+extern psm2_error_t __psm2_ep_open_verbs(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key);
+extern void __psm2_ep_initstats_verbs(psm2_ep_t ep);
+extern void __psm2_ep_free_verbs(psm2_ep_t ep);
+extern psm2_error_t __psm2_ep_initialize_queues(psm2_ep_t ep);
+extern struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context,
+							struct ibv_qp_cap *cap);
+extern void rc_qp_destroy(struct ibv_qp *qp);
+extern psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp);
+extern psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp,
+				const struct psm_rc_qp_attr *req_attr,
+				const ips_path_rec_t *path_rec, uint32_t initpsn);
+extern psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp,
+				const struct psm_rc_qp_attr *req_attr, uint32_t initpsn);
+extern int __psm2_ep_poll_type(int poll_type, psm2_ep_t ep);
+extern psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd,
+            psm2_verbs_send_pool_t pool,
+            uint32_t send_total, uint32_t send_buffer_size);
+extern psm2_error_t psm_verbs_init_send_allocator(
+            psm2_verbs_send_allocator_t allocator,
+            psm2_verbs_send_pool_t pool);
+extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
+            psm2_verbs_recv_pool_t pool,
+            uint32_t recv_total, uint32_t recv_buffer_size);
+extern void psm_verbs_free_send_pool(psm2_verbs_send_pool_t pool);
+extern void psm_verbs_free_recv_pool(psm2_verbs_recv_pool_t pool);
+extern sbuf_t __psm2_ep_verbs_alloc_sbuf(psm2_verbs_send_allocator_t allocator);
+extern void __psm2_ep_verbs_free_sbuf(
+				sbuf_t buf, uint32_t count);
+extern psm2_error_t __psm2_ep_verbs_post_recv(
+				rbuf_t buf);
+extern psm2_error_t __psm2_ep_verbs_prepost_recv(psm2_verbs_recv_pool_t pool);
+
+extern psm2_error_t psm2_verbs_post_rdma_write_immed(psm2_ep_t ep,
+				struct ibv_qp *qp,
+				void *loc_buf, struct psm2_verbs_mr *loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				size_t len, uint32_t immed, uint64_t wr_id);
+
+#ifdef RNDV_MOD
+extern psm2_error_t psm2_verbs_post_rv_rdma_write_immed(psm2_ep_t ep,
+				psm2_rv_conn_t conn,
+				void *loc_buf, struct psm2_verbs_mr *loc_mr,
+				uint64_t rem_buf, uint32_t rkey,
+				size_t len, uint32_t immed, uint64_t wr_id,
+				uint8_t *sconn_index, uint32_t *conn_count);
+#endif
+
+extern psm2_error_t psm2_verbs_completion_update(psm2_ep_t ep);
+
+extern int __psm2_nonzero_gid(const union ibv_gid *gid);
+extern char *__psm2_dump_gid(union ibv_gid *gid, char *buf, size_t bufsize);
+extern void __psm2_dump_verbs_qp(struct ibv_qp *qp);
+extern enum psm_ibv_rate min_rate(enum psm_ibv_rate a, enum psm_ibv_rate b);
+#ifndef UD_SAMPLE
+extern int verbs_get_port_index2pkey(psm2_ep_t ep, int port, int index);
+#endif
+#endif // _PSMI_VERBS_EP_H
diff --git a/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.c b/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.c
new file mode 100644
index 0000000000000000000000000000000000000000..bb77cbb3b41a03169164f2c2f56ec75a199d05a1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.c
@@ -0,0 +1,1260 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+// This performs memory registration for RDMA Rendezvous
+// It also tracks MRs in use and allows existing MRs to be shared.
+
+// in cache_mode MR_CACHE_MODE_USER, as a PoC we keep the cache overly simple.
+// This approach is only viable for
+// some microbenchmarks and simple apps.  For more complex apps the lack of
+// invalidate hooks into memory free may lead to memory corruption.
+// However such hooks are not reliably possible until the 4.17+ kernels.
+// The kernel RV module hooks into mmu_notfiers for invalidate.  These are also
+// used by hypervisors and hence are complete and reliable.
+
+#include <sys/types.h>
+#include "psm_user.h"	// pulls in psm_verbs_ep.h and psm_verbs_mr.h
+#ifdef RNDV_MOD
+#include "psm_rndv_mod.h"
+#endif
+#include "psm2_hal.h"
+#ifdef PSM_FI
+#include "ips_config.h"
+#endif
+
+//#undef _HFI_MMDBG
+//#define _HFI_MMDBG printf
+
+#ifdef min
+#undef min
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+#define MEGABYTE (1024*1024)
+
+#ifndef container_of
+/*
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:        the pointer to the member.
+ * @type:       the type of the container struct this is embedded in.
+ * @member:     the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+
+// Since rbtree.h and rbtree.c are designed to be included, and declare
+// some hardcoded type names (cl_map_item_t and cl_qmap_t), we must limit
+// our data type declarations which use those types to this .c file
+
+// this will be the payload of a cl_qmap_t
+struct psm2_mr_cache_map_pl {
+	uint32_t	nelems;	// number of elements in cache
+};
+
+// rbtree.h uses these 2 well known defines to create the payload for
+// cl_map_item_t and cl_qmap_t structures
+#define RBTREE_MI_PL  struct psm2_verbs_mr
+#define RBTREE_MAP_PL struct psm2_mr_cache_map_pl
+#include "psm3_rbtree.h"
+
+struct psm2_mr_cache {
+	uint32_t max_entries;
+	// limits to allow headroom for priority registrations
+	uint32_t limit_inuse;
+	uint64_t limit_inuse_bytes;
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+	uint64_t limit_gpu_inuse_bytes;
+#endif
+	psm2_rv_t rv;
+	int cmd_fd;
+#endif
+	psm2_ep_t ep;
+	uint8_t cache_mode;	// MR_CACHE_MODE_*
+	cl_qmap_t map;
+	cl_map_item_t root;
+	cl_map_item_t nil_item;
+	// Below is for queue of cache entries available for reuse (refcount==0)
+	// only used when cache_mode==MR_CACHE_MODE_USER.
+	// Available entries are added at end of list and reused from start.
+	// Hence having aging of cached entries.
+	// Aging helps reduce some of the corruption risk,
+	// but is not a full solution.  Good enough for the PoC
+	TAILQ_HEAD(avail_list, psm2_verbs_mr) avail_list;
+	mpool_t mr_pool;	// pool of MRs
+	// some statistics for user space
+	uint64_t hit;
+	uint64_t miss;
+	uint64_t rejected;		// rejected non-priority registration
+	uint64_t full;			// failed registration (tends to be priority)
+	uint64_t failed;		// other failures, should be none
+	uint32_t inuse;		// entry count in use
+	uint32_t max_inuse;
+	uint64_t inuse_bytes;
+	uint64_t max_inuse_bytes;
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+	uint64_t gpu_inuse_bytes;
+	uint64_t max_gpu_inuse_bytes;
+#endif
+#endif
+	uint32_t max_nelems;
+	uint32_t max_refcount;
+#ifdef RNDV_MOD
+	struct psm2_rv_cache_stats rv_stats;	// statistics from rv module
+									// will remain 0 if rv not open
+#ifdef PSM_CUDA
+	struct psm2_rv_gpu_cache_stats rv_gpu_stats;	// GPU statistics from rv module
+									// will remain 0 if rv not open
+#endif
+#endif
+};
+
+static int mr_cache_key_cmp(const struct psm2_verbs_mr *a,
+							const struct psm2_verbs_mr *b)
+{
+	// to match addr, length and access must match
+	// we require exact match to avoid the issue of a release of the larger
+	// MR while smaller overlapping MR still in use, just in case an
+	// allocator frees the extra memory not in the smaller MR
+	// this may be paranoid, TBD if should treat a smaller MR as a match
+	// of a larger subset MR.
+	if (a->access < b->access)
+		return -1;
+	else if (a->access > b->access)
+		return 1;
+	if (a->addr < b->addr)
+		return -1;
+	else if (a->addr > b->addr)
+		return 1;
+	if (a->length < b->length)
+		return -1;
+	else if (a->length > b->length)
+		return 1;
+	return 0;
+}
+
+// rbtree.c uses these defines to establish some of it's code and
+// then provides all the rbtree manipulation functions
+// we want to control the compare funciton so we define RBTREE_CMP and thus
+// must define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR to avoid compiler errors
+#define RBTREE_CMP(a,b) mr_cache_key_cmp((a), (b))
+#define RBTREE_ASSERT                     psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+#include "psm3_rbtree.c"
+
+// TBD - move to a utility macro header
+// taken fron IbAccess imath.h and imath.c
+static uint32_t
+ones64(uint64_t x)
+{
+	x -= ((x >> 1) & 0x5555555555555555ULL);
+	x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
+	x = (((x >> 4) + x) & 0x0f0f0f0f0f0f0f0fULL);
+	x += (x >> 8);
+	x += (x >> 16);
+	x += (x >> 32);
+	return(x & 0x0000003f);
+}
+
+/* log2(x) truncated */
+uint32_t
+FloorLog2(uint64_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	x |= (x >> 32);
+	return(ones64(x >> 1));
+}
+
+/* log2(x) rounded up if x is not a power of 2 */
+uint32_t CeilLog2(uint64_t val)
+{
+	uint32_t floor2 = FloorLog2(val);
+	if ((1ULL << floor2) == val)
+		return (floor2);
+	else
+		return (floor2+1);
+}
+
+static inline uint32_t NextPower2(uint64_t x)
+{
+	return (1 << CeilLog2(x));
+}
+
+// accessor functions for statistics
+#define CACHE_STAT_FUNC(func, stat) \
+    static uint64_t func(void *context) \
+    { \
+		psm2_mr_cache_t cache = (psm2_mr_cache_t)context; \
+		return cache->stat; \
+    }
+
+
+CACHE_STAT_FUNC(mr_cache_mode, cache_mode)
+CACHE_STAT_FUNC(mr_cache_max_entries, max_entries)
+CACHE_STAT_FUNC(mr_cache_nelems, map.payload.nelems)
+CACHE_STAT_FUNC(mr_cache_max_nelems, max_nelems)
+CACHE_STAT_FUNC(mr_cache_limit_inuse, limit_inuse)
+CACHE_STAT_FUNC(mr_cache_inuse, inuse)
+CACHE_STAT_FUNC(mr_cache_max_inuse, max_inuse)
+CACHE_STAT_FUNC(mr_cache_max_refcount, max_refcount)
+#undef CACHE_STAT_FUNC
+
+static uint64_t mr_cache_hit_rate(void *context)
+{
+	psm2_mr_cache_t cache = (psm2_mr_cache_t)context;
+	if (cache->miss)	// all entries start with a miss, then get hits
+		return((cache->hit*100)/(cache->miss+cache->hit));
+	else
+		return 0;
+}
+
+static uint64_t mr_cache_miss_rate(void *context)
+{
+	psm2_mr_cache_t cache = (psm2_mr_cache_t)context;
+	if (cache->miss)	// all entries start with a miss, then get hits
+		return((cache->miss*100)/(cache->miss+cache->hit));
+	else
+		return 0;
+}
+
+#ifdef RNDV_MOD
+static uint64_t mr_cache_rv_size(void *context)
+{
+	psm2_mr_cache_t cache = (psm2_mr_cache_t)context;
+	if (cache->rv) {
+		// this is a little sly, we know the stats processing routines will
+		// call the accessors in the order from the entries list
+		// so we use the 1st of the rv statistics accessors to get
+		// the statistics from rv into the cache structure so other accessors
+		// can simply return the relevant value
+		(void)__psm2_rv_get_cache_stats(cache->rv, &cache->rv_stats);
+	}
+	return cache->rv_stats.cache_size/MEGABYTE;
+}
+
+#define CACHE_RV_STAT_FUNC(func, stat) \
+    static uint64_t func(void *context) \
+    { \
+		psm2_mr_cache_t cache = (psm2_mr_cache_t)context; \
+		return cache->rv_stats.stat; \
+    }
+
+CACHE_RV_STAT_FUNC(mr_cache_rv_max_size, max_cache_size/MEGABYTE)
+CACHE_RV_STAT_FUNC(mr_cache_rv_limit_size, limit_cache_size)
+CACHE_RV_STAT_FUNC(mr_cache_rv_nelems, count)
+CACHE_RV_STAT_FUNC(mr_cache_rv_max_nelems, max_count)
+CACHE_RV_STAT_FUNC(mr_cache_rv_inuse, inuse)
+CACHE_RV_STAT_FUNC(mr_cache_rv_max_inuse, max_inuse)
+CACHE_RV_STAT_FUNC(mr_cache_rv_max_refcount, max_refcount)
+#undef CACHE_RV_STAT_FUNC
+
+static uint64_t mr_cache_rv_hit_rate(void *context)
+{
+	psm2_mr_cache_t cache = (psm2_mr_cache_t)context;
+	if (cache->rv_stats.miss)	// all entries start with a miss, then get hits
+		return((cache->rv_stats.hit*100)/(cache->rv_stats.miss+cache->rv_stats.hit));
+	else
+		return 0;
+}
+
+static uint64_t mr_cache_rv_miss_rate(void *context)
+{
+	psm2_mr_cache_t cache = (psm2_mr_cache_t)context;
+	if (cache->rv_stats.miss)	// all entries start with a miss, then get hits
+		return((cache->rv_stats.miss*100)/(cache->rv_stats.miss+cache->rv_stats.hit));
+	else
+		return 0;
+}
+
+#ifdef PSM_CUDA
+static uint64_t mr_cache_rv_gpu_size(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	if (cache->rv && PSMI_IS_CUDA_ENABLED ) {
+		// this is a little sly, we know the stats processing routines will
+		// call the accessors in the order from the entries list
+		// so we use the 1st of the rv statistics accessors to get
+		// the statistics from rv into the cache structure so other accessors
+		// can simply return the relevant value
+		(void)__psm2_rv_gpu_get_cache_stats(cache->rv, &cache->rv_gpu_stats);
+	}
+	return cache->rv_gpu_stats.cache_size/MEGABYTE;
+}
+
+#define CACHE_RV_GPU_STAT_FUNC(func, stat) \
+    static uint64_t func(void *context) \
+    { \
+		psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats); \
+		return cache->rv_gpu_stats.stat; \
+    }
+
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_size_reg, cache_size_reg/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_size_mmap, cache_size_mmap/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_size_both, cache_size_both/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_size, max_cache_size/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_size_reg, max_cache_size_reg/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_size_mmap, max_cache_size_mmap/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_size_both, max_cache_size_both/MEGABYTE)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_limit_size, limit_cache_size)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_nelems, count)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_nelems_reg, count_reg)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_nelems_mmap, count_mmap)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_nelems_both, count_both)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_nelems, max_count)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_nelems_reg, max_count_reg)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_nelems_mmap, max_count_mmap)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_nelems_both, max_count_both)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_inuse, inuse)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_inuse_reg, inuse_reg)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_inuse_mmap, inuse_mmap)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_inuse_both, inuse_both)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_inuse, max_inuse)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_inuse_reg, max_inuse_reg)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_inuse_mmap, max_inuse_mmap)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_inuse_both, max_inuse_both)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_refcount, max_refcount)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_refcount_reg, max_refcount_reg)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_refcount_mmap, max_refcount_mmap)
+CACHE_RV_GPU_STAT_FUNC(mr_cache_rv_gpu_max_refcount_both, max_refcount_both)
+#undef CACHE_RV_GPU_STAT_FUNC
+
+/* any hit which found an entry, even if partial */
+static uint64_t mr_cache_rv_gpu_hit_rate(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	if (cache->rv_gpu_stats.miss)	// all entries start with a miss, then get hits
+		return((cache->rv_gpu_stats.hit*100)/(cache->rv_gpu_stats.miss+cache->rv_gpu_stats.hit));
+	else
+		return 0;
+}
+
+/* pure hit, want MR and found entry w/ MR */
+static uint64_t mr_cache_rv_gpu_hit_rate_reg(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_reg, then get hits
+	if (cache->rv_gpu_stats.miss_reg || cache->rv_gpu_stats.hit_add_reg)
+		return((cache->rv_gpu_stats.hit_reg*100)/(cache->rv_gpu_stats.miss_reg+cache->rv_gpu_stats.hit_reg+cache->rv_gpu_stats.hit_add_reg));
+	else
+		return 0;
+}
+
+/* partial hit, want MR and found pinned entry w/o MR */
+static uint64_t mr_cache_rv_gpu_hit_rate_add_reg(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_reg, then get hits
+	if (cache->rv_gpu_stats.miss_reg || cache->rv_gpu_stats.hit_add_reg)
+		return((cache->rv_gpu_stats.hit_add_reg*100)/(cache->rv_gpu_stats.miss_reg+cache->rv_gpu_stats.hit_reg+cache->rv_gpu_stats.hit_add_reg));
+	else
+		return 0;
+}
+
+/* pure hit, want mmap and found entry w/ mmap */
+static uint64_t mr_cache_rv_gpu_hit_rate_mmap(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_mmap, then get hits
+	if (cache->rv_gpu_stats.miss_mmap || cache->rv_gpu_stats.hit_add_mmap)
+		return((cache->rv_gpu_stats.hit_mmap*100)/(cache->rv_gpu_stats.miss_mmap+cache->rv_gpu_stats.hit_mmap+cache->rv_gpu_stats.hit_add_mmap));
+	else
+		return 0;
+}
+
+/* partial hit, want MR and found pinned entry w/o mmap */
+static uint64_t mr_cache_rv_gpu_hit_rate_add_mmap(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_mmap, then get hits
+	if (cache->rv_gpu_stats.miss_mmap || cache->rv_gpu_stats.hit_add_mmap)
+		return((cache->rv_gpu_stats.hit_add_mmap*100)/(cache->rv_gpu_stats.miss_mmap+cache->rv_gpu_stats.hit_mmap+cache->rv_gpu_stats.hit_add_mmap));
+	else
+		return 0;
+}
+
+/* complete miss, no entry found */
+static uint64_t mr_cache_rv_gpu_miss_rate(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	if (cache->rv_gpu_stats.miss)	// all entries start with a miss, then get hits
+		return((cache->rv_gpu_stats.miss*100)/(cache->rv_gpu_stats.miss+cache->rv_gpu_stats.hit));
+	else
+		return 0;
+}
+
+/* no entry found when want MR */
+static uint64_t mr_cache_rv_gpu_miss_rate_reg(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_reg, then get hits
+	if (cache->rv_gpu_stats.miss_reg || cache->rv_gpu_stats.hit_add_reg)
+		return((cache->rv_gpu_stats.miss_reg*100)/(cache->rv_gpu_stats.miss_reg+cache->rv_gpu_stats.hit_reg+cache->rv_gpu_stats.hit_add_reg));
+	else
+		return 0;
+}
+
+/* no entry found when want mmap */
+static uint64_t mr_cache_rv_gpu_miss_rate_mmap(void *context)
+{
+	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
+	// all entries start with a miss or add_reg, then get hits
+	if (cache->rv_gpu_stats.miss_mmap || cache->rv_gpu_stats.hit_add_mmap)
+		return((cache->rv_gpu_stats.miss_mmap*100)/(cache->rv_gpu_stats.miss_mmap+cache->rv_gpu_stats.hit_mmap+cache->rv_gpu_stats.hit_add_mmap));
+	else
+		return 0;
+}
+#endif // PSM_CUDA
+
+#endif // RNDV_MOD
+
+#define INC_STAT(cache, stat, max_stat) \
+	do { \
+		if (++((cache)->stat) > (cache)->max_stat) \
+			(cache)->max_stat = (cache)->stat; \
+	} while(0)
+
+#define ADD_STAT(cache, adder, stat, max_stat) \
+	do { \
+		if (((cache)->stat += (adder)) > (cache)->max_stat) \
+			(cache)->max_stat = (cache)->stat; \
+	} while(0)
+
+
+// ep is used for RNDV_MOD, memory tracking and stats
+psm2_mr_cache_t psm2_verbs_alloc_mr_cache(psm2_ep_t ep,
+							uint32_t max_entries, uint8_t cache_mode,
+							uint32_t pri_entries, uint64_t pri_size
+#ifdef PSM_CUDA
+							, uint64_t gpu_pri_size
+#endif
+							)
+{
+	struct psm2_mr_cache *cache;
+
+	cache = (struct psm2_mr_cache *)psmi_calloc(ep, DESCRIPTORS,
+							sizeof(*cache), 1);
+	if (! cache)
+		return NULL;
+	// max_entries for a pool must be power of 2
+	max_entries = max(max_entries, pri_entries);
+	max_entries = NextPower2(max_entries);
+	cache->max_entries = max_entries;
+	cache->cache_mode = cache_mode;
+	// we leave headroom for priority registrations
+	cache->limit_inuse = max_entries - pri_entries;
+	cache->ep = ep;
+#ifdef RNDV_MOD
+	if (cache->cache_mode == MR_CACHE_MODE_KERNEL
+		|| cache->cache_mode == MR_CACHE_MODE_RV) {
+		// TBD - could make this a warning and set limit_inuse_bytes=0
+		// then depend on transfers queuing and retrying until
+		// reg_mr cache space is available
+		if ((uint64_t)ep->rv_mr_cache_size*MEGABYTE < pri_size) {
+			_HFI_ERROR("PSM3_RV_MR_CACHE_SIZE=%u too small, require >= %"PRIu64"\n",
+				ep->rv_mr_cache_size, (pri_size + MEGABYTE-1)/MEGABYTE);
+			return NULL;
+		}
+		cache->limit_inuse_bytes = (uint64_t)ep->rv_mr_cache_size*MEGABYTE - pri_size;
+#ifdef PSM_CUDA
+		if (PSMI_IS_CUDA_ENABLED) {
+			// For GPU, due to GdrCopy, we can't undersize cache.
+			// Otherwise RDMA MRs could consume all the
+			// cache space and leave a gdrcopy pin/mmap stuck
+			// retrying indefinitely.  If we want to allow undersize
+			// GPU cache, we need to have gdrcopy pin/mmap failures
+			// also invoke progress functions to release MRs
+			if (__psm2_min_gpu_bar_size()) {
+				uint64_t max_recommend = __psm2_min_gpu_bar_size() - 32*MEGABYTE;
+				if ((uint64_t)ep->rv_gpu_cache_size*MEGABYTE >= max_recommend) {
+					_HFI_INFO("Warning: PSM3_RV_GPU_CACHE_SIZE=%u too large for smallest GPU's BAR size of %"PRIu64" (< %"PRIu64" total of endpoint-rail-qp recommended)\n",
+						ep->rv_gpu_cache_size,
+						(__psm2_min_gpu_bar_size() + MEGABYTE-1)/MEGABYTE,
+						max_recommend/MEGABYTE);
+				}
+			}
+			if ((uint64_t)ep->rv_gpu_cache_size*MEGABYTE < gpu_pri_size) {
+				_HFI_ERROR("PSM3_RV_GPU_CACHE_SIZE=%u too small, require >= %"PRIu64"\n",
+					ep->rv_gpu_cache_size, (gpu_pri_size + MEGABYTE-1)/MEGABYTE);
+				return NULL;
+			}
+			cache->limit_gpu_inuse_bytes = (uint64_t)ep->rv_gpu_cache_size*MEGABYTE - gpu_pri_size;
+		}
+		_HFI_MMDBG("CPU cache %u GPU cache %u\n", ep->rv_mr_cache_size, ep->rv_gpu_cache_size);
+#endif
+	} else
+#endif // RNDV_MOD
+		cache->limit_inuse_bytes = UINT64_MAX;	// no limit, just count inuse
+#ifdef RNDV_MOD
+	cache->rv = ep->verbs_ep.rv;
+	cache->cmd_fd = ep->verbs_ep.context->cmd_fd;
+#endif // RNDV_MOD
+#if defined(RNDV_MOD) && defined(PSM_CUDA)
+	_HFI_MMDBG("cache alloc: max_entries=%u limit_inuse=%u limit_inuse_bytes=%"PRIu64" limit_gpu_inuse_bytes=%"PRIu64", pri_entries=%u pri_size=%"PRIu64" gpu_pri_size=%"PRIu64"\n",
+			cache->max_entries, cache->limit_inuse,
+			cache->limit_inuse_bytes, cache->limit_gpu_inuse_bytes,
+			pri_entries, pri_size, gpu_pri_size);
+#else
+	_HFI_MMDBG("cache alloc: max_entries=%u limit_inuse=%u limit_inuse_bytes=%"PRIu64", pri_entries=%u pri_size=%"PRIu64"\n",
+			cache->max_entries, cache->limit_inuse,
+			cache->limit_inuse_bytes, pri_entries, pri_size);
+#endif
+	// max_entries must be power of 2>= obj per chunk which is also a power of 2
+	cache->mr_pool = psmi_mpool_create(sizeof(cl_map_item_t),
+						min(128, max_entries), max_entries, 0,
+						DESCRIPTORS, NULL, NULL);
+	if (! cache->mr_pool) {
+		psmi_free(cache);
+		return NULL;
+	}
+	//nil_item already zeroed by calloc
+	//memset(&cache->nil_item.payload, 0, sizeof(cache->nil_item.payload));
+	ips_cl_qmap_init(&cache->map, &cache->root, &cache->nil_item);
+	TAILQ_INIT(&cache->avail_list);
+
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("cache_mode", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_mode, NULL),
+		PSMI_STATS_DECL_FUNC("limit_entries", mr_cache_max_entries),
+		PSMI_STATS_DECL_FUNC("nelems", mr_cache_nelems),
+		PSMI_STATS_DECL_FUNC("max_nelems", mr_cache_max_nelems),
+		PSMI_STATS_DECL("limit_inuse",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_limit_inuse, NULL),
+		PSMI_STATS_DECL_FUNC("inuse", mr_cache_inuse),
+		PSMI_STATS_DECL_FUNC("max_inuse", mr_cache_max_inuse),
+		PSMI_STATS_DECL("limit_inuse_bytes",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				NULL, &cache->limit_inuse_bytes),
+		PSMI_STATS_DECLU64("inuse_bytes", &cache->inuse_bytes),
+		PSMI_STATS_DECLU64("max_inuse_bytes", &cache->max_inuse_bytes),
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+		PSMI_STATS_DECL("limit_gpu_inuse_bytes",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				NULL, &cache->limit_gpu_inuse_bytes),
+		PSMI_STATS_DECLU64("gpu_inuse_bytes", &cache->gpu_inuse_bytes),
+		PSMI_STATS_DECLU64("max_gpu_inuse_bytes", &cache->max_gpu_inuse_bytes),
+#endif
+#endif
+		PSMI_STATS_DECL_FUNC("max_refcount", mr_cache_max_refcount),
+		PSMI_STATS_DECLU64("hit", &cache->hit),
+		PSMI_STATS_DECL("hit_%",MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_hit_rate, NULL),
+		PSMI_STATS_DECLU64("miss", &cache->miss),
+		PSMI_STATS_DECL("miss_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_miss_rate, NULL),
+		PSMI_STATS_DECLU64("rejected", &cache->rejected),
+		PSMI_STATS_DECLU64("full", &cache->full),
+		PSMI_STATS_DECLU64("failed", &cache->failed),
+#ifdef RNDV_MOD
+		PSMI_STATS_DECL_FUNC("rv_size", mr_cache_rv_size),
+		PSMI_STATS_DECL_FUNC("rv_max_size", mr_cache_rv_max_size),
+		PSMI_STATS_DECL_FUNC("rv_limit", mr_cache_rv_limit_size),
+		PSMI_STATS_DECL_FUNC("rv_nelems", mr_cache_rv_nelems),
+		PSMI_STATS_DECL_FUNC("rv_max_nelems", mr_cache_rv_max_nelems),
+		PSMI_STATS_DECL_FUNC("rv_inuse", mr_cache_rv_inuse),
+		PSMI_STATS_DECL_FUNC("rv_max_inuse", mr_cache_rv_max_inuse),
+		PSMI_STATS_DECLU64("rv_inuse_bytes", (uint64_t*)&cache->rv_stats.inuse_bytes),
+		PSMI_STATS_DECLU64("rv_max_inuse_bytes", (uint64_t*)&cache->rv_stats.max_inuse_bytes),
+		PSMI_STATS_DECL_FUNC("rv_max_refcount", mr_cache_rv_max_refcount),
+		PSMI_STATS_DECLU64("rv_hit", (uint64_t*)&cache->rv_stats.hit),
+		PSMI_STATS_DECL("rv_hit_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_hit_rate, NULL),
+		PSMI_STATS_DECLU64("rv_miss", (uint64_t*)&cache->rv_stats.miss),
+		PSMI_STATS_DECL("rv_miss_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_miss_rate, NULL),
+		PSMI_STATS_DECLU64("rv_full", (uint64_t*)&cache->rv_stats.full),
+		PSMI_STATS_DECLU64("rv_failed", (uint64_t*)&cache->rv_stats.failed),
+		PSMI_STATS_DECLU64("rv_remove", (uint64_t*)&cache->rv_stats.remove),
+		PSMI_STATS_DECLU64("rv_evict", (uint64_t*)&cache->rv_stats.evict),
+#endif // RNDV_MOD
+	};
+	psmi_stats_register_type("MR_Cache_Statistics",
+					PSMI_STATSTYPE_MR_CACHE,
+					entries,
+					PSMI_STATS_HOWMANY(entries),
+					ep->epid, cache, ep->dev_name);
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+	struct psmi_stats_entry gpu_entries[] = {
+		PSMI_STATS_DECL_FUNC("rv_gpu_size", mr_cache_rv_gpu_size),
+		PSMI_STATS_DECL_FUNC("rv_gpu_size_reg", mr_cache_rv_gpu_size_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_size_mmap", mr_cache_rv_gpu_size_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_size_both", mr_cache_rv_gpu_size_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_size", mr_cache_rv_gpu_max_size),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_size_reg", mr_cache_rv_gpu_max_size_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_size_mmap", mr_cache_rv_gpu_max_size_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_size_both", mr_cache_rv_gpu_max_size_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_limit", mr_cache_rv_gpu_limit_size),
+		PSMI_STATS_DECL_FUNC("rv_gpu_nelems", mr_cache_rv_gpu_nelems),
+		PSMI_STATS_DECL_FUNC("rv_gpu_nelems_reg", mr_cache_rv_gpu_nelems_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_nelems_mmap", mr_cache_rv_gpu_nelems_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_nelems_both", mr_cache_rv_gpu_nelems_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_nelems", mr_cache_rv_gpu_max_nelems),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_nelems_reg", mr_cache_rv_gpu_max_nelems_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_nelems_mmap", mr_cache_rv_gpu_max_nelems_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_nelems_both", mr_cache_rv_gpu_max_nelems_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_inuse", mr_cache_rv_gpu_inuse),
+		PSMI_STATS_DECL_FUNC("rv_gpu_inuse_reg", mr_cache_rv_gpu_inuse_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_inuse_mmap", mr_cache_rv_gpu_inuse_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_inuse_both", mr_cache_rv_gpu_inuse_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_inuse", mr_cache_rv_gpu_max_inuse),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_inuse_reg", mr_cache_rv_gpu_max_inuse_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_inuse_mmap", mr_cache_rv_gpu_max_inuse_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_inuse_both", mr_cache_rv_gpu_max_inuse_both),
+		PSMI_STATS_DECLU64("rv_gpu_inuse_bytes", (uint64_t*)&cache->rv_gpu_stats.inuse_bytes),
+		PSMI_STATS_DECLU64("rv_gpu_inuse_bytes_reg", (uint64_t*)&cache->rv_gpu_stats.inuse_bytes_reg),
+		PSMI_STATS_DECLU64("rv_gpu_inuse_bytes_mmap", (uint64_t*)&cache->rv_gpu_stats.inuse_bytes_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_inuse_bytes_both", (uint64_t*)&cache->rv_gpu_stats.inuse_bytes_both),
+		PSMI_STATS_DECLU64("rv_gpu_max_inuse_bytes", (uint64_t*)&cache->rv_gpu_stats.max_inuse_bytes),
+		PSMI_STATS_DECLU64("rv_gpu_max_inuse_bytes_reg", (uint64_t*)&cache->rv_gpu_stats.max_inuse_bytes_reg),
+		PSMI_STATS_DECLU64("rv_gpu_max_inuse_bytes_mmap", (uint64_t*)&cache->rv_gpu_stats.max_inuse_bytes_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_max_inuse_bytes_both", (uint64_t*)&cache->rv_gpu_stats.max_inuse_bytes_both),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_refcount", mr_cache_rv_gpu_max_refcount),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_refcount_reg", mr_cache_rv_gpu_max_refcount_reg),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_refcount_mmap", mr_cache_rv_gpu_max_refcount_mmap),
+		PSMI_STATS_DECL_FUNC("rv_gpu_max_refcount_both", mr_cache_rv_gpu_max_refcount_both),
+		PSMI_STATS_DECLU64("rv_gpu_hit", (uint64_t*)&cache->rv_gpu_stats.hit),
+		PSMI_STATS_DECL("rv_gpu_hit_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_hit_rate, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_hit_reg", (uint64_t*)&cache->rv_gpu_stats.hit_reg),
+		PSMI_STATS_DECL("rv_gpu_hit_reg_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_hit_rate_reg, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_hit_add_reg", (uint64_t*)&cache->rv_gpu_stats.hit_add_reg),
+		PSMI_STATS_DECL("rv_gpu_hit_add_reg_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_hit_rate_add_reg, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_hit_mmap", (uint64_t*)&cache->rv_gpu_stats.hit_mmap),
+		PSMI_STATS_DECL("rv_gpu_hit_mmap_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_hit_rate_mmap, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_hit_add_mmap", (uint64_t*)&cache->rv_gpu_stats.hit_add_mmap),
+		PSMI_STATS_DECL("rv_gpu_hit_add_mmap_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_hit_rate_add_mmap, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_miss", (uint64_t*)&cache->rv_gpu_stats.miss),
+		PSMI_STATS_DECL("rv_gpu_miss_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_miss_rate, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_miss_reg", (uint64_t*)&cache->rv_gpu_stats.miss_reg),
+		PSMI_STATS_DECL("rv_gpu_miss_reg_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_miss_rate_reg, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_miss_mmap", (uint64_t*)&cache->rv_gpu_stats.miss_mmap),
+		PSMI_STATS_DECL("rv_gpu_miss_mmap_%", MPSPAWN_STATS_REDUCTION_ALL,
+				mr_cache_rv_gpu_miss_rate_mmap, NULL),
+		PSMI_STATS_DECLU64("rv_gpu_full", (uint64_t*)&cache->rv_gpu_stats.full),
+		PSMI_STATS_DECLU64("rv_gpu_full_reg", (uint64_t*)&cache->rv_gpu_stats.full_reg),
+		PSMI_STATS_DECLU64("rv_gpu_full_mmap", (uint64_t*)&cache->rv_gpu_stats.full_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_failed_pin", (uint64_t*)&cache->rv_gpu_stats.failed_pin),
+		PSMI_STATS_DECLU64("rv_gpu_failed_reg", (uint64_t*)&cache->rv_gpu_stats.failed_reg),
+		PSMI_STATS_DECLU64("rv_gpu_failed_mmap", (uint64_t*)&cache->rv_gpu_stats.failed_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_remove", (uint64_t*)&cache->rv_gpu_stats.remove),
+		PSMI_STATS_DECLU64("rv_gpu_remove_reg", (uint64_t*)&cache->rv_gpu_stats.remove_reg),
+		PSMI_STATS_DECLU64("rv_gpu_remove_mmap", (uint64_t*)&cache->rv_gpu_stats.remove_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_remove_both", (uint64_t*)&cache->rv_gpu_stats.remove_both),
+		PSMI_STATS_DECLU64("rv_gpu_evict", (uint64_t*)&cache->rv_gpu_stats.evict),
+		PSMI_STATS_DECLU64("rv_gpu_evict_reg", (uint64_t*)&cache->rv_gpu_stats.evict_reg),
+		PSMI_STATS_DECLU64("rv_gpu_evict_mmap", (uint64_t*)&cache->rv_gpu_stats.evict_mmap),
+		PSMI_STATS_DECLU64("rv_gpu_evict_both", (uint64_t*)&cache->rv_gpu_stats.evict_both),
+		PSMI_STATS_DECLU64("rv_gpu_inval_mr", (uint64_t*)&cache->rv_gpu_stats.inval_mr),
+		PSMI_STATS_DECLU64("rv_post_write", (uint64_t*)&cache->rv_gpu_stats.post_write),
+		PSMI_STATS_DECLU64("rv_post_write_bytes", (uint64_t*)&cache->rv_gpu_stats.post_write_bytes),
+		PSMI_STATS_DECLU64("rv_gpu_post_write", (uint64_t*)&cache->rv_gpu_stats.gpu_post_write),
+		PSMI_STATS_DECLU64("rv_gpu_post_write_bytes", (uint64_t*)&cache->rv_gpu_stats.gpu_post_write_bytes),
+	};
+	if (cache->rv && PSMI_IS_CUDA_ENABLED)
+		psmi_stats_register_type("MR_GPU_Cache_Statistics",
+					PSMI_STATSTYPE_MR_CACHE,
+					gpu_entries,
+					PSMI_STATS_HOWMANY(gpu_entries),
+					ep->epid, &cache->rv_gpu_stats,
+					ep->dev_name);
+#endif
+#endif
+
+	return cache;
+}
+
+int psm2_verbs_mr_cache_allows_user_mr(psm2_mr_cache_t cache)
+{
+	if (!cache)
+		return 0;
+	switch (cache->cache_mode) {
+		case MR_CACHE_MODE_NONE:
+			return 0;
+		case MR_CACHE_MODE_KERNEL:
+			return psmi_hal_has_cap(PSM_HAL_CAP_USER_MR);
+		case MR_CACHE_MODE_USER:
+			return 1;
+		case MR_CACHE_MODE_RV:
+			return psmi_hal_has_cap(PSM_HAL_CAP_USER_MR);
+		default:	// unexpected
+			return 0;
+	}
+}
+
+static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length,
+					int access)
+{
+	INC_STAT(cache, inuse, max_inuse);
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+	if (access & IBV_ACCESS_IS_GPU_ADDR)
+		ADD_STAT(cache, length, gpu_inuse_bytes, max_gpu_inuse_bytes);
+	else
+#endif
+#endif
+		ADD_STAT(cache, length, inuse_bytes, max_inuse_bytes);
+}
+
+static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length,
+					int access)
+{
+	cache->inuse--;
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+	if (access & IBV_ACCESS_IS_GPU_ADDR)
+		cache->gpu_inuse_bytes -= length;
+	else
+#endif
+#endif
+		cache->inuse_bytes -= length;
+}
+
+// checks for space for a non-priority registration
+static inline int have_space(psm2_mr_cache_t cache, uint64_t length, int access)
+{
+#ifdef RNDV_MOD
+#ifdef PSM_CUDA
+	if (access & IBV_ACCESS_IS_GPU_ADDR)
+		return (cache->inuse < cache->limit_inuse
+			&& cache->gpu_inuse_bytes + length < cache->limit_gpu_inuse_bytes);
+	else
+#endif
+#endif
+		return (cache->inuse < cache->limit_inuse
+			&& cache->inuse_bytes + length < cache->limit_inuse_bytes);
+}
+
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+// given an ep this returns the "next one".
+// It loops through all the multi-rail/multi-QP EPs in a given user opened EP
+// 1st, then it goes to the next user opened EP (multi-EP) and loops through
+// it's multi-rail/mult-QP EPs.
+// When it hits the last rail of the last user opened EP, it goes back to
+// the 1st rail of the 1st user opened EP.
+// caller must hold creation_lock
+static psm2_ep_t next_ep(psm2_ep_t ep)
+{
+       //mctxt_next is the circular list of rails/QPs in a given user EP
+       //mctxt_master is the 1st in the list, when we get back to the 1st
+       //go to the next user EP
+       ep = ep->mctxt_next;
+       if (ep->mctxt_master != ep)
+               return ep;
+       //user_ep_next is a linked list of user opened EPs.  End of list is NULL
+       //when hit end of list, go back to 1st (psmi_opened_endpoint)
+       //for each user opened EP, the entry on this list is the 1st rail within
+       //the EP
+       ep = ep->user_ep_next;
+       if (ep)
+               return ep;
+       else
+               return psmi_opened_endpoint;
+}
+
+// determine if ep is still valid (can't dereference or trust ep given)
+// caller must hold creation_lock
+static int valid_ep(psm2_ep_t ep)
+{
+	psm2_ep_t e1 = psmi_opened_endpoint;
+
+	while (e1) {	// user opened ep's - linear list ending in NULL
+		psm2_ep_t e2 = e1;
+		//check mtcxt list (multi-rail within user opened ep)
+		do {
+			if (e2 == ep)
+				return 1;
+			e2 = e2->mctxt_next;
+		} while (e2 != e1);	// circular list
+		e1 = e1->user_ep_next;
+	}
+	return 0;	// not found
+}
+
+// advance ep to the next.  However it's possible ep is stale and
+// now closed/freed, so make sure it's good.  good_ep is at least one
+// known good_ep and lets us avoid search some of the time (or if only 1 EP)
+// caller must hold creation_lock
+static psm2_ep_t next_valid_ep(psm2_ep_t ep, psm2_ep_t good_ep)
+{
+	if (ep == good_ep || valid_ep(ep))
+		return next_ep(ep);
+	else
+		return good_ep;
+}
+
+/*
+ * Evict some space in given cache (only GPU needs this)
+ * If nvidia_p2p_get_pages reports out of BAR space (perhaps prematurely),
+ * we need to evict from other EPs too.
+ * So we rotate among all eps (rails or QPs) in our user opened EP for eviction.
+ * length - amount attempted in pin/register which just failed
+ * access - indicates if IS_GPU_ADDR or not (rest ignored)
+ * returns:
+ * 	>0 bytes evicted if some evicted
+ * 	-1 if nothing evicted (errno == ENOENT means nothing evictable found)
+ * 	ENOENT also used when access is not for GPU
+ * The caller will have the progress_lock, we need the creation_lock
+ * to walk the list of EPs outside our own MQ.  However creation_lock
+ * is above the progress_lock in lock heirarchy, so we use a LOCK_TRY
+ * to avoid deadlock in the rare case where another thread
+ * has creation_lock and is trying to get progress_lock (such as during
+ * open_ep, close_ep or rcvthread).
+ */
+int64_t psm2_verbs_evict_some(psm2_ep_t ep, uint64_t length, int access)
+{
+	static __thread psm2_ep_t last_evict_ep;	// among all eps
+	static __thread psm2_ep_t last_evict_myuser_ep;	// in my user ep
+	int64_t evicted = 0;
+	int ret;
+
+	if (! (access & IBV_ACCESS_IS_GPU_ADDR)) {
+		errno = ENOENT;
+		return -1;	// only need evictions on GPU addresses
+	}
+	if (! last_evict_ep) {	// first call only
+		last_evict_ep = ep;
+		last_evict_myuser_ep = ep;
+	}
+	// 1st try to evict from 1st rail/QP in our opened EP (gdrcopy and MRs)
+	ret = __psm2_rv_evict_gpu_amount(ep->mctxt_master->verbs_ep.rv, max(gpu_cache_evict, length), 0);
+	if (ret > 0)
+		evicted = ret;
+
+	// next rotate among other rails/QPs in our opened ep (MRs)
+	last_evict_myuser_ep = last_evict_myuser_ep->mctxt_next;
+	if (last_evict_myuser_ep != ep->mctxt_master) {
+		ret = __psm2_rv_evict_gpu_amount(last_evict_myuser_ep->verbs_ep.rv, max(gpu_cache_evict, length), 0);
+		if (ret > 0)
+			evicted += ret;
+	}
+	if (evicted >= length)
+		return evicted;
+
+	// now try other opened EPs
+	if (PSMI_LOCK_TRY(psmi_creation_lock))
+		goto done;
+	// last_evict_ep could point to an ep which has since been closed/freed
+ 	last_evict_ep = next_valid_ep(last_evict_ep, ep);
+	if (last_evict_ep->mctxt_master != ep->mctxt_master) {
+		if (!PSMI_LOCK_TRY(last_evict_ep->mq->progress_lock)) {
+			ret = __psm2_rv_evict_gpu_amount(last_evict_ep->verbs_ep.rv, max(gpu_cache_evict, length), 0);
+			PSMI_UNLOCK(last_evict_ep->mq->progress_lock);
+			if (ret > 0)
+				evicted += ret;
+		}
+	} else {
+		ret = __psm2_rv_evict_gpu_amount(last_evict_ep->verbs_ep.rv, max(gpu_cache_evict, length), 0);
+		if (ret > 0 )
+			evicted += ret;
+	}
+	PSMI_UNLOCK(psmi_creation_lock);
+done:
+	if (! evicted) {
+		errno = ENOENT;
+		return -1;
+	}
+	return evicted;
+}
+#endif
+#endif
+
+// each attempt will increment exactly one of: hit, miss, rejected, full, failed
+struct psm2_verbs_mr * psm2_verbs_reg_mr(psm2_mr_cache_t cache,
+				bool priority, struct ibv_pd *pd,
+				void *addr, uint64_t length, int access)
+{
+	psm2_verbs_mr_t mrc;
+
+#ifdef PSM_FI
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(cache->ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_reg_mr, "reg_mr",
+				"MR cache full, any request type",
+				1, IPS_FAULTINJ_REG_MR);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_reg_mr, "")) {
+			cache->failed++;
+			errno = ENOMEM;
+			return NULL;
+		}
+	}
+	if_pf(!priority && PSMI_FAULTINJ_ENABLED_EP(cache->ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_nonpri_reg_mr, "nonpri_reg_mr",
+				"MR cache full, non-priority request",
+				1, IPS_FAULTINJ_NONPRI_REG_MR);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_nonpri_reg_mr, "")) {
+			cache->failed++;
+			errno = ENOMEM;
+			return NULL;
+		}
+	}
+	if_pf(priority && PSMI_FAULTINJ_ENABLED_EP(cache->ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_pri_reg_mr, "pri_reg_mr",
+				"MR cache full, priority request",
+				1, IPS_FAULTINJ_PRI_REG_MR);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_pri_reg_mr, "")) {
+			cache->failed++;
+			errno = ENOMEM;
+			return NULL;
+		}
+	}
+#endif // PSM_FI
+	access |= IBV_ACCESS_LOCAL_WRITE;	// manditory flag
+#ifndef RNDV_MOD
+	if (access & IBV_ACCESS_IS_GPU_ADDR) {
+		_HFI_ERROR("unsupported GPU memory registration\n");
+		cache->failed++;
+		errno = EINVAL;
+		return NULL;
+	}
+#else
+#ifdef PSM_CUDA
+	psmi_assert(!!(access & IBV_ACCESS_IS_GPU_ADDR) == (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(addr)));
+#endif
+#endif
+	struct psm2_verbs_mr key = { // our search key
+		.addr = addr,
+		.length = length,
+		.access = access
+	};
+	// for user QPs, can share entries with send DMA and send RDMA
+#ifdef RNDV_MOD
+	if (cache->cache_mode != MR_CACHE_MODE_RV)
+		key.access &= ~IBV_ACCESS_RDMA;
+#else
+	key.access &= ~IBV_ACCESS_RDMA;
+#endif
+
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key);
+	if (p_item->payload.mr.mr_ptr) {
+		psmi_assert(p_item != cache->map.nil_item);
+		mrc = &p_item->payload;
+		if (! mrc->refcount) {
+			if (! priority && ! have_space(cache, length, access)) {
+				_HFI_MMDBG("cache has no headroom for non-priority hit addr %p len %"PRIu64" access 0x%x ptr %p\n",
+						addr, length, access, mrc);
+				cache->rejected++;
+				errno = ENOMEM;
+				return NULL;
+			}
+			// it was an entry on avail_list, take off list
+			TAILQ_REMOVE(&cache->avail_list, mrc, next);
+			update_stats_inc_inuse(cache, length, access);
+		}
+		cache->hit++;
+		mrc->refcount++;
+		_HFI_MMDBG("cache hit MR addr %p len %"PRIu64" access 0x%x ref %d ptr %p\n",
+			addr, length, access, mrc->refcount, mrc);
+		cache->max_refcount = max(cache->max_refcount, mrc->refcount);
+		return mrc;
+	}
+	psmi_assert(p_item == cache->map.nil_item);
+	if (! priority && ! have_space(cache, length, access)) {
+		_HFI_MMDBG("cache has no headroom for non-priority miss addr %p len %"PRIu64" access 0x%x\n",
+			addr, length, access);
+		cache->rejected++;
+		errno = ENOMEM;
+		return NULL;
+	}
+	// we only reuse entries from avail_list once cache is full
+	// this helps improve cache hit rate.
+	// we only have items on avail_list when cache_mode==MR_CACHE_MODE_USER
+	if (cache->map.payload.nelems >= cache->max_entries) {
+		int ret;
+		mrc = TAILQ_FIRST(&cache->avail_list);
+		if (! mrc) {
+			_HFI_MMDBG("user space MR cache full\n");
+			cache->full++;
+			errno = ENOMEM;
+			return NULL;
+		}
+		p_item = container_of(mrc, cl_map_item_t, payload);
+		psmi_assert(mrc->mr.mr_ptr);
+		psmi_assert(! mrc->refcount);
+		_HFI_MMDBG("reuse avail MR addr %p len %"PRIu64" access 0x%x ptr %p\n",
+					addr, length, access, mrc);
+		ips_cl_qmap_remove_item(&mrc->cache->map, p_item);
+		TAILQ_REMOVE(&cache->avail_list, mrc, next);
+#ifdef RNDV_MOD
+		if (cache->cache_mode == MR_CACHE_MODE_KERNEL
+			|| cache->cache_mode == MR_CACHE_MODE_RV)	// should not happen
+			ret = __psm2_rv_dereg_mem(cache->rv, mrc->mr.rv_mr);
+		else
+#endif
+			ret = ibv_dereg_mr(mrc->mr.ibv_mr);
+		if (ret) {
+			_HFI_ERROR("unexpected dreg_mr failure: %s\n", strerror(errno));
+			cache->failed++;
+			errno = EIO;
+			// MR is fouled up, we leak the MR and free the cache entry
+			// caller will try again later
+			mrc->mr.mr_ptr = NULL;
+			psmi_mpool_put(p_item);
+			return NULL;
+		}
+		mrc->mr.mr_ptr = NULL;
+	} else {
+		// allocate a new item
+		p_item = (cl_map_item_t *)psmi_mpool_get(cache->mr_pool);
+		if (! p_item) {	// keep KW happy, should not happen, we check max above
+			_HFI_ERROR("unexpected cache pool allocate failure\n");
+			cache->failed++;
+			return NULL;
+		}
+		mrc = &p_item->payload;
+		// we initialize mrc below
+		cache->max_nelems = max(cache->max_nelems, cache->map.payload.nelems+1);
+	}
+#ifdef RNDV_MOD
+	/* need cmd_fd for access to ucontext when converting user pd into kernel pd */
+	if (cache->cache_mode == MR_CACHE_MODE_KERNEL) {
+		// user space QPs for everything, drop IBV_ACCESS_RDMA flag
+		mrc->mr.rv_mr = __psm2_rv_reg_mem(cache->rv, cache->cmd_fd, pd, addr, length, access & ~IBV_ACCESS_RDMA);
+		if (! mrc->mr.rv_mr) {
+			int save_errno = errno;
+			if (errno == ENOMEM) {
+				cache->full++;
+#ifdef PSM_CUDA
+				if (priority)
+					(void)psm2_verbs_evict_some(cache->ep, length, access);
+#endif
+			} else {
+				_HFI_ERROR("reg_mr failed; %s acc 0x%x\n", strerror(errno), access);
+				cache->failed++;
+			}
+			psmi_mpool_put(p_item);
+			errno = save_errno;
+			return NULL;
+		}
+		mrc->iova = mrc->mr.rv_mr->iova;
+		mrc->lkey = mrc->mr.rv_mr->lkey;
+		mrc->rkey = mrc->mr.rv_mr->rkey;
+	} else if (cache->cache_mode == MR_CACHE_MODE_RV) {
+		// kernel QP for RDMA, user QP for send DMA
+		mrc->mr.rv_mr = __psm2_rv_reg_mem(cache->rv, cache->cmd_fd, (access&IBV_ACCESS_RDMA)?NULL:pd, addr, length, access);
+		if (! mrc->mr.rv_mr) {
+			int save_errno = errno;
+			if (errno == ENOMEM) {
+				cache->full++;
+#ifdef PSM_CUDA
+				if (priority)
+					(void)psm2_verbs_evict_some(cache->ep, length, access);
+#endif
+			} else {
+				_HFI_ERROR("reg_mr failed; %s acc 0x%x\n", strerror(errno), access);
+				cache->failed++;
+			}
+			psmi_mpool_put(p_item);
+			errno = save_errno;
+			return NULL;
+		}
+		mrc->iova = mrc->mr.rv_mr->iova;
+		mrc->lkey = mrc->mr.rv_mr->lkey;
+		mrc->rkey = mrc->mr.rv_mr->rkey;
+	} else
+#endif
+	{
+		// user space QPs for everything, drop IBV_ACCESS_RDMA flag
+		mrc->mr.ibv_mr = ibv_reg_mr(pd, addr, length, access & ~IBV_ACCESS_RDMA);
+		if (! mrc->mr.ibv_mr) {
+			int save_errno = errno;
+			if (errno == ENOMEM) {
+				cache->full++;
+			} else {
+				_HFI_ERROR("reg_mr failed; %s acc 0x%x\n", strerror(errno), access);
+				cache->failed++;
+			}
+			psmi_mpool_put(p_item);
+			errno = save_errno;
+			return NULL;
+		}
+		mrc->iova = (uintptr_t)addr;
+		mrc->lkey = mrc->mr.ibv_mr->lkey;
+		mrc->rkey = mrc->mr.ibv_mr->rkey;
+	}
+	cache->miss++;
+	mrc->cache = cache;
+	mrc->refcount = 1;
+	mrc->addr = addr;
+	mrc->length = length;
+#ifdef RNDV_MOD
+	if (cache->cache_mode != MR_CACHE_MODE_RV)
+		mrc->access = access & ~IBV_ACCESS_RDMA;
+	else
+		mrc->access = access;
+#else
+	mrc->access = access & ~IBV_ACCESS_RDMA;
+#endif
+	ips_cl_qmap_insert_item(&cache->map, p_item);
+	update_stats_inc_inuse(cache, length, access);
+	_HFI_MMDBG("registered new MR pri %d addr %p len %"PRIu64" access 0x%x ref %u ptr %p nelems %u\n",
+		priority, addr, length, access, mrc->refcount, mrc, cache->map.payload.nelems);
+	return mrc;
+}
+
+int psm2_verbs_release_mr(struct psm2_verbs_mr *mrc)
+{
+	int ret = 0;
+	if (! mrc) {
+		errno = EINVAL;
+		return -1;
+	}
+	if (! mrc->refcount) {
+		errno = ENXIO;
+		return -1;
+	}
+	_HFI_MMDBG("releasing MR addr %p len %"PRIu64" access 0x%x ref %u-- ptr %p\n",
+		mrc->addr, mrc->length, mrc->access, mrc->refcount, mrc);
+	mrc->refcount--;
+	if (!mrc->refcount) {
+		if (mrc->cache->cache_mode == MR_CACHE_MODE_USER) {
+			// if refcount now zero, put on avail_list to be reclaimed if needed
+			update_stats_dec_inuse(mrc->cache, mrc->length, mrc->access);
+			TAILQ_INSERT_TAIL(&mrc->cache->avail_list, mrc, next);
+		} else {
+			_HFI_MMDBG("freeing MR addr %p len %"PRIu64" access 0x%x ref %u ptr %p nelems %u\n",
+				mrc->addr, mrc->length, mrc->access, mrc->refcount, mrc,
+				mrc->cache->map.payload.nelems);
+			update_stats_dec_inuse(mrc->cache, mrc->length, mrc->access);
+			cl_map_item_t *p_item = container_of(mrc, cl_map_item_t, payload);
+			ips_cl_qmap_remove_item(&mrc->cache->map, p_item);
+#ifdef RNDV_MOD
+			if (mrc->cache->cache_mode == MR_CACHE_MODE_KERNEL
+					|| mrc->cache->cache_mode == MR_CACHE_MODE_RV)
+				ret = __psm2_rv_dereg_mem(mrc->cache->rv, mrc->mr.rv_mr);
+			else
+#endif
+				ret = ibv_dereg_mr(mrc->mr.ibv_mr);
+			if (ret) {
+				// nasty choice, do we leak the MR or leak the cache entry
+				// we chose to leak the MR and free the cache entry
+				_HFI_ERROR("unexpected dreg_mr failure on %s: %s\n", mrc->cache->ep->dev_name, strerror(errno));
+				errno = EIO;
+				ret = -1;
+			}
+			mrc->mr.mr_ptr = NULL;
+			psmi_mpool_put(p_item);
+		}
+	}
+	return ret;
+}
+
+void psm2_verbs_free_mr_cache(psm2_mr_cache_t cache)
+{
+#ifdef PSM_CUDA
+#ifdef RNDV_MOD
+	if (cache->rv && PSMI_IS_CUDA_ENABLED)
+		psmi_stats_deregister_type(PSMI_STATSTYPE_MR_CACHE,
+					&cache->rv_gpu_stats);
+#endif
+#endif
+	psmi_stats_deregister_type(PSMI_STATSTYPE_MR_CACHE, cache);
+	while (cache->map.payload.nelems) {
+		cl_map_item_t *p_item = __cl_map_root(&cache->map);
+		psmi_assert(p_item != cache->map.nil_item);
+		psm2_verbs_mr_t mrc = &p_item->payload;
+		psmi_assert(mrc->mr.mr_ptr);
+		if (mrc->mr.mr_ptr) {
+			int ret;
+			_HFI_MMDBG("free MR addr %p len %"PRIu64" access 0x%x ref %u ptr %p\n",
+				mrc->addr, mrc->length, mrc->access, mrc->refcount, mrc);
+			if (mrc->refcount) {
+				_HFI_ERROR("unreleased MR in psm2_verbs_free_mr_cache addr %p len %"PRIu64" access 0x%x\n",
+					mrc->addr, mrc->length, mrc->access);
+				return; // leak the rest, let process exit cleanup
+			}
+			mrc->refcount = 0;
+			cl_map_item_t *p_item = container_of(mrc, cl_map_item_t, payload);
+			ips_cl_qmap_remove_item(&cache->map, p_item);
+			TAILQ_REMOVE(&cache->avail_list, mrc, next);
+#ifdef RNDV_MOD
+			if (cache->cache_mode == MR_CACHE_MODE_KERNEL
+					|| cache->cache_mode == MR_CACHE_MODE_RV)
+				ret = __psm2_rv_dereg_mem(cache->rv, mrc->mr.rv_mr);
+			else
+#endif
+				ret = ibv_dereg_mr(mrc->mr.ibv_mr);
+			if (ret)
+				_HFI_ERROR("unexpected dreg_mr failure on %s: %s\n", mrc->cache->ep->dev_name, strerror(errno));
+			mrc->mr.mr_ptr = NULL;
+			psmi_mpool_put(p_item);
+		}
+	}
+	psmi_assert(TAILQ_EMPTY(&cache->avail_list));
+	psmi_assert(! cache->map.payload.nelems);
+
+	psmi_mpool_destroy(cache->mr_pool);
+	psmi_free(cache);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.h b/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.h
new file mode 100644
index 0000000000000000000000000000000000000000..98f4ada8b14449365f4cfdeb95ece1f1ff7ae9a5
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psm_verbs_mr.h
@@ -0,0 +1,183 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+
+#ifndef _PSMI_IN_USER_H
+#error psm_verbs_mr.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_VERBS_MR_H
+#define _PSMI_VERBS_MR_H
+
+#include <infiniband/verbs.h>
+#ifdef RNDV_MOD
+#include <psm_rndv_mod.h>
+#define IBV_ACCESS_RDMA IBV_ACCESS_KERNEL
+#else
+#define IBV_ACCESS_RDMA 0
+// pick a flag value unused by verbs.h
+#define IBV_ACCESS_IS_GPU_ADDR 0x10000000
+#endif
+
+#define MR_CACHE_MODE_NONE 0	// user space MRs, but no caching
+#define MR_CACHE_MODE_KERNEL 1	// kernel MR cache in rendezvous module
+#define MR_CACHE_MODE_USER 2	// user space MR cache (demo quality only)
+#define MR_CACHE_MODE_RV 3	// kernel MRs for kernel rendezvous module QPs
+#define MR_CACHE_MODE_VALID(mode) ((unsigned)(mode) <= 3)
+
+// This performs memory registration for RDMA Rendezvous when PSM3_RDMA enabled
+// Priority registration calls are those immediately before the data transfer
+// hence delaying their registration directly delays IOs.
+// Non-priority calls are those registering the whole IO
+// prior to sending/receiving the CTS.  Delays in non-priority
+// calls have less direct impacts on IO delays.
+// Numbers and size limits for priority registrations are able to be directly
+// estimated since limits on outstanding RDMAs and their size constrains them.
+// Non-priority registrations and sizes are a function of application design
+// and how many conurrent MPI_ISend/IRecv are outstanding and of what size.
+// Given defaults, priority registrations limits will be 160 entries of
+// 128K each for a total of < 20MB of the kernel cache.
+// All non-priority registrations eventually get used for IOs and become
+// a priority use case, so attempting to track whether each entry in the cache
+// is a priority or non-priority entry is tricky, especially since there
+// can be both priority and non-priority references.  So instead of attempting
+// to track the amount of priority and non-priority entries, we simply
+// track current inuse entries and only allow non-priority registrations when
+// we have a reasonable amount of headroom.  This way most priority
+// registrations will succeed.
+//
+// access indicates the purpose and permissions for the MR
+// IBV_ACCESS_RDMA - send RDMA
+// IBV_ACCESS_RDMA|IBV_ACCESS_REMOTE_WRITE - recv RDMA
+// 0 - send DMA
+// | IBV_ACCESS_IS_GPU_ADDR - GPU variation of any of the above 3
+// When using RV kernel QPs, IBV_ACCESS_RDMA (== IBV_ACCESS_KERNEL) is passed
+// to RV.  Otherwise, it's is omitted and a user space accessible MR is created.
+// When using only user space QPs, we allow send RDMA and send DMA to share MRs.
+
+// the pointer to psm2_verbs_mr itself is the handle for subsequenent release
+struct psm2_verbs_mr {
+	// fields for use by caller
+	// TBD - review use, we have rkey/lkey here and in mr itself, don't need both
+	uint64_t iova;	// used by caller
+	uint32_t lkey;	// used by caller
+	uint32_t rkey;	// used by caller
+	// private fields below are not for use by caller
+	// for kernel rendezvous this might just be a kernel handle and this
+	// information may be private in the kernel
+	union {
+		void *mr_ptr;		// for simple test of != NULL or clearing to NULL
+#ifdef RNDV_MOD
+		// when cache_mode = MR_CACHE_MODE_KERNEL
+		psm2_rv_mr_t rv_mr;	// internally we can get addr, length and pd from here
+#endif
+		// when cache_mode = MR_CACHE_MODE_NONE or MR_CACHE_MODE_USER
+		struct ibv_mr *ibv_mr;	// internally we can get addr, length and pd from here
+	} mr;
+	struct psm2_mr_cache *cache;	// TBD could have caller pass to release
+	uint32_t refcount;
+	// this structure will be used as a search key too, so must include
+	// addr and length directly since search key object won't have an mr ptr
+	// also addr is used in callers to translate remote addr returned in CTS
+	void *addr;
+	uint64_t length;
+	uint32_t access;
+	// below is for queue of cache entries available for reuse (refcount==0)
+	// only used when cache_mode==1
+	TAILQ_ENTRY(psm2_verbs_mr) next;
+};
+typedef struct psm2_verbs_mr *psm2_verbs_mr_t;
+
+// cache is kept opaque since it has some rbtree fields in it
+struct psm2_mr_cache;
+typedef struct psm2_mr_cache *psm2_mr_cache_t;
+
+extern psm2_mr_cache_t psm2_verbs_alloc_mr_cache(psm2_ep_t ep,
+				uint32_t num_entries, uint8_t cache_mode,
+				uint32_t pri_entries, uint64_t pri_size
+#ifdef PSM_CUDA
+				, uint64_t gpu_pri_size
+#endif
+				);
+extern int psm2_verbs_mr_cache_allows_user_mr(psm2_mr_cache_t cache);
+
+#ifdef PSM_CUDA
+extern int64_t psm2_verbs_evict_some(psm2_ep_t ep, uint64_t length, int access);
+#endif
+
+// pd can be the verbs_ep.pd or NULL to use the RV module's kernel pd
+extern psm2_verbs_mr_t psm2_verbs_reg_mr(psm2_mr_cache_t cache,
+				bool priority, struct ibv_pd *pd,
+				void *addr, uint64_t length, int access);
+static inline psm2_verbs_mr_t psm2_verbs_ref_mr(psm2_verbs_mr_t mr) {
+	mr->refcount++;
+	_HFI_MMDBG("cache hit MR addr %p len %"PRIu64" access 0x%x ref %d ptr %p\n",
+		mr->addr, mr->length, mr->access, mr->refcount, mr);
+	return mr;
+}
+extern int psm2_verbs_release_mr(psm2_verbs_mr_t mrc);
+#ifdef RNDV_MOD
+// can the given MR be used for user space send DMA
+static inline int psm2_verbs_user_space_mr(struct psm2_verbs_mr *mrc)
+{
+	psmi_assert(mrc);
+	psmi_assert(mrc->refcount);
+	return ! (mrc->access & IBV_ACCESS_KERNEL);
+}
+#endif
+extern void psm2_verbs_free_mr_cache(psm2_mr_cache_t cache);
+void ips_tid_mravail_callback(struct ips_proto *proto);
+
+#endif // _PSMI_VERBS_MR_H
diff --git a/deps/libfabric/prov/psm3/psm3/psmi_wrappers.c b/deps/libfabric/prov/psm3/psm3/psmi_wrappers.c
new file mode 100644
index 0000000000000000000000000000000000000000..ba2b0a6224e74b9e0860bf24b038d31830ea5ff9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psmi_wrappers.c
@@ -0,0 +1,94 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "psmi_wrappers.h"
+#include <sys/ioctl.h>
+
+/* The following indirection wrappers for external functions
+ * are only created if this is a mocking tests build
+ */
+#ifdef PSM2_MOCK_TESTING
+
+void MOCKABLE(psmi_exit)(int status)
+{
+	exit(status);
+}
+MOCK_DEF_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count)
+{
+	return write(fd, buf, count);
+}
+MOCK_DEF_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg)
+{
+	return ioctl(fd, cmd, arg);
+}
+MOCK_DEF_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact)
+{
+	return sigaction(signum, act, oldact);
+}
+MOCK_DEF_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void)
+{
+	return ips_rmb();
+}
+MOCK_DEF_EPILOGUE(psmi_rmb);
+
+#endif /* def PSM2_MOCK_TESTING */
diff --git a/deps/libfabric/prov/psm3/psm3/psmi_wrappers.h b/deps/libfabric/prov/psm3/psm3/psmi_wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f11c8109a16865c3740ae639c01fd33631dda6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/psmi_wrappers.h
@@ -0,0 +1,98 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _PSMI_WRAPPERS_H
+#define _PSMI_WRAPPERS_H
+
+#include <signal.h>
+#include "psm2_mock_testing.h"
+#include "opa_intf.h"
+
+#if defined( IB_IOCTL_MAGIC )
+#include <sys/ioctl.h>
+#endif
+
+/* If this is a mocking tests build, we introduce "incision points"
+ * through which we can easily mock external dependencies.
+ * For non-mocking-tests build, we bypass those indirections
+ * for performance reasons.
+ */
+
+#ifdef PSM2_MOCK_TESTING
+void MOCKABLE(psmi_exit)(int status);
+MOCK_DCL_EPILOGUE(psmi_exit);
+
+ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count);
+MOCK_DCL_EPILOGUE(psmi_write);
+
+int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg);
+MOCK_DCL_EPILOGUE(psmi_ioctl);
+
+int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact);
+MOCK_DCL_EPILOGUE(psmi_sigaction);
+
+void MOCKABLE(psmi_rmb)(void);
+MOCK_DCL_EPILOGUE(psmi_rmb);
+
+#else /* def PSM2_MOCK_TESTING */
+
+#define psmi_exit	exit
+#define psmi_write	write
+#define psmi_ioctl	ioctl
+#define psmi_sigaction	sigaction
+#define psmi_rmb 	ips_rmb
+
+#endif /* def PSM2_MOCK_TESTING */
+
+#endif // _PSMI_WRAPPERS_H
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl.h b/deps/libfabric/prov/psm3/psm3/ptl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c2f1487b6214db80368363188645579a9334baa
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl.h
@@ -0,0 +1,405 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* Interface implemented by Packet Transport layers such as
+ * ips and active messages.
+ *
+ * This interface can be volatile, it is never seen by PSM clients, and it will
+ * probably change as the AM ptl is developed.
+ */
+
+#ifndef PSM_PTL_H
+#define PSM_PTL_H
+#include <inttypes.h>
+#include <psm2.h>
+#include <psm2_mq.h>
+#include <psm2_am.h>
+#include <psm_help.h>
+
+/* We currently have 3 PTLs, 0 is reserved. */
+#define PTL_DEVID_IPS  1
+#define PTL_DEVID_AMSH 2
+#define PTL_DEVID_SELF 3
+
+/* We can currently initialize up to 3 PTLs */
+#define PTL_MAX_INIT	3
+
+/* struct ptl is an incomplete type, and it serves as a generic or opaque
+   container.  It should remain an incomplete type in the entire psm
+   source base. concrete ptl types need to have a suffix such as ptl_self,
+   ptl_ips. */
+struct ptl;
+typedef struct ptl ptl_t;
+
+struct ptl_ctl;
+typedef struct ptl_ctl ptl_ctl_t;
+
+struct ptl_mq_req;
+typedef struct ptl_mq_req ptl_mq_req_t;
+
+struct ips_proto;
+typedef struct ips_proto ips_proto_t;
+
+/* To be filled in statically by all PTLs */
+struct ptl_ctl_init {
+	size_t(*sizeof_ptl) (void);
+
+	psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl);
+
+	psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns);
+
+	psm2_error_t
+	    (*setopt) (const void *component_obj, int optname,
+		       const void *optval, uint64_t optlen);
+
+	psm2_error_t
+	    (*getopt) (const void *component_obj, int optname,
+		       void *optval, uint64_t *optlen);
+};
+
+struct ptl_ctl_rcvthread {
+	uint32_t(*is_enabled) (const ptl_t *ptl);
+	void(*transfer_ownership) (ptl_t *from_ptl, ptl_t *to_ptl);
+};
+
+typedef
+struct ptl_arg {
+	union {
+		struct {
+			uint16_t u16w3;
+			uint16_t u16w2;
+			uint16_t u16w1;
+			uint16_t u16w0;
+		} PACK_SUFFIX;
+		struct {
+			uint32_t u32w1;
+			uint32_t u32w0;
+		} PACK_SUFFIX;
+		uint64_t u64w0;
+		uint64_t u64;
+		void *uptr;
+	};
+} PACK_SUFFIX ptl_arg_t;
+
+/* can be tracked per protocol, only fully tracked and reported
+ * for ips_proto at this time but by defining here we can later track
+ * for shm and maybe self protocols too and we avoid a branch in
+ * psmi_mq_handle_envelope
+ */
+struct ptl_strategy_stats {
+	uint64_t tiny_cpu_isend;
+	uint64_t tiny_cpu_isend_bytes;
+#ifdef PSM_CUDA
+	uint64_t tiny_gdrcopy_isend;
+	uint64_t tiny_gdrcopy_isend_bytes;
+	uint64_t tiny_cuCopy_isend;
+	uint64_t tiny_cuCopy_isend_bytes;
+#endif
+	uint64_t tiny_cpu_send;
+	uint64_t tiny_cpu_send_bytes;
+#ifdef PSM_CUDA
+	uint64_t tiny_gdrcopy_send;
+	uint64_t tiny_gdrcopy_send_bytes;
+	uint64_t tiny_cuCopy_send;
+	uint64_t tiny_cuCopy_send_bytes;
+#endif
+
+	uint64_t tiny_cpu_recv;
+	uint64_t tiny_cpu_recv_bytes;
+	uint64_t tiny_sysbuf_recv;	/* to unexpected Q sysbuf */ /* incl 0 byte */
+	uint64_t tiny_sysbuf_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t tiny_gdrcopy_recv;
+	uint64_t tiny_gdrcopy_recv_bytes;
+	uint64_t tiny_cuCopy_recv;
+	uint64_t tiny_cuCopy_recv_bytes;
+#endif
+
+	uint64_t short_copy_cpu_isend;
+	uint64_t short_copy_cpu_isend_bytes;
+	uint64_t short_dma_cpu_isend;
+	uint64_t short_dma_cpu_isend_bytes;
+#ifdef PSM_CUDA
+	uint64_t short_gdrcopy_isend;
+	uint64_t short_gdrcopy_isend_bytes;
+	uint64_t short_cuCopy_send;
+	uint64_t short_cuCopy_send_bytes;
+	uint64_t short_gdr_send;
+	uint64_t short_gdr_send_bytes;
+#endif
+	uint64_t short_copy_cpu_send;
+	uint64_t short_copy_cpu_send_bytes;
+	uint64_t short_dma_cpu_send;
+	uint64_t short_dma_cpu_send_bytes;
+
+#ifdef PSM_CUDA
+	uint64_t short_gdrcopy_send;
+	uint64_t short_gdrcopy_send_bytes;
+	uint64_t short_cuCopy_isend;
+	uint64_t short_cuCopy_isend_bytes;
+	uint64_t short_gdr_isend;
+	uint64_t short_gdr_isend_bytes;
+#endif
+
+	uint64_t short_cpu_recv;
+	uint64_t short_cpu_recv_bytes;
+	uint64_t short_sysbuf_recv;	/* to unexpected Q sysbuf */
+	uint64_t short_sysbuf_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t short_gdrcopy_recv;
+	uint64_t short_gdrcopy_recv_bytes;
+	uint64_t short_cuCopy_recv;
+	uint64_t short_cuCopy_recv_bytes;
+#endif
+
+	uint64_t eager_copy_cpu_isend;
+	uint64_t eager_copy_cpu_isend_bytes;
+	uint64_t eager_dma_cpu_isend;
+	uint64_t eager_dma_cpu_isend_bytes;
+	uint64_t eager_sysbuf_recv;	/* to unexpected Q sysbuf */
+	uint64_t eager_sysbuf_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t eager_cuCopy_isend;
+	uint64_t eager_cuCopy_isend_bytes;
+	uint64_t eager_gdr_isend;
+	uint64_t eager_gdr_isend_bytes;
+#endif
+	uint64_t eager_copy_cpu_send;
+	uint64_t eager_copy_cpu_send_bytes;
+	uint64_t eager_dma_cpu_send;
+	uint64_t eager_dma_cpu_send_bytes;
+#ifdef PSM_CUDA
+	uint64_t eager_cuCopy_send;
+	uint64_t eager_cuCopy_send_bytes;
+	uint64_t eager_gdr_send;
+	uint64_t eager_gdr_send_bytes;
+#endif
+
+	uint64_t eager_cpu_recv;
+	uint64_t eager_cpu_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t eager_gdrcopy_recv;
+	uint64_t eager_gdrcopy_recv_bytes;
+	uint64_t eager_cuCopy_recv;
+	uint64_t eager_cuCopy_recv_bytes;
+#endif
+
+	uint64_t rndv_cpu_isend;
+	uint64_t rndv_cpu_isend_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_gpu_isend;
+	uint64_t rndv_gpu_isend_bytes;
+#endif
+	uint64_t rndv_cpu_send;
+	uint64_t rndv_cpu_send_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_gpu_send;
+	uint64_t rndv_gpu_send_bytes;
+#endif
+
+	/* Payload in RTS for small sync send */
+	uint64_t rndv_rts_cpu_recv;
+	uint64_t rndv_rts_cpu_recv_bytes;
+	uint64_t rndv_rts_sysbuf_recv;
+	uint64_t rndv_rts_sysbuf_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_rts_cuCopy_recv;
+	uint64_t rndv_rts_cuCopy_recv_bytes;
+#endif
+
+	/* Payload in RTS approach used by sender */
+	/* this approach uses a LONG DATA CTS, but sends no more data */
+	uint64_t rndv_rts_copy_cpu_send;	/* per CTS  (1 per RTS) */
+	uint64_t rndv_rts_copy_cpu_send_bytes;
+
+	/* LONG DATA approach selected by receiver */
+	uint64_t rndv_long_cpu_recv;	/* per RTS */
+	uint64_t rndv_long_cpu_recv_bytes;
+	uint64_t rndv_long_gpu_recv;	/* per RTS */
+	uint64_t rndv_long_gpu_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_long_cuCopy_recv;
+	uint64_t rndv_long_cuCopy_recv_bytes;
+	uint64_t rndv_long_gdr_recv;
+	uint64_t rndv_long_gdr_recv_bytes;
+#endif
+
+	/* LONG DATA approach used by sender after LONG selected by receiver */
+	/* LONG DATA only uses 1 CTS per RTS */
+	uint64_t rndv_long_copy_cpu_send;	/* per CTS  (1 per RTS) */
+	uint64_t rndv_long_copy_cpu_send_bytes;
+	uint64_t rndv_long_dma_cpu_send;	/* per CTS  (1 per RTS) */
+	uint64_t rndv_long_dma_cpu_send_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_long_cuCopy_send;	/* per CTS  (1 per RTS) */
+	uint64_t rndv_long_cuCopy_send_bytes;
+	uint64_t rndv_long_gdrcopy_send;	/* per CTS  (1 per RTS) */
+	uint64_t rndv_long_gdrcopy_send_bytes;
+	uint64_t rndv_long_gdr_send;	/* per CTS  (1 per RTS) */ /* SDMA */
+	uint64_t rndv_long_gdr_send_bytes;		/* SDMA */
+#endif
+
+	/* RDMA approach selected by receiver */
+	uint64_t rndv_rdma_cpu_recv;	/* per RTS */
+	uint64_t rndv_rdma_cpu_recv_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_rdma_gdr_recv;	/* per RTS */
+	uint64_t rndv_rdma_gdr_recv_bytes;
+	uint64_t rndv_rdma_hbuf_recv;	/* per RTS */
+	uint64_t rndv_rdma_hbuf_recv_bytes;
+#endif
+
+	/* RDMA approach used by sender after RDMA selected by receiver */
+	/* RDMA may use >= 1 CTS per RTS */
+	uint64_t rndv_rdma_cpu_send;	/* per CTS */
+	uint64_t rndv_rdma_cpu_send_bytes;
+#ifdef PSM_CUDA
+	uint64_t rndv_rdma_gdr_send;	/* per CTS */
+	uint64_t rndv_rdma_gdr_send_bytes;
+	uint64_t rndv_rdma_hbuf_send;	/* per CTS */
+	uint64_t rndv_rdma_hbuf_send_bytes;
+#endif
+};
+
+#include "ptl_self/ptl_fwd.h"
+#include "ptl_ips/ptl_fwd.h"
+#include "ptl_am/ptl_fwd.h"
+
+/* To be filled in as part of ptl_init */
+struct ptl_ctl {
+	ptl_t *ptl;		/* pointer to ptl */
+	psm2_ep_t ep;		/* pointer to ep */
+
+	/* EP-specific stuff */
+	 psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly);
+
+	/* PTL-level connect
+	 *
+	 * This PTL-level is slightly different from the top-level PSM connect.
+	 *
+	 * pre 1: Caller has masked off epids in epid array that are already
+	 *        connected at the PSM level.
+	 *
+	 * post 0: PTL has allocate all epaddrs and whatever internal ptladdr
+	 *         that ptl needs.
+	 * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i]
+	 * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't
+	 *         be connected before a timeout occurred.
+	 * post 3: PTL returns OK if all epids are either OK or UNREACHABLE
+	 * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK.
+	 */
+	 psm2_error_t(*ep_connect) (ptl_t *ptl,
+				   int num_ep,
+				   const psm2_epid_t input_array_of_epid[],
+				   const int array_of_epid_mask[],
+				   psm2_error_t output_array_of_errors[],
+				   psm2_epaddr_t output_array_of_epddr[],
+				   uint64_t timeout_ns);
+
+	 psm2_error_t (*ep_disconnect)(ptl_t *ptl,
+				       int force,
+				       int num_ep,
+				       psm2_epaddr_t input_array_of_epaddr[],
+				       const int array_of_epaddr_mask[],
+				       psm2_error_t output_array_of_errors[],
+				       uint64_t timeout_ns);
+
+	/* MQ stuff */
+	 psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest,
+				uint32_t flags, psm2_mq_tag_t *stag,
+				const void *buf, uint32_t len);
+	 psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest,
+				  uint32_t flags_user, uint32_t flags_internal,
+				  psm2_mq_tag_t *stag, const void *buf,
+				  uint32_t len, void *ctxt, psm2_mq_req_t *req);
+
+#if 0	// unused code, specific to QLogic MPI
+	int (*epaddr_stats_num) (void);
+	int (*epaddr_stats_init) (char *desc[], uint16_t *flags);
+	int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats);
+#endif
+
+	/* AM stuff */
+	 psm2_error_t(*am_get_parameters) (psm2_ep_t ep,
+					  struct psm2_am_parameters *
+					  parameters);
+	 psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr,
+					 psm2_handler_t handler,
+					 psm2_amarg_t *args, int nargs,
+					 void *src, size_t len, int flags,
+					 psm2_am_completion_fn_t completion_fn,
+					 void *completion_ctxt);
+	 psm2_error_t(*am_short_reply) (psm2_am_token_t token,
+				       psm2_handler_t handler,
+				       psm2_amarg_t *args, int nargs, void *src,
+				       size_t len, int flags,
+				       psm2_am_completion_fn_t completion_fn,
+				       void *completion_ctxt);
+	/* Long messages currently unsupported */
+#if 0
+	 psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr,
+					psm2_handler_t handler,
+					psm2_amarg_t *args, int nargs,
+					void *src, size_t len, void *dest,
+					int flags);
+	 psm2_error_t(*am_long_reply) (psm2_am_token_t token,
+				      psm2_handler_t handler, psm2_amarg_t *args,
+				      int nargs, void *src, size_t len,
+				      void *dest, int flags);
+#endif
+	psm2_error_t (*msg_size_thresh_query) (enum psm2_info_query_thresh_et,
+					       uint32_t *out, psm2_mq_t mq, psm2_epaddr_t);
+};
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/am_config.h b/deps/libfabric/prov/psm3/psm3/ptl_am/am_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d887118273ee9324b9bc7b806782ba1ac65a4d6f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/am_config.h
@@ -0,0 +1,82 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PTL_AM_AM_CONFIG_H
+#define PTL_AM_AM_CONFIG_H
+
+#include "psm_config.h"
+
+/*
+ * Can change the rendezvous threshold based on usage of cma (or not)
+ */
+#define PSMI_MQ_RV_THRESH_CMA      16000
+
+/* If no kernel assisted copy is available this is the rendezvous threshold */
+#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
+
+#define AMSH_HAVE_CMA   0x1
+#define AMSH_HAVE_KASSIST 0x1
+
+/* Each block reserves some space at the beginning to store auxiliary data */
+#define AMSH_BLOCK_HEADER_SIZE  4096
+
+/* AMLONG_SZ is the total size in memory of a bulk packet, including an
+ * am_pkt_bulk_t header struct.
+ * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */
+#define AMLONG_SZ   8192
+#define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t))
+
+#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET
+#define PSMI_KASSIST_MODE_DEFAULT_STRING  "cma-get"
+
+#endif /* PTL_AM_AM_CONFIG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c b/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c
new file mode 100644
index 0000000000000000000000000000000000000000..cfc68f209c87e84bb8bfb9f6cd03aa5cc4e4cfd1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c
@@ -0,0 +1,492 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#include "psm_user.h"
+#include "am_cuda_memhandle_cache.h"
+
+/*
+ * rbtree cruft
+ */
+struct _cl_map_item;
+
+typedef struct
+{
+	unsigned long		start;		 /* start virtual address */
+	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
+	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
+	uint16_t		length;	 /* length*/
+	psm2_epid_t             epid;
+	struct _cl_map_item*	i_prev;	 /* idle queue previous */
+	struct _cl_map_item*	i_next;	 /* idle queue next */
+}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		nelems;	/* number of elements in the cache */
+} rbtree_cuda_memhandle_cache_map_pl_t;
+
+static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size);
+
+/*
+ * Custom comparator
+ */
+typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item;
+
+static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b)
+{
+	// When multi-ep is disabled, cache can assume
+	//   1 epid == 1 remote process == 1 CUDA address space
+	// But when multi-ep is enabled, one process can have many epids, so in this case
+	// cannot use epid as part of cache key.
+	if (!psmi_multi_ep_enabled) {
+		if (a->epid < b->epid)
+			return -1;
+		if (a->epid > b->epid)
+			return 1;
+	}
+
+	unsigned long a_end, b_end;
+	// normalize into inclusive upper bounds to handle
+	// 0-length entries
+	a_end = (a->start + a->length);
+	b_end = (b->start + b->length);
+	if (a->length > 0)
+		a_end--;
+
+	if (b->length > 0)
+		b_end--;
+
+	if (a_end < b->start)
+		return -1;
+	if (b_end < a->start)
+		return 1;
+
+	return 0;
+}
+
+
+/*
+ * Necessary rbtree cruft
+ */
+#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
+#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b))
+#define RBTREE_ASSERT                     psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+
+#include "psm3_rbtree.h"
+#include "psm3_rbtree.c"
+
+/*
+ * Convenience rbtree cruft
+ */
+#define NELEMS			cuda_memhandle_cachemap.payload.nelems
+
+#define IHEAD			cuda_memhandle_cachemap.root
+#define LAST			IHEAD->payload.i_prev
+#define FIRST			IHEAD->payload.i_next
+#define INEXT(x)		x->payload.i_next
+#define IPREV(x)		x->payload.i_prev
+
+/*
+ * Actual module data
+ */
+static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */
+static uint8_t cuda_memhandle_cache_enabled;
+static mpool_t cuda_memhandle_mpool;
+static uint32_t cuda_memhandle_cache_size;
+
+static uint64_t cache_hit_counter;
+static uint64_t cache_miss_counter;
+static uint64_t cache_evict_counter;
+static uint64_t cache_collide_counter;
+static uint64_t cache_clear_counter;
+
+static void print_cuda_memhandle_cache_stats(void)
+{
+	_HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu,clear=%lu\n",
+		cuda_memhandle_cache_enabled, cuda_memhandle_cache_size,
+		cache_hit_counter, cache_miss_counter,
+		cache_evict_counter, cache_collide_counter, cache_clear_counter);
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+static void
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
+				       memcache_item->payload.cuda_ipc_dev_ptr);
+	}
+}
+
+/*
+ * Creating mempool for cuda memhandle cache nodes.
+ */
+static psm2_error_t
+am_cuda_memhandle_mpool_init(uint32_t memcache_size)
+{
+	psm2_error_t err;
+	if (memcache_size < 1)
+		return PSM2_PARAM_ERR;
+
+	cuda_memhandle_cache_size = memcache_size;
+	/* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE
+	 * which includes the Root and NIL items
+	 */
+	cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t),
+					cuda_memhandle_cache_size,
+					cuda_memhandle_cache_size, 0,
+					UNDEFINED, NULL, NULL,
+					psmi_cuda_memhandle_cache_alloc_func,
+					NULL);
+	if (cuda_memhandle_mpool == NULL) {
+		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				"Couldn't allocate CUDA host receive buffer pool");
+		return err;
+	}
+	return PSM2_OK;
+}
+
+/*
+ * Initialize rbtree.
+ */
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size)
+{
+	psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size);
+	if (err != PSM2_OK)
+		return err;
+
+	cl_map_item_t *root, *nil_item;
+	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (root == NULL)
+		return PSM2_NO_MEMORY;
+	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (nil_item == NULL) {
+		psmi_free(root);
+		return PSM2_NO_MEMORY;
+	}
+
+	nil_item->payload.start = 0;
+	nil_item->payload.epid = 0;
+	nil_item->payload.length = 0;
+	cuda_memhandle_cache_enabled = 1;
+	ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item);
+	NELEMS = 0;
+
+	cache_hit_counter = 0;
+	cache_miss_counter = 0;
+	cache_evict_counter = 0;
+	cache_collide_counter = 0;
+	cache_clear_counter = 0;
+
+	return PSM2_OK;
+}
+
+void am_cuda_memhandle_cache_map_fini()
+{
+	print_cuda_memhandle_cache_stats();
+
+	if (cuda_memhandle_cachemap.nil_item) {
+		psmi_free(cuda_memhandle_cachemap.nil_item);
+		cuda_memhandle_cachemap.nil_item = NULL;
+	}
+
+	if (cuda_memhandle_cachemap.root) {
+		psmi_free(cuda_memhandle_cachemap.root);
+		cuda_memhandle_cachemap.root = NULL;
+	}
+
+	if (cuda_memhandle_cache_enabled) {
+		psmi_mpool_destroy(cuda_memhandle_mpool);
+		cuda_memhandle_cache_enabled = 0;
+	}
+
+	cuda_memhandle_cache_size = 0;
+}
+
+/*
+ * Insert at the head of Idleq.
+ */
+static void
+am_cuda_idleq_insert(cl_map_item_t* memcache_item)
+{
+	if (FIRST == NULL) {
+		FIRST = memcache_item;
+		LAST = memcache_item;
+		return;
+	}
+	INEXT(FIRST) = memcache_item;
+	IPREV(memcache_item) = FIRST;
+	FIRST = memcache_item;
+	INEXT(FIRST) = NULL;
+	return;
+}
+
+/*
+ * Remove least recent used element.
+ */
+static void
+am_cuda_idleq_remove_last(cl_map_item_t* memcache_item)
+{
+	if (!INEXT(memcache_item)) {
+		LAST = NULL;
+		FIRST = NULL;
+	} else {
+		LAST = INEXT(memcache_item);
+		IPREV(LAST) = NULL;
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+am_cuda_idleq_remove(cl_map_item_t* memcache_item)
+{
+	if (LAST == memcache_item) {
+		am_cuda_idleq_remove_last(memcache_item);
+	} else if (FIRST == memcache_item) {
+		FIRST = IPREV(memcache_item);
+		INEXT(FIRST) = NULL;
+	} else {
+		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+am_cuda_idleq_reorder(cl_map_item_t* memcache_item)
+{
+	if (FIRST == memcache_item && LAST == memcache_item ) {
+		return;
+	}
+	am_cuda_idleq_remove(memcache_item);
+	am_cuda_idleq_insert(memcache_item);
+	return;
+}
+
+/*
+ * After a successful cache hit, item is validated by doing a
+ * memcmp on the handle stored and the handle we recieve from the
+ * sender. If the validation fails the item is removed from the idleq,
+ * the rbtree, is put back into the mpool and IpcCloseMemHandle function
+ * is called.
+ */
+static psm2_error_t
+am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item,
+				 uintptr_t sbuf, CUipcMemHandle* handle,
+				 uint32_t length, psm2_epid_t epid)
+{
+	if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle,
+			 sizeof(CUipcMemHandle)))
+			 && sbuf == memcache_item->payload.start
+			 && epid == memcache_item->payload.epid) {
+		return PSM2_OK;
+	}
+	_HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length);
+
+	cache_collide_counter++;
+	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item);
+	PSMI_CUDA_CALL(cuIpcCloseMemHandle,
+		       memcache_item->payload.cuda_ipc_dev_ptr);
+	am_cuda_idleq_remove(memcache_item);
+	memset(memcache_item, 0, sizeof(*memcache_item));
+	psmi_mpool_put(memcache_item);
+	return PSM2_OK_NO_PROGRESS;
+}
+
+/*
+ * Current eviction policy: Least Recently Used.
+ */
+static void
+am_cuda_memhandle_cache_evict(void)
+{
+	cache_evict_counter++;
+	cl_map_item_t *p_item = LAST;
+	_HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n",
+		p_item->payload.epid, p_item->payload.start, p_item->payload.length,
+		p_item->payload.cuda_ipc_dev_ptr, p_item);
+	ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item);
+	PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
+	am_cuda_idleq_remove_last(p_item);
+	memset(p_item, 0, sizeof(*p_item));
+	psmi_mpool_put(p_item);
+}
+
+static psm2_error_t
+am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle,
+				 uint32_t length, psm2_epid_t epid,
+				 CUdeviceptr cuda_ipc_dev_ptr)
+{
+	if (NELEMS == cuda_memhandle_cache_size)
+		am_cuda_memhandle_cache_evict();
+
+	cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool);
+	/* memcache_item cannot be NULL as we evict
+	 * before the call to mpool_get. Check has
+	 * been fixed to help with klockwork analysis.
+	 */
+	if (memcache_item == NULL)
+		return PSM2_NO_MEMORY;
+	memcache_item->payload.start = sbuf;
+	memcache_item->payload.cuda_ipc_handle = *handle;
+	memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr;
+	memcache_item->payload.length = length;
+	memcache_item->payload.epid = epid;
+	ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item);
+	am_cuda_idleq_insert(memcache_item);
+	return PSM2_OK;
+}
+
+static void am_cuda_memhandle_cache_clear(void)
+{
+	_HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+	while (NELEMS) {
+		am_cuda_memhandle_cache_evict();
+	}
+	_HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS);
+}
+
+/*
+ * The key used to search the cache is the senders buf address pointer.
+ * Upon a succesful hit in the cache, additional validation is required
+ * as multiple senders could potentially send the same buf address value.
+ */
+CUdeviceptr
+am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
+				uint32_t length, psm2_epid_t epid)
+{
+	_HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n",
+		sbuf, handle, length, epid);
+
+	CUdeviceptr cuda_ipc_dev_ptr;
+	if(!cuda_memhandle_cache_enabled) {
+		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+		return cuda_ipc_dev_ptr;
+	}
+
+	cuda_cache_item key = {
+		.start = (unsigned long) sbuf,
+		.length= length,
+		.epid = epid
+	};
+
+	/*
+	 * preconditions:
+	 *  1) newrange [start,end) may or may not be in cachemap already
+	 *  2) there are no overlapping address ranges in cachemap
+	 * postconditions:
+	 *  1) newrange is in cachemap
+	 *  2) there are no overlapping address ranges in cachemap
+	 *
+	 * The key used to search the cache is the senders buf address pointer.
+	 * Upon a succesful hit in the cache, additional validation is required
+	 * as multiple senders could potentially send the same buf address value.
+	 */
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	while (p_item->payload.start) {
+		// Since a precondition is that there are no overlapping ranges in cachemap,
+		// an exact match implies no need to check further
+		if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) {
+			cache_hit_counter++;
+			am_cuda_idleq_reorder(p_item);
+			return p_item->payload.cuda_ipc_dev_ptr;
+		}
+
+		// newrange is not in the cache and overlaps at least one existing range.
+		// am_cuda_memhandle_cache_validate() closed and removed existing range.
+		// Continue searching for more overlapping ranges
+		p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key);
+	}
+	cache_miss_counter++;
+
+	CUresult cudaerr;
+	PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle,
+		&cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+
+	if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) {
+		// remote memory already mapped. Close all handles, clear cache,
+		// and try again
+		am_cuda_memhandle_cache_clear();
+		cache_clear_counter++;
+		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle,
+			CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+	}
+
+	am_cuda_memhandle_cache_register(sbuf, handle,
+					   length, epid, cuda_ipc_dev_ptr);
+	return cuda_ipc_dev_ptr;
+}
+
+void
+am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr)
+{
+	if(!cuda_memhandle_cache_enabled)
+		PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr);
+	return;
+}
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h b/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b1dbc05a4262d52ac792e2f8d6e94f18470d975
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h
@@ -0,0 +1,84 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef PSM_CUDA
+
+#ifndef _AM_CUDA_MEMHANDLE_CACHE_H
+#define _AM_CUDA_MEMHANDLE_CACHE_H
+
+#include "psm_user.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
+
+psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size);
+
+CUdeviceptr
+am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle,
+				uint32_t length, psm2_epid_t epid);
+void
+am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr);
+
+void am_cuda_memhandle_cache_map_fini();
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */
+
+#endif /* PSM_CUDA */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep.c b/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep.c
new file mode 100644
index 0000000000000000000000000000000000000000..5f90ec7267eadbfa77f110b205a36ec4c2b2633d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep.c
@@ -0,0 +1,118 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_am.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+			   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm2_am_completion_fn_t completion_fn,
+			   void *completion_ctxt)
+{
+	psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS];
+
+	/* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC.
+	 * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly
+	 * used to optimize the IPS path though we could put a stricter interpretation
+	 * on it to disallow any replies.
+	 */
+
+	/* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+	 * the handler index.
+	 */
+	psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+	psmi_assert(epaddr->ptlctl->ptl != NULL);
+
+	req_args[0].u32w0 = (uint32_t) handler;
+	psmi_mq_mtucpy((void *)&req_args[1], (const void *)args,
+		       (nargs * sizeof(psm2_amarg_t)));
+	psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx,
+				req_args, nargs + 1, src, len, 0);
+
+	if (completion_fn)
+		completion_fn(completion_ctxt);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+			 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm2_am_completion_fn_t completion_fn,
+			 void *completion_ctxt)
+{
+	psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS];
+
+	/* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry
+	 * the handler index.
+	 */
+	psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1));
+	rep_args[0].u32w0 = (uint32_t) handler;
+	psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args,
+		       (nargs * sizeof(psm2_amarg_t)));
+
+	psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx,
+			      rep_args, nargs + 1, src, len, 0);
+
+	if (completion_fn)
+		completion_fn(completion_ctxt);
+
+	return PSM2_OK;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..3990f9f9c500d454b020414ca9cbe1b6b48eeb11
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
@@ -0,0 +1,2758 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>		/* shm_open and signal handling */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+#include "psmi_wrappers.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+
+static const amsh_qinfo_t amsh_qcounts = {
+	.qreqFifoShort = 1024,
+	.qreqFifoLong = 256,
+	.qrepFifoShort = 1024,
+	.qrepFifoLong = 256
+};
+
+static const amsh_qinfo_t amsh_qelemsz = {
+	.qreqFifoShort = sizeof(am_pkt_short_t),
+	.qreqFifoLong = AMLONG_SZ,
+	.qrepFifoShort = sizeof(am_pkt_short_t),
+	.qrepFifoLong = AMLONG_SZ
+};
+
+ustatic struct {
+	void *addr;
+	size_t len;
+	struct sigaction SIGSEGV_old_act;
+	struct sigaction SIGBUS_old_act;
+} action_stash;
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly);
+static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq);
+static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg,
+			      void *buf, size_t len);
+
+/* Kassist helper functions */
+#if _HFI_DEBUGGING
+static const char *psmi_kassist_getmode(int mode);
+#endif
+static int psmi_get_kassist_mode();
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+static inline void
+am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz)
+{
+	pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED);
+	q->head = 0;
+	q->tail = 0;
+	q->elem_cnt = elem_cnt;
+	q->elem_sz = elem_sz;
+}
+
+static void
+am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems)
+{
+	int i;
+	am_pkt_bulk_t *bulkpkt;
+	uintptr_t bulkptr = (uintptr_t) base_ptr;
+
+	for (i = 0; i < nelems; i++, bulkptr += elemsz) {
+		bulkpkt = (am_pkt_bulk_t *) bulkptr;
+		bulkpkt->idx = i;
+	}
+}
+
+#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \
+			       PSMI_PAGESIZE)
+static inline uintptr_t am_ctl_sizeof_block()
+{
+	return PSMI_ALIGNUP(
+			PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) +
+			/* reqctrl block */
+			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+			_PA(reqFifoShort) + _PA(reqFifoLong) +
+			/*reqctrl block */
+			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
+			/* align to page size */
+			_PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE);
+}
+
+#undef _PA
+
+static uint32_t create_extra_ep_data()
+{
+	uint32_t ret = getpid();
+
+#ifdef PSM_CUDA
+	/* PID is at maximum 22 bits */
+	ret |= my_gpu_device << 22;
+#endif
+
+	return ret;
+}
+
+static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu)
+{
+	uint32_t pid_mask = (1 << 22) - 1;
+
+	*pid = data & pid_mask;
+	*gpu = (data & ~pid_mask) >> 22;
+}
+
+static void am_update_directory(struct am_ctl_nodeinfo *);
+
+static
+void amsh_atexit()
+{
+	static ips_atomic_t atexit_once = { 0 };
+	psm2_ep_t ep;
+	struct ptl_am *ptl;
+
+	/* bail out if previous value is non-zero */
+	if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0)
+		return;
+
+	ep = psmi_opened_endpoint;
+	while (ep) {
+		ptl = (struct ptl_am *)(ep->ptl_amsh.ptl);
+		if (ptl->self_nodeinfo &&
+		    ptl->amsh_keyname != NULL) {
+			_HFI_PRDBG("unlinking shm file %s\n",
+				  ptl->amsh_keyname);
+			shm_unlink(ptl->amsh_keyname);
+		}
+		ep = ep->user_ep_next;
+	}
+
+	return;
+}
+
+ustatic
+void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context)
+{
+	if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr &&
+	    (unsigned long int) siginfo->si_addr <  (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) {
+
+		static char shm_errmsg[256];
+
+		snprintf(shm_errmsg, sizeof(shm_errmsg),
+			 "%s: Unable to allocate shared memory for intra-node messaging.\n"
+			 "%s: Delete stale shared memory files in /dev/shm.\n",
+			 psmi_gethostname(), psmi_gethostname());
+		amsh_atexit();
+		if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1)
+			psmi_exit(2);
+		else
+			psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */
+	} else {
+		if (signo == SIGSEGV) {
+			if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) {
+				psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+				raise(SIGSEGV);
+				struct sigaction act;
+				act.sa_sigaction = amsh_mmap_fault;
+				act.sa_flags = SA_SIGINFO;
+				psmi_sigaction(SIGSEGV, &act, NULL);
+			} else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) {
+				return;
+			} else {
+				action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context);
+			}
+		} else if (signo == SIGBUS) {
+			if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) {
+				psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+				raise(SIGBUS);
+				struct sigaction act;
+				act.sa_sigaction = amsh_mmap_fault;
+				act.sa_flags = SA_SIGINFO;
+				psmi_sigaction(SIGBUS, &act, NULL);
+			} else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) {
+				return;
+			} else {
+				action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context);
+			}
+		} else {
+			psmi_exit(signo);
+		}
+	}
+}
+
+/**
+ * Create endpoint shared-memory object, containing ep's info
+ * and message queues.
+ */
+psm2_error_t psmi_shm_create(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_ep_t ep = ptl->ep;
+	char shmbuf[256];
+	void *mapptr;
+	size_t segsz;
+	psm2_error_t err = PSM2_OK;
+	int shmfd = -1;
+	char *amsh_keyname = NULL;
+	int iterator;
+	/* Get which kassist mode to use. */
+	ptl->psmi_kassist_mode = psmi_get_kassist_mode();
+
+	if (_HFI_PRDBG_ON) {
+		_HFI_PRDBG_ALWAYS
+			("kassist_mode %d %s use_kassist %d\n",
+			ptl->psmi_kassist_mode,
+			psmi_kassist_getmode(ptl->psmi_kassist_mode),
+			(ptl->psmi_kassist_mode != PSMI_KASSIST_OFF));
+	}
+
+	segsz = am_ctl_sizeof_block();
+	for (iterator = 0; iterator <= INT_MAX; iterator++) {
+		snprintf(shmbuf,
+			 sizeof(shmbuf),
+			 "/psm3_shm.%ld%016lx%d",
+			 (long int) getuid(),
+			 ep->epid,
+			 iterator);
+		amsh_keyname = psmi_strdup(NULL, shmbuf);
+		if (amsh_keyname == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		shmfd =
+		    shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+		if (shmfd < 0) {
+			psmi_free(amsh_keyname);
+			amsh_keyname = NULL;
+			if (errno == EACCES && iterator < INT_MAX)
+				continue;
+			else {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error creating shared "
+							"memory object %s in "
+							"shm_open: %s",
+							amsh_keyname, strerror(errno));
+				goto fail;
+			}
+		} else {
+			struct stat st;
+			if (fstat(shmfd, &st) == -1) {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error validating "
+							"shared memory object %s "
+							"with fstat: %s",
+							amsh_keyname, strerror(errno));
+				goto fail;
+			}
+			if (getuid() == st.st_uid) {
+				err = PSM2_OK;
+				break;
+			} else {
+				err = PSM2_SHMEM_SEGMENT_ERR;
+				close(shmfd);
+			}
+		}
+	}
+	if (err) {
+		if (amsh_keyname) psmi_free(amsh_keyname);
+		err = psmi_handle_error(NULL,
+					PSM2_SHMEM_SEGMENT_ERR,
+					"Error creating shared memory object "
+					"in shm_open: namespace exhausted.");
+		goto fail;
+	}
+
+	/* Now register the atexit handler for cleanup, whether master or slave */
+	atexit(amsh_atexit);
+
+	_HFI_PRDBG("Opened shmfile %s\n", amsh_keyname);
+
+	if (ftruncate(shmfd, segsz) != 0) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error setting size of shared memory object to %u bytes in "
+					"ftruncate: %s\n",
+					(uint32_t) segsz,
+					strerror(errno));
+		goto fail;
+	}
+
+	mapptr = mmap(NULL, segsz,
+		      PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0);
+	if (mapptr == MAP_FAILED) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error mmapping shared memory: %s",
+					strerror(errno));
+		psmi_free(amsh_keyname);
+		goto fail;
+	}
+
+	memset((void *) mapptr, 0, segsz); /* touch all of my pages */
+
+	/* Our own ep's info for ptl_am resides at the start of the
+	   shm object.  Other processes need some of this info to
+	   understand the rest of the queue structure and other details. */
+	ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr;
+	ptl->amsh_keyname = amsh_keyname;
+	ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr;
+
+fail:
+	if (shmfd >= 0) close(shmfd);
+	return err;
+}
+
+psm2_error_t psmi_epdir_extend(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	struct am_ctl_nodeinfo *new = NULL;
+
+	new = (struct am_ctl_nodeinfo *)
+		psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+			      (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) *
+			      sizeof(struct am_ctl_nodeinfo));
+	if (new == NULL)
+		return PSM2_NO_MEMORY;
+
+	memcpy(new, ptl->am_ep,
+	       ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+	memset(new + ptl->am_ep_size, 0,
+	       AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo));
+
+	psmi_free(ptl->am_ep);
+	ptl->am_ep = new;
+	ptl->am_ep_size += AMSH_DIRBLOCK_SIZE;
+
+	return PSM2_OK;
+}
+
+/**
+ * Unmap shm regions upon proper disconnect with other processes
+ */
+psm2_error_t psmi_do_unmap(uintptr_t shmbase)
+{
+	psm2_error_t err = PSM2_OK;
+	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+		err =
+		    psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+				      "Error with munmap of shared segment: %s",
+				      strerror(errno));
+	}
+	return err;
+}
+
+/**
+ * Map a remote process' shared memory object.
+ *
+ * If the remote process has a shared memory object available, add it to our own
+ * directory and return the shmidx.  If the shared memory object does not exist,
+ * return -1, and the connect poll function will try to map again later.
+ *
+ * If force_remap is true, then clear the entry that matches the epid.
+ */
+psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shmidx_o, int force_remap)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	int i;
+	int use_kassist;
+	uint16_t shmidx;
+	char shmbuf[256];
+	void *dest_mapptr;
+	size_t segsz;
+	psm2_error_t err = PSM2_OK;
+	int dest_shmfd;
+	struct am_ctl_nodeinfo *dest_nodeinfo;
+	int iterator;
+
+	shmidx = *shmidx_o = -1;
+
+	for (i = 0; i <= ptl->max_ep_idx; i++) {
+		if (ptl->am_ep[i].epid == epid) {
+			if (force_remap) {
+				ptl->am_ep[i].epaddr = NULL;
+				ptl->am_ep[i].epid = 0;
+				break;
+			}
+			*shmidx_o = shmidx = i;
+			return err;
+		}
+	}
+
+
+	use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF);
+
+	segsz = am_ctl_sizeof_block();
+	for (iterator = 0; iterator <= INT_MAX; iterator++) {
+		snprintf(shmbuf,
+			 sizeof(shmbuf),
+			 "/psm3_shm.%ld%016lx%d",
+			 (long int) getuid(),
+			 epid,
+			 iterator);
+		dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU);
+		if (dest_shmfd < 0) {
+			if (errno == EACCES && iterator < INT_MAX)
+				continue;
+			else {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error opening remote "
+							"shared memory object %s "
+							"in shm_open: %s",
+							shmbuf, strerror(errno));
+				goto fail;
+			}
+		} else {
+			struct stat st;
+			if (fstat(dest_shmfd, &st) == -1) {
+				err = psmi_handle_error(NULL,
+							PSM2_SHMEM_SEGMENT_ERR,
+							"Error validating "
+							"shared memory object %s "
+							"with fstat: %s",
+							shmbuf, strerror(errno));
+				close(dest_shmfd);
+				goto fail;
+			}
+			if (getuid() == st.st_uid) {
+				err = PSM2_OK;
+				break;
+			} else {
+				err = PSM2_SHMEM_SEGMENT_ERR;
+				close(dest_shmfd);
+			}
+		}
+	}
+	if (err) {
+		err = psmi_handle_error(NULL,
+					PSM2_SHMEM_SEGMENT_ERR,
+					"Error opening remote shared "
+					"memory object in shm_open: "
+					"namespace exhausted.");
+		goto fail;
+	}
+
+	dest_mapptr = mmap(NULL, segsz,
+		      PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0);
+	if (dest_mapptr == MAP_FAILED) {
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Error mmapping remote shared memory: %s",
+					strerror(errno));
+		close(dest_shmfd);
+		goto fail;
+	}
+	close(dest_shmfd);
+	dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr;
+
+	/* We core dump right after here if we don't check the mmap */
+	action_stash.addr = dest_mapptr;
+	action_stash.len = segsz;
+
+	struct sigaction act = { .sa_sigaction = amsh_mmap_fault, .sa_flags = SA_SIGINFO };
+
+	sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+	sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+	{
+		volatile uint16_t *is_init = &dest_nodeinfo->is_init;
+		while (*is_init == 0)
+			usleep(1);
+		ips_sync_reads();
+		_HFI_CONNDBG("Got a published remote dirpage page at "
+			   "%p, size=%d\n", dest_mapptr, (int)segsz);
+	}
+
+	shmidx = -1;
+	if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) {
+		err = psmi_epdir_extend(ptl_gen);
+		if (err)
+			goto fail;
+
+		for (i = 0; i <= ptl->max_ep_idx; i++) {
+			if (ptl->am_ep[i].epid != 0)
+				am_update_directory(&ptl->am_ep[i]);
+		}
+	}
+	for (i = 0; i < ptl->am_ep_size; i++) {
+		psmi_assert(ptl->am_ep[i].epid != epid);
+		if (ptl->am_ep[i].epid == 0) {
+			ptl->am_ep[i].epid = epid;
+			ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno;
+			ptl->am_ep[i].pid = dest_nodeinfo->pid;
+			if (use_kassist) {
+				/* If we are able to use CMA assume everyone
+				 * else on the node can also use it.
+				 * Advertise that CMA is active via the
+				 * feature flag.
+				 */
+
+				if (cma_available()) {
+					ptl->am_ep[i].amsh_features |=
+					    AMSH_HAVE_CMA;
+					psmi_shm_mq_rv_thresh =
+					    PSMI_MQ_RV_THRESH_CMA;
+				} else {
+					ptl->psmi_kassist_mode =
+					    PSMI_KASSIST_OFF;
+					use_kassist = 0;
+					psmi_shm_mq_rv_thresh =
+					    PSMI_MQ_RV_THRESH_NO_KASSIST;
+				}
+			} else
+				psmi_shm_mq_rv_thresh =
+				    PSMI_MQ_RV_THRESH_NO_KASSIST;
+			_HFI_CONNDBG("KASSIST MODE: %s\n",
+				   psmi_kassist_getmode(ptl->psmi_kassist_mode));
+			shmidx = *shmidx_o = i;
+			_HFI_CONNDBG("Mapped epid 0x%"PRIx64" into shmidx %d\n", epid, shmidx);
+			ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr;
+			ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes;
+			if (i > ptl->max_ep_idx)
+				ptl->max_ep_idx = i;
+			break;
+		}
+	}
+
+	/* install the old sighandler back */
+	sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+	sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+	if (shmidx == (uint16_t)-1)
+		err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+					"Could not connect to local endpoint");
+fail:
+	return err;
+}
+
+/**
+ * Initialize pointer structure and locks for endpoint shared-memory AM.
+ */
+
+#define AMSH_QSIZE(type)                                                \
+	PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type,   \
+		     PSMI_PAGESIZE)
+
+static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+
+	/* Preconditions */
+	psmi_assert_always(ptl != NULL);
+	psmi_assert_always(ptl->ep != NULL);
+	psmi_assert_always(ptl->epaddr != NULL);
+	psmi_assert_always(ptl->ep->epid != 0);
+
+	if ((err = psmi_shm_create(ptl_gen)))
+		goto fail;
+
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong);
+
+	/* We core dump right after here if we don't check the mmap */
+
+	struct sigaction act = {
+		.sa_sigaction = amsh_mmap_fault,
+		.sa_flags = SA_SIGINFO
+	};
+
+	sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act);
+	sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act);
+
+	/*
+	 * Now that we know our epid, update it in the shmidx array
+	 */
+	ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL;
+	ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL;
+
+	am_update_directory(ptl->self_nodeinfo);
+
+	ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort));
+	ptl->reqH.end = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) +
+		 amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort);
+
+	ptl->repH.head = ptl->repH.base = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort));
+	ptl->repH.end = (am_pkt_short_t *)
+		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) +
+		 amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort);
+
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq,
+			 amsh_qcounts.qreqFifoShort,
+			 amsh_qelemsz.qreqFifoShort);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq,
+			 amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq,
+			 amsh_qcounts.qrepFifoShort,
+			 amsh_qelemsz.qrepFifoShort);
+	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq,
+			 amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong);
+
+	/* Set bulkidx in every bulk packet */
+	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong,
+			    amsh_qelemsz.qreqFifoLong,
+			    amsh_qcounts.qreqFifoLong);
+	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong,
+			    amsh_qelemsz.qrepFifoLong,
+			    amsh_qcounts.qrepFifoLong);
+
+	/* install the old sighandler back */
+	sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
+	sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL);
+
+fail:
+	return err;
+}
+
+psm2_error_t psmi_shm_detach(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+	uintptr_t shmbase;
+
+	if (ptl->self_nodeinfo == NULL)
+		return err;
+
+	_HFI_PRDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1);
+	shmbase = ptl->self_nodeinfo->amsh_shmbase;
+	shm_unlink(ptl->amsh_keyname);
+	psmi_free(ptl->amsh_keyname);
+
+	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+		err =
+		    psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
+				      "Error with munmap of shared segment: %s",
+				      strerror(errno));
+		goto fail;
+	}
+	ptl->self_nodeinfo = NULL;
+	return PSM2_OK;
+
+fail:
+	return err;
+}
+
+/**
+ * Update locally shared-pointer directory.  The directory must be
+ * updated when a new epaddr is connected to or on every epaddr already
+ * connected to whenever the shared memory segment is relocated via mremap.
+ *
+ * @param epaddr Endpoint address for which to update local directory.
+ */
+
+static
+void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
+{
+	uintptr_t base_this;
+
+	base_this = nodeinfo->amsh_shmbase +
+		AMSH_BLOCK_HEADER_SIZE;
+
+	/* Request queues */
+	nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this;
+	nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqH +
+	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+
+	nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqFifoShort +
+	     nodeinfo->amsh_qsizes.qreqFifoShort);
+
+	/* Reply queues */
+	nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *)
+	    ((uintptr_t) nodeinfo->qdir.qreqFifoLong +
+	     nodeinfo->amsh_qsizes.qreqFifoLong);
+
+	nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *)
+	    ((uintptr_t) nodeinfo->qdir.qrepH +
+	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+	nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *)
+	    ((uintptr_t) nodeinfo->qdir.qrepFifoShort +
+	     nodeinfo->amsh_qsizes.qrepFifoShort);
+
+	_HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n",
+		  nodeinfo->epaddr,
+		  nodeinfo->qdir.qreqH,
+		  nodeinfo->qdir.qreqFifoShort,
+		  nodeinfo->qdir.qreqFifoLong);
+	_HFI_VDBG("epaddr=%p Reply   Hdr=%p,Pkt=%p,Long=%p\n",
+		  nodeinfo->epaddr,
+		  nodeinfo->qdir.qrepH,
+		  nodeinfo->qdir.qrepFifoShort,
+		  nodeinfo->qdir.qrepFifoLong);
+
+	/* Sanity check */
+	uintptr_t base_next =
+	    (uintptr_t) nodeinfo->qdir.qrepFifoLong +
+	    nodeinfo->amsh_qsizes.qrepFifoLong;
+
+	psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block());
+}
+
+
+/* ep_epid_share_memory wrapper */
+static
+int amsh_epid_reachable(ptl_t *ptl_gen, psm2_epid_t epid)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	int result;
+	psm2_error_t err;
+	err = psm2_ep_epid_share_memory(ptl->ep, epid, &result);
+	psmi_assert_always(err == PSM2_OK);
+	return result;
+}
+
+static
+psm2_error_t
+amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_epaddr_t epaddr;
+	am_epaddr_t *amaddr;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL);
+
+	/* The self PTL handles loopback communication. */
+	psmi_assert(epid != ptl->epid);
+
+	/* note the size of the memory is am_epaddr_t */
+	epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep,
+					    PER_PEER_ENDPOINT, 1,
+					    sizeof(am_epaddr_t));
+	if (epaddr == NULL) {
+		return PSM2_NO_MEMORY;
+	}
+	psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL);
+
+	if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid),
+					  psmi_gethostname(), 0)))
+		goto fail;
+
+	epaddr->ptlctl = ptl->ctl;
+	epaddr->epid = epid;
+
+	/* convert to am_epaddr_t */
+	amaddr = (am_epaddr_t *) epaddr;
+	/* tell the other endpoint their location in our directory */
+	amaddr->shmidx = shmidx;
+	/* we haven't connected yet, so we can't give them the same hint */
+	amaddr->return_shmidx = -1;
+	amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE;
+	amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE;
+
+	/* other setup */
+	ptl->am_ep[shmidx].epaddr = epaddr;
+	am_update_directory(&ptl->am_ep[shmidx]);
+	/* Finally, add to table */
+	if ((err = psmi_epid_add(ptl->ep, epid, epaddr)))
+		goto fail;
+	_HFI_CONNDBG("epaddr=%p %s added to ptl=%p\n",
+		  epaddr, psmi_epaddr_get_name(epid), ptl);
+	*epaddr_o = epaddr;
+	return PSM2_OK;
+fail:
+	if (epaddr != ptl->epaddr)
+		psmi_free(epaddr);
+	return err;
+}
+
+static
+void
+amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	am_epaddr_t *amaddr;
+	uint16_t shmidx;
+	struct am_ctl_nodeinfo *nodeinfo;
+
+	amaddr = (am_epaddr_t *) epaddr;
+	shmidx = amaddr->shmidx;
+	nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase;
+
+	/* restart the connection process */
+	amaddr->return_shmidx = -1;
+	amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE;
+
+	/* wait for the other process to init again */
+	{
+		volatile uint16_t *is_init = &nodeinfo->is_init;
+		while (*is_init == 0)
+			usleep(1);
+		ips_sync_reads();
+	}
+
+	/* get the updated values from the new nodeinfo page */
+	ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno;
+	ptl->am_ep[shmidx].pid = nodeinfo->pid;
+	ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes;
+	am_update_directory(&ptl->am_ep[shmidx]);
+	return;
+}
+
+struct ptl_connection_req {
+	int isdone;
+	int op;			/* connect or disconnect */
+	int numep;
+	int numep_left;
+	int phase;
+
+	int *epid_mask;
+	const psm2_epid_t *epids;	/* input epid list */
+	psm2_epaddr_t *epaddr;
+	psm2_error_t *errors;	/* inout errors */
+
+	/* Used for connect/disconnect */
+	psm2_amarg_t args[4];
+};
+
+static
+void amsh_free_epaddr(psm2_epaddr_t epaddr)
+{
+	psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+	psmi_free(epaddr);
+	return;
+}
+
+#define PTL_OP_CONNECT      0
+#define PTL_OP_DISCONNECT   1
+#define PTL_OP_ABORT        2
+
+static
+psm2_error_t
+amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
+		     int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */
+		     const int array_of_epid_mask[],
+		     psm2_error_t *array_of_errors,
+		     psm2_epaddr_t *array_of_epaddr,
+		     struct ptl_connection_req **req_o)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	int i, cstate;
+	psm2_epaddr_t epaddr;
+	psm2_epid_t epid;
+	struct ptl_connection_req *req = NULL;
+
+	req = (struct ptl_connection_req *)
+	    psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1,
+			sizeof(struct ptl_connection_req));
+	if (req == NULL)
+		return PSM2_NO_MEMORY;
+	req->isdone = 0;
+	req->op = op;
+	req->numep = numep;
+	req->numep_left = 0;
+	req->phase = ptl->connect_phase;
+	req->epid_mask = (int *)
+	    psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int));
+	if (req->epid_mask == NULL) {
+		psmi_free(req);
+		return PSM2_NO_MEMORY;
+	}
+	req->epaddr = array_of_epaddr;
+	req->epids = array_of_epid;
+	req->errors = array_of_errors;
+
+	/* First check if there's really something to connect/disconnect
+	 * for this PTL */
+	for (i = 0; i < numep; i++) {
+		req->epid_mask[i] = AMSH_CMASK_NONE;	/* no connect by default */
+		if (!array_of_epid_mask[i])
+			continue;
+		if (op == PTL_OP_CONNECT) {
+			epid = array_of_epid[i];
+
+			/* Connect only to other processes reachable by shared memory.
+			   The self PTL handles loopback communication, so explicitly
+			   refuse to connect to self. */
+			if (!amsh_epid_reachable(ptl_gen, epid)
+			    || epid == ptl->epid) {
+				array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+				array_of_epaddr[i] = NULL;
+				continue;
+			}
+
+			_HFI_CONNDBG("Connect epid 0x%"PRIx64"\n", epid);
+			epaddr = psmi_epid_lookup(ptl->ep, epid);
+			if (epaddr != NULL) {
+				if (epaddr->ptlctl->ptl != ptl_gen) {
+					array_of_errors[i] =
+					    PSM2_EPID_UNREACHABLE;
+					array_of_epaddr[i] = NULL;
+					continue;
+				}
+				cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
+				if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+					array_of_epaddr[i] = epaddr;
+					array_of_errors[i] = PSM2_OK;
+				} else {
+					psmi_assert(cstate ==
+						    AMSH_CSTATE_OUTGOING_NONE);
+					array_of_errors[i] = PSM2_TIMEOUT;
+					array_of_epaddr[i] = epaddr;
+					req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				}
+			} else {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				array_of_epaddr[i] = NULL;
+			}
+		} else {	/* disc or abort */
+			epaddr = array_of_epaddr[i];
+			if (epaddr->ptlctl->ptl != ptl_gen)
+				continue;
+
+			psmi_assert(epaddr != NULL);
+			_HFI_CONNDBG("Disconnect force=%d epid 0x%"PRIx64"\n",
+				(op == PTL_OP_ABORT), epaddr->epid);
+			cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
+			if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				_HFI_VDBG
+				    ("Just set index %d to AMSH_CMASK_PREREQ\n",
+				     i);
+			}
+			/* XXX undef ? */
+		}
+		if (req->epid_mask[i] != AMSH_CMASK_NONE)
+			req->numep_left++;
+	}
+
+	if (req->numep_left == 0) {	/* nothing to do */
+		psmi_free(req->epid_mask);
+		psmi_free(req);
+		if (op != PTL_OP_ABORT) {
+			_HFI_CONNDBG("Nothing to connect, bump up phase\n");
+			ptl->connect_phase++;
+		}
+		*req_o = NULL;
+		return PSM2_OK;
+	} else {
+		*req_o = req;
+		return PSM2_OK_NO_PROGRESS;
+	}
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	int i, j, cstate;
+	uint16_t shmidx = (uint16_t)-1;
+	psm2_error_t err = PSM2_OK;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+
+	if (req == NULL || req->isdone)
+		return PSM2_OK;
+
+	psmi_assert_always(ptl->connect_phase == req->phase);
+
+	if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+		for (i = 0; i < req->numep; i++) {
+			if (req->epid_mask[i] == AMSH_CMASK_NONE ||
+			    req->epid_mask[i] == AMSH_CMASK_DONE)
+				continue;
+
+			epaddr = req->epaddr[i];
+			psmi_assert(epaddr != NULL);
+			if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+				shmidx = ((am_epaddr_t *) epaddr)->shmidx;
+				/* Make sure the target of the disconnect is still there */
+				if (ptl->am_ep[shmidx].
+				    epid != epaddr->epid) {
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					((am_epaddr_t *) epaddr)->cstate_outgoing =
+						AMSH_CSTATE_OUTGOING_NONE;
+				}
+			}
+
+			if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+				req->args[0].u16w0 = PSMI_AM_DISC_REQ;
+				req->args[0].u16w1 = shmidx;
+				req->args[0].u32w1 = ptl->connect_phase;
+				req->args[1].u64w0 = (uint64_t) ptl->epid;
+				psmi_assert(shmidx != (uint16_t)-1);
+				req->args[2].u32w0 = create_extra_ep_data();
+				req->args[2].u32w1 = PSM2_OK;
+				if (req->op != PTL_OP_ABORT)
+					req->args[3].u64w0 =
+					    (uint64_t) (uintptr_t) &req->errors[i];
+				else
+					req->args[3].u64w0 = 0;
+				psmi_amsh_short_request(ptl_gen, epaddr,
+							amsh_conn_handler_hidx,
+							req->args, 4, NULL, 0,
+							0);
+				((am_epaddr_t *) epaddr)->cstate_outgoing =
+					AMSH_CSTATE_OUTGOING_DISC_REQUESTED;
+				/**
+				* Only munmap if we have nothing more to
+				* communicate with the other node, i.e. we
+				* already recieved a disconnect req from the
+				* other node.
+				*/
+				if (((am_epaddr_t *) epaddr)->cstate_incoming ==
+					AMSH_CSTATE_INCOMING_DISC_REQUESTED)
+					err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+				req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+			} else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+				cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
+				if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) {
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					((am_epaddr_t *) epaddr)->cstate_outgoing =
+						AMSH_CSTATE_OUTGOING_NONE;
+				}
+			}
+		}
+	} else {
+		/* First see if we've made progress on any postreqs */
+		int n_prereq = 0;
+		for (i = 0; i < req->numep; i++) {
+			int cstate;
+			if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) {
+				if (req->epid_mask[i] == AMSH_CMASK_PREREQ)
+					n_prereq++;
+				continue;
+			}
+			epaddr = req->epaddr[i];
+			psmi_assert(epaddr != NULL);
+
+			/* detect if a race has occurred on due to re-using an
+			 * old shm file - if so, restart the connection */
+			shmidx = ((am_epaddr_t *) epaddr)->shmidx;
+			if (ptl->am_ep[shmidx].pid !=
+			    ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) {
+				req->epid_mask[i] = AMSH_CMASK_PREREQ;
+				((am_epaddr_t *) epaddr)->cstate_outgoing =
+					AMSH_CSTATE_OUTGOING_NONE;
+				n_prereq++;
+				amsh_epaddr_update(ptl_gen, epaddr);
+				continue;
+			}
+
+			cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
+			if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+				req->numep_left--;
+				((am_epaddr_t *) epaddr)->cstate_outgoing =
+					AMSH_CSTATE_OUTGOING_ESTABLISHED;
+				req->epid_mask[i] = AMSH_CMASK_DONE;
+				continue;
+			}
+		}
+		if (n_prereq > 0) {
+			psmi_assert(req->numep_left > 0);
+			/* Go through the list of peers we need to connect to and find out
+			 * if they each shared ep is mapped into shm */
+			for (i = 0; i < req->numep; i++) {
+				if (req->epid_mask[i] != AMSH_CMASK_PREREQ)
+					continue;
+				epid = req->epids[i];
+				epaddr = req->epaddr[i];
+				/* Go through mapped epids and find the epid we're looking for */
+				for (shmidx = -1, j = 0;
+				     j <= ptl->max_ep_idx; j++) {
+					/* epid is connected and ready to go */
+					if (ptl->am_ep[j].
+					    epid == epid) {
+						shmidx = j;
+						break;
+					}
+				}
+				if (shmidx == (uint16_t)-1) {
+					/* Couldn't find peer's epid in dirpage.
+					   Check shmdir to see if epid is up now. */
+					if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, 0))) {
+						return err;
+					}
+					continue;
+				}
+				/* Before we even send the request out, check to see if
+				 * versions are interoperable */
+				if (!psmi_verno_isinteroperable
+				    (ptl->am_ep[shmidx].
+				     psm_verno)) {
+					char buf[32];
+					uint16_t their_verno =
+					    ptl->am_ep[shmidx].
+					    psm_verno;
+					snprintf(buf, sizeof(buf), "%d.%d",
+						 PSMI_VERNO_GET_MAJOR
+						 (their_verno),
+						 PSMI_VERNO_GET_MINOR
+						 (their_verno));
+
+					_HFI_INFO("Local endpoint id %" PRIx64
+						  " has version %s "
+						  "which is not supported by library version %d.%d",
+						  epid, buf, PSM2_VERNO_MAJOR,
+						  PSM2_VERNO_MINOR);
+					req->errors[i] =
+					    PSM2_EPID_INVALID_VERSION;
+					req->numep_left--;
+					req->epid_mask[i] = AMSH_CMASK_DONE;
+					continue;
+				}
+				if (epaddr != NULL) {
+					psmi_assert(((am_epaddr_t *) epaddr)->
+						    shmidx == shmidx);
+				} else
+				    if ((epaddr =
+					 psmi_epid_lookup(ptl->ep,
+							  epid)) == NULL) {
+					if ((err =
+					     amsh_epaddr_add(ptl_gen, epid, shmidx,
+							     &epaddr))) {
+						return err;
+					}
+					/* Remote pid is unknown at the moment */
+					((am_epaddr_t *) epaddr)->pid =
+						AMSH_PID_UNKNOWN;
+				}
+				req->epaddr[i] = epaddr;
+				req->args[0].u16w0 = PSMI_AM_CONN_REQ;
+				/* tell the other process its shmidx here */
+				req->args[0].u16w1 = shmidx;
+				req->args[0].u32w1 = ptl->connect_phase;
+				req->args[1].u64w0 = (uint64_t) ptl->epid;
+				req->args[2].u32w0 = create_extra_ep_data();
+				req->args[2].u32w1 = PSM2_OK;
+				req->args[3].u64w0 =
+				    (uint64_t) (uintptr_t) &req->errors[i];
+				req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+				psmi_amsh_short_request(ptl_gen, epaddr,
+							amsh_conn_handler_hidx,
+							req->args, 4, NULL, 0,
+							0);
+				_HFI_CONNDBG("epaddr=%p, epid=0x%" PRIx64
+					   " at shmidx=%d\n", epaddr, epid,
+					   shmidx);
+			}
+		}
+	}
+
+	if (req->numep_left == 0) {	/* we're all done */
+		req->isdone = 1;
+		return PSM2_OK;
+	} else {
+		sched_yield();
+		return PSM2_OK_NO_PROGRESS;
+	}
+}
+
+static
+psm2_error_t
+amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	/* Wherever we are at in our connect process, we've been instructed to
+	 * finish the connection process */
+	if (req == NULL)
+		return PSM2_OK;
+
+	/* This prevents future connect replies from referencing data structures
+	 * that disappeared.  For abort we aren't waiting for DISC_REP so
+	 * we want to keep same phase so we accept them after this function */
+	if (req->op != PTL_OP_ABORT)
+		ptl->connect_phase++;
+
+	/* First process any leftovers in postreq or prereq */
+	for (i = 0; i < req->numep; i++) {
+		if (req->epid_mask[i] == AMSH_CMASK_NONE
+			|| req->op == PTL_OP_ABORT)
+			continue;
+		else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+			int cstate;
+			req->epid_mask[i] = AMSH_CMASK_DONE;
+			cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing;
+			if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
+				req->numep_left--;
+				((am_epaddr_t *) req->epaddr[i])->cstate_outgoing =
+					AMSH_CSTATE_OUTGOING_ESTABLISHED;
+			} else {	/* never actually got reply */
+				req->errors[i] = PSM2_TIMEOUT;
+			}
+		}
+		/* If we couldn't go from prereq to postreq, that means we couldn't
+		 * find the shmidx for an epid in time.  This can only be a case of
+		 * time out */
+		else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+			req->errors[i] = PSM2_TIMEOUT;
+			req->numep_left--;
+			req->epid_mask[i] = AMSH_CMASK_DONE;
+		}
+	}
+
+	/* Whatever is left can only be in DONE or NONE state */
+	for (i = 0; i < req->numep; i++) {
+		if (req->epid_mask[i] == AMSH_CMASK_NONE)
+			continue;
+		if (req->op == PTL_OP_ABORT
+			 && req->epid_mask[i] != AMSH_CMASK_DONE) {
+			req->epid_mask[i] = AMSH_CMASK_DONE;
+			continue;
+		}
+		psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE);
+
+		err = psmi_error_cmp(err, req->errors[i]);
+		/* XXX TODO: Report errors in connection. */
+		/* Only free epaddr if they have disconnected from us */
+		int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming;
+		if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) {
+			if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+				psmi_assert(req->epaddr[i] != NULL);
+				amsh_free_epaddr(req->epaddr[i]);
+				req->epaddr[i] = NULL;
+			}
+		}
+	}
+
+	psmi_free(req->epid_mask);
+	psmi_free(req);
+
+	return err;
+}
+
+/* Wrapper for 2.0's use of connect/disconnect.  The plan is to move the
+ * init/poll/fini interface up to the PTL level for 2.2 */
+#define CONNREQ_ZERO_POLLS_BEFORE_YIELD  20
+static
+psm2_error_t
+amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op,
+		     int numep,
+		     const psm2_epid_t *array_of_epid,
+		     const int array_of_epid_mask[],
+		     psm2_error_t *array_of_errors,
+		     psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err;
+	uint64_t t_start;
+	struct ptl_connection_req *req;
+	int num_polls_noprogress = 0;
+	static int shm_polite_attach = -1;
+
+	if (shm_polite_attach == -1) {
+		char *p = getenv("PSM3_SHM_POLITE_ATTACH");
+		if (p && *p && atoi(p) != 0) {
+			fprintf(stderr, "%s: Using Polite SHM segment attach\n",
+				psmi_gethostname());
+			shm_polite_attach = 1;
+		}
+		shm_polite_attach = 0;
+	}
+
+	/* Initialize */
+	err = amsh_ep_connreq_init(ptl_gen, op, numep,
+				   array_of_epid, array_of_epid_mask,
+				   array_of_errors, array_of_epaddr, &req);
+	if (err != PSM2_OK_NO_PROGRESS)	/* Either we're all done with connect or
+					 * there was an error */
+		return err;
+
+	if (op == PTL_OP_ABORT) {
+		int i;
+		/* loop a couple times only, ignore timeout */
+		/* this will move from PREREQ to POSTREQ and check once
+		 * for reply, but not wait for reply
+		 */
+		for (i=0; i < 2; i++) {
+			psmi_poll_internal(ptl->ep, 1);
+			err = amsh_ep_connreq_poll(ptl_gen, req);
+			if (err != PSM2_OK && err != PSM2_OK_NO_PROGRESS) {
+				psmi_free(req->epid_mask);
+				psmi_free(req);
+				goto fail;
+			}
+		}
+		goto fini;
+	}
+
+	/* Poll until either
+	 * 1. We time out
+	 * 2. We are done with connecting
+	 */
+	t_start = get_cycles();
+	do {
+		psmi_poll_internal(ptl->ep, 1);
+		err = amsh_ep_connreq_poll(ptl_gen, req);
+		if (err == PSM2_OK)
+			break;	/* Finished before timeout */
+		else if (err != PSM2_OK_NO_PROGRESS) {
+			psmi_free(req->epid_mask);
+			psmi_free(req);
+			goto fail;
+		} else if (shm_polite_attach &&
+			   ++num_polls_noprogress ==
+			   CONNREQ_ZERO_POLLS_BEFORE_YIELD) {
+			num_polls_noprogress = 0;
+			PSMI_YIELD(ptl->ep->mq->progress_lock);
+		}
+	}
+	while (psmi_cycles_left(t_start, timeout_ns));
+
+fini:
+	err = amsh_ep_connreq_fini(ptl_gen, req);
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+amsh_ep_connect(ptl_t *ptl,
+		int numep,
+		const psm2_epid_t *array_of_epid,
+		const int array_of_epid_mask[],
+		psm2_error_t *array_of_errors,
+		psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
+{
+	return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid,
+				    array_of_epid_mask, array_of_errors,
+				    array_of_epaddr, timeout_ns);
+}
+
+static
+psm2_error_t
+amsh_ep_disconnect(ptl_t *ptl, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_ns)
+{
+	return amsh_ep_connreq_wrap(ptl,
+				    force ? PTL_OP_ABORT : PTL_OP_DISCONNECT,
+				    numep, NULL, array_of_epaddr_mask,
+				    array_of_errors,
+				    array_of_epaddr,
+				    timeout_ns);
+}
+
+#undef CSWAP
+PSMI_ALWAYS_INLINE(
+int32_t
+cswap(volatile int32_t *p, int32_t old_value, int32_t new_value))
+{
+	asm volatile ("lock cmpxchg %2, %0" :
+		      "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory");
+	return old_value;
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0))
+{
+	am_pkt_short_t *pkt;
+	uint32_t idx;
+#ifndef CSWAP
+	pthread_spin_lock(&shq->lock);
+	idx = shq->tail;
+	pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+	if (pkt->flag == QFREE) {
+		ips_sync_reads();
+		pkt->flag = QUSED;
+		shq->tail += 1;
+		if (shq->tail == shq->elem_cnt)
+			shq->tail = 0;
+	} else {
+		pkt = 0;
+	}
+	pthread_spin_unlock(&shq->lock);
+#else
+	uint32_t idx_next;
+	do {
+		idx = shq->tail;
+		idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1;
+	} while (cswap(&shq->tail, idx, idx_next) != idx);
+
+	pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz);
+	while (cswap(&pkt->flag, QFREE, QUSED) != QFREE);
+#endif
+	return pkt;
+}
+
+/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */
+#define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \
+	am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0)))
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt(ptl_t *ptl_gen, uint16_t shmidx, int is_reply))
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	volatile am_ctl_qhdr_t *shq;
+	am_pkt_short_t *pkt0;
+	if (!is_reply) {
+		shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort;
+	} else {
+		shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort;
+	}
+	return am_ctl_getslot_pkt_inner(shq, pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_long(ptl_t *ptl_gen, uint16_t shmidx, int is_reply))
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	volatile am_ctl_qhdr_t *shq;
+	am_pkt_bulk_t *pkt0;
+	if (!is_reply) {
+		shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong;
+	} else {
+		shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq);
+		pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong;
+	}
+	return am_ctl_getslot_bulkpkt_inner(shq, pkt0);
+}
+
+psmi_handlertab_t psmi_allhandlers[] = {
+	{0}
+	,
+	{amsh_conn_handler}
+	,
+	{psmi_am_mq_handler}
+	,
+	{psmi_am_mq_handler_data}
+	,
+	{psmi_am_mq_handler_rtsmatch}
+	,
+	{psmi_am_mq_handler_rtsdone}
+	,
+	{psmi_am_handler}
+};
+
+PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr))
+{
+	QMARKFREE(hdr->head);
+	hdr->head++;
+	if (hdr->head == hdr->end)
+		hdr->head = hdr->base;
+}
+
+#define AMSH_ZERO_POLLS_BEFORE_YIELD    64
+#define AMSH_POLLS_BEFORE_PSM_POLL      16
+
+/* XXX this can be made faster.  Instead of checking the flag of the head, keep
+ * a cached copy of the integer value of the tail and compare it against the
+ * previous one we saw.
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly,
+			 int is_internal))
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	/* poll replies */
+	if (!QISEMPTY(ptl->repH.head->flag)) {
+		do {
+			ips_sync_reads();
+			process_packet(ptl_gen, (am_pkt_short_t *) ptl->repH.head,
+				       0);
+			advance_head(&ptl->repH);
+			err = PSM2_OK;
+		} while (!QISEMPTY(ptl->repH.head->flag));
+	}
+
+	if (!replyonly) {
+		/* Request queue not enable for 2.0, will be re-enabled to support long
+		 * replies */
+		if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) {
+			psmi_am_reqq_drain(ptl_gen);
+			err = PSM2_OK;
+		}
+		if (!QISEMPTY(ptl->reqH.head->flag)) {
+			do {
+				ips_sync_reads();
+				process_packet(ptl_gen,
+					       (am_pkt_short_t *) ptl->reqH.
+					       head, 1);
+				advance_head(&ptl->reqH);
+				err = PSM2_OK;
+			} while (!QISEMPTY(ptl->reqH.head->flag));
+		}
+	}
+
+	if (is_internal) {
+		if (err == PSM2_OK)	/* some progress, no yields */
+			ptl->zero_polls = 0;
+		else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) {
+			/* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */
+			sched_yield();
+			ptl->zero_polls = 0;
+		}
+
+		if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) {
+			psmi_poll_internal(ptl->ep, 0);
+			ptl->amsh_only_polls = 0;
+		}
+	}
+	return err;		/* if we actually did something */
+}
+
+/* non-inlined version */
+static
+psm2_error_t
+amsh_poll_internal(ptl_t *ptl, int replyonly)
+{
+	return amsh_poll_internal_inner(ptl, replyonly, 1);
+}
+
+#ifdef PSM_PROFILE
+#define AMSH_POLL_UNTIL(ptl, isreply, cond) \
+	do {								\
+		PSMI_PROFILE_BLOCK();					\
+		while (!(cond)) {					\
+			PSMI_PROFILE_REBLOCK(				\
+				amsh_poll_internal(ptl, isreply) ==	\
+					PSM2_OK_NO_PROGRESS);		\
+		}							\
+		PSMI_PROFILE_UNBLOCK();					\
+	} while (0)
+#else
+#define AMSH_POLL_UNTIL(ptl, isreply, cond)			\
+	do {							\
+		while (!(cond)) {				\
+			amsh_poll_internal(ptl, isreply);	\
+		}						\
+	} while (0)
+#endif
+
+static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly)
+{
+	return amsh_poll_internal_inner(ptl, replyonly, 0);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx,
+		  uint32_t bulkidx, uint16_t fmt, uint16_t nargs,
+		  uint16_t handleridx, psm2_amarg_t *args,
+		  const void *src, uint32_t len, int isreply))
+{
+	int i;
+	volatile am_pkt_short_t *pkt;
+	int copy_nargs;
+
+	AMSH_POLL_UNTIL(ptl, isreply,
+			(pkt =
+			 am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL);
+
+	/* got a free pkt... fill it in */
+	pkt->bulkidx = bulkidx;
+	pkt->shmidx = returnidx;
+	pkt->type = fmt;
+	pkt->nargs = nargs;
+	pkt->handleridx = handleridx;
+
+	/* Limit the number of args copied here to NSHORT_ARGS.  Additional args
+	   are carried in the bulkpkt. */
+	copy_nargs = nargs;
+	if (copy_nargs > NSHORT_ARGS) {
+		copy_nargs = NSHORT_ARGS;
+	}
+
+	for (i = 0; i < copy_nargs; i++)
+		pkt->args[i] = args[i];
+
+	if (fmt == AMFMT_SHORT_INLINE)
+		mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src,
+			     len);
+
+	_HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d,"
+		  "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx,
+		  pkt->flag, pkt->nargs, src, (int)len, (int)handleridx,
+		  src != NULL ? *((uint32_t *) src) : 0);
+	QMARKREADY(pkt);
+}
+
+#define amsh_shm_copy_short psmi_mq_mtucpy
+#define amsh_shm_copy_long  psmi_mq_mtucpy
+
+PSMI_ALWAYS_INLINE(
+int
+psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, void *dst, int flags))
+{
+#ifdef PSM_DEBUG
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+#endif
+	uint16_t type;
+	uint32_t bulkidx;
+	uint16_t hidx = (uint16_t) handler;
+	int destidx = ((am_epaddr_t *) epaddr)->shmidx;
+	int returnidx = ((am_epaddr_t *) epaddr)->return_shmidx;
+	int is_reply = AM_IS_REPLY(amtype);
+	volatile am_pkt_bulk_t *bulkpkt;
+
+	_HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n",
+		  is_reply ? "reply" : "request",
+		  psmi_epaddr_get_name(epaddr->epid),
+		  ((am_epaddr_t *) epaddr)->shmidx, amtype);
+	psmi_assert(epaddr != ptl->epaddr);
+
+	switch (amtype) {
+	case AMREQUEST_SHORT:
+	case AMREPLY_SHORT:
+		if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) {
+			/* Payload fits in args packet */
+			type = AMFMT_SHORT_INLINE;
+			bulkidx = len;
+		} else {
+			int i;
+
+			psmi_assert(len < amsh_qelemsz.qreqFifoLong);
+			psmi_assert(src != NULL || nargs > NSHORT_ARGS);
+			type = AMFMT_SHORT;
+
+			AMSH_POLL_UNTIL(ptl_gen, is_reply,
+					(bulkpkt =
+					 am_ctl_getslot_long(ptl_gen, destidx,
+							     is_reply)) !=
+					NULL);
+
+			bulkidx = bulkpkt->idx;
+			bulkpkt->len = len;
+			_HFI_VDBG("bulkpkt %p flag is %d from idx %d\n",
+				  bulkpkt, bulkpkt->flag, destidx);
+
+			for (i = 0; i < nargs - NSHORT_ARGS; i++) {
+				bulkpkt->args[i] = args[i + NSHORT_ARGS];
+			}
+
+			amsh_shm_copy_short((void *)bulkpkt->payload, src,
+					    (uint32_t) len);
+			QMARKREADY(bulkpkt);
+		}
+		am_send_pkt_short(ptl_gen, destidx, returnidx, bulkidx, type,
+				  nargs, hidx, args, src, len, is_reply);
+		break;
+
+	case AMREQUEST_LONG:
+	case AMREPLY_LONG:
+		{
+			uint32_t bytes_left = len;
+			uint8_t *src_this = (uint8_t *) src;
+			uint8_t *dst_this = (uint8_t *) dst;
+			uint32_t bytes_this;
+
+			type = AMFMT_LONG;
+
+			_HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n",
+				  is_reply ? "rep" : "req", src, dst,
+				  (uint32_t) len, hidx);
+			while (bytes_left) {
+				bytes_this = min(bytes_left, AMLONG_MTU);
+				AMSH_POLL_UNTIL(ptl_gen, is_reply,
+						(bulkpkt =
+						 am_ctl_getslot_long(ptl_gen,
+								     destidx,
+								     is_reply))
+						!= NULL);
+				bytes_left -= bytes_this;
+				if (bytes_left == 0)
+					type = AMFMT_LONG_END;
+				bulkidx = bulkpkt->idx;
+				amsh_shm_copy_long((void *)bulkpkt->payload,
+						   src_this, bytes_this);
+
+				bulkpkt->dest = (uintptr_t) dst;
+				bulkpkt->dest_off =
+				    (uint32_t) ((uintptr_t) dst_this -
+						(uintptr_t) dst);
+				bulkpkt->len = bytes_this;
+				QMARKREADY(bulkpkt);
+				am_send_pkt_short(ptl_gen, destidx, returnidx,
+						  bulkidx, type, nargs, hidx,
+						  args, NULL, 0, is_reply);
+				src_this += bytes_this;
+				dst_this += bytes_this;
+			}
+			break;
+		}
+	default:
+		break;
+	}
+	return 1;
+}
+
+/* A generic version that's not inlined */
+int
+psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+		  psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		  const void *src, size_t len, void *dst, int flags)
+{
+	return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args,
+				       nargs, src, len, dst, flags);
+}
+
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, int flags)
+{
+	return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler,
+				       args, nargs, src, len, NULL, flags);
+}
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+		       psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		       const void *src, size_t len, void *dest, int flags)
+{
+	return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler,
+				       args, nargs, src, len, dest, flags);
+}
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags)
+{
+	psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming,
+				handler, args, nargs, src, len, NULL, flags);
+	return;
+}
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags)
+{
+	psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming,
+				handler, args, nargs, src, len, dest, flags);
+	return;
+}
+
+void psmi_am_reqq_init(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	ptl->psmi_am_reqq_fifo.first = NULL;
+	ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+}
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl_gen)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first;
+	am_reqq_t *req;
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+
+	/* We're going to process the entire list, and running the generic handler
+	 * below can cause other requests to be enqueued in the queue that we're
+	 * processing. */
+	ptl->psmi_am_reqq_fifo.first = NULL;
+	ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+
+	while ((req = reqn) != NULL) {
+		err = PSM2_OK;
+		reqn = req->next;
+		_HFI_VDBG
+		    ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n",
+		     req, psmi_epaddr_get_hostname(req->epaddr->epid),
+		     (void *)(uintptr_t) req->args[1].u64w0,
+		     (void *)(uintptr_t) req->args[0].u64w0);
+		psmi_amsh_generic(req->amtype, req->ptl, req->epaddr,
+				  req->handler, req->args, req->nargs, req->src,
+				  req->len, req->dest, req->amflags);
+		if (req->flags & AM_FLAG_SRC_TEMP)
+			psmi_free(req->src);
+		psmi_free(req);
+	}
+	return err;
+}
+
+void
+psmi_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
+		 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		 void *src, size_t len, void *dest, int amflags)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	int i;
+	int flags = 0;
+	am_reqq_t *nreq =
+	    (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t));
+	psmi_assert_always(nreq != NULL);
+	_HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, "
+		  "localreq=%p, remotereq=%p\n", nreq,
+		  psmi_epaddr_get_hostname(epaddr->epid), dest,
+		  (int)len, (void *)(uintptr_t) args[1].u64w0,
+		  (void *)(uintptr_t) args[0].u64w0);
+
+	psmi_assert(nargs <= 8);
+	nreq->next = NULL;
+	nreq->amtype = amtype;
+	nreq->ptl = ptl_gen;
+	nreq->epaddr = epaddr;
+	nreq->handler = handler;
+	for (i = 0; i < nargs; i++)
+		nreq->args[i] = args[i];
+	nreq->nargs = nargs;
+	if (AM_IS_LONG(amtype) && src != NULL &&
+	    len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) {
+		abort();
+		flags |= AM_FLAG_SRC_TEMP;
+		nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len);
+		psmi_assert_always(nreq->src != NULL);	/* XXX mem */
+		amsh_shm_copy_short(nreq->src, src, len);
+	} else
+		nreq->src = src;
+	nreq->len = len;
+	nreq->dest = dest;
+	nreq->amflags = amflags;
+	nreq->flags = flags;
+
+	nreq->next = NULL;
+	*(ptl->psmi_am_reqq_fifo.lastp) = nreq;
+	ptl->psmi_am_reqq_fifo.lastp = &nreq->next;
+}
+
+static
+void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	amsh_am_token_t tok;
+	psmi_handler_fn_t fn;
+	psm2_amarg_t *args = pkt->args;
+	uint16_t shmidx = pkt->shmidx;
+	int nargs = pkt->nargs;
+
+	tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0);
+	tok.ptl = ptl_gen;
+	tok.mq = ptl->ep->mq;
+	tok.shmidx = shmidx;
+
+	uint16_t hidx = (uint16_t) pkt->handleridx;
+	uint32_t bulkidx = pkt->bulkidx;
+	uintptr_t bulkptr;
+	am_pkt_bulk_t *bulkpkt;
+
+	fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn;
+	psmi_assert(fn != NULL);
+	psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase);
+
+	if (pkt->type == AMFMT_SHORT_INLINE) {
+		_HFI_VDBG
+		    ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n",
+		     isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt,
+		     hidx);
+
+		fn(&tok, args, nargs, pkt->length > 0 ?
+		   (void *)&args[nargs] : NULL, pkt->length);
+	} else {
+		int isend = 0;
+		switch (pkt->type) {
+		case AMFMT_LONG_END:
+			isend = 1;
+		/* fall through */
+		case AMFMT_LONG:
+		case AMFMT_SHORT:
+			if (isreq) {
+				bulkptr =
+				    (uintptr_t) ptl->self_nodeinfo->qdir.
+				    qreqFifoLong;
+				bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong;
+			} else {
+				bulkptr =
+				    (uintptr_t) ptl->self_nodeinfo->qdir.
+				    qrepFifoLong;
+				bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong;
+			}
+			break;
+		default:
+			bulkptr = 0;
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown/unhandled packet type 0x%x",
+					  pkt->type);
+			return;
+		}
+
+		bulkpkt = (am_pkt_bulk_t *) bulkptr;
+		_HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d "
+			  "from_idx=%d pkt=%p/%p hidx=%d\n",
+			  ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag,
+			  bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx);
+		psmi_assert(bulkpkt->flag == QREADY);
+
+		if (nargs > NSHORT_ARGS || isend == 1) {
+			/* Either there are more args in the bulkpkt, or this is the last
+			   packet of a long payload.  In either case, copy the args. */
+			int i;
+			args =
+			    alloca((NSHORT_ARGS +
+				    NBULK_ARGS) * sizeof(psm2_amarg_t));
+
+			for (i = 0; i < NSHORT_ARGS; i++) {
+				args[i] = pkt->args[i];
+			}
+
+			for (; i < nargs; i++) {
+				args[i] = bulkpkt->args[i - NSHORT_ARGS];
+			}
+		}
+
+		if (pkt->type == AMFMT_SHORT) {
+			fn(&tok, args, nargs,
+			   (void *)bulkpkt->payload, bulkpkt->len);
+			QMARKFREE(bulkpkt);
+		} else {
+			amsh_shm_copy_long((void *)(bulkpkt->dest +
+						    bulkpkt->dest_off),
+					   bulkpkt->payload, bulkpkt->len);
+
+			/* If this is the last packet, copy args before running the
+			 * handler */
+			if (isend) {
+				void *dest = (void *)bulkpkt->dest;
+				size_t len =
+				    (size_t) (bulkpkt->dest_off + bulkpkt->len);
+				QMARKFREE(bulkpkt);
+				fn(&tok, args, nargs, dest, len);
+			} else
+				QMARKFREE(bulkpkt);
+		}
+	}
+	return;
+}
+
+static
+psm2_error_t
+amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
+	     psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf,
+	     uint32_t len)
+{
+	psm2_amarg_t args[5];
+	psm2_error_t err = PSM2_OK;
+
+	args[0].u32w0 = MQ_MSG_LONGRTS;
+	args[0].u32w1 = len;
+	args[1].u32w1 = tag->tag[0];
+	args[1].u32w0 = tag->tag[1];
+	args[2].u32w1 = tag->tag[2];
+	args[3].u64w0 = (uint64_t) (uintptr_t) req;
+	args[4].u64w0 = (uint64_t) (uintptr_t) buf;
+
+	psmi_assert(req != NULL);
+	req->type = MQE_TYPE_SEND;
+	req->req_data.buf = (void *)buf;
+	req->req_data.buf_len = len;
+	req->req_data.send_msglen = len;
+	req->send_msgoff = 0;
+
+#ifdef PSM_CUDA
+	/* If the send buffer is on gpu, we create a cuda IPC
+	 * handle and send it as payload in the RTS */
+	if (req->is_buf_gpu_mem) {
+		CUdeviceptr buf_base_ptr;
+		PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf);
+
+		/* Offset in GPU buffer from which we copy data, we have to
+			* send it separetly because this offset is lost
+			* when cuIpcGetMemHandle  is called */
+		req->cuda_ipc_offset = buf - (void*)buf_base_ptr;
+		args[2].u32w0 = (uint32_t)req->cuda_ipc_offset;
+
+		PSMI_CUDA_CALL(cuIpcGetMemHandle,
+				&req->cuda_ipc_handle,
+				(CUdeviceptr) buf);
+		if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
+			psmi_am_reqq_add(AMREQUEST_SHORT, ptl,
+						epaddr, mq_handler_hidx,
+						args, 5, (void*)&req->cuda_ipc_handle,
+						sizeof(CUipcMemHandle), NULL, 0);
+		} else {
+			psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+						args, 5, (void*)&req->cuda_ipc_handle,
+						sizeof(CUipcMemHandle), 0);
+		}
+		req->cuda_ipc_handle_attached = 1;
+	} else
+#endif
+	if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
+		psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx,
+					args, 5, NULL, 0, NULL, 0);
+	} else {
+		psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx,
+					args, 5, NULL, 0, 0);
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_shm_num++;
+	mq->stats.tx_rndv_num++;
+	// tx_rndv_bytes tabulated when get CTS
+
+	return err;
+}
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+			psm2_amarg_t *args, uint32_t flags_user, uint32_t flags_internal,
+			psm2_mq_tag_t *tag, const void *ubuf, uint32_t len))
+{
+	uint32_t bytes_left = len;
+	uint32_t bytes_this = 0;
+
+	psm2_handler_t handler = mq_handler_hidx;
+
+	args[1].u32w1 = tag->tag[0];
+	args[1].u32w0 = tag->tag[1];
+	args[2].u32w1 = tag->tag[2];
+	args[2].u32w0 = 0;
+
+	if (!flags_user && len <= AMLONG_MTU) {
+		if (len <= 32)
+			args[0].u32w0 = MQ_MSG_TINY;
+		else
+			args[0].u32w0 = MQ_MSG_SHORT;
+	} else {
+		args[0].u32w0 = MQ_MSG_EAGER;
+		args[0].u32w1 = len;
+	}
+
+	do {
+		args[2].u32w0 += bytes_this;
+		bytes_this = min(bytes_left, AMLONG_MTU);
+
+		/* Assume that shared-memory active messages are delivered in order */
+		if (flags_internal & PSMI_REQ_FLAG_FASTPATH) {
+			psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl,
+					epaddr, handler, args, 3, (void *)ubuf,
+					bytes_this, NULL, 0);
+		} else {
+			psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+						handler, args, 3, ubuf, bytes_this, 0);
+		}
+
+		ubuf = (void *)((uint8_t *)ubuf + bytes_this);
+		bytes_left -= bytes_this;
+		handler = mq_handler_data_hidx;
+	} while(bytes_left);
+
+	/* All eager async sends are always "all done" */
+	if (req != NULL) {
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_shm_num++;
+	mq->stats.tx_shm_bytes += len;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+
+	return PSM2_OK;
+}
+
+/*
+ * All shared am mq sends, req can be NULL
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
+		   uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag,
+		   const void *ubuf, uint32_t len))
+{
+	psm2_amarg_t args[3];
+	psm2_error_t err = PSM2_OK;
+	int is_blocking = (req == NULL);
+
+#ifdef PSM_CUDA
+	int gpu_mem = 0;
+	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported();
+
+	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
+		gpu_mem = 1;
+
+		/* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */
+		if (ep_supports_p2p) {
+			goto do_rendezvous;
+		}
+
+		/*
+		 * Use eager messages if P2P is unsupported between endpoints.
+		 * Potentially use rendezvous with blocking requests only.
+		 */
+		if (!is_blocking)
+			goto do_eager;
+	}
+#endif
+	if (flags_user & PSM2_MQ_FLAG_SENDSYNC)
+		goto do_rendezvous;
+
+	if (len <= mq->shm_thresh_rv)
+#ifdef PSM_CUDA
+do_eager:
+#endif
+		return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user,
+						flags_internal, tag, ubuf, len);
+do_rendezvous:
+	if (is_blocking) {
+		req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+		if_pf(req == NULL)
+			return PSM2_NO_MEMORY;
+		req->req_data.send_msglen = len;
+		req->req_data.tag = *tag;
+
+		/* Since SEND command is blocking, this request is
+		 * entirely internal and we will not be exposed to user.
+		 * Setting as internal so it will not be added to
+		 * mq->completed_q */
+		req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL);
+	}
+#ifdef PSM_CUDA
+	void *host_buf = NULL;
+
+	req->is_buf_gpu_mem = gpu_mem;
+	if (req->is_buf_gpu_mem) {
+		psmi_cuda_set_attr_sync_memops(ubuf);
+
+		/* Use host buffer for blocking requests if GPU P2P is
+		 * unsupported between endpoints.
+		 * This will be only used with blocking requests. */
+		if (!ep_supports_p2p) {
+			host_buf = psmi_malloc(epaddr->ptlctl->ep, UNDEFINED, len);
+			PSMI_CUDA_CALL(cuMemcpyDtoH, host_buf, (CUdeviceptr)ubuf, len);
+
+			/* Reset is_buf_gpu_mem since host buffer is being used
+			 * instead of one from GPU. */
+			ubuf = host_buf;
+			req->is_buf_gpu_mem = 0;
+		}
+	}
+#endif
+
+	err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len);
+
+	if (err == PSM2_OK && is_blocking) {	/* wait... */
+		err = psmi_mq_wait_internal(&req);
+	}
+
+#ifdef PSM_CUDA
+	if (err == PSM2_OK && host_buf)
+		psmi_free(host_buf);
+#endif
+
+	return err;
+}
+
+static
+psm2_error_t
+amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user,
+	      uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf,
+	      uint32_t len, void *context, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(req == NULL)
+	    return PSM2_NO_MEMORY;
+
+	req->req_data.send_msglen = len;
+	req->req_data.tag = *tag;
+	req->req_data.context = context;
+	req->flags_user = flags_user;
+	req->flags_internal = flags_internal;
+	_HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+		  psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+		  psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+		  tag->tag[0], tag->tag[1], tag->tag[2]);
+
+	amsh_mq_send_inner(mq, req, epaddr, flags_user, flags_internal, tag, ubuf, len);
+
+	*req_o = req;
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	     psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	_HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n",
+		  psmi_epaddr_get_name(epaddr->ptlctl->ep->epid),
+		  psmi_epaddr_get_name(epaddr->epid), ubuf, len,
+		  tag->tag[0], tag->tag[1], tag->tag[2]);
+
+	amsh_mq_send_inner(mq, NULL, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len);
+
+	return PSM2_OK;
+}
+
+/* kassist-related handling */
+int psmi_epaddr_pid(psm2_epaddr_t epaddr)
+{
+	uint16_t shmidx = ((am_epaddr_t *) epaddr)->shmidx;
+	return ((struct ptl_am *)(epaddr->ptlctl->ptl))->am_ep[shmidx].pid;
+}
+#if _HFI_DEBUGGING
+static
+const char *psmi_kassist_getmode(int mode)
+{
+	switch (mode) {
+	case PSMI_KASSIST_OFF:
+		return "kassist off";
+	case PSMI_KASSIST_CMA_GET:
+		return "cma get";
+	case PSMI_KASSIST_CMA_PUT:
+		return "cma put";
+	default:
+		return "unknown";
+	}
+}
+#endif
+
+static
+int psmi_get_kassist_mode()
+{
+	/* Cuda PSM2 supports only KASSIST_CMA_GET */
+	int mode = PSMI_KASSIST_CMA_GET;
+#ifndef PSM_CUDA
+	union psmi_envvar_val env_kassist;
+
+	if (!psmi_getenv("PSM3_KASSIST_MODE",
+			 "PSM Shared memory kernel assist mode "
+			 "(cma-put, cma-get, none)",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)
+			 PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) {
+		char *s = env_kassist.e_str;
+		if (strcasecmp(s, "cma-put") == 0)
+			mode = PSMI_KASSIST_CMA_PUT;
+		else if (strcasecmp(s, "cma-get") == 0)
+			mode = PSMI_KASSIST_CMA_GET;
+		else
+			mode = PSMI_KASSIST_OFF;
+	}
+#endif
+	return mode;
+}
+
+/* Connection handling for shared memory AM.
+ *
+ * arg0 => conn_op, result (PSM error type)
+ * arg1 => epid (always)
+ * arg2 => pid, version.
+ * arg3 => pointer to error for replies.
+ */
+static
+void
+amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		  size_t len)
+{
+	int op = args[0].u16w0;
+	int phase = args[0].u32w1;
+	psm2_epid_t epid = args[1].u64w0;
+	int16_t return_shmidx = args[0].u16w1;
+	psm2_error_t err = (psm2_error_t) args[2].u32w1;
+	psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0;
+	unsigned int pid;
+	unsigned int gpuid;
+	int force_remap = 0;
+
+	psm2_epaddr_t epaddr;
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	uint16_t shmidx = tok->shmidx;
+	int is_valid;
+	struct ptl_am *ptl = (struct ptl_am *)(tok->ptl);
+	ptl_t *ptl_gen = tok->ptl;
+	int cstate;
+
+	/* We do this because it's an assumption below */
+	psmi_assert_always(buf == NULL && len == 0);
+	read_extra_ep_data(args[2].u32w0, &pid, &gpuid);
+
+	_HFI_CONNDBG("Conn op=%d, phase=%d, epid=0x%llx, err=%d\n",
+		  op, phase, (unsigned long long)epid, err);
+
+	switch (op) {
+	case PSMI_AM_CONN_REQ:
+		_HFI_CONNDBG("Connect from 0x%"PRIx64":%"PRIu64"\n",
+			  psm2_epid_nid(epid), psm2_epid_context(epid));
+		epaddr = psmi_epid_lookup(ptl->ep, epid);
+		if (epaddr && ((am_epaddr_t *) epaddr)->pid != pid) {
+			/* If old pid is unknown consider new pid the correct one */
+			if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) {
+				((am_epaddr_t *) epaddr)->pid = pid;
+				((am_epaddr_t *) epaddr)->gpuid = gpuid;
+			} else {
+				psmi_epid_remove(ptl->ep, epid);
+				epaddr = NULL;
+				force_remap = 1;
+			}
+		}
+
+		if (shmidx == (uint16_t)-1) {
+			/* incoming packet will never be from our shmidx slot 0
+			   thus the other process doesn't know our return info.
+			   attach_to will lookup or create the proper shmidx */
+			if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, force_remap))) {
+				psmi_handle_error(PSMI_EP_NORETURN, err,
+						  "Fatal error in "
+						  "connecting to shm segment");
+			}
+			am_update_directory(&ptl->am_ep[shmidx]);
+			tok->shmidx = shmidx;
+		}
+
+		if (epaddr == NULL) {
+			uintptr_t args_segoff =
+				(uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase;
+			if ((err = amsh_epaddr_add(ptl_gen, epid, shmidx, &epaddr)))
+				/* Unfortunately, no way out of here yet */
+				psmi_handle_error(PSMI_EP_NORETURN, err,
+						  "Fatal error "
+						  "in connecting to shm segment");
+			args =
+			    (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase +
+					     args_segoff);
+
+			((am_epaddr_t *) epaddr)->pid = pid;
+			((am_epaddr_t *) epaddr)->gpuid = gpuid;
+		}
+
+		/* Rewrite args */
+		ptl->connect_incoming++;
+		args[0].u16w0 = PSMI_AM_CONN_REP;
+		/* and return our shmidx for the connecting process */
+		args[0].u16w1 = shmidx;
+		args[1].u64w0 = (psm2_epid_t) ptl->epid;
+		args[2].u32w0 = create_extra_ep_data();
+		args[2].u32w1 = PSM2_OK;
+		((am_epaddr_t *) epaddr)->cstate_incoming =
+			AMSH_CSTATE_INCOMING_ESTABLISHED;
+		((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx;
+		tok->tok.epaddr_incoming = epaddr;	/* adjust token */
+		psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+				      args, narg, NULL, 0, 0);
+		break;
+
+	case PSMI_AM_CONN_REP:
+		if (ptl->connect_phase != phase) {
+			_HFI_CONNDBG("Out of phase connect reply exp %d got %d\n", ptl->connect_phase, phase);
+			return;
+		}
+		epaddr = ptl->am_ep[shmidx].epaddr;
+		/* check if a race has occurred on shm-file reuse.
+		 * if so, don't transition to the next state.
+		 * the next call to connreq_poll() will restart the
+		 * connection.
+		*/
+		if (ptl->am_ep[shmidx].pid !=
+		    ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid)
+			break;
+
+		*perr = err;
+		((am_epaddr_t *) epaddr)->cstate_outgoing
+			= AMSH_CSTATE_OUTGOING_REPLIED;
+		((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx;
+		ptl->connect_outgoing++;
+		_HFI_CONNDBG("CCC epaddr=%s connected to ptl=%p\n",
+			  psmi_epaddr_get_name(epaddr->epid), ptl);
+		break;
+
+	case PSMI_AM_DISC_REQ:
+		epaddr = psmi_epid_lookup(ptl->ep, epid);
+		if (!epaddr) {
+			_HFI_CONNDBG("Dropping disconnect request from an epid that we are not connected to 0x%"PRIx64"\n", epid);
+			return;
+		}
+		args[0].u16w0 = PSMI_AM_DISC_REP;
+		args[2].u32w1 = PSM2_OK;
+		((am_epaddr_t *) epaddr)->cstate_incoming =
+			AMSH_CSTATE_INCOMING_DISC_REQUESTED;
+		ptl->connect_incoming--;
+		/* Before sending the reply, make sure the process
+		 * is still connected */
+
+		if (ptl->am_ep[shmidx].epid != epaddr->epid)
+			is_valid = 0;
+		else
+			is_valid = 1;
+
+		if (is_valid) {
+			psmi_amsh_short_reply(tok, amsh_conn_handler_hidx,
+					      args, narg, NULL, 0, 0);
+			/**
+			* Only munmap if we have nothing more to
+			* communicate with the other node, i.e. we are
+			* already disconnected with the other node
+			* or have sent a disconnect request.
+			*/
+			cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
+			if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) {
+				err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+				psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
+			}
+		}
+		break;
+
+	case PSMI_AM_DISC_REP:
+		if (ptl->connect_phase != phase) {
+			_HFI_CONNDBG("Out of phase disconnect reply exp %d got %d\n", ptl->connect_phase, phase);
+			return;
+		}
+		if (perr)
+			*perr = err;
+		epaddr = tok->tok.epaddr_incoming;
+		((am_epaddr_t *) epaddr)->cstate_outgoing =
+			AMSH_CSTATE_OUTGOING_DISC_REPLIED;
+		ptl->connect_outgoing--;
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unknown/unhandled connect handler op=%d",
+				  op);
+		break;
+	}
+	return;
+}
+
+static
+size_t amsh_sizeof(void)
+{
+	return sizeof(struct ptl_am);
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	parameters->max_handlers = PSMI_AM_NUM_HANDLERS;
+	parameters->max_nargs = PSMI_AM_MAX_ARGS;
+	parameters->max_request_short = AMLONG_MTU;
+	parameters->max_reply_short = AMLONG_MTU;
+
+	return PSM2_OK;
+}
+
+/**
+ * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid.
+ * @param ptl Pointer to caller-allocated space for PTL (fill in)
+ * @param ctl Pointer to caller-allocated space for PTL-control
+ *            structure (fill in)
+ */
+static
+psm2_error_t
+amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+
+	/* Preconditions */
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+
+	ptl->ep = ep;		/* back pointer */
+	ptl->epid = ep->epid;	/* cache epid */
+	ptl->epaddr = ep->epaddr;	/* cache a copy */
+	ptl->ctl = ctl;
+	ptl->zero_polls = 0;
+
+	ptl->connect_phase = 0;
+	ptl->connect_incoming = 0;
+	ptl->connect_outgoing = 0;
+
+	memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt));
+	memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo));
+
+	ptl->max_ep_idx = -1;
+	ptl->am_ep_size = AMSH_DIRBLOCK_SIZE;
+
+	ptl->am_ep = (struct am_ctl_nodeinfo *)
+		psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64,
+			      ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+	if (ptl->am_ep == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo));
+
+	if ((err = amsh_init_segment(ptl_gen)))
+		goto fail;
+
+	ptl->self_nodeinfo->psm_verno = PSMI_VERNO;
+	if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) {
+		if (cma_available()) {
+			ptl->self_nodeinfo->amsh_features |=
+				AMSH_HAVE_CMA;
+			psmi_shm_mq_rv_thresh =
+				PSMI_MQ_RV_THRESH_CMA;
+		} else {
+			ptl->psmi_kassist_mode =
+				PSMI_KASSIST_OFF;
+			psmi_shm_mq_rv_thresh =
+				PSMI_MQ_RV_THRESH_NO_KASSIST;
+		}
+	} else {
+		psmi_shm_mq_rv_thresh =
+			PSMI_MQ_RV_THRESH_NO_KASSIST;
+	}
+	ptl->self_nodeinfo->pid = getpid();
+	ptl->self_nodeinfo->epid = ep->epid;
+	ptl->self_nodeinfo->epaddr = ep->epaddr;
+
+	ips_mb();
+	ptl->self_nodeinfo->is_init = 1;
+
+	psmi_am_reqq_init(ptl_gen);
+	memset(ctl, 0, sizeof(*ctl));
+
+	/* Fill in the control structure */
+	ctl->ep = ep;
+	ctl->ptl = ptl_gen;
+	ctl->ep_poll = amsh_poll;
+	ctl->ep_connect = amsh_ep_connect;
+	ctl->ep_disconnect = amsh_ep_disconnect;
+
+	ctl->mq_send = amsh_mq_send;
+	ctl->mq_isend = amsh_mq_isend;
+
+	ctl->am_get_parameters = psmi_amsh_am_get_parameters;
+	ctl->am_short_request = psmi_amsh_am_short_request;
+	ctl->am_short_reply = psmi_amsh_am_short_reply;
+
+#if 0	// unused code, specific to QLogic MPI
+	/* No stats in shm (for now...) */
+	ctl->epaddr_stats_num = NULL;
+	ctl->epaddr_stats_init = NULL;
+	ctl->epaddr_stats_get = NULL;
+#endif
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_memcache_enabled;
+	psmi_getenv("PSM3_CUDA_MEMCACHE_ENABLED",
+		    "PSM cuda ipc memhandle cache enabled (default is enabled)",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		     (union psmi_envvar_val)
+		      1, &env_memcache_enabled);
+	if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) {
+		union psmi_envvar_val env_memcache_size;
+		psmi_getenv("PSM3_CUDA_MEMCACHE_SIZE",
+			    "Size of the cuda ipc memhandle cache ",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)
+			    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
+		if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK))
+			goto fail;
+	}
+#endif
+fail:
+	return err;
+}
+
+static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns)
+{
+	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
+	struct psmi_eptab_iterator itor;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	psm2_error_t err_seg;
+	uint64_t t_start = get_cycles();
+	int i = 0;
+
+	/* Close whatever has been left open -- this will be factored out for 2.1 */
+	if (ptl->connect_outgoing > 0) {
+		int num_disc = 0;
+		int *mask;
+		psm2_error_t *errs;
+		psm2_epaddr_t *epaddr_array;
+
+		psmi_epid_itor_init(&itor, ptl->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl != ptl_gen)
+				continue;
+			if (((am_epaddr_t *) epaddr)->cstate_outgoing ==
+			    AMSH_CSTATE_OUTGOING_ESTABLISHED)
+				num_disc++;
+		}
+		psmi_epid_itor_fini(&itor);
+		if (! num_disc)
+			goto poll;
+
+		mask =
+		    (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				       sizeof(int));
+		errs = (psm2_error_t *)
+		    psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				sizeof(psm2_error_t));
+		epaddr_array = (psm2_epaddr_t *)
+		    psmi_calloc(ptl->ep, UNDEFINED, num_disc,
+				sizeof(psm2_epaddr_t));
+
+		if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+			if (epaddr_array)
+				psmi_free(epaddr_array);
+			if (errs)
+				psmi_free(errs);
+			if (mask)
+				psmi_free(mask);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		psmi_epid_itor_init(&itor, ptl->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl == ptl_gen) {
+				if (((am_epaddr_t *) epaddr)->cstate_outgoing ==
+				    AMSH_CSTATE_OUTGOING_ESTABLISHED) {
+					mask[i] = 1;
+					epaddr_array[i] = epaddr;
+					i++;
+				}
+			}
+		}
+		psmi_epid_itor_fini(&itor);
+		psmi_assert(i == num_disc && num_disc > 0);
+		err = amsh_ep_disconnect(ptl_gen, force, num_disc, epaddr_array,
+					 mask, errs, timeout_ns);
+		psmi_free(mask);
+		psmi_free(errs);
+		psmi_free(epaddr_array);
+	}
+
+poll:
+	if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+		_HFI_CONNDBG("CCC polling disconnect from=%d,to=%d to=%"PRIu64" phase %d\n",
+			  ptl->connect_incoming, ptl->connect_outgoing, timeout_ns, ptl->connect_phase);
+		while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) {
+			if (!psmi_cycles_left(t_start, timeout_ns)) {
+				err = PSM2_TIMEOUT;
+				_HFI_CONNDBG("CCC timed out with from=%d,to=%d\n",
+					  ptl->connect_incoming, ptl->connect_outgoing);
+				break;
+			}
+			psmi_poll_internal(ptl->ep, 1);
+		}
+		_HFI_CONNDBG("CCC done polling disconnect from=%d,to=%d\n",
+			  ptl->connect_incoming, ptl->connect_outgoing);
+	} else
+		_HFI_CONNDBG("CCC complete disconnect from=%d,to=%d\n",
+			  ptl->connect_incoming, ptl->connect_outgoing);
+
+	if ((err_seg = psmi_shm_detach(ptl_gen))) {
+		err = err_seg;
+		goto fail;
+	}
+
+	/* This prevents poll calls between now and the point where the endpoint is
+	 * deallocated to reference memory that disappeared */
+	ptl->repH.head = &ptl->amsh_empty_shortpkt;
+	ptl->reqH.head = &ptl->amsh_empty_shortpkt;
+
+	if (ptl->am_ep)
+		psmi_free(ptl->am_ep);
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		am_cuda_memhandle_cache_map_fini();
+#endif
+	return PSM2_OK;
+fail:
+	return err;
+
+}
+
+static
+psm2_error_t
+amsh_setopt(const void *component_obj, int optname,
+	    const void *optval, uint64_t optlen)
+{
+	/* No options for AM PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown AM ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+amsh_getopt(const void *component_obj, int optname,
+	    void *optval, uint64_t *optlen)
+{
+	/* No options for AM PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown AM ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_amsh = {
+	amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt
+};
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/cmarw.h b/deps/libfabric/prov/psm3/psm3/ptl_am/cmarw.h
new file mode 100644
index 0000000000000000000000000000000000000000..0317ed422b1b3df80cc1caf2b169fd8bc6d7201f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/cmarw.h
@@ -0,0 +1,73 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+/*
+ * read from remote process pid
+ */
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n);
+
+/*
+ * write to remote process pid
+ */
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n);
+
+/*
+ * Test if CMA is available by trying a no-op call.
+ * Returns 1 if CMA is present, 0 if not.
+ */
+int cma_available(void);
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/cmarwu.c b/deps/libfabric/prov/psm3/psm3/ptl_am/cmarwu.c
new file mode 100644
index 0000000000000000000000000000000000000000..9c859da699e11243a952c9eae4b218cf41d99a9f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/cmarwu.c
@@ -0,0 +1,207 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "psm_user.h"
+#include "cmarw.h"
+
+/* An iovec looks like this:
+ * struct iovec {
+ *       void  *iov_base;    // Starting address
+ *       size_t iov_len;     // Number of bytes to transfer
+ * };
+ */
+
+#if 0
+#define __NR_process_vm_readv			310
+#define __NR_process_vm_writev			311
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	syscall(__NR_process_vm_readv, \
+		pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	syscall(__NR_process_vm_writev, \
+		pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+#endif
+
+/*CMA syscall wrappers were added in glibc 2.15.  For anything older than that,
+   we need to define our own wrappers.  Apparently older (and maybe newer?)
+   (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5
+   arguments via the generic syscall() function.  These CMA functions, however,
+   have 6 arguments.  So for now, we hack our way around it by generating ASM
+   code for doing a syscall directly.
+*/
+
+#if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15))
+
+#ifdef __x86_64__
+
+#define __NR_process_vm_readv			310
+#define __NR_process_vm_writev			311
+
+static inline ssize_t __x86_64_syscall6(int syscall,
+					pid_t pid,
+					const struct iovec *local_iov,
+					unsigned long liovcnt,
+					const struct iovec *remote_iov,
+					unsigned long riovcnt,
+					unsigned long flags)
+{
+	/*GCC inline ASM is annoying -- can't specify all the x86_64 registers
+	   directly, so declare register-specific variables and use them. */
+	register int64_t rax asm("rax") = syscall;
+	register int64_t rdi asm("rdi") = pid;
+	register int64_t rsi asm("rsi") = (intptr_t) local_iov;
+	register int64_t rdx asm("rdx") = liovcnt;
+	register int64_t r10 asm("r10") = (intptr_t) remote_iov;
+	register int64_t r8 asm("r8") = riovcnt;
+	register int64_t r9 asm("r9") = flags;
+
+	asm volatile ("syscall\n" : "=a" (rax)
+		      : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10),
+		      "r"(r8), "r"(r9)
+		      : "%rcx", "%r11", "cc", "memory");
+	return rax;
+}
+
+#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	__x86_64_syscall6(__NR_process_vm_readv, \
+			  pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \
+	__x86_64_syscall6(__NR_process_vm_writev, \
+			  pid, local_iov, liovcnt, remote_iov, riovcnt, flags)
+
+#else /* ndef __x86_64__ */
+#error "Can't compile CMA support for this architecture."
+#endif /* __x86_64__ */
+#endif /* __GLIBC__ < 2.15 */
+
+int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n)
+{
+	int64_t nr, sum;
+	struct iovec local = {
+		.iov_base = dst,
+		.iov_len = n
+	};
+	struct iovec remote = {
+		.iov_base = (void *)src,
+		.iov_len = n
+	};
+	nr = sum = 0;
+	while (sum != n) {
+		nr = process_vm_readv(pid, &local, 1, &remote, 1, 0);
+		if (nr == -1) {
+			return -1;
+		}
+		sum += nr;
+		local.iov_base = (void *)((uint8_t *)local.iov_base + nr);
+		local.iov_len -= nr;
+		remote.iov_base = (void *)((uint8_t *)remote.iov_base + nr);
+		remote.iov_len -= nr;
+	}
+	return sum;
+}
+
+int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n)
+{
+	int64_t nr, sum;
+	struct iovec local = {
+		.iov_base = (void *)src,
+		.iov_len = n
+	};
+	struct iovec remote = {
+		.iov_base = dst,
+		.iov_len = n
+	};
+
+	nr = sum = 0;
+	while (sum != n) {
+		nr = process_vm_writev(pid, &local, 1, &remote, 1, 0);
+		if (nr == -1) {
+			return -1;
+		}
+		sum += nr;
+		local.iov_base = (void *)((uint8_t *)local.iov_base + nr);
+		local.iov_len -= nr;
+		remote.iov_base = (void *)((uint8_t *)remote.iov_base + nr);
+		remote.iov_len -= nr;
+	}
+	return sum;
+}
+
+/* Test if CMA is available by trying a no-op call. */
+int cma_available(void)
+{
+
+	/* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred)
+	 * should be returned.  If not present, expect -ENOSYS. */
+
+	int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0);
+
+	if (ret == 0) {
+		/* CMA is available! */
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/psm_am_internal.h b/deps/libfabric/prov/psm3/psm3/ptl_am/psm_am_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4c08a5f007f9c286568319a443b8feccd0b19b8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/psm_am_internal.h
@@ -0,0 +1,448 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef PSMI_AM_H
+#define PSMI_AM_H
+
+#include "am_config.h"
+#include "../psm_am_internal.h"
+
+#define AMSH_DIRBLOCK_SIZE 128
+
+typedef
+struct am_epaddr {
+	/*
+	 * epaddr must be the first field to have the same address as this
+	 * structure
+	 */
+	struct psm2_epaddr epaddr;
+
+	uint16_t shmidx;
+	uint16_t return_shmidx;
+
+	uint32_t cstate_outgoing:3;
+	uint32_t cstate_incoming:3;
+	uint32_t pid:22;
+	/*
+	 * Device number of GPU used by given EP, only used when CUDA is
+	 * enabled. There is no gain from #ifdefing it out, since it does not
+	 * use any extra space.
+	 */
+	uint32_t gpuid:4;
+} am_epaddr_t;
+
+/* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining
+   arguments are passed using space in am_pkt_bulk_t.  One additional argument
+   is added for passing the internal ptl_am handler index. */
+#define NSHORT_ARGS 6
+#define NBULK_ARGS  (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1)
+
+typedef
+struct amsh_am_token {
+	struct psmi_am_token tok;
+
+	ptl_t *ptl;	  /**> What PTL was it received on */
+	psm2_mq_t mq;	  /**> What matched queue is this for ? */
+	uint16_t shmidx;  /**> what shmidx sent this */
+} amsh_am_token_t;
+
+typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs,
+				   void *src, size_t len);
+
+typedef struct psmi_handlertab {
+	psmi_handler_fn_t fn;
+} psmi_handlertab_t;
+
+#define PSMI_AM_CONN_REQ    1
+#define PSMI_AM_CONN_REP    2
+#define PSMI_AM_DISC_REQ    3
+#define PSMI_AM_DISC_REP    4
+
+#define PSMI_KASSIST_OFF       0x0
+#define PSMI_KASSIST_CMA_GET   0x1
+#define PSMI_KASSIST_CMA_PUT   0x2
+
+#define PSMI_KASSIST_CMA       0x3
+#define PSMI_KASSIST_GET       0x1
+#define PSMI_KASSIST_PUT       0x2
+#define PSMI_KASSIST_MASK      0x3
+
+int psmi_epaddr_pid(psm2_epaddr_t epaddr);
+
+/*
+ * Eventually, we will allow users to register handlers as "don't reply", which
+ * may save on some of the buffering requirements
+ */
+#define PSMI_HANDLER_NEEDS_REPLY(handler)    1
+#define PSMI_VALIDATE_REPLY(handler)    assert(PSMI_HANDLER_NEEDS_REPLY(handler))
+
+int psmi_amsh_poll(ptl_t *ptl, int replyonly);
+
+/* Shared memory AM, forward decls */
+int
+psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			const void *src, size_t len, int flags);
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags);
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr,
+		       psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		       const void *src, size_t len, void *dest, int flags);
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len);
+
+void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len);
+void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg,
+			     void *buf, size_t len);
+void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg,
+				 void *buf, size_t len);
+void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg,
+				 void *buf, size_t len);
+void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg,
+				void *buf, size_t len);
+void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		     size_t len);
+
+/* AM over shared memory (forward decls) */
+psm2_error_t
+psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+psmi_amsh_am_short_request(psm2_epaddr_t epaddr,
+			   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm2_am_completion_fn_t completion_fn,
+			   void *completion_ctxt);
+
+psm2_error_t
+psmi_amsh_am_short_reply(psm2_am_token_t tok,
+			 psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm2_am_completion_fn_t completion_fn,
+			 void *completion_ctxt);
+
+#define amsh_conn_handler_hidx	 1
+#define mq_handler_hidx          2
+#define mq_handler_data_hidx     3
+#define mq_handler_rtsmatch_hidx 4
+#define mq_handler_rtsdone_hidx  5
+#define am_handler_hidx          6
+
+#define AMREQUEST_SHORT 0
+#define AMREQUEST_LONG  1
+#define AMREPLY_SHORT   2
+#define AMREPLY_LONG    3
+#define AM_IS_REPLY(x)     ((x)&0x2)
+#define AM_IS_REQUEST(x)   (!AM_IS_REPLY(x))
+#define AM_IS_LONG(x)      ((x)&0x1)
+#define AM_IS_SHORT(x)     (!AM_IS_LONG(x))
+
+#define AM_FLAG_SRC_ASYNC   0x1
+#define AM_FLAG_SRC_TEMP    0x2
+
+/*
+ * Request Fifo.
+ */
+typedef
+struct am_reqq {
+	struct am_reqq *next;
+
+	ptl_t *ptl;
+	psm2_epaddr_t epaddr;
+	int amtype;
+	psm2_handler_t handler;
+	psm2_amarg_t args[8];
+	int nargs;
+	uint32_t len;
+	void *src;
+	void *dest;
+	int amflags;
+	int flags;
+} am_reqq_t;
+
+struct am_reqq_fifo_t {
+	am_reqq_t *first;
+	am_reqq_t **lastp;
+};
+
+psm2_error_t psmi_am_reqq_drain(ptl_t *ptl);
+void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      void *src, size_t len, void *dest, int flags);
+
+/*
+ * Shared memory Active Messages, implementation derived from
+ * Lumetta, Mainwaring, Culler.  Multi-Protocol Active Messages on a Cluster of
+ * SMP's. Supercomputing 1997.
+ *
+ * We support multiple endpoints in shared memory, but we only support one
+ * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some
+ * structures are endpoint specific (as denoted * with amsh_ep_) and others are
+ * specific to the single shared memory context * (amsh_ global variables).
+ *
+ * Each endpoint maintains a shared request block and a shared reply block.
+ * Each block is composed of queues for small, medium and large messages.
+ */
+
+#define QFREE      0
+#define QUSED      1
+#define QREADY     2
+#define QREADYMED  3
+#define QREADYLONG 4
+
+#define QISEMPTY(flag) (flag < QREADY)
+#if defined(__x86_64__) || defined(__i386__)
+#  define _QMARK_FLAG_FENCE()  asm volatile("" : : : "memory")	/* compilerfence */
+#else
+#  error No _QMARK_FLAG_FENCE() defined for this platform
+#endif
+
+#define _QMARK_FLAG(pkt_ptr, _flag)		\
+	do {					\
+		_QMARK_FLAG_FENCE();		\
+		(pkt_ptr)->flag = (_flag);	\
+	} while (0)
+
+#define QMARKFREE(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QFREE)
+#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY)
+#define QMARKUSED(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QUSED)
+
+#define AMFMT_SYSTEM       1
+#define AMFMT_SHORT_INLINE 2
+#define AMFMT_SHORT        3
+#define AMFMT_LONG         4
+#define AMFMT_LONG_END     5
+
+#define AMSH_CMASK_NONE    0
+#define AMSH_CMASK_PREREQ  1
+#define AMSH_CMASK_POSTREQ 2
+#define AMSH_CMASK_DONE    3
+
+#define AMSH_CSTATE_OUTGOING_NONE 		1
+#define AMSH_CSTATE_OUTGOING_REPLIED 		2
+#define AMSH_CSTATE_OUTGOING_ESTABLISHED 	3
+#define AMSH_CSTATE_OUTGOING_DISC_REPLIED 	4
+#define AMSH_CSTATE_OUTGOING_DISC_REQUESTED 	5
+
+#define AMSH_CSTATE_INCOMING_NONE 		1
+#define AMSH_CSTATE_INCOMING_DISC_REQUESTED 	4
+#define AMSH_CSTATE_INCOMING_ESTABLISHED 	5
+
+#define AMSH_PID_UNKNOWN			0
+
+/**********************************
+ * Shared memory packet formats
+ **********************************/
+typedef
+struct am_pkt_short {
+	uint32_t flag;	      /**> Packet state */
+	union {
+		uint32_t bulkidx; /**> index in bulk packet queue */
+		uint32_t length;  /**> length when no bulkidx used */
+	};
+	uint16_t shmidx;      /**> index in shared segment */
+	uint16_t type;
+	uint16_t nargs;
+	uint16_t handleridx;
+
+	psm2_amarg_t args[NSHORT_ARGS];	/* AM arguments */
+
+	/* We eventually will expose up to 8 arguments, but this isn't implemented
+	 * For now.  >6 args will probably require a medium instead of a short */
+} __attribute__ ((aligned(64)))
+am_pkt_short_t;
+PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64);
+
+typedef struct am_pkt_bulk {
+	uint32_t flag;
+	uint32_t idx;
+	uintptr_t dest;		/* Destination pointer in "longs" */
+	uint32_t dest_off;	/* Destination pointer offset */
+	uint32_t len;		/* Destination length within offset */
+	psm2_amarg_t args[NBULK_ARGS];	/* Additional "spillover" for >6 args */
+	uint8_t payload[0];
+} am_pkt_bulk_t;
+/* No strict size decl, used for mediums and longs */
+
+/****************************************************
+ * Shared memory header and block control structures
+ ***************************************************/
+
+/* Each pkt queue has the same header format, although the queue
+ * consumers don't use the 'head' index in the same manner. */
+typedef struct am_ctl_qhdr {
+	uint32_t head;		/* Touched only by 1 consumer */
+	uint8_t _pad0[64 - 4];
+
+	pthread_spinlock_t lock;
+	uint32_t tail;		/* XXX candidate for fetch-and-incr */
+	uint32_t elem_cnt;
+	uint32_t elem_sz;
+	uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)];
+} am_ctl_qhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128);
+
+/* Each process has a reply qhdr and a request qhdr */
+typedef struct am_ctl_blockhdr {
+	volatile am_ctl_qhdr_t shortq;
+	volatile am_ctl_qhdr_t longbulkq;
+} am_ctl_blockhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2);
+
+/* We cache the "shorts" because that's what we poll on in the critical path.
+ * We take care to always update these pointers whenever the segment is remapped.
+ */
+typedef struct am_ctl_qshort_cache {
+	volatile am_pkt_short_t *base;
+	volatile am_pkt_short_t *head;
+	volatile am_pkt_short_t *end;
+} am_ctl_qshort_cache_t;
+
+/******************************************
+ * Shared segment local directory (global)
+ ******************************************
+ *
+ * Each process keeps a directory for where request and reply structures are
+ * located at its peers.  This directory must be re-initialized every time the
+ * shared segment moves in the VM, and the segment moves every time we remap()
+ * for additional memory.
+ */
+struct amsh_qdirectory {
+	am_ctl_blockhdr_t *qreqH;
+	am_pkt_short_t *qreqFifoShort;
+	am_pkt_bulk_t *qreqFifoLong;
+
+	am_ctl_blockhdr_t *qrepH;
+	am_pkt_short_t *qrepFifoShort;
+	am_pkt_bulk_t *qrepFifoLong;
+} __attribute__ ((aligned(64)));
+
+/******************************************
+ * Shared fifo element counts and sizes
+ ******************************************
+ * These values are context-wide, they can only be set early on and can't be *
+ * modified at runtime.  All endpoints are expected to use the same values.
+ */
+typedef
+struct amsh_qinfo {
+	int qreqFifoShort;
+	int qreqFifoLong;
+
+	int qrepFifoShort;
+	int qrepFifoLong;
+} amsh_qinfo_t;
+
+/******************************************
+ * Per-endpoint structures (ep-local)
+ ******************************************
+ * Each endpoint keeps its own information as to where it resides in the
+ * directory, and maintains its own cached copies of where the short header
+ * resides in shared memory.
+ *
+ * This structure is carefully arranged to optimize cache locality and
+ * performance.  Do not modify without careful and thorough analysis.
+ */
+struct am_ctl_nodeinfo {
+	uint16_t psm_verno;
+	volatile uint16_t is_init;
+	volatile pid_t pid;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	uintptr_t amsh_shmbase;
+	amsh_qinfo_t amsh_qsizes;
+	uint32_t amsh_features;
+	struct amsh_qdirectory qdir;
+} __attribute__((aligned(64)));
+
+struct ptl_am {
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ctl;
+
+	int connect_phase;
+	int connect_outgoing;
+	int connect_incoming;
+
+	int zero_polls;
+	int amsh_only_polls;
+	int max_ep_idx, am_ep_size;
+	int psmi_kassist_mode;
+	char *amsh_keyname;
+
+	/* These three items carefully picked to fit in one cache line. */
+	am_ctl_qshort_cache_t reqH;
+	am_ctl_qshort_cache_t repH;
+	struct am_reqq_fifo_t psmi_am_reqq_fifo;
+
+	am_pkt_short_t amsh_empty_shortpkt;
+
+	struct am_ctl_nodeinfo *self_nodeinfo;
+	struct am_ctl_nodeinfo *am_ep;
+} __attribute__((aligned(64)));
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/ptl.c b/deps/libfabric/prov/psm3/psm3/ptl_am/ptl.c
new file mode 100644
index 0000000000000000000000000000000000000000..f3ee1c3c594b1965a70dc6736ab11ee988dbad19
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/ptl.c
@@ -0,0 +1,398 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "cmarw.h"
+
+#ifdef PSM_CUDA
+#include "am_cuda_memhandle_cache.h"
+#endif
+
+/* not reported yet, so just track in a global so can pass a pointer to
+ * psmi_mq_handle_envelope and psmi_mq_handle_rts
+ */
+static struct ptl_strategy_stats strat_stats;
+
+/**
+ * Callback function when a receive request is matched with the
+ * tag obtained from the RTS packet.
+ */
+static
+psm2_error_t
+ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
+			    amsh_am_token_t *tok)
+{
+	psm2_amarg_t args[5];
+	psm2_epaddr_t epaddr = req->rts_peer;
+	struct ptl_am *ptl = (struct ptl_am *)(epaddr->ptlctl->ptl);
+	int cma_succeed = 0;
+	int pid = 0, cuda_ipc_send_completion = 0;
+
+	PSM2_LOG_MSG("entering.");
+	psmi_assert((tok != NULL && was_posted)
+		    || (tok == NULL && !was_posted));
+
+	_HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n",
+		  req, req->req_data.buf, req->req_data.recv_msglen, tok);
+#ifdef PSM_CUDA
+	if (req->cuda_ipc_handle_attached) {
+
+		CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf - req->cuda_ipc_offset,
+						  (CUipcMemHandle*)&req->cuda_ipc_handle,
+								 req->req_data.recv_msglen,
+								 req->rts_peer->epid);
+		cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset;
+		/* cuMemcpy into the receive side buffer
+		 * based on its location */
+		if (req->is_buf_gpu_mem) {
+			PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr,
+				       req->req_data.recv_msglen);
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
+		} else
+			PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr,
+				       req->req_data.recv_msglen);
+		cuda_ipc_send_completion = 1;
+		am_cuda_memhandle_release(cuda_ipc_dev_ptr - req->cuda_ipc_offset);
+		req->cuda_ipc_handle_attached = 0;
+		goto send_cts;
+	}
+#endif
+
+	if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET)
+	    && req->req_data.recv_msglen > 0
+	    && (pid = psmi_epaddr_pid(epaddr))) {
+#ifdef PSM_CUDA
+		/* If the buffer on the send side is on the host,
+		 * we alloc a bounce buffer, use kassist and then
+		 * do a cuMemcpy if the buffer on the recv side
+		 * resides on the GPU
+		 */
+		if (req->is_buf_gpu_mem) {
+			void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen);
+			size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+					cuda_ipc_bounce_buf, req->req_data.recv_msglen);
+			psmi_assert_always(nbytes == req->req_data.recv_msglen);
+			PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_bounce_buf,
+				       req->req_data.recv_msglen);
+			/* Cuda library has recent optimizations where they do
+			 * not guarantee synchronus nature for Host to Device
+			 * copies for msg sizes less than 64k. The event record
+			 * and synchronize calls are to guarentee completion.
+			 */
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
+			psmi_free(cuda_ipc_bounce_buf);
+		} else {
+			/* cma can be done in handler context or not. */
+			size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+						req->req_data.buf, req->req_data.recv_msglen);
+			psmi_assert_always(nbytes == req->req_data.recv_msglen);
+		}
+#else
+		/* cma can be done in handler context or not. */
+		size_t nbytes = cma_get(pid, (void *)req->rts_sbuf,
+					req->req_data.buf, req->req_data.recv_msglen);
+		if (nbytes == -1) {
+			ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
+			_HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n");
+		}
+		else {
+			psmi_assert_always(nbytes == req->req_data.recv_msglen);
+			cma_succeed = 1;
+		}
+		psmi_assert_always(nbytes == req->req_data.recv_msglen);
+#endif
+	}
+
+#ifdef PSM_CUDA
+send_cts:
+#endif
+	args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr;
+	args[1].u64w0 = (uint64_t) (uintptr_t) req;
+	args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf;
+	args[3].u32w0 = req->req_data.recv_msglen;
+	args[3].u32w1 = tok != NULL ? 1 : 0;
+	args[4].u32w0 = ptl->psmi_kassist_mode;		// pass current kassist mode to the peer process
+
+	if (tok != NULL) {
+		psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl,
+				 tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx,
+				 args, 5, NULL, 0, NULL, 0);
+	} else
+		psmi_amsh_short_request((struct ptl *)ptl, epaddr, mq_handler_rtsmatch_hidx,
+					args, 5, NULL, 0, 0);
+
+	req->mq->stats.rx_user_num++;
+	req->mq->stats.rx_user_bytes += req->req_data.recv_msglen;
+	req->mq->stats.rx_shm_num++;
+	req->mq->stats.rx_shm_bytes += req->req_data.recv_msglen;
+
+	/* 0-byte completion or we used kassist */
+	if (pid || cma_succeed ||
+		req->req_data.recv_msglen == 0 || cuda_ipc_send_completion == 1) {
+		psmi_mq_handle_rts_complete(req);
+	}
+	PSM2_LOG_MSG("leaving.");
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted)
+{
+	/* was_posted == 0 allows us to assume that we're not running this callback
+	 * within am handler context (i.e. we can poll) */
+	psmi_assert(was_posted == 0);
+	return ptl_handle_rtsmatch_request(req, 0, NULL);
+}
+
+void
+psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
+		   size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	psm2_mq_req_t req;
+	psm2_mq_tag_t tag;
+	int rc;
+	uint32_t opcode = args[0].u32w0;
+	uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1;
+
+	tag.tag[0] = args[1].u32w1;
+	tag.tag[1] = args[1].u32w0;
+	tag.tag[2] = args[2].u32w1;
+	psmi_assert(toki != NULL);
+	_HFI_VDBG("mq=%p opcode=%x, len=%d, msglen=%d\n",
+		  tok->mq, opcode, (int)len, msglen);
+
+	switch (opcode) {
+	case MQ_MSG_TINY:
+	case MQ_MSG_SHORT:
+	case MQ_MSG_EAGER:
+		rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming,
+					     &tag, &strat_stats, msglen, 0, buf,
+					     (uint32_t) len, 1, opcode, &req);
+
+		/* for eager matching */
+		req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming;
+		req->msg_seqnum = 0;	/* using seqnum 0 */
+		req->mq->stats.rx_shm_num++;
+		// close enough, may not yet be matched,
+		//  don't know recv buf_len, so assume no truncation
+		req->mq->stats.rx_shm_bytes += msglen;
+		break;
+	default:{
+			void *sreq = (void *)(uintptr_t) args[3].u64w0;
+			uintptr_t sbuf = (uintptr_t) args[4].u64w0;
+			psmi_assert(narg == 5);
+			psmi_assert_always(opcode == MQ_MSG_LONGRTS);
+			rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming,
+						&tag, &strat_stats, msglen, NULL, 0, 1,
+						ptl_handle_rtsmatch, &req);
+
+			req->rts_peer = tok->tok.epaddr_incoming;
+			req->ptl_req_ptr = sreq;
+			req->rts_sbuf = sbuf;
+#ifdef PSM_CUDA
+			/* Payload in RTS would mean an IPC handle has been
+			 * sent. This would also mean the sender has to
+			 * send from a GPU buffer
+			 */
+			if (buf && len > 0) {
+				req->cuda_ipc_handle = *((CUipcMemHandle*)buf);
+				req->cuda_ipc_handle_attached = 1;
+				req->cuda_ipc_offset = args[2].u32w0;
+			}
+#endif
+
+			if (rc == MQ_RET_MATCH_OK)	/* we are in handler context, issue a reply */
+				ptl_handle_rtsmatch_request(req, 1, tok);
+			/* else will be called later */
+			break;
+		}
+	}
+	return;
+}
+
+void
+psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+	psmi_assert(toki != NULL);
+
+	psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming;
+	psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0);	/* using seqnum 0 */
+	psmi_assert_always(req != NULL);
+#ifdef PSM_CUDA
+	psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len, 0, NULL);
+#else
+	psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len);
+#endif
+
+	return;
+}
+
+/**
+ * Function to handle CTS on the sender.
+ */
+void
+psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			    size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+
+	psmi_assert(toki != NULL);
+
+	ptl_t *ptl = tok->ptl;
+	psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+#ifdef PSM_CUDA
+	/* If send side req has a cuda ipc handle attached then we can
+	 * assume the data has been copied as soon as we get a CTS
+	 */
+	if (sreq->cuda_ipc_handle_attached) {
+		sreq->cuda_ipc_handle_attached = 0;
+		sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen;
+		sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen;
+		psmi_mq_handle_rts_complete(sreq);
+		return;
+	}
+#endif
+	void *dest = (void *)(uintptr_t) args[2].u64w0;
+	uint32_t msglen = args[3].u32w0;
+	psm2_amarg_t rarg[1];
+
+	_HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n",
+		  sreq, (void *)(uintptr_t) args[1].u64w0, sreq->req_data.buf, dest,
+		  msglen);
+
+	if (msglen > 0) {
+		rarg[0].u64w0 = args[1].u64w0;	/* rreq */
+		int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode;
+		int kassist_mode_peer = args[4].u32w0;
+		// In general, peer process(es) shall have the same kassist mode set,
+		// but due to dynamic CMA failure detection, we must align local and remote state,
+		// and make protocol to adopt to that potential change.
+		if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) {
+			((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF;
+			goto no_kassist;
+		}
+
+		if (kassist_mode & PSMI_KASSIST_PUT) {
+			int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming);
+			size_t nbytes = cma_put(sreq->req_data.buf, pid, dest, msglen);
+			if (nbytes == -1) {
+				_HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n");
+				((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF;
+				goto no_kassist;
+			}
+
+			psmi_assert_always(nbytes == msglen);
+
+			/* Send response that PUT is complete */
+			psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx,
+					      rarg, 1, NULL, 0, 0);
+		} else if (!(kassist_mode & PSMI_KASSIST_MASK)) {
+			/* Only transfer if kassist is off, i.e. neither GET nor PUT. */
+no_kassist:
+			psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg,
+					     1, sreq->req_data.buf, msglen, dest, 0);
+		}
+	}
+	sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen;
+	sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen;
+	psmi_mq_handle_rts_complete(sreq);
+}
+
+void
+psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf,
+			   size_t len)
+{
+	psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
+	psmi_assert(narg == 1);
+	_HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->req_data.buf,
+		  rreq->req_data.recv_msglen);
+	psmi_mq_handle_rts_complete(rreq);
+}
+
+void
+psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len)
+{
+	amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+	struct psm2_ep_am_handle_entry *hentry;
+
+	psmi_assert(toki != NULL);
+
+	hentry = psm_am_get_handler_function(tok->mq->ep,
+					  (psm2_handler_t) args[0].u32w0);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	/* Invoke handler function. For AM we do not support break functionality */
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(toki, args + 1, narg - 1, buf, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(toki, args + 1, narg - 1, buf, len);
+	}
+
+	return;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_am/ptl_fwd.h b/deps/libfabric/prov/psm3/psm3/ptl_am/ptl_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d0fec4073ab0e5092fa06dbff9c537cb1ef0240
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_am/ptl_fwd.h
@@ -0,0 +1,64 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_AMSH_H
+#define _PTL_FWD_AMSH_H
+
+/* Symbol in am ptl */
+extern struct ptl_ctl_init psmi_ptl_amsh;
+
+extern int psmi_shm_mq_rv_thresh;
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_config.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0017db3dd7ab8a21e1ac8f11f45617782c03ae24
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_config.h
@@ -0,0 +1,128 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2018 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2018 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef PTL_IPS_IPS_CONFIG_H
+#define PTL_IPS_IPS_CONFIG_H
+
+#include "psm_config.h"
+
+/* Allocate new epaddrs in chunks of 128 */
+#define PTL_EPADDR_ALLOC_CHUNK  128
+
+/* Generate an expected header every 16 packets */
+#define PSM_DEFAULT_EXPECTED_HEADER 16
+
+#define DF_OPP_LIBRARY "libopasadb.so.1.0.0"
+#define DATA_VFABRIC_OFFSET 8
+
+/* Send retransmission */
+#define IPS_PROTO_SPIO_RETRY_US_DEFAULT	2	/* in uS */
+
+#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT	160	/* in millisecs */
+#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT	640	/* in millisecs */
+#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2
+#define PSM_TID_TIMEOUT_DEFAULT "160:640:2"	/* update from above params */
+
+#ifdef PSM_FI
+
+/* Fault injection, becomes parameters to psmi_faultinj_getspec so
+ * a comma-delimited list of
+ *   "spec_name", num, denom
+ * Where num/denom means fault num out of every denom.
+ * The defines set 'denum' and assume that num is set to 1
+ *
+ * These values are all defaults, each is overridable via
+ * PSM3_FI_<spec_name> in the environment (and yes, spec_name is in lowercase
+ * *in the environment* just to minimize it appearing in the wild).  The format
+ * there is <num:denom:initial_seed> so the same thing except that one can set
+ * a specific seed to the random number generator.
+ */
+#define IPS_FAULTINJ_RECVLOST	5000	/* 1 every X pkts dropped at recv */
+#define IPS_FAULTINJ_SENDLOST	5000	/* 1 every X pkts dropped at send */
+#define IPS_FAULTINJ_RQ_LKEY	5000	/* 0 every X RQ WQE bad lkey */
+#define IPS_FAULTINJ_SQ_LKEY	5000	/* 0 every X SQ WQE bad lkey */
+#define IPS_FAULTINJ_RC_RDMA_LKEY 5000	/* 0 every X RC SQ RDMA bad lkey */
+#define IPS_FAULTINJ_RC_RDMA_RKEY 5000	/* 0 every X RC SQ RDMA bad rkey */
+#define IPS_FAULTINJ_RV_RDMA_LEN 5000	/* 0 every X RV SQ RDMA bad len */
+#define IPS_FAULTINJ_RV_RDMA_RKEY 5000	/* 1 every X RV SQ RDMA bad rkey */
+#define IPS_FAULTINJ_REG_MR	100	/* 1 every X reg_mr ENOMEM */
+#define IPS_FAULTINJ_NONPRI_REG_MR 50	/* 1 every X non-pri reg_mr ENOMEM */
+#define IPS_FAULTINJ_PRI_REG_MR	1000	/* 1 every X pri reg_mr ENOMEM */
+#ifdef PSM_CUDA
+#define IPS_FAULTINJ_GDRMMAP	100	/* 1 every X GPU pin and mmap ENOMEM */
+#define IPS_FAULTINJ_GPU_REG_MR	100	/* 1 every X GPU reg_mr */
+#endif
+
+#endif /* #ifdef PSM_FI */
+
+
+
+/* rcv thread */
+/* All in milliseconds */
+#define RCVTHREAD_TO_MIN_FREQ	    10	/* min of 10 polls per sec */
+#define RCVTHREAD_TO_MAX_FREQ	    100	/* max of 100 polls per sec */
+#define RCVTHREAD_TO_SHIFT	    1
+
+/* ptl.c */
+#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS	250
+
+/* ips_proto_recv.c */
+#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS	30
+
+/*
+ * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is
+ * needed
+ */
+#define _HFI_EXP _HFI_VDBG
+
+#endif /* PTL_IPS_IPS_CONFIG_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_crc32.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_crc32.c
new file mode 100644
index 0000000000000000000000000000000000000000..589f3278d3b3a31b798dc1f44aca5d207f52b019
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_crc32.c
@@ -0,0 +1,93 @@
+/* The code in this file was derived from crc32.c in zlib 1.2.3, and
+   modified from its original form to suit our requirements. The zlib
+   license and crc32.c copyright and credits are preserved below. */
+
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.3, July 18th, 2005
+
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+	uint32_t c;
+	int n, k;
+
+	for (n = 0; n < 256; n++) {
+		c = (uint32_t) n;
+		for (k = 0; k < 8; k++) {
+			if (c & 1)
+				c = 0xedb88320 ^ (c >> 1);
+			else
+				c = c >> 1;
+		}
+		crc_table[n] = c;
+	}
+	crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc)
+{
+	uint32_t c = crc;
+	uint32_t n;
+
+	if (!crc_table_computed) {
+		make_crc_table();
+	}
+	for (n = 0; n < len; n++) {
+		c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8);
+	}
+	return c;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.c
new file mode 100644
index 0000000000000000000000000000000000000000..12b80cfe9ecaa04b3d8842eef199814bc6f9699b
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.c
@@ -0,0 +1,174 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_epstate.h"
+
+/* The indexes are used to map a particular endpoint to a structure at the
+ * receiver.  Although we take extra care to validate the identity of endpoints
+ * when packets are received, the communication index is at an offset selected
+ * by the endpoint that allocates the index.  This narrows the window of two
+ * jobs communicated with the same set of indexes from getting crosstalk.
+ */
+
+psm2_error_t
+ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context)
+{
+	memset(eps, 0, sizeof(*eps));
+	eps->context = context;
+	eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) &
+				(IPS_EPSTATE_CONNIDX_MAX-1);
+	return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps)
+{
+	if (eps->eps_tab)
+		psmi_free(eps->eps_tab);
+	memset(eps, 0, sizeof(*eps));
+	return PSM2_OK;
+}
+
+/*
+ * Use this to debug issues involving the epstate table.
+ */
+void ips_epstate_dump(struct ips_epstate *eps)
+{
+	if (_HFI_DBG_ON) {
+		int i=0; 
+		_HFI_DBG_ALWAYS("eps_base_idx = 0x%x, eps_tabsize = %d, "
+			"eps_tabsizeused = %d, eps_tab_nextidx = %d\n",
+			eps->eps_base_idx, eps->eps_tabsize,
+			eps->eps_tabsizeused, eps->eps_tab_nextidx);
+		for (i=0; i<eps->eps_tabsize; i++) {
+			_HFI_DBG_ALWAYS("%03d: ipsaddr = %p, cstate-o: %u, cstate-i: %u\n", i,
+				eps->eps_tab[i].ipsaddr,
+				eps->eps_tab[i].ipsaddr->cstate_outgoing,
+				eps->eps_tab[i].ipsaddr->cstate_incoming);
+		}
+	}
+}
+
+/*
+ * Add ipsaddr with epid to the epstate table, return new index to caller in
+ * 'connidx'.
+ */
+psm2_error_t
+ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr,
+		ips_epstate_idx *connidx_o)
+{
+	int i, j;
+	ips_epstate_idx connidx;
+
+	if (++eps->eps_tabsizeused > eps->eps_tabsize) {	/* realloc */
+		struct ips_epstate_entry *newtab;
+		eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK;
+		newtab = (struct ips_epstate_entry *)
+		    psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT,
+				eps->eps_tabsize,
+				sizeof(struct ips_epstate_entry));
+		if (newtab == NULL)
+			return PSM2_NO_MEMORY;
+		else if (eps->eps_tab) {	/* NOT first alloc */
+			for (i = 0;
+			     i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++)
+				newtab[i] = eps->eps_tab[i];	/* deep copy */
+			psmi_free(eps->eps_tab);
+		}
+		eps->eps_tab = newtab;
+	}
+	/* Find the next free hole.  We can afford to do this since connect is not
+	 * in the critical path */
+	for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) {
+		if (j == eps->eps_tabsize)
+			j = 0;
+		if (eps->eps_tab[j].ipsaddr == NULL) {
+			eps->eps_tab_nextidx = j + 1;
+			if (eps->eps_tab_nextidx == eps->eps_tabsize)
+				eps->eps_tab_nextidx = 0;
+			break;
+		}
+	}
+	psmi_assert_always(i != eps->eps_tabsize);
+	connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	_HFI_VDBG("node %s gets connidx=%d (table idx %d)\n",
+		  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx,
+		  j);
+	eps->eps_tab[j].ipsaddr = ipsaddr;
+	if (j >= IPS_EPSTATE_CONNIDX_MAX) {
+		return psmi_handle_error(eps->context->ep,
+					 PSM2_TOO_MANY_ENDPOINTS,
+					 "Can't connect to more than %d non-local endpoints",
+					 IPS_EPSTATE_CONNIDX_MAX);
+	}
+	*connidx_o = connidx;
+	return PSM2_OK;
+}
+
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx)
+{
+	ips_epstate_idx idx;
+	/* actual table index */
+	idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	psmi_assert_always(idx < eps->eps_tabsize);
+	_HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx);
+	eps->eps_tab[idx].ipsaddr = NULL;
+	/* We may eventually want to release memory, but probably not */
+	eps->eps_tabsizeused--;
+	return PSM2_OK;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.h
new file mode 100644
index 0000000000000000000000000000000000000000..b63c2ce9f4cb09c13235cbaa97809da3d4d6b1c1
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_epstate.h
@@ -0,0 +1,103 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_EPSTATE_H
+#define _IPS_EPSTATE_H
+
+#include "psm_user.h"
+
+typedef uint32_t ips_epstate_idx;
+#define IPS_EPSTATE_CONNIDX_MAX (1<<26)
+
+struct ips_epaddr;
+
+struct ips_epstate_entry {
+	struct ips_epaddr *ipsaddr;
+};
+
+struct ips_epstate {
+	const psmi_context_t *context;
+	ips_epstate_idx eps_base_idx;
+	int eps_tabsize;
+	int eps_tabsizeused;
+	int eps_tab_nextidx;
+
+	struct ips_epstate_entry *eps_tab;
+};
+
+psm2_error_t ips_epstate_init(struct ips_epstate *eps,
+			     const psmi_context_t *contextj);
+psm2_error_t ips_epstate_fini(struct ips_epstate *eps);
+
+psm2_error_t ips_epstate_add(struct ips_epstate *eps,
+			    struct ips_epaddr *ipsaddr,
+			    ips_epstate_idx *connidx);
+psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx);
+
+/* Use this to debug EP issues. */
+void ips_epstate_dump(struct ips_epstate *eps);
+
+PSMI_INLINE(
+struct ips_epstate_entry *
+ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx))
+{
+	idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
+	if (idx < (ips_epstate_idx)eps->eps_tabsize)
+		return &eps->eps_tab[idx];
+	else
+		return NULL;
+}
+
+#endif /* _IPS_EPSTATE_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4dfb58abc8af746d18c0671849feace2942540b
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
@@ -0,0 +1,380 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * Control and state structure for one instance of the expected protocol.  The
+ * protocol depends on some upcalls from internal portions of the receive
+ * protocol (such as opcodes dedicated for expected protocol handling)
+ */
+
+/*
+ * Expected tid operations are carried out over "sessions".  One session is a
+ * collection of N tids where N is determined by the expected message window
+ * size (-W option or PSM3_MQ_RNDV_NIC_WINDOW).  Since naks can cause
+ * retransmissions, each session has an session index (_desc_idx) and a
+ * generation count (_desc_genc) to be able to identify if retransmitted
+ * packets reference the correct session.
+ *
+ * index and generation count are each 4 bytes encoded in one ptl_arg.  They
+ * could be compressed further but we have the header space, so we don't
+ * bother.
+ */
+
+#ifndef __IPS_EXPECTED_PROTO_H__
+
+#define __IPS_EXPECTED_PROTO_H__ 1
+
+#define _desc_idx   u32w0
+#define _desc_genc  u32w1
+
+/*
+ * For debug and/or other reasons, we can log the state of each tid and
+ * optionally associate it to a particular receive descriptor
+ */
+
+#define TIDSTATE_FREE	0
+#define TIDSTATE_USED	1
+
+struct ips_tidinfo {
+	uint32_t tid;
+	uint32_t state;
+	struct ips_tid_recv_desc *tidrecvc;
+};
+
+struct ips_protoexp {
+	const struct ptl *ptl;
+	struct ips_proto *proto;
+	struct psmi_timer_ctrl *timerq;
+	struct ips_tf tfc;
+
+	psm_transfer_type_t ctrl_xfer_type;
+	struct ips_scbctrl tid_scbc_rv;	// pool of SCBs for TID sends
+									// for OPA this includes: TIDEXP, CTS,
+									// EXPTID_COMPLETION
+									// For UD: CTS, ERR_CHK_RDMA,
+									// ERR_CHK_RDMA_RESP
+	mpool_t tid_desc_send_pool;
+	mpool_t tid_getreq_pool;
+	mpool_t tid_sreq_pool;	/* backptr into proto->ep->mq */
+	mpool_t tid_rreq_pool;	/* backptr into proto->ep->mq */
+	uint32_t tid_flags;
+
+	STAILQ_HEAD(ips_tid_send_pend,	/* pending exp. sends */
+		    ips_tid_send_desc) pend_sendq;
+	struct psmi_timer timer_send;
+
+	STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq;	/* pending tid reqs */
+#ifdef RNDV_MOD
+	STAILQ_HEAD(ips_tid_err_resp_pend, ips_epaddr) pend_err_resp;	/* pending ERR CHK RDMA RESP */
+#endif
+	/* services pend_getreqsq and pend_err_chk_rdma_resp */
+	struct psmi_timer timer_getreqs;
+
+#ifdef PSM_CUDA
+	STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */
+		    ips_tid_get_request) cudapend_getreqsq;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg;
+	mpool_t cuda_hostbuf_pool_recv;
+	mpool_t cuda_hostbuf_pool_small_recv;
+	CUstream cudastream_recv;
+#endif
+};
+
+/*
+ * TID member list format used in communication.
+ * Since the compiler does not make sure the bit fields order,
+ * we use mask and shift defined below.
+typedef struct {
+	uint32_t length:11;	// in page unit, max 1024 pages
+	uint32_t reserved:9;	// for future usage
+	uint32_t tidctrl:2;	// hardware defined tidctrl value
+	uint32_t tid:10;	// hardware only support 10bits
+}
+ips_tid_session_member;
+ */
+#define IPS_TIDINFO_LENGTH_SHIFT	0
+#define IPS_TIDINFO_LENGTH_MASK		0x7ff
+#define IPS_TIDINFO_TIDCTRL_SHIFT	20
+#define IPS_TIDINFO_TIDCTRL_MASK	0x3
+#define IPS_TIDINFO_TID_SHIFT		22
+#define IPS_TIDINFO_TID_MASK		0x3ff
+
+#define IPS_TIDINFO_GET_LENGTH(tidinfo)	\
+	(((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK)
+#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \
+	(((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK)
+#define IPS_TIDINFO_GET_TID(tidinfo) \
+	(((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK)
+
+// This structure is used as CTS payload to describe TID receive
+// for UD it describes the destination for an RDMA Write
+// N/A for UDP
+typedef struct ips_tid_session_list_tag {
+	// TBD on how we will handle unaligned start/end at receiver
+	uint32_t tsess_srcoff;	/* source offset from beginning */
+	uint32_t tsess_length;	/* session length, including start/end */
+	uint64_t tsess_raddr;	/* RDMA virt addr this part of receiver's buffer */
+							/* already adjusted for srcoff */
+	uint32_t tsess_rkey;	/* rkey for receiver's buffer */
+} PACK_SUFFIX  ips_tid_session_list;
+
+/*
+ * Send-side expected send descriptors.
+ *
+ * Descriptors are allocated when tid grant requests are received (the 'target'
+ * side of an RDMA get request).  Descriptors are added to a pending queue of
+ * expected sends and processed one at a time (scb's are requested and messages
+ * sent until all fragments of the descriptor's length are put on the wire).
+ *
+ */
+#define TIDSENDC_SDMA_VEC_DEFAULT	260
+
+struct ips_tid_send_desc {
+	struct ips_protoexp *protoexp;
+	 STAILQ_ENTRY(ips_tid_send_desc) next;
+
+	/* Filled in at allocation time */
+	ptl_arg_t sdescid;	/* sender descid */
+	ptl_arg_t rdescid;	/* reciever descid */
+	ips_epaddr_t *ipsaddr;
+	psm2_mq_req_t mqreq;
+
+	psm2_verbs_mr_t mr;
+
+	/* Iterated during send progress */
+	void *userbuf;		/* user privided buffer */
+	void *buffer;
+	uint32_t length;	/* total length, includint start/end */
+
+
+	uint8_t is_complete:1;	// all packets for send queued, waiting CQE/response
+#ifdef RNDV_MOD
+	uint8_t rv_need_err_chk_rdma:1; // need to determine if a retry is required
+	uint8_t reserved:6;
+	uint8_t rv_sconn_index;	// sconn in rv we issued RDMA write on
+	uint32_t rv_conn_count;// Count of sconn completed conn establishments
+#else
+	uint8_t reserved:7;
+#endif
+
+#ifdef PSM_CUDA
+	/* As size of cuda_hostbuf is less than equal to window size,
+	 * there is a guarantee that the maximum number of host bufs we
+	 * would need to attach to a tidsendc would be 2
+	 */
+	struct ips_cuda_hostbuf *cuda_hostbuf[2];
+	/* Number of hostbufs attached */
+	uint8_t cuda_num_buf;
+#endif
+	// ips_tid_session_list is fixed sized for UD
+	// N/A to UDP
+	ips_tid_session_list tid_list;
+};
+
+#define TIDRECVC_STATE_FREE      0
+#define TIDRECVC_STATE_BUSY      1
+
+struct ips_expected_recv_stats {
+	uint32_t nSeqErr;
+	uint32_t nGenErr;
+	uint32_t nReXmit;
+	uint32_t nErrChkReceived;
+};
+
+struct ips_tid_recv_desc {
+	const psmi_context_t *context;
+	struct ips_protoexp *protoexp;
+
+	ptl_arg_t rdescid;	/* reciever descid */
+	ips_epaddr_t *ipsaddr;
+	struct ips_tid_get_request *getreq;
+
+	/* scb to send tid grant CTS */
+	ips_scb_t *grantscb;
+	psm2_verbs_mr_t mr;	// MR for this message window/chunk
+
+	/* TF protocol state (recv) */
+	uint32_t state;
+	// TBD - these next 3 fields are probably not needed for PSM_UD USE_RC
+	uint32_t tidflow_active_gen;
+	uint32_t tidflow_nswap_gen;
+	psmi_seqnum_t tidflow_genseq;
+
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *cuda_hostbuf;
+	uint8_t is_ptr_gpu_backed;
+#endif
+
+	void *buffer;
+	uint32_t recv_msglen;
+
+	struct ips_expected_recv_stats stats;
+
+	/* bitmap of queued control messages for */
+	uint16_t ctrl_msg_queued;
+	// ips_tid_session_list is fixed sized for UD
+	// N/A to UDP
+	ips_tid_session_list tid_list;
+};
+
+/*
+ * Get requests, issued by MQ when there's a match on a large message.  Unlike
+ * an RDMA get, the initiator identifies the location of the data at the target
+ * using a 'send token' instead of a virtual address.  This, of course, assumes
+ * that the target has already registered the token and communicated it to the
+ * initiator beforehand (it actually sends the token as part of the initial
+ * MQ message that contains the MQ tag).
+ *
+ * The operation is semantically a two-sided RDMA get.
+ */
+typedef void (*ips_tid_completion_callback_t) (psm2_mq_req_t);
+
+struct ips_tid_get_request {
+	STAILQ_ENTRY(ips_tid_get_request) tidgr_next;
+	struct ips_protoexp *tidgr_protoexp;
+	psm2_epaddr_t tidgr_epaddr;
+
+	void *tidgr_lbuf;
+	uint32_t tidgr_length;
+	uint32_t tidgr_rndv_winsz;
+	uint32_t tidgr_sendtoken;
+	ips_tid_completion_callback_t tidgr_callback;
+	psm2_mq_req_t tidgr_req;
+
+	uint32_t tidgr_offset;	/* offset in bytes */
+	uint32_t tidgr_bytesdone;
+	uint32_t tidgr_flags;
+
+#ifdef PSM_CUDA
+	int cuda_hostbuf_used;
+	uint32_t tidgr_cuda_bytesdone;
+	STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend,	/* pending exp. sends */
+		    ips_cuda_hostbuf) pend_cudabuf;
+#endif
+};
+
+/*
+ * Descriptor limits, structure contents of struct psmi_rlimit_mpool for
+ * normal, min and large configurations.
+ */
+#define TID_SENDSESSIONS_LIMITS {				\
+	    .env = "PSM3_RDMA_SENDSESSIONS_MAX",			\
+	    .descr = "RDMA max send session descriptors",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = 1<<30,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 256,  8192 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,     1 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 512, 16384 }	\
+	}
+
+/*
+ * Expected send support
+ */
+/*
+ * The expsend token is currently always a pointer to a MQ request.  It is
+ * echoed on the wire throughout various phases of the expected send protocol
+ * to identify a particular send.
+ */
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+			      const struct ips_proto *proto,
+			      uint32_t protoexp_flags, int num_of_send_bufs,
+			      int num_of_send_desc,
+			      struct ips_protoexp **protoexp_o);
+MOCK_DCL_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp);
+
+int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
+								int conn_type, uint32_t immed, uint32_t len);
+int ips_protoexp_rdma_write_completion( uint64_t wr_id);
+#ifdef RNDV_MOD
+int ips_protoexp_rdma_write_completion_error(psm2_ep_t ep, uint64_t wr_id,
+												enum ibv_wc_status wc_status);
+int ips_protoexp_process_err_chk_rdma(struct ips_recvhdrq_event *rcv_ev);
+int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev);
+#endif
+
+
+
+PSMI_ALWAYS_INLINE(
+void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len))
+{
+	while (len) {
+		dst[len-1] = src[len-1];
+		len--;
+	}
+}
+
+/*
+ * Peer is waiting (blocked) for this request
+ */
+#define IPS_PROTOEXP_TIDGET_WAIT	0x1
+#define IPS_PROTOEXP_TIDGET_PEERWAIT	0x2
+psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+			    void *buf, uint32_t length,
+			    psm2_epaddr_t epaddr,
+			    uint32_t remote_tok, uint32_t flags,
+			    ips_tid_completion_callback_t
+			    callback, psm2_mq_req_t req);
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+			    ips_epaddr_t *ipsaddr, psm2_mq_req_t req,
+			    ptl_arg_t rdescid, uint32_t tidflow_genseq,
+			    ips_tid_session_list *tid_list,
+			    uint32_t tid_list_size);
+#endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c
new file mode 100644
index 0000000000000000000000000000000000000000..8616250ab9a3ad1f9722c2697c92d23fc71b874e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c
@@ -0,0 +1,576 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include <dlfcn.h>
+
+/* SLID and DLID are in network byte order */
+static psm2_error_t
+ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto,
+		     uint16_t slid, uint16_t dlid,
+		     ips_path_rec_t **ppath_rec)
+{
+	psm2_error_t err = PSM2_OK;
+	ibta_path_rec_t query, opp_response;
+#ifdef _HFI_DEBUGGING
+	int opp_response_set = 0;
+#endif
+	ips_path_rec_t *path_rec;
+	int opp_err;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+	uint64_t timeout_ack_ms;
+
+	/* Query path record query cache first */
+	bzero(&query, sizeof(query));
+	bzero(eplid, sizeof(eplid));
+
+	/* Bulk service ID is control service id + 1 */
+	switch (type) {
+	case IPS_PATH_LOW_PRIORITY:
+		query.service_id =
+		    __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET);
+		break;
+	case IPS_PATH_NORMAL_PRIORITY:
+	case IPS_PATH_HIGH_PRIORITY:
+	default:
+		query.service_id = __cpu_to_be64(proto->ep->service_id);
+	}
+
+	query.slid = slid;
+	query.dlid = dlid;
+
+	snprintf(eplid, sizeof(eplid), "%s_%x_%x",
+		 (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH",
+		 query.slid, query.dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+	if (!epath) {		/* Unable to find path record in cache */
+		elid.key =
+		    psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+		path_rec = (ips_path_rec_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, 1,
+				sizeof(ips_path_rec_t));
+		if (!elid.key || !path_rec) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+
+		/* Get path record between local LID and remote */
+		opp_err =
+		    proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt,
+							  &query,
+							  &opp_response);
+		if (opp_err) {
+			err = PSM2_EPID_PATH_RESOLUTION;
+			goto fail;
+		}
+#ifdef _HFI_DEBUGGING
+		opp_response_set = 1;
+#endif
+		// this should not happen since we are using a LID to LID query
+		// but at some point we need to figure out how to deal with
+		// virtualized IB environments where a GRH may be needed
+		// HOP Limit >1 indicates a global route with a GRH
+		if ((__be32_to_cpu(opp_response.hop_flow_raw) & 0xFF) > 1) {
+			_HFI_ERROR
+		    	("Global Routed Path Record not supported SLID 0x%d DLID 0x%x\n",
+				__be16_to_cpu(slid), __be16_to_cpu(dlid));
+			err = PSM2_EPID_PATH_RESOLUTION;
+			goto fail;
+		}
+		/* Create path record */
+		path_rec->pr_slid = opp_response.slid;
+		path_rec->pr_dlid = opp_response.dlid;
+		path_rec->pr_mtu =
+		    min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f)
+				- MAX_PSM_HEADER
+			, proto->epinfo.ep_mtu);
+		path_rec->pr_pkey = ntohs(opp_response.pkey);
+		path_rec->pr_sl = ntohs(opp_response.qos_class_sl);
+		path_rec->pr_static_rate = opp_response.rate & 0x3f;
+
+		/* Setup CCA parameters for path */
+		if (path_rec->pr_sl > PSMI_SL_MAX) {
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+
+		/* Compute max timeout based on pkt life time for path */
+		timeout_ack_ms =
+		    ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) /
+		     1000000UL);
+		timeout_ack_ms =
+		    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT +
+				timeout_ack_ms);
+		if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms)
+			proto->epinfo.ep_timeout_ack_max = timeout_ack_ms;
+		err = ips_make_ah(proto->ep, path_rec);
+		if (err != PSM2_OK)
+			goto fail;
+
+		/* Add path record into cache */
+		strcpy(elid.key, eplid);
+		elid.data = (void *)path_rec;
+		hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+	} else			/* Path record found in cache */
+		path_rec = (ips_path_rec_t *) epath->data;
+
+#ifdef _HFI_DEBUGGING
+	/* Dump path record stats */
+	_HFI_CONNDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n",
+		   (uint64_t) __be64_to_cpu(query.service_id),
+		   __be16_to_cpu(slid), __be16_to_cpu(dlid));
+	if (opp_response_set)
+	{
+		_HFI_CONNDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f),
+			   path_rec->pr_mtu);
+		_HFI_CONNDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey));
+		_HFI_CONNDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl));
+		_HFI_CONNDBG("Rate: %x\n", (opp_response.rate & 0x3f));
+	}
+	_HFI_CONNDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n",
+		   proto->epinfo.ep_timeout_ack,
+		   proto->epinfo.ep_timeout_ack_max);
+#endif
+	/* Return the IPS path record */
+	*ppath_rec = path_rec;
+	return err;
+
+fail:
+	if (elid.key)
+		psmi_free(elid.key);
+	if (path_rec)
+		psmi_free(path_rec);
+	return err;
+}
+
+static psm2_error_t
+ips_opp_path_rec(struct ips_proto *proto,
+		 uint16_t slid, uint16_t dlid,
+		 uint16_t ip_hi,	// unused here, but must match API signature
+		 unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+	psm2_error_t err = PSM2_OK;
+	uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc);
+	ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY;
+	ips_path_rec_t *path;
+	ips_path_grp_t *pathgrp;
+	uint16_t path_slid, path_dlid;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	/*
+	 * High Priority Path
+	 * ------------------
+	 *
+	 * Uses the "base" Service ID. For now there exists only 1 high priority
+	 * path between nodes even for non zero LMC fabrics.
+	 *
+	 * Normal/Low Priority Paths
+	 * -------------------------
+	 *
+	 * Currently these paths are the same i.e. they are queried for the same
+	 * Service ID/vFabric which is the Base Service ID for High Priority + 1.
+	 *
+	 * Use case Scenarios
+	 * ------------------
+	 *
+	 * Since with vFabrics we have the capability to define different QoS
+	 * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is
+	 * setup in a separate vFabric for high priority traffic. The NORMAL paths
+	 * are setup in a separate vFabric optimized for high bandwidth. This allows
+	 * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked
+	 * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.)
+	 * also use the high priority control vFabric.
+	 *
+	 * NOTE: In order to distinguish between the different vFabrics the user
+	 * specifies the service ID to use via mpirun (or environment variable).
+	 * This is the service ID for the high priority control traffic. The bulk
+	 * data vFabric is identified by service ID + 1. So for each MPI application
+	 * one should specify two service IDs for the high priority and bulk data.
+	 * Both these service IDs can be placed in the same vFabric which can be
+	 * configured for high priority or bandwidth traffic giving us the default
+	 * behavior upto Infinhfi 2.5 release.
+	 *
+	 * NOTE: All of the above would have really helped if the S20 silicon could
+	 * correctly support IBTA QoS features. Due to S20 design we can only have
+	 * high priority VLarb table (low priority VLarb table results in round
+	 * robin arbitration ignoring the weights!). But if this is fixed in a
+	 * subsequent chip respin then this may potentially help our scalability
+	 * on large fabrics.
+	 *
+	 * Mesh/Torus and DOR routed networks
+	 * ----------------------------------
+	 *
+	 * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be
+	 * more). We would like to take advantage of dispersive routing on these
+	 * fabrics as well to obtain better "worst case/congested" bandwidth. For
+	 * these networks currently the base LIDs are used for UPDN routing which
+	 * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR
+	 * routing (Dimension Ordered Routing) to avoid deadlocks and provide
+	 * higher performance. If a fabric is disrupted then only the base UPDN
+	 * routing is available. PSM should continue to operate in this environment
+	 * albeit with degraded performance. In disrupted fabric the OPP path
+	 * record queries may fail for some DOR routed LIDs i.e. no path exists
+	 * PSM should hence ignore path record failures as they indicate a disrupted
+	 * fabric and only use valid paths that are returned from the replica. This
+	 * will degenerate to only using the UPDN paths on disrupted fabrics and DOR
+	 * routes only for fully configured fabrics. Note: For a clean fabric the
+	 * base LIDs that are configured for UPDN route will not exist in the replica
+	 * as DOR routes are preferred. Hence we will only dispersively route across
+	 * the DOR routes only using the UPDN route for disrupted fabrics.
+	 *
+	 * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN
+	 * TAKE PLACE.
+	 */
+
+	/* Check if this path grp is already in hash table */
+	snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+	if (epath) {		/* Find path group in cache */
+		*ppathgrp = (ips_path_grp_t *) epath->data;
+		return err;
+	}
+
+	/* If base lids are only used then reset num_path to 1 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+		num_path = 1;
+
+	/* Allocate a new pathgroup */
+	elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+	pathgrp = (ips_path_grp_t *)
+	    psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+			num_path * IPS_PATH_MAX_PRIORITY *
+			sizeof(ips_path_rec_t *));
+	if (!elid.key || !pathgrp) {
+		if (elid.key)
+			psmi_free(elid.key);
+		if (pathgrp)
+			psmi_free(pathgrp);
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	/*
+	 * dlid is the peer base lid.
+	 * slid is the base lid for the local end point.
+	 * Store here in network byte order.
+	 */
+	pathgrp->pg_base_dlid = dlid;
+	pathgrp->pg_base_slid = slid;
+
+	pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] =
+	    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] =
+	    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0;
+
+	/* For now there is always only one high priority path between nodes. */
+	for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+		err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto,
+					   path_slid, path_dlid,
+					   &path);
+
+		if (err == PSM2_OK) {	/* Valid high priority path found */
+			/* Resolved high priority path successfully */
+			pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++;
+			pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path;
+
+			/* Increment current path index */
+			cpath++;
+		}
+
+		PSM2_LOG_MSG("path %p slid %hu dlid %hu\n",
+			      path,
+			      __be16_to_cpu(path->pr_slid),
+			      __be16_to_cpu(path->pr_dlid));
+	}
+
+	/* Make sure we have atleast 1 high priority path */
+	if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+
+
+	/* Next setup the bulk paths. If the subnet administrator has misconfigured
+	 * or rather not configured two separate service IDs we place the bulk
+	 * paths in the same vFabric as the control paths.
+	 */
+
+	path_type = IPS_PATH_NORMAL_PRIORITY;
+	for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_normal_path_res:
+		err = ips_opp_get_path_rec(path_type, proto,
+					   path_slid, path_dlid,
+					   &path);
+		if (err != PSM2_OK) {
+			if (path_type == IPS_PATH_NORMAL_PRIORITY) {
+				/* Subnet may only be configured for one service ID/vFabric. Default
+				 * to using the control vFabric/service ID for bulk data as well.
+				 */
+				path_type = IPS_PATH_HIGH_PRIORITY;
+				goto retry_normal_path_res;
+			}
+
+			/* Unable to resolve path for <path_slid, path_dline>. This is possible
+			 * for disrupted fabrics using DOR routing so continue to acquire paths
+			 */
+			err = PSM2_OK;
+			continue;
+		}
+
+		/* Valid path. */
+		pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++;
+		cpath++;
+	}
+
+	/* Make sure we have at least have a single bulk data transfer path */
+	if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+	path_type = IPS_PATH_LOW_PRIORITY;
+	for (pidx = 0, cpath = 0; pidx < num_path; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+retry_low_path_res:
+		err = ips_opp_get_path_rec(path_type, proto,
+					   path_slid, path_dlid,
+					   &path);
+		if (err != PSM2_OK) {
+			if (path_type == IPS_PATH_LOW_PRIORITY) {
+				/* Subnet may only be configured for one service ID/vFabric. Default
+				 * to using the control vFabric/service ID for bulk data as well.
+				 */
+				path_type = IPS_PATH_HIGH_PRIORITY;
+				goto retry_low_path_res;
+			}
+
+			/* Unable to resolve path for <path_slid, path_dline>. This is possible
+			 * for disrupted fabrics using DOR routing so continue to acquire paths
+			 */
+			err = PSM2_OK;
+			continue;
+		}
+
+		/* Valid path. */
+		pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++;
+		cpath++;
+	}
+
+	/* Make sure we have at least have a single bulk data transfer path */
+	if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) {
+		psmi_free(elid.key);
+		psmi_free(pathgrp);
+		err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+					"OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"
+					PRIx64 " defined?", ntohs(slid),
+					ntohs(dlid),
+					(uint64_t) proto->ep->service_id);
+		goto fail;
+	}
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+		    proto->epinfo.EP_HASH %
+		    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+		pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+		    proto->epinfo.EP_HASH %
+		    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+	}
+
+	/* Add path group into cache */
+	strcpy(elid.key, eplid);
+	elid.data = (void *)pathgrp;
+	hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+	*ppathgrp = pathgrp;
+
+fail:
+	if (err != PSM2_OK)
+		_HFI_CONNDBG
+		    ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n",
+		     slid, dlid);
+	return err;
+}
+
+static psm2_error_t ips_opp_fini(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (proto->opp_lib)
+		dlclose(proto->opp_lib);
+
+	return err;
+}
+
+psm2_error_t ips_opp_init(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	char hfiName[32];
+
+	proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW);
+	if (!proto->opp_lib) {
+		char *err = dlerror();
+		_HFI_ERROR
+		    ("Unable to open OFED Plus Plus library %s. Error: %s\n",
+		     DF_OPP_LIBRARY, err ? err : "no dlerror()");
+		goto fail;
+	}
+
+	/* Resolve symbols that we require within opp library */
+	proto->opp_fn.op_path_find_hca =
+	    dlsym(proto->opp_lib, "op_path_find_hfi");
+	proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open");
+	proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close");
+	proto->opp_fn.op_path_get_path_by_rec =
+	    dlsym(proto->opp_lib, "op_path_get_path_by_rec");
+
+	/* If we can't resovle any symbol then fail to load opp module */
+	if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open ||
+	    !proto->opp_fn.op_path_close
+	    || !proto->opp_fn.op_path_get_path_by_rec) {
+		_HFI_ERROR
+		    ("Unable to resolve symbols in OPP library. Unloading.\n");
+		goto fail;
+	}
+
+	/* If PSM3_IDENTIFY is set display the OPP library location being used. */
+	if (psmi_parse_identify()) {
+		Dl_info info_opp;
+		printf
+		    ("PSM3 path record queries using OFED Plus Plus (%s) from %s\n",
+		     DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open,
+					    &info_opp) ? info_opp.
+		     dli_fname :
+		     "Unknown/unsupported version of OPP library found!");
+	}
+
+	/* Obtain handle to hfi (requires verbs on node) */
+	snprintf(hfiName, sizeof(hfiName), "%s_%d",
+		 psmi_hal_get_hfi_name(),
+		 proto->ep->unit_id);
+	proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device);
+	if (!proto->hndl) {
+		_HFI_ERROR
+		    ("OPP: Unable to find NIC %s. Disabling OPP interface for path record queries.\n",
+		     hfiName);
+		goto fail;
+	}
+
+	/* Get OPP context */
+	proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1);
+	if (!proto->opp_ctxt) {
+		_HFI_ERROR
+		    ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n");
+		goto fail;
+	}
+
+	/* Setup default errorcheck timeout. OPP may change it later. */
+	proto->epinfo.ep_timeout_ack =
+	    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+	proto->epinfo.ep_timeout_ack_max =
+	    ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+	proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT;
+
+	/* OPP initialized successfully */
+	proto->ibta.get_path_rec = ips_opp_path_rec;
+	proto->ibta.fini = ips_opp_fini;
+	proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC;
+
+	return err;
+
+fail:
+	_HFI_ERROR("Make sure SM is running...\n");
+	_HFI_ERROR("Make sure service ibacm is running...\n");
+	_HFI_ERROR("to start ibacm: service ibacm start\n");
+	_HFI_ERROR("or enable it at boot time: iefsconfig -E ibacm\n\n");
+
+	err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION,
+				"Unable to initialize OFED Plus library successfully.\n");
+
+	if (proto->opp_lib)
+		dlclose(proto->opp_lib);
+
+	return err;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.c
new file mode 100644
index 0000000000000000000000000000000000000000..ac1530484917b0f158b758a19dd1b1ee47f59d38
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.c
@@ -0,0 +1,554 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+
+/*
+ * These are the default values used in parsing the environment
+ * variable PSM3_PATH_NO_LMC_RANGE, which can be used to exclude
+ * a range of message sizes from the LMC LID assignments used to
+ * implement dispersive routing.
+ *
+ * This value is 2^32 - 1.
+ */
+#define DEF_LIMITS_STRING "4294967295:4294967295"
+#define DEF_LIMITS_VALUE 4294967295
+
+
+
+
+
+static psm2_error_t
+ips_none_get_path_rec(struct ips_proto *proto,
+		      uint16_t slid, uint16_t dlid,
+		      uint16_t ip_hi,
+		      unsigned long timeout, ips_path_rec_t **ppath_rec)
+{
+	psm2_error_t err = PSM2_OK;
+	ips_path_rec_t *path_rec;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	/* Query the path record cache */
+	// TBD - slid same until have dispersive LMC-like, could just use dest
+	snprintf(eplid, sizeof(eplid), "%x_%x%04x", slid, ip_hi, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+	if (!epath) {
+		elid.key =
+		    psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+		path_rec = (ips_path_rec_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, 1,
+				sizeof(ips_path_rec_t));
+		if (!elid.key || !path_rec) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+
+		/* Create path record */
+		path_rec->pr_slid = slid;
+		path_rec->pr_dlid = dlid;
+		path_rec->pr_mtu = proto->epinfo.ep_mtu;
+		path_rec->pr_pkey = proto->epinfo.ep_pkey;
+		path_rec->pr_sl = proto->epinfo.ep_sl;
+		path_rec->pr_ip_hi = ip_hi;
+		path_rec->pr_static_rate = proto->epinfo.ep_link_rate;
+
+
+		/* Setup CCA parameters for path */
+		if (path_rec->pr_sl > PSMI_SL_MAX) {
+			err =  PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+		err = ips_make_ah(proto->ep, path_rec);
+		if (err != PSM2_OK)
+			goto fail;
+
+		/* Add path record into cache */
+		strcpy(elid.key, eplid);
+		elid.data = (void *)path_rec;
+		hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+	} else
+		path_rec = (ips_path_rec_t *) epath->data;
+
+	/* Return IPS path record */
+	*ppath_rec = path_rec;
+
+	return err;
+
+fail:
+	if (elid.key)
+		psmi_free(elid.key);
+	if (path_rec)
+		psmi_free(path_rec);
+	return err;
+}
+
+// This works for UD address vectors as well as the ah_attr in an RC QP attrs
+psm2_error_t ips_path_rec_to_ah_attr(psm2_ep_t ep,
+				const ips_path_rec_t *path_rec, struct ibv_ah_attr *ah_attr)
+{
+	memset(ah_attr, 0, sizeof *ah_attr);
+
+	// we keep PR in network byte order
+	// ah_attr is in CPU byte order except for GIDs which are always
+	// in network byte order
+	ah_attr->sl = path_rec->pr_sl;
+	ah_attr->port_num = ep->portnum;
+	ah_attr->static_rate = path_rec->pr_static_rate;
+	// for OPA/IB we use dlid and is_global=0, for eth use dgid and is_global=1
+	if (ep->verbs_ep.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		// OPA or IB
+			// NIC/HCA/HFI will only look at low "LMC" worth of bits
+		ah_attr->src_path_bits = __be16_to_cpu(path_rec->pr_slid);
+		ah_attr->dlid = __be16_to_cpu(path_rec->pr_dlid);
+		ah_attr->is_global  = 0;
+		_HFI_CONNDBG("creating AH with DLID %u\n", ah_attr->dlid);
+	} else {
+		ah_attr->src_path_bits = 0;
+		ah_attr->dlid = 1;	// not used on ethernet, make non-zero
+		ah_attr->is_global  = 1;
+		ah_attr->grh.dgid = ep->verbs_ep.lgid;
+		ah_attr->grh.dgid.raw[12] =  (uint8_t)(__be16_to_cpu(path_rec->pr_ip_hi)>>8);
+		ah_attr->grh.dgid.raw[13] =  (uint8_t)(__be16_to_cpu(path_rec->pr_ip_hi));
+		ah_attr->grh.dgid.raw[14] =  (uint8_t)(__be16_to_cpu(path_rec->pr_dlid)>>8);
+		ah_attr->grh.dgid.raw[15] =  (uint8_t)(__be16_to_cpu(path_rec->pr_dlid));
+		ah_attr->grh.sgid_index = ep->verbs_ep.lgid_index;
+		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.traffic_class = 0;
+		if (_HFI_CONNDBG_ON) {
+			char buf[80];
+			_HFI_CONNDBG("creating AH with DGID: %s\n",
+				__psm2_dump_gid(&ah_attr->grh.dgid, buf, sizeof(buf)));
+		}
+	}
+	return PSM2_OK;
+}
+
+psm2_error_t ips_make_ah(psm2_ep_t ep, ips_path_rec_t *path_rec)
+{
+	struct ibv_ah_attr ah_attr;
+
+	if (path_rec->ah) {
+		_HFI_CONNDBG("make_ah called second time on given path_rec, skipping\n");
+		return PSM2_OK;
+	}
+	if (PSM2_OK != ips_path_rec_to_ah_attr(ep, path_rec, &ah_attr)) {
+		_HFI_ERROR( "Unable to convert path_rec to AH for %s port %u\n", ep->dev_name, ep->portnum);
+		return PSM2_INTERNAL_ERR;
+	}
+	path_rec->ah = ibv_create_ah(ep->verbs_ep.pd, &ah_attr);
+	if (! path_rec->ah) {
+		int save_errno = errno;
+		_HFI_ERROR( "Unable to create AH for %s: %s (%d)\n", ep->dev_name, strerror(save_errno), save_errno);
+		if (save_errno == ETIMEDOUT)
+			return PSM2_EPID_PATH_RESOLUTION;
+		else
+			return PSM2_INTERNAL_ERR;
+	}
+	_HFI_CONNDBG("created AH %p\n", path_rec->ah);
+	// PSM doesn't free path_rec structures on shutdown, so this will
+	// simply leak and be cleaned up by the kernel close when we shutdown
+	return PSM2_OK;
+}
+
+#ifdef RNDV_MOD
+void ips_path_rec_to_ib_user_path_rec(psm2_ep_t ep,
+		const ips_path_rec_t *path_rec, union ibv_gid *dgid,
+		struct ib_user_path_rec *path)
+{
+	memset(path, 0, sizeof(*path));
+	memcpy(&path->sgid, &ep->verbs_ep.lgid, sizeof(path->sgid));
+	memcpy(&path->dgid, dgid, sizeof(path->dgid));
+	path->slid = path_rec->pr_slid; /* __be16 */
+	if (ep->verbs_ep.link_layer != IBV_LINK_LAYER_ETHERNET)
+		path->dlid = path_rec->pr_dlid; /* __be16 */
+	else
+		path->dlid = __cpu_to_be16(1);
+	//path->raw_traffic
+	//path->flow_label
+	path->reversible = 1;
+	path->mtu = opa_mtu_int_to_enum(path_rec->pr_mtu);
+	path->pkey = __cpu_to_be16(path_rec->pr_pkey); /* __be16 */
+	path->hop_limit = (ep->verbs_ep.link_layer == IBV_LINK_LAYER_ETHERNET)
+						?0xFF:0;	// indicates if need GRH
+	//path->traffic_class
+	path->numb_path = 1;
+	path->sl = path_rec->pr_sl;
+	path->mtu_selector = 2;  /* Exactly the given MTU */
+	path->rate_selector = 2; /* Exactly the given rate */
+	// ips_path_rec.pr_static_rate is negotiated in PSM REQ/REP
+	// then also use negotiated rate in user RC QP, ah_attr above and here
+	path->rate = path_rec->pr_static_rate;
+	path->packet_life_time_selector = 2; /* Exactly the given LT */
+	// the value supplied here will be increased by the CM based on ack_delay
+	// typically ack_delay will be small compared to packet_life_time
+	// in which case the CM wil end up using packet_life_time+1 as the timeout
+	// so we pass timeout-1 here so final timeout is usually what was requested
+	path->packet_life_time = ep->hfi_qp_timeout - 1;
+	//path->preferences
+}
+#endif // RNDV_MOD
+
+static psm2_error_t
+ips_none_path_rec(struct ips_proto *proto,
+		  uint16_t slid, uint16_t dlid,
+		  uint16_t ip_hi,
+		  unsigned long timeout, ips_path_grp_t **ppathgrp)
+{
+	psm2_error_t err = PSM2_OK;
+	uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc);
+	uint16_t path_slid, path_dlid;
+	ips_path_rec_t *path;
+	ips_path_grp_t *pathgrp;
+	ENTRY elid, *epath = NULL;
+	char eplid[128];
+
+	num_path = 1;	// don't yet have multi-path dispersive routing
+					// maybe we use env to derrive multiple sequential IP
+					// addresses, sort of like an LMC concept
+					// or use ECMP or other mechanism
+
+	/* For the "none" path record resolution all paths are assumed to be
+	 * of equal priority however since we want to isolate all control
+	 * traffic (acks, naks) to a separate path for non zero LMC subnets
+	 * the "first path" between a pair of endpoints is always the "higher"
+	 * priority paths. The rest of the paths are the normal (and low
+	 * priority) paths.
+	 */
+
+	/* Query the path record cache */
+	// TBD - slid same until have dispersive LMC-like, could just use dest
+	snprintf(eplid, sizeof(eplid), "%x_%x%04x", slid, ip_hi, dlid);
+	elid.key = eplid;
+	hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash);
+
+	if (epath) {		/* Find path group in cache */
+		*ppathgrp = (ips_path_grp_t *) epath->data;
+		return err;
+	}
+
+	/* If base lids are only used then reset num_path to 1 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+		num_path = 1;
+
+	/* Allocate a new pathgroup */
+	elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+	pathgrp = (ips_path_grp_t *)
+	    psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) +
+			num_path * IPS_PATH_MAX_PRIORITY *
+			sizeof(ips_path_rec_t *));
+	if (!elid.key || !pathgrp) {
+		if (elid.key)
+			psmi_free(elid.key);
+		if (pathgrp)
+			psmi_free(pathgrp);
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	/*
+	 * dlid is the peer base lid.
+	 * slid is the base lid for the local end point.
+	 * Store in network byte order.
+	 */
+	pathgrp->pg_base_dlid = dlid;
+	pathgrp->pg_base_slid = slid;
+
+	if (num_path > 1) {
+		/* One control path and (num_path - 1) norm and low priority paths */
+		pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1;
+	} else {
+		/* LMC of 0. Use the same path for all priorities */
+		pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1;
+		pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1;
+	}
+
+	/* For "none" path record we just setup 2^lmc paths. To get better load
+	 * balance
+	 */
+	for (pidx = 0; pidx < num_path; pidx++) {
+		path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+		path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+		err =
+		    ips_none_get_path_rec(proto, path_slid, path_dlid,
+					  ip_hi,
+					  timeout, &path);
+		if (err != PSM2_OK) {
+			psmi_free(elid.key);
+			psmi_free(pathgrp);
+			goto fail;
+		}
+
+		if (num_path > 1) {
+			if (pidx == 0) {
+				/* First path is always the high priority path */
+				pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] =
+				    path;
+			} else {
+				pathgrp->pg_path[pidx -
+						 1][IPS_PATH_NORMAL_PRIORITY] =
+				    path;
+				pathgrp->pg_path[pidx -
+						 1][IPS_PATH_LOW_PRIORITY] =
+				    path;
+			}
+		} else {
+			pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path;
+			pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path;
+			pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path;
+		}
+                PSM2_LOG_MSG("path %p slid %hu dlid %hu ip_hi %hu\n",
+                              path,
+			      __be16_to_cpu(path->pr_slid),
+			      __be16_to_cpu(path->pr_dlid),
+			      __be16_to_cpu(path->pr_ip_hi));
+
+	}
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] =
+		    proto->epinfo.EP_HASH %
+		    pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY];
+		pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] =
+		    proto->epinfo.EP_HASH %
+		    pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY];
+	}
+
+	/* Add path record into cache */
+	strcpy(elid.key, eplid);
+	elid.data = (void *)pathgrp;
+	hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash);
+
+	*ppathgrp = pathgrp;
+
+fail:
+	if (err != PSM2_OK)
+		_HFI_CONNDBG
+		    ("Unable to get path record for %s port %u LID %x <---> DLID %x.\n",
+		     proto->ep->dev_name, proto->ep->portnum, slid, dlid);
+	return err;
+}
+
+static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Obtain the SL and PKEY to use from the environment (PSM3_NIC_SL & PSM_KEY) */
+	proto->epinfo.ep_sl = proto->ep->out_sl;
+	proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey;
+
+	/*
+	 * Parse the err_chk settings from the environment.
+	 * <min_timeout>:<max_timeout>:<timeout_factor>
+	 */
+	{
+		union psmi_envvar_val env_to;
+		char *errchk_to = PSM_TID_TIMEOUT_DEFAULT;
+		int tvals[3] = {
+			IPS_PROTO_ERRCHK_MS_MIN_DEFAULT,
+			IPS_PROTO_ERRCHK_MS_MAX_DEFAULT,
+			IPS_PROTO_ERRCHK_FACTOR_DEFAULT
+		};
+
+		if (!psmi_getenv("PSM3_ERRCHK_TIMEOUT",
+				 "Errchk timeouts in mS <min:max:factor>",
+				 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+				 (union psmi_envvar_val)errchk_to, &env_to)) {
+			/* Not using default values, parse what we can */
+			errchk_to = env_to.e_str;
+			psmi_parse_str_tuples(errchk_to, 3, tvals);
+			/* Adjust for max smaller than min, things would break */
+			if (tvals[1] < tvals[0])
+				tvals[1] = tvals[0];
+		}
+
+		proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]);
+		proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]);
+		proto->epinfo.ep_timeout_ack_factor = tvals[2];
+	}
+
+	proto->ibta.get_path_rec = ips_none_path_rec;
+	proto->ibta.fini = NULL;
+
+
+	return err;
+}
+
+
+/* On link up/down we need to update some state */
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	/* Get base lid, lmc and rate as these may have changed if the link bounced */
+	proto->epinfo.ep_base_lid =
+	    __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid));
+
+	proto->epinfo.ep_lmc = 0; // No LMC for UD
+	proto->epinfo.ep_link_rate = proto->ep->verbs_ep.active_rate;
+	return err;
+}
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+	union psmi_envvar_val path_disable_lmc_interval;
+
+	proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE;
+
+	/* Initialize path record/group hash table */
+
+	{
+		uint32_t lmc_disable_low, lmc_disable_high;
+		int sscanf_ret;
+
+		/* The default disable_low and disable_low values
+		 * are 2^32 - 1, the maximum allowable message size.
+		 * So by default all messages should be smaller than the
+		 * lower limit, and so will not have LMC dispersive
+		 * routing disabled.
+		 *
+		 * Add to this, these limits are applied only to SDMA
+		 * and PIO message, NOT TID messages.  So this size
+		 * bigger than any PIO size.
+		 */
+		psmi_getenv("PSM3_PATH_NO_LMC_RANGE",
+		            "Disable LMC route dispersion within this range, "
+		             "low_value:high_value\n",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+			    (union psmi_envvar_val)DEF_LIMITS_STRING,
+			    &path_disable_lmc_interval);
+
+		sscanf_ret = sscanf(path_disable_lmc_interval.e_str, "%u:%u",
+		       		   &lmc_disable_low, &lmc_disable_high);
+
+		/*
+		 * It's "invalid" for the low end of the range to be
+		 * larger than the hig end of the range, so revert
+		 * to the "maximum message size" (2^32 - 1).
+		 */
+		if ((sscanf_ret != 2) || (lmc_disable_low > lmc_disable_high)) {
+			lmc_disable_low = lmc_disable_high = DEF_LIMITS_VALUE;
+		}
+
+		PSM2_LOG_MSG("PSM3_PATH_NO_LMC_RANGE: "
+			     "lmc_disable_low %u lmc_disable_high %u\n",
+			     lmc_disable_low, lmc_disable_high);
+
+		/*
+		 * These specify the range of message sizes in bytes, of
+		 * the messages to disable LMC dynamic LID assignment.
+		 */
+		proto->ips_lmc_disable_low = lmc_disable_low;
+		proto->ips_lmc_disable_high = lmc_disable_high;
+	}
+
+	hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash);
+	hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash);
+
+	/* On startup treat it as a link up/down event to setup state . */
+	if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK)
+		goto fail;
+
+	/* Setup the appropriate query interface for the endpoint */
+	switch (proto->ep->path_res_type) {
+	case PSM2_PATH_RES_OPP:
+		err = ips_opp_init(proto);
+		if (err != PSM2_OK)
+			_HFI_ERROR
+			    ("Unable to use OFED Plus Plus for path record queries.\n");
+		break;
+	case PSM2_PATH_RES_UMAD:
+		_HFI_ERROR
+		    ("Path record queries using UMAD is not supported in PSM version %d.%dx\n",
+		     PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+		err = PSM2_EPID_PATH_RESOLUTION;
+		break;
+	case PSM2_PATH_RES_NONE:
+	default:
+		err = ips_none_path_rec_init(proto);
+	}
+
+fail:
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto)
+{
+	psm2_error_t err = PSM2_OK;
+
+	if (proto->ibta.fini)
+		err = proto->ibta.fini(proto);
+
+	/* Destroy the path record/group hash */
+	hdestroy_r(&proto->ips_path_rec_hash);
+	hdestroy_r(&proto->ips_path_grp_hash);
+
+	return err;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc0a4331053125977fc9a8f93d198162ccc9ce07
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_path_rec.h
@@ -0,0 +1,202 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */
+
+
+#ifndef _IPS_PATH_REC_H_
+#define _IPS_PATH_REC_H_
+
+#include <search.h>
+
+/* Default size of path record hash table */
+#define DF_PATH_REC_HASH_SIZE 2047
+
+/* Default size of path group hash table */
+#define DF_PATH_GRP_HASH_SIZE 255
+
+/* Default size of CCT table. Must be multiple of 64 */
+#define DF_CCT_TABLE_SIZE 128
+
+/* CCT max IPD delay. */
+#define DF_CCT_MAX_IPD_DELAY_US 21
+
+/* CCA divisor shift */
+#define CCA_DIVISOR_SHIFT 14
+
+/* CCA ipd mask */
+#define CCA_IPD_MASK 0x3FFF
+
+/* A lot of these are IBTA specific defines that are available in other header
+ * files. To minimize dependencies with PSM build process they are listed
+ * here. Most of this is used to implement IBTA compliance features with PSM
+ * like path record query etc.
+ */
+
+enum opa_mtu {
+	IBTA_MTU_256  = 1,
+	IBTA_MTU_512  = 2,
+	IBTA_MTU_1024 = 3,
+	IBTA_MTU_2048 = 4,
+	IBTA_MTU_4096 = 5,
+	OPA_MTU_8192  = 6,
+	OPA_MTU_10240 = 7,
+	IBTA_MTU_MIN  = IBTA_MTU_256,
+	OPA_MTU_MIN   = IBTA_MTU_256,
+	OPA_MTU_MAX   = IBTA_MTU_4096,
+};
+
+typedef enum psm_ibv_rate opa_rate;
+
+static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
+{
+	switch (mtu) {
+	case IBTA_MTU_256:
+		return 256;
+	case IBTA_MTU_512:
+		return 512;
+	case IBTA_MTU_1024:
+		return 1024;
+	case IBTA_MTU_2048:
+		return 2048;
+	case IBTA_MTU_4096:
+		return 4096;
+	case OPA_MTU_8192:
+		return 8192;
+	case OPA_MTU_10240:
+		return 10240;
+	default:
+		return -1;
+	}
+}
+
+static inline enum opa_mtu opa_mtu_int_to_enum(int mtu)
+{
+	// the PSM mtu may be slightly less than wire MTU to allow for
+	// PSM headers, so round up to nearest MTU enum
+	if (mtu <= 256)
+		return IBTA_MTU_256;
+	else if (mtu <= 512)
+		return IBTA_MTU_512;
+	else if (mtu <= 1024)
+		return IBTA_MTU_1024;
+	else if (mtu <= 2048)
+		return IBTA_MTU_2048;
+	else if (mtu <= 4096)
+		return IBTA_MTU_4096;
+// TBD if we should allow these values on standard verbs
+	else if (mtu <= 8192)
+		return OPA_MTU_8192;
+	else
+		return OPA_MTU_10240;
+}
+
+/* This is same as ob_path_rec from ib_types.h. Listed here to be self
+ * contained to minimize dependencies during build etc.
+ */
+typedef struct _ibta_path_rec {
+	uint64_t service_id;	/* net order */
+	uint8_t dgid[16];
+	uint8_t sgid[16];
+	uint16_t dlid;		/* net order */
+	uint16_t slid;		/* net order */
+	uint32_t hop_flow_raw;	/* net order */
+	uint8_t tclass;
+	uint8_t num_path;
+	uint16_t pkey;		/* net order */
+	uint16_t qos_class_sl;	/* net order */
+	uint8_t mtu;		/* IBTA encoded */
+	uint8_t rate;		/* IBTA encoded */
+	uint8_t pkt_life;	/* IBTA encoded */
+	uint8_t preference;
+	uint8_t resv2[6];
+} ibta_path_rec_t;
+
+/*
+ * PSM IPS path record components for endpoint.
+ *
+ * For Torus/non-zero LMC fabrics, pr_slid and pr_dlid may be different from
+ * the "base lid" values for this connection.
+ */
+struct ips_proto;
+
+typedef struct ips_path_rec {
+	uint16_t pr_slid;
+	uint16_t pr_dlid;
+	uint16_t pr_mtu;	/* PSM payload in bytes, < Path's MTU */
+	uint16_t pr_pkey;
+	uint8_t pr_sl;
+	uint8_t pr_static_rate;	// psm_ibv_rate enum
+	uint16_t pr_ip_hi;	// high 16 bits of IP address for ethernet
+						// and low 16 are in pr_dlid
+
+	// address handle for UD comms
+	struct ibv_ah *ah;
+#ifdef RNDV_MOD
+	psm2_rv_conn_t rv_conn;
+	uint8_t connecting;
+#endif
+
+} ips_path_rec_t;
+
+psm2_error_t ips_opp_init(struct ips_proto *proto);
+psm2_error_t ips_make_ah(psm2_ep_t ep, ips_path_rec_t *path_rec);
+psm2_error_t ips_path_rec_to_ah_attr(psm2_ep_t ep,
+                const ips_path_rec_t *path_rec, struct ibv_ah_attr *ah_attr);
+#ifdef RNDV_MOD
+void ips_path_rec_to_ib_user_path_rec(psm2_ep_t ep,
+		const ips_path_rec_t *path_rec, union ibv_gid *dgid,
+		struct ib_user_path_rec *path);
+#endif
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.c
new file mode 100644
index 0000000000000000000000000000000000000000..f0019798eea37c22b608c46adb78a5626cb27727
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.c
@@ -0,0 +1,1827 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+/*
+ * IPS - Interconnect Protocol Stack.
+ */
+
+#include <assert.h>
+#include <sys/uio.h>		/* writev */
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_help.h"
+#include "psmi_wrappers.h"
+#include "psm_mq_internal.h"
+
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+
+/*
+ * Control message types have their own flag to determine whether a message of
+ * that type is queued or not.  These flags are kept in a state bitfield.
+ */
+#define CTRL_MSG_ACK_QUEUED                     0x0001
+#define CTRL_MSG_NAK_QUEUED                     0x0002
+#define CTRL_MSG_BECN_QUEUED                    0x0004
+#define CTRL_MSG_ERR_CHK_QUEUED                 0x0008
+// reserved                                     0x0010
+#define CTRL_MSG_CONNECT_REQUEST_QUEUED		0x0020
+#define CTRL_MSG_CONNECT_REPLY_QUEUED		0x0040
+#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED	0x0080
+#define CTRL_MSG_DISCONNECT_REPLY_QUEUED	0x0100
+
+#ifdef PSM_CUDA
+uint32_t gpudirect_send_limit;
+uint32_t gpudirect_recv_limit;
+#endif
+
+static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto);
+static psm2_error_t proto_sdma_init(struct ips_proto *proto,
+				   const psmi_context_t *context);
+
+#ifdef PSM_CUDA
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+{
+	struct ips_cuda_hostbuf *icb = (struct ips_cuda_hostbuf *)obj;
+	if (is_alloc) {
+		icb->host_buf = NULL;
+		icb->copy_status = NULL;
+	} else {
+		if (icb->host_buf != NULL) {
+			PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf);
+		}
+		if (icb->copy_status != NULL) {
+			PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status);
+		}
+	}
+	return;
+}
+#endif
+
+static uint64_t verbs_ep_send_num_free(void *context)
+{
+	struct psm2_verbs_ep *vep = &((struct ips_proto *)context)->ep->verbs_ep;
+	return vep->send_allocator.pool->send_num_free;
+}
+
+static uint64_t verbs_ep_send_rdma_outstanding(void *context)
+{
+	struct psm2_verbs_ep *vep = &((struct ips_proto *)context)->ep->verbs_ep;
+	return vep->send_rdma_outstanding;
+}
+
+psm2_error_t
+ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
+	       int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
+	       const struct psmi_timer_ctrl *timerq,
+	       const struct ips_epstate *epstate,
+	       void *spioc, struct ips_proto *proto)
+{
+	uint32_t protoexp_flags, cksum_sz;
+	union psmi_envvar_val env_tid, env_cksum, env_mtu;
+	psm2_error_t err = PSM2_OK;
+
+	/*
+	 * Checksum packets within PSM. Default is off.
+	 * This is heavy weight and done in software so not recommended for
+	 * production runs.
+	 */
+
+	psmi_getenv("PSM3_CHECKSUM",
+		    "Enable checksum of messages (0 disables checksum)",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)0, &env_cksum);
+
+	memset(proto, 0, sizeof(struct ips_proto));
+	proto->ptl = (ptl_t *) ptl;
+	proto->ep = context->ep;	/* cached */
+	proto->mq = context->ep->mq;	/* cached */
+	proto->pend_sends.proto = proto;
+	psmi_timer_entry_init(&proto->pend_sends.timer,
+			      ips_proto_timer_pendq_callback,
+			      &proto->pend_sends);
+	STAILQ_INIT(&proto->pend_sends.pendq);
+	proto->epstate = (struct ips_epstate *)epstate;
+	proto->timerq = (struct psmi_timer_ctrl *)timerq;
+	proto->spioc = spioc;
+
+	// hash for dispersive routing
+	proto->epinfo.ep_hash = context->ep->verbs_ep.qp->qp_num;// low 8b only
+
+	/* If checksums enabled we insert checksum at end of packet */
+	cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0;
+	proto->epinfo.ep_mtu = context->ep->mtu;
+	/* Decrement checksum */
+	proto->epinfo.ep_mtu -= cksum_sz;
+
+	/* See if user specifies a lower MTU to use */
+	if (!psmi_getenv("PSM3_MTU",
+		"Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096]",
+	     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+	     (union psmi_envvar_val)-1, &env_mtu)) {
+		if (env_mtu.e_int >= OPA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) //enum
+			env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int);
+		else if (env_mtu.e_int < OPA_MTU_MIN) // pick default
+			env_mtu.e_int = 8192;
+		else // wash through enum to force round up to next valid MTU
+			env_mtu.e_int = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int));
+		if (proto->epinfo.ep_mtu > env_mtu.e_int) {
+			proto->epinfo.ep_mtu = env_mtu.e_int;
+			proto->epinfo.ep_mtu -= MAX_PSM_HEADER;
+		}
+	}
+	// ep_mtu is our final choice of local PSM payload we can support, save it
+	// back to ep->mtu
+	proto->ep->mtu = proto->epinfo.ep_mtu;
+
+	// create and size the buffer pools based on the selected ep->mtu
+	err = __psm2_ep_initialize_queues(proto->ep);
+	if (err)
+		goto fail;
+
+
+	proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT);
+	proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U;
+#ifdef PSM_CUDA
+	proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking = ~0U;
+#endif
+	proto->t_init = get_cycles();
+	proto->t_fini = 0;
+	proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0;
+	proto->runid_key = getpid();
+
+	proto->num_connected_outgoing = 0;
+	proto->num_connected_incoming = 0;
+	proto->num_disconnect_requests = 0;
+	proto->stray_warn_interval = (uint64_t) -1;
+	proto->done_warning = 0;
+	proto->done_once = 0;
+	proto->num_bogus_warnings = 0;
+	proto->psmi_logevent_tid_send_reqs.interval_secs = 15;
+	proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+	proto->psmi_logevent_tid_send_reqs.count = 0;
+
+	{
+		/* threshold for multirail load balancing */
+		union psmi_envvar_val env_thresh_load_balance;
+
+		psmi_getenv("PSM3_MULTIRAIL_THRESH_LOAD_BALANCE",
+			    "Min packet size at which load balance for multi-rail (default is 0)",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)0,
+			    &env_thresh_load_balance);
+		proto->multirail_thresh_load_balance = env_thresh_load_balance.e_uint;
+	}
+
+	/* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */
+	if ((err = ips_ibta_init(proto)))
+		goto fail;
+
+	{
+		/* User asks for HFI loopback? */
+		union psmi_envvar_val env_loopback;
+
+		psmi_getenv("PSM3_NIC_LOOPBACK",
+			"PSM uses NIC loopback (default is disabled i.e. 0)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			(union psmi_envvar_val)0, /* Disabled by default */
+			&env_loopback);
+
+		if (env_loopback.e_uint)
+			proto->flags |= IPS_PROTO_FLAG_LOOPBACK;
+	}
+
+
+	{
+		/* Disable coalesced ACKs? */
+		union psmi_envvar_val env_coalesce_acks;
+
+		psmi_getenv("PSM3_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1,	/* Enabled by default */
+			    &env_coalesce_acks);
+
+		if (env_coalesce_acks.e_uint)
+			proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS;
+	}
+
+	{
+		/* Number of credits per flow */
+		union psmi_envvar_val env_flow_credits;
+		int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc);
+
+		psmi_getenv("PSM3_FLOW_CREDITS",
+			    "Number of unacked packets (credits) per flow (default is 64)",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)df_flow_credits,
+			    &env_flow_credits);
+		proto->flow_credits = env_flow_credits.e_uint;
+	}
+
+	/*
+	 * Pre-calculate the PSN mask to support 31 bit PSN.
+	 */
+	proto->psn_mask = 0x7FFFFFFF;
+
+	/*
+	 * Initialize SDMA, otherwise, turn on all PIO.
+	 */
+	// initialize sdma after PSM3_MR_CACHE_MODE
+	proto->flags |= IPS_PROTO_FLAG_SPIO;
+
+	/*
+	 * Setup the protocol wide short message ep flow.
+	 */
+	proto->msgflowid = EP_FLOW_GO_BACK_N_PIO;
+
+	/*
+	 * Clone sendreq mpool configuration for pend sends config
+	 */
+	{
+		uint32_t chunks, maxsz;
+
+		psmi_assert_always(proto->ep->mq->sreq_pool != NULL);
+		psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks,
+					&maxsz);
+
+		proto->pend_sends_pool =
+		    psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks,
+				      maxsz, 0, DESCRIPTORS, NULL, NULL);
+		if (proto->pend_sends_pool == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Create a pool of CCA timers for path_rec. The timers should not
+	 * exceed the scb number num_of_send_desc(default 4K).
+	 */
+	{
+		uint32_t chunks, maxsz;
+
+		chunks = 256;
+		maxsz = num_of_send_desc;
+
+		proto->timer_pool =
+		    psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz,
+				      0, DESCRIPTORS, NULL, NULL);
+		if (proto->timer_pool == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+
+	/*
+	 * Register ips protocol statistics
+	 *
+	 * We put a (*) in the output to denote stats that may cause a drop in
+	 * performance.
+	 *
+	 * We put a (**) in the output of those stats that "should never happen"
+	 */
+	{
+
+		struct psmi_stats_entry entries[] = {
+			PSMI_STATS_DECLU64("pio_busy_count",
+					   &proto->stats.pio_busy_cnt),
+			PSMI_STATS_DECLU64("pio_no_flow_credits",
+					   &proto->stats.pio_no_flow_credits),
+			PSMI_STATS_DECLU64("post_send_fail",
+					   &proto->stats.post_send_fail),
+			PSMI_STATS_DECL_FUNC("ud_sbuf_free",
+					   verbs_ep_send_num_free),
+			PSMI_STATS_DECL_FUNC("send_rdma_outstanding",
+					   verbs_ep_send_rdma_outstanding),
+
+			/* Throttling by kernel */
+			PSMI_STATS_DECLU64("writev_busy_cnt",
+					   &proto->stats.writev_busy_cnt),
+			PSMI_STATS_DECLU64("scb_unavail_eager_count",
+					   &proto->stats.scb_egr_unavail_cnt),
+			PSMI_STATS_DECLU64("unknown_packets_(**)",	/* shouldn't happen */
+					   &proto->stats.unknown_packets),
+			PSMI_STATS_DECLU64("stray_packets_(*)",
+					   &proto->stats.stray_packets),
+			PSMI_STATS_DECLU64("err_chk_send",
+					   &proto->epaddr_stats.err_chk_send),
+			PSMI_STATS_DECLU64("err_chk_recv",
+					   &proto->epaddr_stats.err_chk_recv),
+#ifdef RNDV_MOD
+			PSMI_STATS_DECLU64("err_chk_rdma_send",
+					   &proto->epaddr_stats.err_chk_rdma_send),
+			PSMI_STATS_DECLU64("err_chk_rdma_recv",
+					   &proto->epaddr_stats.err_chk_rdma_recv),
+			PSMI_STATS_DECLU64("err_chk_rdma_resp_send",
+					   &proto->epaddr_stats.err_chk_rdma_resp_send),
+			PSMI_STATS_DECLU64("err_chk_rdma_resp_recv",
+					   &proto->epaddr_stats.err_chk_rdma_resp_recv),
+#endif
+			PSMI_STATS_DECLU64("nak_send",
+					   &proto->epaddr_stats.nak_send),
+			PSMI_STATS_DECLU64("nak_recv",
+					   &proto->epaddr_stats.nak_recv),
+			PSMI_STATS_DECLU64("connect_req_send",
+					   &proto->epaddr_stats.connect_req_send),
+			PSMI_STATS_DECLU64("connect_req_recv",
+					   &proto->epaddr_stats.connect_req_recv),
+			PSMI_STATS_DECLU64("connect_rep_send",
+					   &proto->epaddr_stats.connect_rep_send),
+			PSMI_STATS_DECLU64("connect_rep_recv",
+					   &proto->epaddr_stats.connect_rep_recv),
+			PSMI_STATS_DECLU64("disconnect_req_send",
+					   &proto->epaddr_stats.disconnect_req_send),
+			PSMI_STATS_DECLU64("disconnect_req_recv",
+					   &proto->epaddr_stats.disconnect_req_recv),
+			PSMI_STATS_DECLU64("disconnect_rep_send",
+					   &proto->epaddr_stats.disconnect_rep_send),
+			PSMI_STATS_DECLU64("disconnect_rep_recv",
+					   &proto->epaddr_stats.disconnect_rep_recv),
+			PSMI_STATS_DECLU64("rts_send",
+					   &proto->epaddr_stats.rts_send),
+			PSMI_STATS_DECLU64("rts_recv",
+					   &proto->epaddr_stats.rts_recv),
+			PSMI_STATS_DECLU64("cts_long_data_send",
+					   &proto->epaddr_stats.cts_long_data_send),
+			PSMI_STATS_DECLU64("cts_long_data_recv",
+					   &proto->epaddr_stats.cts_long_data_recv),
+			PSMI_STATS_DECLU64("cts_rdma_send",
+					   &proto->epaddr_stats.cts_rdma_send),
+			PSMI_STATS_DECLU64("cts_rdma_recv",
+					   &proto->epaddr_stats.cts_rdma_recv),
+			PSMI_STATS_DECLU64("send_rexmit",
+					   &proto->epaddr_stats.send_rexmit),
+#ifdef RNDV_MOD
+			PSMI_STATS_DECLU64("rdma_rexmit",
+					   &proto->epaddr_stats.rdma_rexmit),
+#endif
+			PSMI_STATS_DECLU64("tiny_cpu_isend",
+					   &proto->strat_stats.tiny_cpu_isend),
+			PSMI_STATS_DECLU64("tiny_cpu_isend_bytes",
+					   &proto->strat_stats.tiny_cpu_isend_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("tiny_gdrcopy_isend",
+					   &proto->strat_stats.tiny_gdrcopy_isend),
+			PSMI_STATS_DECLU64("tiny_gdrcopy_isend_bytes",
+					   &proto->strat_stats.tiny_gdrcopy_isend_bytes),
+			PSMI_STATS_DECLU64("tiny_cuCopy_isend",
+					   &proto->strat_stats.tiny_cuCopy_isend),
+			PSMI_STATS_DECLU64("tiny_cuCopy_isend_bytes",
+					   &proto->strat_stats.tiny_cuCopy_isend_bytes),
+#endif
+			PSMI_STATS_DECLU64("tiny_cpu_send",
+					   &proto->strat_stats.tiny_cpu_send),
+			PSMI_STATS_DECLU64("tiny_cpu_send_bytes",
+					   &proto->strat_stats.tiny_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("tiny_gdrcopy_send",
+					   &proto->strat_stats.tiny_gdrcopy_send),
+			PSMI_STATS_DECLU64("tiny_gdrcopy_send_bytes",
+					   &proto->strat_stats.tiny_gdrcopy_send_bytes),
+			PSMI_STATS_DECLU64("tiny_cuCopy_send",
+					   &proto->strat_stats.tiny_cuCopy_send),
+			PSMI_STATS_DECLU64("tiny_cuCopy_send_bytes",
+					   &proto->strat_stats.tiny_cuCopy_send_bytes),
+#endif
+			PSMI_STATS_DECLU64("tiny_cpu_recv",
+					   &proto->strat_stats.tiny_cpu_recv),
+			PSMI_STATS_DECLU64("tiny_cpu_recv_bytes",
+					   &proto->strat_stats.tiny_cpu_recv_bytes),
+			PSMI_STATS_DECLU64("tiny_sysbuf_recv",
+					   &proto->strat_stats.tiny_sysbuf_recv),
+			PSMI_STATS_DECLU64("tiny_sysbuf_recv_bytes",
+					   &proto->strat_stats.tiny_sysbuf_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("tiny_gdrcopy_recv",
+					   &proto->strat_stats.tiny_gdrcopy_recv),
+			PSMI_STATS_DECLU64("tiny_gdrcopy_recv_bytes",
+					   &proto->strat_stats.tiny_gdrcopy_recv_bytes),
+			PSMI_STATS_DECLU64("tiny_cuCopy_recv",
+					   &proto->strat_stats.tiny_cuCopy_recv),
+			PSMI_STATS_DECLU64("tiny_cuCopy_recv_bytes",
+					   &proto->strat_stats.tiny_cuCopy_recv_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("short_copy_cpu_isend",
+					   &proto->strat_stats.short_copy_cpu_isend),
+			PSMI_STATS_DECLU64("short_copy_cpu_isend_bytes",
+					   &proto->strat_stats.short_copy_cpu_isend_bytes),
+			PSMI_STATS_DECLU64("short_dma_cpu_isend",
+					   &proto->strat_stats.short_dma_cpu_isend),
+			PSMI_STATS_DECLU64("short_dma_cpu_isend_bytes",
+					   &proto->strat_stats.short_dma_cpu_isend_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("short_gdrcopy_isend",
+					   &proto->strat_stats.short_gdrcopy_isend),
+			PSMI_STATS_DECLU64("short_gdrcopy_isend_bytes",
+					   &proto->strat_stats.short_gdrcopy_isend_bytes),
+			PSMI_STATS_DECLU64("short_cuCopy_isend",
+					   &proto->strat_stats.short_cuCopy_isend),
+			PSMI_STATS_DECLU64("short_cuCopy_isend_bytes",
+					   &proto->strat_stats.short_cuCopy_isend_bytes),
+			PSMI_STATS_DECLU64("short_gdr_isend",
+					   &proto->strat_stats.short_gdr_isend),
+			PSMI_STATS_DECLU64("short_gdr_isend_bytes",
+					   &proto->strat_stats.short_gdr_isend_bytes),
+#endif
+			PSMI_STATS_DECLU64("short_copy_cpu_send",
+					   &proto->strat_stats.short_copy_cpu_send),
+			PSMI_STATS_DECLU64("short_copy_cpu_send_bytes",
+					   &proto->strat_stats.short_copy_cpu_send_bytes),
+			PSMI_STATS_DECLU64("short_dma_cpu_send",
+					   &proto->strat_stats.short_dma_cpu_send),
+			PSMI_STATS_DECLU64("short_dma_cpu_send_bytes",
+					   &proto->strat_stats.short_dma_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("short_gdrcopy_send",
+					   &proto->strat_stats.short_gdrcopy_send),
+			PSMI_STATS_DECLU64("short_gdrcopy_send_bytes",
+					   &proto->strat_stats.short_gdrcopy_send_bytes),
+			PSMI_STATS_DECLU64("short_cuCopy_send",
+					   &proto->strat_stats.short_cuCopy_send),
+			PSMI_STATS_DECLU64("short_cuCopy_send_bytes",
+					   &proto->strat_stats.short_cuCopy_send_bytes),
+			PSMI_STATS_DECLU64("short_gdr_send",
+					   &proto->strat_stats.short_gdr_send),
+			PSMI_STATS_DECLU64("short_gdr_send_bytes",
+					   &proto->strat_stats.short_gdr_send_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("short_cpu_recv",
+					   &proto->strat_stats.short_cpu_recv),
+			PSMI_STATS_DECLU64("short_cpu_recv_bytes",
+					   &proto->strat_stats.short_cpu_recv_bytes),
+			PSMI_STATS_DECLU64("short_sysbuf_recv",
+					   &proto->strat_stats.short_sysbuf_recv),
+			PSMI_STATS_DECLU64("short_sysbuf_recv_bytes",
+					   &proto->strat_stats.short_sysbuf_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("short_gdrcopy_recv",
+					   &proto->strat_stats.short_gdrcopy_recv),
+			PSMI_STATS_DECLU64("short_gdrcopy_recv_bytes",
+					   &proto->strat_stats.short_gdrcopy_recv_bytes),
+			PSMI_STATS_DECLU64("short_cuCopy_recv",
+					   &proto->strat_stats.short_cuCopy_recv),
+			PSMI_STATS_DECLU64("short_cuCopy_recv_bytes",
+					   &proto->strat_stats.short_cuCopy_recv_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("eager_copy_cpu_isend",
+					   &proto->strat_stats.eager_copy_cpu_isend),
+			PSMI_STATS_DECLU64("eager_copy_cpu_isend_bytes",
+					   &proto->strat_stats.eager_copy_cpu_isend_bytes),
+			PSMI_STATS_DECLU64("eager_dma_cpu_isend",
+					   &proto->strat_stats.eager_dma_cpu_isend),
+			PSMI_STATS_DECLU64("eager_dma_cpu_isend_bytes",
+					   &proto->strat_stats.eager_dma_cpu_isend_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("eager_cuCopy_isend",
+					   &proto->strat_stats.eager_cuCopy_isend),
+			PSMI_STATS_DECLU64("eager_cuCopy_isend_bytes",
+					   &proto->strat_stats.eager_cuCopy_isend_bytes),
+			PSMI_STATS_DECLU64("eager_gdr_isend",
+					   &proto->strat_stats.eager_gdr_isend),
+			PSMI_STATS_DECLU64("eager_gdr_isend_bytes",
+					   &proto->strat_stats.eager_gdr_isend_bytes),
+#endif
+			PSMI_STATS_DECLU64("eager_copy_cpu_send",
+					   &proto->strat_stats.eager_copy_cpu_send),
+			PSMI_STATS_DECLU64("eager_copy_cpu_send_bytes",
+					   &proto->strat_stats.eager_copy_cpu_send_bytes),
+			PSMI_STATS_DECLU64("eager_dma_cpu_send",
+					   &proto->strat_stats.eager_dma_cpu_send),
+			PSMI_STATS_DECLU64("eager_dma_cpu_send_bytes",
+					   &proto->strat_stats.eager_dma_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("eager_cuCopy_send",
+					   &proto->strat_stats.eager_cuCopy_send),
+			PSMI_STATS_DECLU64("eager_cuCopy_send_bytes",
+					   &proto->strat_stats.eager_cuCopy_send_bytes),
+			PSMI_STATS_DECLU64("eager_gdr_send",
+					   &proto->strat_stats.eager_gdr_send),
+			PSMI_STATS_DECLU64("eager_gdr_send_bytes",
+					   &proto->strat_stats.eager_gdr_send_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("eager_cpu_recv",
+					   &proto->strat_stats.eager_cpu_recv),
+			PSMI_STATS_DECLU64("eager_cpu_recv_bytes",
+					   &proto->strat_stats.eager_cpu_recv_bytes),
+			PSMI_STATS_DECLU64("eager_sysbuf_recv",
+					   &proto->strat_stats.eager_sysbuf_recv),
+			PSMI_STATS_DECLU64("eager_sysbuf_recv_bytes",
+					   &proto->strat_stats.eager_sysbuf_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("eager_gdrcopy_recv",
+					   &proto->strat_stats.eager_gdrcopy_recv),
+			PSMI_STATS_DECLU64("eager_gdrcopy_recv_bytes",
+					   &proto->strat_stats.eager_gdrcopy_recv_bytes),
+			PSMI_STATS_DECLU64("eager_cuCopy_recv",
+					   &proto->strat_stats.eager_cuCopy_recv),
+			PSMI_STATS_DECLU64("eager_cuCopy_recv_bytes",
+					   &proto->strat_stats.eager_cuCopy_recv_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("rndv_cpu_isend",
+					   &proto->strat_stats.rndv_cpu_isend),
+			PSMI_STATS_DECLU64("rndv_cpu_isend_bytes",
+					   &proto->strat_stats.rndv_cpu_isend_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_gpu_isend",
+					   &proto->strat_stats.rndv_gpu_isend),
+			PSMI_STATS_DECLU64("rndv_gpu_isend_bytes",
+					   &proto->strat_stats.rndv_gpu_isend_bytes),
+#endif
+			PSMI_STATS_DECLU64("rndv_cpu_send",
+					   &proto->strat_stats.rndv_cpu_send),
+			PSMI_STATS_DECLU64("rndv_cpu_send_bytes",
+					   &proto->strat_stats.rndv_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_gpu_send",
+					   &proto->strat_stats.rndv_gpu_send),
+			PSMI_STATS_DECLU64("rndv_gpu_send_bytes",
+					   &proto->strat_stats.rndv_gpu_send_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("rndv_rts_cpu_recv",
+					   &proto->strat_stats.rndv_rts_cpu_recv),
+			PSMI_STATS_DECLU64("rndv_rts_cpu_recv_bytes",
+					   &proto->strat_stats.rndv_rts_cpu_recv_bytes),
+			PSMI_STATS_DECLU64("rndv_rts_sysbuf_recv",
+					   &proto->strat_stats.rndv_rts_sysbuf_recv),
+			PSMI_STATS_DECLU64("rndv_rts_sysbuf_recv_bytes",
+					   &proto->strat_stats.rndv_rts_sysbuf_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_rts_cuCopy_recv",
+					   &proto->strat_stats.rndv_rts_cuCopy_recv),
+			PSMI_STATS_DECLU64("rndv_rts_cuCopy_recv_bytes",
+					   &proto->strat_stats.rndv_rts_cuCopy_recv_bytes),
+#endif
+			PSMI_STATS_DECLU64("rndv_rts_copy_cpu_send",
+					   &proto->strat_stats.rndv_rts_copy_cpu_send),
+			PSMI_STATS_DECLU64("rndv_rts_copy_cpu_send_bytes",
+					   &proto->strat_stats.rndv_rts_copy_cpu_send_bytes),
+
+			PSMI_STATS_DECLU64("rndv_long_cpu_recv",
+					   &proto->strat_stats.rndv_long_cpu_recv),
+			PSMI_STATS_DECLU64("rndv_long_cpu_recv_bytes",
+					   &proto->strat_stats.rndv_long_cpu_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_long_cuCopy_recv",
+					   &proto->strat_stats.rndv_long_cuCopy_recv),
+			PSMI_STATS_DECLU64("rndv_long_cuCopy_recv_bytes",
+					   &proto->strat_stats.rndv_long_cuCopy_recv_bytes),
+			PSMI_STATS_DECLU64("rndv_long_gdr_recv",
+					   &proto->strat_stats.rndv_long_gdr_recv),
+			PSMI_STATS_DECLU64("rndv_long_gdr_recv_bytes",
+					   &proto->strat_stats.rndv_long_gdr_recv_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("rndv_long_copy_cpu_send",
+					   &proto->strat_stats.rndv_long_copy_cpu_send),
+			PSMI_STATS_DECLU64("rndv_long_copy_cpu_send_bytes",
+					   &proto->strat_stats.rndv_long_copy_cpu_send_bytes),
+			PSMI_STATS_DECLU64("rndv_long_dma_cpu_send",
+					   &proto->strat_stats.rndv_long_dma_cpu_send),
+			PSMI_STATS_DECLU64("rndv_long_dma_cpu_send_bytes",
+					   &proto->strat_stats.rndv_long_dma_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_long_cuCopy_send",
+					   &proto->strat_stats.rndv_long_cuCopy_send),
+			PSMI_STATS_DECLU64("rndv_long_cuCopy_send_bytes",
+					   &proto->strat_stats.rndv_long_cuCopy_send_bytes),
+			PSMI_STATS_DECLU64("rndv_long_gdrcopy_send",
+					   &proto->strat_stats.rndv_long_gdrcopy_send),
+			PSMI_STATS_DECLU64("rndv_long_gdrcopy_send_bytes",
+					   &proto->strat_stats.rndv_long_gdrcopy_send_bytes),
+			PSMI_STATS_DECLU64("rndv_long_gdr_send",
+					   &proto->strat_stats.rndv_long_gdr_send),
+			PSMI_STATS_DECLU64("rndv_long_gdr_send_bytes",
+					   &proto->strat_stats.rndv_long_gdr_send_bytes),
+#endif
+
+			PSMI_STATS_DECLU64("rndv_rdma_cpu_recv",
+					   &proto->strat_stats.rndv_rdma_cpu_recv),
+			PSMI_STATS_DECLU64("rndv_rdma_cpu_recv_bytes",
+					   &proto->strat_stats.rndv_rdma_cpu_recv_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_rdma_gdr_recv",
+					   &proto->strat_stats.rndv_rdma_gdr_recv),
+			PSMI_STATS_DECLU64("rndv_rdma_gdr_recv_bytes",
+					   &proto->strat_stats.rndv_rdma_gdr_recv_bytes),
+			PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv",
+					   &proto->strat_stats.rndv_rdma_hbuf_recv),
+			PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv_bytes",
+					   &proto->strat_stats.rndv_rdma_hbuf_recv_bytes),
+#endif
+			PSMI_STATS_DECLU64("rndv_rdma_cpu_send",
+					   &proto->strat_stats.rndv_rdma_cpu_send),
+			PSMI_STATS_DECLU64("rndv_rdma_cpu_send_bytes",
+					   &proto->strat_stats.rndv_rdma_cpu_send_bytes),
+#ifdef PSM_CUDA
+			PSMI_STATS_DECLU64("rndv_rdma_gdr_send",
+					   &proto->strat_stats.rndv_rdma_gdr_send),
+			PSMI_STATS_DECLU64("rndv_rdma_gdr_send_bytes",
+					   &proto->strat_stats.rndv_rdma_gdr_send_bytes),
+			PSMI_STATS_DECLU64("rndv_rdma_hbuf_send",
+					   &proto->strat_stats.rndv_rdma_hbuf_send),
+			PSMI_STATS_DECLU64("rndv_rdma_hbuf_send_bytes",
+					   &proto->strat_stats.rndv_rdma_hbuf_send_bytes),
+#endif
+		};
+
+		err =
+		    psmi_stats_register_type
+		    ("PSM_low-level_protocol_stats",
+		     PSMI_STATSTYPE_IPSPROTO, entries,
+		     PSMI_STATS_HOWMANY(entries), proto->ep->epid, proto,
+		     proto->ep->dev_name);
+		if (err != PSM2_OK)
+			goto fail;
+	}
+
+	/*
+	 * Control Queue and messaging
+	 */
+	ctrlq_init(&proto->ctrlq, proto);
+
+	/*
+	 * Receive-side handling
+	 */
+	if ((err = ips_proto_recv_init(proto)))
+		goto fail;
+
+	/* If progress thread is enabled, set the proto flag */
+	{
+		if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD))
+			proto->flags |= IPS_PROTO_FLAG_RCVTHREAD;
+	}
+
+	/*
+	 * Eager buffers.  We don't care to receive a callback when eager buffers
+	 * are newly released since we actively poll for new bufs.
+	 */
+	{
+		/* configure PSM bounce buffer size */
+		union psmi_envvar_val env_bbs;
+
+		psmi_getenv("PSM3_BOUNCE_SZ",
+			"PSM send bounce buffer size (default is 8192B)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)8192,
+			&env_bbs);
+
+		proto->scb_bufsize = env_bbs.e_uint;
+	}
+
+	if ((err = ips_scbctrl_init(context, num_of_send_desc,
+				    num_of_send_bufs, imm_size,
+				    proto->scb_bufsize, NULL, NULL,
+				    &proto->scbc_egr)))
+		goto fail;
+
+	/*
+	 * Expected protocol handling.
+	 * If we enable tid-based expected rendezvous, the expected protocol code
+	 * handles its own rv scb buffers.  If not, we have to enable eager-based
+	 * rendezvous and we allocate scb buffers for it.
+	 * For UD PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous
+	 * For STL100 PSM3_TID controls use of EXPTID for Rendezvous
+	 */
+	env_tid.e_uint = proto->ep->rdmamode;	// PSM3_RDMA
+	protoexp_flags = env_tid.e_uint;
+
+	// protoexp implements RDMA for UD and TID for STL100 native.  N/A to UDP
+	// when proto->protoexp is NULL, we will not attempt to use TID nor RDMA
+	{
+		// for UD, even when RDMA is enabled, we may fall back to LONG_DATA
+		// in which case we want the scbc_rv scb's so we don't exhaust the
+		// scbc_egr pool
+		proto->scbc_rv = (struct ips_scbctrl *)
+		    psmi_calloc(proto->ep, DESCRIPTORS,
+				1, sizeof(struct ips_scbctrl));
+		if (proto->scbc_rv == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		/*
+		 * Rendezvous buffers. We want to get a callback for rendezvous bufs
+		 * since we asynchronously try to make progress on these sends and only
+		 * schedule them on the timerq if there are pending sends and available
+		 * bufs.
+		 */
+		if ((err =
+		     ips_scbctrl_init(context, num_of_send_desc,
+				      0 /* no bufs */ ,
+				      0, 0 /* bufsize==0 */ ,
+				      ips_proto_rv_scbavail_callback,
+				      proto, proto->scbc_rv)))
+			goto fail;
+	}
+	if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
+#ifdef PSM_CUDA
+		proto->cudastream_send = NULL;
+#endif
+		if ((err = ips_protoexp_init(context, proto, protoexp_flags,
+					     num_of_send_bufs, num_of_send_desc,
+					     &proto->protoexp)))
+			goto fail;
+	} else {
+		proto->protoexp = NULL;
+	}
+
+
+	/* Active Message interface. AM requests compete with MQ for eager
+	 * buffers, since request establish the amount of buffering in the
+	 * network (maximum number of requests in flight). The AM init function
+	 * does not allow the number of send buffers to be set separately from
+	 * the number of send descriptors, because otherwise it would have to
+	 * impose extremely arcane constraints on the relative amounts to avoid
+	 * a deadlock scenario. Thus, it handles it internally. The constraint
+	 * is: In a node pair, the number of reply send buffers on at least one
+	 * of the nodes must be at least double the number (optimal: double + 1)
+	 * of send descriptors on the other node. */
+	if ((err = ips_proto_am_init(proto,
+				     min(num_of_send_bufs, num_of_send_desc),
+				     imm_size,
+				     &proto->proto_am)))
+		goto fail;
+
+#if 0
+	if (!host_pid) {
+		char ipbuf[INET_ADDRSTRLEN], *p;
+		host_pid = (uint32_t) getpid();
+		host_ipv4addr = psmi_get_ipv4addr();	/* already be */
+		if (host_ipv4addr == 0) {
+			_HFI_DBG("Unable to obtain local IP address, "
+				 "not fatal but some features may be disabled\n");
+		} else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) {
+			_HFI_INFO("Localhost IP address is set to the "
+				  "loopback address 127.0.0.1, "
+				  "not fatal but some features may be disabled\n");
+		} else {
+			p = (char *)inet_ntop(AF_INET,
+					      (const void *)&host_ipv4addr,
+					      ipbuf, sizeof(ipbuf));
+			_HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p,
+				   host_pid);
+		}
+
+		/* Store in big endian for use in ERR_CHK */
+		host_pid = __cpu_to_be32(host_pid);
+	}
+#endif
+#ifdef PSM_CUDA
+	is_gpudirect_enabled = psmi_parse_gpudirect();
+	gpudirect_send_limit = psmi_parse_gpudirect_send_limit();
+	gpudirect_recv_limit = psmi_parse_gpudirect_recv_limit();
+
+	if (! is_gpudirect_enabled) {
+		gpudirect_send_limit = gpudirect_recv_limit = 0;
+	} else if (PSMI_IS_CUDA_DISABLED) {
+		// should not happen since we don't dynamically disable CUDA
+		_HFI_INFO("WARNING: Non-CUDA application, PSM3_GPUDIRECT option ignored\n");
+		is_gpudirect_enabled = 0;
+		gpudirect_send_limit = gpudirect_recv_limit = 0;
+	} else if (!device_support_gpudirect()) {
+		_HFI_INFO("WARNING: GPU device does not support GPU Direct, PSM3_GPUDIRECT option ignored\n");
+		is_gpudirect_enabled = 0;
+		gpudirect_send_limit = gpudirect_recv_limit = 0;
+	} else if (
+		PSMI_IS_DRIVER_GPUDIRECT_DISABLED) {
+		err = psmi_handle_error(PSMI_EP_NORETURN,
+				PSM2_INTERNAL_ERR,
+				"Unable to start run, PSM3_GPUDIRECT requires rv module with CUDA support.\n");
+	} else if (!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
+		// only GDR Copy and GPU Send DMA allowed
+		gpudirect_send_limit = gpudirect_recv_limit = 0;
+	} else {
+		if (gpudirect_send_limit)
+			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
+		if (gpudirect_recv_limit)
+			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
+	}
+	// from here forward can't use psmi_parse_gpudirect,
+	// must use is_gpudirect_enabled
+
+	/* The following cases need to be handled:
+	 * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
+	 *    by default - Turn off GDR COPY
+	 * 2) GPU DIRECT is turned on but App, GPU or RV doesn't support it
+	 *    (tested above) - Turn off GDR COPY
+	 * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
+	 *.   this config as it is.
+	 */
+	if (!is_gpudirect_enabled)
+		is_gdr_copy_enabled = gdr_copy_limit_send =
+			gdr_copy_limit_recv = 0;
+
+	if (PSMI_IS_CUDA_ENABLED &&
+		 (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
+		struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+		uint32_t maxsz, chunksz, max_elements;
+
+		if ((err = psmi_parse_mpool_env(proto->mq, 1,
+						&rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		/* the maxsz is the amount in MB, not the number of entries,
+		 * since the element size depends on the window size */
+		max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+		/* mpool requires max_elements to be power of 2. round down. */
+		max_elements = 1 << (31 - __builtin_clz(max_elements));
+		proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv;
+		proto->cuda_hostbuf_pool_send =
+			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+						   chunksz, max_elements, 0,
+						   UNDEFINED, NULL, NULL,
+						   psmi_cuda_hostbuf_alloc_func,
+						   (void *)
+						   &proto->cuda_hostbuf_send_cfg);
+
+		if (proto->cuda_hostbuf_pool_send == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate CUDA host send buffer pool");
+			goto fail;
+		}
+
+		/* use the same number of elements for the small pool */
+		proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ;
+		proto->cuda_hostbuf_pool_small_send =
+			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+						   chunksz, max_elements, 0,
+						   UNDEFINED, NULL, NULL,
+						   psmi_cuda_hostbuf_alloc_func,
+						   (void *)
+						   &proto->cuda_hostbuf_small_send_cfg);
+
+		if (proto->cuda_hostbuf_pool_small_send == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate CUDA host small send buffer pool");
+			goto fail;
+		}
+
+		/* Configure the amount of prefetching */
+		union psmi_envvar_val env_prefetch_limit;
+
+		psmi_getenv("PSM3_CUDA_PREFETCH_LIMIT",
+			    "How many RDMA windows to prefetch at RTS time(default is 2)",
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT,
+			    &env_prefetch_limit);
+		proto->cuda_prefetch_limit = env_prefetch_limit.e_uint;
+	}
+#endif
+
+	// we allocate MR cache here (as opposed to in protoexp) in case we later
+	// decide to implement RC send for medium messages and use it to register
+	// medium sized user eager buffers (SDMA-like)
+	// We also need to know GPU Direct Copy sizes for pri_size
+	// if RDMA=0 with PSM3_SDMA or PSM3_GPUDIRECT_SDMA can still
+	// allocate cache for just send DMA and perhaps gdrcopy
+	if ((protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)
+		|| proto->ep->mr_cache_mode) {
+		union psmi_envvar_val env_mr_cache_size;
+		uint32_t default_cache_size;	// in entries
+		uint32_t cache_pri_entries;
+		uint64_t cache_pri_size;	// in bytes
+#ifdef PSM_CUDA
+		uint64_t cache_gpu_pri_size;	// in bytes
+		union psmi_envvar_val env_mr_cache_gpu_evict;
+#endif
+
+		// we can have at most HFI_TF_NFLOWS inbound RDMA and hfi_num_send_rdma
+		// outbound RDMA.  Each of which potentially needs an MR.
+		// so mr_cache_size should be >= HFI_TF_NFLOWS + ep->hfi_num_send_rdma
+		// but can survive if it's smaller as we will delay transfer til avail
+		if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
+			cache_pri_entries =  HFI_TF_NFLOWS + proto->ep->hfi_num_send_rdma;
+			cache_pri_size  = (uint64_t)cache_pri_entries * proto->mq->hfi_base_window_rv;
+			if (proto->ep->mr_cache_mode == MR_CACHE_MODE_USER) {
+				// we attempt to cache, so can benefit from more than inflight
+				default_cache_size = cache_pri_entries * 16;
+			} else {
+				// we only reference count
+				// could benefit from some extra so we can preregister MRs for
+				// transfers we don't yet have resources for
+				default_cache_size = cache_pri_entries * 8;
+			}
+		} else {
+			// just for non-priority send DMA
+			cache_pri_entries =  0;
+			cache_pri_size  = 0;
+			if (proto->ep->mr_cache_mode == MR_CACHE_MODE_USER) {
+				// we attempt to cache, so can benefit from more than inflight
+				default_cache_size = 128 * 16;
+			} else {
+				// we only reference count
+				default_cache_size = 128;
+			}
+		}
+		/* Size of user space MR Cache
+		 */
+		psmi_getenv("PSM3_MR_CACHE_SIZE",
+				"user space MR table/cache size (num MRs)",
+				PSMI_ENVVAR_LEVEL_USER,
+				PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)default_cache_size, &env_mr_cache_size);
+
+#ifdef PSM_CUDA
+#ifndef ROUNDUP64P2
+#define ROUNDUP64P2(val, align)   \
+        (((uint64_t)(val) + (uint64_t)(align) - 1) & (~((uint64_t)(align)-1)))
+#endif
+
+		// cache_gpu_pri_size only used to confirm RV GPU cache size
+		// Without GPU Direct we will not register any GPU MRs
+		// if we have GPU Direct w/o RDMA, no priority pin/MRs except
+		// for GDRCopy
+		// since GdrCopy doesn't use psm2_mr_cache, no need to
+		// grow pri_entries to account for it
+		// Note cache_pri_size == 0 if rdmamode not enabled
+		cache_gpu_pri_size = 0;
+		if (PSMI_IS_CUDA_ENABLED && is_gpudirect_enabled) {
+			if (gpudirect_send_limit || gpudirect_recv_limit)
+				cache_gpu_pri_size = cache_pri_size;
+			if (gdr_copy_limit_send || gdr_copy_limit_recv) {
+				// min of one extra for GDRCopy
+				// largest recv with GDR copy is gdr_copy_limit_recv
+				// largest send with GDR copy is gdr_copy_limit_send
+				cache_gpu_pri_size +=
+					ROUNDUP64P2(max(proto->epinfo.ep_mtu,
+							max(gdr_copy_limit_recv,
+							gdr_copy_limit_send)),
+						PSMI_GPU_PAGESIZE);
+			}
+			psmi_getenv("PSM3_RV_GPU_CACHE_EVICT",
+				"Number of kilobytes to evict from GPU cache if can't pin memory for GPUDIRECT (0=just exact amount needed))",
+				PSMI_ENVVAR_LEVEL_USER,
+				PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)0, &env_mr_cache_gpu_evict);
+			gpu_cache_evict = (uint64_t)env_mr_cache_gpu_evict.e_uint * 1024;
+		}
+
+#endif
+		proto->mr_cache = psm2_verbs_alloc_mr_cache(proto->ep,
+						env_mr_cache_size.e_uint, proto->ep->mr_cache_mode,
+						cache_pri_entries, cache_pri_size
+#ifdef PSM_CUDA
+						, cache_gpu_pri_size
+#endif
+						);
+		if (! proto->mr_cache) {
+			_HFI_ERROR( "Unable to allocate MR cache (%u entries)\n",
+					env_mr_cache_size.e_uint);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+	}
+	// Send DMA only makes sense if we have a MR cache
+	if (proto->ep->mr_cache_mode) {
+		if ((err = proto_sdma_init(proto, context)))
+			goto fail;
+	} else {
+		if (psmi_parse_senddma())
+			_HFI_INFO("WARNING: Send DMA requires an MR Cache, disabling PSM3_SDMA\n");
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    ~0U;
+#ifdef PSM_CUDA
+		proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking =
+		    ~0U;
+#endif
+	}
+#ifdef PSM_CUDA
+	psmi_assert(proto->ep->mr_cache_mode || ! is_gdr_copy_enabled);
+#endif
+#ifdef PSM_CUDA
+	_HFI_DBG("Cuda %d GPU Direct support: driver %d GPU device %d\n",
+		is_cuda_enabled, is_driver_gpudirect_enabled, _device_support_gpudirect);
+	_HFI_DBG("GDR Copy: %d limit send=%u recv=%u cuda_rndv=%u GPU RDMA flags=0x%x limit send=%u recv=%u\n",
+		is_gdr_copy_enabled, gdr_copy_limit_send, gdr_copy_limit_recv,
+		cuda_thresh_rndv,
+		proto->flags & (IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV
+				|IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND),
+		gpudirect_send_limit, gpudirect_recv_limit);
+	_HFI_DBG("send dma thresh: %u %u GPU send DMA thresh %u %u\n",
+		proto->iovec_thresh_eager, proto->iovec_thresh_eager_blocking,
+		proto->iovec_gpu_thresh_eager,
+		proto->iovec_gpu_thresh_eager_blocking);
+#else
+	_HFI_DBG("send dma thresh: %u %u\n", proto->iovec_thresh_eager,
+		proto->iovec_thresh_eager_blocking);
+#endif
+	_HFI_DBG("rdma: %u MR cache %u\n", proto->ep->rdmamode,
+		proto->ep->mr_cache_mode);
+
+fail:
+	return err;
+}
+
+psm2_error_t
+ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
+{
+	struct psmi_eptab_iterator itor;
+	uint64_t t_start;
+	uint64_t t_grace_start, t_grace_time, t_grace_interval;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	int i;
+	union psmi_envvar_val grace_intval;
+
+	/* Poll one more time to attempt to synchronize with the peer ep's. */
+	ips_ptl_poll(proto->ptl, 0);
+
+	psmi_getenv("PSM3_CLOSE_GRACE_PERIOD",
+		    "Additional grace period in seconds for closing end-point.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &grace_intval);
+
+	if (getenv("PSM3_CLOSE_GRACE_PERIOD")) {
+		t_grace_time = grace_intval.e_uint * SEC_ULL;
+	} else if (timeout_in > 0) {
+		/* default to half of the close time-out */
+		t_grace_time = timeout_in / 2;
+	} else {
+		/* propagate the infinite time-out case */
+		t_grace_time = 0;
+	}
+
+	if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT)
+		t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+	/* At close we will busy wait for the grace interval to see if any
+	 * receive progress is made. If progress is made we will wait for
+	 * another grace interval, until either no progress is made or the
+	 * entire grace period has passed. If the grace interval is too low
+	 * we may miss traffic and exit too early. If the grace interval is
+	 * too large the additional time spent while closing the program
+	 * will become visible to the user. */
+	psmi_getenv("PSM3_CLOSE_GRACE_INTERVAL",
+		    "Grace interval in seconds for closing end-point.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)0, &grace_intval);
+
+	if (getenv("PSM3_CLOSE_GRACE_INTERVAL")) {
+		t_grace_interval = grace_intval.e_uint * SEC_ULL;
+	} else {
+		/* A heuristic is used to scale up the timeout linearly with
+		 * the number of endpoints, and we allow one second per 1000
+		 * endpoints. */
+		t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000;
+	}
+
+	if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL)
+		t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL;
+	if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL)
+		t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	t_start = proto->t_fini = get_cycles();
+
+	/* Close whatever has been left open */
+	if (proto->num_connected_outgoing > 0) {
+		int num_disc = 0;
+		int *mask;
+		psm2_error_t *errs;
+		psm2_epaddr_t *epaddr_array;
+
+		psmi_epid_itor_init(&itor, proto->ep);
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			if (epaddr->ptlctl->ptl == proto->ptl)
+				num_disc++;
+		}
+		psmi_epid_itor_fini(&itor);
+		mask =
+		    (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				       sizeof(int));
+		errs = (psm2_error_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				sizeof(psm2_error_t));
+		epaddr_array = (psm2_epaddr_t *)
+		    psmi_calloc(proto->ep, UNDEFINED, num_disc,
+				sizeof(psm2_epaddr_t));
+
+		if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+			if (epaddr_array)
+				psmi_free(epaddr_array);
+			if (errs)
+				psmi_free(errs);
+			if (mask)
+				psmi_free(mask);
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		psmi_epid_itor_init(&itor, proto->ep);
+		i = 0;
+		while ((epaddr = psmi_epid_itor_next(&itor))) {
+			/*
+			 * if cstate_outgoing is CSTATE_NONE, then we know it
+			 * is an uni-directional connect, in that the peer
+			 * sent a connect request to us, but we never sent one
+			 * out to the peer epid. Ignore handling those in
+			 * ips_proto_disconnect() as we will do the right thing
+			 * when a disconnect request for the epaddr comes in from the peer.
+			 */
+			if (epaddr->ptlctl->ptl == proto->ptl &&
+				((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) {
+				mask[i] = 1;
+				epaddr_array[i] = epaddr;
+				i++;
+				IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr);
+			}
+		}
+		psmi_epid_itor_fini(&itor);
+		err = ips_proto_disconnect(proto, force, num_disc, epaddr_array,
+					   mask, errs, timeout_in);
+		psmi_free(mask);
+		psmi_free(errs);
+		psmi_free(epaddr_array);
+	}
+
+	t_grace_start = get_cycles();
+
+	while (psmi_cycles_left(t_grace_start, t_grace_time)) {
+		uint64_t t_grace_interval_start = get_cycles();
+		int num_disconnect_requests = proto->num_disconnect_requests;
+		PSMI_BLOCKUNTIL(
+			proto->ep, err,
+			proto->num_connected_incoming == 0 ||
+			(!psmi_cycles_left(t_start, timeout_in) &&
+			 (!psmi_cycles_left(t_grace_interval_start,
+					    t_grace_interval) ||
+			  !psmi_cycles_left(t_grace_start, t_grace_time))));
+		if (num_disconnect_requests == proto->num_disconnect_requests) {
+			/* nothing happened in this grace interval so break out early */
+			break;
+		}
+	}
+
+#if _HFI_DEBUGGING
+	if (_HFI_PRDBG_ON) {
+		uint64_t t_grace_finish = get_cycles();
+
+		_HFI_PRDBG_ALWAYS(
+			"Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n",
+			proto->num_connected_outgoing, proto->num_connected_incoming,
+			(int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) /
+			MSEC_ULL), (int)(t_grace_time / MSEC_ULL));
+	}
+#endif
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) {
+		PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send);
+	}
+#endif
+
+	if ((err = ips_ibta_fini(proto)))
+		goto fail;
+
+	if ((err = ips_proto_am_fini(&proto->proto_am)))
+		goto fail;
+
+	if ((err = ips_scbctrl_fini(&proto->scbc_egr)))
+		goto fail;
+
+	ips_proto_recv_fini(proto);
+
+	if (proto->protoexp) {
+		if ((err = ips_protoexp_fini(proto->protoexp)))
+			goto fail;
+	}
+	if (proto->scbc_rv) {
+		ips_scbctrl_fini(proto->scbc_rv);
+		psmi_free(proto->scbc_rv);
+	}
+
+	if (proto->mr_cache) {
+		psm2_verbs_free_mr_cache(proto->mr_cache);
+		proto->mr_cache = NULL;
+    }
+	psmi_stats_deregister_type(PSMI_STATSTYPE_IPSPROTO, proto);
+
+	psmi_mpool_destroy(proto->pend_sends_pool);
+	psmi_mpool_destroy(proto->timer_pool);
+
+
+fail:
+	proto->t_fini = proto->t_init = 0;
+	return err;
+}
+
+static
+psm2_error_t
+proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context)
+{
+	union psmi_envvar_val env_sdma, env_hfiegr;
+	psm2_error_t err = PSM2_OK;
+
+	env_sdma.e_uint = psmi_parse_senddma();
+	if (!env_sdma.e_uint) {
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    ~0U;
+	} else if (! psm2_verbs_mr_cache_allows_user_mr(proto->mr_cache)) {
+		_HFI_INFO("WARNING: Cache does not allow user MRs, disabling PSM3_SDMA (check rv enable_user_mr)\n");
+		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+		    ~0U;
+	} else {
+		proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ;
+		proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA;
+
+		if (!psmi_getenv("PSM3_MQ_EAGER_SDMA_THRESH",
+				"UD copy-to-sdma eager switchover threshold",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val) proto->iovec_thresh_eager,
+				&env_hfiegr)) {
+			proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
+				 env_hfiegr.e_uint;
+		}
+	}
+
+#ifdef PSM_CUDA
+	if (! is_gpudirect_enabled)
+		env_sdma.e_uint = 0;
+	else 
+		psmi_getenv("PSM3_GPUDIRECT_SDMA",
+		    "UD GPU send dma flags (0 disables send dma, 1 enables), default 1",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)1, &env_sdma);
+	if (!env_sdma.e_uint) {
+		proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking =
+		    ~0U;
+	} else if (! psm2_verbs_mr_cache_allows_user_mr(proto->mr_cache)) {
+		_HFI_INFO("WARNING: Cache does not allow user MRs, disabling PSM3_GPUDIRECT_SDMA (check rv enable_user_mr)\n");
+		proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking =
+		    ~0U;
+	} else {
+		proto->iovec_gpu_thresh_eager = MQ_HFI_THRESH_GPU_EGR_SDMA_SQ;
+		proto->iovec_gpu_thresh_eager_blocking = MQ_HFI_THRESH_GPU_EGR_SDMA;
+
+		if (!psmi_getenv("PSM3_GPU_MQ_EAGER_SDMA_THRESH",
+				"UD GPU copy-to-sdma eager switchover threshold",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val) proto->iovec_gpu_thresh_eager,
+				&env_hfiegr)) {
+			proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking =
+				 env_hfiegr.e_uint;
+		}
+	}
+#endif
+
+	return err;
+}
+
+static
+void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto)
+{
+	/* clear the ctrl send queue */
+	memset(ctrlq, 0, sizeof(*ctrlq));
+
+	proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED;
+	proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED;
+	proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED;
+	proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED;
+	proto->message_type_to_index[OPCODE_CONNECT_REQUEST] =
+	    CTRL_MSG_CONNECT_REQUEST_QUEUED;
+	proto->message_type_to_index[OPCODE_CONNECT_REPLY] =
+	    CTRL_MSG_CONNECT_REPLY_QUEUED;
+	proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] =
+	    CTRL_MSG_DISCONNECT_REQUEST_QUEUED;
+	proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] =
+	    CTRL_MSG_DISCONNECT_REPLY_QUEUED;
+
+	ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0;
+	ctrlq->ctrlq_overflow = 0;
+	ctrlq->ctrlq_proto = proto;
+
+	/*
+	 * We never enqueue ctrl messages with real payload. If we do,
+	 * the queue 'elem_payload' size needs to be big enough.
+	 * Note: enqueue nak/ack is very important for performance.
+	 */
+	proto->ctrl_msg_queue_enqueue =
+	    CTRL_MSG_ACK_QUEUED |
+	    CTRL_MSG_NAK_QUEUED |
+	    CTRL_MSG_BECN_QUEUED;
+
+	psmi_timer_entry_init(&ctrlq->ctrlq_timer,
+			      ips_proto_timer_ctrlq_callback, ctrlq);
+
+	return;
+}
+
+static __inline__ void _build_ctrl_message(struct ips_proto *proto,
+			struct ips_flow *flow, uint8_t message_type,
+			ips_scb_t *ctrlscb, uint32_t paylen)
+{
+	uint32_t tot_paywords = (sizeof(struct ips_message_header) +
+		HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT;
+	uint32_t slid, dlid;
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_message_header *p_hdr = &ctrlscb->ips_lrh;
+	ips_path_rec_t *ctrl_path =
+	    ipsaddr->pathgrp->pg_path[ipsaddr->
+				      hpp_index][IPS_PATH_HIGH_PRIORITY];
+
+	if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) &&
+	    (++ipsaddr->hpp_index >=
+	     ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]))
+		ipsaddr->hpp_index = 0;
+
+	/*
+	 * If the size of the transfer is NOT within the "exclusion range",
+	 * then use the "dispersive routling" slid/dlid.  Otherwise
+	 * use the base LIDS.
+	 *
+	 * This is a control message, so it should never be a TID transfer.
+	 */
+	slid = ctrl_path->pr_slid;
+	dlid = ctrl_path->pr_dlid;
+	if (ctrlscb->scb_flags & IPS_SEND_FLAG_NO_LMC) {
+		slid = ipsaddr->pathgrp->pg_base_slid;
+		dlid = ipsaddr->pathgrp->pg_base_dlid;
+	}
+
+	/* Control messages go over the control path. */
+	p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+				      ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) <<
+				       HFI_LRH_SL_SHIFT)
+					);
+	p_hdr->lrh[1] = dlid;
+	p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK);
+	p_hdr->lrh[3] = slid;
+
+	p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey |
+				      (message_type << HFI_BTH_OPCODE_SHIFT));
+
+	p_hdr->bth[1] = __cpu_to_be32(flow->flowid << HFI_BTH_FLOWID_SHIFT);
+	flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN;
+
+	/* p_hdr->bth[2] already set by caller, or don't care */
+	/* p_hdr->ack_seq_num already set by caller, or don't care */
+
+	p_hdr->connidx = ipsaddr->connidx_outgoing;
+	p_hdr->flags = 0;
+
+	p_hdr->khdr.kdeth0 = __cpu_to_le32(
+			(ctrlscb->scb_flags & IPS_SEND_FLAG_INTR) |
+			(IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT));
+	p_hdr->khdr.kdeth1 = 0;
+
+	return;
+}
+
+psm2_error_t
+ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
+{
+	struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context;
+	struct ips_proto *proto = ctrlq->ctrlq_proto;
+	struct ips_ctrlq_elem *cqe;
+	uint32_t have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+	psm2_error_t err;
+
+	/* service ctrl send queue first */
+	while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) {
+		cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail];
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+		if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) {
+			err = psmi_hal_spio_transfer_frame(proto,
+							   cqe->msg_scb.flow, &cqe->msg_scb,
+							   cqe->msg_scb.cksum, 0, PSMI_TRUE,
+							   have_cksum, cqe->msg_scb.cksum[0],
+							   proto->ep->context.psm_hw_ctxt
+#ifdef PSM_CUDA
+			       , 0
+#endif
+				);
+		} else {
+			psmi_assert_always(0);
+			err = PSM2_INTERNAL_ERR;
+		}
+		GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+
+		if (err == PSM2_OK) {
+			PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&cqe->msg_scb.ips_lrh,"PKT_STRM: err: %d", err);
+			ips_proto_epaddr_stats_set(proto, cqe->message_type);
+			*cqe->msg_queue_mask &=
+			    ~message_type2index(proto, cqe->message_type);
+			cqe->msg_queue_mask = NULL;
+			ctrlq->ctrlq_tail =
+			    (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE;
+		} else {
+			psmi_assert(err == PSM2_EP_NO_RESOURCES);
+
+			proto->stats.pio_busy_cnt++;
+			/* re-request a timer expiration */
+			psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+					   PSMI_TIMER_PRIO_0);
+			return PSM2_OK;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+/* Update cqe struct which is a single element from pending control message queue */
+PSMI_ALWAYS_INLINE(
+void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask,
+			  struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){
+
+	cqe->message_type = message_type;
+	cqe->msg_queue_mask = msg_queue_mask;
+	psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh,
+		       &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh));
+	cqe->msg_scb.flow = flow;
+	cqe->msg_scb.cksum[0] = ctrlscb->cksum[0];
+}
+
+psm2_error_t
+ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
+			uint16_t *msg_queue_mask, ips_scb_t *ctrlscb,
+			void *payload, uint32_t paylen)
+{
+	psm2_error_t err = PSM2_EP_NO_RESOURCES;
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+	struct ips_ctrlq *ctrlq = &proto->ctrlq;
+	struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe;
+	uint32_t have_cksum;
+
+	psmi_assert(message_type >= OPCODE_ACK &&
+			message_type <= OPCODE_DISCONNECT_REPLY);
+	psmi_assert((paylen & 0x3) == 0);	/* require 4-byte multiple */
+	psmi_assert(flow->frag_size >=
+			(paylen + PSM_CRC_SIZE_IN_BYTES));
+
+	/* Drain queue if non-empty */
+	if (cqe[ctrlq->ctrlq_tail].msg_queue_mask)
+		ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL);
+
+	/* finish setup control message header */
+	ips_set_LMC_LID_choice(proto, ctrlscb, paylen);
+	_build_ctrl_message(proto, flow, message_type, ctrlscb, paylen);
+
+	/* If enabled checksum control message */
+	have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM;
+	if (have_cksum) {
+		ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+		ips_do_cksum(proto, &ctrlscb->ips_lrh,
+				payload, paylen, ctrlscb->cksum);
+	}
+
+	/*
+	 * for ACK/NAK/BECN, we use the fast flow to send over, otherwise,
+	 * we use the original flow
+	 */
+	if (message_type == OPCODE_ACK ||
+	    message_type == OPCODE_NAK ||
+	    message_type == OPCODE_BECN)
+	{
+		psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[proto->msgflowid];
+	}
+
+	switch (flow->transfer) {
+	case PSM_TRANSFER_PIO:
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+		err = psmi_hal_spio_transfer_frame(proto, flow,
+						   ctrlscb, payload, paylen,
+						   PSMI_TRUE, have_cksum, ctrlscb->cksum[0],
+						   proto->ep->context.psm_hw_ctxt
+#ifdef PSM_CUDA
+						   , 0
+#endif
+			     );
+		GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+		break;
+	default:
+		err = PSM2_INTERNAL_ERR;
+		break;
+	}
+
+	if (err == PSM2_OK)
+	{
+		PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&ctrlscb->ips_lrh,"PKT_STRM: err: %d", err);
+		ips_proto_epaddr_stats_set(proto, message_type);
+	}
+
+	_HFI_VDBG("transfer_frame of opcode=%x,remote_lid=%d,"
+		  "src=%p,len=%d returns %d\n",
+		  (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh),
+		  __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err);
+
+	if (err != PSM2_EP_NO_RESOURCES)
+		return err;
+	proto->stats.pio_busy_cnt++;
+
+	if (proto->ctrl_msg_queue_enqueue & proto->
+	    message_type_to_index[message_type]) {
+		/* We only queue control msg without payload */
+		psmi_assert(paylen == 0);
+
+		if ((*msg_queue_mask) & proto->
+		    message_type_to_index[message_type]) {
+
+			if (message_type == OPCODE_ACK) {
+				/* Pending queue should contain latest ACK type message,
+				 * overwrite the previous one. */
+				ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask,
+						     flow, ctrlscb, message_type);
+			}
+
+			err = PSM2_OK;
+		} else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) {
+			/* entry is free */
+			if (message_type == OPCODE_ACK) {
+				/* Track the index of last ACK type message in queue*/
+				flow->ack_index = ctrlq->ctrlq_head;
+			}
+
+			*msg_queue_mask |=
+			    message_type2index(proto, message_type);
+
+			ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask,
+					     flow, ctrlscb, message_type);
+
+			ctrlq->ctrlq_head =
+			    (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE;
+			/* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */
+			psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer,
+					   PSMI_TIMER_PRIO_0);
+
+			err = PSM2_OK;
+		} else {
+			proto->ctrl_msg_queue_overflow++;
+		}
+	}
+
+	return err;
+}
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb)
+{
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+	ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb);
+	if ((proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+	    (scb->tidctrl == 0) && (scb->nfrag == 1)) {
+		scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM;
+		ips_do_cksum(proto, &scb->ips_lrh,
+			     ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]);
+	}
+
+	/* If this is the first scb on flow, pull in both timers. */
+	if (flow->timer_ack == NULL) {
+		psmi_assert(flow->timer_send == NULL);
+		flow->timer_ack = scb->timer_ack;
+		flow->timer_send = scb->timer_send;
+	}
+	psmi_assert(flow->timer_ack != NULL);
+	psmi_assert(flow->timer_send != NULL);
+
+	/* Every flow has a pending head that points into the unacked queue.
+	 * If sends are already pending, process those first */
+	if (SLIST_EMPTY(&flow->scb_pend))
+        {
+                PSM2_LOG_PKT_STRM(PSM2_LOG_PEND,&scb->ips_lrh,"PKT_STRM: pkt in pend list");
+		SLIST_FIRST(&flow->scb_pend) = scb;
+	}
+
+	/* Insert scb into flow's unacked queue */
+	STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq);
+
+#ifdef PSM_DEBUG
+	/* update scb counters in flow. */
+	flow->scb_num_pending++;
+	flow->scb_num_unacked++;
+#endif
+}
+MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue);
+
+/*
+ * This function attempts to flush the current list of pending
+ * packets through PIO.
+ *
+ * Recoverable errors:
+ * PSM2_OK: Packet triggered through PIO.
+ * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm2_error_t
+ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
+{
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+	struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+	int num_sent = 0;
+	uint64_t t_cyc;
+	ips_scb_t *scb;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(!SLIST_EMPTY(scb_pend));
+
+	/* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+	if_pf((flow->credits <= 0)
+		) {
+		if (nflushed)
+			*nflushed = 0;
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) {
+		scb = SLIST_FIRST(scb_pend);
+		psmi_assert(scb->nfrag == 1);
+		GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+		if ((err = psmi_hal_spio_transfer_frame(proto, flow, scb,
+							ips_scb_buffer(scb),
+							scb->payload_size,
+							PSMI_FALSE,
+							scb->ips_lrh.flags &
+							IPS_SEND_FLAG_PKTCKSUM,
+							scb->cksum[0],
+							proto->ep->context.psm_hw_ctxt
+#ifdef PSM_CUDA
+						   , IS_TRANSFER_BUF_GPU_MEM(scb)
+#endif
+			     ))
+		    == PSM2_OK) {
+			GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+			t_cyc = get_cycles();
+			scb->scb_flags &= ~IPS_SEND_FLAG_PENDING;
+			scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+			scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc;
+			psmi_timer_request(proto->timerq, flow->timer_ack,
+					   scb->abs_timeout);
+			num_sent++;
+			flow->credits--;
+			SLIST_REMOVE_HEAD(scb_pend, next);
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+			PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: err: %d", err);
+
+		} else
+		{
+			GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */
+			break;
+		}
+	}
+
+	/* If out of flow credits re-schedule send timer */
+	if (!SLIST_EMPTY(scb_pend)) {
+		proto->stats.pio_no_flow_credits++;
+		psmi_timer_request(proto->timerq, flow->timer_send,
+				   get_cycles() + proto->timeout_send);
+	}
+
+	if (nflushed != NULL)
+		*nflushed = num_sent;
+
+	return err;
+}
+
+
+
+
+
+
+
+
+psm2_error_t
+ips_proto_timer_ack_callback(struct psmi_timer *current_timer,
+			     uint64_t current)
+{
+	struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+	struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto;
+	uint64_t t_cyc_next = get_cycles();
+	psmi_seqnum_t err_chk_seq;
+	ips_scb_t *scb, ctrlscb;
+	uint8_t message_type;
+
+	if (STAILQ_EMPTY(&flow->scb_unacked))
+		return PSM2_OK;
+
+	scb = STAILQ_FIRST(&flow->scb_unacked);
+
+	if (current >= scb->abs_timeout) {
+		int done_local = 0;
+
+		done_local = 1;	/* Always done for PIO flows */
+
+		scb->ack_timeout =
+		    min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor,
+			proto->epinfo.ep_timeout_ack_max);
+		scb->abs_timeout = t_cyc_next + scb->ack_timeout;
+		if (done_local) {
+			_HFI_VDBG
+			    ("sending err_chk flow=%d with first=%d,last=%d\n",
+			     flow->flowid,
+			     STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num,
+			     STAILQ_LAST(&flow->scb_unacked, ips_scb,
+					 nextq)->seq_num.psn_num);
+
+			ctrlscb.scb_flags = 0;
+			if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD)
+				ctrlscb.scb_flags |= IPS_SEND_FLAG_INTR;
+
+			err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ?
+					flow->xmit_seq_num :
+					SLIST_FIRST(&flow->scb_pend)->seq_num;
+
+			if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+				// for UD we use RC QP instead of STL100's TIDFLOW HW
+				// UDP has no RDMA
+				psmi_assert_always(0);	// we don't allocate ips_flow for TID
+				message_type = OPCODE_ERR_CHK;	// keep KlockWorks happy
+			} else {
+				PSM2_LOG_MSG("sending ERR_CHK message");
+				message_type = OPCODE_ERR_CHK;
+				err_chk_seq.psn_num = (err_chk_seq.psn_num - 1)
+					& proto->psn_mask;
+			}
+			ctrlscb.ips_lrh.bth[2] =
+					__cpu_to_be32(err_chk_seq.psn_num);
+
+			ips_proto_send_ctrl_message(flow, message_type,
+					&flow->ipsaddr->ctrl_msg_queued,
+					&ctrlscb, ctrlscb.cksum, 0);
+		}
+
+		t_cyc_next = get_cycles() + scb->ack_timeout;
+	} else
+		t_cyc_next += (scb->abs_timeout - current);
+
+	psmi_timer_request(proto->timerq, current_timer, t_cyc_next);
+
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_timer_send_callback(struct psmi_timer *current_timer,
+			      uint64_t current)
+{
+	struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow;
+
+	if (!SLIST_EMPTY(&flow->scb_pend))
+		flow->flush(flow, NULL);
+
+	return PSM2_OK;
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a4b0825e0b2dde3f876db06267cf7043216787a
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto.h
@@ -0,0 +1,715 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_H
+#define _IPS_PROTO_H
+
+#include "ips_config.h"
+#include "psm_user.h"
+
+#include "ips_tid.h"
+#include "ips_recvhdrq.h"
+#include "ips_epstate.h"
+#include "ips_proto_am.h"
+#include "ips_tidflow.h"
+#include "ips_path_rec.h"
+
+typedef enum ips_path_type {
+	IPS_PATH_LOW_PRIORITY,
+	IPS_PATH_NORMAL_PRIORITY,
+	IPS_PATH_HIGH_PRIORITY,
+	IPS_PATH_MAX_PRIORITY
+} ips_path_type_t;
+
+/*
+ * Local Endpoint info.
+ *
+ * Contains information necessary for composing packets for the local endpoint
+ */
+struct ips_epinfo {
+	uint16_t ep_base_lid;
+	uint8_t ep_hash;	// for hashing adaptive dispersive routing
+#define EP_HASH ep_hash
+	uint8_t ep_lmc;
+	opa_rate ep_link_rate;
+	uint16_t ep_sl;		/* PSM3_NIC_SL only when path record not used */
+	uint16_t ep_mtu;	// PSM payload after potential hdr & PSM3_MTU decrease
+	uint16_t ep_pkey;	/* PSM3_PKEY only when path record not used */
+	uint64_t ep_timeout_ack;	/* PSM3_ERRCHK_TIMEOUT if no path record */
+	uint64_t ep_timeout_ack_max;
+	uint32_t ep_timeout_ack_factor;
+};
+
+/*
+ * This contains a path record table table that Enumerate the paths available
+ * between the local node and a remote node associated with an end point.
+ * Also maintain a state value for each message priority that keeps indicates
+ * which path should be assigned to the next message of that priority.
+ *
+ * For LMC/Torus, keep list of base and max dlid. Used for pkt verification
+ *
+ * pg_base_dlid and pg_base_slid are in network byte order.
+ */
+#define IPS_MAX_PATH_LMC 3
+typedef struct ips_path_grp {
+	uint16_t pg_base_dlid;
+	uint16_t pg_base_slid;
+	uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY];
+	uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY];
+	ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY];
+} ips_path_grp_t;
+
+/*
+ * Start and finish routines for constructing an ips_proto.
+ */
+struct ips_proto;
+psm2_error_t ips_proto_init(const psmi_context_t *context,
+			    const struct ptl *ptl,
+			    int num_of_send_bufs,
+			    int num_of_send_desc,
+			    uint32_t imm_size,
+			    const struct psmi_timer_ctrl *timerq, /* PTL's timerq */
+			    const struct ips_epstate *epstate,	  /* PTL's epstate */
+			    void *spioc,	                  /* PTL's opaque spio control */
+			    struct ips_proto *proto);	          /* output protocol */
+
+psm2_error_t ips_proto_fini(struct ips_proto *proto, int force,
+			   uint64_t timeout);
+
+/*
+ * Control message structures
+ *
+ * ips low-level control messages to ensure reliability of eager packets.
+ */
+#define CTRL_MSG_QEUEUE_SIZE 64	/* power of two */
+
+struct ips_ctrlq_elem {
+	uint8_t message_type;
+	uint16_t *msg_queue_mask;
+	ips_scb_t msg_scb;
+};
+
+struct ips_ctrlq {
+	/* Queued control messages, queued when pio is busy */
+	struct ips_proto *ctrlq_proto;
+
+	uint32_t ctrlq_head;
+	uint32_t ctrlq_tail;
+	uint32_t ctrlq_overflow;
+
+	struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN;
+	struct psmi_timer ctrlq_timer;	/* when in timerq */
+};
+
+/* Connect/disconnect, as implemented by ips */
+
+/*
+ * Connections are not pairwise but we keep a single 'epaddr' for messages-from
+ * and messages-to a remote 'epaddr'.  State transitions for connecting TO and
+ * FROM 'epaddrs' are the following:
+ * Connect TO (Connect OUTGOING):
+ *   NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE
+ *
+ * Connect FROM (we receive a connect request - Connect INCOMING)
+ *   NONE -> ESTABLISHED -> NONE
+ */
+#define CSTATE_ESTABLISHED		1
+#define CSTATE_NONE			2
+#define CSTATE_OUTGOING_DISCONNECTED	3
+#define CSTATE_OUTGOING_WAITING		4
+#define CSTATE_OUTGOING_WAITING_DISC	5
+
+psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep,
+			      const psm2_epid_t *array_of_epid,
+			      const int *array_of_epid_mask,
+			      psm2_error_t *array_of_errors,
+			      psm2_epaddr_t *array_of_epaddr,
+			      uint64_t timeout_in);
+
+psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+				 psm2_epaddr_t array_of_epaddr[],
+				 const int array_of_epaddr_mask[],
+				 psm2_error_t array_of_errors[],
+				 uint64_t timeout_in);
+
+int ips_proto_isconnected(struct ips_epaddr *ipsaddr);
+
+/*
+ * Pending operation structures
+ */
+struct ips_pend_sreq {
+	STAILQ_ENTRY(ips_pend_sreq) next;
+	psm2_mq_req_t req;
+	uint32_t type;
+};
+
+#define IPS_PENDSEND_EAGER_DATA	1
+#define IPS_PENDSEND_EAGER_REQ	2
+#define IPS_PENDSEND_EXP_TIDS	3
+#define IPS_PENDSEND_EXP_SENDS	4
+
+STAILQ_HEAD(ips_pendsendq, ips_pend_sreq);
+
+struct ips_pend_sends {
+	struct ips_proto *proto;	/* back ptr */
+	struct psmi_timer timer;
+	struct ips_pendsendq pendq;
+};
+
+/*
+ * One instance of the protocol
+ */
+
+struct ips_protoexp;
+
+struct ips_proto_stats {
+	uint64_t pio_busy_cnt;
+	uint64_t pio_no_flow_credits;
+	uint64_t post_send_fail;
+	uint64_t writev_busy_cnt;
+	uint64_t scb_egr_unavail_cnt;
+	uint64_t unknown_packets;
+	uint64_t stray_packets;
+};
+
+
+/*
+ * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init
+ */
+struct ips_proto_epaddr_stats {
+	uint64_t err_chk_send;
+	uint64_t err_chk_recv;
+#ifdef RNDV_MOD
+	uint64_t err_chk_rdma_send;
+	uint64_t err_chk_rdma_recv;
+	uint64_t err_chk_rdma_resp_send;
+	uint64_t err_chk_rdma_resp_recv;
+#endif
+	uint64_t nak_send;
+	uint64_t nak_recv;
+	uint64_t connect_req_send;
+	uint64_t connect_req_recv;
+	uint64_t connect_rep_send;
+	uint64_t connect_rep_recv;
+	uint64_t disconnect_req_send;
+	uint64_t disconnect_req_recv;
+	uint64_t disconnect_rep_send;
+	uint64_t disconnect_rep_recv;
+	uint64_t rts_send;
+	uint64_t rts_recv;
+	uint64_t cts_long_data_send;
+	uint64_t cts_long_data_recv;
+	uint64_t cts_rdma_send;
+	uint64_t cts_rdma_recv;
+	uint64_t send_rexmit;
+#ifdef RNDV_MOD
+	uint64_t rdma_rexmit;
+#endif
+};
+
+/* OPP support structure. */
+struct opp_api {
+	void *(*op_path_find_hca) (const char *name, void **device);
+	void *(*op_path_open) (void *device, int port_num);
+	void (*op_path_close) (void *context);
+	int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query,
+					ibta_path_rec_t *response);
+};
+
+struct ips_ibta_compliance_fn {
+	psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid,
+				    uint16_t dlid,
+				    uint16_t ip_hi,
+				    unsigned long timeout,
+				    ips_path_grp_t **ppathgrp);
+	psm2_error_t(*fini) (struct ips_proto *proto);
+};
+
+/* please don't change the flow id order */
+typedef enum ips_epaddr_flow {
+	EP_FLOW_GO_BACK_N_PIO,
+	EP_FLOW_TIDFLOW,	/* Can either pio or dma for tidflow */
+	EP_FLOW_LAST		/* Keep this the last endpoint flow */
+} ips_epaddr_flow_t;
+
+typedef enum psm_transfer_type {
+	PSM_TRANSFER_PIO,
+	PSM_TRANSFER_LAST	/* Keep this the last transfer type */
+} psm_transfer_type_t;
+
+typedef enum psm_protocol_type {
+	PSM_PROTOCOL_GO_BACK_N,
+	PSM_PROTOCOL_TIDFLOW,
+	PSM_PROTOCOL_LAST	/* Keep this the last protocol type */
+} psm_protocol_type_t;
+
+struct ips_proto {
+	struct ptl *ptl;	/* cached */
+	psm2_ep_t ep;		/* cached, for errors */
+	psm2_mq_t mq;		/* cached, for mq handling */
+	/* Pending sends */
+	struct ips_pend_sends pend_sends;
+	struct ips_epstate *epstate;
+	struct psmi_timer_ctrl *timerq;
+
+	struct ips_protoexp *protoexp;
+	struct ips_scbctrl *scbc_rv;
+	struct ips_spio *spioc;
+	struct ips_scbctrl scbc_egr;
+	struct ips_epinfo epinfo;
+
+
+	uint64_t timeout_send;
+	uint32_t flags;
+	uint32_t iovec_thresh_eager;
+	uint32_t iovec_thresh_eager_blocking;
+#ifdef PSM_CUDA
+	uint32_t iovec_gpu_thresh_eager;
+	uint32_t iovec_gpu_thresh_eager_blocking;
+#endif
+	uint32_t psn_mask;
+	uint32_t scb_bufsize;
+	uint32_t multirail_thresh_load_balance;
+	uint16_t flow_credits;
+	mpool_t pend_sends_pool;
+	mpool_t timer_pool;
+	struct ips_ibta_compliance_fn ibta;
+	struct ips_proto_stats stats;
+	struct ips_proto_epaddr_stats epaddr_stats;
+	struct ptl_strategy_stats strat_stats;
+
+	struct ips_proto_am proto_am;
+
+	struct ips_ctrlq ctrlq;
+	/* pure sdma mode, use dma flow, otherwise, use pio flow */
+	ips_epaddr_flow_t msgflowid;
+
+	// mr_cache is only allocated and used when PSM3_RDMA enabled
+	psm2_mr_cache_t mr_cache;
+
+
+	uint64_t t_init;
+	uint64_t t_fini;
+	uint32_t runid_key;	/* we use our pid, not ideal */
+
+	int num_connected_outgoing;
+	int num_connected_incoming;
+	int num_disconnect_requests;
+
+	/* misc state variables. */
+
+	/* Smallest interval in cycles between which we warn about stray
+	 * messages This is a per-endpoint quantity, overridable with
+	 * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die"
+	 * message.
+	 */
+	uint64_t stray_warn_interval;
+	int done_warning;
+	int done_once;
+	int num_bogus_warnings;
+	struct {
+		uint32_t interval_secs;
+		uint64_t next_warning;
+		uint64_t count;
+	} psmi_logevent_tid_send_reqs;
+
+	/*
+	 * Disable the LMC based dispersive routing for all message
+	 * sizes in bytes between ips_lmc_disable_low and ips_lmc_disable_high,
+	 * inclusive.
+	 */
+	uint32_t ips_lmc_disable_low;
+	uint32_t ips_lmc_disable_high;
+	struct hsearch_data ips_path_rec_hash;
+	struct hsearch_data ips_path_grp_hash;
+	void *opp_lib;
+	void *hndl;
+	void *device;
+	void *opp_ctxt;
+	struct opp_api opp_fn;
+
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg;
+	struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg;
+	mpool_t cuda_hostbuf_pool_send;
+	mpool_t cuda_hostbuf_pool_small_send;
+	CUstream cudastream_send;
+	unsigned cuda_prefetch_limit;
+#endif
+
+/*
+ * Control message queue for pending messages.
+ *
+ * Control messages are queued as pending when no PIO is available for sending
+ * the message.  They are composed on the fly and do not need buffering.
+ *
+ * Variables here are write once (at init) and read afterwards (except the msg
+ * queue overflow counters).
+ */
+	uint32_t ctrl_msg_queue_overflow;
+	uint32_t ctrl_msg_queue_enqueue;
+	uint32_t message_type_to_index[256];
+#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)])
+
+	time_t writevFailTime;
+};
+
+
+/*
+ * Test the payload length against the lmc_disable_low and lmc_disable_hi
+ * values, to determine if a transfer of this size should use LMC LIDs.
+ * Set the IPS_SEND_FLAG_NO_LMC flag in the scb.
+ */
+static inline void
+ips_set_LMC_LID_choice(struct ips_proto *proto, ips_scb_t *scb, uint32_t len)
+{
+	if ((len >= proto->ips_lmc_disable_low) &&
+	    (len <= proto->ips_lmc_disable_high)) {
+		PSM2_LOG_MSG("DISABLE LMC paylen %u\n", len);
+		scb->scb_flags |= IPS_SEND_FLAG_NO_LMC;
+	}
+
+	return;
+}
+
+/*
+ * Endpoint address, encapsulates per-endpoint protocol metadata
+ *
+ * Directly implements the ptl epaddr.
+ */
+typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed);
+
+/**
+ * ips_flow is a structure that combines all information regarding a send
+ * from one endpoint to another one. Specifically, it is the place where
+ * the Maximum Transmission Unit for a send is calculated, given how many
+ * factors could possibly influence the MTU calculation. See ips_flow_init
+ * documentation for more details.
+ */
+struct ips_flow {
+	SLIST_ENTRY(ips_flow) next;	/* List of flows with pending acks */
+	ips_flow_flush_fn_t flush;	/* flush function for this flow */
+
+	struct ips_epaddr *ipsaddr;	/* back pointer, remote endpoint */
+	ips_path_rec_t *path;	/* Path to use for flow */
+
+	uint16_t frag_size;	/* < This flow's fragment size, calculated as the
+				   < minimum of all relevant MTUs involved */
+
+	uint16_t flowid:2;	/* flow id: pio(0) or dma(1) or tidflow(2) */
+	uint16_t transfer:3;	/* spio or sdma */
+	uint16_t protocol:3;	/* go-back-n or tidflow */
+	uint16_t flags:8;	/* flow state flags */
+
+	uint16_t cwin;		/* Size of congestion window */
+	uint16_t ack_interval;	/* interval to ack packets */
+	uint16_t ack_counter;	/* counter to ack packets */
+	int16_t  credits;	/* Current credits available to send on flow */
+	uint32_t ack_index;     /* Index of the last ACK message type in pending message queue */
+
+	psmi_seqnum_t xmit_seq_num;	/* transmit packet sequence number */
+	psmi_seqnum_t xmit_ack_num;	/* acked packet sequence number */
+	psmi_seqnum_t recv_seq_num;	/* recieved packet sequence number */
+
+	psmi_timer *timer_send;	/* timer for frames that got a busy PIO */
+	psmi_timer *timer_ack;	/* timer for unacked frames */
+
+	 STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked;	/* unacked queue */
+	 SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend;	/* pending queue */
+
+#ifdef PSM_DEBUG
+	uint32_t scb_num_pending;	/* pending scb counter */
+	uint32_t scb_num_unacked;	/* unacked scb counter */
+#endif
+};
+
+#define IPS_FLOW_MSG_TOGGLE_OOO_MASK	(1 << 0)	/* ooo msg check */
+#define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK	(1 << 1)	/* unexp msg check */
+/*
+ * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other.
+ */
+struct ips_epaddr {
+	struct psm2_epaddr epaddr;	/* inlined psm level epaddr */
+	struct ips_msgctl *msgctl;	/* ips level msg control */
+
+	struct ips_epaddr *next;	/* linklist */
+
+	struct ips_flow flows[EP_FLOW_LAST - 1];	/* pio and dma */
+	ips_path_grp_t *pathgrp;	/* pointer to slid/dlid group in hash */
+
+	uint32_t connidx_outgoing;	/* peer's connection idx */
+	uint32_t connidx_incoming;	/* my connection idx */
+
+	uint16_t ctrl_msg_queued;	/* bitmap of queued control messages to be send */
+	uint32_t window_rv;		/* RNDV window size per connection */
+
+	uint8_t  hpp_index;	/* high priority index */
+	uint8_t  msg_toggle;	/* only 2 bits used, 6 bits for future */
+	// on UD/UDP context only used for hashing adaptive dispersive routing
+	uint32_t remote_qpn;
+#define IPSADDR_HASH remote_qpn
+#ifdef RNDV_MOD
+	union  ibv_gid remote_gid;	/* GID of dest to use for IB CM  */
+	psm2_rv_conn_t rv_conn;
+	uint32_t remote_rv_index; // RV index of dest to use for immed */
+	// state of connection - need it here so we don't call kernel to poll
+	// ! conn - no connection
+	// conn && ! connected - connection processes started, but not done
+	// connected - connection established and usable (implies conn)
+	uint8_t rv_connected:1;
+	uint8_t reserved:4;
+	// during error recovery a receiver may be unable to allocate an scb to
+	// send the respond.  In which case the information is stashed here and
+	// checked in ips_proto_timer_send_callback for the proto->msgflowid flow
+	// when an scb is available, this info allows the response to be built
+	// Since we can only stash one such info per ipsaddr, we limit senders
+	// to one outstanding err_chk_rdma at a time.  Recovery is infrequent
+	// and already slow due to QP reconnect so this is a reasonable compromise
+	// the idea of using the ctrlq (64 entries deep per proto) was explored
+	// but is not really for "level 2" reliability messages so this approach
+	// was deemed simpler to implement and lower risk to mature code
+	uint8_t rv_err_chk_rdma_outstanding:1; /* only one per requestor */
+	uint8_t rv_need_send_err_chk_rdma_resp:1; /* is resp info stashed */
+	uint8_t rv_err_chk_rdma_resp_need_resend:1; /* info for resp */
+	ptl_arg_t rv_err_chk_rdma_resp_rdesc_id; /* info for resp */
+	ptl_arg_t rv_err_chk_rdma_resp_sdesc_id; /* info for resp */
+	STAILQ_ENTRY(ips_epaddr) pend_err_resp_next; /* queue to send resp */
+#endif
+	// TBD - to reduce memory footprint, perhaps allocate a separate
+	// structure only when RC QP enabled and point to it here
+	struct ibv_qp *rc_qp;
+	struct psm2_verbs_recv_pool recv_pool;
+	uint32_t rc_qp_max_recv_wr;	// TBD if we allocated recv buffers sooner we
+							// wouldn't need this field
+	uint32_t rc_qp_max_inline_data;
+	struct psm2_verbs_send_allocator send_allocator;
+	// use_* help avoid if tests in post_send datapath
+	psm2_verbs_send_allocator_t use_allocator;	// points to verbs_ep until
+												// rc_connected
+	struct ibv_qp *use_qp;	// points to verbs_ep UD QP until
+							// rc_connected
+	uint32_t use_max_inline_data;	// verbs_ep UD QP value until connected
+	uint8_t rc_connected;
+
+	/* this portion is only for connect/disconnect */
+	uint64_t s_timeout;	/* used as a time in close */
+	uint32_t runid_key;	/* peer process pid */
+	uint32_t credit:2;	/* credit to connect/disconnect: 0 or 1 */
+	uint32_t cstate_outgoing:3;	/* connection state to, max 7 */
+	uint32_t cstate_incoming:3;	/* connection state from, max 7 */
+	uint32_t delay_in_ms:8;	/* disconnect delay in ms */
+	uint32_t cerror_outgoing:8;	/* error code during connection */
+	uint32_t cerror_incoming:8;	/* error code during connection */
+};
+
+static inline int
+ips_epaddr_connected(struct ips_epaddr *ipsaddr)
+{
+	if (ipsaddr->rc_connected)
+		return 1;
+#ifdef RNDV_MOD
+	if (ipsaddr->rv_connected)
+		return 1;
+#endif
+	return 0;
+}
+
+/*
+ * ips_msgctl_t is per connection struct.
+ */
+struct ips_msgctl {
+	struct ips_epaddr master_epaddr; /* Master rail's epaddr */
+
+	struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */
+	uint16_t mq_send_seqnum;	 /* next sending message sequence */
+	uint16_t mq_recv_seqnum;	 /* next receiving message sequence */
+	uint16_t am_send_seqnum;	 /* next sending message sequence */
+	uint16_t am_recv_seqnum;	 /* next receiving message sequence */
+	uint16_t ipsaddr_count;		 /* number of ipsaddr to use */
+	uint16_t outoforder_count;	 /* number of outoforder messages */
+};
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node)
+{
+	ips_epaddr_t *cur;
+
+	/* The new node is inserted before head. */
+	node->next = head;
+
+	/* Circle around the linked list to head's predecessor and update. */
+	for (cur = head; cur->next != head; cur = cur->next);
+	cur->next = node;
+}
+
+static inline __attribute__ ((unused))
+void IPS_MCTXT_REMOVE(ips_epaddr_t *node)
+{
+	ips_epaddr_t *cur;
+
+	/* Circle around to node's predecessor and update. */
+	for (cur = node; cur->next != node; cur = cur->next);
+	cur->next = node->next;
+	node->next = node;
+}
+
+/*
+ * Initialize a flow, setting its attributes. Selects the path the flow will
+ * use as well as calculates the flow's fragment size defined as:
+ * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends
+ * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends
+ */
+void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+		   ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+		   psm_protocol_type_t protocol, ips_path_type_t path_type,
+		   uint32_t flow_index);
+MOCK_DCL_EPILOGUE(ips_flow_init);
+
+void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr,
+			  struct ips_flow *flow);
+
+void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb);
+MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue);
+
+psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed);
+
+/* Wrapper for enqueue + flush */
+psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb);
+
+
+/*
+ * Protocol receive processing
+ *
+ */
+/* Error handling for unknown packet, packet is unknown when epid doesn't match
+ * in epstate table */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev);
+/* Exposed for fastpath only */
+int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * Protocol exception handling and frame dumps
+ */
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg);
+void ips_proto_dump_frame(void *frame, int lenght, char *message);
+void ips_proto_dump_data(void *data, int data_length);
+void ips_proto_dump_eager(uint32_t *curr_rcv_hdr);
+
+/*
+ * Checksum of ips packets
+ */
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc);
+
+/*
+ * Matched-Queue processing and sends
+ */
+psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto,
+				      psm2_mq_req_t req);
+psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto,
+				       psm2_mq_req_t req);
+int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev);
+void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl);
+int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev);
+
+psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr,
+			      uint32_t flags, psm2_mq_tag_t *tag,
+			      const void *ubuf, uint32_t len);
+
+psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr,
+				uint32_t flags_user, uint32_t flags_internal,
+				psm2_mq_tag_t *tag, const void *ubuf, uint32_t len,
+				void *context, psm2_mq_req_t *req_o);
+
+psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et,
+					      uint32_t *out, psm2_mq_t mq, psm2_epaddr_t);
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * IPS packet service routine table.
+ */
+typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev);
+extern ips_packet_service_fn_t
+	ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED];
+
+psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto);
+
+psm2_error_t
+MOCKABLE(ips_ibta_init)(struct ips_proto *proto);
+MOCK_DCL_EPILOGUE(ips_ibta_init);
+
+psm2_error_t ips_ibta_fini(struct ips_proto *proto);
+
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset,
+			      uint32_t len))
+{
+	uint32_t window_len;
+	window_len = len - offset;
+	if (window_len >= max_window)
+		window_len = max_window;
+	return window_len;
+}
+#endif
+
+
+#endif /* _IPS_PROTO_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.c
new file mode 100644
index 0000000000000000000000000000000000000000..995c6862a77d4c2431c03f1a22b2c2971a061361
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.c
@@ -0,0 +1,618 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm2_am.h"
+#include "psm_am_internal.h"
+#include "psm_mq_internal.h"
+#include "ips_proto.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+struct ips_am_token {
+	struct psmi_am_token tok;
+
+	/* ptl-specific token stuff */
+	struct ips_epaddr *epaddr_rail;
+	struct ips_proto_am *proto_am;
+};
+
+struct ips_am_message {
+	struct ips_message_header p_hdr;
+	struct ips_am_message *next;
+	struct ips_epaddr *ipsaddr;
+	struct ips_proto_am *proto_am;
+	uint64_t *payload;
+	uint32_t paylen;
+	uint16_t seqnum;
+};
+
+/* These variables are shared for all packet flows in a PSM process; they are
+ * shared across multiple rails.  There is no single AM object to hang these
+ * off of, so they are declared here as globals. */
+static struct {
+	struct ips_am_message head;
+	struct ips_am_message *tail;
+} ips_am_outoforder_q;
+
+static mpool_t ips_am_msg_pool;
+
+/* This calculation ensures that the number of reply slots will always be at
+ * least twice as large + 1 as the number of request slots. This is optimal: the
+ * minimum amount required is actually only twice as many, but it is much
+ * slower. */
+#define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1)
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+		  int num_send_slots,
+		  uint32_t imm_size,
+		  struct ips_proto_am *proto_am)
+{
+	psm2_error_t err = PSM2_OK;
+	int send_buf_size = proto->epinfo.ep_mtu;
+	int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots);
+	int num_req_slots = num_send_slots - num_rep_slots;
+
+	proto_am->proto = proto;
+
+	/* In a node pair, the number of reply send buffers on at least one of
+	 * the nodes must be at least double the number (optimal: double + 1) of
+	 * send descriptors on the other node. While this constraint applies
+	 * only to the reply send buffers, allowing the caller to tune only the
+	 * number of request send buffers would be awkward, as they have no
+	 * knowledge of the subdivision of the memory into separate mempools for
+	 * requests and replies. It's an internal concern at this point. */
+	if ((err = ips_scbctrl_init(&proto->ep->context,
+				    num_req_slots,
+				    num_req_slots,
+				    imm_size,
+				    send_buf_size,
+				    NULL,
+				    NULL,
+				    &proto_am->scbc_request)))
+		goto fail;
+
+	if ((err = ips_scbctrl_init(&proto->ep->context,
+				    num_rep_slots,
+				    num_rep_slots,
+				    imm_size,
+				    send_buf_size,
+				    NULL,
+				    NULL,
+				    &proto_am->scbc_reply)))
+		goto fail;
+
+	if (ips_am_msg_pool == NULL) {
+		union psmi_envvar_val max_msgs;
+
+		ips_am_outoforder_q.head.next = NULL;
+		ips_am_outoforder_q.tail = &ips_am_outoforder_q.head;
+
+		psmi_getenv("PSM3_AM_MAX_OOO_MSGS",
+			"Maximum number of OOO Active Messages to queue before dropping.",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1024, &max_msgs);
+
+		ips_am_msg_pool = psmi_mpool_create(
+				sizeof(struct ips_am_message),
+				32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL);
+	}
+fail:
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am)
+{
+	ips_scbctrl_fini(&proto_am->scbc_request);
+	ips_scbctrl_fini(&proto_am->scbc_reply);
+	if (ips_am_msg_pool != NULL) {
+		psmi_mpool_destroy(ips_am_msg_pool);
+		ips_am_msg_pool = NULL;
+	}
+
+	return PSM2_OK;
+}
+
+/* Fill in AM capabilities parameters */
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS);
+	int max_payload =
+		ep->mtu -
+		((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t));
+
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS;
+	parameters->max_nargs = max_nargs;
+	parameters->max_request_short = max_payload;
+	parameters->max_reply_short = max_payload;
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr,
+		psm2_amarg_t *args, int nargs, uint8_t opcode,
+		void *src, size_t len, int flags, int pad_bytes)
+{
+	int i, hdr_qwords = IPS_AM_HDR_NARGS;
+	struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto;
+
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+
+	/* There are a limited number of bits for nargs in the header, making
+	   overflow very easy.  Make sure the values match. */
+	psmi_assert(nargs == scb->ips_lrh.amhdr_nargs);
+
+	_HFI_VDBG("%s src=%p len=%d, nargs=%d\n",
+		  ((opcode == OPCODE_AM_REQUEST) ||
+		   (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep",
+		  src, (int)len, nargs);
+
+	if (nargs == 1) {	/* fastpath */
+		scb->ips_lrh.data[0].u64w0 = args[0].u64w0;
+		hdr_qwords--;
+	} else if (nargs > 1) {
+		/* Easily unrollable but leave as is in case we can increase
+		 * qwords on the chip in the near future */
+		for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--)
+			scb->ips_lrh.data[i].u64w0 = args[i].u64w0;
+
+		if (nargs > IPS_AM_HDR_NARGS) {
+			/* Slow case -- we don't have iovec and not enough
+			 * space in the message header, so we have to copy the
+			 * user's arguments even if the payload is marked ASYNC
+			 */
+			uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb);
+			size_t arg_payload_len =
+			    sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS);
+
+			psmi_mq_mtucpy((void *)bufp,
+				       &args[IPS_AM_HDR_NARGS],
+				       arg_payload_len);
+			bufp += arg_payload_len;
+			scb->payload_size = arg_payload_len;
+
+			if (src != NULL && len > 0) {
+				psmi_mq_mtucpy((void *)bufp, src, len);
+				scb->payload_size += len;
+			}
+
+			psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+			scb->payload_size += pad_bytes;
+			scb->ips_lrh.amhdr_len = pad_bytes;
+			goto send_scb;
+		}
+	}
+
+	if (len == 0) {
+		scb->payload_size = 0;
+		scb->ips_lrh.amhdr_len = 0;
+	} else if (len <= (hdr_qwords << 3)) {
+		/* Inline the payload into the header. */
+		/* This path CANNOT handle length = 0 due to limited space
+		   in the header.  If IPS_SEND_FLAG_AMISTINY is set, an
+		   amhdr_len value of 0 means a full payload, i.e.
+		   1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */
+		psmi_assert(len > 0);
+
+		psmi_mq_mtucpy(&scb->ips_lrh.
+			       data[IPS_AM_HDR_NARGS - hdr_qwords], src, len);
+		scb->payload_size = 0;
+		psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS));
+		scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1);
+		scb->scb_flags |= IPS_SEND_FLAG_AMISTINY;
+	} else { /* Whatever's left requires a separate payload */
+		if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */
+			ips_scb_buffer(scb) = src;
+		else /* May need to re-xmit user data, keep it around */
+			psmi_mq_mtucpy(ips_scb_buffer(scb), src, len);
+
+		psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+		scb->payload_size = len + pad_bytes;
+		scb->ips_lrh.amhdr_len = pad_bytes;
+	}
+
+send_scb:
+	ips_scb_opcode(scb) = opcode;
+	scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++;
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	return PSM2_OK;
+}
+
+static inline int
+calculate_pad_bytes(size_t len)
+{
+	/* Align to dword (4 bytes) */
+	size_t dword_aligned_len = (len + 3) & ~3;
+	return dword_aligned_len - len;
+}
+
+static inline
+void
+ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs,
+		int pad_bytes,
+		psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS));
+
+	scb->completion_am = completion_fn;
+	scb->cb_param = completion_ctxt;
+	scb->ips_lrh.amhdr_hidx = handler;
+	scb->ips_lrh.amhdr_len = pad_bytes;
+	scb->ips_lrh.amhdr_nargs = nargs;
+	scb->ips_lrh.flags = 0;
+	if (completion_fn)
+		scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+	return;
+}
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt)
+{
+	struct ips_proto_am *proto_am = &epaddr->proto->proto_am;
+	psm2_error_t err;
+	ips_scb_t *scb;
+	ips_epaddr_t *ipsaddr;
+	int pad_bytes = calculate_pad_bytes(len);
+	int payload_sz = (nargs << 3);
+
+	if_pt(!(flags & PSM2_AM_FLAG_ASYNC))
+	    payload_sz += len;
+
+	if (payload_sz > (IPS_AM_HDR_NARGS << 3)) {
+		/* Payload can't fit in header, allocate buffer to carry data */
+		int arg_sz = (nargs > IPS_AM_HDR_NARGS) ?
+		    ((nargs - IPS_AM_HDR_NARGS) << 3) : 0;
+
+		/* len + pad_bytes + overflow_args */
+		PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+				err,
+				((scb = ips_scbctrl_alloc(
+				      &proto_am->scbc_request,
+				      1,
+				      len + pad_bytes + arg_sz,
+				      IPS_SCB_FLAG_ADD_BUFFER)) != NULL));
+	} else {
+		PSMI_BLOCKUNTIL(epaddr->ptlctl->ep,
+				err,
+				((scb = ips_scbctrl_alloc_tiny(
+				      &proto_am->scbc_request)) != NULL));
+	}
+
+	psmi_assert_always(scb != NULL);
+	ips_am_scb_init(scb, handler, nargs, pad_bytes,
+			completion_fn, completion_ctxt);
+
+	if (payload_sz >= epaddr->proto->multirail_thresh_load_balance) {
+		/* Select the next ipsaddr for multi-rail */
+		ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next;
+		ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+	} else {
+		ipsaddr = (ips_epaddr_t *)epaddr;
+	}
+
+	return am_short_reqrep(scb, ipsaddr, args,
+			       nargs,
+			       (flags & PSM2_AM_FLAG_NOREPLY) ?
+			       OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST,
+			       src, len, flags, pad_bytes);
+}
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+		   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	struct ips_am_token *token = (struct ips_am_token *)tok;
+	struct ips_proto_am *proto_am = token->proto_am;
+	struct ips_epaddr *ipsaddr = token->epaddr_rail;
+	int pad_bytes = calculate_pad_bytes(len);
+	int scb_flags = 0;
+	ips_scb_t *scb;
+
+	if (!token->tok.can_reply) {
+		_HFI_ERROR("Invalid AM reply for request!");
+		return PSM2_AM_INVALID_REPLY;
+	}
+
+	psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply));
+
+	if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) {
+		scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply);
+	} else {
+		int payload_sz = (nargs << 3);
+
+		payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ?
+			      0 : (len + pad_bytes);
+		scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ?
+		    IPS_SCB_FLAG_ADD_BUFFER : 0;
+
+		scb =
+		    ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz,
+				      scb_flags);
+	}
+
+	psmi_assert_always(scb != NULL);
+	ips_am_scb_init(scb, handler, nargs, pad_bytes,
+			completion_fn, completion_ctxt);
+	am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY,
+			src, len, flags, pad_bytes);
+	return PSM2_OK;
+}
+
+/* Prepares and runs a handler from a receive event. */
+static int
+ips_am_run_handler(const struct ips_message_header *p_hdr,
+		struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am,
+		uint64_t *payload,
+		uint32_t paylen)
+{
+	struct ips_am_token token;
+	int nargs = p_hdr->amhdr_nargs;
+	int ret;
+	struct psm2_ep_am_handle_entry *hentry;
+	psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data;
+
+	token.tok.flags = p_hdr->flags;
+	token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr;
+	token.tok.can_reply =
+		(_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST);
+	token.epaddr_rail = ipsaddr;
+	token.proto_am = proto_am;
+
+	if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) {
+		/* Payload is packed into header after args */
+		payload = (uint64_t *)&p_hdr->data[nargs].u64;
+		paylen = p_hdr->amhdr_len;
+		/* Interpret amhdr_len == 0 as 16 bytes of payload */
+		if (paylen == 0)
+			paylen = 1 << IPS_AM_HDR_LEN_BITS;
+	} else {
+		if (nargs > IPS_AM_HDR_NARGS) {
+			/* Args are split across header and payload */
+			int payload_args_len =
+				(nargs - IPS_AM_HDR_NARGS) *
+				sizeof(psm2_amarg_t);
+
+			args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t));
+
+			args[0].u64 = p_hdr->data[0].u64;
+			args[1].u64 = p_hdr->data[1].u64;
+
+			memcpy(&args[2], payload, payload_args_len);
+
+			payload += nargs - IPS_AM_HDR_NARGS;
+			paylen -= payload_args_len;
+		}
+
+		/* Subtract off padding bytes (dword padding) for non-TINY. */
+		paylen -= p_hdr->amhdr_len;
+	}
+
+	hentry = psm_am_get_handler_function(proto_am->proto->ep,
+			p_hdr->amhdr_hidx);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		ret = hfn2(&token, args, nargs, payload, paylen, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		ret = hfn1(&token, args, nargs, payload, paylen);
+	}
+
+	return ret;
+}
+
+static int
+ips_proto_am_handle_outoforder_queue()
+{
+	struct ips_am_message *msg, *prev;
+	int ret = IPS_RECVHDRQ_CONTINUE;
+
+	prev = &ips_am_outoforder_q.head;
+	msg = ips_am_outoforder_q.head.next;
+
+	while (msg != NULL) {
+		struct ips_epaddr *ipsaddr = msg->ipsaddr;
+		if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) {
+			prev = msg;
+			msg = msg->next;
+			continue;
+		}
+
+		ipsaddr->msgctl->am_recv_seqnum++;
+
+		if (ips_am_run_handler(&msg->p_hdr,
+					ipsaddr, msg->proto_am,
+					msg->payload, msg->paylen))
+			ret = IPS_RECVHDRQ_BREAK;
+
+		prev->next = msg->next;
+		if (prev->next == NULL)
+			ips_am_outoforder_q.tail = prev;
+
+		psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload);
+		psmi_mpool_put(msg);
+
+		msg = prev->next;
+	}
+
+	return ret;
+}
+
+static void
+ips_proto_am_queue_msg(struct ips_am_message *msg)
+{
+	msg->next = NULL;
+	ips_am_outoforder_q.tail->next = msg;
+	ips_am_outoforder_q.tail = msg;
+}
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	struct ips_am_message *msg = NULL;
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	enum ips_msg_order msgorder;
+
+	psmi_assert(flowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[flowid];
+	/*
+	 * Based on AM request/reply traffic pattern, if we don't have a reply
+	 * scb slot then we can't process the request packet, we just silently
+	 * drop it.  Otherwise, it will be a deadlock.  note:
+	 * ips_proto_is_expected_or_nak() can not be called in this case.
+	 */
+	if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST &&
+	    !ips_scbctrl_avail(&proto_am->scbc_reply))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	if (!ips_proto_is_expected_or_nak(rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	uint16_t send_msgseq =
+	    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq,
+			&ipsaddr->msgctl->am_recv_seqnum);
+
+	if (msgorder == IPS_MSG_ORDER_FUTURE)
+		return IPS_RECVHDRQ_REVISIT;
+	else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+		uint64_t *msg_payload;
+		uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+		psmi_assert(paylen == 0 || payload);
+		msg = psmi_mpool_get(ips_am_msg_pool);
+		if (unlikely(msg == NULL)) {
+			/* Out of memory, drop the packet. */
+			flow->recv_seq_num.psn_num =
+				(flow->recv_seq_num.psn_num - 1) &
+				rcv_ev->proto->psn_mask;
+			return IPS_RECVHDRQ_BREAK;
+		}
+		msg_payload = psmi_mq_sysbuf_alloc(
+				proto_am->proto->mq,
+				ips_recvhdrq_event_paylen(rcv_ev));
+		if (unlikely(msg_payload == NULL)) {
+			/* Out of memory, drop the packet. */
+			flow->recv_seq_num.psn_num =
+				(flow->recv_seq_num.psn_num - 1) &
+				rcv_ev->proto->psn_mask;
+			psmi_mpool_put(msg);
+			return IPS_RECVHDRQ_BREAK;
+		}
+
+		memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header));
+		memcpy(msg_payload, payload, paylen);
+
+		msg->payload = msg_payload;
+		msg->ipsaddr = ipsaddr;
+		msg->proto_am = proto_am;
+		msg->paylen = paylen;
+		msg->seqnum =
+			__le32_to_cpu(p_hdr->khdr.kdeth0) &
+			HFI_KHDR_MSGSEQ_MASK;
+
+		ips_proto_am_queue_msg(msg);
+	} else if ((msgorder == IPS_MSG_ORDER_EXPECTED) ||
+		   (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) {
+		uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+		psmi_assert(paylen == 0 || payload);
+		if (ips_am_run_handler(p_hdr, ipsaddr, proto_am,
+					payload, paylen))
+			ret = IPS_RECVHDRQ_BREAK;
+
+		ips_proto_am_handle_outoforder_queue();
+	}
+
+	/* Look if the handler replied, if it didn't, ack the request */
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+	return ret;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e0a27172056d40d7d41e253037064a96dbf08e2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_am.h
@@ -0,0 +1,93 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_AM_H
+#define _IPS_PROTO_AM_H
+
+#include "psm_user.h"
+#include "ips_scb.h"
+
+struct ips_proto_am {
+	struct ips_proto *proto;	/* back pointer */
+	struct ips_scbctrl scbc_request;
+	struct ips_scbctrl scbc_reply;
+};
+
+psm2_error_t
+ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters);
+
+psm2_error_t
+ips_am_short_reply(psm2_am_token_t tok,
+		   psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags,
+		   psm2_am_completion_fn_t completion_fn, void *completion_ctxt);
+
+psm2_error_t
+ips_am_short_request(psm2_epaddr_t epaddr,
+		     psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags,
+		     psm2_am_completion_fn_t completion_fn,
+		     void *completion_ctxt);
+
+psm2_error_t
+MOCKABLE(ips_proto_am_init)(struct ips_proto *proto,
+             int num_send_slots,
+             uint32_t imm_size,
+             struct ips_proto_am *proto_am);
+MOCK_DCL_EPILOGUE(ips_proto_am_init);
+
+psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am);
+
+#endif /* _IPS_PROTO_AM_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_connect.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_connect.c
new file mode 100644
index 0000000000000000000000000000000000000000..fecb3405ea117b92e96a03000573caa72eca554d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_connect.c
@@ -0,0 +1,1947 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "psm_mq_internal.h"
+#include "ips_proto_internal.h"
+#ifdef RNDV_MOD
+#include "psm_rndv_mod.h"
+#endif
+
+/*
+ * define connection version. this is the basic version, optimized
+ * version will be added later for scalability.
+ * version kept in 2 nibbles in this format: 0xMMmm MM=major, mm=minor version
+ */
+// a litle paranod as a UD or UDP connect can't reach a STL100 PSM recv context
+// but we don't worry about UDP vs UD since can't reach eachother either
+#define IPS_CONNECT_VERNO	  0x0100 // 1.0
+
+struct ips_connect_hdr {
+	uint16_t connect_verno;	/* should be ver IPS_CONNECT_VERNO */
+	uint16_t psm_verno;	/* should be 2.0 */
+	uint32_t connidx;	/* ignore if 0xffffffff */
+	uint64_t epid;		/* epid of connector process */
+} PACK_SUFFIX;
+
+struct ips_connect_reqrep {
+	uint16_t connect_verno;	/* should be ver IPS_CONNECT_VERNO */
+	uint16_t psm_verno;	/* should be 2.0 */
+	uint32_t connidx;	/* ignore if 0xffffffff */
+	uint64_t epid;		/* epid of connector process */
+	/* above should be same as ips_connect_hdr */
+
+	// fields below specific to CONNECT_REQUEST/REPLY
+	uint16_t connect_result;	/* error code */
+	uint16_t sl;		/* service level for matching */
+	uint16_t mtu;		/* receive payload */
+	uint16_t job_pkey;	/* partition key for verification */
+
+	uint32_t runid_key;	/* one-time stamp connect key */
+	uint32_t initpsn;	/* initial psn for flow */
+
+	char hostname[128];	/* sender's hostname string */
+	// fields below added as part of IPS_CONNECT_VERNO 1.0
+	// used for rndv and user space RC QP connection (CONNECT_REQUEST/REPLY)
+	uint8_t rdmamode;	/* IPS_PROTOEXP_FLAG_RDMA_MASK portion of rdmamode */
+	uint8_t static_rate;	/* ibv_rate enum */
+	uint8_t reserved[6+16];	// 1st 6 bytes keep fields below 64b aligned
+	// fields below can be zero depending on rdmamode
+
+	// for rndv module connection establishment only set for RNDV_MOD
+	union ibv_gid gid; /* sender's gid */	// zero if no rndv mod RDMA
+	uint32_t rv_index; /* senders process index */ // zero if no rndv mod RDMA
+	uint32_t resv;	// alignment
+	// for user space RC QP connection establishment only set for USE_RC
+	struct psm_rc_qp_attr qp_attr;	// zero if no user space RC QPs
+	// 8 bytes of subnet and 8 bytes of epid may follow for each of up to
+	// PSMI_MAX_QPS in case this is a multi-rail run and/or mutliple QPs 
+	// are opened per NIC.
+} PACK_SUFFIX;
+
+/* Startup protocol in PSM/IPS
+ *
+ * Start timer.
+ *
+ * For all nodes to connect to:
+ *   Grab connect lock
+ *   Look up epid in table
+ *      MATCH.
+ *         assert cstate_outgoing != CONNECT_WAITING (no re-entrancy)
+ *         If cstate_outgoing == CONNECT_DONE
+ *            return the already connected address.
+ *         else
+ *            assert cstate_outgoing == CONNECT_NONE
+ *            assert cstate_incoming == CONNECT_DONE
+ *            cstate_outgoing := CONNECT_WAITING
+ *            assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN
+ *            req->connidx := epaddr->connidx_incoming
+ *            add to list of pending connect.
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_outgoing := CONNECT_WAITING
+ *         cstate_incoming := CONNECT_NONE
+ *         connidx_outgoing := UNKNOWN
+ *         req->connidx := epaddr->connidx_incoming := NEW connidx integer
+ *         add to list of pending connect
+ *   Release connect lock
+ *
+ * expected_connect_count = ep->total_connect_count + num_to_connect
+ * while (expected_connect_count != ep->total_connect_count)
+ *    check for timeout
+ *    progress();
+ *
+ * For all connection requests received (within progress loop)
+ *   If uuid doesn't match, NAK the connect and skip request
+ *   Grab connect lock
+ *   Lock up epid in table
+ *      MATCH
+ *	   if cstate_incoming == CONNECT_DONE
+ *	      req->connidx := epaddr->connidx_incoming
+ *            compose reply and send again (this is a dupe request).
+ *         else
+ *            assert cstate_incoming == CONNECT_NONE
+ *            assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE)
+ *            cstate_incoming := CONNECT_DONE
+ *            epaddr->connidx_outgoing := req->connidx
+ *            req->connidx := epaddr->connidx_incoming
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_incoming := CONNECT_DONE
+ *         epaddr->connidx_outgoing = req->connidx;
+ *         rep->connidx := epaddr->connidx_incoming := NEW connidx integer
+ *         compose connect reply and send
+ *   Release connect lock
+ *
+ * For all connection replies received:
+ *    If connect_result != 0, process error and skip.
+ *    assert cstate_outgoing == CONNECT_WAITING
+ *    if cstate_incoming == CONNECT_DONE
+ *       assert rep->connidx == epaddr->connidx_outgoing
+ *    else
+ *	 epaddr->connidx_outgoing := rep->connidx
+ *    cstate_outgoing := CONNECT_DONE
+ *    ep->total_connect_count ++
+ *
+ *   * Fill in a connection request:
+ *      1. Set connect protocol version and PSM versions
+ *      2. Set the uuid attached to current endpoint and add the job_pkey
+ *         the node wishes to communicate post-connect.
+ *      3. Set our mtu, bitwidth and endianess to detect inconsistencies
+ *
+ */
+
+static int
+ips_proto_build_connect_message(struct ips_proto *proto,
+	ips_epaddr_t *ipsaddr, uint8_t opcode, void *payload,
+	size_t max_paylen);
+
+#ifdef RNDV_MOD
+/* on -1 errno is status
+ * EIO is connection error other values are more serious (invalid call, etc)
+ */
+static int is_rv_connected(ips_epaddr_t *ipsaddr)
+{
+	int ret;
+
+	/* ! rv_conn means we don't need a rv connection, otherwise
+	 * return status of the connection
+	 */
+	if (! ipsaddr->rv_conn || ipsaddr->rv_connected)
+		return 1;
+	ret = __psm2_rv_connected(ipsaddr->rv_conn);
+	if (ret < 0 && errno != EIO) {
+		int save_errno = errno;
+		perror("can't query rv connection\n");
+		errno = save_errno;
+	}
+	ipsaddr->rv_connected = (1 == ret);
+	return ret;
+}
+#else // RNDV_MOD
+static inline int is_rv_connected(ips_epaddr_t *ipsaddr) { return 1; }
+#endif // RNDV_MOD
+	
+/**
+ * Configure flows for an ipsaddr.
+ *
+ * @arg ipsaddr - the ipsaddr to configure the flows for
+ * @arg proto - the protocol used
+ *
+ * @pre proto's flags must be set
+ *
+ * Flows should be configured:
+ * - immediately upon creation of an ipsaddr
+ * - whenever a connection is established and the receiver's characteristics
+ *   (e.g. mtu) become known
+ */
+ustatic
+void
+ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+	/* PIO flow uses the normal priority path, to separate low
+	 * priority path for bulk sdma data packets
+	 */
+	ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto,
+		      ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+		      IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO);
+
+}
+
+/*
+ * Teardown any unnecessary timers that could still be active and assign NULL
+ * to pointers in flow structs. We do this mainly for PIO and DMA flows.
+ * TidFlow teardowns are conducted in ips_protoexp_fini()
+ */
+static
+void
+ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto)
+{
+	struct ips_flow *flow;
+	int i;
+
+	for (i = 0; i < EP_FLOW_LAST-1; i++) {
+		flow = &ipsaddr->flows[i];
+
+		/* Cancel any stale flow->timers in flight */
+		if (flow->timer_ack) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+		}
+
+		if (flow->timer_send) {
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+		}
+
+		flow->flush = NULL;
+		flow->path = NULL;
+		flow->ipsaddr = NULL;
+	}
+}
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+		 const char *hostname,
+		 unsigned long timeout, psm2_error_t *err_out);
+
+/*
+ * Given a connection request, set mtu, communication index and hdr length
+ * parameters.
+ *
+ * The most subtle parameter is the mtu.  When set as 'req->mtu', the mtu
+ * is our connecting peer's declared mtu (which may not be the same as our
+ * mtu).  The approach is to take the smaller of both mtus when communicating
+ * with that peer.  Also, when using pio, the size can be further restricted by
+ * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers).
+ */
+static
+psm2_error_t
+ips_ipsaddr_set_req_params(struct ips_proto *proto,
+			   ips_epaddr_t *ipsaddr,
+			   const struct ips_connect_reqrep *req,
+			   uint32_t paylen)
+{
+	psm2_ep_t ep;
+	psm2_epaddr_t epaddr;
+	psm2_error_t err = PSM2_OK;
+	int i, start, count;
+	uint64_t *data;
+	psmi_assert_always(req->mtu > 0);
+	// common_mtu will be further reduced by pr_mtu to set frag_size and RC mtu
+	uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu);
+	psmi_assert_always(req->static_rate > 0);
+	enum psm_ibv_rate common_rate = min_rate(req->static_rate,
+										 proto->epinfo.ep_link_rate);
+	int ptype, pidx;
+
+	ipsaddr->window_rv = proto->mq->hfi_base_window_rv;
+
+	/*
+	 * For static routes i.e. "none" path resolution update all paths to
+	 * have the same profile (mtu, sl etc.).
+	 *
+	 * For path record queries the epr_mtu and epr_sl are setup correctly
+	 * from the path itself.
+	 */
+	for (ptype = IPS_PATH_LOW_PRIORITY;
+	     ptype < IPS_PATH_MAX_PRIORITY; ptype++)
+		for (pidx = 0;
+		     pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) {
+			if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) {
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+					common_mtu;
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate =
+					common_rate;
+			} else {
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu =
+				    min(common_mtu,
+					ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu);
+				ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate =
+				    min_rate(common_rate,
+					ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate);
+			}
+		}
+
+	/*
+	 * We've got updated mtu/path records, need to re-initialize the flows to take
+	 * into account _real_ (updated) remote endpoint characteristics
+	 */
+	ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+	/*
+	 * Save peer's info.
+	 */
+	ipsaddr->connidx_outgoing = req->connidx;
+	ipsaddr->runid_key = req->runid_key;
+	/* ipsaddr->initpsn = req->initpsn; */
+
+	err =
+	    psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid),
+				   (char *)req->hostname, 0);
+	if (err)
+		return err;
+
+#ifdef RNDV_MOD
+	ipsaddr->remote_gid = req->gid;
+	ipsaddr->remote_rv_index = req->rv_index;
+	if (ipsaddr->rv_conn) {
+		psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode));
+		psmi_assert(proto->ep->verbs_ep.rv);
+		if (!  __psm2_nonzero_gid(&req->gid)) {
+			_HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 1\n");
+			return PSM2_INTERNAL_ERR;
+			// TBD - if we wanted to allow mismatched config to run in UD mode
+			//__psm2_rv_destroy_conn(ipsaddr->rv_conn);
+			//ipsaddr->rv_conn = NULL;
+		} else {
+			// both sides are ready, so we can start rv_connect now
+			if (! ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->connecting) {
+				char buf[80];
+				struct ib_user_path_rec path;
+				_HFI_MMDBG("rv_connect to: %s\n", __psm2_dump_gid(&ipsaddr->remote_gid, buf, sizeof(buf)));
+				// pg_path has negotiated pr_mtu and pr_static_rate
+				ips_path_rec_to_ib_user_path_rec(proto->ep,
+					ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY],
+ 					&ipsaddr->remote_gid, &path);
+				if (__psm2_rv_connect(ipsaddr->rv_conn, &path)) {
+					_HFI_ERROR("rv_connect failed: %s\n", strerror(errno));
+					return PSM2_INTERNAL_ERR;
+				}
+				ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->connecting = 1;
+			}
+		}
+	// } else if (__psm2_nonzero_gid(&req->gid)) {
+	//	 We could fail here, but we just let remote end decide
+	//	_HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 1\n");
+	//	return PSM2_INTERNAL_ERR;
+	}
+#endif // RNDV_MOD
+	if (ipsaddr->rc_qp) {
+		psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode));
+#ifdef RNDV_MOD
+		psmi_assert(proto->ep->verbs_ep.rv
+					|| proto->ep->mr_cache_mode != MR_CACHE_MODE_KERNEL);
+#endif
+		if (! req->qp_attr.qpn) {
+			_HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 2 or 3\n");
+			return PSM2_INTERNAL_ERR;
+			// TBD - if we wanted to allow mismatched config to run in UD mode
+			//rc_qp_destroy(ipsaddr->rc_qp);
+			//ipsaddr->rc_qp = NULL;
+		} else {
+			// we got a REQ or a REP, we can move to RTR
+			// if we are only doing RDMA, we don't need any buffers, but we need a
+			// pool object for RQ coallesce, so we create a pool with 0 size buffers
+			if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, ipsaddr->rc_qp, &ipsaddr->recv_pool,
+					min(proto->ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->rc_qp_max_recv_wr),
+				  (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0
+					// want to end up with multiple of cache line (64)
+					// pr_mtu is negotiated max PSM payload, not including hdrs
+					// pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+					// be conservative (+BUFFER_HEADROOM)
+					: ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu
+							+ MAX_PSM_HEADER + BUFFER_HEADROOM
+			)) {
+				_HFI_ERROR("failed to alloc RC recv buffers\n");
+				return PSM2_INTERNAL_ERR;
+			}
+
+			if (modify_rc_qp_to_init(proto->ep, ipsaddr->rc_qp)) {
+				_HFI_ERROR("qp_to_init failed\n");
+				return PSM2_INTERNAL_ERR;
+			}
+			if (PSM2_OK != __psm2_ep_verbs_prepost_recv(&ipsaddr->recv_pool)) {
+				_HFI_ERROR("prepost failed\n");
+				return PSM2_INTERNAL_ERR;
+			}
+			// RC QP MTU will be set to min of req->qp_attr and pr_mtu
+			// TBD - we already factored in req vs pr to update pr no need
+			// for modify_cq_qp_to_rtr to repeat it
+			if (modify_rc_qp_to_rtr(proto->ep, ipsaddr->rc_qp, &req->qp_attr,
+					ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY], //TBD path_rec
+					req->initpsn)) {
+				_HFI_ERROR("qp_to_rtr failed\n");
+				return PSM2_INTERNAL_ERR;
+			}
+		}
+	// } else if (req->qp_attr.qpn) {
+	//	 We could fail here, but we just let remote end decide
+	//	_HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 2 or 3\n");
+	//	return PSM2_INTERNAL_ERR;
+	}
+
+	/*
+	 * Check if there is other rails to setup.
+	 */
+	paylen -= sizeof(struct ips_connect_reqrep);
+	if (paylen == 0)
+		return PSM2_OK;
+
+	/*
+	 * Yes, other rail's gid/epid is attached.
+	 */
+	if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) {
+		return PSM2_INTERNAL_ERR;
+	}
+	count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t));
+	if (count > PSMI_MAX_QPS)
+		return PSM2_INTERNAL_ERR;
+
+	/*
+	 * Both side are ordered, so just search from small to big.
+	 */
+	start = 0;
+	data = (uint64_t *) (req + 1);
+	ep = proto->ep->mctxt_next;
+
+	struct drand48_data drand48_data;
+	srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data);
+
+	/* Loop over all slave endpoints */
+	while (ep != ep->mctxt_master) {
+		for (i = start; i < count; i++) {
+
+			/* There is a gid match, create the epaddr */
+			if (data[2 * i] == ep->gid_hi
+			// allow_routers only applied to ethernet epids (V4)
+			|| (psmi_allow_routers && PSMI_EPID_GET_EPID_VERSION(data[2*i+1]) == PSMI_EPID_V4)
+			 ) {
+
+				epaddr =
+					ips_alloc_epaddr(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, 0,
+							 data[2 * i + 1], NULL,
+							  5000, &err);
+				if (epaddr == NULL)
+					return err;
+
+				/* link the ipsaddr */
+				IPS_MCTXT_APPEND(ipsaddr,
+						 (ips_epaddr_t *) epaddr);
+
+				/* Setup message control info to the same struct */
+				((ips_epaddr_t *) epaddr)->msgctl =
+				    ipsaddr->msgctl;
+				ipsaddr->msgctl->ipsaddr_count++;
+
+				/* randomize the rail to start traffic */
+				long int rnum;
+				lrand48_r(&drand48_data, &rnum);
+				if ((rnum % count) == i) {
+					ipsaddr->msgctl->ipsaddr_next =
+					    (ips_epaddr_t *) epaddr;
+				}
+
+				/* update the starting point,
+				 * all previous ones are not valid anymore */
+				start = i + 1;
+				break;
+			}
+		}
+
+		ep = ep->mctxt_next;
+	}
+
+	return PSM2_OK;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_request(struct ips_proto *proto,
+				    struct ips_flow *flow, uint8_t message_type,
+				    uint16_t *msg_queue_mask, uint64_t timeout)
+{
+	psm2_error_t err = PSM2_OK;
+	ips_scb_t ctrlscb;
+
+	/* msg header plus gid+epid for all rails plus checksum */
+	char payload[sizeof(struct ips_connect_reqrep) +
+		16*PSMI_MAX_QPS + PSM_CRC_SIZE_IN_BYTES];
+	uint32_t paylen;
+
+	ctrlscb.scb_flags = 0;
+	paylen = ips_proto_build_connect_message(proto,
+		flow->ipsaddr, message_type, payload, sizeof(payload));
+	psmi_assert_always(paylen <= sizeof(payload));
+
+	do {
+		err = ips_proto_send_ctrl_message(flow, message_type,
+				msg_queue_mask, &ctrlscb, payload, paylen);
+		if (err == PSM2_OK) {
+			break;
+		}
+		if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) {
+			break;
+		}
+	} while (get_cycles() < timeout);
+
+	return err;
+}
+
+static psm2_error_t
+ips_proto_send_ctrl_message_reply(struct ips_proto *proto,
+				    struct ips_flow *flow, uint8_t message_type,
+				    uint16_t *msg_queue_mask)
+{
+	/* This will try up to 100 times until the message is sent. The code
+	 * is persistent because dropping replies will lead to a lack of
+	 * overall progress on the connection/disconnection. We do not want
+	 * to poll from here, and we cannot afford a lengthy timeout, since
+	 * this is called from the receive path.
+	 */
+	psm2_error_t err = PSM2_OK;
+	int i;
+	ips_scb_t ctrlscb;
+	/* msg header plus gid+epid for all rails plus checksum */
+	char payload[sizeof(struct ips_connect_reqrep) +
+		16*PSMI_MAX_QPS + PSM_CRC_SIZE_IN_BYTES];
+	uint32_t paylen;
+
+	ctrlscb.scb_flags = 0;
+	paylen = ips_proto_build_connect_message(proto,
+		flow->ipsaddr, message_type, payload, sizeof(payload));
+	psmi_assert_always(paylen <= sizeof(payload));
+
+	for (i = 0; i < 100; i++) {
+		err = ips_proto_send_ctrl_message(flow, message_type,
+				msg_queue_mask, &ctrlscb, payload, paylen);
+		if (err == PSM2_OK) {
+			break;
+		}
+	}
+
+	return err;
+}
+
+static int
+ips_proto_build_connect_message(struct ips_proto *proto,
+	ips_epaddr_t *ipsaddr, uint8_t opcode, void *payload,
+	size_t max_paylen)
+{
+	struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+	struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload;
+	uint32_t paylen = 0;
+
+	psmi_assert_always(proto != NULL);
+
+	hdr->connect_verno = IPS_CONNECT_VERNO;
+	hdr->psm_verno = PSMI_VERNO;
+	hdr->connidx = (uint32_t) ipsaddr->connidx_incoming;
+	hdr->epid = proto->ep->epid;
+
+	switch (opcode) {
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_CONNECT_REQUEST:
+		if (opcode == OPCODE_CONNECT_REQUEST) {
+			req->connect_result = PSM2_OK;
+			req->runid_key = proto->runid_key;
+		} else {
+			req->connect_result = ipsaddr->cerror_incoming;
+			req->runid_key = ipsaddr->runid_key;
+		}
+
+		req->sl = proto->epinfo.ep_sl;
+		// we keep this simple and send our local PSM payload (MTU)
+		// after connection negotiation of a common_mtu, the MTU will be
+		// further reduced by pr_mtu to set frag_size and RC QP mtu
+		req->mtu = proto->epinfo.ep_mtu;
+		req->job_pkey = proto->epinfo.ep_pkey;
+
+		strncpy(req->hostname, psmi_gethostname(),
+			sizeof(req->hostname) - 1);
+		req->hostname[sizeof(req->hostname) - 1] = '\0';
+		req->rdmamode = proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK;
+		req->static_rate = proto->epinfo.ep_link_rate;
+		memset(&req->reserved, 0, sizeof(req->reserved));
+#ifdef RNDV_MOD
+		// only supply gid if we want to use kernel rv
+		if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode)
+				&& proto->ep->verbs_ep.rv) {
+			req->gid = proto->ep->verbs_ep.lgid;
+			req->rv_index = proto->ep->verbs_ep.rv_index;
+		} else
+#endif
+		{
+			memset(&req->gid, 0, sizeof(req->gid));
+			req->rv_index = 0;
+		}
+		if (ipsaddr->rc_qp) {
+			psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode));
+			req->initpsn = proto->runid_key;// pid, not ideal, better than const
+			req->qp_attr.qpn = ipsaddr->rc_qp->qp_num;
+			req->qp_attr.mtu = opa_mtu_int_to_enum(req->mtu);
+			req->qp_attr.srq = 0;
+			req->qp_attr.resv = 0;
+			req->qp_attr.target_ack_delay = 0; // TBD; - from local device
+			req->qp_attr.resv2 = 0;
+			req->qp_attr.responder_resources = 0;
+			req->qp_attr.initiator_depth = 0;
+			memset(&req->qp_attr.resv3, 0, sizeof(req->qp_attr.resv3));
+		} else
+			memset(&req->qp_attr, 0, sizeof(req->qp_attr));
+
+		paylen = sizeof(struct ips_connect_reqrep);
+
+		/* Attach all multi-context subnetids and epids. */
+		if (proto->ep->mctxt_master == proto->ep) {
+			psm2_ep_t ep = proto->ep->mctxt_next;
+			uint64_t *data = (uint64_t *) (req + 1);
+			while (ep != proto->ep) {
+				*data = ep->gid_hi;
+				paylen += sizeof(uint64_t);
+				data++;
+				*data = ep->epid;
+				paylen += sizeof(uint64_t);
+				data++;
+				psmi_assert_always(paylen <= max_paylen);
+				ep = ep->mctxt_next;
+			}
+		}
+
+		break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+		paylen = sizeof(struct ips_connect_hdr);
+		// TBD - this is redundant if transfer_frame uses UD for all
+		// control messages, but it also makes sure we stop using
+		// RC for any non-control messages (should be none) after disconnect
+		// use the UD QP's allocator and inline now and going forward
+		ipsaddr->use_allocator =  &proto->ep->verbs_ep.send_allocator;
+		ipsaddr->use_qp =  proto->ep->verbs_ep.qp;
+		ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data;
+		_HFI_MMDBG("RC discon\n");
+		// ultimately we will free ipsaddr
+		// so that will free RC QP and its buffers
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unexpected/unhandled connection opcode 0x%x\n",
+				  opcode);
+		break;
+	}
+
+	return paylen;
+}
+
+void
+MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
+	      ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type,
+	      psm_protocol_type_t protocol, ips_path_type_t path_type,
+	      uint32_t flow_index)
+{
+	psmi_assert_always(protocol < PSM_PROTOCOL_LAST);
+	psmi_assert_always(flow_index < EP_FLOW_LAST);
+
+	SLIST_NEXT(flow, next) = NULL;
+	psmi_assert(transfer_type == PSM_TRANSFER_PIO);
+	flow->flush = ips_proto_flow_flush_pio;
+
+	flow->path =
+	    ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp);
+
+	/* Select the fragment size for this flow. Flow is the common
+	 * denominator between the local endpoint, the remote endpoint,
+	 * the path between those and whether it's a PIO or DMA send.
+	 * Hence, it "owns" the maximum transmission unit in its frag_size
+	 * member.
+	 */
+
+	/* min of local MTU and path MTU */
+	flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu);
+	_HFI_CONNDBG("[ipsaddr=%p] UD flow->frag_size: %u = min("
+		"proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n",
+		ipsaddr, flow->frag_size, proto->epinfo.ep_mtu,
+		flow->path->pr_mtu);
+
+	flow->ipsaddr = ipsaddr;
+	flow->transfer = transfer_type;
+	flow->protocol = protocol;
+	flow->flowid = flow_index;
+	flow->xmit_seq_num.psn_val = 0;
+	flow->recv_seq_num.psn_val = 0;
+	flow->xmit_ack_num.psn_val = 0;
+	flow->flags = 0;
+	flow->credits = flow->cwin = proto->flow_credits;
+	flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1);
+	flow->ack_counter = 0;
+#ifdef PSM_DEBUG
+	flow->scb_num_pending = 0;
+	flow->scb_num_unacked = 0;
+#endif
+
+	flow->timer_ack = NULL;
+	flow->timer_send = NULL;
+
+	STAILQ_INIT(&flow->scb_unacked);
+	SLIST_INIT(&flow->scb_pend);
+	return;
+}
+MOCK_DEF_EPILOGUE(ips_flow_init);
+
+static
+psm2_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid,
+		 const char *hostname,
+		 unsigned long timeout, psm2_error_t *err_out)
+{
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	ips_path_grp_t *pathgrp;
+	uint16_t lid;
+	uint16_t ip_hi;
+
+	/* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl
+	 * structures are collocated in memory for performance reasons -- this is
+	 * why ips allocates memory for all three together.
+	 *
+	 * The PSM/PTL structure data is filled in upon successfully ep connect in
+	 * ips_ptl_connect().
+	 */
+	if (master) {
+		struct ips_msgctl *msgctl;
+
+		/* Although an ips_msgtl is allocated here, it can be safely casted to
+		   both an ips_epaddr and a psm2_epaddr.  It is eventually freed as an
+		   ips_epaddr. */
+		msgctl =
+		    (struct ips_msgctl *)psmi_calloc(proto->ep,
+						     PER_PEER_ENDPOINT, 1,
+						     sizeof(struct ips_msgctl));
+		if (msgctl == NULL) {
+			*err_out = PSM2_NO_MEMORY;
+			return NULL;
+		}
+
+		ipsaddr = &msgctl->master_epaddr;
+		epaddr = (psm2_epaddr_t) ipsaddr;
+
+		_HFI_CONNDBG("ips_alloc_epaddr %p for EPID= 0x%"PRIx64" %s\n",
+				epaddr, epid, hostname?hostname:"unknown");
+		ipsaddr->msgctl = msgctl;
+
+		/* initialize items in ips_msgctl_t */
+		msgctl->ipsaddr_next = ipsaddr;
+		msgctl->mq_send_seqnum = 0;
+		msgctl->mq_recv_seqnum = 0;
+		msgctl->am_send_seqnum = 0;
+		msgctl->am_recv_seqnum = 0;
+		msgctl->ipsaddr_count = 1;
+		msgctl->outoforder_count = 0;
+	} else {
+		epaddr =
+		    (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1,
+					       sizeof(struct ips_epaddr));
+		if (!epaddr) {
+			*err_out = PSM2_NO_MEMORY;
+			return NULL;
+		}
+		ipsaddr = (ips_epaddr_t *) epaddr;
+	}
+
+	epaddr->ptlctl = ((struct ptl_ips *)(proto->ptl))->ctl;
+	epaddr->proto = proto;
+	epaddr->epid = epid;
+
+	/* IPS-level epaddr */
+	ipsaddr->next = ipsaddr;
+
+	ipsaddr->ctrl_msg_queued = 0;
+	ipsaddr->msg_toggle = 0;
+
+	ipsaddr->remote_qpn = PSMI_EPID_GET_CONTEXT(epid);
+
+	/* Get path record for <service, slid, dlid> tuple */
+	lid = PSMI_EPID_GET_LID(epid);
+	ip_hi = PSMI_EPID_GET_LID(epid) >> 16;
+	_HFI_CONNDBG("qpn=0x%x lid=0x%x ip_hi=0x%x\n", ipsaddr->remote_qpn, lid, ip_hi);
+
+	err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid,
+				       __cpu_to_be16(lid),
+				       __cpu_to_be16(ip_hi),
+				       timeout,
+				       &pathgrp);
+	if (err != PSM2_OK) {
+		goto fail;
+	}
+	ipsaddr->pathgrp = pathgrp;
+
+	/* Setup high priority path index, control messages use the high
+	 * priority CONTROL path.
+	 */
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+		ipsaddr->hpp_index = 0;
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+		ipsaddr->hpp_index = ipsaddr->IPSADDR_HASH %
+		    ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+		ipsaddr->hpp_index = proto->epinfo.EP_HASH %
+		    ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY];
+	else			/* Base LID  */
+		ipsaddr->hpp_index = 0;
+
+	if (IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode)
+#ifdef RNDV_MOD
+		// if verbs_ep allows us to open w/o rv_open then we can't use RC QP
+		&& (proto->ep->verbs_ep.rv
+			|| proto->ep->mr_cache_mode != MR_CACHE_MODE_KERNEL)
+#endif
+		) {
+		struct ibv_qp_cap qp_cap;
+		ipsaddr->rc_qp = rc_qp_create(proto->ep, ipsaddr, &qp_cap);
+		if (! ipsaddr->rc_qp) {
+			_HFI_ERROR("unable to create RC QP\n");
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+		if ((proto->ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+			// we need to make sure we can't overflow send Q
+			if (qp_cap.max_send_wr < proto->ep->verbs_ep.send_pool.send_total) {
+				_HFI_ERROR("RC QP Send Q too small\n");
+				err = PSM2_INTERNAL_ERR;
+				goto fail;
+			}
+		}
+		ipsaddr->rc_qp_max_recv_wr = qp_cap.max_recv_wr;
+		ipsaddr->rc_qp_max_inline_data = qp_cap.max_inline_data;
+		if (PSM2_OK != psm_verbs_init_send_allocator(&ipsaddr->send_allocator,
+							&proto->ep->verbs_ep.send_pool)) {
+			_HFI_ERROR("can't init RC QP send allocator\n");
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+	}
+	// until our QP is connected, use the UD QP's allocator and inline
+	ipsaddr->use_allocator =  &proto->ep->verbs_ep.send_allocator;
+	ipsaddr->use_qp =  proto->ep->verbs_ep.qp;
+	ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data;
+
+#ifdef RNDV_MOD
+	if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode)
+			&& proto->ep->verbs_ep.rv) {
+		struct ibv_ah_attr ah_attr;
+
+		ipsaddr->rv_connected = 0; // redundant since we calloc above
+		// Right now we are not doing multi-pathing and
+		// multi-priority so using path 0 in LOW PRIORITY (TID RDMA) is ok
+		// we're going to share the same path with all processes.  So we
+		// don't want to apply dispersive routing.  Hence we don't use
+		//ips_select_path(proto, IPS_PATH_LOW_PRIORITY, ipsaddr, ipsaddr->pathgrp);
+		// nor do we use the ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path which
+		// configure_flows will have setup similarly
+		// we only need 1 connn per remote node, can share same conn for
+		// all ipsaddr which go to same node.  so we track rv_conn at
+		// path record level
+		if (ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn) {
+			ipsaddr->rv_conn = ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn;
+		} else {
+			err = ips_path_rec_to_ah_attr(proto->ep,
+				ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY], // ????
+				&ah_attr);
+			if (PSM2_OK != err) {
+				_HFI_ERROR("unable to get ah from path\n");
+				goto fail;
+			}
+			ipsaddr->rv_conn = __psm2_rv_create_conn(proto->ep->verbs_ep.rv,
+							 &ah_attr, (uint32_t)PSMI_EPID_GET_LID(epid));
+			ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn = ipsaddr->rv_conn;
+			if (! ipsaddr->rv_conn) {
+				_HFI_ERROR("rv_create_conn failed: %s\n", strerror(errno));
+				err = PSM2_INTERNAL_ERR;
+//TBD - should we make this non-fatal?  Just regress to UD mode and output ERROR
+				goto fail;
+			}
+		}
+	}
+#endif // RNDV_MOD
+
+	/*
+	 * Set up the flows on this ipsaddr
+	 */
+	ips_ipsaddr_configure_flows(ipsaddr, proto);
+
+	/* clear connection state. */
+	ipsaddr->cstate_outgoing = CSTATE_NONE;
+	ipsaddr->cstate_incoming = CSTATE_NONE;
+
+	/* Add epaddr to PSM's epid table */
+	psmi_epid_add(proto->ep, epaddr->epid, epaddr);
+	psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr);
+
+	*err_out = PSM2_OK;
+	return epaddr;
+
+fail:
+	if (ipsaddr->rc_qp) {
+		rc_qp_destroy(ipsaddr->rc_qp);
+		ipsaddr->rc_qp = NULL;
+	}
+	psmi_free(epaddr);
+	*err_out = err;
+	return NULL;
+}
+
+static
+void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto)
+{
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+	ips_flow_fini(ipsaddr, proto);
+
+	_HFI_CONNDBG("epaddr=%p connidx_incoming=%d epid=0x%"PRIx64"\n",
+			epaddr, ipsaddr->connidx_incoming, epaddr->epid);
+	IPS_MCTXT_REMOVE(ipsaddr);
+#ifdef RNDV_MOD
+	_HFI_MMDBG("free_epaddr\n");
+	if (ipsaddr->rv_conn) {
+		//__psm2_rv_destroy_conn(ipsaddr->rv_conn);
+		// TBD - call rv_disconnect or maybe rv_destroy_conn
+		// TBD disconnect and free rv_conn
+		// TBD - can we do this in a synchronous manner?
+		// below we free epaddr, so we will lose track of rv_conn
+		// but maybe rndv module will track it enough that we don't have to
+		// here, provided we don't confuse ourselves with a discon resp
+		// because the rv_conn's content we will get in that callback
+		// may be pointing to a freed rv_conn or freed epaddr
+		// maybe just call rndv_mod to set context to 0?  But could
+		// be races for callbacks and events already queued
+	}
+#endif // RNDV_MOD
+	if (ipsaddr->rc_qp) {
+		rc_qp_destroy(ipsaddr->rc_qp);
+		ipsaddr->rc_qp = NULL;
+	}
+	psm_verbs_free_recv_pool(&ipsaddr->recv_pool);
+	psmi_epid_remove(epaddr->proto->ep, epaddr->epid);
+	ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming);
+	psmi_free(epaddr);
+	return;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto,
+		       psm2_epaddr_t epaddr, struct ips_connect_reqrep *req,
+		       uint32_t paylen);
+
+psm2_error_t
+ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode,
+			  struct ips_message_header *p_hdr, void *payload,
+			  uint32_t paylen)
+{
+	struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	epaddr = psmi_epid_lookup(proto->ep, hdr->epid);
+	ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL;
+
+	_HFI_CONNDBG("Conn Pkt Rcv'd: op=0x%02x from: 0x%lx to: 0x%lx\n",
+		opcode, hdr->epid, proto->ep->epid);
+	switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+		proto->epaddr_stats.connect_req_recv++;
+		err = ptl_handle_connect_req(proto, epaddr,
+					     (struct ips_connect_reqrep *)hdr,
+					     paylen);
+		break;
+
+	case OPCODE_CONNECT_REPLY:
+		{
+			struct ips_connect_reqrep *req =
+			    (struct ips_connect_reqrep *)payload;
+
+			proto->epaddr_stats.connect_rep_recv++;
+			if (!ipsaddr || req->runid_key != proto->runid_key) {
+				_HFI_PRDBG
+				    ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid 0x%"PRIx64": %s\n",
+				     ipsaddr, req->runid_key, proto->runid_key,
+				     hdr->epid, psmi_epaddr_fmt_addr(hdr->epid));
+			} else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) {
+				/* possible dupe */
+				_HFI_CONNDBG("connect dupe, expected %d got %d\n",
+					  CSTATE_OUTGOING_WAITING,
+					  ipsaddr->cstate_outgoing);
+			} else {
+				/* Reply to our request for connection (i.e. outgoing connection) */
+				if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) {
+					err =
+					    ips_ipsaddr_set_req_params(proto,
+								       ipsaddr,
+								       req,
+								       paylen);
+					if (err)
+						goto fail;
+				}
+				if (ipsaddr->rc_qp) {
+					psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode));
+					psmi_assert(req->qp_attr.qpn); // checked in set_req_params
+					// we got a a REP, we can move to RTS
+					if (modify_rc_qp_to_rts(proto->ep, ipsaddr->rc_qp,
+						&req->qp_attr, proto->runid_key)) { // initpsn we sent
+						_HFI_ERROR("qp_to_rts failed\n");
+						return PSM2_INTERNAL_ERR;
+					}
+					if ((proto->ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+						// use RC QPs for eager and RDMA
+						// now we can use our own send Q and send allocator
+						ipsaddr->use_allocator =  &ipsaddr->send_allocator;
+						ipsaddr->use_qp =  ipsaddr->rc_qp;
+						ipsaddr->use_max_inline_data = ipsaddr->rc_qp_max_inline_data;
+						_HFI_MMDBG("RC enabled\n");
+					}
+					ipsaddr->rc_connected = 1;
+				}
+				ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED;
+				ipsaddr->cerror_outgoing = req->connect_result;
+			}
+		}
+		break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+		{
+			ips_epaddr_t ipsaddr_f;	/* fake a ptl addr */
+			int epaddr_do_free = 0;
+			psmi_assert_always(paylen ==
+					   sizeof(struct ips_connect_hdr));
+			_HFI_CONNDBG("Got a disconnect from %s\n",
+				  psmi_epaddr_get_name(hdr->epid));
+			proto->num_disconnect_requests++;
+			proto->epaddr_stats.disconnect_req_recv++;
+			/* It's possible to get a disconnection request on a ipsaddr that
+			 * we've since removed if the request is a dupe.  Instead of
+			 * silently dropping the packet, we "echo" the request in the
+			 * reply. */
+			if (ipsaddr == NULL) {
+				ips_path_grp_t *pathgrp;
+				uint16_t lid;
+				uint16_t ip_hi;
+
+				ipsaddr = &ipsaddr_f;
+				memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t));
+				ipsaddr_f.IPSADDR_HASH =
+				    PSMI_EPID_GET_CONTEXT(hdr->epid);
+
+				/* Get path record for peer */
+				lid = PSMI_EPID_GET_LID(hdr->epid);
+				ip_hi = PSMI_EPID_GET_LID(hdr->epid) >> 16;
+				err = proto->ibta.get_path_rec(proto,
+							       proto->epinfo.
+								   ep_base_lid,
+							       __cpu_to_be16(lid),
+							       __cpu_to_be16(ip_hi),
+								   3000, &pathgrp);
+				if (err != PSM2_OK)
+					goto fail;
+
+				ipsaddr_f.pathgrp = pathgrp;
+				((psm2_epaddr_t) &ipsaddr_f)->ptlctl =
+					((struct ptl_ips *)(proto->ptl))->ctl;
+				((psm2_epaddr_t) &ipsaddr_f)->proto = proto;
+				/* If the send fails because of pio_busy, don't let ips queue
+				 * the request on an invalid ipsaddr, just drop the reply */
+				ipsaddr_f.ctrl_msg_queued = ~0;
+
+				psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+
+				ips_flow_init(&ipsaddr_f.
+					      flows[proto->msgflowid], proto,
+					      &ipsaddr_f, PSM_TRANSFER_PIO,
+					      PSM_PROTOCOL_GO_BACK_N,
+					      IPS_PATH_LOW_PRIORITY,
+					      EP_FLOW_GO_BACK_N_PIO);
+				_HFI_CONNDBG
+				    ("Disconnect on unknown epaddr, just echo request\n");
+			} else if (ipsaddr->cstate_incoming != CSTATE_NONE) {
+				ipsaddr->cstate_incoming = CSTATE_NONE;
+				proto->num_connected_incoming--;
+				if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+					epaddr_do_free = 1;
+				}
+			}
+
+			psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+			// TBD - this is redundant if transfer_frame uses UD for all
+			// control messages, but it also makes sure we stop using
+			// RC for any non-control messages (should be none) after disconnect
+			// use the UD QP's allocator and inline now and going forward
+			ipsaddr->use_allocator =  &proto->ep->verbs_ep.send_allocator;
+			ipsaddr->use_qp =  proto->ep->verbs_ep.qp;
+			ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data;
+			_HFI_MMDBG("RC discon\n");
+			// we will free ipsaddr below for all but "fake ipsaddr" case
+			// so that will free RC QP and its buffers
+
+			ips_proto_send_ctrl_message_reply(proto, &ipsaddr->
+							  flows[proto->
+								msgflowid],
+							  OPCODE_DISCONNECT_REPLY,
+							  &ipsaddr->
+							  ctrl_msg_queued);
+			/* We can safely free the ipsaddr if required since disconnect
+			 * messages are never enqueued so no reference to ipsaddr is kept */
+			if (epaddr_do_free) {
+				ips_free_epaddr(epaddr, proto);
+				epaddr = NULL;
+			}
+		}
+		break;
+
+	case OPCODE_DISCONNECT_REPLY:
+		proto->epaddr_stats.disconnect_rep_recv++;
+		if (!ipsaddr) {
+			_HFI_CONNDBG
+			    ("Unknown disconnect reply from epid 0x%"PRIx64": %s\n",
+			     hdr->epid, psmi_epaddr_fmt_addr(hdr->epid));
+			break;
+		} else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) {
+			ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+			/* Freed in disconnect() if cstate_incoming == NONE */
+		}		/* else dupe reply */
+		break;
+
+	default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Unexpected/unhandled connect opcode 0x%x\n",
+				  opcode);
+	}
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr,
+		       struct ips_connect_reqrep *req, uint32_t paylen)
+{
+	ips_epaddr_t *ipsaddr;
+	psm2_error_t err = PSM2_OK;
+	uint16_t connect_result;
+	int newconnect = 0;
+
+	if (req->epid == proto->ep->epid) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR,
+				  "Network connectivity problem: Locally detected duplicate "
+				  "LIDs 0x%04x on hosts %s and %s (%s port %u). (Exiting)",
+				  (uint32_t) psm2_epid_nid(req->epid),
+				  psmi_epaddr_get_hostname(req->epid),
+				  psmi_gethostname(), proto->ep->dev_name, proto->ep->portnum);
+		/* XXX no return */
+		abort();
+	} else if (epaddr == NULL) {	/* new ep connect before we call into connect */
+		newconnect = 1;
+		if ((epaddr =
+		     ips_alloc_epaddr(proto, 1, req->epid, req->hostname,
+					      5000, &err)) == NULL) {
+			goto fail;
+		}
+	} else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) {
+		ipsaddr = (ips_epaddr_t *) epaddr;
+		/* Duplicate lid detection.  */
+		if (ipsaddr->runid_key == req->runid_key)
+			goto do_reply;	/* duplicate request, not duplicate lid */
+		else {		/* Some out of context message.  Just drop it */
+			if (!proto->done_warning) {
+				psmi_syslog(proto->ep, 1, LOG_INFO,
+					    "Non-fatal connection problem: Received an out-of-context "
+					    "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+					    req->hostname,
+					    (int)psm2_epid_nid(req->epid),
+					    psm2_epid_context(req->epid));
+				proto->done_warning = 1;
+			}
+			goto no_reply;
+		}
+	} else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) {
+		/* pre-created epaddr in multi-rail */
+		psmi_assert_always(epaddr->proto->ep !=
+				   epaddr->proto->ep->mctxt_master);
+		newconnect = 1;
+	}
+
+	ipsaddr = (ips_epaddr_t *) epaddr;
+	psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE);
+
+	/* Check connect version and psm version */
+	// for now we are strict about major rev, if we add additional optional
+	// features they can be minor revs and may need more sophisticated handling
+	if ((req->connect_verno >>8) != (IPS_CONNECT_VERNO >>8)) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION,
+				  "Connect protocol (%x,%x) is incompatible with %x.%x",
+				  (req->connect_verno >> 8) & 0xff,
+				  req->connect_verno & 0xff,
+				  (IPS_CONNECT_VERNO >> 8) & 0xff,
+				  IPS_CONNECT_VERNO & 0xff);
+		connect_result = PSM2_EPID_INVALID_CONNECT;
+	} else if (!psmi_verno_isinteroperable(req->psm_verno)) {
+		connect_result = PSM2_EPID_INVALID_VERSION;
+	} else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) &&
+		   proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY &&
+		   proto->epinfo.ep_pkey != req->job_pkey) {
+		connect_result = PSM2_EPID_INVALID_PKEY;
+	} else if (req->sl != proto->epinfo.ep_sl) {
+		connect_result = PSM2_EPID_INVALID_CONNECT;
+		_HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl);
+	} else if (req->rdmamode != (proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) ) {
+		connect_result = PSM2_EPID_INVALID_CONNECT;
+		_HFI_ERROR("Connection error: RDMA Mode mismatch (local:%d, remote:%d)\n",
+			(proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK), req->rdmamode);
+	} else {
+		connect_result = PSM2_OK;
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+			ips_epstate_idx idx;
+			psmi_assert_always(newconnect == 1);
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		}
+	}
+
+	/* Incoming connection request */
+	if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) {
+		err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen);
+		if (err)
+			goto fail;
+	}
+	ipsaddr->cstate_incoming = CSTATE_ESTABLISHED;
+	ipsaddr->cerror_incoming = connect_result;
+
+	ipsaddr->runid_key = req->runid_key;
+
+	proto->num_connected_incoming++;
+
+do_reply:
+	_HFI_CONNDBG("Conn Pkt Sent: op=0x%02x from: 0x%lx to: 0x%lx\n",
+		OPCODE_CONNECT_REPLY, proto->ep->epid, ipsaddr->epaddr.epid);
+	psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+	ips_proto_send_ctrl_message_reply(proto,
+					  &ipsaddr->flows[proto->msgflowid],
+					  OPCODE_CONNECT_REPLY,
+					  &ipsaddr->ctrl_msg_queued);
+no_reply:
+fail:
+	return err;
+}
+
+psm2_error_t
+ips_proto_connect(struct ips_proto *proto, int numep,
+		  const psm2_epid_t *array_of_epid,
+		  const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		  psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+	int i, n, n_first;
+	psm2_error_t err = PSM2_OK;
+	psm2_epaddr_t epaddr;
+	ips_epaddr_t *ipsaddr;
+	ips_epstate_idx idx;
+	int numep_toconnect = 0, numep_left;
+	union psmi_envvar_val credits_intval;
+	int connect_credits;
+
+	psmi_getenv("PSM3_CONNECT_CREDITS",
+		    "End-point connect request credits.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)100, &credits_intval);
+
+	connect_credits = credits_intval.e_uint;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	/* All timeout values are in cycles */
+	uint64_t t_start = get_cycles();
+	/* Print a timeout at the warning interval */
+	union psmi_envvar_val warn_intval;
+	uint64_t to_warning_interval;
+	uint64_t to_warning_next;
+
+	/* Setup warning interval */
+	psmi_getenv("PSM3_CONNECT_WARN_INTERVAL",
+		    "Period in seconds to warn if connections are not completed."
+		    "Default is 300 seconds, 0 to disable",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)300, &warn_intval);
+
+	to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL);
+	to_warning_next = t_start + to_warning_interval;
+
+	/* Some sanity checks */
+	psmi_assert_always(array_of_epid_mask != NULL);
+
+	/* First pass: make sure array of errors is at least fully defined */
+	for (i = 0; i < numep; i++) {
+		_HFI_CONNDBG("epid-connect=%s connect to epid 0x%"PRIx64": %s\n",
+			  array_of_epid_mask[i] ? "YES" : " NO",
+			  array_of_epid[i],
+			  psmi_epaddr_fmt_addr(array_of_epid[i]));
+		if (array_of_epid_mask[i]) {
+			array_of_errors[i] = PSM2_EPID_UNKNOWN;
+			array_of_epaddr[i] = NULL;
+		}
+	}
+
+	/* Second pass: see what to connect and what is connectable. */
+	for (i = 0, numep_toconnect = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+
+		/* Can't send to epid on same lid if not loopback */
+		if ((psm2_epid_nid(proto->ep->epid) ==
+		    psm2_epid_nid(array_of_epid[i])) &&
+		    !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) {
+			array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+			continue;
+		}
+
+		if ((PSMI_EPID_VERSION == PSMI_EPID_V3
+				|| (PSMI_EPID_VERSION == PSMI_EPID_V4 && ! psmi_allow_routers))
+			 && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) !=
+			 	 PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) {
+			char buf1[INET_ADDRSTRLEN];
+			char buf2[INET_ADDRSTRLEN];
+			if (PSMI_EPID_VERSION == PSMI_EPID_V3)
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Trying to connect from %s port %u to a node (subnet id - %"PRIx64") on a"
+					  " different subnet - %"PRIx64"\n",
+					  proto->ep->dev_name, proto->ep->portnum,
+					  PSMI_GET_SUBNET_ID(proto->ep->gid_hi),
+					  (uint64_t)PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]));
+			else // V4
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  " Trying to connect from %s to a node (subnet %s) on a"
+					  " different subnet %s\n",
+					  proto->ep->dev_name,
+					  psmi_ipv4_ntop((uint32_t)PSMI_GET_SUBNET_ID(proto->ep->gid_hi), buf1, sizeof(buf1)),
+					  psmi_ipv4_ntop((uint32_t)PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]), buf2, sizeof(buf2)));
+		}
+
+		epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]);
+		if (epaddr == NULL) {
+			/* We're sending a connect request message before some other node
+			 * has sent its connect message */
+			// so we lack it's hostname, rv and qpn info
+			epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i],
+						  NULL,
+						  (timeout_in / 1000000UL), &err);
+			if (epaddr == NULL) {
+				_HFI_ERROR("Unable to issue connect from %s to %s: %s\n",
+						proto->ep->dev_name, psmi_epaddr_get_name(array_of_epid[i]),
+						psm2_error_get_string(err));
+				goto fail;
+			}
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		} else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) {	/* already connected */
+			psmi_assert_always(((ips_epaddr_t *) epaddr)->
+					   cstate_outgoing == CSTATE_ESTABLISHED);
+			array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED;
+			array_of_epaddr[i] = epaddr;
+			continue;
+		} else if (((ips_epaddr_t *) epaddr)->cstate_incoming ==
+			   CSTATE_NONE) {
+			/* pre-created epaddr in multi-rail */
+			psmi_assert_always(epaddr->proto->ep !=
+					   epaddr->proto->ep->mctxt_master);
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+			if (err)
+				goto fail;
+			ipsaddr->connidx_incoming = idx;
+		} else {
+			/* We've already received a connect request message from a remote
+			 * peer, it's time to send our own. */
+			ipsaddr = (ips_epaddr_t *) epaddr;
+			/* No re-entrancy sanity check and makes sure we are not connected
+			 * twice (caller's precondition) */
+			psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE);
+			psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE);
+		}
+
+		ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING;
+		ipsaddr->cerror_outgoing = PSM2_OK;
+		array_of_epaddr[i] = epaddr;
+		ipsaddr->s_timeout = get_cycles();
+		ipsaddr->delay_in_ms = 1;
+		ipsaddr->credit = 0;
+		numep_toconnect++;
+	}
+
+	/* Second pass: do the actual connect.
+	 * PSM2_EPID_UNKNOWN: Not connected yet.
+	 * PSM2_EPID_UNREACHABLE: Not to be connected.
+	 * PSM2_OK: Successfully connected.
+	 * Start sending connect messages at a random index between 0 and numep-1
+	 */
+	numep_left = numep_toconnect;
+	n_first = ((uint32_t) get_cycles()) % numep;
+	while (numep_left > 0) {
+		for (n = 0; n < numep; n++) {
+			int keep_polling = 1;
+			i = (n_first + n) % numep;
+			if (!array_of_epid_mask[i])
+				continue;
+			switch (array_of_errors[i]) {
+			case PSM2_EPID_UNREACHABLE:
+			case PSM2_EPID_ALREADY_CONNECTED:
+			case PSM2_OK:
+				continue;
+			default:
+				break;
+			}
+			psmi_assert_always(array_of_epaddr[i] != NULL);
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+				if (ipsaddr->credit) {
+					connect_credits++;
+					ipsaddr->credit = 0;
+				}
+				switch (is_rv_connected(ipsaddr)) {
+				case 1:
+					/* This is not the real error code, we only set OK here
+					 * so we know to stop polling for the reply. The actual
+					 * error is in ipsaddr->cerror_outgoing */
+					array_of_errors[i] = PSM2_OK;
+					numep_left--;
+					continue;
+					break;
+				case 0:
+					// fall through to "keep_polling" loop below to check timers
+					break;
+				default:
+					/* This is not the real error code, we only set OK here
+					 * so we know to stop polling for the reply. The actual
+					 * error is in ipsaddr->cerror_outgoing */
+					array_of_errors[i] = PSM2_OK;
+					numep_left--;
+					if (ipsaddr->cerror_outgoing == PSM2_OK)
+						ipsaddr->cerror_outgoing = PSM2_EPID_RV_CONNECT_ERROR;
+					// EIO is connect error
+					if (errno != EIO) {
+						err = PSM2_INTERNAL_ERR;
+						goto fail;	// serious error
+					}
+					continue;
+					break;
+				}
+			}
+			while (keep_polling) {
+				if (!psmi_cycles_left(t_start, timeout_in)) {
+					err = PSM2_TIMEOUT;
+					goto err_timeout;
+				}
+				if (to_warning_interval
+				    && get_cycles() >= to_warning_next) {
+#if _HFI_DEBUGGING
+					uint64_t waiting_time = 0;
+					if (_HFI_INFO_ON) {
+					    waiting_time = cycles_to_nanosecs(
+								get_cycles() -
+								t_start) / SEC_ULL;
+					}
+#endif
+					const char *first_name = NULL;
+					int num_waiting = 0;
+
+					for (i = 0; i < numep; i++) {
+						if (!array_of_epid_mask[i] ||
+						    array_of_errors[i] !=
+						    PSM2_EPID_UNKNOWN)
+							continue;
+						if (!first_name)
+							first_name =
+							    psmi_epaddr_get_name
+							    (array_of_epid[i]);
+						num_waiting++;
+					}
+					if (_HFI_INFO_ON) {
+						if (first_name) {
+						_HFI_INFO_ALWAYS
+						    ("Couldn't connect to %s (and %d others). "
+						     "Time elapsed %02i:%02i:%02i. Still trying...\n",
+						     first_name, num_waiting,
+						     (int)(waiting_time / 3600),
+						     (int)((waiting_time / 60) -
+							   ((waiting_time /
+							     3600) * 60)),
+						     (int)(waiting_time -
+							   ((waiting_time /
+							     60) * 60)));
+						}
+					}
+					to_warning_next =
+					    get_cycles() + to_warning_interval;
+				}
+				if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+					// just waiting for rv to be connected
+					if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+						goto fail;
+					break;	// let outer loop start another REQ
+				}
+
+				if (get_cycles() > ipsaddr->s_timeout) {
+					if (!ipsaddr->credit && connect_credits) {
+						ipsaddr->credit = 1;
+						connect_credits--;
+					}
+					if (ipsaddr->credit) {
+						_HFI_CONNDBG("Conn Pkt Sent: op=0x%02x from: 0x%lx to: 0x%lx\n",
+							OPCODE_CONNECT_REQUEST, proto->ep->epid, ipsaddr->epaddr.epid);
+					    psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+					    if (
+					    ips_proto_send_ctrl_message_request
+						    (proto, &ipsaddr->
+						     flows[proto->msgflowid],
+						     OPCODE_CONNECT_REQUEST,
+						     &ipsaddr->ctrl_msg_queued,
+						     0) == PSM2_OK) {
+							keep_polling = 0;
+							ipsaddr->delay_in_ms =
+							    min(100,
+								ipsaddr->
+								delay_in_ms <<
+								1);
+							ipsaddr->s_timeout =
+							    get_cycles() +
+							    nanosecs_to_cycles
+							    (ipsaddr->
+							     delay_in_ms *
+							     MSEC_ULL);
+						}
+						/* If not, send got "busy", keep trying */
+					} else {
+						keep_polling = 0;
+					}
+				}
+
+				if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+					goto fail;
+
+				if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) {
+					connect_credits++;
+					ipsaddr->credit = 0;
+					switch (is_rv_connected(ipsaddr)) {
+					case 1:
+						/* This is not the real error code, we only set OK here
+						 * so we know to stop polling for the reply. The actual
+						 * error is in ipsaddr->cerror_outgoing */
+						array_of_errors[i] = PSM2_OK;
+						numep_left--;
+						break;
+					case 0:
+						break;
+					default:
+						/* This is not the real error code, we only set OK here
+						 * so we know to stop polling for the reply. The actual
+						 * error is in ipsaddr->cerror_outgoing */
+						array_of_errors[i] = PSM2_OK;
+						numep_left--;
+						if (ipsaddr->cerror_outgoing == PSM2_OK)
+							ipsaddr->cerror_outgoing = PSM2_EPID_RV_CONNECT_ERROR;
+						// EIO is connect error
+						if (errno != EIO) {
+							err = PSM2_INTERNAL_ERR;
+							goto fail;	// serious error
+						}
+						break;
+					}
+					// even if ! rv_connected, let outer loop start next REQ
+					break;
+				}
+			}
+		}
+	}
+
+err_timeout:
+	/* Find the worst error to report */
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+		switch (array_of_errors[i]) {
+			/* These are benign */
+		case PSM2_EPID_UNREACHABLE:
+		case PSM2_EPID_ALREADY_CONNECTED:
+			break;
+		case PSM2_EPID_UNKNOWN:
+			array_of_errors[i] = PSM2_TIMEOUT;
+			err = psmi_error_cmp(err, PSM2_TIMEOUT);
+			_HFI_CONNDBG("EP has timed out on connect.\n");
+			break;
+		case PSM2_OK:
+			/* Restore the real connect error */
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			array_of_errors[i] = psmi_error_cmp(ipsaddr->cerror_outgoing,
+												ipsaddr->cerror_incoming);
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+			if (array_of_errors[i] != PSM2_OK) {
+				err = psmi_error_cmp(err, array_of_errors[i]);
+				ips_free_epaddr(array_of_epaddr[i], proto);
+				array_of_epaddr[i] = NULL;
+			} else {
+				proto->num_connected_outgoing++;
+				psmi_assert_always(ipsaddr->pathgrp->
+						   pg_path[0]
+						   [IPS_PATH_HIGH_PRIORITY]->
+						   pr_mtu > 0);
+			}
+			break;
+		default:
+			_HFI_CONNDBG("EP has error code %d\n", array_of_errors[i]);
+			break;
+		}
+	}
+
+fail:
+	return err;
+}
+
+/* Repercussions on MQ.
+ *
+ * If num_connected==0, everything that exists in the posted queue should
+ * complete and the error must be marked epid_was_closed.
+ *
+ */
+
+psm2_error_t
+ips_proto_disconnect(struct ips_proto *proto, int force, int numep,
+		     psm2_epaddr_t array_of_epaddr[],
+		     const int array_of_epaddr_mask[],
+		     psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	ips_epaddr_t *ipsaddr;
+	int numep_left, numep_todisc, i, n;
+	int n_first;
+	int has_pending;
+	uint64_t timeout;
+	psm2_error_t err = PSM2_OK;
+	uint64_t reqs_sent = 0;
+	union psmi_envvar_val credits_intval;
+	int disconnect_credits;
+	uint64_t t_warning, t_start;
+	union psmi_envvar_val warn_intval;
+	unsigned warning_secs;
+
+	/* In case of a forced close, we cancel whatever timers are pending
+	 * on the proto so that we don't have zombie timers coming back
+	 * after the internal structures of PSM2 have been destroyed
+	 */
+	if (force) {
+		struct psmi_timer *t_cursor;
+		TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) {
+			psmi_timer_cancel(proto->timerq, t_cursor);
+		}
+	}
+
+	psmi_assert_always(numep > 0);
+
+	psmi_getenv("PSM3_DISCONNECT_CREDITS",
+		    "End-point disconnect request credits.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)100, &credits_intval);
+
+	disconnect_credits = credits_intval.e_uint;
+
+	/* Setup warning interval */
+	psmi_getenv("PSM3_DISCONNECT_WARN_INTERVAL",
+		    "Period in seconds to warn if disconnections are not completed."
+		    "Default is 300 seconds, 0 to disable.",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)300, &warn_intval);
+
+	warning_secs = warn_intval.e_uint;
+
+	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
+
+	/* First pass: see what to disconnect and what is disconnectable */
+	for (i = 0, numep_todisc = 0; i < numep; i++) {
+		if (!array_of_epaddr_mask[i])
+			continue;
+		psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl ==
+				   proto->ptl);
+		ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+		ipsaddr->credit = 0;
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE) {
+			array_of_errors[i] = PSM2_OK;
+			continue;
+		} else {
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+		}
+		_HFI_CONNDBG("disconnecting %p force=%d EPID= 0x%"PRIx64" %s\n",
+				ipsaddr, force, ((psm2_epaddr_t)ipsaddr)->epid,
+
+				psmi_epaddr_get_hostname(((psm2_epaddr_t)ipsaddr)->epid));
+		array_of_errors[i] = PSM2_EPID_UNKNOWN;
+		numep_todisc++;
+	}
+	if (numep_todisc == 0)
+		goto success;
+
+	/* Wait for everyone to ack previous packets before putting */
+	if (timeout_in == 0)
+		timeout = ~0ULL;
+	else
+		timeout = get_cycles() + nanosecs_to_cycles(timeout_in);
+
+	t_start = get_cycles();
+	t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL);
+
+	n_first = ((uint32_t) get_cycles()) % numep;
+	if (!force) {
+		numep_left = numep_todisc;
+		do {
+			for (n = 0; n < numep; n++) {
+				i = (n_first + n) % numep;
+				if (!array_of_epaddr_mask[i]
+				    || array_of_errors[i] == PSM2_OK)
+					continue;
+				ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+				switch (ipsaddr->cstate_outgoing) {
+				case CSTATE_OUTGOING_DISCONNECTED:
+					array_of_errors[i] = PSM2_OK;
+					numep_left--;
+					disconnect_credits++;
+					ipsaddr->credit = 0;
+					continue;
+				case CSTATE_OUTGOING_WAITING_DISC:
+					if (ipsaddr->s_timeout > get_cycles())
+						continue;
+					ipsaddr->delay_in_ms =
+					    min(100, ipsaddr->delay_in_ms << 1);
+					ipsaddr->s_timeout = get_cycles() +
+					    nanosecs_to_cycles(ipsaddr->
+							       delay_in_ms *
+							       MSEC_ULL);
+					psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+					ips_proto_send_ctrl_message_request
+					    (proto,
+					     &ipsaddr->flows[proto->msgflowid],
+					     OPCODE_DISCONNECT_REQUEST,
+					     &ipsaddr->ctrl_msg_queued,
+					     timeout);
+					reqs_sent++;
+					break;
+				case CSTATE_ESTABLISHED:
+					/* Still pending acks, hold off for now */
+					has_pending =
+					    !STAILQ_EMPTY(&ipsaddr->flows
+							  [EP_FLOW_GO_BACK_N_PIO].
+							  scb_unacked)
+						;
+					if (has_pending)
+						continue;
+					if (!ipsaddr->credit
+					    && disconnect_credits) {
+						ipsaddr->credit = 1;
+						disconnect_credits--;
+					}
+					if (!ipsaddr->credit)
+						continue;
+					ipsaddr->delay_in_ms = 1;
+					ipsaddr->cstate_outgoing =
+					    CSTATE_OUTGOING_WAITING_DISC;
+					ipsaddr->s_timeout =
+					    get_cycles() +
+					    nanosecs_to_cycles(MSEC_ULL);
+					psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+					ips_proto_send_ctrl_message_request
+					    (proto,
+					     &ipsaddr->flows[proto->msgflowid],
+					     OPCODE_DISCONNECT_REQUEST,
+					     &ipsaddr->ctrl_msg_queued,
+					     timeout);
+					reqs_sent++;
+					break;
+				default:
+					psmi_handle_error(PSMI_EP_NORETURN,
+							  PSM2_INTERNAL_ERR,
+							  "Unhandled/unknown close state %d",
+							  ipsaddr->cstate_outgoing);
+					break;
+				}
+			}
+			if (numep_left == 0)
+				break;
+
+			if ((err =
+			     psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+				goto fail;
+
+			if (warning_secs && get_cycles() > t_warning) {
+				_HFI_INFO
+				    ("graceful close in progress for %d/%d peers "
+				     "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+				     numep_left, numep_todisc,
+				     (int)(cycles_to_nanosecs
+					   (get_cycles() - t_start) / MSEC_ULL),
+				     (int)(timeout_in / MSEC_ULL),
+				     (unsigned long long)reqs_sent);
+				t_warning =
+				    get_cycles() +
+				    nanosecs_to_cycles(warning_secs * SEC_ULL);
+			}
+		}
+		while (timeout > get_cycles());
+
+		if (numep_left > 0) {
+			err = PSM2_TIMEOUT;
+			for (i = 0; i < numep; i++) {
+				if (!array_of_epaddr_mask[i])
+					continue;
+				if (array_of_errors[i] == PSM2_EPID_UNKNOWN) {
+					array_of_errors[i] = PSM2_TIMEOUT;
+					_HFI_CONNDBG
+					    ("disc timeout on index %d, epaddr %s\n",
+					     i,
+					     psmi_epaddr_get_name
+					     (array_of_epaddr[i]->epid));
+				}
+			}
+			_HFI_PRDBG("graceful close incomplete for %d/%d peers "
+				   "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n",
+				   numep_left, numep_todisc,
+				   (int)(cycles_to_nanosecs
+					 (get_cycles() - t_start) / MSEC_ULL),
+				   (int)(timeout_in / MSEC_ULL),
+				   (unsigned long long)reqs_sent);
+		} else
+			_HFI_PRDBG
+			    ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n",
+			     numep_todisc,
+			     (int)(cycles_to_nanosecs(get_cycles() - t_start) /
+				   MSEC_ULL), (unsigned long long)reqs_sent);
+	} else {
+		psmi_assert_always(proto->msgflowid < EP_FLOW_LAST);
+		for (n = 0; n < numep; n++) {
+			i = (n_first + n) % numep;
+			if (!array_of_epaddr_mask[i])
+				continue;
+			ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+			psmi_assert_always(ipsaddr->cstate_outgoing ==
+					   CSTATE_ESTABLISHED);
+			ips_proto_send_ctrl_message_request(proto, &ipsaddr->
+						    flows[proto->msgflowid],
+						    OPCODE_DISCONNECT_REQUEST,
+						    &ipsaddr->ctrl_msg_queued,
+						    0);
+			/* Force state to DISCONNECTED */
+			ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED;
+			array_of_errors[i] = PSM2_OK;
+		}
+		_HFI_CONNDBG("non-graceful close complete from %d peers\n", numep);
+	}
+
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK)
+			continue;
+		ipsaddr = (ips_epaddr_t *) array_of_epaddr[i];
+		if (ipsaddr->cstate_outgoing == CSTATE_NONE)
+			continue;
+		psmi_assert_always(ipsaddr->cstate_outgoing ==
+				   CSTATE_OUTGOING_DISCONNECTED);
+		proto->num_connected_outgoing--;
+		/* Remote disconnect req arrived already, remove this epid.  If it
+		 * hasn't arrived yet, that's okay, we'll pick it up later and just
+		 * mark our connect-to status as being "none". */
+		if (ipsaddr->cstate_incoming == CSTATE_NONE) {
+			ips_free_epaddr(array_of_epaddr[i], proto);
+			array_of_epaddr[i] = NULL;
+		} else
+			ipsaddr->cstate_outgoing = CSTATE_NONE;
+	}
+
+fail:
+success:
+	return err;
+}
+
+int ips_proto_isconnected(ips_epaddr_t *ipsaddr)
+{
+	if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED ||
+	    ipsaddr->cstate_incoming == CSTATE_ESTABLISHED)
+		return 1;
+	else
+		return 0;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_dump.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_dump.c
new file mode 100644
index 0000000000000000000000000000000000000000..927a8bde48ac2d9127a7f94290abd8f8eb1cf973
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_dump.c
@@ -0,0 +1,150 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+void ips_proto_dump_frame(void *frame, int lenght, char *message)
+{
+	uint8_t *raw_frame = frame;
+	int counter;
+	char default_message[] = "<UNKNOWN>";
+
+	if (!message)
+		message = default_message;
+
+	printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame,
+	       message);
+
+	for (counter = 0; counter < lenght; counter++) {
+		if ((counter % 16) == 0)
+			printf("\n");
+
+		if ((counter % 4) == 0)
+			printf("   ");
+
+		printf("%02X ", raw_frame[counter]);
+	}
+	printf("\n");
+}
+
+void ips_proto_dump_data(void *data, int data_length)
+{
+	int counter;
+	uint8_t *payload = (uint8_t *) data;
+
+	printf("\nHex dump of data, length = %i\n", data_length);
+
+	for (counter = 0; counter < data_length; counter++) {
+		if ((counter % 16) == 0)
+			printf("\n %04d: ", counter);
+
+		if ((counter % 4) == 0)
+			printf("   ");
+
+		printf("%02X ", payload[counter]);
+	}
+	printf("\n");
+}
+
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg)
+{
+	psmi_seqnum_t ack_seq_num;
+
+	printf("\nHeader decoding in hex: %s\n", msg ? msg : "");
+
+	printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n",
+	       __be16_to_cpu(p_hdr->lrh[0]));
+	printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1]));
+	printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2]));
+	printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3]));
+
+	printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n",
+	       __be32_to_cpu(p_hdr->bth[0]));
+	printf("BTH: Res24-Flow8 %x\n", __be32_to_cpu(p_hdr->bth[1]));
+	printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2]));
+
+	printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1));
+	printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n",
+	       __le32_to_cpu(p_hdr->khdr.kdeth0));
+
+	printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr));
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+		printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n",
+		       (__be32_to_cpu(p_hdr->bth[1]) >>
+			HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK,
+		       (__be32_to_cpu(p_hdr->bth[2]) >>
+			HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK,
+		       (__be32_to_cpu(p_hdr->bth[2]) >>
+			HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK);
+	else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW)
+		printf("ack_seq_num gen %x, seq %x\n",
+		       ack_seq_num.psn_gen, ack_seq_num.psn_seq);
+	else
+		printf("ack_seq_num %x\n", ack_seq_num.psn_num);
+
+	printf("src_rank/connidx %x\n", p_hdr->connidx);
+	if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)))
+		printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc);
+	printf("flags %x\n", p_hdr->flags);
+}
+
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
new file mode 100644
index 0000000000000000000000000000000000000000..643a271f5ed91176897eb78cec40616bcb9daca2
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
@@ -0,0 +1,2520 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2016 Intel Corporation. All rights reserved. */
+
+// This file implements the TID protocol for STL100 and the RDMA
+// protocol for UD mode.  The majority of functons in this file (perhaps all)
+// are not used when TID/RDMA is disabled via PSM3_TID o PSM3_RDMA respectively
+// RDMA is N/A for UDP, so it will behave as if PSM3_RDMA is disabled
+// and not use functions in this file.
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+
+#include "ips_scb.h"
+#include "ips_tid.h"
+#include "ips_tidflow.h"
+#include "ips_proto.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Timer callbacks.  When we need work to be done out of the receive process
+ * loop, we schedule work on timers to be done at a later time.
+ */
+static psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+#ifdef RNDV_MOD
+static void ips_protoexp_send_err_chk_rdma_resp(struct ips_flow *flow);
+static void ips_tid_reissue_rdma_write(struct ips_tid_send_desc *tidsendc);
+#endif
+
+
+static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context);
+
+
+static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc);
+static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc);
+
+#ifdef PSM_CUDA
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+			      struct ips_tid_send_desc *tidsendc);
+static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+					psm2_mq_req_t req,
+					struct ips_tid_send_desc *tidsendc,
+					struct ips_cuda_hostbuf *chb_prev,
+					uint32_t tsess_srcoff,
+					uint32_t tsess_length,
+					uint32_t tsess_unaligned_start,
+					psm2_chb_match_type_t type);
+#endif
+
+psm2_error_t
+MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
+		  const struct ips_proto *proto,
+		  uint32_t protoexp_flags,
+		  int num_of_send_bufs,
+		  int num_of_send_desc, struct ips_protoexp **protoexp_o)
+{
+	struct ips_protoexp *protoexp = NULL;
+	psm2_error_t err = PSM2_OK;
+
+	protoexp = (struct ips_protoexp *)
+	    psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp));
+	if (protoexp == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	*protoexp_o = protoexp;
+
+	protoexp->ptl = (const struct ptl *)proto->ptl;
+	protoexp->proto = (struct ips_proto *)proto;
+	protoexp->timerq = proto->timerq;
+	protoexp->tid_flags = protoexp_flags;
+
+	if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) {
+		protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED;
+	}
+
+
+	/* Must be initialized already */
+	/* Comment out because of Klockwork scanning critical error. CQ 11/16/2012
+	   psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL &&
+	   proto->ep->mq->rreq_pool != NULL &&
+	   proto->ep->mq->sreq_pool != NULL);
+	 */
+	psmi_assert_always(proto->timerq != NULL);
+
+	/* These request pools are managed by the MQ component */
+	protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool;
+	protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool;
+
+	protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO;
+
+	/* Initialize tid flow control. */
+	err = ips_tf_init(protoexp, context, &protoexp->tfc,
+			       ips_tidflow_avail_callback);
+	if (err != PSM2_OK)
+		goto fail;
+
+
+	if ((err = ips_scbctrl_init(context, num_of_send_desc, 0,
+				    0, 0, ips_tid_scbavail_callback,
+				    protoexp, &protoexp->tid_scbc_rv)))
+		goto fail;
+
+
+	{
+		union psmi_envvar_val env_rts_cts_interleave;
+
+		psmi_getenv("PSM3_RTS_CTS_INTERLEAVE",
+			    "Interleave the handling of RTS to provide a fair distribution between multiple senders",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+			    (union psmi_envvar_val)0, &env_rts_cts_interleave);
+		if (env_rts_cts_interleave.e_uint)
+			protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE;
+	}
+
+	/* Send descriptors.
+	 *
+	 * There can be up to 2^32 of these send descriptors.  We conservatively
+	 * allocate 256 but large node configurations can allocate up to sdesc_num
+	 * of these (they are about 2k each).
+	 * We impose a theoretical limit of 2^30.
+	 */
+	{
+		struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS;
+		uint32_t maxsz, chunksz;
+
+		if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+						&rlim, &maxsz, &chunksz)))
+			goto fail;
+
+		protoexp->tid_desc_send_pool =
+		    psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz,
+				      maxsz, 0, DESCRIPTORS, NULL, NULL);
+
+		if (protoexp->tid_desc_send_pool == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate tid descriptor memory pool");
+			goto fail;
+		}
+	}
+
+	/* Receive descriptors are an array in tidflow structure. */
+
+	/* This pool can never be smaller than the max number of rreqs that can be
+	 * allocated. */
+	{
+		uint32_t rreq_per_chunk, rreq_max;
+
+		psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL);
+
+		psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool,
+					&rreq_per_chunk, &rreq_max);
+
+		protoexp->tid_getreq_pool =
+		    psmi_mpool_create(sizeof(struct ips_tid_get_request),
+				      rreq_per_chunk, rreq_max, 0, DESCRIPTORS,
+				      NULL, NULL);
+
+		if (protoexp->tid_getreq_pool == NULL) {
+			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate getreq descriptor memory pool");
+			goto fail;
+		}
+	}
+
+	/* Timers to handle requeueing of work out of the receive path */
+	psmi_timer_entry_init(&protoexp->timer_send,
+			      ips_tid_pendsend_timer_callback, protoexp);
+	STAILQ_INIT(&protoexp->pend_sendq);
+	psmi_timer_entry_init(&protoexp->timer_getreqs,
+			      ips_tid_pendtids_timer_callback, protoexp);
+	STAILQ_INIT(&protoexp->pend_getreqsq);
+#ifdef RNDV_MOD
+	STAILQ_INIT(&protoexp->pend_err_resp);
+#endif
+
+
+
+#ifdef PSM_CUDA
+	{
+		if (PSMI_IS_CUDA_ENABLED &&
+			 !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+			struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS;
+			uint32_t maxsz, chunksz, max_elements;
+
+			if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+							&rlim, &maxsz, &chunksz)))
+				goto fail;
+
+			/* the maxsz is the amount in MB, not the number of entries,
+			 * since the element size depends on the window size */
+			max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+			/* mpool requires max_elements to be power of 2. round down. */
+			max_elements = 1 << (31 - __builtin_clz(max_elements));
+			protoexp->cuda_hostbuf_recv_cfg.bufsz =
+				proto->mq->hfi_base_window_rv;
+
+			protoexp->cuda_hostbuf_pool_recv =
+				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+							   chunksz, max_elements, 0,
+							   UNDEFINED, NULL, NULL,
+							   psmi_cuda_hostbuf_alloc_func,
+							   (void *)
+							   &protoexp->cuda_hostbuf_recv_cfg);
+
+			if (protoexp->cuda_hostbuf_pool_recv == NULL) {
+				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+							"Couldn't allocate CUDA host receive buffer pool");
+				goto fail;
+			}
+
+			protoexp->cuda_hostbuf_small_recv_cfg.bufsz =
+				CUDA_SMALLHOSTBUF_SZ;
+			protoexp->cuda_hostbuf_pool_small_recv =
+				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
+							   chunksz, max_elements, 0,
+							   UNDEFINED, NULL, NULL,
+							   psmi_cuda_hostbuf_alloc_func,
+							   (void *)
+							   &protoexp->cuda_hostbuf_small_recv_cfg);
+
+			if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
+				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
+							"Couldn't allocate CUDA host small receive buffer pool");
+				goto fail;
+			}
+
+			protoexp->cudastream_recv = NULL;
+			STAILQ_INIT(&protoexp->cudapend_getreqsq);
+		} else {
+			protoexp->cuda_hostbuf_pool_recv = NULL;
+			protoexp->cuda_hostbuf_pool_small_recv = NULL;
+		}
+	}
+#endif
+	psmi_assert(err == PSM2_OK);
+	return err;
+
+fail:
+#ifdef PSM_CUDA
+	if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL)
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+	if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL)
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+#endif
+	if (protoexp != NULL && protoexp->tid_getreq_pool != NULL)
+		psmi_mpool_destroy(protoexp->tid_getreq_pool);
+	if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL)
+		psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+	if (protoexp != NULL)
+		ips_scbctrl_fini(&protoexp->tid_scbc_rv);
+	if (protoexp != NULL)
+		psmi_free(protoexp);
+	return err;
+}
+MOCK_DEF_EPILOGUE(ips_protoexp_init);
+
+psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
+{
+	psm2_error_t err = PSM2_OK;
+
+#ifdef PSM_CUDA
+	if(PSMI_IS_CUDA_ENABLED &&
+		 !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
+		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
+		if (protoexp->cudastream_recv != NULL) {
+			PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+		}
+	}
+#endif
+	psmi_mpool_destroy(protoexp->tid_getreq_pool);
+	psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+
+	if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv)))
+		goto fail;
+
+
+	/* finalize tid flow control. */
+	if ((err = ips_tf_fini(&protoexp->tfc)))
+		goto fail;
+
+
+	psmi_free(protoexp);
+
+fail:
+	return err;
+}
+
+/* New scbs now available.  If we have pending sends or pending get requests,
+ * turn on the timer so it can be processed. */
+/* for RDMA we can also use this routine when an MR is freed.  scbc is not used
+ */
+static
+void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+	if (!STAILQ_EMPTY(&protoexp->pend_sendq))
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)
+#ifdef RNDV_MOD
+		|| !STAILQ_EMPTY(&protoexp->pend_err_resp)
+#endif
+		)
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+	return;
+}
+
+void ips_tid_mravail_callback(struct ips_proto *proto)
+{
+	// if we have Send DMA but not RDMA, no proto->protoexp
+	if (proto->protoexp)
+		ips_tid_scbavail_callback(NULL, proto->protoexp);
+}
+
+
+// On STL100 ips_tf is a user space control for the HW tidflow which
+// would fully process most valid inbound EXPTID packets within an RV Window.
+// For UD we maintain the user space control to help manage each active
+// RV window.
+// There is one CTS per RV window (typically 128K).
+// For UD with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window.
+// Typically there are 32 (HFI_TF_NFLOWS) configured.
+// The 32 is hard coded, could make it tunable.
+// The tidflow provides a natural pacing mechanism and limits the total amount
+// of inflight EXPTID or RDMA incoming to given receiver.
+// In addition on STL100 there is an upper bound on TIDs which limited total
+// inbound DMA for a receiver to avoid 4MB. For smaller messages tidflow
+// count may be the limit, for larger messages TIDs would be the limit.
+
+/* New Tid Flows are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)context;
+
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+	{
+		psmi_timer_request(protoexp->timerq,
+				   &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+	}
+	return;
+}
+
+// this is called from ips_proto_mq_rts_match_callback when a RTS is matched
+// and we chose to use the TID receive mechanism
+// this kicks off the receiver side protocol for preparing TIDs and issuing a
+// CTS which requests use of TID
+/*
+ * The tid get request is always issued from within the receive progress loop,
+ * which is why we always enqueue the request instead of issuing it directly.
+ * Eventually, if we expose tid_get to users, we will want to differentiate
+ * when the request comes from the receive progress loop from cases where the
+ * tid_get is issued directly from user code.
+ *
+ */
+psm2_error_t
+ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+				void *buf,
+				uint32_t length,
+				psm2_epaddr_t epaddr,
+				uint32_t remote_tok,
+				uint32_t flags,
+				ips_tid_completion_callback_t callback,
+				psm2_mq_req_t req)
+{
+	struct ips_tid_get_request *getreq;
+	int count;
+	int tidflows;
+	uint64_t nbytes;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0);
+	getreq = (struct ips_tid_get_request *)
+	    psmi_mpool_get(protoexp->tid_getreq_pool);
+
+	/* We can't *really* run out of these here because we always allocate as
+	 * much as available receive reqs */
+	if_pf(getreq == NULL)
+	{
+		PSM2_LOG_MSG("leaving");
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			      "Ran out of 'getreq' descriptors");
+	}
+
+	getreq->tidgr_protoexp = protoexp;
+	getreq->tidgr_epaddr = epaddr;
+	getreq->tidgr_lbuf = buf;
+	getreq->tidgr_length = length;
+	getreq->tidgr_sendtoken = remote_tok;
+	getreq->tidgr_req = req;
+	getreq->tidgr_callback = callback;
+	getreq->tidgr_offset = 0;
+	getreq->tidgr_bytesdone = 0;
+	getreq->tidgr_flags = flags;
+
+#ifdef PSM_CUDA
+	if ((req->is_buf_gpu_mem &&
+	    !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) ||
+	    ((req->is_buf_gpu_mem &&
+	     (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
+	     (length > gpudirect_recv_limit
+		|| length & 0x03 || (uintptr_t)buf & 0x03
+ 		)))) {
+		getreq->cuda_hostbuf_used = 1;
+		getreq->tidgr_cuda_bytesdone = 0;
+		STAILQ_INIT(&getreq->pend_cudabuf);
+		protoexp->proto->strat_stats.rndv_rdma_hbuf_recv++;
+		protoexp->proto->strat_stats.rndv_rdma_hbuf_recv_bytes += length;
+	} else {
+		getreq->cuda_hostbuf_used = 0;
+		if (req->is_buf_gpu_mem) {
+			protoexp->proto->strat_stats.rndv_rdma_gdr_recv++;
+			protoexp->proto->strat_stats.rndv_rdma_gdr_recv_bytes += length;
+		} else {
+#endif
+			protoexp->proto->strat_stats.rndv_rdma_cpu_recv++;
+			protoexp->proto->strat_stats.rndv_rdma_cpu_recv_bytes += length;
+#ifdef PSM_CUDA
+		}
+	}
+#endif
+
+	/* nbytes is the bytes each channel should transfer. */
+	count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count;
+#ifdef PSM_CUDA
+	if (req->is_buf_gpu_mem)
+		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE);
+	else
+#endif
+		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE);
+	getreq->tidgr_rndv_winsz =
+	    min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv);
+	_HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n",
+				 nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length);
+	// we have now computed the size of each TID sequence (tidgr_rndv_winsz)
+
+	STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next);
+	// by using tidflow we also constrain amount of concurrent RDMA to our NIC
+	tidflows = ips_tf_available(&protoexp->tfc);
+	_HFI_MMDBG("available tidflow %u\n", tidflows);
+
+	if (
+		tidflows > 0)
+		// get the actual TIDs and tidflows and send the CTS
+		ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0);
+	else if (
+		tidflows != -1)
+		// out of TIDs, set a timer to try again later
+		psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs,
+				   PSMI_TIMER_PRIO_1);
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+/* List of perf events */
+#define _ips_logeventid_tid_send_reqs	0	/* out of tid send descriptors */
+
+#define ips_logevent_id(event)	 _ips_logeventid_ ## event
+#define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr)
+
+static
+void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context)
+{
+	uint64_t t_now = get_cycles();
+
+	switch (eventid) {
+	case ips_logevent_id(tid_send_reqs):{
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) context;
+			proto->psmi_logevent_tid_send_reqs.count++;
+
+			if (t_now >=
+			    proto->psmi_logevent_tid_send_reqs.next_warning) {
+				psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK,
+						  "Non-fatal temporary exhaustion of send rdma descriptors "
+						  "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)",
+						  (double)
+						  cycles_to_nanosecs(t_now -
+								     proto->
+								     t_init) /
+						  1.0e9,
+						  (int)psm2_epid_nid(epaddr->
+								    epid),
+						  (int)psm2_epid_context(epaddr->
+									epid),
+						  (long long)proto->
+						  psmi_logevent_tid_send_reqs.
+						  count);
+				proto->psmi_logevent_tid_send_reqs.
+				    next_warning =
+				    t_now +
+				    sec_2_cycles(proto->
+						 psmi_logevent_tid_send_reqs.
+						 interval_secs);
+			}
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Expected Protocol.
+ *
+ * We're granted tids (as part of a tid get request) and expected to fulfill
+ * the request by associating the request's sendtoken to a tid send descriptor.
+ *
+ * It's possible to be out of tid send descriptors when somehow all allocated
+ * descriptors can't complete all of their sends.  For example, the targets of
+ * the sends may be busy in computation loops and not processing incoming
+ * packets.
+ */
+
+// build and issue CTS
+void
+ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc)
+{
+	ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+	struct ips_proto *proto = tidrecvc->protoexp->proto;
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	ips_scb_t *scb;
+
+	scb = tidrecvc->grantscb;
+	ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val;
+	scb->ips_lrh.data[0] = tidrecvc->rdescid;
+	scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length;
+	scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken;
+
+	ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list;
+	ips_scb_length(scb) = sizeof(tidrecvc->tid_list);
+	_HFI_MMDBG("sending CTS\n");
+
+	PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d",
+		    tidrecvc->getreq->tidgr_sendtoken);
+	proto->epaddr_stats.cts_rdma_send++;
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+}
+
+
+#ifdef PSM_CUDA
+static
+void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb)
+{
+	PSMI_CUDA_CALL(cuMemFreeHost, chb->host_buf);
+	PSMI_CUDA_CALL(cuEventDestroy, chb->copy_status);
+	psmi_free(chb);
+	return;
+}
+#endif
+
+// indicate the given tidsendc has been completed and cleanup after it
+static void
+ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+	psm2_mq_req_t req = tidsendc->mqreq;
+
+	_HFI_MMDBG("ips_protoexp_tidsendc_complete\n");
+	PSM2_LOG_MSG("entering");
+
+	req->send_msgoff += tidsendc->length;
+
+	if (tidsendc->mr) {
+		_HFI_MMDBG("send chunk complete, releasing MR: rkey: 0x%x\n", tidsendc->mr->rkey);
+		psm2_verbs_release_mr(tidsendc->mr);
+		tidsendc->mr = NULL;
+	}
+
+#ifdef PSM_CUDA
+	if (req->cuda_hostbuf_used) {
+		if (tidsendc->cuda_num_buf == 1) {
+			tidsendc->cuda_hostbuf[0]->bytes_read +=
+				tidsendc->tid_list.tsess_length;
+			if(tidsendc->cuda_hostbuf[0]->bytes_read ==
+				tidsendc->cuda_hostbuf[0]->size){
+				STAILQ_REMOVE(&req->sendreq_prefetch,
+					      tidsendc->cuda_hostbuf[0],
+					      ips_cuda_hostbuf, req_next);
+				if (tidsendc->cuda_hostbuf[0]->is_tempbuf)
+					psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]);
+				else {
+					tidsendc->cuda_hostbuf[0]->req = NULL;
+					tidsendc->cuda_hostbuf[0]->offset = 0;
+					tidsendc->cuda_hostbuf[0]->bytes_read = 0;
+					psmi_mpool_put(tidsendc->cuda_hostbuf[0]);
+				}
+				psmi_cuda_run_prefetcher(protoexp, tidsendc);
+			}
+		} else
+			psmi_free(tidsendc->userbuf);
+	}
+#endif
+	/* Check if we can complete the send request. */
+	_HFI_MMDBG("ips_protoexp_tidsendc_complete off %u req len %u\n",
+		req->send_msgoff, req->req_data.send_msglen);
+	if (req->send_msgoff >= req->req_data.send_msglen) {
+		psmi_mq_handle_rts_complete(req);
+	}
+
+	psmi_mpool_put(tidsendc);
+	/* we freed an MR  If we have pending sends or pending get requests,
+	 * turn on the timer so it can be processed. */
+	ips_tid_mravail_callback(protoexp->proto);
+
+	PSM2_LOG_MSG("leaving");
+}
+
+// our RDMA Write has completed on our send Q (RV or user space RC QP)
+// This is called by the send CQE polling which might be within a send
+// so it cannot issue any sends directly, otherwise we will have a recursive
+// situation and potentially deeper recursion if more send CQEs found
+// key notes in this regard:
+//	OPA100 code which may send acks here is ifdef'ed out since N/A to RC QP RDMA
+//	psmi_mq_handle_rts_complete - sets flags in req and queues it, no callbacks
+//	psmi_mpool_put(tidsendc) - tid_desc_send_pool has no callback configured
+//	ips_tid_mravail_callback - psmi_timer_request call queues timer for future
+//							callback  (no immediate callback)
+//	psmi_mpool_put(tidsendc->cuda_hostbuf[0]) - cuda_hostbuf_pool_send has a
+//							callback of psmi_cuda_hostbuf_alloc_func which
+//							manages cuda buffers but does not issue any sends
+
+int
+ips_protoexp_rdma_write_completion(uint64_t wr_id)
+{
+	struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *)(uintptr_t)wr_id;
+
+	_HFI_MMDBG("ips_protoexp_rdma_write_completion\n");
+	PSM2_LOG_MSG("entering");
+
+	ips_protoexp_tidsendc_complete(tidsendc);
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+#ifdef RNDV_MOD
+// our RV RDMA Write has completed with error on our send Q
+// This is called by the send CQE polling which might be within a send
+// so it cannot issue any sends directly, otherwise we will have a recursive
+// situation and potentially deeper recursion if more send CQEs found
+// key notes in this regard:
+// if we don't return PSM2_OK, caller will consider it an unrecoverable error
+int
+ips_protoexp_rdma_write_completion_error(psm2_ep_t ep, uint64_t wr_id,
+												enum ibv_wc_status wc_status)
+{
+	struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *)(uintptr_t)wr_id;
+	struct ips_protoexp *protoexp;
+
+	PSM2_LOG_MSG("entering");
+	if (! tidsendc) {
+		psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"rv RDMA Write with invalid tidsendc: status: '%s' (%d)\n",
+			ibv_wc_status_str(wc_status),(int)wc_status);
+		goto fail_ret;
+	}
+	protoexp = tidsendc->protoexp;
+	_HFI_MMDBG("failed rv RDMA Write on %s to %s status: '%s' (%d)\n",
+			ep->dev_name,
+			psmi_epaddr_get_name(tidsendc->ipsaddr->epaddr.epid),
+			ibv_wc_status_str(wc_status),(int)wc_status);
+
+	if (! protoexp->proto->ep->rv_reconnect_timeout)
+		goto fail; /* reconnect disabled, can't recover */
+
+	// perhaps depending on wc_status
+	// IBV_WC_REM_ACCESS_ERR and others unrecoverable
+	// IBV_WC_RETRY_EXC_ERR may be recoverable
+	// IBV_WC_RNR_RETRY_EXC_ERR may be recoverable
+	// IBV_WC_RESP_TIMEOUT_ERR may be recoverable (is this applicable?)
+	// any others?  IB_WC_GENERAL_ERR?
+
+	tidsendc->rv_need_err_chk_rdma = 1;
+	tidsendc->is_complete = 0;	// status of send of err_chk_rdma
+
+	/* Add as a pending op and ring up the timer */
+	/* ips_tid_pendsend_timer_callback timer will issue ERR_CHK_RDMA */
+	STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+	psmi_timer_request(protoexp->timerq, &protoexp->timer_send,
+		   PSMI_TIMER_PRIO_1);
+
+	return PSM2_OK;
+
+fail:
+	psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"failed rv RDMA Write on %s to %s status: '%s' (%d)\n",
+			ep->dev_name,
+			psmi_epaddr_get_name(tidsendc->ipsaddr->epaddr.epid),
+			ibv_wc_status_str(wc_status),(int)wc_status);
+fail_ret:
+	PSM2_LOG_MSG("leaving");
+	return PSM2_INTERNAL_ERR;
+}
+#endif // RNDV_MOD
+
+#ifdef RNDV_MOD
+static psm2_error_t ips_protoexp_send_err_chk_rdma(struct ips_tid_send_desc *tidsendc)
+{
+	ips_scb_t *scb = NULL;
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+	struct ips_proto *proto = protoexp->proto;
+	ips_epaddr_t *ipsaddr = tidsendc->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	psm2_error_t err = PSM2_OK;
+	uint32_t conn_count;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("ips_protoexp_send_err_chk_rdma\n");
+
+	// we delay our sending of err chk rdma until after the connection is
+	// restored as reflected by an increment of conn_count relative to when
+	// tidsendc issued the rdma_write.  This also forms a barrier to
+	// ensure our err chk rdma does not arrive at receiver prior to the
+	// rdma completion (eg. in case we timeded out for RC QP ack but
+	// receiver got the full rdma write).
+	if (__psm2_rv_get_conn_count(proto->ep->verbs_ep.rv, ipsaddr->rv_conn,
+			tidsendc->rv_sconn_index, &conn_count)) {
+		psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"send_err_chk_rdma: Connect unrecoverable on %s to %s\n",
+			proto->ep->dev_name,
+			psmi_epaddr_get_name(ipsaddr->epaddr.epid));
+		err = PSM2_TIMEOUT; /* force a resend reschedule */
+		goto done;
+	}
+
+	// conn_count only advances.  Only need to test for equality.
+	// 32b reconnect_count sufficient for 13 years of constant reconnect
+	// at 100ms intervals (eg. RV_DELAY) before wrapping
+	if (conn_count == tidsendc->rv_conn_count) {
+		err = PSM2_TIMEOUT; /* force a resend reschedule */
+		goto done;
+	}
+
+	// limit to 1 outstanding per remote connection.
+	// receiver can only queue 1 response if it's low on scb's
+	if (ipsaddr->rv_err_chk_rdma_outstanding) {
+		err = PSM2_TIMEOUT; /* force a resend reschedule */
+		goto done;
+	}
+
+	scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+	if (scb == NULL) {
+		// ips_tid_scbavail_callback will trigger pend_sendq again
+		// and call ips_tid_pendsend_timer_callback
+		err = PSM2_EP_NO_RESOURCES;
+		goto done;
+	}
+
+	_HFI_MMDBG("sending ERR_CHK_RDMA\n");
+	PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_TX, proto->ep->epid,
+			ipsaddr->epaddr.epid,
+			"psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid. _desc_genc %d _desc_idx: %d, tidsendc->sdescid._desc_idx: %d",
+			psmi_mpool_get_obj_index(tidsendc->mqreq),
+			tidsendc->rdescid._desc_genc,tidsendc->rdescid._desc_idx,
+			tidsendc->sdescid._desc_idx);
+
+	ips_scb_opcode(scb) = OPCODE_ERR_CHK_RDMA;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+		// providing our REQ index gives receiver an extra sanity check
+	scb->ips_lrh.mdata = psmi_mpool_get_obj_index(tidsendc->mqreq);
+	scb->ips_lrh.data[0] = tidsendc->rdescid;
+	scb->ips_lrh.data[1] = tidsendc->sdescid;
+	/* path is having issue, ask for ack */
+	scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+	/* INTR makes sure remote end works on it immediately */
+	if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD)
+		scb->scb_flags |= IPS_SEND_FLAG_INTR;
+
+	ipsaddr->rv_err_chk_rdma_outstanding = 1;
+	tidsendc->is_complete = 1;	// status of send of err_chk_rdma
+
+	proto->epaddr_stats.err_chk_rdma_send++;
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	/* inbound ack will free scb */
+done:
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+#endif // RNDV_MOD
+
+#ifdef RNDV_MOD
+// scan all alternate addresses for "expected" (multi-QP and multi-EP)
+// to see if a match for "got" can be found
+static
+int ips_protoexp_ipsaddr_match(ips_epaddr_t *expected, ips_epaddr_t *got)
+{
+	ips_epaddr_t *p = expected;
+
+	do {
+		if (p == got)
+			return 1;
+		p = p->next;
+	} while (p != expected);
+
+	return 0;
+}
+#endif // RNDV_MOD
+
+#ifdef RNDV_MOD
+int ips_protoexp_process_err_chk_rdma(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	struct ips_protoexp *protoexp = proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	__u32 sendtoken = p_hdr->mdata;
+	ptl_arg_t rdesc_id = p_hdr->data[0];
+	ptl_arg_t sdesc_id = p_hdr->data[1];
+	struct ips_tid_recv_desc *tidrecvc;
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("ips_protoexp_process_err_chk_rdma\n");
+
+	/* normal packet reliabilty protocol handling */
+	if (!ips_proto_is_expected_or_nak(rcv_ev))
+		goto done;
+
+	/* processing specific to err chk rdma packet */
+	proto->epaddr_stats.err_chk_rdma_recv++;
+
+	_HFI_MMDBG("received ERR_CHK_RDMA\n");
+	PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_RX,ipsaddr->epaddr.epid,
+			proto->ep->epid,
+			"rdescid._desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d",
+			rdesc_id._desc_genc,rdesc_id._desc_idx, sdesc_id._desc_idx);
+
+	if (ipsaddr->rv_need_send_err_chk_rdma_resp) {
+		/* sender has >1 err chk rdma outstanding: protocol violation */
+		psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"process_err_chk_rdma: Protocol Violation: > 1 outstanding from remote node %s on %s\n",
+			psmi_epaddr_get_name(ipsaddr->epaddr.epid),
+			proto->ep->dev_name);
+		goto do_acks;
+	}
+
+	/* Get receive descriptor */
+	psmi_assert(rdesc_id._desc_idx < HFI_TF_NFLOWS);
+	tidrecvc = &protoexp->tfc.tidrecvc[rdesc_id._desc_idx];
+
+	tidrecvc->stats.nErrChkReceived++;
+
+	// stash information to build resp in ipsaddr
+	psmi_assert(! ipsaddr->rv_need_send_err_chk_rdma_resp);
+	ipsaddr->rv_need_send_err_chk_rdma_resp = 1;
+	ipsaddr->rv_err_chk_rdma_resp_rdesc_id = rdesc_id;
+	ipsaddr->rv_err_chk_rdma_resp_sdesc_id = sdesc_id;
+
+	// for the rare case that err_chk_rdma has a rdescid which we completed
+	// a while ago, we need to sanity check not only rdescid, but also
+	// the identity of the sender and the sendtoken for the senders RTS
+	// this protects us in case rdescid generation has wrapped
+	if (tidrecvc->rdescid._desc_genc != rdesc_id._desc_genc
+		|| tidrecvc->state != TIDRECVC_STATE_BUSY
+		|| ! ips_protoexp_ipsaddr_match(tidrecvc->ipsaddr, ipsaddr)
+		|| tidrecvc->getreq->tidgr_sendtoken != sendtoken
+		) {
+		/* Receive descriptor mismatch in time and space.
+		 * Must have completed recv for this RDMA
+		 * (eg. sender timeout waiting for RC QP ack)
+		 */
+		ipsaddr->rv_err_chk_rdma_resp_need_resend = 0;
+	} else if (__psm2_rv_scan_cq(proto->ep->verbs_ep.rv, RV_WC_RECV_RDMA_WITH_IMM,
+				RDMA_IMMED_DESC_MASK,
+				RDMA_PACK_IMMED(tidrecvc->rdescid._desc_genc,
+								tidrecvc->rdescid._desc_idx, 0))) {
+		// the CQ scan above solves a very rare race where the receiving QP is
+		// very slow to issue CQEs and PSM happens to poll the UD QP and find
+		// the err chk rdma before finding a succesful RDMA Write received.
+		// Due to reconnection essentially being a barrier, we know the
+		// CQE must be processed in RV drain prior to the new connection and
+		// hence prior to the err chk rdma on UD QP.  So we scan the RV CQ
+		// to close the race, if we find a matching completion we can
+		// respond with resend_needed=0 and know we will process the CQE
+		// soon to fully complete the RDMA receipt.
+		// We ignore RV_IDX in this scan, it should always match us and better
+		// to not ask for a resend and fail when we process the completion
+		// than to ask for an a resend into a freed buffer
+		ipsaddr->rv_err_chk_rdma_resp_need_resend = 0;
+	} else {
+		tidrecvc->stats.nReXmit++;
+		ipsaddr->rv_err_chk_rdma_resp_need_resend = 1;
+	}
+
+	// try to send it now, will remain "queued" until we can send
+	ips_protoexp_send_err_chk_rdma_resp(flow);
+	if (ipsaddr->rv_need_send_err_chk_rdma_resp)
+		// ips_tid_scbavail_callback will trigger pend_err_resp again
+		// and call ips_tid_pendtids_timer_callback
+		STAILQ_INSERT_TAIL(&protoexp->pend_err_resp, ipsaddr, pend_err_resp_next);
+
+do_acks:
+	if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ)
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq,
+					&ipsaddr->flows[ips_proto_flowid(p_hdr)]);
+
+	ips_proto_process_ack(rcv_ev);
+done:
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+#endif // RNDV_MOD
+
+
+#ifdef RNDV_MOD
+static
+void ips_protoexp_send_err_chk_rdma_resp(struct ips_flow *flow)
+{
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	struct ips_proto *proto = ipsaddr->epaddr.proto;
+	struct ips_protoexp *protoexp = proto->protoexp;
+	ips_scb_t *scb;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("ips_protoexp_send_err_chk_rdma_resp\n");
+	psmi_assert(ipsaddr->rv_need_send_err_chk_rdma_resp);
+	scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+	if (scb == NULL) {
+		/* ips_tid_scbavail_callback() will reschedule */
+		return;
+	}
+
+	_HFI_MMDBG("sending ERR_CHK_RDMA_RESP\n");
+	PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_TX, proto->ep->epid,
+			ipsaddr->epaddr.epid,
+			"need_resend %d rdescid. _desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d",
+			ipsaddr->rv_err_chk_rdma_resp_need_resend,
+			ipsaddr->rv_err_chk_rdma_resp_rdesc_id._desc_genc,
+			ipsaddr->rv_err_chk_rdma_resp_rdesc_id._desc_idx,
+			ipsaddr->rv_err_chk_rdma_resp_sdesc_id._desc_idx);
+
+	ips_scb_opcode(scb) = OPCODE_ERR_CHK_RDMA_RESP;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	scb->ips_lrh.mdata = ipsaddr->rv_err_chk_rdma_resp_need_resend;
+	scb->ips_lrh.data[0] = ipsaddr->rv_err_chk_rdma_resp_rdesc_id;
+	scb->ips_lrh.data[1] = ipsaddr->rv_err_chk_rdma_resp_sdesc_id;
+	/* path is having issue, ask for ack */
+	scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+	/* INTR makes sure remote end works on it immediately */
+	if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD)
+		scb->scb_flags |= IPS_SEND_FLAG_INTR;
+
+	// The scb will own reliable transmission of resp, we can clear flag
+	ipsaddr->rv_need_send_err_chk_rdma_resp = 0;
+
+	proto->epaddr_stats.err_chk_rdma_resp_send++;
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	PSM2_LOG_MSG("leaving");
+	return;
+}
+#endif // RNDV_MOD
+
+#ifdef RNDV_MOD
+int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_tid_send_desc *tidsendc;
+	uint32_t need_resend = p_hdr->mdata;
+	//ptl_arg_t rdesc_id = p_hdr->data[0];
+	ptl_arg_t sdesc_id = p_hdr->data[1];
+
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("ips_protoexp_process_err_chk_rdma_resp\n");
+
+	/* normal packet reliabilty protocol handling */
+	if (!ips_proto_is_expected_or_nak(rcv_ev))
+		goto done;
+
+	/* processing specific to err chk rdma resp packet */
+
+	protoexp->proto->epaddr_stats.err_chk_rdma_resp_recv++;
+
+	_HFI_MMDBG("received ERR_CHK_RDMA_RESP\n");
+	PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_RX,ipsaddr->epaddr.epid,
+			protoexp->proto->ep->epid,
+			"rdescid. _desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d",
+			p_hdr->data[0]._desc_genc,p_hdr->data[0]._desc_idx,
+			sdesc_id._desc_idx);
+	/* Get the session send descriptor
+	 * a subset of get_tidflow in ips_proto_recv.c since we don't
+	 * have tidflow sequence numbers to check
+	 */
+	tidsendc = (struct ips_tid_send_desc *)
+		psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+					sdesc_id._desc_idx);
+	_HFI_VDBG("desc_id=%d (%p)\n", sdesc_id._desc_idx, tidsendc);
+	if (tidsendc == NULL) {
+		_HFI_ERROR("err_chk_rdma_resp: Index %d is out of range\n",
+					sdesc_id._desc_idx);
+		goto do_acks;
+	} else {
+		ptl_arg_t desc_tidsendc;
+
+		psmi_mpool_get_obj_index_gen_count(tidsendc,
+						 &desc_tidsendc._desc_idx, &desc_tidsendc._desc_genc);
+
+		_HFI_VDBG("sdesc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n",
+				sdesc_id._desc_idx, sdesc_id._desc_genc,
+				desc_tidsendc._desc_idx, desc_tidsendc._desc_genc);
+
+		/* See if the reference is still live and valid */
+		if (desc_tidsendc.u64 != sdesc_id.u64) {
+			_HFI_ERROR("err_chk_rdma_resp: Genc %d does not match\n",
+				sdesc_id._desc_genc);
+			goto do_acks;
+		}
+	}
+
+	ipsaddr->rv_err_chk_rdma_outstanding = 0;
+	tidsendc->rv_need_err_chk_rdma = 0;
+	if (need_resend)
+		ips_tid_reissue_rdma_write(tidsendc);
+	else
+		ips_protoexp_tidsendc_complete(tidsendc);
+
+do_acks:
+	if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ)
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq,
+					&ipsaddr->flows[ips_proto_flowid(p_hdr)]);
+
+	ips_proto_process_ack(rcv_ev);
+done:
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+#endif // RNDV_MOD
+
+// Intermediate STL100 EXTID packets can be delivered to software when
+// acks are requested.
+// The final packet in a STL100 EXTID flow is also delivered to software
+// to indicate the completion of the flow and can contain unaligned data.
+// for RDMA Write we will simply use immediate data in the write
+// to indicate the completed receive of the RDMA Write
+// if we use RDMA Read, the local SQ Completion will indicate this
+// could build and pass a ips_recvhdrq_event or pass struct ips_recvhdrq
+// but all we really need is proto and len
+// conn indicates where we received RDMA Write, just for quick sanity check
+// 	for RV module conn will be the psm2_rv_conn_t
+// 	for user RC QPs conn will be will be the RC struct ibv_qp*
+int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
+									int conn_type, uint32_t immed, uint32_t len)
+{
+	struct ips_tid_recv_desc *tidrecvc;
+	struct ips_protoexp *protoexp = proto->protoexp;
+	ptl_arg_t desc_id;
+	_HFI_MMDBG("ips_protoexp_immed_data\n");
+	PSM2_LOG_MSG("entering");
+	desc_id._desc_genc = RDMA_UNPACK_IMMED_GENC(immed);
+	desc_id._desc_idx = RDMA_UNPACK_IMMED_IDX(immed);
+
+	tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx];
+
+	if ((tidrecvc->rdescid._desc_genc & IPS_HDR_RDESCID_GENC_MASK)
+		!= desc_id._desc_genc) {
+		_HFI_ERROR("stale inbound rv RDMA generation: expected %u got %u\n",
+				tidrecvc->rdescid._desc_genc, desc_id._desc_genc);
+		tidrecvc->stats.nGenErr++;
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;		/* skip */
+	}
+
+	// maybe should use assert below so don't add test in production code
+	if (tidrecvc->state != TIDRECVC_STATE_BUSY) {
+		_HFI_ERROR("stale inbound rv RDMA (tidrecvc not busy)\n");
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;		/* skip */
+	}
+	// some sanity checks
+	// maybe this should be an assert so don't add test in production code
+	if (len != tidrecvc->recv_msglen) {
+		// RDMA Write does not match what we asked for in CTS
+		_HFI_ERROR("incorrect RDMA Write Len: expected %u got %u\n",
+				tidrecvc->recv_msglen, len);
+		// TBD - what to do?
+	}
+	psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & tidrecvc->protoexp->proto->ep->rdmamode);
+#ifdef RNDV_MOD
+	if (conn_type == RDMA_IMMED_RV
+		&& RDMA_UNPACK_IMMED_RV_IDX(immed) != proto->ep->verbs_ep.rv_index) {
+		// RV module should not have delivered this CQE to us
+		_HFI_ERROR("incorrect RDMA RV Index: expected %u got %u\n",
+				proto->ep->verbs_ep.rv_index, RDMA_UNPACK_IMMED_RV_IDX(immed));
+		return IPS_RECVHDRQ_CONTINUE;		/* skip */
+	}
+#endif
+	// For User RC conn_ref is context we set in rc_qp_create (*ipsaddr)
+	// For Kernel RC, conn_ref is the conn handle (psm2_rv_conn_get_conn_handle)
+	// maybe this should be an assert so don't add test in production code
+	if ((conn_type == RDMA_IMMED_USER_RC)
+			&& (uint64_t)tidrecvc->ipsaddr != conn_ref) {
+		// RDWA Write is not on expected RC QP from remote node
+		_HFI_ERROR("RDMA Write on Wrong User QP 0x%"PRIx64", expect 0x%"PRIx64"\n",
+				 	conn_ref, (uint64_t)tidrecvc->ipsaddr);
+		// TBD - what to do?
+	}
+#ifdef RNDV_MOD
+	if (conn_type == RDMA_IMMED_RV
+		&& psm2_rv_conn_get_conn_handle(tidrecvc->ipsaddr->rv_conn)
+					 != conn_ref) {
+		// RDWA Write is not on expected RV QP from remote node
+		_HFI_ERROR("RDMA Write on Wrong RV QP 0x%"PRIx64", expect 0x%"PRIx64"\n",
+				 	conn_ref,
+		 			psm2_rv_conn_get_conn_handle(tidrecvc->ipsaddr->rv_conn));
+		// TBD - what to do?
+	}
+#endif
+	if (_HFI_PDBG_ON) {
+#ifdef PSM_CUDA
+		if (tidrecvc->is_ptr_gpu_backed)
+			_HFI_PDBG_DUMP_GPU(tidrecvc->buffer, len);
+		else
+#endif
+			_HFI_PDBG_DUMP(tidrecvc->buffer, len);
+	}
+
+	/* Reset the swapped generation count as we received a valid packet */
+	tidrecvc->tidflow_nswap_gen = 0;
+
+	/* Do some sanity checking */
+	psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
+	// STL100 does this at the end of ips_protoexp_send_tid_completion
+	// TBD - seems like this should be done after ips_tid_recv_free
+	// so we have more likelihood of getting freshly freed resources?
+	if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+		tidrecvc->ipsaddr->flows[protoexp->proto->msgflowid].flags &= ~IPS_FLOW_FLAG_SKIP_CTS;                                  /* Let the next CTS be processed */
+		ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0);  /* and make explicit progress for it. */
+	}
+
+		/* Mark receive as done */
+		ips_tid_recv_free(tidrecvc);
+		_HFI_MMDBG("tidrecv done\n");
+	PSM2_LOG_MSG("leaving");
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+
+
+#ifdef PSM_CUDA
+static
+psm2_error_t
+psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq)
+{
+	struct ips_protoexp *protoexp = getreq->tidgr_protoexp;
+	struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead =
+		&getreq->pend_cudabuf;
+	struct ips_cuda_hostbuf *chb;
+	CUresult status;
+
+	/* Get the getreq's first memcpy op */
+	while (!STAILQ_EMPTY(cmemcpyhead)) {
+		chb = STAILQ_FIRST(cmemcpyhead);
+		PSMI_CUDA_CHECK_EVENT(chb->copy_status, status);
+		if (status != CUDA_SUCCESS) {
+			/* At least one of the copies is still
+			 * in progress. Schedule the timer,
+			 * then leave the CUDA progress phase
+			 * and check for other pending TID work.
+			 */
+			psmi_timer_request(protoexp->timerq,
+					   &protoexp->timer_getreqs,
+					   PSMI_TIMER_PRIO_1);
+			return PSM2_OK_NO_PROGRESS;
+		}
+		/* The getreq's oldest cudabuf is done. Reclaim it. */
+		getreq->tidgr_cuda_bytesdone += chb->size;
+		STAILQ_REMOVE_HEAD(cmemcpyhead, next);
+		psmi_mpool_put(chb);
+	}
+	return PSM2_OK;
+}
+static
+struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len)
+{
+	struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*)
+						psmi_calloc(PSMI_EP_NONE,
+							    UNDEFINED, 1,
+							    sizeof(struct ips_cuda_hostbuf));
+	if (chb == NULL) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+						"Couldn't allocate cuda host buffers ");
+	}
+	PSMI_CUDA_CALL(cuMemHostAlloc,
+			       (void **) &chb->host_buf,
+			       window_len,
+			       CU_MEMHOSTALLOC_PORTABLE);
+	PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+	return chb;
+}
+
+static
+void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
+			      struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_proto *proto = protoexp->proto;
+	struct ips_cuda_hostbuf *chb = NULL;
+	psm2_mq_req_t req = tidsendc->mqreq;
+	uint32_t offset, window_len;
+
+	/* try to push the prefetcher forward */
+	if (req->prefetch_send_msgoff < req->req_data.send_msglen) {
+		/* some data remains to be sent */
+		offset = req->prefetch_send_msgoff;
+		window_len =
+			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+					     offset, req->req_data.buf_len);
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_small_send);
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
+		/* were any buffers available for the prefetcher? */
+		if (chb == NULL)
+			return;
+		req->prefetch_send_msgoff += window_len;
+		chb->offset = offset;
+		chb->size = window_len;
+		chb->req = req;
+		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
+		chb->bytes_read = 0;
+
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
+		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
+			       chb->host_buf, chb->gpu_buf,
+			       window_len,
+			       proto->cudastream_send);
+		PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
+			       proto->cudastream_send);
+
+		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+		return;
+	}
+	return;
+}
+
+static
+void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
+				 psm2_mq_req_t req,
+				 struct ips_tid_send_desc *tidsendc,
+				 struct ips_cuda_hostbuf *chb_prev,
+				 uint32_t tsess_srcoff,
+				 uint32_t tsess_length,
+				 uint32_t tsess_unaligned_start,
+				 psm2_chb_match_type_t type)
+{
+	struct ips_proto *proto = protoexp->proto;
+	struct ips_cuda_hostbuf *chb = NULL;
+	uint32_t offset, window_len, attached=0;
+
+	/* try to push the prefetcher forward */
+	while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) {
+		/* some data remains to be sent */
+		offset = req->prefetch_send_msgoff;
+		window_len =
+			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
+					     offset, req->req_data.buf_len);
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_small_send);
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
+			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
+				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
+
+		/* were any buffers available? If not force allocate */
+		if (chb == NULL) {
+			chb = psmi_allocate_chb(window_len);
+			psmi_assert(chb);
+			chb->is_tempbuf = 1;
+		}
+		req->prefetch_send_msgoff += window_len;
+		chb->offset = offset;
+		chb->size = window_len;
+		chb->req = req;
+		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
+		chb->bytes_read = 0;
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
+		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
+			       chb->host_buf, chb->gpu_buf,
+			       window_len,
+			       proto->cudastream_send);
+		PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
+			       proto->cudastream_send);
+
+		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
+		if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+			if ((tsess_srcoff < chb->offset)
+			     && ((tsess_srcoff + tsess_length) > chb->offset)) {
+				tidsendc->cuda_hostbuf[0] = chb_prev;
+				tidsendc->cuda_hostbuf[1] = chb;
+				tidsendc->cuda_num_buf = 2;
+				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+						tsess_length);
+				tidsendc->userbuf =
+					(void *)((uintptr_t) buffer);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+						tsess_unaligned_start);
+				return;
+			}
+		} else {
+			if (attached) {
+				tidsendc->cuda_hostbuf[0] = chb_prev;
+				tidsendc->cuda_hostbuf[1] = chb;
+				tidsendc->cuda_num_buf = 2;
+				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+						tsess_length);
+				tidsendc->userbuf =
+					(void *)((uintptr_t) buffer);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+						tsess_unaligned_start);
+				attached = 0;
+				return;
+			}
+			if ((tsess_srcoff > chb->offset)
+			    && (tsess_srcoff < (chb->offset + chb->size))
+			     && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) {
+				chb_prev = chb;
+				attached = 1;
+				chb = NULL;
+				continue;
+			} else if ((chb->offset <= tsess_srcoff) &&
+				  ((tsess_srcoff + tsess_length) <=
+				   (chb->offset+chb->size))) {
+				tidsendc->cuda_hostbuf[0] = chb;
+				tidsendc->cuda_hostbuf[1] = NULL;
+				tidsendc->cuda_num_buf = 1;
+				tidsendc->userbuf =
+					(void *)((uintptr_t) chb->host_buf +
+						tsess_srcoff - chb->offset);
+				tidsendc->buffer =
+					(void *)((uintptr_t)tidsendc->userbuf +
+							tsess_unaligned_start );
+				return;
+			} else
+				chb = NULL;
+		}
+	}
+}
+
+static
+psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb,
+				       ips_tid_session_list *tid_list,
+				       uint32_t prefetch_send_msgoff)
+{
+	/* To get a match:
+	 * 1. Tid list offset + length is contained within a chb
+	 * 2. Tid list offset + length is contained within
+	 * the prefetched offset of this req.
+	 * 3. Tid list offset + length is partially prefetched
+	 * within one chb. (A partial match)
+	 */
+	if (chb->offset <= tid_list->tsess_srcoff) {
+		if ((chb->offset + chb->size) >=
+		    (tid_list->tsess_srcoff + tid_list->tsess_length)) {
+			return PSMI_CUDA_FULL_MATCH_FOUND;
+		} else {
+			if((chb->offset + chb->size) > tid_list->tsess_srcoff){
+				if(((chb->offset + (2 * chb->size)) >
+				   (tid_list->tsess_srcoff + tid_list->tsess_length)) &&
+						  ((prefetch_send_msgoff) >=
+						   (tid_list->tsess_srcoff + tid_list->tsess_length))){
+					return PSMI_CUDA_SPLIT_MATCH_FOUND;
+				} else if((tid_list->tsess_srcoff + tid_list->tsess_length)
+					> prefetch_send_msgoff) {
+					return PSMI_CUDA_PARTIAL_MATCH_FOUND;
+				}
+			}
+		}
+	}
+	return PSMI_CUDA_CONTINUE;
+}
+#endif
+
+// sender handling of a CTS which indicates use of TID protocol
+psm2_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
+			   ips_epaddr_t *ipsaddr,
+			   psm2_mq_req_t req,
+			   ptl_arg_t rdescid,
+			   uint32_t tidflow_genseq,
+			   ips_tid_session_list *tid_list,
+			   uint32_t tid_list_size)
+{
+	struct ips_tid_send_desc *tidsendc;
+	_HFI_MMDBG("ips_tid_send_handle_tidreq\n");
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(tid_list_size == sizeof(ips_tid_session_list));
+
+	tidsendc = (struct ips_tid_send_desc *)
+	    psmi_mpool_get(protoexp->tid_desc_send_pool);
+	if (tidsendc == NULL) {
+		PSM2_LOG_MSG("leaving");
+		ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr);
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	req->ptl_req_ptr = (void *)tidsendc;
+	tidsendc->protoexp = protoexp;
+
+	/* Uniquely identify this send descriptor in space and time */
+	tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc);
+	tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc);
+	tidsendc->rdescid = rdescid;
+	tidsendc->ipsaddr = ipsaddr;
+	tidsendc->mqreq = req;
+
+	/* Copy received tidinfo to local tidsendc buffer. */
+	psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list,
+				sizeof(ips_tid_session_list));
+	// for UD we do not need a ips_flow since we will use the RC QP and
+	// then will use our main flow for the final RV completion control msg
+	// The path record for use by RDMA will be selected when the connection
+	// is established
+	tidsendc->mr = NULL;	// be safe,but should be NULL since clear on release
+	_HFI_VDBG("recv'd CTS: rkey 0x%x srcoff %u raddr 0x%"PRIx64" len %u\n",
+		tid_list->tsess_rkey, tid_list->tsess_srcoff, tid_list->tsess_raddr,
+		tid_list->tsess_length);
+
+	tidsendc->userbuf =
+	    (void *)((uintptr_t) req->req_data.buf + tid_list->tsess_srcoff);
+	tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf
+				);
+	tidsendc->length = tid_list->tsess_length;
+	_HFI_MMDBG("tidsendc created userbuf %p buffer %p length %u\n",
+			tidsendc->userbuf,  tidsendc->buffer, tidsendc->length);
+
+#ifdef PSM_CUDA
+	/* Matching on previous prefetches and initiating next prefetch */
+	struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL;
+	psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE;
+
+	/* check if the prefetcher has a buffer ready to use */
+	tidsendc->cuda_hostbuf[0] = NULL;
+	tidsendc->cuda_hostbuf[1] = NULL;
+	tidsendc->cuda_num_buf = 0;
+	if (req->cuda_hostbuf_used) {
+		/* To get a match:
+		 * 1. Tid list offset + length is contained within a chb
+		 * 2. Tid list offset + length is contained within
+		 * the prefetched offset of this req.
+		 * 3. Tid list offset + length is partially prefetched
+		 * within one chb. (A partial match)
+		 */
+		STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) {
+			rc = psmi_find_match_in_prefeteched_chb(chb,
+								tid_list,
+								req->prefetch_send_msgoff);
+			if (rc < PSMI_CUDA_CONTINUE)
+				break;
+		}
+		if (rc == PSMI_CUDA_FULL_MATCH_FOUND) {
+			tidsendc->userbuf =
+				(void *)((uintptr_t) chb->host_buf+
+					 tid_list->tsess_srcoff - chb->offset);
+			tidsendc->buffer =
+				(void *)((uintptr_t)tidsendc->userbuf
+					);
+			/* now associate the buffer with the tidsendc */
+			tidsendc->cuda_hostbuf[0] = chb;
+			tidsendc->cuda_hostbuf[1] = NULL;
+			tidsendc->cuda_num_buf = 1;
+		} else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){
+			void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
+					tid_list->tsess_length);
+			tidsendc->userbuf =
+				(void *)((uintptr_t) buffer);
+			tidsendc->buffer =
+				(void *)((uintptr_t)tidsendc->userbuf
+				);
+			chb_next = STAILQ_NEXT(chb, req_next);
+			tidsendc->cuda_hostbuf[0] = chb;
+			tidsendc->cuda_hostbuf[1] = chb_next;
+			tidsendc->cuda_num_buf = 2;
+		} else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) {
+			psmi_attach_chb_to_tidsendc(protoexp, req,
+						    tidsendc,
+						    chb,
+						    tid_list->tsess_srcoff,
+						    tid_list->tsess_length,
+							0,
+						    rc);
+		} else {
+			psmi_attach_chb_to_tidsendc(protoexp, req,
+						    tidsendc,
+						    NULL,
+						    tid_list->tsess_srcoff,
+						    tid_list->tsess_length,
+							0,
+						    PSMI_CUDA_CONTINUE);
+		}
+		protoexp->proto->strat_stats.rndv_rdma_hbuf_send++;
+		protoexp->proto->strat_stats.rndv_rdma_hbuf_send_bytes += tid_list->tsess_length;
+	} else if (req->is_buf_gpu_mem) {
+		protoexp->proto->strat_stats.rndv_rdma_gdr_send++;
+		protoexp->proto->strat_stats.rndv_rdma_gdr_send_bytes += tid_list->tsess_length;
+	} else
+#endif // PSM_CUDA
+	{
+		protoexp->proto->strat_stats.rndv_rdma_cpu_send++;
+		protoexp->proto->strat_stats.rndv_rdma_cpu_send_bytes += tid_list->tsess_length;
+	}
+
+	tidsendc->is_complete = 0;
+	tidsendc->reserved = 0;
+#ifdef RNDV_MOD
+	tidsendc->rv_need_err_chk_rdma = 0;
+	tidsendc->rv_sconn_index = 0;
+	tidsendc->rv_conn_count = 0;
+#endif
+
+
+	_HFI_EXP
+	    ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d"
+		"\n",
+	     tidsendc->sdescid._desc_idx, rdescid._desc_idx,
+	     tid_list->tsess_srcoff, tid_list->tsess_length
+		);
+
+	// start sending TIDEXP packets
+	ips_tid_send_exp(tidsendc);
+
+	/* Add as a pending op and ring up the timer */
+	if (tidsendc->is_complete == 0) {
+		STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+		psmi_timer_request(protoexp->timerq, &protoexp->timer_send,
+			   PSMI_TIMER_PRIO_1);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	/* Consider breaking out of progress engine here */
+	return PSM2_OK;
+}
+
+
+/*
+ * Returns:
+ *
+ * PSM2_OK: scb was allocated for at least one frame, the packet may be queued
+ *         or actually sent.
+ *
+ * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow
+ *		       to be enqueued before polling receive queue.
+ *
+ * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more
+ *                      scbs become available.
+ *
+ * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now.
+ *
+ */
+
+// issue RDMA Write in response to a CTS
+psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+	struct ips_proto *proto = protoexp->proto;
+	psm2_error_t err = PSM2_OK;
+
+	// for STL100 native we would loop on ips_scb_prepare_tid_sendctrl and
+	// ips_proto_flow_enqueue to prepare EXPTID scbs for the TIDFLOW protocol
+	// and queue and issue them.  Once they were all posted the is_complete
+	// flag would be set.  For larger messages, it might take multiple
+	// attempts to get resources to queue everything in which case callbacks
+	// and timers ensure progress
+	// For verbs we are delegating the RC Write "flow" to the NIC's RC QP
+	// it will manage segmentation, sequence numbers and acks for the flow
+	// so our job is done here after one call.
+	// we use immediate data with the rdescid to trigger a CQE on receiver
+	// so it knows when RDMA is done
+	// if too many RDMA in flight retry later when next RDMA completes
+	if (protoexp->proto->ep->verbs_ep.send_rdma_outstanding
+				 >= protoexp->proto->ep->hfi_num_send_rdma) {
+		err = PSM2_EP_NO_RESOURCES; // try again on next RDMA completion
+	} else if (tidsendc->mr) {
+		// registered or referenced in previous failed ips_tid_send_exp attempt
+		// no need to register again
+		err = PSM2_OK;
+	} else if (
+#ifdef PSM_CUDA
+		! tidsendc->mqreq->cuda_hostbuf_used &&
+#endif
+			// separate MR cache's per EP, so this confirms we have the same EP
+		tidsendc->mqreq->mr && tidsendc->mqreq->mr->cache == proto->mr_cache) {
+		// we can use the same MR as the whole mqreq
+		_HFI_MMDBG("CTS send chunk reference send: %p %u bytes via %p %"PRIu64"\n",
+			tidsendc->buffer, tidsendc->length, tidsendc->mqreq->mr->addr, tidsendc->mqreq->mr->length);
+		tidsendc->mr = psm2_verbs_ref_mr(tidsendc->mqreq->mr);
+	} else {
+		// we need an MR for this chunk
+		_HFI_MMDBG("CTS send chunk register send: %p %u bytes\n", tidsendc->buffer , tidsendc->length);
+		tidsendc->mr = psm2_verbs_reg_mr(proto->mr_cache, 1,
+                         proto->ep->verbs_ep.pd,
+                         tidsendc->buffer, tidsendc->length, IBV_ACCESS_RDMA
+#ifdef PSM_CUDA
+						| ((tidsendc->mqreq->is_buf_gpu_mem
+								 && !tidsendc->mqreq->cuda_hostbuf_used)
+							?IBV_ACCESS_IS_GPU_ADDR:0)
+#endif
+						);
+		if (! tidsendc->mr)
+			err = PSM2_TIMEOUT;	/* force a resend reschedule */
+	}
+
+	// if post_send fails below, we'll try again later
+	// completion handler decides how to handle any WQE/CQE errors
+	_HFI_MMDBG("tidsendc prior to post userbuf %p buffer %p length %u err %d outstanding %u\n",
+			tidsendc->userbuf,  tidsendc->buffer, tidsendc->length,
+			err, protoexp->proto->ep->verbs_ep.send_rdma_outstanding);
+#ifdef RNDV_MOD
+	if (err == PSM2_OK) {
+		psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & protoexp->proto->ep->rdmamode);
+
+		if (IPS_PROTOEXP_FLAG_KERNEL_QP(protoexp->proto->ep->rdmamode))
+			err = psm2_verbs_post_rv_rdma_write_immed(
+				protoexp->proto->ep,
+				tidsendc->ipsaddr->rv_conn,
+				tidsendc->buffer, tidsendc->mr,
+				tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey,
+				tidsendc->tid_list.tsess_length,
+				RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc,
+							 tidsendc->rdescid._desc_idx,
+							 tidsendc->ipsaddr->remote_rv_index),
+				(uintptr_t)tidsendc,
+				&tidsendc->rv_sconn_index, &tidsendc->rv_conn_count);
+		else if (IPS_PROTOEXP_FLAG_USER_RC_QP(protoexp->proto->ep->rdmamode))
+			err = psm2_verbs_post_rdma_write_immed(
+				protoexp->proto->ep,
+				tidsendc->ipsaddr->rc_qp,
+				tidsendc->buffer, tidsendc->mr,
+				tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey,
+				tidsendc->tid_list.tsess_length,
+				RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc,
+							 tidsendc->rdescid._desc_idx, 0),
+				(uintptr_t)tidsendc);
+	}
+	if (err == PSM2_OK) {
+		if (_HFI_PDBG_ON) {
+#ifdef PSM_CUDA
+			if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->cuda_hostbuf_used)
+				_HFI_PDBG_DUMP_GPU(tidsendc->buffer, tidsendc->tid_list.tsess_length);
+			else
+#endif
+				_HFI_PDBG_DUMP(tidsendc->buffer, tidsendc->tid_list.tsess_length);
+		}
+		tidsendc->is_complete = 1;	// send queued
+	} else
+		_HFI_MMDBG("after posted IBV Write: err %d\n", err);
+
+#else // RNDV_MOD
+	if (err == PSM2_OK) {
+		psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & protoexp->proto->ep->rdmamode);
+		if (IPS_PROTOEXP_FLAG_USER_RC_QP(protoexp->proto->ep->rdmamode))
+			err = psm2_verbs_post_rdma_write_immed(
+				protoexp->proto->ep,
+				tidsendc->ipsaddr->rc_qp,
+				tidsendc->buffer, tidsendc->mr,
+				tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey,
+				tidsendc->tid_list.tsess_length,
+				RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc,
+							 tidsendc->rdescid._desc_idx, 0),
+				(uintptr_t)tidsendc);
+	}
+	if (err == PSM2_OK) {
+		if (_HFI_PDBG_ON) {
+#ifdef PSM_CUDA
+			if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->cuda_hostbuf_used)
+				_HFI_PDBG_DUMP_GPU(tidsendc->buffer, tidsendc->tid_list.tsess_length);
+			else
+#endif
+				_HFI_PDBG_DUMP(tidsendc->buffer, tidsendc->tid_list.tsess_length);
+		}
+		tidsendc->is_complete = 1;	// send queued
+	} else
+		_HFI_MMDBG("after posted IBV Write 2: err %d\n", err);
+#endif // RNDV_MOD
+	return err;
+}
+
+/*
+ * Returns:
+ *
+ * PSM2_OK: scb was allocated for at least one frame, the packet may be queued
+ *         or actually sent.
+ *
+ * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow
+ *		       to be enqueued before polling receive queue.
+ *
+ * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more
+ *                      scbs become available.
+ *
+ * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now.
+ *
+ */
+
+// we got a CTS and processed it.  Now we can start sending EXPTID packets.
+// For UD we will use RDMA instead of EXPTID
+static
+psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
+{
+	psm2_error_t err = PSM2_OK;
+#if   defined(PSM_CUDA)
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+#endif
+
+	_HFI_MMDBG("ips_tid_send_exp\n");
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *chb, *chb_next;
+	CUresult chb_status;
+	uint32_t offset_in_chb, i;
+	for (i = 0; i < tidsendc->cuda_num_buf; i++) {
+		chb = tidsendc->cuda_hostbuf[i];
+		if (chb) {
+			PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status);
+			if (chb_status != CUDA_SUCCESS) {
+				err = PSM2_OK_NO_PROGRESS;
+				PSM2_LOG_MSG("leaving");
+				return err;
+			}
+		}
+	}
+
+	if (tidsendc->cuda_num_buf == 2) {
+		chb = tidsendc->cuda_hostbuf[0];
+		chb_next = tidsendc->cuda_hostbuf[1];
+		offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset;
+		/* Copying data from multiple cuda
+		 * host buffers into a bounce buffer.
+		 */
+		memcpy(tidsendc->buffer, chb->host_buf +
+			offset_in_chb, chb->size-offset_in_chb);
+		memcpy(tidsendc->buffer+ chb->size -
+			offset_in_chb, chb_next->host_buf,
+			tidsendc->tid_list.tsess_srcoff +
+			tidsendc->tid_list.tsess_length - chb_next->offset);
+
+		chb->bytes_read += chb->size - offset_in_chb;
+		chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff +
+				  tidsendc->tid_list.tsess_length -
+				  chb_next->offset;
+		if(chb->bytes_read == chb->size) {
+			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb,
+				       ips_cuda_hostbuf, req_next);
+			if (chb->is_tempbuf)
+				psmi_deallocate_chb(chb);
+			else {
+				chb->req = NULL;
+				chb->offset = 0;
+				chb->bytes_read = 0;
+				psmi_mpool_put(chb);
+			}
+			psmi_cuda_run_prefetcher(protoexp, tidsendc);
+		 }
+		if(chb_next->bytes_read == chb_next->size) {
+			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next,
+				       ips_cuda_hostbuf, req_next);
+			if (chb_next->is_tempbuf)
+				psmi_deallocate_chb(chb_next);
+			else{
+				chb_next->req = NULL;
+				chb_next->offset = 0;
+				chb_next->bytes_read = 0;
+				psmi_mpool_put(chb_next);
+			}
+			psmi_cuda_run_prefetcher(protoexp, tidsendc);
+		}
+		/* Clean Up tidsendc ref's to split cuda hostbufs when no longer needed */
+		tidsendc->cuda_num_buf = 0;
+		tidsendc->cuda_hostbuf[0] = NULL;
+		tidsendc->cuda_hostbuf[1] = NULL;
+	}
+#endif
+	err = ips_tid_issue_rdma_write(tidsendc);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+#ifdef RNDV_MOD
+// Used when err chk rdma resp indicates we must resend the rdma
+static
+void ips_tid_reissue_rdma_write(struct ips_tid_send_desc *tidsendc)
+{
+	struct ips_protoexp *protoexp = tidsendc->protoexp;
+
+	_HFI_MMDBG("ips_tid_reissue_rdma_write\n");
+
+	PSM2_LOG_MSG("entering");
+	protoexp->proto->epaddr_stats.rdma_rexmit++;
+	tidsendc->is_complete = 0;	// sends not yet queued
+
+	ips_tid_issue_rdma_write(tidsendc);
+
+	/* Add as a pending op and ring up the timer */
+	if (tidsendc->is_complete == 0) {
+		STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+		psmi_timer_request(protoexp->timerq, &protoexp->timer_send,
+			   PSMI_TIMER_PRIO_1);
+	}
+
+	PSM2_LOG_MSG("leaving");
+}
+#endif // RNDV_MOD
+
+static
+psm2_error_t
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+	struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context;
+	struct ips_tid_send_pend *phead = &protoexp->pend_sendq;
+	struct ips_tid_send_desc *tidsendc;
+	psm2_error_t err = PSM2_OK;
+	_HFI_MMDBG("ips_tid_pendsend_timer_callback\n");
+
+	while (!STAILQ_EMPTY(phead)) {
+		tidsendc = STAILQ_FIRST(phead);
+
+		// we have some scb's and can use them to queue some more EXPTID packets
+#ifdef RNDV_MOD
+		if (tidsendc->rv_need_err_chk_rdma)
+			err = ips_protoexp_send_err_chk_rdma(tidsendc);
+		else
+#endif
+			err = ips_tid_send_exp(tidsendc);
+
+		if (tidsendc->is_complete)
+			STAILQ_REMOVE_HEAD(phead, next);
+
+		if (err == PSM2_OK) {
+			/* Was able to complete the send, keep going */
+		} else if (err == PSM2_EP_NO_RESOURCES) {
+			/* No more sendbufs available, sendbuf callback will requeue this
+			 * timer */
+			break;
+		} else if (err == PSM2_TIMEOUT
+				  || err == PSM2_EPID_RV_CONNECT_RECOVERING
+				  || err == PSM2_EPID_RV_CONNECT_ERROR) {
+			/* Always a case of try later:
+			 * On PIO flow, means no send pio bufs available
+			 * On DMA flow, means kernel can't queue request or would have to block
+			 * On RV or user RDMA QP is full or connection recovery/issues
+			 */
+			psmi_timer_request(protoexp->proto->timerq,
+					   &protoexp->timer_send,
+					   get_cycles() +
+					   protoexp->proto->timeout_send);
+			break;
+		} else {
+			/* Forced to reschedule later so we can check receive queue */
+			psmi_assert(err == PSM2_OK_NO_PROGRESS);
+			psmi_timer_request(protoexp->proto->timerq,
+					   &protoexp->timer_send,
+					   PSMI_TIMER_PRIO_1);
+			break;
+		}
+	}
+
+	return PSM2_OK;
+}
+
+/* Right now, in the kernel we are allowing for virtually non-contiguous pages,
+   in a single call, and we are therefore locking one page at a time, but since
+   the intended use of this routine is for a single group of
+   virtually contiguous pages, that should change to improve
+   performance.  That means possibly changing the calling MPI code.
+   Doing so gets rid of some of the loop stuff here, and in the driver,
+   and allows for a single call to the core VM code in the kernel,
+   rather than one per page, definitely improving performance. */
+
+
+static
+psm2_error_t
+ips_tid_recv_alloc(struct ips_protoexp *protoexp,
+		   ips_epaddr_t *ipsaddr,
+		   const struct ips_tid_get_request *getreq,
+		   uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc)
+{
+	psm2_error_t err;
+	ips_scb_t *grantscb;
+	psm2_mq_req_t req = getreq->tidgr_req;
+	struct ips_proto *proto = protoexp->proto;
+
+	struct ips_tid_recv_desc *tidrecvc;
+
+	PSM2_LOG_MSG("entering");
+	/* Allocate all necessary resources. */
+	_HFI_MMDBG("tid_recv_alloc\n");
+
+	// allocate what we need to handle TID or RDMA on receive side
+	// we do this before we issue CTS
+
+	/* 1. allocate a tid grant (CTS) scb. */
+	grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0);
+	if (grantscb == NULL) {
+		_HFI_MMDBG("Wait: NO GRANT SCB\n");
+		/* ips_tid_scbavail_callback() will reschedule */
+		PSM2_LOG_MSG("leaving");
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+
+	/* 3. allocate a tid flow entry. */
+	err = ips_tf_allocate(&protoexp->tfc, &tidrecvc);
+	if (err != PSM2_OK) {
+		_HFI_MMDBG("Wait: NO tid flow\n");
+		ips_scbctrl_free(grantscb);
+		/* Unable to get a tidflow for expected protocol. */
+		psmi_timer_request(protoexp->timerq,
+			&protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+		PSM2_LOG_MSG("leaving");
+		return err;
+	}
+	tidrecvc->mr = NULL;	// be safe,but should be NULL since clear on release
+
+#ifdef PSM_CUDA
+       if (req->is_buf_gpu_mem)
+               tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used;
+       else
+               tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem;
+
+	/* 4. allocate a cuda bounce buffer, if required */
+	struct ips_cuda_hostbuf *chb = NULL;
+	if (getreq->cuda_hostbuf_used) {
+		unsigned bufsz;
+		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) {
+			chb = (struct ips_cuda_hostbuf *)
+				psmi_mpool_get(
+					protoexp->cuda_hostbuf_pool_small_recv);
+			bufsz = protoexp->cuda_hostbuf_small_recv_cfg.bufsz;
+		}
+		if (chb == NULL) {
+			chb = (struct ips_cuda_hostbuf *)
+				psmi_mpool_get(
+					protoexp->cuda_hostbuf_pool_recv);
+			bufsz = protoexp->cuda_hostbuf_recv_cfg.bufsz;
+		}
+		if (chb == NULL) {
+			/* Unable to get a cudahostbuf for TID.
+			 * Release the resources we're holding and reschedule.*/
+			ips_tf_deallocate(&protoexp->tfc,
+					  tidrecvc->rdescid._desc_idx);
+			ips_scbctrl_free(grantscb);
+			psmi_timer_request(protoexp->timerq,
+					   &protoexp->timer_getreqs,
+					   PSMI_TIMER_PRIO_1);
+			PSM2_LOG_MSG("leaving");
+			return PSM2_EP_NO_RESOURCES;
+		}
+
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		tidrecvc->cuda_hostbuf = chb;
+		tidrecvc->buffer = chb->host_buf;
+		chb->size = 0;
+		chb->gpu_buf = (CUdeviceptr) getreq->tidgr_lbuf +
+					getreq->tidgr_offset;
+	} else {
+		chb = NULL;
+		tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf +
+					    getreq->tidgr_offset);
+		tidrecvc->cuda_hostbuf = NULL;
+	}
+#else // PSM_CUDA
+	tidrecvc->buffer =
+	    (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset);
+#endif // PSM_CUDA
+
+	// separate MR cache's per EP, so this confirms we have the same EP
+	if (
+#ifdef PSM_CUDA
+		! getreq->cuda_hostbuf_used &&
+#endif
+		req->mr && req->mr->cache == proto->mr_cache) {
+		_HFI_MMDBG("CTS chunk reference recv: %p %u bytes via %p %"PRIu64"\n", tidrecvc->buffer, nbytes_this, req->mr->addr, req->mr->length);
+		tidrecvc->mr = psm2_verbs_ref_mr(req->mr);
+	} else {
+		_HFI_MMDBG("CTS chunk register recv: %p %u bytes\n", tidrecvc->buffer, nbytes_this);
+		tidrecvc->mr = psm2_verbs_reg_mr(proto->mr_cache, 1,
+                        proto->ep->verbs_ep.pd,
+                        tidrecvc->buffer, nbytes_this, IBV_ACCESS_RDMA|IBV_ACCESS_REMOTE_WRITE
+#ifdef PSM_CUDA
+               			| (tidrecvc->is_ptr_gpu_backed?IBV_ACCESS_IS_GPU_ADDR:0)
+#endif
+						);
+		if (! tidrecvc->mr) {
+#ifdef PSM_CUDA
+			if (chb)
+				psmi_mpool_put(chb);
+#endif
+			ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+			//ips_scbctrl_free(completescb);
+			ips_scbctrl_free(grantscb);
+			/* Unable to register MR */
+			psmi_timer_request(protoexp->timerq,
+				&protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+			PSM2_LOG_MSG("leaving");
+			return PSM2_TIMEOUT;	// make sure we try again
+		}
+		_HFI_MMDBG("CTS chunk registered: addr %p len %d rkey 0x%x\n",  tidrecvc->buffer , nbytes_this, tidrecvc->mr->rkey);
+	}
+
+	tidrecvc->recv_msglen = nbytes_this;
+
+	/* Initialize recv descriptor */
+	tidrecvc->ipsaddr = ipsaddr;
+	tidrecvc->getreq = (struct ips_tid_get_request *)getreq;
+
+
+	tidrecvc->tidflow_nswap_gen = 0;
+	tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen;
+	tidrecvc->tidflow_genseq.psn_seq = 0;	/* Always start sequence number at 0 (zero),
+	 	 	 	 	 	   in order to prevent wraparound sequence numbers */
+
+	tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset;
+	tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen;
+	// when using kernel PD/MR for kernel rendezvous QP, we need to xlat
+	// our buffer to the kernel PD/MR iova space.
+	// for user space PD/MR iova == addr and xlat is a noop
+	tidrecvc->tid_list.tsess_rkey = tidrecvc->mr->rkey;
+	tidrecvc->tid_list.tsess_raddr = tidrecvc->mr->iova + ((uintptr_t)tidrecvc->buffer -  (uintptr_t)tidrecvc->mr->addr);
+
+	tidrecvc->state = TIDRECVC_STATE_BUSY;
+
+	tidrecvc->stats.nSeqErr = 0;
+	tidrecvc->stats.nGenErr = 0;
+	tidrecvc->stats.nReXmit = 0;
+	tidrecvc->stats.nErrChkReceived = 0;
+
+	_HFI_EXP("alloc tidrecv=%d\n",
+		 tidrecvc->rdescid._desc_idx);
+
+	tidrecvc->grantscb = grantscb;
+
+	*ptidrecvc = tidrecvc; /* return to caller */
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+// process receiver side TID queue
+// If we have TID resources, we will acquire them, setup TID HW,
+// prepare a CTS and send it
+// If we run out of resources with more to do, we reschedule ourselves on timer
+// called directly or on timer
+static
+psm2_error_t
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+	struct ips_tid_get_pend *phead =
+	    &((struct ips_protoexp *)timer->context)->pend_getreqsq;
+	struct ips_protoexp *protoexp;
+	struct ips_tid_get_request *getreq;
+	struct ips_tid_recv_desc *tidrecvc;
+	ips_epaddr_t *ipsaddr;
+	uint32_t nbytes_this, count;
+#ifdef RNDV_MOD
+	struct ips_tid_err_resp_pend *phead_resp =
+	    &((struct ips_protoexp *)timer->context)->pend_err_resp;
+#endif
+	int ret;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("ips_tid_pendtids_timer_callback\n");
+
+#ifdef RNDV_MOD
+	while (!STAILQ_EMPTY(phead_resp)) {
+		ipsaddr = STAILQ_FIRST(phead_resp);
+		protoexp = ipsaddr->epaddr.proto->protoexp;
+		psmi_assert(ipsaddr->rv_need_send_err_chk_rdma_resp);
+		ips_protoexp_send_err_chk_rdma_resp(&ipsaddr->flows[protoexp->proto->msgflowid]);
+		if (! ipsaddr->rv_need_send_err_chk_rdma_resp)
+			STAILQ_REMOVE_HEAD(phead_resp, pend_err_resp_next);
+		else
+			break; // ips_tid_scbavail_callback will trigger us again
+	}
+#endif
+
+#ifdef PSM_CUDA
+	if (
+	    1	/* due to unaligned recv using hostbuf, must always do this */
+	) {
+		/* Before processing pending TID requests, first try to free up
+		 * any CUDA host buffers that are now idle. */
+		struct ips_tid_get_cudapend *cphead =
+			&((struct ips_protoexp *)timer->context)->cudapend_getreqsq;
+		psm2_error_t err;
+
+		/* See if any CUDA memcpys are in progress. Grab the first getreq... */
+		while (!STAILQ_EMPTY(cphead)) {
+			getreq = STAILQ_FIRST(cphead);
+
+			err = psmi_cuda_reclaim_hostbufs(getreq);
+			if (err == PSM2_OK_NO_PROGRESS)
+				goto cudapend_exit;
+
+			/* This pending cuda getreq has no more CUDA ops queued up.
+			 * Either it's completely done, or the CUDA copies have caught
+			 * up with the TID data xfer, but the TID xfer itself is not
+			 * finished.
+			 */
+			if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) {
+				/* TID xfer is done.
+				 * We should only get here if:
+				 * this was involved a cuda copy, and
+				 * the TIX xfer is done.
+				 */
+				psmi_assert(getreq->cuda_hostbuf_used);
+				psmi_assert(getreq->tidgr_length ==
+					    getreq->tidgr_offset);
+
+				/* Remove from the cudapend list, and reclaim */
+				getreq->tidgr_protoexp = NULL;
+				getreq->tidgr_epaddr = NULL;
+				STAILQ_REMOVE_HEAD(cphead, tidgr_next);
+
+				/* mark the req as done */
+				if (getreq->tidgr_callback)
+					getreq->tidgr_callback(getreq->tidgr_req);
+				psmi_mpool_put(getreq);
+			} else
+				break; /* CUDA xfers in progress. Leave. */
+		}
+	}
+cudapend_exit:
+#endif
+
+	while (!STAILQ_EMPTY(phead)) {
+		getreq = STAILQ_FIRST(phead);
+		ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr);
+		count = ipsaddr->msgctl->ipsaddr_count;
+
+ipsaddr_next:
+		// always stripe for rendezvous, ignore multirail_thresh_load_balance
+		// TBD - for multi-rail does this eliminate any advantages of
+		// registering the MR in ips_proto_mq_rts_match_callback
+		ipsaddr = ipsaddr->msgctl->ipsaddr_next;
+		ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+		protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp;
+
+		if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+			psmi_assert(protoexp->proto->msgflowid < EP_FLOW_LAST);
+			struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid];
+			if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) {
+				break;                                    /* skip sending next CTS */
+			}
+		}
+
+#ifdef PSM_CUDA
+		if (getreq->cuda_hostbuf_used) {
+			/* If this is a large transfer, we may be able to
+			 * start reclaiming before all of the data is sent. */
+			psmi_cuda_reclaim_hostbufs(getreq);
+		}
+#endif
+		/*
+		 * Calculate the next window size, avoid the last
+		 * window too small.
+		 */
+		nbytes_this = getreq->tidgr_length - getreq->tidgr_offset;
+		if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz)
+			nbytes_this = getreq->tidgr_rndv_winsz;
+		else if (nbytes_this > getreq->tidgr_rndv_winsz)
+			nbytes_this /= 2;
+		_HFI_MMDBG("ips_tid_pendtids_timer_callback: getreq len %u offset %u nbytes_this %u\n", getreq->tidgr_length, getreq->tidgr_offset, nbytes_this);
+
+		/*
+		 * If there is a next window and the next window
+		 * length is greater than PAGESIZE, make sure the window
+		 * starts on a page boundary.
+		 */
+#ifdef PSM_CUDA
+		psm2_mq_req_t req = getreq->tidgr_req;
+		if (req->is_buf_gpu_mem){
+			if (((getreq->tidgr_offset + nbytes_this) <
+					getreq->tidgr_length) &&
+					nbytes_this > PSMI_GPU_PAGESIZE) {
+				uint32_t pageoff =
+					(((uintptr_t)getreq->tidgr_lbuf) &
+						(PSMI_GPU_PAGESIZE - 1)) +
+					getreq->tidgr_offset + nbytes_this;
+				nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1);
+			}
+		} else
+#endif
+// TBD - we may not need this page alignment of nbytes_this
+		{
+			if ((getreq->tidgr_offset + nbytes_this) <
+					getreq->tidgr_length &&
+					nbytes_this > PSMI_PAGESIZE) {
+				uint32_t pageoff =
+					(((uintptr_t)getreq->tidgr_lbuf) &
+						(PSMI_PAGESIZE - 1)) +
+					getreq->tidgr_offset + nbytes_this;
+				nbytes_this -= pageoff & (PSMI_PAGESIZE - 1);
+			}
+		}
+		_HFI_MMDBG("ips_tid_pendtids_timer_callback: page align nbytes_this %u\n", nbytes_this);
+
+		psmi_assert(nbytes_this >= 4);
+
+		// for STL native the tids and tidflows available pace incoming TIDs
+		// for UD we still use tidflows available to pace incoming RDMA
+			if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) {
+			/* We're out of tidflow. If this process used all the resource,
+			 * the free callback will reschedule the operation, otherwise,
+			 * we reschedule it here */
+			if (ret == 0)
+			{
+				psmi_timer_request(protoexp->timerq,
+						   &protoexp->timer_getreqs,
+						   PSMI_TIMER_PRIO_1);
+			}
+		} else if (ips_tid_recv_alloc(protoexp, ipsaddr,
+			      getreq, nbytes_this, &tidrecvc) == PSM2_OK) {
+			// send the actual CTS
+			ips_protoexp_send_tid_grant(tidrecvc);
+			_HFI_VDBG("GRANT sent tididx=%d srcoff=%d nbytes=%d/%d\n",
+				  tidrecvc->rdescid._desc_idx,
+				  getreq->tidgr_offset, tidrecvc->recv_msglen,
+				  getreq->tidgr_length);
+
+			if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
+				/*
+				 * Once the CTS was sent, we mark it per 'flow' object
+				 * not to proceed with next CTSes until that one is done.
+				 */
+				struct ips_proto *proto = tidrecvc->protoexp->proto;
+				psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+				struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+				flow->flags |= IPS_FLOW_FLAG_SKIP_CTS;
+			}
+
+			/*
+			 * nbytes_this is the asked length for this session,
+			 * ips_tid_recv_alloc() might register less pages, the
+			 * real length is in tidrecvc->recv_msglen.
+			 */
+			getreq->tidgr_offset += tidrecvc->recv_msglen;
+			psmi_assert(getreq->tidgr_offset <=
+				    getreq->tidgr_length);
+
+			if (getreq->tidgr_offset == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+				if (getreq->cuda_hostbuf_used) {
+					/* this completes the tid xfer setup.
+					   move to the pending cuda ops queue,
+					   set the timer to catch completion */
+					STAILQ_REMOVE_HEAD(phead, tidgr_next);
+					STAILQ_INSERT_TAIL(
+						&getreq->tidgr_protoexp->cudapend_getreqsq,
+						getreq, tidgr_next);
+					psmi_timer_request(getreq->tidgr_protoexp->timerq,
+							   &getreq->tidgr_protoexp->timer_getreqs,
+							   PSMI_TIMER_PRIO_1);
+					continue;
+				}
+#endif
+				getreq->tidgr_protoexp = NULL;
+				getreq->tidgr_epaddr = NULL;
+				STAILQ_REMOVE_HEAD(phead, tidgr_next);
+				continue;	/* try next grant request */
+			}
+			else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) {
+				/* In case of multi rail, PSM sends one CTS per request
+				 * per card after which the request is moved to the end
+				 * of the queue.
+				 */
+				count--;
+				if (count)
+					goto ipsaddr_next;
+				STAILQ_REMOVE_HEAD(phead, tidgr_next);
+				STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next);
+				continue;
+			}
+
+			/* created a tidrecvc, reset count */
+			count = ipsaddr->msgctl->ipsaddr_count;
+			goto ipsaddr_next;	/* try next fragment on next ipsaddr */
+		}
+
+		/*
+		 * We need to loop until we can't get a tidrecvc on all
+		 * ipsaddrs, then the callbacks on the home protoexp where
+		 * getreq is linked can resume this routine. Otherwise, we
+		 * might make this getreq to be orphaned and cause deadlock.
+		 */
+		count--;
+		if (count)
+			goto ipsaddr_next;
+		break;
+	}
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;		/* XXX err-broken */
+}
+
+#ifdef PSM_CUDA
+static
+void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
+{
+	struct ips_protoexp *protoexp = tidrecvc->protoexp;
+	struct ips_cuda_hostbuf *chb;
+
+	chb = tidrecvc->cuda_hostbuf;
+	chb->size += tidrecvc->recv_msglen;
+
+	if (protoexp->cudastream_recv == NULL) {
+		PSMI_CUDA_CALL(cuStreamCreate,
+			&protoexp->cudastream_recv,
+			CU_STREAM_NON_BLOCKING);
+	}
+
+	PSMI_CUDA_CALL(cuMemcpyHtoDAsync,
+		       chb->gpu_buf, chb->host_buf,
+		       tidrecvc->recv_msglen,
+		       protoexp->cudastream_recv);
+
+	if (chb->copy_status == NULL) {
+		PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+	}
+	PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
+		       protoexp->cudastream_recv);
+
+	STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next);
+	tidrecvc->cuda_hostbuf = NULL;
+	ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0);
+}
+#endif
+
+// we have completed receipt of the TIDs for a given CTS
+// For RC QP, this is indicated by RDMA completion w/immediate
+static
+psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
+{
+	struct ips_protoexp *protoexp = tidrecvc->protoexp;
+	struct ips_tid_get_request *getreq = tidrecvc->getreq;
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert(getreq != NULL);
+	psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
+
+#ifdef PSM_CUDA
+	if (tidrecvc->cuda_hostbuf)
+		psmi_cudamemcpy_tid_to_device(tidrecvc);
+#endif
+
+	if (tidrecvc->mr) {
+		_HFI_MMDBG("CTS recv chunk complete, releasing MR: rkey: 0x%x\n", tidrecvc->mr->rkey);
+        psm2_verbs_release_mr(tidrecvc->mr);
+        tidrecvc->mr = NULL;
+    }
+
+	getreq->tidgr_bytesdone += tidrecvc->recv_msglen;
+
+	_HFI_EXP("req=%p bytes=%d/%d\n",
+		 getreq->tidgr_req,
+		 getreq->tidgr_bytesdone, getreq->tidgr_length);
+
+	tidrecvc->state = TIDRECVC_STATE_FREE;
+
+	/* finally free the tidflow */
+	ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx);
+
+	if (getreq->tidgr_bytesdone == getreq->tidgr_length) {
+#ifdef PSM_CUDA
+		/* if cuda, we handle callbacks when the cuda xfer is done */
+		if (!getreq->cuda_hostbuf_used) {
+			if (getreq->tidgr_callback)
+				getreq->tidgr_callback(getreq->tidgr_req);
+			psmi_mpool_put(getreq);
+		}
+#else
+		if (getreq->tidgr_callback)
+			getreq->tidgr_callback(getreq->tidgr_req);
+		psmi_mpool_put(getreq);
+#endif
+	} else {
+		/* We just released some tids.
+		 * If requests are waiting on tids to be
+		 * freed, queue up the timer */
+		if (getreq->tidgr_offset < getreq->tidgr_length) {
+			ips_tid_pendtids_timer_callback(&getreq->
+							tidgr_protoexp->
+							timer_getreqs, 0);
+		}
+	}
+
+	/* we freed some an MR  If we have pending sends or pending get requests,
+	 * turn on the timer so it can be processed. */
+	ips_tid_mravail_callback(protoexp->proto);
+
+	return err;
+}
+
+
+
+
+
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_header.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_header.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c4ae1d6df97822b6ba7616b87cbca84ea3aab76
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_header.h
@@ -0,0 +1,202 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HEADER_H
+#define _IPS_PROTO_HEADER_H
+
+/* Although defined as macros, the *_BITS values below are NOT meant to be
+   changed.  They are defined this way so that their values are written in
+   exactly one place.  These macros are used in struct ips_message_header
+   below, as well as in the active messages code for the purpose of
+   establishing how many arguments/handlers are supported, and to assert that
+   values written into the header fields are not too large for the number of
+   bits available. The preprocessor check below ensures less than 32 bits are
+   used.
+ */
+
+/* Number of bits to use for the amhdr_len field. */
+#define IPS_AM_HDR_LEN_BITS 4
+
+/* Number of bits to use for the amhdr_hidx field.  Bounds the number of
+ * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */
+#define IPS_AM_HDR_HIDX_BITS 8
+
+/* Number of bits to use for the amhdr_nargs field.  Bounds the number of
+   arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */
+#define IPS_AM_HDR_NARGS_BITS 4
+
+#if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32
+#error "Bad IPS header definition: AM fields must use 32 bits or less"
+#endif
+
+/* Number of AM arguments that can be packets into struct_ips_message_header.
+   Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in
+   the data payload. */
+#define IPS_AM_HDR_NARGS  \
+	(sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t))
+
+/* The actual size of the message header is determined by three paramters:
+ * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware)
+ *    OPA words contain LRH and BTH
+ * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol)
+ *    IPS hardware words contain ips-protocol-specific data
+ * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol)
+ *    IPS user words contain ips-protocol-specific data
+ *
+ * The header message size is determined to as IWORDS + HWORDS + UWORDS
+ */
+struct ips_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+
+	// 32b alignment
+	/* fields below this point are in host byte order */
+	struct hfi_kdeth khdr;
+
+	// 32b alignment
+	struct {
+		__u32 flags:6;
+		__u32 connidx:26;	/* connection idx */
+	} PACK_SUFFIX;
+
+	// 64b alignment
+	union {
+		struct {
+			struct {
+				__u32 ack_seq_num:31;
+				__u32 reserved:1;
+			} PACK_SUFFIX;
+
+			// 32b alignment
+			union {
+				struct {	/* for active message */
+					__u32 amhdr_len:IPS_AM_HDR_LEN_BITS;
+					__u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS;
+					__u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS;
+				} PACK_SUFFIX;
+				__u32 mdata;	/* for misc data */
+			};
+
+			// 64b alignment
+			/* Inline arguments and/or message payload  */
+			union {
+				ptl_arg_t data[2];
+				__u32 uwords[4];
+			};
+		} PACK_SUFFIX;
+
+		/* for message header packet only */
+		struct {
+			__u32 pad1;
+			__u32 tag[3];	/* 96 bits psm tag */
+			ptl_arg_t hdr_data;
+		} PACK_SUFFIX;
+
+		/* for expected tid packet only */
+		struct {
+			__u8	  exp_ustart[3]; /* unaligned start bytes */
+			__u8	  exp_uend[3];   /* unaligned end bytes */
+			__u16	  exp_rdescid_genc; /* tidrecvc gen count */
+			ptl_arg_t exp_sdescid;  /* sender descriptor id */
+			__u32     exp_cksum;	/* optional checksum */
+			__u32     exp_offset;	/* packet offset */
+		} PACK_SUFFIX;
+	};
+} PACK_SUFFIX;
+/* desc_genc is up to 32 bits, but EXPTID header (and RDMA immediate data)
+ * only has room for 16 bits
+ */
+#define IPS_HDR_RDESCID_GENC_MASK 0xffff
+
+/*
+ * OpCodes in BTH[0], 24-31 bits. Order is important!!!
+ */
+#define OPCODE_RESERVED			0xC0	/* reserved */
+/* TINY to EXPTID_COMPLETION/ERR_CHK_RDMA_RESP are level 2 packets */
+/* sending queue keeps a copy and resends if timeout waiting for ack */
+/* order and reliability maintained */
+#define OPCODE_TINY			0xC1	/* 0 <= msglen <= 8 */
+#define OPCODE_SHORT			0xC2	/* 8 < msglen <= MTU */
+#define OPCODE_EAGER			0xC3	/* eager packet */
+#define OPCODE_LONG_RTS			0xC4	/* ready to send */
+#define OPCODE_LONG_CTS			0xC5	/* confirm to send */
+#define OPCODE_LONG_DATA		0xC6	/* long data packets */
+#define OPCODE_ERR_CHK_RDMA		0xC7	/* RDMA error recovery */
+#define OPCODE_ERR_CHK_RDMA_RESP 0xC8	/* RDMA error recovery response */
+/* ACK to ERR_CHK_GEN are "level 0 control packets" state machine driven send */
+/* reissue if given state persists */
+/* duplicates can occur with no consequences */
+#define OPCODE_ACK			0xC9	/* explicit ACK packet */
+#define OPCODE_NAK			0xCA	/* explicit NAK packet */
+#define OPCODE_BECN			0xCB	/* congestion control */
+#define OPCODE_ERR_CHK			0xCC	/* query eager receiving */
+//					0xCD	/* reserved */
+/* CONNECT_REQUEST to DISCONNECT_REPLY are "level 1 control packets" */
+/* timer based resend, but rebuild on fly when resend */
+/* consumer must deal with duplicates */
+#define OPCODE_CONNECT_REQUEST		0xCE	/* connect request */
+#define OPCODE_CONNECT_REPLY		0xCF	/* connect reply */
+#define OPCODE_DISCONNECT_REQUEST	0xD0	/* disconnect request */
+#define OPCODE_DISCONNECT_REPLY		0xD1	/* disconnect reply */
+/* AM_REQUEST_NOREPLY to AM_REPLY are level 2 packets */
+/* sending queue keeps a copy and resends if timeout waiting for ack */
+/* order and reliability maintained */
+#define OPCODE_AM_REQUEST_NOREPLY	0xD2	/* AM request w/o reply */
+#define OPCODE_AM_REQUEST		0xD3	/* AM request */
+#define OPCODE_AM_REPLY			0xD4	/* AM reply */
+#define OPCODE_FUTURE_FROM		0xD5	/* reserved for expansion */
+#define OPCODE_FUTURE_TO		0xDF	/* reserved for expansion */
+
+#endif /* _IPS_PROTO_HEADER_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_help.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_help.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a3ebeede2795bc0fe5c99bb8ea2e0417fa00002
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_help.h
@@ -0,0 +1,558 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2017 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2017 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_HELP_H
+#define _IPS_PROTO_HELP_H
+
+#include "ptl_ips.h"
+
+/* hfi_opcode is not the ips-level opcode. */
+PSMI_ALWAYS_INLINE(
+uint8_t
+_get_proto_hfi_opcode(const struct ips_message_header *p_hdr))
+{
+	return ((__be32_to_cpu(p_hdr->bth[0]) >>
+		 HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+uint8_t
+ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow))
+{
+	/*
+	 * Setup ACK request if more than ack_interval packets
+	 * have not been requested an ACK
+	 */
+	if (scb->scb_flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) {
+		flow->ack_counter = 0;
+	} else {
+		flow->ack_counter++;
+		if (flow->ack_counter > flow->ack_interval) {
+			flow->ack_counter = 0;
+			scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+		}
+	}
+
+	/* Bottom 6 bits wind up in protocol header fields, other bits
+	 * control other aspects of packet composition */
+	return (uint8_t) (scb->scb_flags & IPS_SEND_FLAG_PROTO_OPTS);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_epaddr_flow_t
+ips_proto_flowid(struct ips_message_header *p_hdr))
+{
+	return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >>
+				     HFI_BTH_FLOWID_SHIFT) &
+				    HFI_BTH_FLOWID_MASK);
+}
+
+PSMI_ALWAYS_INLINE(
+int
+ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr,
+	     void *payload, uint32_t paylen, uint32_t *cksum))
+{
+	uint16_t paywords;
+
+	/* Update the payload words in header */
+	paywords = (sizeof(struct ips_message_header) + paylen +
+		    PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >>
+	    BYTE2DWORD_SHIFT;
+	p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+	/* Need to regenerate KDETH checksum after updating payload length */
+	/* ips_kdeth_cksum(p_hdr); */
+
+	*cksum = 0xffffffff;
+
+	/* Checksum header */
+	*cksum = ips_crc_calculate(sizeof(struct ips_message_header),
+				   (uint8_t *) p_hdr, *cksum);
+
+	/* Checksum payload (if any) */
+	if (paylen) {
+		psmi_assert_always(payload);
+		*cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum);
+	}
+
+	return 0;
+}
+
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+	      struct ips_flow *flow, ips_scb_t *scb, uint8_t flags))
+{
+	uint16_t slid, dlid;
+	uint32_t paywords = (sizeof(struct ips_message_header) +
+			     scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >>
+	    BYTE2DWORD_SHIFT;
+	struct ips_message_header *p_hdr = &scb->ips_lrh;
+#if 0
+	/*
+	 * This scb has been used by this connection last time,
+	 * so some of the header fields are already set.
+	 */
+	if (scb->flow == flow) {
+		p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+
+		p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+					      (scb->
+					       opcode << BTH_OPCODE_SHIFT) |
+					      (extra_bytes <<
+					       BTH_EXTRA_BYTE_SHIFT));
+		p_hdr->bth[2] =
+		    __cpu_to_be32(flow->xmit_seq_num.
+				  psn | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ));
+
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset |
+						   (scb->
+						    offset_mode <<
+						    HFI_KHDR_OM_SHIFT) | (scb->
+									  tid <<
+									  HFI_KHDR_TID_SHIFT)
+						   | (scb->
+						      tidctrl <<
+						      HFI_KHDR_TIDCTRL_SHIFT) |
+						   (scb->
+						    flags & IPS_SEND_FLAG_INTR)
+						   | (scb->
+						      flags &
+						      IPS_SEND_FLAG_HDR_SUPPRESS)
+						   | (IPS_PROTO_VERSION <<
+						      HFI_KHDR_KVER_SHIFT));
+
+		/* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */
+
+		p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+		p_hdr->flags = flags;
+
+		return;
+	}
+#endif
+	slid = flow->path->pr_slid;
+	dlid = flow->path->pr_dlid;
+	if (scb->scb_flags & IPS_SEND_FLAG_NO_LMC) {
+        	slid = ipsaddr->pathgrp->pg_base_slid;
+        	dlid = ipsaddr->pathgrp->pg_base_dlid;
+	}
+
+	/* Setup LRH fields */
+	p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH |
+				      ((flow->path->pr_sl & HFI_LRH_SL_MASK) <<
+				       HFI_LRH_SL_SHIFT)
+					);
+	p_hdr->lrh[1] = dlid;
+	p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK);
+	p_hdr->lrh[3] = slid;
+
+	/* Setup BTH fields */
+	p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey |
+			      (scb->opcode << HFI_BTH_OPCODE_SHIFT));
+	p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num |
+				      (scb->scb_flags & IPS_SEND_FLAG_ACKREQ));
+
+	if (scb->tidctrl) {	/* expected receive packet */
+		psmi_assert(scb->tidsendc != NULL);
+		p_hdr->bth[1] = __cpu_to_be32((scb->tidsendc->
+						rdescid._desc_idx
+						 << HFI_BTH_FLOWID_SHIFT));
+
+		/* Setup KHDR fields */
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+						   (scb->tidctrl <<
+						    HFI_KHDR_TIDCTRL_SHIFT) |
+						   (scb->scb_flags &
+							IPS_SEND_FLAG_INTR)
+						   | (IPS_PROTO_VERSION <<
+						    HFI_KHDR_KVER_SHIFT));
+	} else {		/* eager receive packet */
+		p_hdr->bth[1] = __cpu_to_be32((flow->flowid
+						 << HFI_BTH_FLOWID_SHIFT));
+		/* Setup KHDR fields */
+		p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 |
+						   (scb->scb_flags &
+							IPS_SEND_FLAG_INTR)
+						   | (IPS_PROTO_VERSION <<
+						      HFI_KHDR_KVER_SHIFT));
+
+		p_hdr->ack_seq_num = flow->recv_seq_num.psn_num;
+	}
+
+	p_hdr->khdr.job_key = 0;
+	p_hdr->connidx = ipsaddr->connidx_outgoing;
+	p_hdr->flags = flags;
+
+	scb->flow = flow;
+
+	return;
+}
+
+/*
+ * Assumes that the following fields are already set in scb:
+ * payload
+ * payload_size
+ * flags
+ */
+PSMI_INLINE(
+void
+ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
+			   struct ips_flow *flow, ips_scb_t *scb))
+{
+	// ips_ptl_mq_rndv can allow small odd sized payload in RTS
+	psmi_assert(scb->payload_size <= 3 || ! (scb->payload_size & 3));
+	ips_proto_hdr(proto, ipsaddr, flow, scb,
+		      ips_flow_gen_ackflags(scb, flow));
+
+	scb->ack_timeout = proto->epinfo.ep_timeout_ack;
+	scb->abs_timeout = TIMEOUT_INFINITE;
+	scb->scb_flags |= IPS_SEND_FLAG_PENDING;
+
+	if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+		flow->xmit_seq_num.psn_seq += scb->nfrag;
+		scb->seq_num = flow->xmit_seq_num;
+		scb->seq_num.psn_seq--;
+	} else {
+		flow->xmit_seq_num.psn_num =
+		    (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask;
+		scb->seq_num.psn_num =
+		    (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask;
+	}
+
+	return;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype))
+{
+	switch (msgtype) {
+	case OPCODE_ACK:
+		break;
+	case OPCODE_ERR_CHK:
+		proto->epaddr_stats.err_chk_send++;
+		break;
+	case OPCODE_NAK:
+		proto->epaddr_stats.nak_send++;
+		break;
+	case OPCODE_CONNECT_REQUEST:
+		proto->epaddr_stats.connect_req_send++;
+		break;
+	case OPCODE_CONNECT_REPLY:
+		proto->epaddr_stats.connect_rep_send++;
+		break;
+	case OPCODE_DISCONNECT_REQUEST:
+		proto->epaddr_stats.disconnect_req_send++;
+		break;
+	case OPCODE_DISCONNECT_REPLY:
+		proto->epaddr_stats.disconnect_rep_send++;
+		break;
+	default:
+		break;
+	}
+	return;
+}
+
+/*
+ * Exported there solely for inlining is_expected_or_nak and mq_tiny handling
+ */
+extern
+psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow,
+		uint8_t message_type, uint16_t *msg_queue_mask,
+		ips_scb_t *ctrlscb, void *payload, uint32_t paylen);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+	if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) {
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;	/* ACK clears NAK */
+		} else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) {
+			SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+		}
+
+		flow->flags |= IPS_FLOW_FLAG_PENDING_ACK;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.scb_flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+		/* Coalesced ACKs disabled. Send ACK immediately */
+		ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+					    &flow->ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+	if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;	/* NAK clears ACK */
+		} else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) {
+			SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+		}
+
+		flow->flags |= IPS_FLOW_FLAG_PENDING_NAK;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.scb_flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+		/* Coalesced ACKs disabled. Send NAK immediately */
+		ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+					    &flow->ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+}
+
+/* return 1 if packet is next expected in flow
+ * return 0 if packet is not next expected in flow (and nak packet).
+ */
+PSMI_ALWAYS_INLINE(
+int
+ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev))
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	psmi_seqnum_t sequence_num;
+
+	psmi_assert(flowid == EP_FLOW_GO_BACK_N_PIO);
+	flow = &ipsaddr->flows[flowid];
+
+	sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+	if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) {
+		flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND;
+
+		flow->recv_seq_num.psn_num =
+		    (flow->recv_seq_num.psn_num + 1) & proto->psn_mask;
+
+		/* don't process ack, caller will do it. */
+		return 1;
+
+	}
+
+	int16_t diff = (int16_t) (sequence_num.psn_num -
+			       flow->recv_seq_num.psn_num);
+	if (diff > 0) {
+		if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) {
+			/* Queue/Send NAK to peer  */
+			ips_proto_send_nak((struct ips_recvhdrq *)
+					   rcv_ev->recvq, flow);
+			flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+		}
+	}
+
+	/* process ack if packet is not in sequence. */
+	ips_proto_process_ack(rcv_ev);
+
+	return 0;
+}
+
+/*
+ * Note, some code depends on the literal values specified in this enum.
+ */
+enum ips_msg_order {
+	IPS_MSG_ORDER_PAST  = 3,	/* Old message, recv & drop */
+	IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */
+	IPS_MSG_ORDER_EXPECTED = 1,	/* Expected message, always recv */
+	IPS_MSG_ORDER_FUTURE_RECV = 0,	/* Future message, buffer in OOO Q */
+	IPS_MSG_ORDER_FUTURE = -1,	/* Future message, leave on RHQ */
+};
+
+PSMI_ALWAYS_INLINE(
+enum ips_msg_order
+ips_proto_check_msg_order(ips_epaddr_t *ipsaddr,
+			 struct ips_flow *flow,
+			 uint16_t send_seqnum,
+			 uint16_t *recv_seqnum))
+
+{
+	int16_t diff = (int16_t) (*recv_seqnum - send_seqnum);
+
+	if (likely(diff == 0)) {
+		*recv_seqnum += 1;
+
+		ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK;
+		if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK)
+			return IPS_MSG_ORDER_EXPECTED_MATCH;
+
+		return IPS_MSG_ORDER_EXPECTED;
+	} else if (diff > 0) {
+		return IPS_MSG_ORDER_PAST;
+	}
+
+	ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK;
+	if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) {
+		/*
+		 * Second time to see the same ooo message, receive and put
+		 * into OOO queue.
+		 */
+		return IPS_MSG_ORDER_FUTURE_RECV;
+	}
+
+	/* The first time to see an OOO message, leave it there and try
+	 * next time. But we need to revert back the receiving flow PSN. */
+	uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+	flow->recv_seq_num.psn_num =
+		(flow->recv_seq_num.psn_num - 1) & psn_mask;
+	return IPS_MSG_ORDER_FUTURE;
+}
+
+PSMI_INLINE(
+int
+ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
+{
+	uint32_t index;
+
+#ifdef PSM_FI
+
+	if_pf(PSMI_FAULTINJ_ENABLED_EP(rcv_ev->proto->ep)) {
+		PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost",
+					  "drop "
+					  "RC eager or any UD "
+					  "packet at recv",
+					   1, IPS_FAULTINJ_RECVLOST);
+		if_pf(PSMI_FAULTINJ_IS_FAULT(fi_recv, ""))
+			return IPS_RECVHDRQ_CONTINUE;
+	}
+#endif // PSM_FI
+	/* see file ips_proto_header.h for details */
+	index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED;
+	if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED))
+		index = 0;
+
+	return ips_packet_service_routine[index]
+			((struct ips_recvhdrq_event *)rcv_ev);
+}
+
+/*
+ * Breaks header encapsulation but needed in mq sends so we can pay
+ * "near-equal" attention to putting sends on the wire and servicing the
+ * receive queue.
+ */
+
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_recv_progress_if_busy(ptl_t *ptl_gen, psm2_error_t err))
+{
+	struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen;
+
+	if (err == PSM2_EP_NO_RESOURCES) {
+		ptl->ctl->ep_poll(ptl_gen, 0);
+		return PSM2_OK;
+	} else
+		return err;
+}
+
+/* Find next lowest power of a two for a 32 bit number*/
+PSMI_ALWAYS_INLINE(
+unsigned int
+ips_next_low_pow2(unsigned int v))
+{
+
+	const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 };
+	const unsigned int S[] = { 1, 2, 4, 8, 16 };
+	register unsigned int r = 1;
+	int i;
+
+	for (i = 4; i >= 0; i--) {
+		if (v & b[i]) {
+			v >>= S[i];
+			r <<= S[i];
+		}
+	}
+
+	return r;
+}
+
+PSMI_ALWAYS_INLINE(
+ips_path_rec_t *
+ips_select_path(struct ips_proto *proto, ips_path_type_t path_type,
+		ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp))
+{
+	uint32_t path_idx;
+
+	if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+		/* If dispersive routes are configured then select the routes
+		 * in round robin order. We may want to use congestion
+		 * information to select the least lightly loaded path.
+		 */
+		path_idx = pathgrp->pg_next_path[path_type];
+		if (++pathgrp->pg_next_path[path_type] >=
+		    pathgrp->pg_num_paths[path_type])
+			pathgrp->pg_next_path[path_type] = 0;
+	} else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+		path_idx =	/* Key on destination context */
+		    ipsaddr->IPSADDR_HASH % pathgrp->pg_num_paths[path_type];
+	else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+		path_idx =	/* Key off src context */
+		    proto->epinfo.EP_HASH % pathgrp->pg_num_paths[path_type];
+	else			/* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */
+		path_idx = 0;
+
+	return pathgrp->pg_path[path_idx][path_type];
+}
+
+#endif /* _IPS_PROTO_HELP_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_internal.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..917e098842a09d5fd1741444e1caf00a40bc9fc3
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_internal.h
@@ -0,0 +1,85 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_INTERNAL_H
+#define _IPS_PROTO_INTERNAL_H
+
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+/*
+ * Connect protocol.
+ *
+ * On receive, handled by upcalling into the connect interface.
+ * On send, handled by ips_proto by having connect compose the message.
+ */
+psm2_error_t ips_proto_process_connect(struct ips_proto *proto,
+				      uint8_t opcode,
+				      struct ips_message_header *p_hdr,
+				      void *payload, uint32_t paylen);
+psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t);
+psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t);
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto);
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto);
+
+int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev);
+int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev);
+
+#endif /* _IPS_PROTO_INTERNAL_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
new file mode 100644
index 0000000000000000000000000000000000000000..15256d472b46fd69d5bd20cc24e1ee1de4e35989
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
@@ -0,0 +1,2497 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#ifdef PSM_CUDA
+#include "psm_gdrcpy.h"
+#endif
+#include "ips_scb.h"
+#include "ips_proto.h"
+#include "psm_mq_internal.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+PSMI_NEVER_INLINE(ips_scb_t *
+		  ips_poll_scb(struct ips_proto *proto,
+			       int npkts, int len, uint32_t flags, int istiny))
+{
+	ips_scb_t *scb = NULL;
+	psmi_assert(npkts > 0);
+	psm2_error_t err;
+
+	proto->stats.scb_egr_unavail_cnt++;
+
+	PSMI_BLOCKUNTIL(proto->ep, err,
+			((scb =
+			  (istiny ?
+			   ips_scbctrl_alloc_tiny(&proto->scbc_egr) :
+			   ips_scbctrl_alloc(&proto->scbc_egr, npkts, len,
+					     flags))) != NULL));
+	psmi_assert(scb != NULL);
+	return scb;
+}
+
+PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto))
+{
+	ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr);
+	/* common case should branch right through */
+	if_pt(scb != NULL)
+	    return scb;
+	else
+	return ips_poll_scb(proto, 1, 0, 0, 1);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_scb_t *
+mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags))
+{
+	psmi_assert(npkts > 0);
+	ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags);
+	if_pt(scb != NULL) {
+		return scb;
+	}
+	else {
+		return ips_poll_scb(proto, npkts, len, flags,
+				    0 /* not tiny scb */);
+	}
+}
+
+static
+int ips_proto_scb_mr_complete(void *context, uint32_t nbytes)
+{
+	ips_scb_t *scb = (ips_scb_t *)context;
+	if (scb->mr) {
+		_HFI_MMDBG("SDMA complete, releasing MR: lkey: 0x%x\n", scb->mr->lkey);
+		psm2_verbs_release_mr(scb->mr);
+		scb->mr = NULL;
+		ips_tid_mravail_callback(scb->flow->ipsaddr->epaddr.proto);
+	}
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+// handle end to end completion of eager and LONG_DATA sends
+static
+int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
+{
+	psm2_mq_req_t req = (psm2_mq_req_t) reqp;
+
+	/* This code path is executed when the send is on a device buffer
+	 * and the receive is completed using eager buffers. As there is no
+	 * completion notification sent to the sender, this is the only place
+	 * where send side chb's can be freed and put back into the mpool.
+	 */
+#ifdef PSM_CUDA
+	struct ips_cuda_hostbuf *chb;
+	if (req->cuda_hostbuf_used) {
+		while (!STAILQ_EMPTY(&req->sendreq_prefetch)) {
+			/* If any prefetched buffers weren't used, they
+			   must be reclaimed here. */
+			chb = STAILQ_FIRST(&req->sendreq_prefetch);
+			STAILQ_REMOVE_HEAD(&req->sendreq_prefetch,
+						   req_next);
+			psmi_mpool_put(chb);
+		}
+	}
+#endif
+
+	req->send_msgoff += nbytes;
+	/*
+	 * the reason to use >= is because
+	 * we may have DW pad in nbytes.
+	 */
+	if (req->send_msgoff >= req->req_data.send_msglen) {
+		// If we predicted use of RDMA and pre-registered our buffer when we
+		// sent RTS, and receiver chose LONG_DATA in CTS, we can end up here
+		// and need to release our MR
+		if (req->mr) {
+			_HFI_MMDBG("RTS complete, releasing MR: rkey: 0x%x\n", req->mr->rkey);
+			psm2_verbs_release_mr(req->mr);
+			req->mr = NULL;
+			ips_tid_mravail_callback(req->rts_peer->proto);
+		}
+		req->state = MQ_STATE_COMPLETE;
+		ips_barrier();
+		if(!psmi_is_req_internal(req))
+			mq_qq_append(&req->mq->completed_q, req);
+	}
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+void ips_proto_mq_rv_complete(psm2_mq_req_t req)
+{
+	psmi_mq_handle_rts_complete(req);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars))
+{
+	unsigned char *dest = vdest;
+	const unsigned char *src = vsrc;
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) {
+		PSMI_CUDA_CALL(cuMemcpy,
+			       (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars);
+		return;
+	}
+#endif
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3: *dest++ = *src++;
+	/* fall through */
+	case 2: *dest++ = *src++;
+	/* fall through */
+	case 1: *dest++ = *src++;
+	}
+	return;
+}
+
+#ifdef PSM_CUDA
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars))
+{
+	unsigned char *dest = vdest;
+	const unsigned char *src = vsrc;
+
+	if (nchars >> 2)
+		hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2);
+	dest += (nchars >> 2) << 2;
+	src += (nchars >> 2) << 2;
+	switch (nchars & 0x03) {
+	case 3: *dest++ = *src++;
+	/* fall through */
+	case 2: *dest++ = *src++;
+	/* fall through */
+	case 1: *dest++ = *src++;
+	}
+	return;
+}
+#endif
+
+extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope
+ *
+ * Recoverable errors:
+ * PSM2_OK: If PIO, envelope is sent.
+ *	   If DMA, all queued up packets on flow were flushed.
+ *
+ * Recoverable errors converted to PSM2_OK just before return:
+ * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets.
+ * PSM2_EP_NO_RESOURCES:
+ *	   If PIO, no pio available or cable currently pulled.
+ *	   If DMA, can be that no scb's available to handle unaligned packets
+ *	           or writev returned a recoverable error (no mem for
+ *	           descriptors, dma interrupted or no space left in dma queue).
+ *
+ * Unrecoverable errors (PIO or DMA).
+ * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ *			  rxe/txe parity error.
+ * PSM2_EP_NO_NETWORK: No network, no lid, ...
+ */
+PSMI_ALWAYS_INLINE(
+psm2_error_t
+ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow,
+		     struct ips_scb *scb, int do_flush))
+{
+	psm2_error_t err = PSM2_OK;
+
+	ips_proto_flow_enqueue(flow, scb);
+
+	if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush)
+		err = flow->flush(flow, NULL);
+
+	if (do_flush)
+		err = ips_recv_progress_if_busy(proto->ptl, err);
+
+	/* As per the PSM error model (or lack thereof), PSM clients expect to see
+	 * only PSM2_OK as a recoverable error */
+	if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS)
+		err = PSM2_OK;
+	return err;
+}
+
+/*
+ * We don't use message striping for middle message protocol,
+ * Tests on sandy-bridge two HFIs show lower bandwidth if
+ * message striping is used.
+ */
+ustatic
+psm2_error_t
+ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
+		 struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf,
+		 uint32_t len)
+{
+	ips_epaddr_t *ipsaddr = flow->ipsaddr;
+	psm2_error_t err = PSM2_OK;
+	uintptr_t buf = (uintptr_t) ubuf;
+	uint32_t nbytes_left, pktlen, offset, chunk_size;
+	uint16_t msgseq, padding;
+	ips_scb_t *scb;
+	uint32_t is_non_dw_mul_allowed = 0;
+
+	psmi_assert(len > 0);
+	psmi_assert(req != NULL);
+
+	chunk_size = flow->frag_size;
+	msgseq = ipsaddr->msgctl->mq_send_seqnum++;
+
+	nbytes_left = len;
+	offset = 0;
+	do {
+		if (is_non_dw_mul_allowed) {
+			/* No need to care about padding if non-double word
+			 * multiple message size is allowed.
+			 */
+			padding = 0;
+		} else {
+			padding = nbytes_left & 0x3;
+		}
+
+		if (padding) {
+			psmi_assert(nbytes_left > flow->frag_size);
+			/* over reading should be OK on sender because
+			 * the padding area is within the whole buffer,
+			 * receiver will discard the extra bytes via
+			 * padcnt in packet header
+			 */
+			padding = 4 - padding;
+			pktlen = flow->frag_size - padding;
+		} else {
+			pktlen = min(chunk_size, nbytes_left);
+			psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed));
+		}
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb != NULL);
+		ips_scb_opcode(scb) = OPCODE_EAGER;
+		ips_set_LMC_LID_choice(proto, scb, len);
+		scb->ips_lrh.khdr.kdeth0 = msgseq;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+		scb->ips_lrh.hdr_data.u32w1 = len;
+		scb->ips_lrh.hdr_data.u32w0 = offset;	/* initial offset */
+
+		_HFI_VDBG
+		    ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n",
+		     (void *)buf, pktlen, flow->frag_size, nbytes_left);
+		ips_scb_buffer(scb) = (void *)buf;
+		if (req->mr) {
+			scb->mr = req->mr;
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+		}
+
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem) {
+			// flags will get handled in pio transfer_frame
+			// but use cuMemcpy instead of GDRCopy
+			if (!req->mr)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+		}
+#endif // PSM_CUDA
+
+		buf += pktlen;
+		offset += pktlen;
+		nbytes_left -= pktlen;
+
+		pktlen += padding;
+		psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed));
+
+		scb->frag_size = flow->frag_size;
+		scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size;
+		if (scb->nfrag > 1) {
+			ips_scb_length(scb) = flow->frag_size;
+			scb->nfrag_remaining = scb->nfrag;
+			scb->chunk_size =
+				scb->chunk_size_remaining = pktlen;
+		} else
+			ips_scb_length(scb) = pktlen;
+
+		if (nbytes_left == 0) {	/* last segment/packet */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+
+			/* Set ACKREQ if single packet per scb. For multi
+			 * packets per scb, it is SDMA, driver will set
+			 * ACKREQ in last packet, we only need ACK for
+			 * last packet.
+			 */
+			if (scb->nfrag == 1)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+		} else {
+			req->send_msgoff += pktlen;
+		}
+		ips_proto_flow_enqueue(flow, scb);
+
+		if (flow->transfer == PSM_TRANSFER_PIO) {
+			/* we need to flush the pio pending queue as quick as possible */
+			err = flow->flush(flow, NULL);
+		}
+
+	} while (nbytes_left);
+
+
+	/* Before return, try to make some progress as long as the operation is
+	 * not a fast path isend. If this is a fast path isend we cannot call
+	 * progress functions since that will cause recursion into recvhdrq_progress
+	 * and cause messages to be lost. Instead, for fast path if the operation
+	 * was successfully enqueued, but flush returned PSM2_OK_NO_PROGRESS we return
+	 * PSM2_OK since the user will progress the queue once the fast path call is
+	 * complete.
+	*/
+	if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) {
+		if (likely(!(req->flags_internal & PSMI_REQ_FLAG_FASTPATH))) {
+			err = ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES);
+		} else if (err == PSM2_EP_NO_RESOURCES) {
+			err = PSM2_OK;
+		}
+	}
+
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
+		ips_epaddr_t *ipsaddr, const void *buf, uint32_t len)
+{
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid];
+	psm2_error_t err = PSM2_OK;
+	ips_scb_t *scb;
+
+	PSM2_LOG_MSG("entering");
+	req->req_data.buf = (void *)buf;
+	req->req_data.buf_len = len;
+	req->req_data.send_msglen = len;
+	req->recv_msgoff = 0;
+	req->rts_peer = (psm2_epaddr_t) ipsaddr;
+
+	scb = mq_alloc_pkts(proto, 1, 0, 0);
+	psmi_assert(scb);
+	ips_scb_opcode(scb) = OPCODE_LONG_RTS;
+	ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+	if (req->type & MQE_TYPE_WAITING)
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING;
+	scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+	ips_scb_copy_tag(scb->ips_lrh.tag, req->req_data.tag.tag);
+	scb->ips_lrh.hdr_data.u32w1 = len;
+	scb->ips_lrh.hdr_data.u32w0 = psmi_mpool_get_obj_index(req);
+
+	// small well aligned synchronous payload is sent in RTS itself
+	// CTS becomes the synchronous ACK
+	if (len <= flow->frag_size &&
+#ifdef PSM_CUDA
+	    !req->is_buf_gpu_mem &&
+#endif
+	    (len <= 3 || !(len & 0x3))) {
+		ips_scb_buffer(scb) = (void *)buf;
+		ips_scb_length(scb) = len;
+		req->send_msgoff = len;
+		req->mq->stats.tx_rndv_bytes += len;
+	} else {
+		ips_scb_length(scb) = 0;
+		req->send_msgoff = 0;
+	}
+
+#ifdef PSM_CUDA
+	/* Used to indicate to the receiver that the send
+	 * is issued on a device buffer. This helps the
+	 * receiver select TID instead of using eager buffers.
+	 */
+	if (req->is_buf_gpu_mem) {
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+		scb->mq_req = req;	/* request comes from GPU domain (device) ... */
+	}
+	req->cuda_hostbuf_used = 0;
+	if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) &&
+	   req->is_buf_gpu_mem &&
+	    (len > GPUDIRECT_THRESH_RV)) ||
+	    ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND)  &&
+	    req->is_buf_gpu_mem &&
+	    (len > gpudirect_send_limit))) {
+		/* send from intermediate host buffer */
+		struct ips_cuda_hostbuf *chb;
+		uint32_t offset, window_len;
+		int prefetch_lookahead = 0;
+
+		STAILQ_INIT(&req->sendreq_prefetch);
+		offset = 0;
+		req->cuda_hostbuf_used = 1;
+		/* start prefetching */
+		req->prefetch_send_msgoff = 0;
+		while ((offset < len) &&
+		       (prefetch_lookahead < proto->cuda_prefetch_limit)) {
+			chb = NULL;
+			window_len =
+				ips_cuda_next_window(ipsaddr->window_rv,
+						     offset, len);
+
+			unsigned bufsz;
+			if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
+				chb = (struct ips_cuda_hostbuf *)
+					psmi_mpool_get(
+					proto->cuda_hostbuf_pool_small_send);
+				bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+			}
+			if (chb == NULL) {
+				chb = (struct ips_cuda_hostbuf *)
+					psmi_mpool_get(
+					proto->cuda_hostbuf_pool_send);
+				bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+			}
+
+			/* any buffers available? */
+			if (chb == NULL)
+				break;
+
+			req->prefetch_send_msgoff += window_len;
+
+			chb->offset = offset;
+			chb->size = window_len;
+			chb->req = req;
+			chb->gpu_buf = (CUdeviceptr) buf + offset;
+			chb->bytes_read = 0;
+
+			if (proto->cudastream_send == NULL) {
+				PSMI_CUDA_CALL(cuStreamCreate,
+					   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+			}
+			if (chb->host_buf == NULL) {
+				PSMI_CUDA_CALL(cuMemHostAlloc,
+					       (void **) &chb->host_buf,
+					       bufsz,
+					       CU_MEMHOSTALLOC_PORTABLE);
+			}
+			if (chb->copy_status == NULL) {
+				PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+			}
+			PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
+				       chb->host_buf, chb->gpu_buf,
+				       window_len,
+				       proto->cudastream_send);
+			PSMI_CUDA_CALL(cuEventRecord,
+				       chb->copy_status,
+				       proto->cudastream_send);
+
+			STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb,
+					   req_next);
+			offset += window_len;
+			prefetch_lookahead++;
+		}
+	}
+#endif
+
+	PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) &&
+			  proto->protoexp,
+			  OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid,
+			    "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0);
+	proto->epaddr_stats.rts_send++;
+
+	_HFI_VDBG("sending with rndv %u\n", len);
+	/* If this is a fast path isend, then we cannot poll or
+	 * allow progressing of the mq from within the fast path
+	 * call otherwise messages will be lost. Therefore given fast path
+	 * we will avoid calling poll_internal and not set PSMI_TRUE which would
+	 * call ips_recv_progress_if_busy.
+	*/
+	if ((err = ips_mq_send_envelope(proto, flow, scb, 
+					! unlikely(req->flags_internal & PSMI_REQ_FLAG_FASTPATH))))
+		goto fail;
+// TBD - we may want to include odd bytes at start
+// and end of message in the RTS itself as opposed to being in last
+// EXPTID payload packet's header
+// then the RDMA Write can be better aligned and may perform better
+	// Start registering memory for anticipated CTS requesting RDMA
+	// TBD - we could reduce duation of memory pin by doing this only
+	// once we receive CTS, but that will put this call in the critical
+	// path.  If done after getting CTS we don't have to predict
+	// if remote end will chose RDMA vs LONG DATA approach (eg. if tests of
+	// length, etc below)
+	//
+	// register buffer we will use as source for RDMA Write
+	// for PSM_CUDA, a group of host bounce buffers may be used above
+	// ips_scb_buffer catches when RTS contains the data, in which case no
+	// need for memory registration.  While unlkely we also skip
+	// registration for zero length sync messages
+	// PSM3_RDMA if disabled causes proto->protoexp == NULL
+	if (! ips_scb_buffer(scb) && len
+			&& len > proto->mq->hfi_thresh_rv
+			&& proto->protoexp 	/* expected tid recieve enabled */
+			&& ips_epaddr_connected(ipsaddr)
+#ifdef PSM_CUDA
+			&& len > GPUDIRECT_THRESH_RV
+			&& ! req->cuda_hostbuf_used
+#endif
+		) {
+		req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0, proto->ep->verbs_ep.pd,
+						 req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA
+#ifdef PSM_CUDA
+						| (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0)
+#endif
+						);
+		// if we failed to register memory we will try again when
+		// we get the CTS.
+	}
+
+	if_pt (! (req->flags_internal & PSMI_REQ_FLAG_FASTPATH)) {
+		/* Assume that we already put a few rndv requests in flight.  This helps
+		 * for bibw microbenchmarks and doesn't hurt the 'blocking' case since
+		 * we're going to poll anyway */
+		psmi_poll_internal(proto->ep, 1);
+	}
+
+fail:
+	_HFI_VDBG
+	    ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n",
+	     psmi_epaddr_get_name(proto->ep->epid),
+	     psmi_epaddr_get_name(req->rts_peer->epid), buf, len,
+	     req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req,
+	     psmi_mpool_get_obj_index(req), psm2_error_get_string(err));
+	PSM2_LOG_MSG("leaving");
+	return err;
+}
+
+#ifdef PSM_CUDA
+static inline
+int psmi_cuda_is_buffer_gpu_mem(void *ubuf)
+{
+	return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf));
+}
+
+static inline
+int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len)
+{
+	if (
+		len > cuda_thresh_rndv){
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
+
+psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et qt,
+					      uint32_t *out, psm2_mq_t mq, psm2_epaddr_t epaddr)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *) epaddr->ptlctl->ptl;
+	psm2_error_t rv = PSM2_INTERNAL_ERR;
+
+	switch (qt)
+	{
+	case PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA:
+		*out = ptl->proto.iovec_thresh_eager;
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_THRESH_IPS_TINY:
+		*out = mq->hfi_thresh_tiny;
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE:
+		{
+			ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next;
+			*out = ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size;
+		}
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE:
+		*out = 0;
+		rv = PSM2_OK;
+		break;
+	case PSM2_INFO_QUERY_THRESH_IPS_RNDV:
+		*out = mq->hfi_thresh_rv;
+		rv = PSM2_OK;
+		break;
+	default:
+		break;
+	}
+
+	return rv;
+}
+
+psm2_error_t
+ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user,
+		   uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf,
+		   uint32_t len, void *context, psm2_mq_req_t *req_o)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_proto *proto;
+	struct ips_flow *flow;
+	ips_epaddr_t *ipsaddr;
+	ips_scb_t *scb;
+	psm2_mq_req_t req;
+#if defined(PSM_CUDA)
+	int gpu_mem = 0;
+#endif // PSM_CUDA
+
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(req == NULL)
+		return PSM2_NO_MEMORY;
+
+	_HFI_VDBG("(req=%p) ubuf=%p len=%u, flags_user=0x%x\n", req, ubuf, len, flags_user);
+
+	req->flags_user = flags_user;
+	req->flags_internal = flags_internal;
+	if (len >= mepaddr->proto->multirail_thresh_load_balance) {
+		ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+		ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+	} else {
+		ipsaddr = (ips_epaddr_t *)mepaddr;
+	}
+	psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED);
+
+	proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	req->req_data.send_msglen = len;
+	req->req_data.tag = *tag;
+	req->req_data.context = context;
+
+#ifdef PSM_CUDA
+	req->is_buf_gpu_mem = len && psmi_cuda_is_buffer_gpu_mem((void*)ubuf);
+	req->cuda_hostbuf_used = 0;
+	if (req->is_buf_gpu_mem) {
+		gpu_mem = 1;
+		psmi_cuda_set_attr_sync_memops(ubuf);
+		if (psmi_cuda_is_needed_rendezvous(proto, len))
+			goto do_rendezvous;
+	}
+#endif
+	flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+
+	if (flags_user & PSM2_MQ_FLAG_SENDSYNC) {
+		goto do_rendezvous;
+	} else if (len <= mq->hfi_thresh_tiny) {
+		scb = mq_alloc_tiny(proto);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_TINY;
+		ips_set_LMC_LID_choice(proto, scb, len);
+		scb->ips_lrh.khdr.kdeth0 =
+		    ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+		    ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+		const void *user_buffer = ubuf;
+#ifdef PSM_CUDA
+		if (!req->is_buf_gpu_mem) {
+			mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
+							  (uint32_t *) user_buffer, len);
+			proto->strat_stats.tiny_cpu_isend++;
+			proto->strat_stats.tiny_cpu_isend_bytes += len;
+		} else {
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			/* The following functions PINS the GPU pages
+			 * and mmaps the pages into the process virtual
+			 * space. This allows PSM to issue a standard
+			 * memcpy to move data between HFI resources
+			 * and the GPU
+			 */
+			if (len <= gdr_copy_limit_send &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto->ep))) {
+				mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
+							  (uint32_t *) user_buffer, len);
+				proto->strat_stats.tiny_gdrcopy_isend++;
+				proto->strat_stats.tiny_gdrcopy_isend_bytes += len;
+			} else {
+				user_buffer = ubuf;
+#endif // PSM_CUDA
+				mq_copy_tiny((uint32_t *) &scb->ips_lrh.hdr_data,
+						 (uint32_t *) user_buffer, len);
+#ifdef PSM_CUDA
+				proto->strat_stats.tiny_cuCopy_isend++;
+				proto->strat_stats.tiny_cuCopy_isend_bytes += len;
+			}
+		}
+#else
+		proto->strat_stats.tiny_cpu_isend++;
+		proto->strat_stats.tiny_cpu_isend_bytes += len;
+#endif
+
+		/* If this is a fast path isend, then we cannot allow
+		 * progressing of the mq from within the fast path
+		 * call otherwise messages will be lost. Therefore given fast path
+		 * we will set PSMI_FALSE which will prevent the call to
+		 * ips_recv_progress_if_busy.
+		*/
+		err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH));
+		if (err != PSM2_OK)
+			return err;
+
+		/* We can mark this op complete since all the data is now copied
+		 * into an SCB that remains live until it is remotely acked */
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+		_HFI_VDBG
+		    ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else if (len <= flow->frag_size) {
+		uint32_t paylen = len & ~0x3;
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_SHORT;
+		ips_set_LMC_LID_choice(proto, scb, len);
+		scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+		scb->ips_lrh.hdr_data.u32w1 = len;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+		const void * user_buffer = ubuf;
+		int used_send_dma = 0;
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem) {
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			if (len <= gdr_copy_limit_send &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+					(unsigned long)ubuf, len , 0, proto->ep))) {
+				/* init req so ips_proto_mq_eager_complete can unmap */
+				req->req_data.buf = (uint8_t*)ubuf;
+				req->req_data.buf_len = len;
+				req->req_data.send_msglen = len;
+				proto->strat_stats.short_gdrcopy_isend++;
+				proto->strat_stats.short_gdrcopy_isend_bytes += len;
+			} else {
+				user_buffer = ubuf;
+				if (len > proto->iovec_gpu_thresh_eager) {
+					scb->mr = req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+							proto->ep->verbs_ep.pd,
+							(void*)user_buffer, len,
+							IBV_ACCESS_IS_GPU_ADDR);
+				}
+				if (req->mr) {
+					ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+					req->rts_peer = (psm2_epaddr_t) ipsaddr;
+					ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+					ips_scb_cb_param(scb) = req;
+					used_send_dma = 1;
+					proto->strat_stats.short_gdr_isend++;
+					proto->strat_stats.short_gdr_isend_bytes += len;
+				} else
+				{
+					ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+					// TBD for OPA flow_type could be DMA
+					proto->strat_stats.short_cuCopy_isend++;
+					proto->strat_stats.short_cuCopy_isend_bytes += len;
+				}
+			}
+		} else
+#endif // PSM_CUDA
+		{
+			if (len > proto->iovec_thresh_eager) {
+				scb->mr = req->mr = psm2_verbs_reg_mr(
+						proto->mr_cache, 0,
+						proto->ep->verbs_ep.pd,
+						(void*)user_buffer, len, 0);
+			}
+			if (req->mr) {
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+				req->rts_peer = (psm2_epaddr_t) ipsaddr;
+				ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+				ips_scb_cb_param(scb) = req;
+				used_send_dma = 1;
+				proto->strat_stats.short_dma_cpu_isend++;
+				proto->strat_stats.short_dma_cpu_isend_bytes += len;
+			} else
+			{
+				// TBD for OPA flow_type could be DMA
+				proto->strat_stats.short_copy_cpu_isend++;
+				proto->strat_stats.short_copy_cpu_isend_bytes += len;
+			}
+		}
+
+		ips_scb_buffer(scb) = (void *)user_buffer;
+
+		ips_scb_length(scb) = paylen;
+		if (len > paylen) {
+			/* there are nonDW bytes, copy to header */
+			mq_copy_tiny
+				((uint32_t *)&scb->ips_lrh.hdr_data.u32w0,
+				(uint32_t *)((uintptr_t)ubuf + paylen),
+				len - paylen);
+
+			/* for complete callback */
+			req->send_msgoff = len - paylen;
+		} else {
+			req->send_msgoff = 0;
+		}
+
+		/*
+		 * Need ack for send side completion because we
+		 * send from user buffer.  ACK will trigger scb callback
+		 */
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+
+		/* If this is a fast path isend, then we cannot allow
+		 * progressing of the mq from within the fast path
+		 * call otherwise messages will be lost. Therefore given fast path
+		 * we will set PSMI_FALSE which will prevent the call to
+		 * ips_recv_progress_if_busy.
+		*/
+		err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH));
+		if (err != PSM2_OK)
+			return err;
+
+		/*
+		 * It should be OK to check the buffer address in
+		 * 'scb' to be changed, when this scb is done, the
+		 * address is set to NULL when scb is put back to
+		 * scb pool. Even if the same scb is re-used, it
+		 * is not possible to set to this 'buf' address
+		 * because the app has not yet had a chance to start
+		 * another IO.  TBD - possible odd scenario if app
+		 * had this IO started in middle of a buffer which it also
+		 * had a multi-packet eager IO working on, then could see
+		 * same user_buffer from two IOs here.
+		 */
+		if (used_send_dma) {
+			// noop, callback already setup
+		} else
+		// TBD - could avoid this if/else code by always marking
+		// callback above, but may be less efficient for msgrate
+		if (ips_scb_buffer(scb) == (void *)user_buffer) {
+			/* continue to send from user buffer */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+		} else {
+			/* mark the message done */
+			req->state = MQ_STATE_COMPLETE;
+			mq_qq_append(&mq->completed_q, req);
+		}
+		_HFI_VDBG
+		    ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else if (len <= mq->hfi_thresh_rv) {
+		req->send_msgoff = 0;
+		req->rts_peer = (psm2_epaddr_t) ipsaddr;
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem) {
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (len > proto->iovec_gpu_thresh_eager) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+                                        proto->ep->verbs_ep.pd,
+                                        (void*)ubuf, len, IBV_ACCESS_IS_GPU_ADDR);
+			}
+			if (req->mr) {
+				proto->strat_stats.eager_gdr_isend++;
+				proto->strat_stats.eager_gdr_isend_bytes += len;
+			} else
+			{
+				proto->strat_stats.eager_cuCopy_isend++;
+				proto->strat_stats.eager_cuCopy_isend_bytes += len;
+			}
+		} else
+#endif
+		{
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (len > proto->iovec_thresh_eager) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+                                        	proto->ep->verbs_ep.pd,
+                                        	(void*)ubuf, len, 0);
+			}
+			if (req->mr) {
+				proto->strat_stats.eager_dma_cpu_isend++;
+				proto->strat_stats.eager_dma_cpu_isend_bytes += len;
+			} else
+			{
+				// TBD for OPA flow_type could be DMA
+				proto->strat_stats.eager_copy_cpu_isend++;
+				proto->strat_stats.eager_copy_cpu_isend_bytes += len;
+			}
+		}
+		err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+
+		_HFI_VDBG
+		    ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n",
+		     psmi_epaddr_get_name(mq->ep->epid),
+		     psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf,
+		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
+	} else {		/* skip eager accounting below */
+do_rendezvous:
+#ifdef PSM_CUDA
+		if (gpu_mem) {
+			proto->strat_stats.rndv_gpu_isend++;
+			proto->strat_stats.rndv_gpu_isend_bytes += len;
+		} else {
+#endif
+			proto->strat_stats.rndv_cpu_isend++;
+			proto->strat_stats.rndv_cpu_isend_bytes += len;
+#ifdef PSM_CUDA
+		}
+#endif
+
+		mq->stats.tx_num++;
+		mq->stats.tx_rndv_num++;
+		// we count tx_rndv_bytes as we get CTS
+
+		err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+		*req_o = req;
+		return err;
+	}
+
+	*req_o = req;
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+#ifdef PSM_CUDA
+	if (gpu_mem) {
+		mq->stats.tx_eager_gpu_num++;
+		mq->stats.tx_eager_gpu_bytes += len;
+	} else {
+		mq->stats.tx_eager_cpu_num++;
+		mq->stats.tx_eager_cpu_bytes += len;
+	}
+#endif
+
+	return err;
+}
+
+psm2_error_t
+ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
+		  psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_proto *proto;
+	struct ips_flow *flow;
+	ips_epaddr_t *ipsaddr;
+	ips_scb_t *scb;
+#if defined(PSM_OPA) || defined(PSM_CUDA)
+	int gpu_mem = 0;
+#endif
+
+	_HFI_VDBG("ubuf=%p len=%u flags=0x%x\n", ubuf, len, flags);
+
+	if (len >= mepaddr->proto->multirail_thresh_load_balance) {
+		ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next;
+		ipsaddr->msgctl->ipsaddr_next = ipsaddr->next;
+	} else {
+		ipsaddr = (ips_epaddr_t *)mepaddr;
+	}
+	psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED);
+
+	proto = ((psm2_epaddr_t) ipsaddr)->proto;
+
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+
+#ifdef PSM_CUDA
+	gpu_mem = len && psmi_cuda_is_buffer_gpu_mem((void*)ubuf);
+	if (gpu_mem) {
+		psmi_cuda_set_attr_sync_memops(ubuf);
+		if (psmi_cuda_is_needed_rendezvous(proto, len))
+			goto do_rendezvous;
+	}
+#endif
+	flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+
+	if (flags & PSM2_MQ_FLAG_SENDSYNC) {
+		goto do_rendezvous;
+	} else if (len <= mq->hfi_thresh_tiny) {
+		scb = mq_alloc_tiny(proto);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_TINY;
+		ips_set_LMC_LID_choice(proto, scb, len);
+		scb->ips_lrh.khdr.kdeth0 =
+		    ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) |
+		    ipsaddr->msgctl->mq_send_seqnum++;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+#ifdef PSM_CUDA
+		const void *user_buffer = ubuf;
+		if (!gpu_mem) {
+			mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
+							  (uint32_t *) user_buffer, len);
+			proto->strat_stats.tiny_cpu_send++;
+			proto->strat_stats.tiny_cpu_send_bytes += len;
+		} else {
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			/* The following functions PINS the GPU pages
+			 * and mmaps the pages into the process virtual
+			 * space. This allows PSM to issue a standard
+			 * memcpy to move data between HFI resources
+			 * and the GPU
+			 */
+			if (len <= gdr_copy_limit_send &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto->ep))) {
+				mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
+							  (uint32_t *) user_buffer, len);
+				proto->strat_stats.tiny_gdrcopy_send++;
+				proto->strat_stats.tiny_gdrcopy_send_bytes += len;
+			} else {
+				user_buffer = ubuf;
+#endif // PSM_CUDA
+				mq_copy_tiny
+					((uint32_t *) &scb->ips_lrh.hdr_data,
+					     (uint32_t *) ubuf, len);
+#ifdef PSM_CUDA
+				proto->strat_stats.tiny_cuCopy_send++;
+				proto->strat_stats.tiny_cuCopy_send_bytes += len;
+			}
+		}
+#else
+		proto->strat_stats.tiny_cpu_send++;
+		proto->strat_stats.tiny_cpu_send_bytes += len;
+#endif
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		_HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+	} else if (len <= flow->frag_size) {
+		uint32_t paylen = len & ~0x3;
+
+		scb = mq_alloc_pkts(proto, 1, 0, 0);
+		psmi_assert(scb);
+		ips_scb_opcode(scb) = OPCODE_SHORT;
+		ips_set_LMC_LID_choice(proto, scb, len);
+		scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++;
+		scb->ips_lrh.hdr_data.u32w1 = len;
+		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
+
+		const void * user_buffer = ubuf;
+#ifdef PSM_CUDA
+		int converted = 0;
+		if (gpu_mem) {
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+			/* will use PIO */
+			if (len <= gdr_copy_limit_send &&
+				NULL != (user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)ubuf, len, 0, proto->ep))) {
+				converted = 1;
+				proto->strat_stats.short_gdrcopy_send++;
+				proto->strat_stats.short_gdrcopy_send_bytes += len;
+			} else {
+				user_buffer = ubuf;
+				if (len > proto->iovec_gpu_thresh_eager_blocking) {
+					scb->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+						proto->ep->verbs_ep.pd,
+						(void*)user_buffer, len, IBV_ACCESS_IS_GPU_ADDR);
+				} else
+					scb->mr = NULL;
+				if (scb->mr) {
+					ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+					ips_scb_cb(scb) = ips_proto_scb_mr_complete;
+					ips_scb_cb_param(scb) = scb;
+					proto->strat_stats.short_gdr_send++;
+					proto->strat_stats.short_gdr_send_bytes += len;
+				} else
+				{
+					ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+					// TBD for OPA flow_type could be DMA
+					proto->strat_stats.short_cuCopy_send++;
+					proto->strat_stats.short_cuCopy_send_bytes += len;
+				}
+			}
+		} else
+#endif // PSM_CUDA
+		{
+			if (len > proto->iovec_thresh_eager_blocking) {
+				scb->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+						proto->ep->verbs_ep.pd,
+						(void*)user_buffer, len, 0);
+			} else
+				scb->mr = NULL;
+			if (scb->mr) {
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+				ips_scb_cb(scb) = ips_proto_scb_mr_complete;
+				ips_scb_cb_param(scb) = scb;
+				proto->strat_stats.short_dma_cpu_send++;
+				proto->strat_stats.short_dma_cpu_send_bytes += len;
+			} else
+			{
+				// TBD for OPA flow_type could be DMA
+				proto->strat_stats.short_copy_cpu_send++;
+				proto->strat_stats.short_copy_cpu_send_bytes += len;
+			}
+		}
+
+		ips_scb_buffer(scb) = (void *)user_buffer;
+		ips_scb_length(scb) = paylen;
+		if (len > paylen) {
+			/* there are nonDW bytes, copy to header */
+			mq_copy_tiny
+				((uint32_t *)&scb->ips_lrh.hdr_data.u32w0,
+				(uint32_t *)((uintptr_t)ubuf + paylen),
+				len - paylen);
+		}
+
+		/*
+		 * Need ack for send side completion because we
+		 * send from user buffer. ACK will trigger scb callback
+		 */
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+		err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE);
+		if (err != PSM2_OK)
+			return err;
+
+		/*
+		 * It should be OK to check the buffer address in
+		 * 'scb' to be changed, when this scb is done, the
+		 * address is set to NULL when scb is put back to
+		 * scb pool. Even if the same scb is re-used, it
+		 * is not possible to set to this 'ubuf' address
+		 * because the app has not yet had a chance to start
+		 * another IO.  TBD - possible odd scenario if app
+		 * had this IO started in middle of a buffer which it also
+		 * had a multi-packet eager IO working on, then could see
+		 * same user_buffer from two IOs here.
+		 */
+		if (ips_scb_buffer(scb) == (void *)user_buffer) {
+			if ((ips_scb_flags(scb) & IPS_SEND_FLAG_SEND_MR) ||
+			    paylen > proto->scb_bufsize ||
+			    !ips_scbctrl_bufalloc(scb)) {
+				/* sdma transfer (can't change user buffer),
+				 * or, payload is larger than bounce buffer,
+				 * or, can't allocate bounce buffer,
+				 * send from user buffer till complete */
+				PSMI_BLOCKUNTIL(mq->ep, err,
+					ips_scb_buffer(scb) != (void*)user_buffer);
+				if (err > PSM2_OK_NO_PROGRESS)
+					return err;
+				err = PSM2_OK;
+			} else {
+				/* PIO and now have a bounce buffer */
+				/* copy to bounce buffer */
+#ifdef PSM_CUDA
+				if (!gpu_mem || converted) {
+					// host address
+					ips_shortcpy_host_mem
+						(ips_scb_buffer(scb),
+					 	(void*)user_buffer, paylen);
+				} else {
+					// cuda address - undo flags so PIO
+					// doesn't cuMemcpy too
+					ips_scb_flags(scb) &= ~IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+					// TBD - could call cuMemcpy directly
+					ips_shortcpy(ips_scb_buffer(scb),
+					 	(void*)user_buffer, paylen);
+				}
+#else
+				ips_shortcpy(ips_scb_buffer(scb),
+					 (void*)user_buffer, paylen);
+#endif
+			}
+		}
+		_HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+
+	} else if (len <= mq->hfi_thresh_rv) {
+		psm2_mq_req_t req;
+
+		/* Block until we can get a req */
+		PSMI_BLOCKUNTIL(mq->ep, err,
+				(req =
+				 psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+		if (err > PSM2_OK_NO_PROGRESS)
+			return err;
+
+#ifdef PSM_CUDA
+		req->cuda_hostbuf_used = 0;
+		if (gpu_mem) {
+			req->is_buf_gpu_mem = 1;
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (len > proto->iovec_gpu_thresh_eager_blocking) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+                                        proto->ep->verbs_ep.pd,
+                                        (void*)ubuf, len, IBV_ACCESS_IS_GPU_ADDR);
+			}
+			if (req->mr) {
+				proto->strat_stats.eager_gdr_send++;
+				proto->strat_stats.eager_gdr_send_bytes += len;
+			} else
+			{
+				proto->strat_stats.eager_cuCopy_send++;
+				proto->strat_stats.eager_cuCopy_send_bytes += len;
+			}
+		} else {
+			req->is_buf_gpu_mem = 0;
+#else
+		{
+#endif // PSM_CUDA
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (len > proto->iovec_thresh_eager_blocking) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+                                        	proto->ep->verbs_ep.pd,
+                                        	(void*)ubuf, len, 0);
+			}
+			if (req->mr) {
+				proto->strat_stats.eager_dma_cpu_send++;
+				proto->strat_stats.eager_dma_cpu_send_bytes += len;
+			} else
+			{
+				// TBD for OPA flow_type could be DMA
+				proto->strat_stats.eager_copy_cpu_send++;
+				proto->strat_stats.eager_copy_cpu_send_bytes += len;
+			}
+		}
+		req->type |= MQE_TYPE_WAITING;
+		req->req_data.send_msglen = len;
+		req->req_data.tag = *tag;
+		req->send_msgoff = 0;
+		req->flags_user = flags;
+		req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL;
+		req->rts_peer = (psm2_epaddr_t) ipsaddr;
+
+		err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+
+		psmi_mq_wait_internal(&req);
+
+		_HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n",
+			  psmi_epaddr_get_name(mq->ep->epid),
+			  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid),
+			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
+	} else {
+		psm2_mq_req_t req;
+do_rendezvous:
+		/* Block until we can get a req */
+		PSMI_BLOCKUNTIL(mq->ep, err,
+				(req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+		if (err > PSM2_OK_NO_PROGRESS)
+			return err;
+
+		req->type |= MQE_TYPE_WAITING;
+		req->req_data.tag = *tag;
+		req->flags_user = flags;
+		req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL;
+
+#ifdef PSM_CUDA
+		if (gpu_mem) {
+			req->is_buf_gpu_mem = 1;
+			proto->strat_stats.rndv_gpu_send++;
+			proto->strat_stats.rndv_gpu_send_bytes += len;
+		} else {
+			req->is_buf_gpu_mem = 0;
+#endif
+			proto->strat_stats.rndv_cpu_send++;
+			proto->strat_stats.rndv_cpu_send_bytes += len;
+#ifdef PSM_CUDA
+		}
+#endif
+
+		mq->stats.tx_num++;
+		mq->stats.tx_rndv_num++;
+		// we count tx_rndv_bytes as we get CTS
+
+		err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len);
+		if (err != PSM2_OK)
+			return err;
+		psmi_mq_wait_internal(&req);
+		return err;	/* skip accounting, done separately at completion time */
+	}
+
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+#ifdef PSM_CUDA
+	if (gpu_mem) {
+		mq->stats.tx_eager_gpu_num++;
+		mq->stats.tx_eager_gpu_bytes += len;
+	} else {
+		mq->stats.tx_eager_cpu_num++;
+		mq->stats.tx_eager_cpu_bytes += len;
+	}
+#endif
+
+	return err;
+}
+
+static
+psm2_error_t
+ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
+{
+	psm2_epaddr_t epaddr = req->rts_peer;
+	struct ips_proto *proto = epaddr->proto;
+
+	/* We have a match.
+	 * We may already set with first packet,
+	 * If we're doing eager-based r-v, just send back the sreq and length and
+	 * have the sender complete the send.
+	 */
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("rts_match_callback\n");
+	// while matching RTS we set both recv and send msglen to min of the two
+	psmi_assert(req->req_data.recv_msglen == req->req_data.send_msglen);
+	req->mq->stats.rx_user_num++;
+	req->mq->stats.rx_user_bytes += req->req_data.recv_msglen;
+#ifdef PSM_CUDA
+	/* Cases where we do not use TIDs:
+	 * 0) Received full message as payload to RTS, CTS is just an ack
+	 * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes
+	 * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv
+	 * 3) Recv on gpu buf and len is less than 3 bytes
+	 * 4) Expected protocol not initialized.
+	 */
+	if (req->recv_msgoff >= req->req_data.recv_msglen
+	    || (!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem &&
+	     req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)||
+	    (!req->is_sendbuf_gpu_mem &&
+	     req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) ||
+	    (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) ||
+		! ips_epaddr_connected((ips_epaddr_t *) epaddr) ||
+	    proto->protoexp == NULL) {	/* no expected tid recieve */
+#else // PSM_CUDA
+	if (req->recv_msgoff >= req->req_data.recv_msglen ||
+		! ips_epaddr_connected((ips_epaddr_t *) epaddr) ||
+	    req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv || /* less rv theshold */
+	    proto->protoexp == NULL) {  /* no expected tid recieve */
+#endif // PSM_CUDA
+//do_long_data:
+		// send CTS asking for use of LONG_DATA send of large message
+
+		/* there is no order requirement, try to push CTS request
+		 * directly, if fails, then queue it for later try. */
+		_HFI_VDBG("pushing CTS recv off %u len %u"
+#ifdef PSM_CUDA
+			" rGPU %u sGPU %u"
+#endif
+			" rv thresh %u"
+			" conn %u"
+			" epaddr %p RDMA %u\n",
+			req->recv_msgoff, req->req_data.recv_msglen,
+#ifdef PSM_CUDA
+			req->is_buf_gpu_mem, req->is_sendbuf_gpu_mem,
+#endif
+			proto->mq->hfi_thresh_rv,
+			ips_epaddr_connected((ips_epaddr_t *) epaddr),
+			epaddr, proto->protoexp != NULL);
+
+		if (req->recv_msgoff < req->req_data.recv_msglen) {
+			// RTS did not have the message as payload
+#ifdef PSM_CUDA
+			if (req->is_buf_gpu_mem) {
+				proto->strat_stats.rndv_long_gpu_recv++;
+				proto->strat_stats.rndv_long_gpu_recv_bytes += req->req_data.recv_msglen;
+			} else {
+#endif
+				proto->strat_stats.rndv_long_cpu_recv++;
+				proto->strat_stats.rndv_long_cpu_recv_bytes += req->req_data.recv_msglen;
+#ifdef PSM_CUDA
+			}
+#endif
+		}
+		if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) {
+			struct ips_pend_sends *pends = &proto->pend_sends;
+			struct ips_pend_sreq *sreq =
+			    psmi_mpool_get(proto->pend_sends_pool);
+			psmi_assert(sreq != NULL);
+			if (sreq == NULL)
+			{
+				PSM2_LOG_MSG("leaving");
+				return PSM2_NO_MEMORY;
+			}
+			sreq->type = IPS_PENDSEND_EAGER_REQ;
+			sreq->req = req;
+
+			STAILQ_INSERT_TAIL(&pends->pendq, sreq, next);
+			psmi_timer_request(proto->timerq, &pends->timer,
+					   PSMI_TIMER_PRIO_1);
+		}
+	} else {
+		// send CTS asking for use of TID send of large message
+		// register buffer we will use as destination for remote RDMA Write
+		// We choose not to register memory when recv is posted since
+		// that could pin memory for a long time waiting for a tag match
+		// and recv buffers could be much larger than the messages they tag
+		// match with, resulting in unnecessary MR registration.
+		// req->req_data.buf is app buffer
+		// req->req_data.buf_len is app buffer length
+		// req->req_data.send_msglen is agreed amount to transfer (<= buf_len)
+		// TBD - if we were tight on MR resources, this could tie up more
+		// resources than needed, in which case skipping this and registering
+		// per CTS below could be better
+		// TBD - it might help MR cache hit rate if we registered the whole
+		// receive buffer (req->req_data.buf_len), this way large receive
+		// buffers which match smaller messages can get MR cache hit for
+		// various sized messages which may arrive in the buffer
+		psmi_assert(req->req_data.send_msglen);	// 0 len uses LONG_DATA above
+#ifdef PSM_CUDA
+		// for GPU receive buffer we need to sort things out at a lower level
+		// since may use a host bounce buffer for RDMA and need to register it
+		if (! req->is_buf_gpu_mem) {
+#else
+		{
+#endif
+			req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+					proto->ep->verbs_ep.pd,
+					req->req_data.buf, req->req_data.send_msglen,
+					IBV_ACCESS_RDMA|IBV_ACCESS_REMOTE_WRITE);
+			if (! req->mr) {
+				// ips_protoexp_tid_get_from_token will try to get MR again
+				// and will retry via ips_tid_pendtids_timer_callback.  So we
+				// can just fall through with req->mr == NULL.
+				// The alternative would be to goto and force use of LONG_DATA
+				//goto do_long_data;
+			} else {
+				_HFI_MMDBG("rbuf registered: addr %p len %d rkey 0x%x\n",  req->req_data.buf, req->req_data.send_msglen, req->mr->rkey);
+			}
+		}
+		_HFI_VDBG("matched rts, trying TID\n");
+		ips_protoexp_tid_get_from_token(proto->protoexp, req->req_data.buf,
+						req->req_data.recv_msglen, epaddr,
+						req->rts_reqidx_peer,
+						req->type & MQE_TYPE_WAITING_PEER ?
+						IPS_PROTOEXP_TIDGET_PEERWAIT :
+						0, ips_proto_mq_rv_complete,
+						req);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+psm2_error_t
+ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req)
+{
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+	struct ips_flow *flow;
+	ips_scb_t *scb;
+	ptl_arg_t *args;
+
+	PSM2_LOG_MSG("entering");
+	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[proto->msgflowid];
+	scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0);
+	if (scb == NULL)
+	{
+		PSM2_LOG_MSG("leaving");
+		return PSM2_OK_NO_PROGRESS;
+	}
+	args = (ptl_arg_t *) scb->ips_lrh.data;
+
+	ips_scb_opcode(scb) = OPCODE_LONG_CTS;
+	scb->ips_lrh.khdr.kdeth0 = 0;
+	args[0].u32w0 = psmi_mpool_get_obj_index(req);
+	args[1].u32w1 = req->req_data.recv_msglen;
+	args[1].u32w0 = req->rts_reqidx_peer;
+
+	PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid,
+		    flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d",
+		    req->rts_reqidx_peer);
+	proto->epaddr_stats.cts_long_data_send++;
+
+	ips_proto_flow_enqueue(flow, scb);
+	flow->flush(flow, NULL);
+
+	/* have already received enough bytes */
+	if (req->recv_msgoff == req->req_data.recv_msglen) {
+		ips_proto_mq_rv_complete(req);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return PSM2_OK;
+}
+
+// rendezvous using LONG DATA "eager push" instead of TID
+// If we run out of resources (scbs), this is called again to continue
+psm2_error_t
+ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
+{
+	psm2_error_t err = PSM2_OK;
+	uintptr_t buf = (uintptr_t) req->req_data.buf + req->recv_msgoff;
+	ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer);
+	uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff;
+	uint32_t nbytes_sent = 0;
+	uint32_t nbytes_this, chunk_size;
+	uint16_t frag_size, unaligned_bytes;
+#ifdef PSM_CUDA
+	int converted = 0;
+#endif
+	struct ips_flow *flow;
+	ips_scb_t *scb;
+	int dostats = !req->recv_msgoff; // if continuing, don't update stats
+
+	psmi_assert(nbytes_left > 0);
+
+	PSM2_LOG_MSG("entering.");
+	{
+		/* use PIO transfer */
+		flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+		chunk_size = frag_size = flow->frag_size;
+#ifdef PSM_CUDA
+		if (req->is_buf_gpu_mem) {
+			// rare, but when RV connection not available, we
+			// can select LONG DATA for a GPU send buffer.  Normally
+			// won't happen for GPU send >3 unless RDMA disabled
+			// or RV not connected
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (!req->mr && req->req_data.send_msglen > proto->iovec_gpu_thresh_eager) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+					proto->ep->verbs_ep.pd,
+					req->req_data.buf, req->req_data.send_msglen, 
+					IBV_ACCESS_IS_GPU_ADDR);
+			}
+			if (req->mr) {
+				proto->strat_stats.rndv_long_gdr_send += dostats;
+				proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen;
+			} else
+#ifdef PSM_CUDA
+				// for GPU send buffer <= 3, receiver can select
+				// LONG DATA and we can use GDRCopy
+				// must repin per attempt
+			if (req->req_data.send_msglen <= gdr_copy_limit_send &&
+				0 != (buf =  (uintptr_t)gdr_convert_gpu_to_host_addr(GDR_FD,
+					(unsigned long)req->req_data.buf,
+					req->req_data.send_msglen, 0, proto->ep))) {
+				converted = 1;
+				proto->strat_stats.rndv_long_gdrcopy_send += dostats;
+				proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen;
+			} else {
+				buf = (uintptr_t) req->req_data.buf + req->recv_msgoff;
+#else
+			{
+#endif
+				proto->strat_stats.rndv_long_cuCopy_send += dostats;
+				proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen;
+			}
+		} else {
+#endif
+			// TBD - no upper bound for send DMA here
+			// non-priority MR and will fallback if can't register
+			if (!req->mr && req->req_data.send_msglen > proto->iovec_thresh_eager) {
+				req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+					proto->ep->verbs_ep.pd,
+					req->req_data.buf,
+					req->req_data.send_msglen, 0);
+			}
+			if (req->mr) {
+				proto->strat_stats.rndv_long_dma_cpu_send += dostats;
+				proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*req->req_data.send_msglen;
+			} else
+			{
+				proto->strat_stats.rndv_long_copy_cpu_send += dostats;
+				proto->strat_stats.rndv_long_copy_cpu_send_bytes += dostats*req->req_data.send_msglen;
+			}
+#ifdef PSM_CUDA
+		}
+#endif
+	}
+
+	do {
+		/*
+		 * don't try to call progression routine such as:
+		 * ips_recv_progress_if_busy() in this loop,
+		 * it will cause recursive call of this function.
+		 */
+
+		/*
+		 * When tid code path is enabled, we don’t allocate scbc_rv
+		 * objects. If the message is less than the hfi_thresh_rv,
+		 * we normally use eager protocol to do the transfer.
+		 * However, if it is sync send, we use the rendezvous
+		 * rts/cts/rts-data protocol.
+		 * In this case, because scbc_rv is null,
+		 * we use scbc_egr instead.
+		 */
+
+		scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv
+					: &proto->scbc_egr, 1, 0, 0);
+		if (scb == NULL) {
+			err = PSM2_OK_NO_PROGRESS;
+			break;
+		}
+		ips_scb_opcode(scb) = OPCODE_LONG_DATA;
+		scb->ips_lrh.khdr.kdeth0 = 0;
+		scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer;
+		scb->ips_lrh.data[1].u32w1 = req->req_data.send_msglen;
+
+		/* attached unaligned bytes into packet header */
+		unaligned_bytes = nbytes_left & 0x3;
+		if (unaligned_bytes) {
+#ifdef PSM_CUDA
+			if (!req->is_buf_gpu_mem
+			    || converted
+			    )
+				mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata,
+					(uint32_t *)buf, unaligned_bytes);
+			else
+#endif
+			mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata,
+				(uint32_t *)buf, unaligned_bytes);
+
+			/* position to send */
+			buf += unaligned_bytes;
+			req->recv_msgoff += unaligned_bytes;
+			psmi_assert(req->recv_msgoff < 4);
+
+			/* for complete callback */
+			req->send_msgoff += unaligned_bytes;
+
+			nbytes_left -= unaligned_bytes;
+			nbytes_sent += unaligned_bytes;
+		}
+		scb->ips_lrh.data[1].u32w0 = req->recv_msgoff;
+		ips_scb_buffer(scb) = (void *)buf;
+		if (req->mr) {
+			scb->mr = req->mr;
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
+		}
+#ifdef PSM_CUDA
+		// SDMA identifies GPU buffers itself. But PIO path needs flags
+		if (req->is_buf_gpu_mem
+		) {
+			if (!req->mr && !converted)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
+			// TBD USER_BUF_GPU only useful for RTS
+			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
+		}
+#endif
+
+		scb->frag_size = frag_size;
+		nbytes_this = min(chunk_size, nbytes_left);
+		if (nbytes_this > 0)
+			scb->nfrag = (nbytes_this + frag_size - 1) / frag_size;
+		else
+			scb->nfrag = 1;
+
+		if (scb->nfrag > 1) {
+			ips_scb_length(scb) = frag_size;
+			scb->nfrag_remaining = scb->nfrag;
+			scb->chunk_size =
+				scb->chunk_size_remaining = nbytes_this;
+		} else
+			ips_scb_length(scb) = nbytes_this;
+
+		buf += nbytes_this;
+		req->recv_msgoff += nbytes_this;
+		nbytes_sent += nbytes_this;
+		nbytes_left -= nbytes_this;
+		if (nbytes_left == 0) {
+			/* because of scb callback, use eager complete */
+			ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+			ips_scb_cb_param(scb) = req;
+
+			/* Set ACKREQ if single packet per scb. For multi
+			 * packets per scb, it is SDMA, driver will set
+			 * ACKREQ in last packet, we only need ACK for
+			 * last packet.
+			 */
+			if (scb->nfrag == 1)
+				ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ;
+		} else {
+			req->send_msgoff += nbytes_this;
+		}
+		ips_proto_flow_enqueue(flow, scb);
+		if (flow->transfer == PSM_TRANSFER_PIO) {
+			/* we need to flush the pio pending queue as quick as possible */
+			flow->flush(flow, NULL);
+		}
+
+	} while (nbytes_left);
+
+
+	PSM2_LOG_MSG("leaving.");
+
+	return err;
+}
+
+// received a CTS
+int
+ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_proto *proto = rcv_ev->proto;
+	psm2_mq_t mq = proto->ep->mq;
+	struct ips_flow *flow;
+	psm2_mq_req_t req;
+	uint32_t paylen;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	PSM2_LOG_MSG("entering");
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+	req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0);
+	psmi_assert(req != NULL);
+
+	/*
+	 * if there is payload, it is expected tid protocol
+	 * with tid session info as the payload.
+	 */
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	if (paylen > 0) {
+		// we will use TID RDMA
+		ips_tid_session_list *payload =
+			ips_recvhdrq_event_payload(rcv_ev);
+		psmi_assert(paylen == 0 || payload);
+		PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid,
+			    mq->ep->epid,"p_hdr->data[1].u32w0 %d",
+			    p_hdr->data[1].u32w0);
+		proto->epaddr_stats.cts_rdma_recv++;
+
+#ifdef PSM_CUDA
+		psmi_assert(p_hdr->data[1].u32w1 > min(cuda_thresh_rndv, mq->hfi_thresh_rv));	// msglen
+#else
+		psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv);	// msglen
+#endif
+		psmi_assert(proto->protoexp != NULL);
+
+		/* ptl_req_ptr will be set to each tidsendc */
+		if (req->ptl_req_ptr == NULL) {
+			req->req_data.send_msglen = p_hdr->data[1].u32w1;
+			req->mq->stats.tx_rndv_bytes += req->req_data.send_msglen;
+		}
+		psmi_assert(req->req_data.send_msglen == p_hdr->data[1].u32w1);
+
+		if (! req->mr
+#ifdef PSM_CUDA
+			&& ! req->cuda_hostbuf_used
+#endif
+			) {
+			// we predicted use of LONG DATA and remote side chose RDMA
+			// or we failed to register memory previously.
+			req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0,
+							proto->ep->verbs_ep.pd,
+							req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA
+#ifdef PSM_CUDA
+								| (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0)
+#endif
+							);
+			// if we still don't have an MR, we will try again later
+		}
+		_HFI_MMDBG("ips_proto_mq_handle_cts for TID CTS\n");
+		if (ips_tid_send_handle_tidreq(proto->protoexp,
+					       rcv_ev->ipsaddr, req, p_hdr->data[0],
+					       p_hdr->mdata, payload, paylen) == 0) {
+			proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+		} else {
+			flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+			flow->recv_seq_num.psn_num -= 1;                            /* Decrement seq number to NAK proper CTS */
+			ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+			static unsigned int msg_cnt = 0;
+			if (msg_cnt++ == 0) {                                       /* Report the message only once */
+				_HFI_INFO("PSM3 memory shortage detected. Please consider modifying PSM3_MEMORY setting\n");
+			}
+			return PSM2_EP_NO_RESOURCES;
+		}
+	} else {
+		// we will use LONG DATA push
+		PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid,
+			    mq->ep->epid, "long data");
+		proto->epaddr_stats.cts_long_data_recv++;
+		req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */
+		req->req_data.send_msglen = p_hdr->data[1].u32w1;
+
+		if (req->send_msgoff >= req->req_data.send_msglen) {
+// TBD - should cleanup from pin as needed
+			/* already sent enough bytes, may truncate so using >= */
+			/* RTS payload is only used for CPU memory */
+			proto->strat_stats.rndv_rts_copy_cpu_send++;
+			proto->strat_stats.rndv_rts_copy_cpu_send_bytes += req->req_data.send_msglen;
+			ips_proto_mq_rv_complete(req);
+		} else {
+			req->mq->stats.tx_rndv_bytes += (req->req_data.send_msglen - req->send_msgoff);
+#ifdef RNDV_MOD
+			// If we have an MR due to incorrect prediction of RDMA
+			// release it if can't be used for send DMA or don't
+			// want send DMA.  push_rts_data will attempt to use
+			// for send DMA if req->mr != NULL.
+			if (req->mr &&
+				(!psm2_verbs_user_space_mr(req->mr)
+#ifdef PSM_CUDA
+				|| (req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_gpu_thresh_eager)
+				|| (!req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_thresh_eager)
+#else
+				|| (req->req_data.send_msglen <= proto->iovec_thresh_eager)
+#endif
+				)) {
+
+				_HFI_MMDBG("Using LONG_DATA, releasing RV RDMA MR: %p rkey: 0x%x\n", req->mr, req->mr->rkey);
+				psm2_verbs_release_mr(req->mr);
+				req->mr = NULL;
+				ips_tid_mravail_callback(req->rts_peer->proto);
+			}
+#endif
+
+			if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) {
+				/* there is no order requirement, tried to push RTS data
+				 * directly and not done, so queue it for later try. */
+				struct ips_pend_sreq *sreq =
+					psmi_mpool_get(proto->pend_sends_pool);
+				psmi_assert(sreq != NULL);
+
+				sreq->type = IPS_PENDSEND_EAGER_DATA;
+				sreq->req = req;
+				STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next);
+				/* Make sure it's processed by timer */
+				psmi_timer_request(proto->timerq, &proto->pend_sends.timer,
+						   PSMI_TIMER_PRIO_1);
+			}
+		}
+	}
+
+	flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+// received an RTS
+int
+ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	PSM2_LOG_MSG("entering");
+	_HFI_MMDBG("got rts\n");
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_CONTINUE;
+	}
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+	{
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	/* either no payload or whole message */
+	psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1);
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n",
+		  (long long)p_hdr->data[0].u64,
+		  p_hdr->data[1].u32w0, p_hdr->data[1].u32w1);
+
+	int rc = psmi_mq_handle_rts(mq,
+				    (psm2_epaddr_t) &ipsaddr->msgctl->
+				    master_epaddr,
+				    (psm2_mq_tag_t *) p_hdr->tag,
+				    &rcv_ev->proto->strat_stats,
+				    p_hdr->data[1].u32w1, payload, paylen,
+				    msgorder, ips_proto_mq_rts_match_callback,
+				    &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		// as a performance optimization, the 1st time we process an
+		// unmatched RTS, we ask to REVISIT it next poll loop hoping for
+		// a match due to a slightly late MPI_recv call
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		PSM2_LOG_MSG("leaving");
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	rcv_ev->proto->epaddr_stats.rts_recv++;
+
+	req->rts_peer = (psm2_epaddr_t) ipsaddr;
+	req->rts_reqidx_peer = p_hdr->data[1].u32w0;
+	if (req->req_data.send_msglen > mq->hfi_thresh_rv)
+	{
+		PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid,
+			    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
+	}
+	if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING)
+		req->type |= MQE_TYPE_WAITING_PEER;
+
+#ifdef PSM_CUDA
+	if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU)
+		req->is_sendbuf_gpu_mem = 1;
+	else
+		req->is_sendbuf_gpu_mem = 0;
+#endif
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+		if (rc == MQ_RET_MATCH_OK)
+			ips_proto_mq_rts_match_callback(req, 1);
+		/* XXX if blocking, break out of progress loop */
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	PSM2_LOG_MSG("leaving");
+	return ret;
+}
+
+int
+ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = (void *)&p_hdr->hdr_data;
+	paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >>
+		  HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK;
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%x, msglen=%d\n",
+		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		  OPCODE_TINY, (p_hdr->khdr.kdeth0 >> HFI_KHDR_TINYLEN_SHIFT)
+				& HFI_KHDR_TINYLEN_MASK);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag,
+				&rcv_ev->proto->strat_stats,  paylen, 0,
+				payload, paylen, msgorder, OPCODE_TINY, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+int
+ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%x, msglen=%d\n",
+		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		  OPCODE_SHORT, p_hdr->hdr_data.u32w1);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag,
+				&rcv_ev->proto->strat_stats,
+				p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+				payload, paylen, msgorder, OPCODE_SHORT, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		/* for out of order matching only */
+		req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+		req->ptl_req_ptr = (void *)msgctl;
+
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+int
+ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
+{
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	ips_msgctl_t *msgctl = ipsaddr->msgctl;
+	enum ips_msg_order msgorder;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	msgorder = ips_proto_check_msg_order(ipsaddr, flow,
+		__le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK,
+		&ipsaddr->msgctl->mq_recv_seqnum);
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE))
+		return IPS_RECVHDRQ_REVISIT;
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+	if (msgorder == IPS_MSG_ORDER_PAST ||
+			msgorder == IPS_MSG_ORDER_FUTURE_RECV) {
+		req = mq_eager_match(mq, msgctl,
+		    __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK);
+		/*
+		 * It is future message sequence or past message sequence,
+		 * and there is request matching in eager queue, we handle
+		 * the packet data and return. We can't go continue to
+		 * match envelope.
+		 * Past message sequence must always have a matching!!!
+		 * error is caught below.
+		 */
+		if (req) {
+			//u32w0 is offset - only cnt recv msgs on 1st pkt in msg
+#ifdef PSM_CUDA
+			int use_gdrcopy = 0;
+			if (!req->is_buf_gpu_mem) {
+				if (req->state == MQ_STATE_UNEXP) {
+					if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_sysbuf_recv++;
+					rcv_ev->proto->strat_stats.eager_sysbuf_recv_bytes += paylen;
+				} else {
+					if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_cpu_recv++;
+					rcv_ev->proto->strat_stats.eager_cpu_recv_bytes += paylen;
+				}
+			} else if (PSMI_USE_GDR_COPY_RECV(paylen)) {
+				use_gdrcopy = 1;
+				if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_gdrcopy_recv++;
+				rcv_ev->proto->strat_stats.eager_gdrcopy_recv_bytes += paylen;
+			} else {
+				if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_cuCopy_recv++;
+				rcv_ev->proto->strat_stats.eager_cuCopy_recv_bytes += paylen;
+			}
+			psmi_mq_handle_data(mq, req,
+				p_hdr->data[1].u32w0, payload, paylen,
+				use_gdrcopy,
+				rcv_ev->proto->ep);
+#else
+			if (req->state == MQ_STATE_UNEXP) {
+				if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_sysbuf_recv++;
+				rcv_ev->proto->strat_stats.eager_sysbuf_recv_bytes += paylen;
+			} else {
+				if (p_hdr->data[1].u32w0<4) rcv_ev->proto->strat_stats.eager_cpu_recv++;
+				rcv_ev->proto->strat_stats.eager_cpu_recv_bytes += paylen;
+			}
+			psmi_mq_handle_data(mq, req,
+				p_hdr->data[1].u32w0, payload, paylen);
+#endif // PSM_CUDA
+
+			if (msgorder == IPS_MSG_ORDER_FUTURE_RECV)
+				ret = IPS_RECVHDRQ_BREAK;
+
+			if ((__be32_to_cpu(p_hdr->bth[2]) &
+			    IPS_SEND_FLAG_ACKREQ) ||
+			    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+				ips_proto_send_ack((struct ips_recvhdrq *)
+					rcv_ev->recvq, flow);
+
+			ips_proto_process_ack(rcv_ev);
+
+			return ret;
+		}
+
+		psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV);
+		/*
+		 * For future message sequence, since there is no eager
+		 * queue matching yet, this must be the first packet for
+		 * the message sequence. And of course, expected message
+		 * sequence is always the first packet for the sequence.
+		 */
+	}
+
+	/*
+	 * We can't have past message sequence here. For eager message,
+	 * it must always have an eager queue matching because even in
+	 * truncation case the code logic will wait till all packets
+	 * have been received.
+	 */
+	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
+
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%x, msglen=%d\n",
+		p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
+		OPCODE_EAGER, p_hdr->hdr_data.u32w1);
+
+	/* store in req below too! */
+	int rc = psmi_mq_handle_envelope(mq,
+				(psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr,
+				(psm2_mq_tag_t *) p_hdr->tag,
+				&rcv_ev->proto->strat_stats,
+				p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0,
+				payload, paylen, msgorder, OPCODE_EAGER, &req);
+	if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) {
+		uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask;
+
+		flow->recv_seq_num.psn_num =
+			(flow->recv_seq_num.psn_num - 1) & psn_mask;
+		ipsaddr->msgctl->mq_recv_seqnum--;
+
+		return IPS_RECVHDRQ_REVISIT;
+	}
+
+	/* for both outoforder matching and eager matching */
+	req->msg_seqnum =
+		    __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK;
+	req->ptl_req_ptr = (void *)msgctl;
+
+	if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) {
+		msgctl->outoforder_count++;
+		mq_qq_append(&mq->outoforder_q, req);
+
+		ret = IPS_RECVHDRQ_BREAK;
+	} else {
+		ipsaddr->msg_toggle = 0;
+
+		if (msgctl->outoforder_count)
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+
+		if (rc == MQ_RET_UNEXP_OK)
+			ret = IPS_RECVHDRQ_BREAK;
+	}
+
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return ret;
+}
+
+/*
+ * Progress the out of order queue to see if any message matches
+ * current receiving sequence number.
+ */
+void
+ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl)
+{
+	psm2_mq_req_t req;
+
+	do {
+		req =
+		    mq_ooo_match(&mq->outoforder_q, msgctl,
+				 msgctl->mq_recv_seqnum);
+		if (req == NULL)
+			return;
+
+		msgctl->outoforder_count--;
+		msgctl->mq_recv_seqnum++;
+
+		psmi_mq_handle_outoforder(mq, req);
+
+	} while (msgctl->outoforder_count > 0);
+
+	return;
+}
+
+// LONG_DATA packet handler
+int
+ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	psm2_mq_t mq = rcv_ev->proto->mq;
+	char *payload;
+	uint32_t paylen;
+	psm2_mq_req_t req;
+	struct ips_flow *flow;
+#if defined(PSM_CUDA)
+	int use_gdrcopy = 0;
+	struct ips_proto *proto = rcv_ev->proto;
+#endif // PSM_CUDA
+	psmi_copy_tiny_fn_t psmi_copy_tiny_fn = mq_copy_tiny;
+
+
+	/*
+	 * if PSN does not match, drop the packet.
+	 */
+	if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev))
+		return IPS_RECVHDRQ_CONTINUE;
+
+	req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0);
+	psmi_assert(req != NULL);
+	// while matching RTS we set both recv and send msglen to min of the two
+	psmi_assert(req->req_data.recv_msglen == req->req_data.send_msglen);
+	psmi_assert(p_hdr->data[1].u32w1 == req->req_data.send_msglen);
+
+	payload = ips_recvhdrq_event_payload(rcv_ev);
+	paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	psmi_assert(paylen == 0 || payload);
+
+#ifdef PSM_CUDA
+	// cpu stats already tracked when sent CTS
+	if (req->is_buf_gpu_mem) {
+		req->req_data.buf = req->user_gpu_buffer;
+		// 1st packet with any unaligned data we handle here
+		if (p_hdr->data[1].u32w0 < 4) {
+			void *buf;
+			if (PSMI_USE_GDR_COPY_RECV(paylen + p_hdr->data[1].u32w0) &&
+				NULL != (buf = gdr_convert_gpu_to_host_addr(GDR_FD,
+						(unsigned long)req->user_gpu_buffer,
+						paylen + p_hdr->data[1].u32w0, 1, proto->ep))) {
+				req->req_data.buf = buf;
+				psmi_copy_tiny_fn = mq_copy_tiny_host_mem;
+				proto->strat_stats.rndv_long_gdr_recv++;
+				proto->strat_stats.rndv_long_gdr_recv_bytes += paylen;
+			} else {
+				proto->strat_stats.rndv_long_cuCopy_recv++;
+				proto->strat_stats.rndv_long_cuCopy_recv_bytes += paylen;
+			}
+		} else if (PSMI_USE_GDR_COPY_RECV(paylen)) {
+			// let mq_handle_data do the conversion
+			use_gdrcopy = 1;
+			//proto->strat_stats.rndv_long_gdr_recv++;
+			proto->strat_stats.rndv_long_gdr_recv_bytes += paylen;
+		} else {
+			if (p_hdr->data[1].u32w0 < 4) proto->strat_stats.rndv_long_cuCopy_recv++;
+			proto->strat_stats.rndv_long_cuCopy_recv_bytes += paylen;
+		}
+	}
+#endif
+	/*
+	 * if a packet has very small offset, it must have unaligned data
+	 * attached in the packet header, and this must be the first packet
+	 * for that message.
+	 */
+	if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) {
+		psmi_assert(p_hdr->data[1].u32w0 == (req->req_data.send_msglen&0x3));
+		psmi_copy_tiny_fn((uint32_t *)req->req_data.buf,
+				(uint32_t *)&p_hdr->mdata,
+				p_hdr->data[1].u32w0);
+		req->send_msgoff += p_hdr->data[1].u32w0;
+	}
+
+	psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen
+#ifdef PSM_CUDA
+				, use_gdrcopy, rcv_ev->proto->ep);
+#else
+				);
+#endif
+
+	flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
+	if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ||
+	    (flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+		ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow);
+
+	ips_proto_process_ack(rcv_ev);
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffdfd229dfdece6a36a439b2b7ac1414190b41de
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_params.h
@@ -0,0 +1,248 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PROTO_PARAMS_H
+#define _IPS_PROTO_PARAMS_H
+
+/*
+ * send method: dma, pio;
+ * recv method: tid, egr;
+ *
+ * send-recv mode combinations: 1=on, 0=off
+ * A: dma:1, pio=1, tid=1, egr=1;
+ * B: dma:0, pio=1, tid=1, egr=1;
+ * C: dma:1, pio=0, tid=1, egr=1;
+ * D: dma:1, pio=1, tid=0, egr=1;
+ * E: dma:0, pio=1, tid=0, egr=1;
+ * F: dma:1, pio=0, tid=0, egr=1;
+ *
+ * message packet type:
+ * T: tiny; S: short; E: eager;
+ * LR: long rts; LC: long cts; LD: long data;
+ * ED: expected data; EC: expected completion;
+ * C: ctrl msg;
+ *
+ * send,recv method for each packet type and each send-recv mode
+ * -------------------------------------------------------------------
+ * |    |  A       | B       | C       | D       | E       | F       |
+ * -------------------------------------------------------------------
+ * | T  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | S  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | E  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | E  |  dma,egr | pio,egr | dma,egr | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | LR |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LC |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ * | LD |  x       | x       | x       | pio,egr | pio,egr | dma,egr |<threshold
+ * -------------------------------------------------------------------
+ * | LD |  x       | x       | x       | dma,egr | pio,egr | dma,egr |>threshold
+ * -------------------------------------------------------------------
+ * | ED |  dma,tid | pio,tid | dma,tid | x       | x       | x       |
+ * -------------------------------------------------------------------
+ * | EC |  pio,egr | pio,egr | dma,egr | x       | x       | x       |
+ * -------------------------------------------------------------------
+ * | C  |  pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |
+ * -------------------------------------------------------------------
+ */
+
+/* Constants */
+#define BYTE2DWORD_SHIFT 2
+#define LOWER_16_BITS 0xFFFF
+#define PSM_CACHE_LINE_BYTES 64
+#define PSM2_FLOW_CREDITS 64
+#define PSM_CRC_SIZE_IN_BYTES 8
+
+/*
+ * version of protocol header (known to chip also).
+ * This value for OPA is defined in spec.
+ */
+#define IPS_PROTO_VERSION 0x1
+
+/* time conversion macros */
+#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us))
+#define ms_2_cycles(ms)  nanosecs_to_cycles(1000000ULL*(ms))
+#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec))
+
+/* Per-flow flags */
+#define IPS_FLOW_FLAG_NAK_SEND	    0x01
+#define IPS_FLOW_FLAG_PENDING_ACK   0x02
+#define IPS_FLOW_FLAG_PENDING_NAK   0x04
+#define IPS_FLOW_FLAG_GEN_BECN      0x08
+#define IPS_FLOW_FLAG_SKIP_CTS      0x20
+
+/* tid session expected send flags  */
+#define EXP_SEND_FLAG_CLEAR_ALL 0x00
+#define EXP_SEND_FLAG_FREE_TIDS 0x01
+
+#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL	/* 64 bit all-one's  */
+
+/*
+ * scb flags for wire,
+ * Only the lower 6 bits are wire-protocol options
+ */
+#define IPS_SEND_FLAG_NONE              0x00
+#define IPS_SEND_FLAG_BLOCKING		0x01	/* blocking send */
+#define IPS_SEND_FLAG_PKTCKSUM          0x02	/* Has packet checksum */
+#define IPS_SEND_FLAG_AMISTINY		0x04	/* AM is tiny, exclusive */
+
+#ifdef PSM_CUDA
+/* This flag is used to indicate to the reciever when
+ * the send is issued on a device buffer. This helps in
+ * selecting TID path on the recieve side regardless of
+ * the receive buffers locality. It is used
+ * in a special case where the send is on a device
+ * buffer and the receive is on a host buffer.
+ */
+#define IPS_SEND_FLAG_USER_BUF_GPU      0x08
+#endif
+
+#define IPS_SEND_FLAG_PROTO_OPTS        0x3f	/* only 6bits wire flags */
+
+/* scb flags */
+#define IPS_SEND_FLAG_PENDING		0x0100
+#define IPS_SEND_FLAG_PERSISTENT	0x0200
+#define IPS_SEND_FLAG_NO_LMC		0x0400
+
+#ifdef PSM_CUDA
+/* This flag is used to indicate if the send is on
+ * a GPU buffer. This helps PIO/SDMA paths to detect
+ * if payload is GPU buffer without having to call
+ * cudaGetPointerAttribute.
+ */
+#define IPS_SEND_FLAG_PAYLOAD_BUF_GPU   0x0800
+#endif
+
+#define IPS_SEND_FLAG_SEND_MR          0x1000
+
+
+/* 0x10000000, interrupt when done */
+#define IPS_SEND_FLAG_INTR		(1<<HFI_KHDR_INTR_SHIFT)
+
+
+/* 0x80000000, request ack (normal) */
+#define IPS_SEND_FLAG_ACKREQ		(1<<HFI_BTH_ACK_SHIFT)
+
+/* proto flags */
+#define IPS_PROTO_FLAG_SPIO		0x02	/* all spio, no dma */
+#define IPS_PROTO_FLAG_RCVTHREAD	0x04	/* psm recv thread is on */
+#define IPS_PROTO_FLAG_LOOPBACK		0x08	/* psm loopback over hfi */
+#define IPS_PROTO_FLAG_CKSUM            0x10	/* psm checksum is on */
+
+/* Coalesced ACKs (On by default) */
+#define IPS_PROTO_FLAG_COALESCE_ACKS    0x20
+
+/* Use Path Record query (off by default) */
+#define IPS_PROTO_FLAG_QUERY_PATH_REC   0x40
+
+/* Path selection policies:
+ *
+ * (a) Adaptive - Dynamically determine the least loaded paths using various
+ * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs.
+ *
+ * (b) Static schemes  -
+ *     (i) static_src  - Use path keyed off source context
+ *    (ii) static_dest - Use path keyed off destination context
+ *   (iii) static_base - Use only the base lid path - default till Oct'09.
+ *
+ * The default is adaptive. If a zero lmc network is used then there exists
+ * just one path between endpoints the (b)(iii) case above.
+ *
+ */
+
+#define IPS_PROTO_FLAG_PPOLICY_ADAPTIVE 0x200
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_SRC 0x400
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_DST 0x800
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_BASE 0x1000
+
+/* All static policies */
+#define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
+
+
+#ifdef PSM_CUDA
+/* Use RNDV (TID) for all message sizes */
+//#define IPS_PROTO_FLAG_ALWAYS_RNDV		0x10000	// unused
+/* Use GPUDirect RDMA for SDMA */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND	0x20000
+/* Use GPUDirect RDMA for TID */
+#define IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV	0x40000
+#endif
+
+// These flags select the TID/RDMA expected protocol options
+// most are single bit flags except for the low 2 bits (RDMA_MASK)
+// which select one of 4 possible modes (including 0 == disable).
+// proper use of the FLAG_ENABLED is (flags & FLAG_ENABLED) which
+// will be true if any of the 3 RDMA modes are selected.
+// The low 2 bits are shared in connection establishment protocol so
+// If they change meaning, the ips_proto_connect.c protocol and version must be
+// reviewed and updated accordingly
+#define IPS_PROTOEXP_FLAG_RDMA_MASK	     0x03
+/* 0 disables RDMA */
+#define IPS_PROTOEXP_FLAG_RDMA_KERNEL    0x01	/* kernel RV module RDMA */
+#define IPS_PROTOEXP_FLAG_RDMA_USER	     0x02	/* user RC QP for RDMA only */
+#define IPS_PROTOEXP_FLAG_RDMA_USER_RC   0x03	/* user RC QP eager & RDMA */
+#define IPS_PROTOEXP_FLAG_ENABLED	     0x03	/* any of the 3 modes */
+#define IPS_PROTOEXP_FLAG_USER_RC_QP(flag) ((flag)&0x02) /* either RC QP mode */
+#define IPS_PROTOEXP_FLAG_KERNEL_QP(flag) \
+		(((flag)&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_KERNEL)
+#define IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE 0x08	/* Interleave RTS handling. */
+#define IPS_PROTOEXP_FLAG_CTS_SERIALIZED 0x10	/* CTS serialized */
+#define IPS_PROTOEXP_FLAGS_DEFAULT	     0x00
+
+#endif /* _IPS_PROTO_PARAMS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_recv.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5a5a94f0d4c9f20c32c2d8a028aaeb0f6eb0e13
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
@@ -0,0 +1,814 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* receive service routine for each packet opcode */
+ips_packet_service_fn_t
+ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED] = {
+ips_proto_process_unknown_opcode,	/* 0xC0 */
+ips_proto_mq_handle_tiny,		/* OPCODE_TINY */
+ips_proto_mq_handle_short,
+ips_proto_mq_handle_eager,
+ips_proto_mq_handle_rts,                /* RTS */
+ips_proto_mq_handle_cts,                /* CTS */
+ips_proto_mq_handle_data,               /* DATA */
+#ifdef RNDV_MOD
+ips_protoexp_process_err_chk_rdma,		/* ERR_CHK_RDMA */
+ips_protoexp_process_err_chk_rdma_resp,		/* ERR_CHK_RDMA_RESP */
+#else
+ips_proto_process_unknown_opcode,
+ips_proto_process_unknown_opcode,
+#endif
+/* these are control packets */
+ips_proto_process_ack,
+ips_proto_process_nak,
+ips_proto_process_unknown_opcode,		/* BECN */
+ips_proto_process_err_chk,
+// ERR_CHK_GEN only valid for STL100 HW TIDFLOW
+ips_proto_process_unknown_opcode,		/* ERR_CHK_GEN */
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+ips_proto_connect_disconnect,
+/* rest are not control packets */
+ips_proto_am,
+ips_proto_am,
+ips_proto_am				/* OPCODE_AM_REPLY */
+};
+
+static void ips_report_strays(struct ips_proto *proto);
+
+#define INC_TIME_SPEND(timer)
+
+psm2_error_t ips_proto_recv_init(struct ips_proto *proto)
+{
+	uint32_t interval_secs;
+	union psmi_envvar_val env_stray;
+
+	psmi_getenv("PSM3_STRAY_WARNINTERVAL",
+		    "min secs between stray process warnings",
+		    PSMI_ENVVAR_LEVEL_HIDDEN,
+		    PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val)PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS,
+		    &env_stray);
+	interval_secs = env_stray.e_uint;
+	if (interval_secs > 0)
+		proto->stray_warn_interval = sec_2_cycles(interval_secs);
+	else
+		proto->stray_warn_interval = 0;
+
+	return PSM2_OK;
+}
+
+psm2_error_t ips_proto_recv_fini(struct ips_proto *proto)
+{
+	ips_report_strays(proto);
+	return PSM2_OK;
+}
+
+#define cycles_to_sec_f(cycles)		    \
+	(((double)cycles_to_nanosecs(cycles)) / 1000000000.0)
+
+struct ips_stray_epid {
+	psm2_epid_t epid;
+	uint32_t err_check_bad_sent;
+	uint32_t ipv4_addr;
+	uint32_t pid;
+	uint32_t num_messages;
+	uint64_t t_warn_next;
+	uint64_t t_first;
+	uint64_t t_last;
+};
+
+static
+void ips_report_strays(struct ips_proto *proto)
+{
+	struct ips_stray_epid *sepid;
+	struct psmi_eptab_iterator itor;
+	psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK);
+
+#if _HFI_DEBUGGING
+	double t_first = 0;
+	double t_last = 0;
+	double t_runtime = 0;
+	if (_HFI_INFO_ON) {
+		t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init);
+	}
+#endif
+
+	while ((sepid = psmi_epid_itor_next(&itor))) {
+		char ipbuf[INET_ADDRSTRLEN], *ip = NULL;
+		char bufpid[32];
+		uint32_t lid = psm2_epid_nid(sepid->epid);
+#if _HFI_DEBUGGING
+		if (_HFI_INFO_ON) {
+			t_first =
+				cycles_to_sec_f(sepid->t_first - proto->t_init);
+			t_last =
+				cycles_to_sec_f(sepid->t_last - proto->t_init);
+		}
+#endif
+		if (sepid->ipv4_addr)
+			ip = (char *)
+			    inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf,
+				      sizeof(ipbuf));
+		if (!ip)
+			snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid);
+
+		if (sepid->pid)
+			snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid);
+		else
+			snprintf(bufpid, sizeof(bufpid), "PID unknown");
+
+		if (_HFI_INFO_ON) {
+			_HFI_INFO_ALWAYS
+				("Process %s on host %s=%s sent %d stray message(s) and "
+				"was told so %d time(s) (first stray message at %.1fs "
+				"(%d%%), last at %.1fs (%d%%) into application run)\n",
+				bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages,
+				sepid->err_check_bad_sent, t_first,
+				(int)(t_first * 100.0 / t_runtime), t_last,
+				(int)(t_last * 100.0 / t_runtime));
+		}
+
+		psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid);
+		psmi_free(sepid);
+	}
+	psmi_epid_itor_fini(&itor);
+	return;
+}
+
+/* New scbs now available.  If we have pending sends because we were out of
+ * scbs, put the pendq on the timerq so it can be processed. */
+void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+	struct ips_proto *proto = (struct ips_proto *)context;
+	struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq);
+	if (sreq != NULL)
+		psmi_timer_request(proto->timerq,
+				   &proto->pend_sends.timer, PSMI_TIMER_PRIO_1);
+	return;
+}
+
+psm2_error_t
+ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current)
+{
+	psm2_error_t err = PSM2_OK;
+	struct ips_pend_sends *pend_sends =
+	    (struct ips_pend_sends *)timer->context;
+	struct ips_pendsendq *phead = &pend_sends->pendq;
+	struct ips_proto *proto = (struct ips_proto *)pend_sends->proto;
+	struct ips_pend_sreq *sreq;
+
+	while (!STAILQ_EMPTY(phead)) {
+		sreq = STAILQ_FIRST(phead);
+		switch (sreq->type) {
+		case IPS_PENDSEND_EAGER_REQ:
+			err = ips_proto_mq_push_cts_req(proto, sreq->req);
+			break;
+		case IPS_PENDSEND_EAGER_DATA:
+			err = ips_proto_mq_push_rts_data(proto, sreq->req);
+			break;
+
+		default:
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "Unknown pendq state %d\n",
+					  sreq->type);
+		}
+
+		if (err == PSM2_OK) {
+			STAILQ_REMOVE_HEAD(phead, next);
+			psmi_mpool_put(sreq);
+		} else {	/* out of scbs. wait for the next scb_avail callback */
+			/* printf("!!!!! breaking out of pendq progress\n"); */
+			break;
+		}
+	}
+
+	return err;
+}
+
+PSMI_INLINE(
+int
+between(int first_seq, int last_seq, int seq))
+{
+	if (last_seq >= first_seq) {
+		if (seq < first_seq || seq > last_seq) {
+			return 0;
+		}
+	} else {
+		if (seq > last_seq && seq < first_seq) {
+			return 0;
+		}
+	}
+	return 1;
+}
+
+PSMI_INLINE(
+int
+pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow,
+		  psmi_seqnum_t ack_seq_num))
+{
+	uint32_t last_num;
+	struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+
+	if (STAILQ_EMPTY(unackedq))
+		return 0;
+
+	/* scb_pend will be moved back when an nak is received, but
+	 * the packet may actually be received and acked after the nak,
+	 * so we use the tail of unacked queue, which may include packets
+	 * not being sent out yet, this is over do, but it should be OK. */
+	last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num;
+
+	return between(flow->xmit_ack_num.psn_num,
+				last_num, ack_seq_num.psn_num);
+}
+
+
+
+/* NAK post process for dma flow */
+void ips_dmaflow_nak_post_process(struct ips_proto *proto,
+				  struct ips_flow *flow)
+{
+	ips_scb_t *scb;
+	uint32_t first_num, ack_num;
+	uint16_t padding = 0;
+
+	scb = STAILQ_FIRST(&flow->scb_unacked);
+	first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask;
+	ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask;
+
+
+	/* If the ack PSN falls into a multi-packets scb,
+	 * don't re-send the packets already acked. */
+	psmi_assert(scb->nfrag > 1);
+	if (between(first_num, scb->seq_num.psn_num, ack_num)) {
+		uint32_t npkt, pktlen, nbytes;
+
+		/* how many packets acked in this scb */
+		npkt = ((ack_num - first_num) & proto->psn_mask) + 1;
+
+		/* how many bytes already acked in this scb, for eager receive
+		 * packets, all payload size is frag_size except the last packet
+		 * which is not acked yet */
+		pktlen = scb->frag_size;
+		nbytes = (((ack_num - first_num) &
+			proto->psn_mask) + 1) * pktlen;
+
+		/* 0. update scb info */
+		psmi_assert(scb->nfrag_remaining > npkt);
+		scb->nfrag_remaining -= npkt;
+		psmi_assert(scb->chunk_size_remaining > nbytes);
+		scb->chunk_size_remaining -= nbytes;
+		ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes);
+
+		/* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */
+		if (scb->chunk_size_remaining <= scb->frag_size) {
+			psmi_assert(scb->nfrag_remaining == 1);
+			scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+
+			/* last packet is what remaining */
+			/* check if padding is required*/
+			padding = scb->chunk_size_remaining & 0x3;
+			if_pf(padding) {
+				/* how much to pad with also equals how many bytes we need
+				 * to rewind the source buffer offset by to keep it dw aligned */
+				padding = 4 - padding;
+				ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding);
+				scb->chunk_size_remaining += padding;
+			}
+			pktlen = scb->chunk_size_remaining;
+		}
+
+		/* 2. set new packet sequence number */
+		scb->ips_lrh.bth[2] = __cpu_to_be32(
+			((ack_num + 1) & proto->psn_mask) |
+			(scb->scb_flags & IPS_SEND_FLAG_ACKREQ));
+
+		/* 3. set new packet offset adjusted with padding */
+		scb->ips_lrh.hdr_data.u32w0 += nbytes - padding;
+
+		/* 4. if packet length is changed, set new length */
+		if (scb->payload_size != pktlen) {
+			scb->payload_size = pktlen;
+			scb->ips_lrh.lrh[2] = __cpu_to_be16((
+				(scb->payload_size +
+				sizeof(struct ips_message_header) +
+				HFI_CRC_SIZE_IN_BYTES) >>
+				BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK);
+		}
+	}
+}
+
+/* process an incoming ack message.  Separate function to allow */
+/* for better optimization by compiler */
+int
+ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_flow *flow = NULL;
+	struct ips_scb_unackedq *unackedq;
+	struct ips_scb_pendlist *scb_pend;
+	psmi_seqnum_t ack_seq_num, last_seq_num;
+	ips_epaddr_flow_t flowid;
+	ips_scb_t *scb;
+	uint32_t tidctrl;
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+	if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+		ack_seq_num.psn_num =
+		    (ack_seq_num.psn_num - 1) & proto->psn_mask;
+		psmi_assert(flowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[flowid];
+		if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+			goto ret;
+	} else {
+		// we don't use tidflow on UD nor UDP, shouldn't get ACKs about it
+		_HFI_ERROR("Got ack for TID flow, not allowed for UD\n");
+			goto ret;
+	}
+	flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+
+	unackedq = &flow->scb_unacked;
+	scb_pend = &flow->scb_pend;
+
+	if (STAILQ_EMPTY(unackedq))
+		goto ret;
+
+	last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+	INC_TIME_SPEND(TIME_SPEND_USER2);
+
+	/* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+	 * we can used general psn_num to compare the PSN. */
+	while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+		       last_seq_num.psn_num, ack_seq_num.psn_num)
+	    ) {
+
+		/* take it out of the xmit queue and ..  */
+		if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+			SLIST_REMOVE_HEAD(scb_pend, next);
+		}
+
+		STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+		flow->scb_num_unacked--;
+		psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+		flow->credits += scb->nfrag;
+
+
+		if (scb->callback)
+			(*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+					  scb->chunk_size : scb->payload_size);
+
+		if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT))
+			ips_scbctrl_free(scb);
+
+		/* set all index pointer to NULL if all frames have been
+		 * acked */
+		if (STAILQ_EMPTY(unackedq)) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+
+			SLIST_FIRST(scb_pend) = NULL;
+			psmi_assert(flow->scb_num_pending == 0);
+			/* Reset congestion window - all packets ACK'd */
+			flow->credits = flow->cwin = proto->flow_credits;
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+			goto ret;
+		} else if (flow->timer_ack == scb->timer_ack) {
+			/*
+			 * Exchange timers with last scb on unackedq.
+			 * timer in scb is used by flow, cancelling current
+			 * timer and then requesting a new timer takes more
+			 * time, instead, we exchange the timer between current
+			 * freeing scb and the last scb on unacked queue.
+			 */
+			psmi_timer *timer;
+			ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+			timer = scb->timer_ack;
+			scb->timer_ack = last->timer_ack;
+			last->timer_ack = timer;
+			timer = scb->timer_send;
+			scb->timer_send = last->timer_send;
+			last->timer_send = timer;
+
+			scb->timer_ack->context = scb;
+			scb->timer_send->context = scb;
+			last->timer_ack->context = last;
+			last->timer_send->context = last;
+		}
+	}
+
+	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
+
+	{
+		/* Increase congestion window if flow is not congested */
+		if_pf(flow->cwin < proto->flow_credits) {
+			flow->credits +=
+			    min(flow->cwin << 1,
+				proto->flow_credits) - flow->cwin;
+			flow->cwin = min(flow->cwin << 1, proto->flow_credits);
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+		}
+	}
+
+	/* Reclaimed some credits - attempt to flush flow */
+	if (!SLIST_EMPTY(scb_pend))
+		flow->flush(flow, NULL);
+
+	/*
+	 * If the next packet has not even been put on the wire, cancel the
+	 * retransmission timer since we're still presumably waiting on free
+	 * pio bufs
+	 */
+	if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE)
+		psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+ret:
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* process an incoming nack message.  Separate function to allow */
+/* for better optimization by compiler */
+int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_proto *proto = rcv_ev->proto;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_flow *flow = NULL;
+	struct ips_scb_unackedq *unackedq;
+	struct ips_scb_pendlist *scb_pend;
+	psmi_seqnum_t ack_seq_num, last_seq_num;
+	psm_protocol_type_t protocol;
+	ips_epaddr_flow_t flowid;
+	ips_scb_t *scb;
+	uint32_t tidctrl;
+
+	INC_TIME_SPEND(TIME_SPEND_USER3);
+
+	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+	tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0));
+	if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) {
+		protocol = PSM_PROTOCOL_GO_BACK_N;
+		psmi_assert(flowid < EP_FLOW_LAST);
+		flow = &ipsaddr->flows[flowid];
+		if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
+			goto ret;
+		ack_seq_num.psn_num =
+		    (ack_seq_num.psn_num - 1) & proto->psn_mask;
+		flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num;
+	} else {
+		// we don't use tidflow on UD nor UDP, shouldn't get NAKs about it
+		_HFI_ERROR("Got nak for TID flow, not allowed for UD\n");
+			goto ret;	/* Invalid ack for flow */
+		ack_seq_num.psn_seq--;
+
+		psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen);
+		psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen);
+		/* Update xmit_ack_num with both new generation and new
+		 * acked sequence; update xmit_seq_num with the new flow
+		 * generation, don't change the sequence number. */
+		flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0;
+		flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen;
+		psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen);
+	}
+
+	unackedq = &flow->scb_unacked;
+	scb_pend = &flow->scb_pend;
+
+	if (STAILQ_EMPTY(unackedq))
+		goto ret;
+
+	last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+
+	proto->epaddr_stats.nak_recv++;
+
+	_HFI_VDBG("got a nack %d on flow %d, "
+		  "first is %d, last is %d\n", ack_seq_num.psn_num,
+		  flow->flowid,
+		  STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num.
+		  psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq,
+								     ips_scb,
+								     nextq)->
+		  seq_num.psn_num);
+
+	/* For tidflow, psn_gen matches. So for all flows, tid/pio/dma,
+	 * we can use general psn_num to compare the PSN. */
+	while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
+		       last_seq_num.psn_num, ack_seq_num.psn_num)
+	    ) {
+		/* take it out of the xmit queue and ..  */
+		if (scb == SLIST_FIRST(scb_pend)) {
+#ifdef PSM_DEBUG
+			flow->scb_num_pending--;
+#endif
+			SLIST_REMOVE_HEAD(scb_pend, next);
+		}
+
+		STAILQ_REMOVE_HEAD(unackedq, nextq);
+#ifdef PSM_DEBUG
+		flow->scb_num_unacked--;
+		psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending);
+#endif
+
+
+		if (scb->callback)
+			(*scb->callback) (scb->cb_param, scb->nfrag > 1 ?
+					  scb->chunk_size : scb->payload_size);
+
+		if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT))
+			ips_scbctrl_free(scb);
+
+		/* set all index pointer to NULL if all frames has been acked */
+		if (STAILQ_EMPTY(unackedq)) {
+			psmi_timer_cancel(proto->timerq, flow->timer_ack);
+			flow->timer_ack = NULL;
+			psmi_timer_cancel(proto->timerq, flow->timer_send);
+			flow->timer_send = NULL;
+
+			SLIST_FIRST(scb_pend) = NULL;
+			psmi_assert(flow->scb_num_pending == 0);
+			/* Reset congestion window if all packets acknowledged */
+			flow->credits = flow->cwin = proto->flow_credits;
+			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+			goto ret;
+		} else if (flow->timer_ack == scb->timer_ack) {
+			/*
+			 * Exchange timers with last scb on unackedq.
+			 * timer in scb is used by flow, cancelling current
+			 * timer and then requesting a new timer takes more
+			 * time, instead, we exchange the timer between current
+			 * freeing scb and the last scb on unacked queue.
+			 */
+			psmi_timer *timer;
+			ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq);
+
+			timer = scb->timer_ack;
+			scb->timer_ack = last->timer_ack;
+			last->timer_ack = timer;
+			timer = scb->timer_send;
+			scb->timer_send = last->timer_send;
+			last->timer_send = timer;
+
+			scb->timer_ack->context = scb;
+			scb->timer_send->context = scb;
+			last->timer_ack->context = last;
+			last->timer_send->context = last;
+		}
+	}
+
+	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
+
+	if (protocol == PSM_PROTOCOL_TIDFLOW)
+		_HFI_ERROR("post processing, Got nak for TID flow, not allowed for UD\n");
+	else if (scb->nfrag > 1)
+		ips_dmaflow_nak_post_process(proto, flow);
+
+	/* Always cancel ACK timer as we are going to restart the flow */
+	psmi_timer_cancel(proto->timerq, flow->timer_ack);
+
+	/* What's now pending is all that was unacked */
+	SLIST_FIRST(scb_pend) = scb;
+#ifdef PSM_DEBUG
+	flow->scb_num_pending = flow->scb_num_unacked;
+#endif
+	while (scb && !(scb->scb_flags & IPS_SEND_FLAG_PENDING)) {
+
+		scb->scb_flags |= IPS_SEND_FLAG_PENDING;
+		scb = SLIST_NEXT(scb, next);
+	}
+
+	{
+		int num_resent = 0;
+
+		/* Reclaim all credits upto congestion window only */
+		flow->credits = flow->cwin;
+		flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+
+		/* Flush pending scb's */
+		flow->flush(flow, &num_resent);
+
+		proto->epaddr_stats.send_rexmit += num_resent;
+	}
+
+ret:
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq;
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+	ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	struct ips_flow *flow;
+	psmi_seqnum_t seq_num;
+	int16_t seq_off;
+
+	INC_TIME_SPEND(TIME_SPEND_USER4);
+	PSM2_LOG_MSG("entering");
+	psmi_assert(flowid < EP_FLOW_LAST);
+	flow = &ipsaddr->flows[flowid];
+	recvq->proto->epaddr_stats.err_chk_recv++;
+
+	seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]);
+	seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num);
+
+	if_pf(seq_off <= 0) {
+		_HFI_VDBG("naking for seq=%d, off=%d on flowid  %d\n",
+			  seq_num.psn_num, seq_off, flowid);
+
+		if (seq_off < -flow->ack_interval)
+			flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+		ips_proto_send_nak(recvq, flow);
+		flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+	}
+	else {
+		ips_scb_t ctrlscb;
+
+		ctrlscb.scb_flags = 0;
+		ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num;
+		ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+					    &ipsaddr->ctrl_msg_queued,
+					    &ctrlscb, ctrlscb.cksum, 0);
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+
+
+static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
+{
+	_HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code);
+
+	if (hfi_debug & __HFI_DBG) {
+		ips_proto_show_header(proto, "received bad opcode");
+		ips_proto_dump_frame(proto, sizeof(struct ips_message_header),
+				     "Opcode error protocol header dump");
+	}
+}
+
+int
+ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *protocol_header = rcv_ev->p_hdr;
+	struct ips_proto *proto = rcv_ev->proto;
+
+	proto->stats.unknown_packets++;
+	ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header);
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+int
+ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev)
+{
+	psm2_error_t err = PSM2_OK;
+	char *payload = ips_recvhdrq_event_payload(rcv_ev);
+	uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+
+	psmi_assert(payload);
+	err = ips_proto_process_connect(rcv_ev->proto,
+					_get_proto_hfi_opcode(rcv_ev->p_hdr),
+					rcv_ev->p_hdr,
+					payload,
+					paylen);
+	if (err != PSM2_OK)
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Process connect/disconnect error: %d, opcode %x\n",
+			err, _get_proto_hfi_opcode(rcv_ev->p_hdr));
+
+	return IPS_RECVHDRQ_CONTINUE;
+}
+
+/* Return 1 if packet is ok. */
+/* Return 0 if packet should be skipped */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev)
+{
+	struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+	struct ips_proto *proto = rcv_ev->proto;
+	int opcode = (int)_get_proto_hfi_opcode(p_hdr);
+
+	/*
+	 * If the protocol is disabled or not yet enabled, no processing happens
+	 * We set it t_init to 0 when disabling the protocol
+	 */
+	if (proto->t_init == 0)
+		return IPS_RECVHDRQ_CONTINUE;
+
+	/* Connect messages don't have to be from a known epaddr */
+	switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+		ips_proto_connect_disconnect(
+				(struct ips_recvhdrq_event *)rcv_ev);
+		return IPS_RECVHDRQ_CONTINUE;
+	default:
+		break;
+	}
+
+	/* Packet from "unknown" peer. Log the packet and payload if at appropriate
+	 * verbose level.
+	 */
+	{
+		char *payload = ips_recvhdrq_event_payload(rcv_ev);
+		uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+		    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+
+		if (hfi_debug & __HFI_PKTDBG) {
+			ips_proto_dump_frame(rcv_ev->p_hdr,
+					     HFI_MESSAGE_HDR_SIZE, "header");
+			if (paylen)
+				ips_proto_dump_frame(payload, paylen, "data");
+		}
+	}
+
+
+	proto->stats.stray_packets++;
+
+	/* If we have debug mode, print the complete packet every time */
+	if (hfi_debug & __HFI_PKTDBG)
+		ips_proto_show_header(p_hdr, "invalid connidx");
+
+	psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_EPID_NETWORK_ERROR,
+			 "Received message(s) opcode=%x from an unknown process", opcode);
+
+	return 0;		/* Always skip this packet unless the above call was a noreturn
+				 * call */
+}
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c
new file mode 100644
index 0000000000000000000000000000000000000000..b71a15d9319ab7c84349f08a57571f8df6b9037c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c
@@ -0,0 +1,406 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+
+#include "ips_epstate.h"
+#include "ips_proto.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+#include "ips_proto_internal.h"
+
+/*
+ * Receive header queue initialization.
+ */
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  struct ips_recvhdrq *recvq
+		)
+{
+	psm2_error_t err = PSM2_OK;
+
+	memset(recvq, 0, sizeof(*recvq));
+	recvq->proto = (struct ips_proto *)proto;
+	recvq->context = context;
+	pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED);
+
+	recvq->epstate = epstate;
+	recvq->recvq_callbacks = *callbacks;	/* deep copy */
+	SLIST_INIT(&recvq->pending_acks);
+
+	return err;
+}
+
+
+/* flush the eager buffers, by setting the eager index head to eager index tail
+   if eager buffer queue is full.
+
+   Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR
+   was set in RHF errors), and no good eager packets were received, so
+   that eager head wasn't advanced.
+*/
+#if 0
+static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq)
+{
+	const uint32_t tail = ips_recvq_tail_get(&recvq->egrq);
+	const uint32_t head = ips_recvq_head_get(&recvq->egrq);
+	uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+	if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) {
+		_HFI_DBG("eager array full after overflow, flushing "
+			 "(head %llx, tail %llx)\n",
+			 (long long)head, (long long)tail);
+		recvq->proto->stats.egr_overflow++;
+	}
+	return;
+}
+#endif
+
+/*
+ * Helpers for ips_recvhdrq_progress.
+ */
+
+
+
+
+#ifdef PSM_DEBUG
+#endif
+
+
+PSMI_ALWAYS_INLINE(
+void
+process_pending_acks(struct ips_recvhdrq *recvq))
+{
+	ips_scb_t ctrlscb;
+	struct ips_message_header *msg_hdr = NULL;
+
+	/* If any pending acks, dispatch them now */
+	while (!SLIST_EMPTY(&recvq->pending_acks)) {
+		struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks);
+
+		SLIST_REMOVE_HEAD(&recvq->pending_acks, next);
+		SLIST_NEXT(flow, next) = NULL;
+
+		ctrlscb.scb_flags = 0;
+		msg_hdr = &ctrlscb.ips_lrh;
+		msg_hdr->ack_seq_num = flow->recv_seq_num.psn_num;
+
+		if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+			psmi_assert_always((flow->
+					    flags & IPS_FLOW_FLAG_PENDING_NAK)
+					   == 0);
+
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;
+			ips_proto_send_ctrl_message(flow, OPCODE_ACK,
+						    &flow->ipsaddr->
+						    ctrl_msg_queued,
+						    &ctrlscb, ctrlscb.cksum, 0);
+		} else {
+			psmi_assert_always(flow->
+					   flags & IPS_FLOW_FLAG_PENDING_NAK);
+
+			flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;
+			ips_proto_send_ctrl_message(flow, OPCODE_NAK,
+						    &flow->ipsaddr->
+						    ctrl_msg_queued,
+						    &ctrlscb, ctrlscb.cksum, 0);
+		}
+	}
+}
+
+
+#ifdef RNDV_MOD
+// check for and process RV RDMA sends and RDMA recv
+psm2_error_t check_rv_completion(psm2_ep_t ep, struct ips_proto *proto)
+{
+	struct rv_event ev;
+	psm2_error_t ret = PSM2_OK_NO_PROGRESS;
+
+	if (! IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode))
+		return ret;
+
+	while (__psm2_rv_poll_cq(ep->verbs_ep.rv, &ev) > 0) {
+		ret = PSM2_OK;
+		switch (ev.event_type) {
+			case RV_WC_RDMA_WRITE:
+				ep->verbs_ep.send_rdma_outstanding--;
+				_HFI_MMDBG("got RV RDMA Write SQ CQE status %u outstanding %u\n", ev.wc.status, ep->verbs_ep.send_rdma_outstanding);
+				if_pf (ev.wc.status || ev.wc.wr_id == 0) {
+					if (PSM2_OK != ips_protoexp_rdma_write_completion_error(
+								ep, ev.wc.wr_id, ev.wc.status))
+						return PSM2_INTERNAL_ERR;
+
+				} else {
+					ips_protoexp_rdma_write_completion( ev.wc.wr_id);
+				}
+				break;
+			case RV_WC_RECV_RDMA_WITH_IMM:
+				if_pf (ev.wc.status) {
+					if (ep->rv_reconnect_timeout)
+						break;	/* let sender handle errors */
+					psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+							"failed rv recv RDMA '%s' (%d) on %s port %u epid 0x%lx\n",
+							ibv_wc_status_str(ev.wc.status), (int)ev.wc.status, ep->dev_name, ep->portnum, ep->epid);
+					return PSM2_INTERNAL_ERR;
+				}
+				_HFI_MMDBG("got RV RDMA Write Immediate RQ CQE %u bytes\n",
+							ev.wc.byte_len);
+				ips_protoexp_handle_immed_data(proto,
+							ev.wc.conn_handle, RDMA_IMMED_RV,
+							ev.wc.imm_data, ev.wc.byte_len);
+				break;
+			default:
+				psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"unexpected rv event %d status '%s' (%d) on %s port %u epid 0x%lx\n",
+					ev.event_type, ibv_wc_status_str(ev.wc.status),
+					(int)ev.wc.status, ep->dev_name, ep->portnum, ep->epid);
+				break;
+			}
+	}
+	return ret;;
+}
+#endif // RNDV_MOD
+
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
+{
+	GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR); /* perf stats */
+
+	int ret = IPS_RECVHDRQ_CONTINUE;
+	struct ips_epstate_entry *epstaddr;
+	psm2_ep_t ep = recvq->proto->ep;
+	PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {
+		.proto = recvq->proto,
+		.recvq = recvq,
+		//.ptype = RCVHQ_RCV_TYPE_ERROR
+	};
+	rbuf_t buf;
+	uint32_t num_done = 0;
+	int err;
+
+#ifdef RNDV_MOD
+	// rv completes are for larger RDMAs and should be infrequent, give
+	// them 1st chance
+	switch (check_rv_completion(ep, recvq->proto)) {
+	case PSM2_OK:
+		num_done=1;	// triggers PSM_OK return below
+		break;
+	case PSM2_OK_NO_PROGRESS:
+		break;
+	default:
+		goto fail;
+		break;
+	}
+#endif
+#if VERBS_RECV_CQE_BATCH > 1
+	int done = 0;
+	do {
+		struct ibv_wc *wc;
+// a little atypical, but allows ifdef to be smaller scope
+#undef WC
+#define WC(field) ((wc)->field)
+		if (! ep->verbs_ep.recv_wc_count) {
+			// TBD - negative error return is possible but unlikely
+			if (0 == (err = ibv_poll_cq(ep->verbs_ep.recv_cq, VERBS_RECV_CQE_BATCH, ep->verbs_ep.recv_wc_list)))
+				break;
+			else if_pf (err < 0) {
+				if (errno == EAGAIN || errno == EWOULDBLOCK
+				    || errno == EBUSY || errno = EINTR)
+					break;
+				_HFI_ERROR("failed ibv_poll_cq '%s' (%d) on %s port %u epid 0x%lx\n",
+					strerror(errno), errno, ep->dev_name, ep->portnum, ep->epid);
+				goto fail;
+			}
+			ep->verbs_ep.recv_wc_count = err;
+			ep->verbs_ep.recv_wc_next = 0;
+			// once drained break out of loop w/o polling CQ again
+			// don't worry about small race of new pkt arriving while we
+			// process the CQEs.  poll_cq is expensive so avoid doing it
+			// an extra time because it will usually be empty
+			done = (ep->verbs_ep.recv_wc_count < VERBS_RECV_CQE_BATCH);
+		}
+		// consume next wc
+		wc = &(ep->verbs_ep.recv_wc_list[ep->verbs_ep.recv_wc_next++]);
+		ep->verbs_ep.recv_wc_count--;
+		{
+#else	// VERBS_RECV_CQE_BATCH > 1
+	while (1) {
+		struct ibv_wc wc;
+// a little atypical, but allows ifdef to be smaller scope
+#undef WC
+#define WC(field) ((wc).field)
+		// TBD really only need to check this on 1st loop
+		if_pf (ep->verbs_ep.revisit_buf) {
+			buf = ep->verbs_ep.revisit_buf;
+			ep->verbs_ep.revisit_buf = NULL;
+			rcv_ev.payload_size = ep->verbs_ep.revisit_payload_size;
+		} else if (0 == (err = ibv_poll_cq(ep->verbs_ep.recv_cq, 1, &wc))) {
+			break;
+		} else if_pf (err < 0) {
+			if (errno == EAGAIN || errno == EWOULDBLOCK
+			    || errno == EBUSY || errno == EINTR)
+				break;
+			_HFI_ERROR("failed ibv_poll_cq '%s' (%d) on %s port %u epid 0x%lx\n",
+				strerror(errno), errno, ep->dev_name, ep->portnum, ep->epid);
+			goto fail;
+		} else {
+#endif	// VERBS_RECV_CQE_BATCH > 1
+			psmi_assert_always(WC(wr_id));
+			buf = (rbuf_t)WC(wr_id);
+			if_pf (WC(status)) {
+				if (WC(status) != IBV_WC_WR_FLUSH_ERR)
+					_HFI_ERROR("failed recv '%s' (%d) on %s port %u epid 0x%lx QP %u\n",
+						ibv_wc_status_str(WC(status)), (int)WC(status), ep->dev_name, ep->portnum, ep->epid, WC(qp_num));
+				goto fail;
+			}
+			switch (WC(opcode)) {
+			case IBV_WC_RECV_RDMA_WITH_IMM:
+				_HFI_MMDBG("got RDMA Write Immediate RQ CQE %u bytes\n",
+							WC(byte_len));
+				// wc.byte_len is len of inbound rdma write not including immed
+				// wc.qp_num - local QP
+				ips_protoexp_handle_immed_data(rcv_ev.proto,
+						(uint64_t)(rbuf_qp(ep, buf)->qp_context),
+						RDMA_IMMED_USER_RC, WC(imm_data), WC(byte_len));
+				goto repost;
+				break;
+			default:
+				_HFI_ERROR("unexpected recv opcode %d on %s port %u epid 0x%lx QP %u\n",
+					WC(opcode), ep->dev_name, ep->portnum, ep->epid, WC(qp_num));
+				goto repost;
+				break;
+			case IBV_WC_RECV:
+				_HFI_VDBG("got CQE %u bytes\n", WC(byte_len));
+				// wc.byte_len is length of data including rbuf_addition
+				// actual data starts after rbuf_addition in posted recv buffer
+				// if we need it wc has:
+				//		qp_num - local QP
+				// 		src_qp - remote QP
+				// 		slid - remote SLID
+				// 		probably have GRH at start of buffer with remote GID
+				if_pf (_HFI_PDBG_ON)
+					_HFI_PDBG_DUMP(rbuf_to_buffer(buf), WC(byte_len));
+				if_pf (WC(byte_len) < rbuf_addition(buf)+sizeof(struct ips_message_header)) {
+					_HFI_ERROR( "unexpected small recv: %u on %s port %u\n", WC(byte_len), ep->dev_name, ep->portnum);
+					goto repost;
+				}
+				rcv_ev.payload_size = WC(byte_len) - rbuf_addition(buf) - sizeof(struct ips_message_header);
+				break;
+			}
+			// fall through to process recv pkt in buf of rcv_ev.payload_size
+		}
+		rcv_ev.p_hdr = (struct ips_message_header *)(rbuf_to_buffer(buf)+rbuf_addition(buf));
+		rcv_ev.payload = (rbuf_to_buffer(buf) + rbuf_addition(buf) + sizeof(struct ips_message_header));
+		_HFI_VDBG("%s receive - opcode %x\n", qp_type_str(rbuf_qp(ep, buf)),
+			_get_proto_hfi_opcode(rcv_ev.p_hdr));
+
+		epstaddr = ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->connidx);
+
+		if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) {
+			rcv_ev.ipsaddr = NULL;
+			recvq->recvq_callbacks.callback_packet_unknown(&rcv_ev);
+		} else {
+			rcv_ev.ipsaddr = epstaddr->ipsaddr;
+			ret = ips_proto_process_packet(&rcv_ev);
+			if_pf (ret == IPS_RECVHDRQ_REVISIT)
+			{
+				// try processing on next progress call
+				_HFI_VDBG("REVISIT returned on process_packet\n");
+				// process this CQE again next time called
+#if VERBS_RECV_CQE_BATCH > 1
+				ep->verbs_ep.recv_wc_next--;
+				ep->verbs_ep.recv_wc_count++;
+#else
+				ep->verbs_ep.revisit_buf = buf;
+				ep->verbs_ep.revisit_payload_size = rcv_ev.payload_size;
+#endif
+				GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */
+				return PSM2_OK_NO_PROGRESS;
+			}
+		}
+repost:
+		num_done++;
+		// buffer processing is done, we can requeue it on QP
+		if_pf (PSM2_OK != __psm2_ep_verbs_post_recv(
+									buf))
+			_HFI_ERROR( "unable to post recv on %s port %u\n", ep->dev_name, ep->portnum); // leak the buffer
+
+		// if we can't process this now (such as an RTS we revisited and
+		// ended up queueing on unexpected queue) we're told
+		// to stop processing, we'll look at the rest later
+		if_pf (ret == IPS_RECVHDRQ_BREAK) {
+			_HFI_VDBG("stop rcvq\n");
+			break;
+		}
+#if VERBS_RECV_CQE_BATCH > 1
+	} while(! done);
+#else
+	}
+#endif
+
+	/* Process any pending acks before exiting */
+	process_pending_acks(recvq);
+	GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */
+
+	return num_done?PSM2_OK:PSM2_OK_NO_PROGRESS;
+
+fail:
+	GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */
+	return PSM2_INTERNAL_ERR;
+}
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fcbc1126690867fd8ea222cd65fe57c8dfc5136
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h
@@ -0,0 +1,166 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ips_proto_params.h"
+#include "ips_proto_header.h"
+
+#ifndef _IPS_RECVHDRQ_H
+#define _IPS_RECVHDRQ_H
+
+struct ips_recvhdrq;
+struct ips_recvhdrq_state;
+struct ips_epstate;
+
+/* process current packet, continue on next packet */
+#define IPS_RECVHDRQ_CONTINUE   0
+/* process current packet, break and return to caller */
+#define IPS_RECVHDRQ_BREAK      1
+/* keep current packet, revisit the same packet next time */
+#define IPS_RECVHDRQ_REVISIT	2
+
+
+struct ips_recvhdrq_event {
+	struct ips_proto *proto;
+	const struct ips_recvhdrq *recvq;	/* where message received */
+	struct ips_message_header *p_hdr;	/* protocol header in rcv_hdr */
+	struct ips_epaddr *ipsaddr;	/* peer ipsaddr, if available */
+	// we point to the payload part of our recv buffer
+	uint8_t *payload;
+	uint32_t payload_size;
+};
+
+struct ips_recvhdrq_callbacks {
+	int (*callback_packet_unknown) (const struct ips_recvhdrq_event *);
+};
+
+psm2_error_t
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  struct ips_recvhdrq *recvq
+		);
+
+psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq);
+
+
+/*
+ * Structure containing state for recvhdrq reading. This is logically
+ * part of ips_recvhdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the context. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+#define NO_EAGER_UPDATE ~0U
+struct ips_recvhdrq_state {
+};
+
+/*
+ * Structure to read from recvhdrq
+ */
+struct ips_recvhdrq {
+	struct ips_proto *proto;
+	const psmi_context_t *context;	/* error handling, epid id, etc. */
+	/* Header queue handling */
+	pthread_spinlock_t hdrq_lock;	/* Lock for thread-safe polling */
+	/* Lookup endpoints epid -> ptladdr (rank)) */
+	const struct ips_epstate *epstate;
+
+	/* Callbacks to handle recvq events */
+	struct ips_recvhdrq_callbacks recvq_callbacks;
+
+	/* List of flows with pending acks for receive queue */
+	SLIST_HEAD(pending_flows, ips_flow) pending_acks;
+
+};
+
+
+PSMI_INLINE(
+void *
+ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev))
+{
+	psmi_assert(rcv_ev);
+	return rcv_ev->payload;
+}
+
+PSMI_INLINE(
+uint32_t
+ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev))
+{
+	psmi_assert(rcv_ev);
+	return rcv_ev->payload_size;
+}
+
+PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_trylock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_lock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq))
+{
+	int ret = pthread_spin_unlock(&recvq->hdrq_lock);
+	return !ret;
+}
+
+#endif /* _IPS_RECVHDRQ_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.c
new file mode 100644
index 0000000000000000000000000000000000000000..be7248db7be2395cf9477dc6b04d718acbdd6516
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.c
@@ -0,0 +1,92 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "ips_recvq.h"
+
+/* We return a table of pointer indexes.
+ *
+ * From the point of view of the returned pointer, index -1 always points to
+ * the address to call psmi_free on (since we force page-alignment).
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr,
+				    uint32_t bufnum, uint32_t bufsize)
+{
+	unsigned i;
+	void *ptr_alloc;
+	uintptr_t *buft;
+	uintptr_t base = (uintptr_t) baseptr;
+
+	ptr_alloc = psmi_malloc(ep, UNDEFINED,
+				PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum +
+								     1));
+	if (ptr_alloc == NULL)
+		return NULL;
+	/* First pointer is to the actual allocated address, so we can free it but
+	 * buft[1] is first on the page boundary
+	 */
+	buft = (uintptr_t *) PSMI_ALIGNUP((uint8_t *)ptr_alloc + 1, PSMI_PAGESIZE);
+	buft[-1] = (uintptr_t) ptr_alloc;
+	for (i = 0; i < bufnum; i++)
+		buft[i] = (uintptr_t) ((char *)base + i * bufsize);
+	return (void **)buft;
+}
+
+void ips_recvq_egrbuf_table_free(void **buftable)
+{
+	uintptr_t *buft = (uintptr_t *) buftable;
+	void *ptr_alloc = (void *)buft[-1];
+	psmi_free(ptr_alloc);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d1a990d43379c9ab36f68a35215f08391cf8e3c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_recvq.h
@@ -0,0 +1,73 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_RECVQ_H
+#define _IPS_RECVQ_H
+
+#include "psm_user.h"
+
+/*
+ * Tables to map eager indexes into their buffer addresses
+ *
+ * If function returns NULL, no memory has been allocated and the error handler
+ * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY.
+ */
+void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep,
+				    void *base, uint32_t bufnum,
+				    uint32_t bufsize);
+void ips_recvq_egrbuf_table_free(void **buftable);
+
+
+#endif /* _IPS_RECVQ_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.c
new file mode 100644
index 0000000000000000000000000000000000000000..4ba1dfac06507d2c0f4ebe823a8d03affb7ce723
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.c
@@ -0,0 +1,350 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_scb.h"
+#include "ips_proto_internal.h"
+
+psm2_error_t
+ips_scbctrl_init(const psmi_context_t *context,
+		 uint32_t numscb, uint32_t numbufs,
+		 uint32_t imm_size, uint32_t bufsize,
+		 ips_scbctrl_avail_callback_fn_t scb_avail_callback,
+		 void *scb_avail_context, struct ips_scbctrl *scbc)
+{
+	int i;
+	struct ips_scb *scb;
+	size_t scb_size;
+	size_t alloc_sz;
+	uintptr_t base, imm_base;
+	psm2_ep_t ep = context->ep;
+	/* scbc->context = context; */
+	psm2_error_t err = PSM2_OK;
+
+	psmi_assert_always(numscb > 0);
+	scbc->sbuf_num = scbc->sbuf_num_cur = numbufs;
+	SLIST_INIT(&scbc->sbuf_free);
+	scbc->sbuf_buf_size = bufsize;
+	scbc->sbuf_buf_base = NULL;
+	scbc->sbuf_buf_alloc = NULL;
+	scbc->sbuf_buf_last = NULL;
+
+	/* send buffers are not mandatory but when allocating them, make sure they
+	 * are on a page boundary */
+	if (numbufs > 0) {
+		struct ips_scbbuf *sbuf;
+
+		bufsize = PSMI_ALIGNUP(bufsize, 64);
+
+		alloc_sz = numbufs * bufsize + PSMI_PAGESIZE;
+		scbc->sbuf_buf_alloc =
+		    psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+		if (scbc->sbuf_buf_alloc == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+		base = (uintptr_t) scbc->sbuf_buf_alloc;
+		base = PSMI_ALIGNUP(base, PSMI_PAGESIZE);
+		scbc->sbuf_buf_base = (void *)base;
+		scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1));
+		_HFI_VDBG
+		    ("sendbufs=%d, (size=%d),base=[%p..%p)\n",
+		     numbufs,  bufsize,
+		     (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last);
+
+		for (i = 0; i < numbufs; i++) {
+			sbuf = (struct ips_scbbuf *)(base + bufsize * i);
+			SLIST_NEXT(sbuf, next) = NULL;
+			SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next);
+		}
+	}
+
+	imm_base = 0;
+	scbc->scb_imm_size = imm_size;
+	if (scbc->scb_imm_size) {
+		scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64);
+		alloc_sz = numscb * scbc->scb_imm_size + 64;
+		scbc->scb_imm_buf = psmi_memalign(ep, NETWORK_BUFFERS, 64,
+						  alloc_sz);
+
+		if (scbc->scb_imm_buf == NULL) {
+			err = PSM2_NO_MEMORY;
+			goto fail;
+		}
+
+		memset(scbc->scb_imm_buf, 0, alloc_sz);
+		imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64);
+	} else
+		scbc->scb_imm_buf = NULL;
+
+	scbc->scb_num = scbc->scb_num_cur = numscb;
+	SLIST_INIT(&scbc->scb_free);
+
+	scb_size = PSMI_ALIGNUP(sizeof(*scb), 64);
+	alloc_sz = numscb * scb_size;
+
+	scbc->scb_base = psmi_memalign(ep, NETWORK_BUFFERS, 64, alloc_sz);
+	if (scbc->scb_base == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	memset(scbc->scb_base, 0, alloc_sz);
+	base = (uintptr_t) scbc->scb_base;
+
+	/*
+	 * Allocate ack/send timer for each scb object.
+	 */
+	scbc->timers = (struct psmi_timer *)
+		psmi_calloc(ep, UNDEFINED, 2*numscb,
+		sizeof(struct psmi_timer));
+	if (scbc->timers == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	for (i = 0; i < numscb; i++) {
+		scb = (struct ips_scb *)(base + i * scb_size);
+
+		scb->scbc = scbc;
+		if (scbc->scb_imm_buf)
+			scb->imm_payload =
+			    (void *)(imm_base + (i * scbc->scb_imm_size));
+		else
+			scb->imm_payload = NULL;
+
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+		/*
+		 * Initialize timers.
+		 * Associate the timers to each scb, the association is
+		 * not fixed because later PSM may exchange the timers
+		 * between scb, the reason for exchanging is that the
+		 * timer is currently using by flow, but the scb is to
+		 * be freed. see ack/nak processing in file ips_prot_recv.c
+		 */
+		scb->timer_ack = &scbc->timers[2*i];
+		psmi_timer_entry_init(scb->timer_ack,
+				ips_proto_timer_ack_callback, scb);
+
+		scb->timer_send = &scbc->timers[2*i+1];
+		psmi_timer_entry_init(scb->timer_send,
+				ips_proto_timer_send_callback, scb);
+	}
+	scbc->scb_avail_callback = scb_avail_callback;
+	scbc->scb_avail_context = scb_avail_context;
+
+
+fail:
+	return err;
+}
+
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc)
+{
+	if (scbc->scb_base != NULL) {
+		psmi_free(scbc->scb_base);
+	}
+	if (scbc->sbuf_buf_alloc) {
+		psmi_free(scbc->sbuf_buf_alloc);
+	}
+	if (scbc->timers != NULL) {
+		psmi_free(scbc->timers);
+	}
+	if (scbc->scb_imm_buf) {
+		psmi_free(scbc->scb_imm_buf);
+	}
+	return PSM2_OK;
+}
+
+int ips_scbctrl_bufalloc(ips_scb_t *scb)
+{
+	struct ips_scbctrl *scbc = scb->scbc;
+
+	psmi_assert(scbc->sbuf_num > 0);
+	psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+			     (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)));
+	psmi_assert(scb->payload_size <= scbc->sbuf_buf_size);
+
+	if (scb->payload_size <= scbc->scb_imm_size) {
+		/* Attach immediate buffer */
+		ips_scb_buffer(scb) = scb->imm_payload;
+		return 1;
+	}
+
+	if (SLIST_EMPTY(&scbc->sbuf_free))
+		return 0;
+	else {
+		psmi_assert(scbc->sbuf_num_cur);
+		ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free);
+		scbc->sbuf_num_cur--;
+
+		/* If under memory pressure request ACK for packet to reclaim
+		 * credits.
+		 */
+		if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1))
+			scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+
+		SLIST_REMOVE_HEAD(&scbc->sbuf_free, next);
+		return 1;
+	}
+}
+
+int ips_scbctrl_avail(struct ips_scbctrl *scbc)
+{
+	return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0);
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len,
+				uint32_t flags)
+{
+	ips_scb_t *scb, *scb_head = NULL;
+
+	psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1);
+	psmi_assert(scbc->sbuf_buf_size >= len);
+
+	while (scbnum--) {
+		if (SLIST_EMPTY(&scbc->scb_free))
+			break;
+		scb = SLIST_FIRST(&scbc->scb_free);
+		/* Need to set this here as bufalloc may request
+		 * an ACK under memory pressure
+		 */
+		scb->scb_flags = 0;
+		if (flags & IPS_SCB_FLAG_ADD_BUFFER) {
+			scb->payload_size = len;
+			if (!ips_scbctrl_bufalloc(scb))
+				break;
+		} else {
+			ips_scb_buffer(scb) = NULL;
+			scb->payload_size = 0;
+		}
+
+		scb->tidsendc = NULL;
+		scb->callback = NULL;
+		scb->tidctrl = 0;
+		scb->nfrag = 1;
+		scb->frag_size = 0;
+#ifdef PSM_CUDA
+		scb->mq_req = NULL;
+#endif
+		scb->mr = NULL;
+
+		scbc->scb_num_cur--;
+		if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+			scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+
+		SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+		SLIST_NEXT(scb, next) = scb_head;
+		scb_head = scb;
+	}
+	return scb_head;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc);
+
+void ips_scbctrl_free(ips_scb_t *scb)
+{
+	struct ips_scbctrl *scbc = scb->scbc;
+	if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) &&
+	    (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) {
+		scbc->sbuf_num_cur++;
+		SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next);
+	}
+
+	ips_scb_buffer(scb) = NULL;
+	scb->tidsendc = NULL;
+	scb->mr = NULL;
+	scb->payload_size = 0;
+	scbc->scb_num_cur++;
+	if (SLIST_EMPTY(&scbc->scb_free)) {
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+		if (scbc->scb_avail_callback != NULL)
+			scbc->scb_avail_callback(scbc, scbc->scb_avail_context);
+	} else
+		SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+	return;
+}
+
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc)
+{
+	ips_scb_t *scb;
+	if (SLIST_EMPTY(&scbc->scb_free))
+		return NULL;
+	scb = SLIST_FIRST(&scbc->scb_free);
+
+	SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+	SLIST_NEXT(scb, next) = NULL;
+
+	ips_scb_buffer(scb) = NULL;
+	scb->payload_size = 0;
+	scb->scb_flags = 0;
+	scb->tidsendc = NULL;
+	scb->callback = NULL;
+	scb->tidctrl = 0;
+	scb->nfrag = 1;
+	scb->frag_size = 0;
+#ifdef PSM_CUDA
+	scb->mq_req = NULL;
+#endif
+	scb->mr = NULL;
+
+	scbc->scb_num_cur--;
+	if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+		scb->scb_flags |= IPS_SEND_FLAG_ACKREQ;
+	return scb;
+}
+MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny);
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.h
new file mode 100644
index 0000000000000000000000000000000000000000..8446f7318845b3bab1b639d20e70cb00d22eca38
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_scb.h
@@ -0,0 +1,201 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_SCB_H
+#define _IPS_SCB_H
+
+#include "psm2_mock_testing.h"
+#include "psm_user.h"
+#include "ips_proto_header.h"
+
+/* ips_alloc_scb flags */
+#define IPS_SCB_FLAG_NONE	0x0
+#define IPS_SCB_FLAG_ADD_BUFFER 0x1
+
+/* macros to update scb */
+#define ips_scb_opcode(scb)    scb->opcode
+#define ips_scb_buffer(scb)    scb->payload
+#define ips_scb_length(scb)    scb->payload_size
+#define ips_scb_flags(scb)     scb->scb_flags
+#define ips_scb_dma_cntr(scb)  scb->dma_cntr
+#define ips_scb_epaddr(scb)    scb->epaddr
+#define ips_scb_cb(scb)        scb->callback
+#define ips_scb_cb_param(scb)  scb->cb_param
+
+#define ips_scb_copy_tag(dst, src)			\
+				(dst)[0] = (src)[0];	\
+				(dst)[1] = (src)[1];	\
+				(dst)[2] = (src)[2];
+
+struct ips_scbbuf;
+struct ips_scb;
+struct ips_scbctrl;
+struct ips_tid_send_desc;
+
+typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *,
+						 void *context);
+
+STAILQ_HEAD(ips_scb_stailq, ips_scb);
+SLIST_HEAD(ips_scb_slist, ips_scb);
+
+struct ips_scbctrl {
+	/* const psmi_context_t *context; */
+
+	/* Send control blocks for each send */
+	uint32_t scb_num;
+	uint32_t scb_num_cur;
+	 SLIST_HEAD(scb_free, ips_scb) scb_free;
+	void *scb_base;
+	ips_scbctrl_avail_callback_fn_t scb_avail_callback;
+	void *scb_avail_context;
+
+	/* Immediate data for send buffers */
+	uint32_t scb_imm_size;
+	void *scb_imm_buf;
+	psmi_timer *timers;	/* ack/send timers */
+
+	/*
+	 * Send buffers (or bounce buffers) to keep user data if we need to
+	 * retransmit.
+	 */
+	uint32_t sbuf_num;
+	uint32_t sbuf_num_cur;
+	 SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free;
+	void *sbuf_buf_alloc;
+	uint32_t sbuf_buf_size;
+	void *sbuf_buf_base;
+	void *sbuf_buf_last;
+};
+
+struct ips_scbbuf {
+	SLIST_ENTRY(ips_scbbuf) next;
+};
+
+typedef struct ips_scb ips_scb_t;
+
+struct ips_scb {
+	union {
+		SLIST_ENTRY(ips_scb) next;
+		STAILQ_ENTRY(ips_scb) nextq;
+	};
+	union {
+		void *payload;				// used for UD and UDP
+		struct ips_scbbuf *sbuf;	// linkage for free scb's
+	};
+	uint64_t ack_timeout;	/* in cycles  */
+	uint64_t abs_timeout;	/* in cycles  */
+
+	psmi_timer *timer_send;	/* for sending packets */
+	psmi_timer *timer_ack;	/* for acking packets */
+
+	/* Used when composing packet */
+	psmi_seqnum_t seq_num;
+	uint32_t cksum[2];
+	uint32_t scb_flags;
+	uint32_t payload_size;	/* remaining first packet size */
+	uint32_t chunk_size;	/* total buffer size if nfrag > 1 */
+	/* initially chunk_size_remaining = chunk_size. */
+	uint32_t chunk_size_remaining; /* buffer size to re-transmit */
+	uint16_t nfrag;		/* total packets in sequence */
+	/* initially nfrag_remaining = nfrag */
+	uint16_t nfrag_remaining; /* number packets to re-transmit */
+	uint16_t dma_complete;
+	uint16_t tidctrl;
+	uint16_t frag_size;	/* max packet size in sequence */
+	uint16_t opcode;
+	psm2_verbs_mr_t mr;
+	struct ips_flow *flow;
+	struct ips_tid_send_desc *tidsendc;
+
+	struct ips_scbctrl *scbc;
+	void *imm_payload;
+
+	union {
+		int (*callback) (void *, uint32_t);
+		psm2_am_completion_fn_t completion_am;
+	};
+	void *cb_param;
+#ifdef PSM_CUDA
+	psm2_mq_req_t mq_req;		/* back pointer to original request */
+#endif
+	struct {
+		struct ips_message_header ips_lrh;
+	} PSMI_CACHEALIGN;
+};
+
+
+#ifdef PSM_CUDA
+#define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU)
+#endif
+
+void ips_scbctrl_free(ips_scb_t *scb);
+int ips_scbctrl_bufalloc(ips_scb_t *scb);
+int ips_scbctrl_avail(struct ips_scbctrl *scbc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc,
+				int scbnum, int len, uint32_t flags);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc);
+ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc);
+MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny);
+
+psm2_error_t ips_scbctrl_init(const psmi_context_t *context,
+			     uint32_t numscb, uint32_t numbufs,
+			     uint32_t imm_size, uint32_t bufsize,
+			     ips_scbctrl_avail_callback_fn_t,
+			     void *avail_context, struct ips_scbctrl *);
+psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *);
+
+psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd);
+
+#endif /* _IPS_SCB_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_stats.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..046e0c38a2fe220597f479b8f31f3fdd614d90c5
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_stats.h
@@ -0,0 +1,83 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_STATS_H
+#define _IPS_STATS_H
+
+struct psm2_epaddr;		/* for non-PSM clients */
+
+/* Old stats */
+typedef struct {
+	uint64_t err_chk_send;
+	uint64_t err_chk_recv;
+	uint64_t send_failed;
+	uint64_t recv_dropped;
+	union {
+		uint64_t recv_copied;	/* obsolete */
+		uint64_t nak_sent;
+	};
+	uint64_t nak_recv;
+	uint64_t total_send_eager;
+	uint64_t total_send_exp;
+	uint64_t acks_sent;
+	uint64_t retransmits;
+	uint64_t recv_matched;
+	uint64_t recv_unmatched;
+	uint64_t scb_alloc_yields;
+} ips_sess_stat;
+
+int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats);
+
+#endif /* _IPS_STATS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_subcontext.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_subcontext.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f5afcbab0e534b8770edd926f48204b260c4a70
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_subcontext.h
@@ -0,0 +1,79 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef __IPS_SUBCONTEXT_H
+#define __IPS_SUBCONTEXT_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+
+/* This data structure is allocated in ureg page of each subcontext process */
+
+struct ips_subcontext_ureg {
+	/* head/eager head/tail register storage, one per cacheline
+	 (member is unused by PSM, but needed here to match driver structures). */
+	uint64_t subcontext_uregbase[40 /* i.e. ur_maxreg * 8 */];
+	struct ips_writehdrq_state writeq_state;	/* used in all ureg pages */
+} __attribute__ ((aligned(64)));
+
+struct ips_hwcontext_ctrl {
+	pthread_spinlock_t context_lock;	/* lock shared by all subctxts */
+	struct ips_recvhdrq_state recvq_state;	/* state shared by all subctxts */
+	uint32_t rx_hdrq_rhf_seq;               /* rhf seq for the hw hdrq shared
+						   by all subctxts */
+} __attribute__ ((aligned(64)));
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7349dde133780340334b0f029e15d348382a984
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.c
@@ -0,0 +1,55 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d31defc872e6302c40574bf1fad150f1d234cf7
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tid.h
@@ -0,0 +1,61 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* included header files  */
+
+#ifndef _IPS_TID_H
+#define _IPS_TID_H
+
+#endif /* _IPS_TID_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.c
new file mode 100644
index 0000000000000000000000000000000000000000..f7588b83fe020e067348045e4a52df669b2ef28d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.c
@@ -0,0 +1,53 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d31284427e2720b5486f4f1bcb0a5cd6931d9b9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidcache.h
@@ -0,0 +1,158 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef _IPS_TIDCACHE_H
+#define _IPS_TIDCACHE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/*
+ * Design notes.
+ *
+ * PSM needs to call into driver to program receiving buffer pages to
+ * HFI gen1 hardware, each tid can be programmed with physically contiguous
+ * power-of-two pages from 1 pages to 512 pages. This procedure takes
+ * time.
+ *
+ * Lots of applications tend to re-use the same receiving buffer, caching
+ * such programmed tids in user space process will save time and improve
+ * application performance.
+ *
+ * This PSM tid registration caching design requires cooperation between
+ * PSM and driver. Here is what happen between PSM and driver.
+ *
+ * 1. PSM call into driver with a chunk of buffer with virtual address
+ *    and length.
+ * 2. driver pins the buffer pages, program hardware with the physical
+ *    pages, get a list of tids.
+ * 3. driver caches the tids with the corresponding virtual address in
+ *    user space for each tid, and return the list of tids back to PSM.
+ * 4. PSM also caches the list of tids with the corresponding virtual
+ *    address for each tid, and use the list of tids for transmission.
+ * 5. when process frees a buffer, kernel VM will catch the event and
+ *    calls the callback in driver to notify that the virtual address
+ *    range is gone in the process.
+ * 6. driver will search its cache system and find the tids with the
+ *    removed virtual address, put these tid in an invalidation queue
+ *    and notify PSM the event.
+ * 7. PSM will pick the event and remove the tids from its own cache
+ *    as well.
+ * 8. PSM must check such invalidation event every time before searching
+ *    its caching system to match tids for a 'new' buffer chunk.
+ * 9, when the caching system is full, and a new buffer chunk is asked
+ *    to register, PSM picks a victim to remove.
+ */
+
+typedef struct
+{
+	unsigned long		start;		/* start virtual address */
+	uint32_t		tidinfo;	/* tid encoding */
+	uint16_t		length;		/* length in pages */
+	uint16_t		invalidate;	/* invalidate flag */
+	uint16_t		refcount;	/* usage reference count */
+	uint16_t		i_prev;		/* idle queue previous */
+	uint16_t		i_next;		/* idle queue next */
+} rbtree_tidcache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		ntid;		/* tids are cached */
+	uint32_t		nidle;		/* tids are idle */
+} rbtree_tidcache_map_pl_t;
+
+#define RBTREE_MI_PL  rbtree_tidcache_mapitem_pl_t
+#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t
+
+#include "psm3_rbtree.h"
+
+/*
+ * Macro definition for easy programming.
+ */
+
+#define NTID			p_map->payload.ntid
+#define REFCNT(x)		p_map->root[x].payload.refcount
+#define INVALIDATE(x)		p_map->root[x].payload.invalidate
+
+#define LENGTH(x)		p_map->root[x].payload.length
+#define START(x)		p_map->root[x].payload.start
+#define END(x)			(START(x) + (LENGTH(x)<<12))
+
+/*
+ * Macro for idle tid queue management.
+ */
+#define NIDLE			p_map->payload.nidle
+#define IHEAD			0
+#define INEXT(x)		p_map->root[x].payload.i_next
+#define IPREV(x)		p_map->root[x].payload.i_prev
+
+#define IDLE_REMOVE(x)		do {					\
+					INEXT(IPREV(x)) = INEXT(x);	\
+					IPREV(INEXT(x)) = IPREV(x);	\
+					NIDLE--;			\
+				} while (0)
+
+#define	IDLE_INSERT(x)		do {					\
+					INEXT(x) = INEXT(IHEAD);	\
+					IPREV(x) = IHEAD;		\
+					IPREV(INEXT(IHEAD)) = x;	\
+					INEXT(IHEAD) = x;		\
+					NIDLE++;			\
+				} while (0)
+
+extern void ips_tidcache_map_init(cl_qmap_t		*p_map,
+				  cl_map_item_t* const	root,
+				  cl_map_item_t* const	nil_item);
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.c
new file mode 100644
index 0000000000000000000000000000000000000000..61d97adaa60a20811e6320387c1bec7023e25a26
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.c
@@ -0,0 +1,274 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_proto.h"
+#include "ips_expected_proto.h"
+#include "ips_tidflow.h"
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+			const psmi_context_t *context,
+			struct ips_tf *tfc,
+			ips_tf_avail_cb_fn_t cb)
+{
+	int tf_idx;
+
+#if TF_ADD
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECL("tidflow_update_count",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				NULL, &tfc->tf_num_total),
+	};
+#endif
+
+	tfc->context = context;
+	tfc->tf_num_total = 0;
+	tfc->tf_num_inuse = 0;
+	tfc->tf_avail_cb = cb;
+	tfc->tf_avail_context = (void *)protoexp;
+	tfc->tf_gen_mask = 0xFFFFF;
+
+	/* Allocate and Initialize tidrecvc array. */
+	tfc->tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_calloc(context->ep, UNDEFINED, 1,
+			sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS);
+	if (tfc->tidrecvc == NULL)
+		return PSM2_NO_MEMORY;
+
+	for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+		tfc->tidrecvc[tf_idx].context = context;
+		tfc->tidrecvc[tf_idx].protoexp = protoexp;
+		tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx;
+		tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx;
+	}
+
+	/* Shared control structure, it will be in shared memory
+	 * for context sharing, otherwise calloc() it */
+	tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl;
+	if (!tfc->tf_ctrl) {
+		tfc->tf_ctrl = (struct ips_tf_ctrl *)
+		    psmi_calloc(context->ep, UNDEFINED, 1,
+				sizeof(struct ips_tf_ctrl));
+		if (tfc->tf_ctrl == NULL) {
+			return PSM2_NO_MEMORY;
+		}
+	}
+
+	/*
+	 * Only the master process can initialize.
+	 */
+	{
+		pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock,
+					PTHREAD_PROCESS_SHARED);
+		tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS;
+		tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS;
+
+		for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+// USE_RC TBD this is bizzare.  For native mode it works fine
+// for UD/UDP mode it crashes at next_free assignment below on some systems
+// but adding this print or moving next_free assignment to separate
+// loop works fine.  Really odd if this is a compiler issue, but
+// I don't see any other reason.  We should be single threaded here
+// enabling the empty call to tidflow_reset doesn't help
+// stubbing tidflow_reset on native works fine, can't explain crash
+// nor workaround
+			/* Update flow state */
+			tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED;
+			tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx;
+			tfc->tf_ctrl->tf[tf_idx].next_gen = 0;
+#if 0
+			tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
+#endif
+
+		}
+#if 1
+		for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+			tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
+		}
+#endif
+		tfc->tf_ctrl->tf_head = 0;
+	}
+
+#if TF_ADD
+	/* TF_ADD: Add a new stats type for tid flows in psm_stats.h */
+	return psmi_stats_register_type("TID_Flow_Statistics",
+					PSMI_STATSTYPE_TIDS,
+					entries,
+					PSMI_STATS_HOWMANY(entries),
+					protoexp->proto->ep->epid, tfc,
+					protoexp->proto->ep->dev_name);
+#else
+	return PSM2_OK;
+#endif
+}
+
+psm2_error_t ips_tf_fini(struct ips_tf *tfc)
+{
+	psmi_stats_deregister_type(PSMI_STATSTYPE_TIDS, tfc);
+	if (!tfc->context->tf_ctrl)
+		psmi_free(tfc->tf_ctrl);
+	psmi_free(tfc->tidrecvc);
+	return PSM2_OK;
+}
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+		struct ips_tid_recv_desc **tidrecvc)
+{
+	struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+	struct ips_tf_entry *entry;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+	if (!ctrl->tf_num_avail) {
+		psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS);
+		*tidrecvc = NULL;
+
+		if (tfc->context->tf_ctrl)
+			pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+		return PSM2_EP_NO_RESOURCES;
+	}
+
+	entry = &ctrl->tf[ctrl->tf_head];
+	ctrl->tf_head = entry->next_free;
+	ctrl->tf_num_avail--;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+	tfc->tf_num_total++;
+	tfc->tf_num_inuse++;
+
+	psmi_assert(entry->state == TF_STATE_DEALLOCATED);
+	entry->state = TF_STATE_ALLOCATED;
+
+	*tidrecvc = &(tfc->tidrecvc[entry->tf_idx]);
+	/* initial tidflow generation */
+	(*tidrecvc)->tidflow_active_gen = entry->next_gen;
+
+	psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx);
+	psmi_assert_always(entry->next_gen < tfc->tf_gen_mask);
+
+	entry->next_gen++;
+	if (entry->next_gen == tfc->tf_gen_mask)
+		entry->next_gen = 0;
+
+	return PSM2_OK;
+}
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx)
+{
+	struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
+	struct ips_tf_entry *entry;
+
+	psmi_assert(tf_idx < HFI_TF_NFLOWS);
+	psmi_assert(tf_idx >= 0);
+
+	entry = &ctrl->tf[tf_idx];
+	psmi_assert(entry->state == TF_STATE_ALLOCATED);
+	entry->state = TF_STATE_DEALLOCATED;
+
+	/* we track all 32 bits to improve ability for err_chk_rdma
+	 * to identify completed requests vs rdescid reuse
+	 * however only low 16 bits are used in RDMA immediate data
+	 */
+	tfc->tidrecvc[tf_idx].rdescid.u32w1++;
+
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_lock(&ctrl->tf_ctrl_lock);
+
+	entry->next_free = ctrl->tf_head;
+	ctrl->tf_head = tf_idx;
+	ctrl->tf_num_avail++;
+
+	if (tfc->context->tf_ctrl)
+		pthread_spin_unlock(&ctrl->tf_ctrl_lock);
+
+	tfc->tf_num_inuse--;
+	/* If an available callback is registered invoke it */
+	if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb)
+		tfc->tf_avail_cb(tfc, tfc->tf_avail_context);
+
+	return PSM2_OK;
+}
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+			       uint32_t tf_idx, uint32_t *tfgen)
+{
+	struct ips_tf_entry *entry;
+	int ret = PSM2_OK;
+
+	psmi_assert(tf_idx < HFI_TF_NFLOWS);
+	psmi_assert(tf_idx >= 0);
+
+	entry = &tfc->tf_ctrl->tf[tf_idx];
+	psmi_assert(entry->state == TF_STATE_ALLOCATED);
+
+	*tfgen = entry->next_gen;
+
+	entry->next_gen++;
+	if (entry->next_gen == tfc->tf_gen_mask)
+		entry->next_gen = 0;
+
+	psmi_assert_always(*tfgen < tfc->tf_gen_mask);
+
+	return ret;
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a93fb7b090ba06e5e2da0d26010dc63263df3b9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_tidflow.h
@@ -0,0 +1,131 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2016 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2016 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_TIDFLOW_H
+#define _IPS_TIDFLOW_H
+
+#include "psm_user.h"
+
+#define TF_ADD 1	// enable code for tidflow statistics
+
+struct ips_tf;
+struct ips_protoexp;
+
+typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context);
+typedef enum {
+	TF_STATE_INVALID = 0,
+	TF_STATE_ALLOCATED = 1,
+	TF_STATE_DEALLOCATED = 2
+} tf_state_t;
+
+struct ips_tf_entry {
+	tf_state_t state;
+	uint32_t tf_idx;
+	uint32_t next_gen;
+	uint32_t next_free;
+};
+
+struct ips_tf_ctrl {
+	pthread_spinlock_t tf_ctrl_lock;
+	uint32_t tf_num_max;
+	uint32_t tf_num_avail;
+	uint32_t tf_head;
+	struct ips_tf_entry tf[HFI_TF_NFLOWS];
+} __attribute__ ((aligned(64)));
+
+struct ips_tf {
+	const psmi_context_t *context;
+	ips_tf_avail_cb_fn_t tf_avail_cb;
+	void *tf_avail_context;
+	struct ips_tf_ctrl *tf_ctrl;
+
+	uint64_t tf_num_total;
+	uint32_t tf_num_inuse;
+	uint32_t tf_gen_mask;
+
+	/* Pointer to array of size HFI_TF_NFLOWS */
+	struct ips_tid_recv_desc *tidrecvc;
+};
+
+PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf))
+{
+	if (tf->tf_ctrl->tf_num_avail == 0) {
+		if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse)
+			return -1;
+		else
+			return 0;
+	}
+
+	return tf->tf_ctrl->tf_num_avail;
+}
+
+psm2_error_t ips_tf_init(struct ips_protoexp *protoexp,
+			const psmi_context_t *context,
+			struct ips_tf *tfc,
+			ips_tf_avail_cb_fn_t cb);
+psm2_error_t ips_tf_fini(struct ips_tf *tfc);
+
+/* Allocate a tidflow */
+psm2_error_t ips_tf_allocate(struct ips_tf *tfc,
+			struct ips_tid_recv_desc **tidrecvc);
+
+/* Deallocate a tidflow */
+psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx);
+
+/* Allocate a generation for a flow */
+psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc,
+			uint32_t tf_idx, uint32_t *tfgen);
+
+#endif
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc30d546d5ed7d49813f717cc57cfd716cb5adb9
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.c
@@ -0,0 +1,61 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ips_writehdrq.h"
+#include "ips_proto_params.h"
+
+
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ad489a3805c1d37642a21f75ddd33d1aae92d65
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ips_writehdrq.h
@@ -0,0 +1,83 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_WRITEHDRQ_H
+#define _IPS_WRITEHDRQ_H
+
+#include "psm_user.h"
+#include "ips_recvq.h"
+
+/*
+ * Structure containing state for writehdrq writing. This is logically
+ * part of ips_writehdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the port. Generally, do not
+ * put pointers in here since the address map of each process can be
+ * different.
+ */
+struct ips_writehdrq_state {
+	uint32_t hdrq_rhf_seq;	/* last seq */
+	uint32_t egrq_offset;	/* in bytes unit, not 64B */
+	uint32_t enabled;	/* enables writing */
+};
+
+struct ips_writehdrq {
+	const psmi_context_t *context;
+	struct ips_writehdrq_state *state;
+	uint32_t hdrq_elemlast;
+};
+
+
+#endif /* _IPS_WRITEHDRQ_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl.c
new file mode 100644
index 0000000000000000000000000000000000000000..66dd6788107335c670d47bccf5ef3b0b863ed037
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl.c
@@ -0,0 +1,622 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/* This file implements the PSM PTL for ips */
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "ptl_ips.h"
+#include "psm_mq_internal.h"
+
+
+static size_t ips_ptl_sizeof(void)
+{
+	return sizeof(struct ptl_ips);
+}
+
+#if 0	// unused code, specific to QLogic MPI
+static
+int ips_ptl_epaddr_stats_num(void)
+{
+	return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+}
+
+static
+int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags)
+{
+	int num_stats =
+	    sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+	int i;
+	int j=0;
+
+	/* All stats are uint64_t */
+	for (i = 0; i < num_stats; i++)
+		flags[i] = MPSPAWN_STATS_REDUCTION_ALL |
+		    MPSPAWN_STATS_SKIP_IF_ZERO;
+
+	desc[j++] = "errchecks send";
+	desc[j++] = "errchecks recv";
+#ifdef RNDV_MOD
+	desc[j++] = "err_chk_rdma send"
+	desc[j++] = "err_chk_rdma recv"
+#endif
+	desc[j++] = "nak send";
+	desc[j++] = "nak recv";
+	desc[j++] = "connect req send";
+	desc[j++] = "connect req recv";
+	desc[j++] = "connect rep send";
+	desc[j++] = "connect rep recv";
+	desc[j++] = "disconnect req send";
+	desc[j++] = "disconnect req recv";
+	desc[j++] = "disconnect rep send";
+	desc[j++] = "disconnect rep recv";
+	desc[j++] = "tid grants send";
+	desc[j++] = "tid grants recv";
+	desc[j++] = "send rexmit";
+#ifdef RNDV_MOD
+	desc[j++] = "rdma rexmit";
+#endif
+	desc[j++] = "congestion packets";
+
+	psmi_assert(num_stats == j);
+	return num_stats;
+}
+
+int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o)
+{
+	int i, num_stats =
+	    sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t);
+	uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats;
+
+	for (i = 0; i < num_stats; i++)
+		stats_o[i] = stats_i[i];
+
+	return num_stats;
+}
+#endif // 0	// unused code, specific to QLogic MPI
+
+static
+psm2_error_t
+psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)t->context;
+	const uint64_t current_count = get_cycles();
+	psm2_error_t err;
+
+	err = psmi_context_check_status(ptl->context);
+	if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS)
+	{
+		int rc = psmi_hal_spio_process_events((struct ptl *)ptl);
+		err = rc >= 0 ? PSM2_OK : PSM2_INTERNAL_ERR;
+	}
+	psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+				  current_count + ptl->status_cyc_timeout);
+
+	return err;
+}
+
+static
+psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+	uint32_t num_of_send_bufs = ep->hfi_num_sendbufs;
+	uint32_t num_of_send_desc = ep->hfi_num_descriptors;
+	uint32_t imm_size = ep->hfi_imm_size;
+	const psmi_context_t *context = &ep->context;
+	const int enable_shcontexts = 0;
+	const uint64_t current_count = get_cycles();
+
+	/* Preconditions */
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+	psmi_assert_always(ep->hfi_num_sendbufs > 0);
+
+	memset(ptl, 0, sizeof(struct ptl_ips));
+
+	ptl->ep = ep;		/* back pointer */
+	ptl->epid = ep->epid;	/* cache epid */
+	ptl->epaddr = ep->epaddr;	/* cache a copy */
+	ptl->ctl = ctl;
+	ptl->context = context;
+
+	memset(ctl, 0, sizeof(*ctl));
+	/* Fill in the control structure */
+	ctl->ep = ep;
+	ctl->ptl = ptl_gen;
+	ctl->ep_poll = ips_ptl_poll;
+	ctl->ep_connect = ips_ptl_connect;
+	ctl->ep_disconnect = ips_ptl_disconnect;
+	ctl->mq_send = ips_proto_mq_send;
+	ctl->mq_isend = ips_proto_mq_isend;
+
+	ctl->am_get_parameters = ips_am_get_parameters;
+
+	ctl->am_short_request = ips_am_short_request;
+	ctl->am_short_reply = ips_am_short_reply;
+
+#if 0	// unused code, specific to QLogic MPI
+	ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num;
+	ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init;
+	ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get;
+#endif
+
+	ctl->msg_size_thresh_query = ips_proto_msg_size_thresh_query;
+
+	/*
+	 * Runtime flags in 'ptl' are different from runtime flags in 'context'.
+	 * In 'context', runtime flags reflect what the driver is capable of.
+	 * In 'ptl', runtime flags reflect the features we can or want to use in
+	 *           the driver's supported runtime flags.
+	 */
+
+	/*
+	 * This timer is to be used to check the context's status at every
+	 * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS.  This is useful to detect when
+	 * the link transitions from the DOWN state to the UP state.  We can thus
+	 * stop aggregating link failure messages once we detect that the link is
+	 * up.
+	 */
+	psmi_timer_entry_init(&ptl->status_timer,
+			      psmi_context_check_status_callback, ptl);
+
+	/* cache the context's status timeout in cycles */
+	ptl->status_cyc_timeout =
+	    ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS);
+
+	/*
+	 * Retransmissions and pending operations are kept in a timer structure
+	 * (queue).  The timerq is shared to various internal IPS interfaces so
+	 * that they too may schedule events on the timer queue.  The timerq is
+	 * drained in the progress function.
+	 */
+	if ((err = psmi_timer_init(&ptl->timerq)))
+		goto fail;
+
+	/* start the context's status timer */
+	psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+				  current_count + ptl->status_cyc_timeout);
+
+	/*
+	 * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings
+	 * are added/removed by the connect portion of the ips protocol and lookup
+	 * is made by the receive queue processing component.
+	 */
+	if ((err = ips_epstate_init(&ptl->epstate, context)))
+		goto fail;
+
+
+	/*
+	 * Actual ips protocol handling.
+	 */
+	if ((err =
+	     ips_proto_init(context, ptl_gen, num_of_send_bufs, num_of_send_desc,
+			    imm_size, &ptl->timerq, &ptl->epstate, ptl->spioc,
+			    &ptl->proto)))
+		goto fail;
+
+	/*
+	 * Hardware receive hdr/egr queue, services incoming packets and issues
+	 * callbacks for protocol handling in proto_recv.  It uses the epstate
+	 * interface to determine if a packet is known or unknown.
+	 */
+	if (!enable_shcontexts) {
+		struct ips_recvhdrq_callbacks recvq_callbacks;
+		recvq_callbacks.callback_packet_unknown =
+		    ips_proto_process_unknown;
+		if ((err =
+		     ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+				       &recvq_callbacks,
+				       &ptl->recvq
+						)))
+			goto fail;
+	}
+
+	/*
+	 * Receive thread, always initialized but not necessary creates a
+	 * pthread.
+	 */
+	if ((err = ips_ptl_rcvthread_init(ptl_gen, &ptl->recvq)))
+		goto fail;
+fail:
+	return err;
+}
+
+static psm2_error_t ips_ptl_fini(ptl_t *ptl_gen, int force, uint64_t timeout_in)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+
+	if ((err = ips_proto_fini(&ptl->proto, force, timeout_in)))
+		goto fail;
+
+	/* We have to cancel the thread after terminating the protocol because
+	 * connect/disconnect packets use interrupts and the kernel doesn't
+	 * like to have no pollers waiting */
+	if ((err = ips_ptl_rcvthread_fini(ptl_gen)))
+		goto fail;
+
+	if ((err = ips_epstate_fini(&ptl->epstate)))
+		goto fail;
+
+
+	if ((err = psmi_timer_fini(&ptl->timerq)))
+		goto fail;
+
+
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_optctl(const void *core_obj, int optname,
+	       void *optval, uint64_t *optlen, int get)
+{
+	psm2_error_t err = PSM2_OK;
+
+	switch (optname) {
+	case PSM2_IB_OPT_EP_SL:
+		{
+			/* Core object is psm2_epaddr */
+			psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj;
+			ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr;
+
+			/* If endpoint does not use IB ignore for set, complain for get */
+			if (epaddr->ptlctl->ep_connect != ips_ptl_connect) {
+				if (get)
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+							      PSM2_PARAM_ERR,
+							      "Invalid EP transport");
+				goto exit_fn;
+			}
+
+			/* Sanity check option length */
+			if (*optlen < sizeof(uint8_t)) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Option value length error");
+				*optlen = sizeof(unsigned);
+				goto exit_fn;
+			}
+
+			if (get) {
+				/* Get returns the SL for the PIO flow */
+				*((uint8_t *) optval) =
+				    (uint8_t) ipsaddr->
+				    flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl;
+			} else {
+				uint16_t new_sl;
+
+				/* Sanity check if SL is within range */
+				new_sl = (uint16_t) *(uint8_t *) optval;
+				if (new_sl > PSMI_SL_MAX) {
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid SL value %u. %d<= SL <=%d.",
+						      new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+					goto exit_fn;
+				}
+
+				/* Set new SL for all flows */
+				ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path->
+				    pr_sl = new_sl;
+			}
+		}
+		break;
+	case PSM2_IB_OPT_DF_SL:
+		{
+			/* Set default SL to be used by an endpoint for all communication */
+			/* Core object is psm2_epaddr */
+			psm2_ep_t ep = (psm2_ep_t) core_obj;
+
+			/* Make sure ep is specified */
+			if (!ep) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid PSM Endpoint");
+				goto exit_fn;
+			}
+
+			/* Sanity check option length */
+			if (*optlen < sizeof(uint8_t)) {
+				err =
+				    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Option value length error");
+				*optlen = sizeof(uint8_t);
+				goto exit_fn;
+			}
+
+			if (get) {
+				*((uint8_t *) optval) =
+					((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl;
+			} else {
+				uint16_t new_sl;
+
+				/* Sanity check if SL is within range */
+				new_sl = (uint16_t) *(uint8_t *) optval;
+				if (new_sl > PSMI_SL_MAX) {
+					err =
+					    psmi_handle_error(PSMI_EP_LOGEVENT,
+						      PSM2_PARAM_ERR,
+						      "Invalid SL value %u. %d<= SL <=%d.",
+						      new_sl, PSMI_SL_MIN, PSMI_SL_MAX);
+					goto exit_fn;
+				}
+
+				((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl =
+				    (uint8_t) new_sl;
+			}
+		}
+		break;
+	default:
+		err =
+		    psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				      "Unknown PSM3_IB option %u.", optname);
+	}
+
+exit_fn:
+	return err;
+}
+
+static
+psm2_error_t
+ips_ptl_setopt(const void *component_obj, int optname,
+	       const void *optval, uint64_t optlen)
+{
+	return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen,
+			      0);
+}
+
+static
+psm2_error_t
+ips_ptl_getopt(const void *component_obj, int optname,
+	       void *optval, uint64_t *optlen)
+{
+	return ips_ptl_optctl(component_obj, optname, optval, optlen, 1);
+}
+
+static
+uint32_t
+ips_ptl_rcvthread_is_enabled(const ptl_t *ptl)
+{
+	return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED);
+}
+
+psm2_error_t ips_ptl_poll(ptl_t *ptl_gen, int _ignored)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	const uint64_t current_count = get_cycles();
+	const int do_lock = PSMI_LOCK_DISABLED &&
+		psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED);
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	psm2_error_t err2;
+
+	if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq))
+		return err;
+	err = ips_recvhdrq_progress(&ptl->recvq);
+	if (do_lock)
+		ips_recvhdrq_unlock(&ptl->recvq);
+	if_pf(err > PSM2_OK_NO_PROGRESS)
+		return err;
+	err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count);
+	if (err2 != PSM2_OK_NO_PROGRESS)
+		return err2;
+	else {
+		// TBD - where to best poll for this
+		(void)psm2_verbs_completion_update(ptl->ep);
+		return err;
+	}
+}
+
+
+
+
+/*
+ * Legacy ips_get_stat -- do nothing.
+ */
+int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats)
+{
+	memset(stats, 0, sizeof(ips_sess_stat));
+	return 0;
+}
+
+
+
+psm2_error_t
+ips_ptl_connect(ptl_t *ptl_gen, int numep, const psm2_epid_t *array_of_epid,
+		const int *array_of_epid_mask, psm2_error_t *array_of_errors,
+		psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	psm2_error_t err;
+	psm2_ep_t ep;
+	psm2_epid_t *epid_array = NULL;
+	psm2_error_t *error_array = NULL;
+	psm2_epaddr_t *epaddr_array = NULL;
+	ips_epaddr_t *ipsaddr_master, *ipsaddr;
+	int *mask_array = NULL;
+	int i;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+	err = ips_proto_connect(&ptl->proto, numep, array_of_epid,
+				array_of_epid_mask, array_of_errors,
+				array_of_epaddr, timeout_in);
+	if (err)
+		return err;
+
+	psmi_assert_always(ptl->ep->mctxt_master == ptl->ep);
+	if (ptl->ep->mctxt_next == ptl->ep)
+		return err;
+
+	/* make the additional mutil-context connections. */
+	epid_array = (psm2_epid_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep);
+	mask_array = (int *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep);
+	error_array = (psm2_error_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep);
+	epaddr_array = (psm2_epaddr_t *)
+	    psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep);
+	if (!epid_array || !mask_array || !error_array || !epaddr_array) {
+		goto fail;
+	}
+
+	ep = ptl->ep->mctxt_next;
+	while (ep != ep->mctxt_master) {
+
+		/* Setup the mask array and epid array. */
+		for (i = 0; i < numep; i++) {
+			if (array_of_epid_mask[i]
+			    && array_of_errors[i] == PSM2_OK) {
+				ipsaddr_master =
+				    (ips_epaddr_t *) array_of_epaddr[i];
+				ipsaddr = ipsaddr_master->next;
+				mask_array[i] = 0;
+				while (ipsaddr != ipsaddr_master) {
+					if (((psm2_epaddr_t) ipsaddr)->proto->
+					    ep == ep) {
+						mask_array[i] = 1;
+						epid_array[i] =
+						    ((psm2_epaddr_t) ipsaddr)->
+						    epid;
+						break;
+					}
+					ipsaddr = ipsaddr->next;
+				}
+			} else {
+				mask_array[i] = 0;
+			}
+		}
+
+		/* Make the real protocol connections. */
+		err =
+			ips_proto_connect(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto,
+					  numep, epid_array, mask_array, error_array,
+					  epaddr_array, timeout_in);
+		if (err)
+			goto fail;
+
+		ep = ep->mctxt_next;
+	}
+
+fail:
+	if (epid_array)
+		psmi_free(epid_array);
+	if (mask_array)
+		psmi_free(mask_array);
+	if (error_array)
+		psmi_free(error_array);
+	if (epaddr_array)
+		psmi_free(epaddr_array);
+
+	return err;
+}
+
+psm2_error_t
+ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	int *array_of_epaddr_mask_internal, i;
+	psm2_error_t err;
+
+	/*
+	 * Copy true values from array_of_epaddr_mask, provided that their
+	 * respective epaddr is an ips one.
+	 * Newly created mask will be used for the protocol disconnect call
+	 * instead.
+	 */
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+	array_of_epaddr_mask_internal = psmi_calloc(ptl->ep, UNDEFINED,
+						    sizeof(int), numep);
+	if (!array_of_epaddr_mask_internal)
+		return PSM2_NO_MEMORY;
+
+	for (i = 0; i < numep; ++i) {
+		if (array_of_epaddr_mask[i] && array_of_epaddr[i]
+		    && array_of_epaddr[i]->ptlctl->ptl == ptl_gen) {
+			array_of_epaddr_mask_internal[i] = 1;
+		}
+	}
+
+	err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr,
+				   array_of_epaddr_mask_internal,
+				   array_of_errors, timeout_in);
+
+	psmi_free(array_of_epaddr_mask_internal);
+	return err;
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_ips = {
+	ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt,
+	    ips_ptl_getopt
+};
+
+struct ptl_ctl_rcvthread
+psmi_ptl_ips_rcvthread = {
+	ips_ptl_rcvthread_is_enabled,
+	ips_ptl_rcvthread_transfer_ownership,
+};
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_fwd.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7742609f3ab7a9dbf003f0922da1b91a6a7a66d
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_fwd.h
@@ -0,0 +1,67 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_IPS_H
+#define _PTL_FWD_IPS_H
+#include "ptl.h"
+
+typedef struct ips_epaddr ips_epaddr_t;
+typedef struct ips_msgctl ips_msgctl_t;
+
+/* Symbol in ips ptl */
+extern struct ptl_ctl_init psmi_ptl_ips;
+
+extern struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread;
+#endif /* _PTL_FWD_IPS_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_ips.h b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_ips.h
new file mode 100644
index 0000000000000000000000000000000000000000..185f0ec0791162953cdb8d3b8a11dd39f3ae9e66
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_ips.h
@@ -0,0 +1,161 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _IPS_PTL_H
+#define _IPS_PTL_H
+
+#include "psm_user.h"
+
+#include "ips_proto.h"
+#include "ips_stats.h"
+#include "ips_subcontext.h"
+
+struct ptl_shared;
+
+/*
+ * PTL at the ips level (for OPA)
+ *
+ * This PTL structure glues all the ips components together.
+ *
+ * * ips timer, shared by various components, allows each component to
+ *   schedule time-based expiration callbacks on the timerq.
+ * * HW receive queue
+ * * send control block to handle eager messages
+ * * instantiation of the ips protocol
+ * * endpoint state, to map endpoint indexes into structures
+ *
+ *   Receive-side
+ *
+ *          ----[   proto    ]
+ *         /       ^      ^
+ *        |        |      |
+ *        |     packet  packet
+ *        |	known   unknown
+ *   add_endpt      \ /
+ *        |          |
+ *        `----> [epstate]
+ *                   ^
+ *                   |
+ *               lookup_endpt
+ *                   |
+ *                [recvq]
+ *                   |
+ *                 poll
+ *
+ */
+/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */
+/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */
+struct ptl_ips {
+	psm2_ep_t ep;		/* back ptr */
+	psm2_epid_t epid;	/* cached from ep */
+	psm2_epaddr_t epaddr;	/* cached from ep */
+	ips_epaddr_t *ipsaddr;	/* cached from epaddr */
+	ptl_ctl_t *ctl;		/* cached from init */
+	const psmi_context_t *context;	/* cached from init */
+
+	void *spioc;	        /* PIO send control (opaque ptr) */
+	struct ips_proto proto;	/* protocol instance: timerq, epstate, spio */
+
+	struct psmi_timer_ctrl timerq;
+	struct ips_epstate epstate;	/* map incoming packets */
+	struct ips_recvhdrq_state recvq_state;
+	struct ips_recvhdrq recvq;	/* HW recvq: epstate, proto */
+
+	/* timer to check the context's status */
+	struct psmi_timer status_timer;
+
+	/* context's status check timeout in cycles -- cached */
+	uint64_t status_cyc_timeout;
+	/* Shared contexts context */
+	struct ptl_shared *recvshc;
+	/* Rcv thread context */
+	struct ptl_rcvthread *rcvthread;
+}
+#ifndef PACK_STRUCT_STL
+#define PACK_STRUCT_STL /* nothing */
+#endif
+ __attribute__ ((PACK_STRUCT_STL aligned(16)));
+
+
+/*
+ * Connect/disconnect are wrappers around psm proto's connect/disconnect,
+ * mostly to abstract away PSM-specific stuff from ips internal structures
+ */
+psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep,
+			    const psm2_epid_t *array_of_epid,
+			    const int *array_of_epid_mask,
+			    psm2_error_t *array_of_errors,
+			    psm2_epaddr_t *array_of_epaddr,
+			    uint64_t timeout_in);
+
+psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep,
+			       psm2_epaddr_t array_of_epaddr[],
+			       const int array_of_epaddr_mask[],
+			       psm2_error_t array_of_errors[],
+			       uint64_t timeout_in);
+
+/*
+ * Generic Poll function for ips-level ptl
+ */
+psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Support for receive thread
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq);
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl);
+void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl, ptl_t *to_ptl);
+
+#endif /* _IPS_PTL_H */
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
new file mode 100644
index 0000000000000000000000000000000000000000..7d1d9987403d728d78b863888b69d0843ba8af27
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
@@ -0,0 +1,638 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#include <sys/poll.h>
+
+#include "psm_user.h"
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "ptl_ips.h"
+#include "ips_proto.h"
+
+struct ptl_rcvthread;
+
+static void *ips_ptl_pollintr(void *recvthreadc);
+static psm2_error_t rcvthread_initstats(ptl_t *ptl);
+static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc);
+
+struct ptl_rcvthread {
+	const psmi_context_t *context;
+	const ptl_t *ptl;
+	struct ips_recvhdrq *recvq;
+
+	pthread_t hdrq_threadid;
+	uint64_t t_start_cyc;
+	int pipefd[2];
+
+	/* stats and some for scheduling */
+	uint64_t pollcnt;
+	uint64_t pollcnt_to;
+	uint64_t pollcyc;
+	uint64_t pollok;
+
+	/* For scheduling interrupt thread */
+	int timeout_period_min;
+	int timeout_period_max;
+	int timeout_shift;
+	uint64_t pollok_last;
+	uint64_t pollcnt_last;
+	uint32_t last_timeout;
+};
+
+#ifdef PSM_CUDA
+	/* This is a global cuda context (extern declaration in psm_user.h)
+         * stored to provide hints during a cuda failure
+         * due to a null cuda context.
+         */
+	CUcontext cu_ctxt;
+#endif
+
+/*
+ * The receive thread knows about the ptl interface, so it can muck with it
+ * directly.
+ */
+psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	psm2_error_t err = PSM2_OK;
+	struct ptl_rcvthread *rcvc;
+
+	ptl->rcvthread =
+	    psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread));
+	if (ptl->rcvthread == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	rcvc = ptl->rcvthread;
+
+	rcvc->recvq = recvq;
+	rcvc->ptl = ptl_gen;
+	rcvc->context = ptl->context;
+	rcvc->t_start_cyc = get_cycles();
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED)
+		PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt);
+#endif
+
+	if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) &&
+	    (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){
+
+		if ((err = rcvthread_initsched(rcvc)))
+			goto fail;
+
+		/* Create a pipe so we can synchronously terminate the thread */
+		if (pipe(rcvc->pipefd) != 0) {
+			err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+						"Cannot create a pipe for receive thread: %s\n",
+						strerror(errno));
+			goto fail;
+		}
+
+		psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED);
+		if (pthread_create(&rcvc->hdrq_threadid, NULL,
+				   ips_ptl_pollintr, ptl->rcvthread)) {
+			close(rcvc->pipefd[0]);
+			close(rcvc->pipefd[1]);
+			err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE,
+						"Cannot start receive thread: %s\n",
+						strerror(errno));
+			goto fail;
+		}
+		if ((err = rcvthread_initstats(ptl_gen)))
+			goto fail;
+	}
+
+
+fail:
+	return err;
+}
+
+psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl_gen)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+	uint64_t t_now;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+	if (ptl->rcvthread == NULL)
+		return err;
+
+	psmi_stats_deregister_type(PSMI_STATSTYPE_RCVTHREAD, rcvc);
+	if (rcvc->hdrq_threadid && psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) {
+		t_now = get_cycles();
+
+		/* Disable interrupts then kill the receive thread */
+		if (psmi_context_interrupt_isenabled
+		    ((psmi_context_t *) ptl->context))
+			if ((err =
+			     psmi_context_interrupt_set((psmi_context_t *) ptl->
+							context, 0)))
+				goto fail;
+
+		/* Close the pipe so we can have the thread synchronously exit.
+		   On Linux just closing the pipe does not wake up the receive
+		   thread.
+		 */
+		if (write(rcvc->pipefd[1], (const void *)&t_now,
+			  sizeof(uint64_t)) == -1 ||
+		    close(rcvc->pipefd[1]) == -1) {
+			_HFI_VDBG
+			    ("unable to close pipe to receive thread cleanly\n");
+		}
+		pthread_join(rcvc->hdrq_threadid, NULL);
+		psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED);
+		rcvc->hdrq_threadid = 0;
+		if (_HFI_PRDBG_ON) {
+			_HFI_PRDBG_ALWAYS
+				("rcvthread poll success %lld/%lld times, "
+				 "thread cancelled in %.3f us\n",
+				(long long)rcvc->pollok, (long long)rcvc->pollcnt,
+				(double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3);
+		}
+	}
+
+	psmi_free(ptl->rcvthread);
+	ptl->rcvthread = NULL;
+fail:
+	return err;
+}
+
+void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_ptl_gen)
+{
+	struct ptl_rcvthread *rcvc;
+
+	psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED);
+	struct ptl_ips *from_ptl = (struct ptl_ips *)from_ptl_gen;
+	struct ptl_ips *to_ptl   = (struct ptl_ips *)to_ptl_gen;
+	to_ptl->rcvthread = from_ptl->rcvthread;
+	from_ptl->rcvthread = NULL;
+
+	rcvc = to_ptl->rcvthread;
+
+	rcvc->recvq = &to_ptl->recvq;
+	rcvc->context = to_ptl->context;
+	rcvc->ptl = to_ptl_gen;
+}
+
+psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
+{
+	union psmi_envvar_val env_to;
+	char buf[192];
+	char *rcv_freq = buf;
+	int no_timeout = 0;
+	int tvals[3] = { RCVTHREAD_TO_MIN_FREQ,
+		RCVTHREAD_TO_MAX_FREQ,
+		RCVTHREAD_TO_SHIFT
+	};
+	snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ,
+		 RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT);
+	buf[sizeof(buf) - 1] = '\0';
+
+	if (!psmi_getenv("PSM3_RCVTHREAD_FREQ",
+			 "Recv Thread frequency (per sec) <min_freq[:max_freq[:shift_freq]]>",
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)rcv_freq, &env_to)) {
+		/* not using default values */
+		int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals);
+		int invalid = 0;
+
+		if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) ||
+		    (nparsed > 1 && tvals[1] == 0)) {
+			no_timeout = 1;
+		} else {
+			if (nparsed > 0 && tvals[0] > 1000)
+				invalid = 1;
+			if (nparsed > 1
+			    && (tvals[1] > 1000 || tvals[1] < tvals[0]))
+				invalid = 1;
+			if (nparsed > 2 && tvals[2] > 10)
+				invalid = 1;
+		}
+
+		if (invalid) {
+			_HFI_INFO
+			    ("Overriding invalid request for RcvThread frequency"
+			     " settings of %s to be <%d:%d:%d>\n", env_to.e_str,
+			     RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ,
+			     RCVTHREAD_TO_SHIFT);
+			tvals[0] = RCVTHREAD_TO_MIN_FREQ;
+			tvals[1] = RCVTHREAD_TO_MAX_FREQ;
+			tvals[2] = RCVTHREAD_TO_SHIFT;
+		}
+	}
+
+	if (no_timeout) {
+		rcvc->last_timeout = -1;
+		_HFI_PRDBG("PSM3_RCVTHREAD_FREQ set to only interrupt "
+			   "(no timeouts)\n");
+	} else {
+		/* Convert freq to period in milliseconds (for poll()) */
+		rcvc->timeout_period_max = 1000 / tvals[0];
+		rcvc->timeout_period_min = 1000 / tvals[1];
+		rcvc->timeout_shift = tvals[2];
+		/* Start in the middle of min and max */
+		rcvc->last_timeout = (rcvc->timeout_period_min +
+				      rcvc->timeout_period_max) / 2;
+		_HFI_PRDBG("PSM3_RCVTHREAD_FREQ converted to period "
+			   "min=%dms,max=%dms,shift=%d\n",
+			   rcvc->timeout_period_min, rcvc->timeout_period_max,
+			   rcvc->timeout_shift);
+	}
+	return PSM2_OK;
+}
+
+static
+int rcvthread_next_timeout(struct ptl_rcvthread *rcvc)
+{
+	uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last;
+
+	if (pollok_diff > 0) {
+		if (rcvc->last_timeout > rcvc->timeout_period_min)
+			/* By default, be less aggressive, but there's a more aggressive
+			 * alternative if need be */
+#if 1
+			rcvc->last_timeout >>= rcvc->timeout_shift;
+#else
+			rcvc->last_timeout = rcvc->timeout_period_min;
+#endif
+	} else {		/* we had less progress */
+		if (rcvc->last_timeout < rcvc->timeout_period_max)
+			rcvc->last_timeout <<= rcvc->timeout_shift;
+	}
+
+	rcvc->pollok_last = rcvc->pollok;
+	rcvc->pollcnt_last = rcvc->pollcnt;
+	return (int)rcvc->last_timeout;
+}
+
+extern int ips_in_rcvthread;
+
+static void process_async_event(psm2_ep_t ep)
+{
+	struct ibv_async_event async_event;
+	const char* errstr = NULL;
+
+	if (ibv_get_async_event(ep->verbs_ep.context, &async_event)) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Receive thread ibv_get_async_event() error on %s port %u: %s", ep->dev_name, ep->portnum, strerror(errno));
+	}
+	/* Ack the event */
+	ibv_ack_async_event(&async_event);
+
+	switch (async_event.event_type) {
+	case IBV_EVENT_CQ_ERR:
+		if (async_event.element.cq == ep->verbs_ep.send_cq)
+			errstr = "Send CQ";
+		else if (async_event.element.cq == ep->verbs_ep.recv_cq)
+			errstr = "Recv CQ";
+		else
+			errstr = "CQ";
+		break;
+	case IBV_EVENT_QP_FATAL:
+	case IBV_EVENT_QP_REQ_ERR:
+	case IBV_EVENT_QP_ACCESS_ERR:
+		if (async_event.element.qp == ep->verbs_ep.qp)
+			errstr = "UD QP";
+		else
+			errstr = "RC QP";	// qp->context will be an ipsaddr
+		break;
+	case IBV_EVENT_DEVICE_FATAL:
+		errstr = "NIC";
+		break;
+	default:
+		// be silent about other events
+		break;
+	}
+	if (errstr)
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			  "Fatal %s Async Event on %s port %u: %s", errstr, ep->dev_name, ep->portnum,
+				ibv_event_type_str(async_event.event_type));
+}
+
+static void rearm_cq_event(psm2_ep_t ep)
+{
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+
+	_HFI_VDBG("rcvthread got solicited event\n");
+	if (ibv_get_cq_event(ep->verbs_ep.recv_comp_channel, &ev_cq, &ev_ctx)) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			  "Receive thread ibv_get_cq_event() error on %s port %u: %s",
+			  ep->dev_name, ep->portnum, strerror(errno));
+	}
+
+	/* Ack the event */
+	ibv_ack_cq_events(ev_cq, 1);
+	psmi_assert_always(ev_cq == ep->verbs_ep.recv_cq);
+	psmi_assert_always(ev_ctx == ep);
+	// we only use solicited, so just reenable it
+	// TBD - during shutdown events get disabled and we could check
+	// psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED)
+	// to make sure we still want enabled.  But given verbs events
+	// are one-shots, that seems like overkill
+	if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 1)) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			  "Receive thread ibv_req_notify_cq() error on %s port %u: %s",
+			  ep->dev_name, ep->portnum, strerror(errno));
+	}
+}
+
+// poll for async events for all rails/QPs within a given end user opened EP
+static void poll_async_events(psm2_ep_t ep)
+{
+	struct pollfd pfd[PSMI_MAX_QPS];
+	psm2_ep_t pep[PSMI_MAX_QPS];
+	int num_ep = 0;
+	psm2_ep_t first;
+	int ret;
+	int i;
+
+	first = ep;
+	do {
+#ifdef RNDV_MOD
+		if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)
+		    && __psm2_rv_cq_overflowed(ep->verbs_ep.rv))
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "RV event ring overflow for %s port %u", ep->dev_name, ep->portnum);
+#endif
+		pfd[num_ep].fd = ep->verbs_ep.context->async_fd;
+		pfd[num_ep].events = POLLIN;
+		pfd[num_ep].revents = 0;
+		pep[num_ep++] = ep;
+		ep = ep->mctxt_next;
+	} while (ep != first);
+
+	ret = poll(pfd, num_ep, 0);
+	if_pf(ret < 0) {
+		if (errno == EINTR)
+			_HFI_DBG("got signal, keep polling\n");
+		else
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "Receive thread poll() error: %s", strerror(errno));
+	} else if_pf (ret > 0) {
+		for (i=0; i < num_ep; i++) {
+			if (pfd[i].revents & POLLIN)
+				process_async_event(pep[i]);
+		}
+	}
+}
+
+/*
+ * Receiver thread support.
+ *
+ * By default, polling in the driver asks the chip to generate an interrupt on
+ * every packet.  When the driver supports POLLURG we can switch the poll mode
+ * to one that requests interrupts only for packets that contain an urgent bit
+ * (and optionally enable interrupts for hdrq overflow events).  When poll
+ * returns an event, we *try* to make progress on the receive queue but simply
+ * go back to sleep if we notice that the main thread is already making
+ * progress.
+ */
+static
+void *ips_ptl_pollintr(void *rcvthreadc)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc;
+	struct ips_recvhdrq *recvq = rcvc->recvq;
+	int fd_pipe = rcvc->pipefd[0];
+	psm2_ep_t ep;
+	struct pollfd pfd[3];
+	int ret;
+	int next_timeout = rcvc->last_timeout;
+	uint64_t t_cyc;
+	psm2_error_t err;
+
+#ifdef PSM_CUDA
+	if (PSMI_IS_CUDA_ENABLED && cu_ctxt != NULL)
+		PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt);
+#endif
+
+	PSM2_LOG_MSG("entering");
+	/* No reason to have many of these, keep this as a backup in case the
+	 * recvhdrq init function is misused */
+	psmi_assert_always(psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED));
+
+	/* Switch driver to a mode where it can interrupt on urgent packets */
+	if (psmi_context_interrupt_set((psmi_context_t *)
+				       rcvc->context, 1) == PSM2_EP_NO_RESOURCES) {
+		_HFI_PRDBG
+		    ("poll_type feature not present in driver, turning "
+		     "off internal progress thread\n");
+		return NULL;
+	}
+
+	_HFI_PRDBG("Enabled communication thread on URG packets\n");
+
+	while (1) {
+		// pfd[0] is for urgent inbound packets (NAK, urgent ACK, etc)
+		// pfd[1] is for rcvthread termination
+		// pfd[2] is for verbs async events (PSM_UD only)
+		// on timeout (poll() returns 0), we do background process checks
+		//		for non urgent inbound packets
+		pfd[0].fd = rcvc->context->ep->verbs_ep.recv_comp_channel->fd;
+		pfd[0].events = POLLIN;
+		pfd[0].revents = 0;
+		pfd[1].fd = fd_pipe;
+		pfd[1].events = POLLIN;
+		pfd[1].revents = 0;
+		pfd[2].fd = rcvc->context->ep->verbs_ep.context->async_fd;
+		pfd[2].events = POLLIN;
+		pfd[2].revents = 0;
+
+		ret = poll(pfd, 3, next_timeout);
+		t_cyc = get_cycles();
+		if_pf(ret < 0) {
+			if (errno == EINTR)
+				_HFI_DBG("got signal, keep polling\n");
+			else
+				psmi_handle_error(PSMI_EP_NORETURN,
+						  PSM2_INTERNAL_ERR,
+						  "Receive thread poll() error: %s",
+						  strerror(errno));
+		} else if (pfd[1].revents) {
+			/* Any type of event on this fd means exit, should be POLLHUP */
+			_HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents);
+			close(fd_pipe);
+			break;
+		} else {
+			// we got an async event
+			if (pfd[2].revents & POLLIN)
+				process_async_event(rcvc->context->ep);
+
+			// we got here due to a CQ event (as opposed to timeout)
+			// consume the event and rearm, we'll poll cq below
+			if (pfd[0].revents & POLLIN)
+				rearm_cq_event(rcvc->context->ep);
+
+			rcvc->pollcnt++;
+			if (!PSMI_LOCK_TRY(psmi_creation_lock)) {
+				if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) {
+					if (PSMI_LOCK_DISABLED) {
+						// this path is not supported.  having rcvthread
+						// and PSMI_PLOCK_IS_NOLOCK define not allowed.
+						// TBD - would be good if we could quickly
+						// check for ep->verbs_ep.recv_wc_count == 0
+						//	&& nothing on CQ without doing a ibv_poll_cq
+						// ibv_poll_cq(cq, 0, NULL) always returns 0, so that
+						// doesn't help
+						// ibv_poll_cq would consume a CQE and require a lock so
+						// must call our main recv progress function below
+						// maybe if we open the can on HW verbs driver we could
+						// quickly check Q without polling.  Main benefit would
+						// be avoiding spinlock contention with main PSM
+						// thread and perhaps using the trylock style inside
+						// poll_cq much like we do for WFR
+						if (!ips_recvhdrq_trylock(recvq))
+							continue;
+						err = ips_recvhdrq_progress(recvq);
+						if (err == PSM2_OK)
+							rcvc->pollok++;
+						else
+							rcvc->pollcyc += get_cycles() - t_cyc;
+						ips_recvhdrq_unlock(recvq);
+					} else {
+
+						ep = psmi_opened_endpoint;
+
+						/* Go through all master endpoints. */
+						do{
+							if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) {
+								/* If we time out, we service shm and NIC.
+								* If not, we assume to have received an urgent
+								* packet and service only NIC.
+								*/
+								err = psmi_poll_internal(ep,
+											 ret == 0 ? PSMI_TRUE : PSMI_FALSE);
+
+								if (err == PSM2_OK)
+									rcvc->pollok++;
+								else
+									rcvc->pollcyc += get_cycles() - t_cyc;
+								PSMI_UNLOCK(ep->mq->progress_lock);
+							}
+							poll_async_events(ep);
+
+							/* get next endpoint from multi endpoint list */
+							ep = ep->user_ep_next;
+						} while(NULL != ep);
+					}
+				}
+				PSMI_UNLOCK(psmi_creation_lock);
+			}
+			if (ret == 0) { /* change timeout only on timed out poll */
+				rcvc->pollcnt_to++;
+				next_timeout = rcvthread_next_timeout(rcvc);
+			}
+		}
+	}
+
+	PSM2_LOG_MSG("leaving");
+	return NULL;
+}
+
+static uint64_t rcvthread_stats_pollok(void *context)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+	//double ratio = 0.0;
+	uint64_t ratio_u = 0;
+	if (rcvc->pollcnt > 0)
+		//ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt;
+		ratio_u = (uint64_t)((double)rcvc->pollok * 100.0 / rcvc->pollcnt);
+	//memcpy(&ratio_u, &ratio, sizeof(uint64_t));
+	return ratio_u;
+}
+
+static uint64_t rcvthread_stats_pollcyc(void *context)
+{
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context;
+	/* log in milliseconds */
+	return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6);
+}
+
+static psm2_error_t rcvthread_initstats(ptl_t *ptl_gen)
+{
+	struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen;
+	struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread;
+	struct psmi_stats_entry entries[] = {
+		PSMI_STATS_DECLU64("intrthread_schedule_count", &rcvc->pollcnt),
+		PSMI_STATS_DECL("intrthread_schedule_success_(%)",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				rcvthread_stats_pollok, NULL),
+		PSMI_STATS_DECLU64("intrthread_timeout_count", &rcvc->pollcnt_to),
+		PSMI_STATS_DECL("intrthread_wasted_time_(ms)",
+				MPSPAWN_STATS_REDUCTION_ALL,
+				rcvthread_stats_pollcyc, NULL)
+	};
+
+	/* If we don't want a thread, make sure we still initialize the counters
+	 * but set them to NaN instead */
+	if (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) {
+		int i;
+		static uint64_t ctr_nan = MPSPAWN_NAN;
+		for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) {
+			entries[i].getfn = NULL;
+			entries[i].u.val = &ctr_nan;
+		}
+	}
+
+	// one rcvThread per process, so omit id (ptl->ep->epid) and
+	// info (ptl->ep->dev_name)
+	return psmi_stats_register_type("RcvThread_statistics",
+					PSMI_STATSTYPE_RCVTHREAD,
+					entries,
+					PSMI_STATS_HOWMANY(entries), 0, rcvc, NULL);
+}
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_self/ptl.c b/deps/libfabric/prov/psm3/psm3/ptl_self/ptl.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d2fc2d53ba5c386ac718304045c9a8b692828e5
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_self/ptl.c
@@ -0,0 +1,424 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+/*
+ * This file implements the PSM PTL for self (loopback)
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+struct ptl_self {
+	psm2_ep_t ep;
+	psm2_epid_t epid;
+	psm2_epaddr_t epaddr;
+	ptl_ctl_t *ctl;
+} __attribute__((aligned(16)));
+
+/* not reported yet, so just track in a global so can pass a pointer to
+ * psmi_mq_handle_envelope and psmi_mq_handle_rts
+ */
+static struct ptl_strategy_stats strat_stats;
+
+static
+psm2_error_t
+ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted)
+{
+	psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr;
+
+	if (recv_req->req_data.recv_msglen > 0) {
+		psmi_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf,
+			       recv_req->req_data.recv_msglen);
+	}
+
+	recv_req->mq->stats.rx_user_num++;
+	recv_req->mq->stats.rx_user_bytes += recv_req->req_data.recv_msglen;
+	psmi_mq_handle_rts_complete(recv_req);
+
+	send_req->mq->stats.tx_rndv_bytes += send_req->req_data.send_msglen;
+	/* If the send is already marked complete, that's because it was internally
+	 * buffered. */
+	if (send_req->state == MQ_STATE_COMPLETE) {
+		if (send_req->req_data.buf != NULL && send_req->req_data.send_msglen > 0)
+			psmi_mq_sysbuf_free(send_req->mq, send_req->req_data.buf);
+		/* req was left "live" even though the sender was told that the
+		 * send was done */
+		psmi_mq_req_free(send_req);
+	} else
+		psmi_mq_handle_rts_complete(send_req);
+
+	_HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n",
+		  recv_req->req_data.buf, send_req, recv_req);
+	return PSM2_OK;
+}
+
+static
+psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq)
+{
+	uint8_t *ubuf;
+	psm2_mq_req_t req = *ireq;
+
+	PSMI_LOCK_ASSERT(req->mq->progress_lock);
+
+	/* We're waiting on a send request, and the matching receive has not been
+	 * posted yet.  This is a deadlock condition in MPI but we accommodate it
+	 * here in the "self ptl" by using system-allocated memory.
+	 */
+	req->testwait_callback = NULL;	/* no more calls here */
+
+	ubuf = req->req_data.buf;
+	if (ubuf != NULL && req->req_data.send_msglen > 0) {
+		req->req_data.buf = psmi_mq_sysbuf_alloc(req->mq, req->req_data.send_msglen);
+		if (req->req_data.buf == NULL)
+			return PSM2_NO_MEMORY;
+		psmi_mq_mtucpy(req->req_data.buf, ubuf, req->req_data.send_msglen);
+	}
+
+	/* Mark it complete but don't free the req, it's freed when the receiver
+	 * does the match */
+	req->state = MQ_STATE_COMPLETE;
+	*ireq = PSM2_MQ_REQINVALID;
+	return PSM2_OK;
+}
+
+/* Self is different.  We do everything as rendezvous. */
+static
+psm2_error_t
+self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user,
+	      uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf,
+	      uint32_t len, void *context, psm2_mq_req_t *req_o)
+{
+	psm2_mq_req_t send_req;
+	psm2_mq_req_t recv_req;
+	int rc;
+
+	send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+	if_pf(send_req == NULL)
+	    return PSM2_NO_MEMORY;
+
+#ifdef PSM_CUDA
+	if (len && PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
+		psmi_cuda_set_attr_sync_memops(ubuf);
+		send_req->is_buf_gpu_mem = 1;
+	} else
+		send_req->is_buf_gpu_mem = 0;
+#endif
+
+	mq->stats.tx_num++;
+	mq->stats.tx_rndv_num++;
+
+	rc = psmi_mq_handle_rts(mq, epaddr, tag, &strat_stats,
+				len, NULL, 0, 1,
+				ptl_handle_rtsmatch, &recv_req);
+	send_req->req_data.tag = *tag;
+	send_req->req_data.buf = (void *)ubuf;
+	send_req->req_data.send_msglen = len;
+	send_req->req_data.context = context;
+	recv_req->ptl_req_ptr = (void *)send_req;
+	recv_req->rts_sbuf = (uintptr_t) ubuf;
+	recv_req->rts_peer = epaddr;
+	if (rc == MQ_RET_MATCH_OK)
+		ptl_handle_rtsmatch(recv_req, 1);
+	else
+		send_req->testwait_callback = self_mq_send_testwait;
+
+	_HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n",
+		  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2],
+		  rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req);
+	*req_o = send_req;
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags,
+	     psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)
+{
+	psm2_error_t err;
+	psm2_mq_req_t req;
+	err = self_mq_isend(mq, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len, NULL, &req);
+	psmi_mq_wait_internal(&req);
+	return err;
+}
+
+/* Fill in AM capabilities parameters */
+static psm2_error_t
+self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
+{
+	if (parameters == NULL) {
+		return PSM2_PARAM_ERR;
+	}
+
+	/* Self is just a loop-back and has no restrictions. */
+	parameters->max_handlers = INT_MAX;
+	parameters->max_nargs = INT_MAX;
+	parameters->max_request_short = INT_MAX;
+	parameters->max_reply_short = INT_MAX;
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_request(psm2_epaddr_t epaddr,
+		      psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		      void *src, size_t len, int flags,
+		      psm2_am_completion_fn_t completion_fn,
+		      void *completion_ctxt)
+{
+	struct psm2_ep_am_handle_entry *hentry;
+	psm2_ep_t ep = ((struct ptl_self *)(epaddr->ptlctl->ptl))->ep;
+	struct psmi_am_token tok;
+
+	tok.epaddr_incoming = epaddr;
+
+	hentry = psm_am_get_handler_function(ep, handler);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(&tok, args, nargs, src, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(&tok, args, nargs, src, len);
+	}
+
+	if (completion_fn) {
+		completion_fn(completion_ctxt);
+	}
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_am_short_reply(psm2_am_token_t token,
+		    psm2_handler_t handler, psm2_amarg_t *args, int nargs,
+		    void *src, size_t len, int flags,
+		    psm2_am_completion_fn_t completion_fn, void *completion_ctxt)
+{
+	struct psm2_ep_am_handle_entry *hentry;
+	struct psmi_am_token *tok = token;
+	struct ptl_self *ptl = (struct ptl_self *)tok->epaddr_incoming->ptlctl->ptl;
+	psm2_ep_t ep = ptl->ep;
+
+	hentry = psm_am_get_handler_function(ep, handler);
+
+	/* Note a guard here for hentry != NULL is not needed because at
+	 * initialization, a psmi_assert_always() assure the entry will be
+	 * non-NULL. */
+
+	if (likely(hentry->version == PSM2_AM_HANDLER_V2)) {
+		psm2_am_handler_2_fn_t hfn2 =
+				(psm2_am_handler_2_fn_t)hentry->hfn;
+		hfn2(token, args, nargs, src, len, hentry->hctx);
+	} else {
+		psm2_am_handler_fn_t hfn1 =
+				(psm2_am_handler_fn_t)hentry->hfn;
+		hfn1(token, args, nargs, src, len);
+	}
+
+	if (completion_fn) {
+		completion_fn(completion_ctxt);
+	}
+
+	return PSM2_OK;
+}
+
+static
+psm2_error_t
+self_connect(ptl_t *ptl_gen,
+	     int numep,
+	     const psm2_epid_t array_of_epid[],
+	     const int array_of_epid_mask[],
+	     psm2_error_t array_of_errors[],
+	     psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns)
+{
+	struct ptl_self *ptl = (struct ptl_self *)ptl_gen;
+	psmi_assert_always(ptl->epaddr != NULL);
+	psm2_error_t err = PSM2_OK;
+	int i;
+
+	PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock);
+
+	for (i = 0; i < numep; i++) {
+		if (!array_of_epid_mask[i])
+			continue;
+
+		if (array_of_epid[i] == ptl->epid) {
+			_HFI_CONNDBG("connect self\n");
+			array_of_epaddr[i] = ptl->epaddr;
+			array_of_epaddr[i]->ptlctl = ptl->ctl;
+			array_of_epaddr[i]->epid = ptl->epid;
+			if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid),
+						   psmi_gethostname(), 0)) {
+				err = PSM2_NO_MEMORY;
+				goto fail;
+			}
+			psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr);
+			array_of_errors[i] = PSM2_OK;
+		} else {
+			array_of_epaddr[i] = NULL;
+			array_of_errors[i] = PSM2_EPID_UNREACHABLE;
+		}
+	}
+
+fail:
+	return err;
+}
+
+static
+psm2_error_t
+self_disconnect(ptl_t *ptl_gen, int force, int numep,
+		   psm2_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[],
+		   psm2_error_t array_of_errors[], uint64_t timeout_in)
+{
+	struct ptl_self *ptl = (struct ptl_self *)ptl_gen;
+	int i;
+	for (i = 0; i < numep; i++) {
+		if (array_of_epaddr_mask[i] == 0)
+			continue;
+
+		if (array_of_epaddr[i] == ptl->epaddr) {
+			_HFI_CONNDBG("disconnect self\n");
+			psmi_epid_remove(ptl->ep, ptl->epid);
+			array_of_errors[i] = PSM2_OK;
+		}
+	}
+	return PSM2_OK;
+}
+
+static
+size_t self_ptl_sizeof(void)
+{
+	return sizeof(struct ptl_self);
+}
+
+ustatic
+psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
+{
+	struct ptl_self *ptl = (struct ptl_self *)ptl_gen;
+	psmi_assert_always(ep != NULL);
+	psmi_assert_always(ep->epaddr != NULL);
+	psmi_assert_always(ep->epid != 0);
+
+	ptl->ep = ep;
+	ptl->epid = ep->epid;
+	ptl->epaddr = ep->epaddr;
+	ptl->ctl = ctl;
+
+	memset(ctl, 0, sizeof(*ctl));
+	/* Fill in the control structure */
+	ctl->ptl = ptl_gen;
+	ctl->ep = ep;
+	ctl->ep_poll = NULL;
+	ctl->ep_connect = self_connect;
+	ctl->ep_disconnect = self_disconnect;
+
+	ctl->mq_send = self_mq_send;
+	ctl->mq_isend = self_mq_isend;
+
+	ctl->am_get_parameters = self_am_get_parameters;
+	ctl->am_short_request = self_am_short_request;
+	ctl->am_short_reply = self_am_short_reply;
+
+#if 0	// unused code, specific to QLogic MPI
+	/* No stats in self */
+	ctl->epaddr_stats_num = NULL;
+	ctl->epaddr_stats_init = NULL;
+	ctl->epaddr_stats_get = NULL;
+#endif
+
+	return PSM2_OK;
+}
+
+static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+	return PSM2_OK;		/* nothing to do */
+}
+
+static
+psm2_error_t
+self_ptl_setopt(const void *component_obj, int optname,
+		const void *optval, uint64_t optlen)
+{
+	/* No options for SELF PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown SELF ptl option %u.", optname);
+}
+
+static
+psm2_error_t
+self_ptl_getopt(const void *component_obj, int optname,
+		void *optval, uint64_t *optlen)
+{
+	/* No options for SELF PTL at the moment */
+	return psmi_handle_error(NULL, PSM2_PARAM_ERR,
+				 "Unknown SELF ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_self = {
+	self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt,
+	self_ptl_getopt
+};
diff --git a/deps/libfabric/prov/psm3/psm3/ptl_self/ptl_fwd.h b/deps/libfabric/prov/psm3/psm3/ptl_self/ptl_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ee6b732a3ff06843de5a907aa143edae1e81d5f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/psm3/ptl_self/ptl_fwd.h
@@ -0,0 +1,62 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2015 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2015 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
+
+#ifndef _PTL_FWD_SELF_H
+#define _PTL_FWD_SELF_H
+
+/* Symbol in am ptl */
+extern struct ptl_ctl_init psmi_ptl_self;
+
+#endif
diff --git a/deps/libfabric/prov/psm3/shared b/deps/libfabric/prov/psm3/shared
new file mode 120000
index 0000000000000000000000000000000000000000..929cb3dc9ba04d657e86ebde2d29661b78bdd85c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/shared
@@ -0,0 +1 @@
+../../src
\ No newline at end of file
diff --git a/deps/libfabric/prov/psm3/src/.gitignore b/deps/libfabric/prov/psm3/src/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9a799b2e58479775aa5c7527c1d603efc4e9eb28
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/.gitignore
@@ -0,0 +1,2 @@
+psm3_revision.c
+psm3_src_chksum.h
diff --git a/deps/libfabric/prov/psm3/src/psm3_revision.c.in b/deps/libfabric/prov/psm3/src/psm3_revision.c.in
new file mode 100644
index 0000000000000000000000000000000000000000..3229fa9e79b5c9612ed5b2ae43fc21f6ed8121b6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psm3_revision.c.in
@@ -0,0 +1,39 @@
+#include "psmx3.h"
+#include "psm3_src_chksum.h"
+
+#ifndef PSMX3_IFS_VERSION
+#define PSMX3_IFS_VERSION	"@PSM3_IFS_VERSION@"
+#endif
+
+#ifndef PSMX3_BUILD_TIMESTAMP
+#define PSMX3_BUILD_TIMESTAMP	"@PSM3_BUILD_TIMESTAMP@"
+#endif
+
+#ifndef PSMX3_SRC_CHECKSUM
+#define PSMX3_SRC_CHECKSUM	"@PSM3_SRC_CHECKSUM@"
+#endif
+
+#ifndef PSMX3_GIT_CHECKSUM
+#define PSMX3_GIT_CHECKSUM	"@PSM3_GIT_HASH@"
+#endif
+
+char psmi_hfi_IFS_version[] = PSMX3_IFS_VERSION;
+char psmi_hfi_build_timestamp[] = PSMX3_BUILD_TIMESTAMP;
+char psmi_hfi_sources_checksum[] = PSMX3_SRC_CHECKSUM;
+char psmi_hfi_git_checksum[] = PSMX3_GIT_CHECKSUM;
+
+#define PSM3_PROV_VER_MAJOR @PSM3_PROV_VER_MAJOR@
+#define PSM3_PROV_VER_MINOR @PSM3_PROV_VER_MINOR@
+#define PSM3_PROV_VER_MAINT @PSM3_PROV_VER_MAINT@
+#define PSM3_PROV_VER_PATCH @PSM3_PROV_VER_PATCH@
+
+/* Leave last digit open for special use */
+#define PSM3_PROV_VER(major, minor, maint, patch) \
+	( ( ( ((major) * 100) + (minor)) << 16)	| ( ( ((maint) * 1000) + ((patch) * 10)) & 0xFFFF) )
+
+
+static uint32_t psm3_provider_version =
+	PSM3_PROV_VER(PSM3_PROV_VER_MAJOR, PSM3_PROV_VER_MINOR, PSM3_PROV_VER_MAINT, PSM3_PROV_VER_PATCH);
+uint32_t get_psm3_provider_version() {
+	return psm3_provider_version;
+}
diff --git a/deps/libfabric/prov/psm3/src/psmx3.h b/deps/libfabric/prov/psm3/src/psmx3.h
new file mode 100644
index 0000000000000000000000000000000000000000..af04942e917baeb5a97b8e6b6c9524ae8072105a
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3.h
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_PSM2_H
+#define _FI_PSM2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <limits.h>
+#include <complex.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_rma.h>
+#include <rdma/fi_atomic.h>
+#include <rdma/fi_trigger.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_errno.h>
+#include "ofi.h"
+#include "ofi_atomic.h"
+#include "ofi_enosys.h"
+#include "ofi_list.h"
+#include "ofi_util.h"
+#include "ofi_mem.h"
+#include "rbtree.h"
+#include "version.h"
+#include "psm_config.h"
+
+#ifdef FABRIC_DIRECT_ENABLED
+#define DIRECT_FN __attribute__((visibility ("default")))
+#define STATIC
+#else
+#define DIRECT_FN
+#define STATIC static
+#endif
+
+extern struct fi_provider psmx3_prov;
+
+#define PSMX3_OP_FLAGS	(FI_INJECT | FI_MULTI_RECV | FI_COMPLETION | \
+			 FI_TRIGGER | FI_INJECT_COMPLETE | \
+			 FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)
+
+#define PSMX3_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS | \
+		       FI_NAMED_RX_CTX | FI_TRIGGER)
+#define PSMX3_RX_CAPS (FI_SOURCE | FI_SOURCE_ERR | FI_RMA_EVENT | OFI_RX_MSG_CAPS | \
+		       FI_TAGGED | OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \
+		       FI_MULTI_RECV | FI_TRIGGER)
+#define PSMX3_DOM_CAPS	(FI_SHARED_AV | FI_LOCAL_COMM | FI_REMOTE_COMM)
+#define PSMX3_CAPS (PSMX3_TX_CAPS | PSMX3_RX_CAPS | PSMX3_DOM_CAPS)
+
+#define PSMX3_RMA_TX_CAPS (PSMX3_TX_CAPS & ~(FI_TAGGED | FI_MSG | FI_SEND))
+#define PSMX3_RMA_RX_CAPS (PSMX3_RX_CAPS & ~(FI_TAGGED | FI_MSG | FI_RECV | \
+			   FI_DIRECTED_RECV | FI_MULTI_RECV))
+#define PSMX3_RMA_CAPS (PSMX3_RMA_TX_CAPS | PSMX3_RMA_RX_CAPS | PSMX3_DOM_CAPS)
+
+#define PSMX3_SUB_CAPS	(FI_SEND | FI_RECV | FI_READ | FI_WRITE | \
+			 FI_REMOTE_READ | FI_REMOTE_WRITE)
+
+#define PSMX3_ALL_TRX_CTXT	((void *)-1)
+#define PSMX3_MAX_MSG_SIZE	((0x1ULL << 32) - 1)
+#define PSMX3_RMA_ORDER_SIZE	(4096)
+#define PSMX3_MSG_ORDER		(FI_ORDER_SAS | OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | \
+				 OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET)
+#define PSMX3_COMP_ORDER	FI_ORDER_NONE
+
+/*
+ * Four bits are reserved from the 64-bit tag space as a flags to identify the
+ * type and properties of the messages.
+ *
+ * To conserve tag bits, we use a couple otherwise invalid bit combinations
+ * to distinguish RMA long reads from long writes and distinguish iovec
+ * payloads from regular messages.
+ *
+ * We never match on the immediate bit. Regular tagged and untagged messages
+ * do not match on the iov bit, but the iov and imm bits are checked when we
+ * process completions.
+ *
+ *                   MSG RMA IOV IMM
+ * tagged message    0   0   x   x
+ * untagged message  1   0   x   x
+ * rma long read     0   1   0   x
+ * rma long write    0   1   1   x
+ * iov payload       1   1   x   x
+ */
+
+#define PSMX3_MSG_BIT		(0x80000000)
+#define PSMX3_RMA_BIT		(0x40000000)
+#define PSMX3_IOV_BIT		(0x20000000)
+#define PSMX3_IMM_BIT		(0x10000000)
+
+/* Top two bits of the flag are the message type */
+#define PSMX3_TYPE_TAGGED	(0)
+#define PSMX3_TYPE_MSG		PSMX3_MSG_BIT
+#define PSMX3_TYPE_RMA		PSMX3_RMA_BIT
+#define PSMX3_TYPE_IOV_PAYLOAD	(PSMX3_MSG_BIT | PSMX3_RMA_BIT)
+#define PSMX3_TYPE_MASK		(PSMX3_MSG_BIT | PSMX3_RMA_BIT)
+
+/*
+ * For RMA protocol, use the IOV bit to distinguish between long RMA write
+ * and long RMA read. This prevents tag collisions between reads/writes issued
+ * locally and the writes/reads issued by peers. RMA doesn't use this bit for
+ * IOV support so it's safe to do so.
+ */
+#define PSMX3_RMA_TYPE_READ	PSMX3_TYPE_RMA
+#define PSMX3_RMA_TYPE_WRITE	(PSMX3_TYPE_RMA | PSMX3_IOV_BIT)
+#define PSMX3_RMA_TYPE_MASK	(PSMX3_TYPE_MASK | PSMX3_IOV_BIT)
+
+/* IOV header is only possible when the RMA bit is 0 */
+#define PSMX3_IOV_HEADER_MASK		(PSMX3_IOV_BIT | PSMX3_RMA_BIT)
+
+#define PSMX3_IS_IOV_HEADER(flags)	(((flags) & PSMX3_IOV_HEADER_MASK) == PSMX3_IOV_BIT)
+#define PSMX3_IS_IOV_PAYLOAD(flags)	(((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_IOV_PAYLOAD)
+#define PSMX3_IS_RMA(flags)		(((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_RMA)
+#define PSMX3_IS_MSG(flags)		(((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_MSG)
+#define PSMX3_IS_TAGGED(flags)		(((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_TAGGED)
+#define PSMX3_HAS_IMM(flags)		((flags) & PSMX3_IMM_BIT)
+
+/* Set a bit conditionally without branching.  Flag must be 1 or 0. */
+#define PSMX3_MSG_BIT_SET(flag) (-(uint32_t)flag & PSMX3_MSG_BIT)
+#define PSMX3_RMA_BIT_SET(flag) (-(uint32_t)flag & PSMX3_RMA_BIT)
+#define PSMX3_IOV_BIT_SET(flag) (-(uint32_t)flag & PSMX3_IOV_BIT)
+#define PSMX3_IMM_BIT_SET(flag) (-(uint32_t)flag & PSMX3_IMM_BIT)
+
+/*
+ * Different ways to use the 96 bit tag:
+ * 	TAG60: 32/4/60 for data/flags/tag
+ * 	TAG64: 4/28/64 for flags/data/tag
+ * 	RUNTIME:  make the choice at runtime
+ */
+#define PSMX3_TAG_LAYOUT_RUNTIME	0
+#define PSMX3_TAG_LAYOUT_TAG60		1
+#define PSMX3_TAG_LAYOUT_TAG64		2
+
+#ifndef PSMX3_TAG_LAYOUT
+#define PSMX3_TAG_LAYOUT	PSMX3_TAG_LAYOUT_RUNTIME
+#elif (PSMX3_TAG_LAYOUT < 0 || PSMX3_TAG_LAYOUT > 2)
+#warning "Invalid PSMX3_TAG_LAYOUT definition, using default."
+#undef PSMX3_TAG_LAYOUT
+#define PSMX3_TAG_LAYOUT	PSMX3_TAG_LAYOUT_RUNTIME
+#endif
+
+#define PSMX3_TAG_MASK_60	(0x0FFFFFFFFFFFFFFFULL)
+#define PSMX3_TAG_UPPER_MASK_60	((uint32_t)0x0FFFFFFF)
+#define PSMX3_DATA_MASK_60	((uint32_t)0xFFFFFFFF)
+#define PSMX3_FLAGS_IDX_60	(1)
+
+#define PSMX3_TAG_MASK_64	(0xFFFFFFFFFFFFFFFFULL)
+#define PSMX3_TAG_UPPER_MASK_64	((uint32_t)0xFFFFFFFF)
+#define PSMX3_DATA_MASK_64	((uint32_t)0x0FFFFFFF)
+#define PSMX3_FLAGS_IDX_64	(2)
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG60)
+#define PSMX3_TAG_MASK		PSMX3_TAG_MASK_60
+#define PSMX3_TAG_UPPER_MASK	PSMX3_TAG_UPPER_MASK_60
+#define PSMX3_DATA_MASK		PSMX3_DATA_MASK_60
+#define PSMX3_FLAGS_IDX		PSMX3_FLAGS_IDX_60
+#endif
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG64)
+#define PSMX3_TAG_MASK		PSMX3_TAG_MASK_64
+#define PSMX3_TAG_UPPER_MASK	PSMX3_TAG_UPPER_MASK_64
+#define PSMX3_DATA_MASK		PSMX3_DATA_MASK_64
+#define PSMX3_FLAGS_IDX		PSMX3_FLAGS_IDX_64
+#endif
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+#define PSMX3_TAG_MASK		psmx3_tag_mask
+#define PSMX3_TAG_UPPER_MASK	psmx3_tag_upper_mask
+#define PSMX3_DATA_MASK		psmx3_data_mask
+#define PSMX3_FLAGS_IDX		psmx3_flags_idx
+extern uint64_t	psmx3_tag_mask;
+extern uint32_t	psmx3_tag_upper_mask;
+extern uint32_t	psmx3_data_mask;
+extern int	psmx3_flags_idx;
+extern int	psmx3_tag_layout_locked;
+#endif
+
+#define PSMX3_FLAGS_MASK	((uint32_t)0xF0000000)
+
+#define PSMX3_MAX_TAG		PSMX3_TAG_MASK
+#define PSMX3_MATCH_ALL		(-1ULL)
+#define PSMX3_MATCH_NONE	(0ULL)
+
+#define PSMX3_PRINT_TAG(tag96) \
+	printf("%s: %08x %08x %08x\n", __func__, tag96.tag0, tag96.tag1, tag96.tag2)
+
+/*
+ * psm2_mq_tag_t is a union type of 96 bits. These functions are used to
+ * access the first 64 bits without generating the warning "dereferencing
+ * type-punned pointer will break strict-aliasing rules". This is faster
+ * than combining two 32-bit values with bit operations.
+ *
+ * Notice:
+ * (1) *(uint64_t *)tag96 works, but *(uint64_t *)tag96->tag doesn't;
+ * (2) putting these statements directly inside the macros won't work.
+ */
+__attribute__((always_inline))
+static inline void psmx3_set_tag64(psm2_mq_tag_t *tag96, uint64_t tag64)
+{
+	tag96->tag64 = tag64;
+}
+
+__attribute__((always_inline))
+static inline uint64_t psmx3_get_tag64(psm2_mq_tag_t *tag96)
+{
+	return tag96->tag64;
+}
+
+#define PSMX3_SET_TAG_INTERNAL(tag96,_tag_,cq_data,flags) \
+	do { \
+		psmx3_set_tag64(&(tag96),(_tag_) & PSMX3_TAG_MASK); \
+		(tag96).tag2 = ((cq_data) & PSMX3_DATA_MASK); \
+		(tag96).tag[PSMX3_FLAGS_IDX] |= (flags); \
+	} while (0)
+
+#define PSMX3_SET_TAG(tag96,tag,cq_data,flags) \
+	PSMX3_SET_TAG_INTERNAL(tag96,tag,cq_data,flags)
+
+#define PSMX3_SET_MASK(tagsel96,tag_mask,flag_mask) \
+	PSMX3_SET_TAG_INTERNAL(tagsel96,tag_mask,0,flag_mask)
+
+#define PSMX3_GET_TAG64(tag96)	(psmx3_get_tag64(&(tag96)) & PSMX3_TAG_MASK)
+#define PSMX3_GET_FLAGS(tag96)	((tag96).tag[PSMX3_FLAGS_IDX] & PSMX3_FLAGS_MASK)
+#define PSMX3_GET_CQDATA(tag96)	((tag96).tag2 & PSMX3_DATA_MASK)
+
+#define PSMX3_MAX_RX_CTX_BITS	(12)
+#define PSMX3_ADDR_IDX_MASK	(0x000FFFFFFFFFFFFFUL)
+#define PSMX3_ADDR_CTXT_MASK	(0xFFF0000000000000UL)
+#define PSMX3_ADDR_IDX(addr)	((addr) & PSMX3_ADDR_IDX_MASK)
+#define PSMX3_ADDR_CTXT(addr, ctxt_bits) \
+				(((addr) & PSMX3_ADDR_CTXT_MASK) >> (64-(ctxt_bits)))
+
+/* Bits 60 .. 63 of the flag are provider specific */
+#define PSMX3_NO_COMPLETION	(1ULL << 60)
+
+enum psmx3_context_type {
+	PSMX3_NOCOMP_SEND_CONTEXT = 1,
+	PSMX3_NOCOMP_RECV_CONTEXT,
+	PSMX3_NOCOMP_TSEND_CONTEXT,
+	PSMX3_NOCOMP_TRECV_CONTEXT,
+	PSMX3_NOCOMP_WRITE_CONTEXT,
+	PSMX3_NOCOMP_READ_CONTEXT,
+	PSMX3_SEND_CONTEXT,
+	PSMX3_RECV_CONTEXT,
+	PSMX3_MULTI_RECV_CONTEXT,
+	PSMX3_TSEND_CONTEXT,
+	PSMX3_TRECV_CONTEXT,
+	PSMX3_WRITE_CONTEXT,
+	PSMX3_READ_CONTEXT,
+	PSMX3_REMOTE_WRITE_CONTEXT,
+	PSMX3_REMOTE_READ_CONTEXT,
+	PSMX3_SENDV_CONTEXT,
+	PSMX3_IOV_SEND_CONTEXT,
+	PSMX3_IOV_RECV_CONTEXT,
+	PSMX3_MAX_CONTEXT_TYPE
+};
+
+union psmx3_pi {
+	void	*p;
+	uint32_t i[2];
+};
+
+#define PSMX3_CTXT_REQ(fi_context)	((fi_context)->internal[0])
+#define PSMX3_CTXT_TYPE(fi_context)	(((union psmx3_pi *)&(fi_context)->internal[1])->i[0])
+#define PSMX3_CTXT_SIZE(fi_context)	(((union psmx3_pi *)&(fi_context)->internal[1])->i[1])
+#define PSMX3_CTXT_USER(fi_context)	((fi_context)->internal[2])
+#define PSMX3_CTXT_EP(fi_context)	((fi_context)->internal[3])
+
+/*
+ * Use per-protocol versioning to avoid unnecessary version checking. Only perform
+ * version checking when the current version is greater than zero.
+ */
+#define PSMX3_AM_RMA_VERSION		0
+#define PSMX3_AM_ATOMIC_VERSION		0
+#define PSMX3_AM_SEP_VERSION		1
+#define PSMX3_AM_TRX_CTXT_VERSION	0
+
+#define PSMX3_AM_RMA_HANDLER		0
+#define PSMX3_AM_ATOMIC_HANDLER		1
+#define PSMX3_AM_SEP_HANDLER		2
+#define PSMX3_AM_TRX_CTXT_HANDLER	3
+
+#define PSMX3_AM_OP_MASK	0x000000FF
+#define PSMX3_AM_FLAG_MASK	0xFF000000
+#define PSMX3_AM_VER_MASK	0x00FF0000
+#define PSMX3_AM_VER_SHIFT	16
+#define PSMX3_AM_EOM		0x40000000
+#define PSMX3_AM_DATA		0x20000000
+#define PSMX3_AM_FORCE_ACK	0x10000000
+
+#define PSMX3_AM_SET_OP(u32w0,op)	do {(u32w0) &= ~PSMX3_AM_OP_MASK; (u32w0) |= (op);} while (0)
+#define PSMX3_AM_SET_FLAG(u32w0,flag)	do {(u32w0) &= ~PSMX3_AM_FLAG_MASK; (u32w0) |= (flag);} while (0)
+#define PSMX3_AM_SET_VER(u32w0,ver)	do {(u32w0) &= ~PSMX3_AM_VER_MASK; (u32w0) |= (ver << PSMX3_AM_VER_SHIFT);} while (0)
+#define PSMX3_AM_GET_OP(u32w0)		((u32w0) & PSMX3_AM_OP_MASK)
+#define PSMX3_AM_GET_FLAG(u32w0)	((u32w0) & PSMX3_AM_FLAG_MASK)
+#define PSMX3_AM_GET_VER(u32w0)		(((u32w0) & PSMX3_AM_VER_MASK) >> PSMX3_AM_VER_SHIFT)
+
+enum {
+	PSMX3_AM_REQ_WRITE = 1,
+	PSMX3_AM_REQ_WRITE_LONG,
+	PSMX3_AM_REP_WRITE,
+	PSMX3_AM_REQ_READ,
+	PSMX3_AM_REQ_READ_LONG,
+	PSMX3_AM_REP_READ,
+	PSMX3_AM_REQ_ATOMIC_WRITE,
+	PSMX3_AM_REP_ATOMIC_WRITE,
+	PSMX3_AM_REQ_ATOMIC_READWRITE,
+	PSMX3_AM_REP_ATOMIC_READWRITE,
+	PSMX3_AM_REQ_ATOMIC_COMPWRITE,
+	PSMX3_AM_REP_ATOMIC_COMPWRITE,
+	PSMX3_AM_REQ_WRITEV,
+	PSMX3_AM_REQ_READV,
+	PSMX3_AM_REQ_SEP_QUERY,
+	PSMX3_AM_REP_SEP_QUERY,
+	PSMX3_AM_REQ_TRX_CTXT_DISCONNECT,
+};
+
+struct psmx3_am_request {
+	int op;
+	union {
+		struct {
+			uint8_t	*buf;
+			size_t	len;
+			uint64_t addr;
+			uint64_t key;
+			void	*context;
+			void	*peer_addr;
+			uint64_t data;
+		} write;
+		struct {
+			union {
+				uint8_t	*buf;	   /* for read */
+				size_t	iov_count; /* for readv */
+			};
+			size_t	len;
+			uint64_t addr;
+			uint64_t key;
+			void	*context;
+			void	*peer_addr;
+			size_t	len_read;
+		} read;
+		struct {
+			union {
+				uint8_t	*buf;	   /* for result_count == 1 */
+				size_t	iov_count; /* for result_count > 1 */
+			};
+			size_t	len;
+			uint64_t addr;
+			uint64_t key;
+			void	*context;
+			uint8_t *result;
+			int	datatype;
+		} atomic;
+	};
+	uint64_t		cq_flags;
+	struct fi_context	fi_context;
+	struct psmx3_fid_ep	*ep;
+	int			no_event;
+	int			error;
+	struct slist_entry	list_entry;
+	union {
+		struct iovec	*iov;	/* for readv */
+		struct fi_ioc	*ioc;	/* for atomic read */
+	};
+	void			*tmpbuf;
+};
+
+#define PSMX3_IOV_PROTO_PACK	0
+#define PSMX3_IOV_PROTO_MULTI	1
+#define PSMX3_IOV_MAX_SEQ_NUM 	0x7fffffff
+#define PSMX3_IOV_BUF_SIZE	64
+#define PSMX3_IOV_MAX_COUNT	(PSMX3_IOV_BUF_SIZE / sizeof(uint32_t) - 3)
+
+struct psmx3_iov_info {
+	uint32_t	seq_num;
+	uint32_t	total_len;
+	uint32_t	count;
+	uint32_t	len[PSMX3_IOV_MAX_COUNT];
+};
+
+struct psmx3_sendv_request {
+	struct fi_context	fi_context;
+	struct fi_context	fi_context_iov;
+	void			*user_context;
+	int			iov_protocol;
+	int			no_completion;
+	int			comp_flag;
+	uint32_t		iov_done;
+	psm2_mq_tag_t		tag;
+	union {
+		struct psmx3_iov_info	iov_info;
+		char			buf[PSMX3_IOV_BUF_SIZE];
+	};
+};
+
+struct psmx3_sendv_reply {
+	struct fi_context	fi_context;
+	int			no_completion;
+	int			multi_recv;
+	psm2_mq_tag_t		tag;
+	uint8_t			*buf;
+	void			*user_context;
+	size_t			iov_done;
+	size_t			bytes_received;
+	size_t			msg_length;
+	int			error_code;
+	int			comp_flag;
+	struct psmx3_iov_info	iov_info;
+};
+
+struct psmx3_req_queue {
+	fastlock_t	lock;
+	struct slist	list;
+};
+
+struct psmx3_multi_recv {
+	psm2_epaddr_t	src_addr;
+	psm2_mq_tag_t	tag;
+	psm2_mq_tag_t	tagsel;
+	uint8_t		*buf;
+	size_t		len;
+	size_t		offset;
+	int		min_buf_size;
+	int		flag;
+	void		*context;
+};
+
+struct psmx3_fid_fabric {
+	struct util_fabric	util_fabric;
+	psm2_uuid_t		uuid;
+	struct util_ns		name_server;
+
+	/* list of all opened domains */
+	fastlock_t		domain_lock;
+	struct dlist_entry	domain_list;
+};
+
+#define PSMX3_TX	(1)
+#define PSMX3_RX	(2)
+#define PSMX3_TX_RX	(PSMX3_TX | PSMX3_RX)
+
+struct psmx3_trx_ctxt {
+	psm2_ep_t		psm2_ep;
+	psm2_epid_t		psm2_epid;
+	psm2_mq_t		psm2_mq;
+	int			am_initialized;
+	int			am_progress;
+	int			am_poll_count;
+	int			id;
+	int			usage_flags;
+	struct psm2_am_parameters psm2_am_param;
+
+	struct psmx3_fid_domain	*domain;
+	struct psmx3_fid_ep	*ep;
+
+	/* triggered operations that are ready to be processed */
+	struct psmx3_req_queue	trigger_queue;
+
+	/* request pool for RMA/atomic ops */
+	struct ofi_bufpool	*am_req_pool;
+	fastlock_t		am_req_pool_lock;
+
+	/* lock to prevent the sequence of psm2_mq_ipeek and psm2_mq_test be
+	 * interleaved in a multithreaded environment.
+	 */
+	fastlock_t		poll_lock;
+
+	/* list of peers connected to this tx/rx context */
+	struct dlist_entry	peer_list;
+	fastlock_t		peer_lock;
+
+	/* number of pathes this tx/rx context can be polled. this include
+	 * CQs and counters, as well as domain->trx_ctxt_list.
+	 */
+	ofi_atomic32_t		poll_refcnt;
+	int			poll_active;
+
+	psm2_uuid_t		uuid;
+
+	struct dlist_entry	entry;
+};
+
+typedef void	(*psmx3_lock_fn_t) (fastlock_t *lock, int lock_level);
+typedef int	(*psmx3_trylock_fn_t) (fastlock_t *lock, int lock_level);
+typedef void	(*psmx3_unlock_fn_t) (fastlock_t *lock, int lock_level);
+
+struct psmx3_fid_domain {
+	struct util_domain	util_domain;
+	struct psmx3_fid_fabric	*fabric;
+	uint64_t		mode;
+	uint64_t		caps;
+
+	enum fi_mr_mode		mr_mode;
+	fastlock_t		mr_lock;
+	uint64_t		mr_reserved_key;
+	RbtHandle		mr_map;
+
+	/* list of hw contexts opened for this domain */
+	fastlock_t		trx_ctxt_lock;
+	struct dlist_entry	trx_ctxt_list;
+
+	ofi_atomic32_t		sep_cnt;
+	fastlock_t		sep_lock;
+	struct dlist_entry	sep_list;
+
+	int			progress_thread_enabled;
+	pthread_t		progress_thread;
+
+	int			addr_format;
+	uint32_t		max_atomic_size;
+
+	struct dlist_entry	entry;
+
+	/* Lock/Unlock function pointers set based on FI_THREAD model */
+	psmx3_lock_fn_t		av_lock_fn;
+	psmx3_unlock_fn_t	av_unlock_fn;
+	psmx3_lock_fn_t		am_req_pool_lock_fn;
+	psmx3_unlock_fn_t	am_req_pool_unlock_fn;
+	psmx3_lock_fn_t		trx_ctxt_lock_fn;
+	psmx3_unlock_fn_t	trx_ctxt_unlock_fn;
+	psmx3_lock_fn_t		rma_queue_lock_fn;
+	psmx3_unlock_fn_t	rma_queue_unlock_fn;
+	psmx3_lock_fn_t		trigger_queue_lock_fn;
+	psmx3_unlock_fn_t	trigger_queue_unlock_fn;
+	psmx3_lock_fn_t		peer_lock_fn;
+	psmx3_unlock_fn_t	peer_unlock_fn;
+	psmx3_lock_fn_t		sep_lock_fn;
+	psmx3_unlock_fn_t	sep_unlock_fn;
+	psmx3_lock_fn_t		trigger_lock_fn;
+	psmx3_unlock_fn_t	trigger_unlock_fn;
+	psmx3_lock_fn_t		cq_lock_fn;
+	psmx3_unlock_fn_t	cq_unlock_fn;
+	psmx3_lock_fn_t		mr_lock_fn;
+	psmx3_unlock_fn_t	mr_unlock_fn;
+	psmx3_lock_fn_t		context_lock_fn;
+	psmx3_unlock_fn_t	context_unlock_fn;
+	psmx3_trylock_fn_t	poll_trylock_fn;
+	psmx3_unlock_fn_t	poll_unlock_fn;
+};
+
+#define PSMX3_EP_REGULAR	0
+#define PSMX3_EP_SCALABLE	1
+#define PSMX3_EP_SRC_ADDR	2
+
+#define PSMX3_RESERVED_EPID	(0xFFFFULL)
+#define PSMX3_DEFAULT_UNIT	(-1)
+#define PSMX3_DEFAULT_PORT	0
+#define PSMX3_ANY_SERVICE	0
+
+struct psmx3_ep_name {
+	psm2_epid_t		epid;
+	uint8_t			type;
+	union {
+		uint8_t		sep_id;		/* for scalable ep */
+		int8_t		unit;		/* for src addr. start from 0. -1 means any */
+	};
+	uint8_t			port;		/* for src addr. start from 1, 0 means any */
+	uint8_t			padding;
+	uint32_t		service;	/* for src addr. 0 means any */
+};
+
+#define PSMX3_MAX_STRING_NAME_LEN	64	/* "fi_addr_psmx3://<uint64_t>:<uint64_t>"  */
+
+struct psmx3_status_data {
+	struct psmx3_fid_cq	*poll_cq;
+	struct psmx3_trx_ctxt	*trx_ctxt;
+	fi_addr_t		*src_addr;
+	void			*event_buffer;
+};
+
+struct psmx3_cq_event {
+	union {
+		struct fi_cq_entry		context;
+		struct fi_cq_msg_entry		msg;
+		struct fi_cq_data_entry		data;
+		struct fi_cq_tagged_entry	tagged;
+		struct fi_cq_err_entry		err;
+	} cqe;
+	int			error;
+	int8_t			source_is_valid;
+	uint8_t			source_sep_id;
+	psm2_epaddr_t		source;
+	struct psmx3_fid_av	*source_av;
+	struct slist_entry	list_entry;
+};
+
+#define PSMX3_ERR_DATA_SIZE		64	/* large enough to hold a string address */
+
+struct psmx3_poll_ctxt {
+	struct psmx3_trx_ctxt		*trx_ctxt;
+	struct slist_entry		list_entry;
+};
+
+struct psmx3_fid_cq {
+	struct fid_cq			cq;
+	struct psmx3_fid_domain		*domain;
+	struct slist			poll_list;
+	int 				format;
+	int				entry_size;
+	size_t				event_count;
+	struct slist			event_queue;
+	struct slist			free_list;
+	fastlock_t			lock;
+	struct psmx3_cq_event		*pending_error;
+	struct util_wait		*wait;
+	int				wait_cond;
+	int				wait_is_local;
+	ofi_atomic32_t			signaled;
+	uint8_t				error_data[PSMX3_ERR_DATA_SIZE];
+};
+
+struct psmx3_trigger;
+
+struct psmx3_fid_cntr {
+	union {
+		struct fid_cntr		cntr;
+		struct util_cntr	util_cntr; /* for util_poll_run */
+	};
+	struct psmx3_fid_domain	*domain;
+	struct slist		poll_list;
+	int			poll_all;
+	int			events;
+	uint64_t		flags;
+	ofi_atomic64_t		counter;
+	ofi_atomic64_t		error_counter;
+	int			error_avail;
+	int			wait_is_local;
+	struct util_wait	*wait;
+	struct psmx3_trigger	*trigger;
+	fastlock_t		trigger_lock;
+};
+
+#define PSMX3_AV_DEFAULT_SIZE	64
+
+#define PSMX3_AV_TABLE_SIZE(count, shared) \
+		(sizeof(struct psmx3_av_hdr) + \
+		 ((shared) ? (count) * sizeof(fi_addr_t) : 0) + \
+		 (count) * sizeof(struct psmx3_av_addr))
+
+struct psmx3_av_hdr {
+	uint64_t		size;
+	uint64_t		last;
+};
+
+struct psmx3_av_addr {
+	psm2_epid_t		epid;
+	uint8_t			type;
+	uint8_t			sep_id;
+	uint8_t			valid;
+};
+
+struct psmx3_av_sep {
+	int			ctxt_cnt;
+	psm2_epid_t		*epids;
+};
+
+struct psmx3_av_conn {
+	struct psmx3_trx_ctxt	*trx_ctxt;
+	psm2_epaddr_t		*epaddrs;
+	psm2_epaddr_t		**sepaddrs;
+};
+
+struct psmx3_fid_av {
+	struct fid_av		av;
+	int			type;
+	struct psmx3_fid_domain	*domain;
+	struct fid_eq		*eq;
+	int			addr_format;
+	int			rx_ctx_bits;
+	int			max_trx_ctxt;
+	int			shared;
+	uint64_t		flags;
+	size_t			addrlen;
+	size_t			count;
+	fastlock_t		lock;
+	struct psmx3_trx_ctxt	*av_map_trx_ctxt;
+	struct util_shm		shm;
+	struct psmx3_av_hdr	*hdr;	/* shared AV header */
+	fi_addr_t		*map;	/* shared AV address mapping */
+	struct psmx3_av_addr	*table;	/* shared AV address table */
+	struct psmx3_av_sep	*sep_info;
+	struct psmx3_av_conn	conn_info[];
+};
+
+struct psmx3_fid_ep {
+	struct fid_ep		ep;
+	int			type;
+	struct psmx3_fid_domain	*domain;
+	/* above fields are common with sep */
+
+	struct psmx3_trx_ctxt	*tx;
+	struct psmx3_trx_ctxt	*rx;
+	struct psmx3_fid_ep	*base_ep;
+	struct psmx3_fid_stx	*stx;
+	struct psmx3_fid_av	*av;
+	struct psmx3_fid_cq	*send_cq;
+	struct psmx3_fid_cq	*recv_cq;
+	struct psmx3_fid_cntr	*send_cntr;
+	struct psmx3_fid_cntr	*recv_cntr;
+	struct psmx3_fid_cntr	*write_cntr;
+	struct psmx3_fid_cntr	*read_cntr;
+	struct psmx3_fid_cntr	*remote_write_cntr;
+	struct psmx3_fid_cntr	*remote_read_cntr;
+	unsigned		send_selective_completion:1;
+	unsigned		recv_selective_completion:1;
+	unsigned		enabled:1;
+	uint64_t		tx_flags;
+	uint64_t		rx_flags;
+	uint64_t		caps;
+	ofi_atomic32_t		ref;
+	struct fi_context	nocomp_send_context;
+	struct fi_context	nocomp_tsend_context;
+
+	PSMX3_EP_DECL_OP_CONTEXT
+
+	size_t			min_multi_recv;
+	uint32_t		iov_seq_num;
+	int			service;
+	int			sep_id;
+};
+
+struct psmx3_sep_ctxt {
+	struct psmx3_trx_ctxt	*trx_ctxt;
+	struct psmx3_fid_ep	*ep;
+};
+
+struct psmx3_fid_sep {
+	struct fid_ep		ep;
+	int			type;
+	struct psmx3_fid_domain	*domain;
+	/* above fields are common with regular ep */
+
+	struct dlist_entry	entry;
+
+	ofi_atomic32_t		ref;
+	int			service;
+	uint8_t			id;
+	uint8_t			enabled;
+	size_t			ctxt_cnt;
+	struct psmx3_sep_ctxt	ctxts[]; /* must be last element */
+};
+
+struct psmx3_fid_stx {
+	struct fid_stx		stx;
+	struct psmx3_fid_domain	*domain;
+	struct psmx3_trx_ctxt	*tx;
+	ofi_atomic32_t		ref;
+};
+
+struct psmx3_fid_mr {
+	struct fid_mr		mr;
+	struct psmx3_fid_domain	*domain;
+	struct psmx3_fid_cntr	*cntr;
+	uint64_t		access;
+	uint64_t		flags;
+	uint64_t		offset;
+	size_t			iov_count;
+	struct iovec		iov[];	/* must be the last field */
+};
+
+struct psmx3_epaddr_context {
+	struct psmx3_trx_ctxt	*trx_ctxt;
+	psm2_epid_t		epid;
+	psm2_epaddr_t		epaddr;
+	struct dlist_entry	entry;
+};
+
+struct psmx3_env {
+	int	name_server;
+	int	tagged_rma;
+	char	*uuid;
+	int	uuid_override;
+	int	delay;
+	int	timeout;
+	int	conn_timeout;
+	int	prog_interval;
+	char	*prog_affinity;
+	int	multi_ep;
+	int	inject_size;
+	int	lock_level;
+	int	lazy_conn;
+	int	disconnect;
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+	char	*tag_layout;
+#endif
+};
+
+#define PSMX3_MAX_UNITS	PSMI_MAX_RAILS /* from psm_config.h */
+struct psmx3_hfi_info {
+	int max_trx_ctxt;
+	int free_trx_ctxt;
+	int num_units;
+	int num_active_units;
+	int active_units[PSMX3_MAX_UNITS];
+	int unit_is_active[PSMX3_MAX_UNITS];
+	int unit_nctxts[PSMX3_MAX_UNITS];
+	int unit_nfreectxts[PSMX3_MAX_UNITS];
+	char default_domain_name[PSMX3_MAX_UNITS * NAME_MAX]; /* hfi1_0;hfi1_1;...;hfi1_n */
+};
+
+extern struct fi_ops_mr		psmx3_mr_ops;
+extern struct fi_ops_cm		psmx3_cm_ops;
+extern struct fi_ops_tagged	psmx3_tagged_ops;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_flag_directed;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_event_directed;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_send_event_directed;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_recv_event_directed;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_flag_undirected;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_event_undirected;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_send_event_undirected;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_recv_event_undirected;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_flag_directed_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_event_directed_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_send_event_directed_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_recv_event_directed_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_flag_undirected_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_event_undirected_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_send_event_undirected_av_map;
+extern struct fi_ops_tagged	psmx3_tagged_ops_no_recv_event_undirected_av_map;
+extern struct fi_ops_msg	psmx3_msg_ops;
+extern struct fi_ops_msg	psmx3_msg2_ops;
+extern struct fi_ops_rma	psmx3_rma_ops;
+extern struct fi_ops_atomic	psmx3_atomic_ops;
+extern struct psmx3_env		psmx3_env;
+extern struct psmx3_hfi_info	psmx3_hfi_info;
+extern struct psmx3_fid_fabric	*psmx3_active_fabric;
+
+/*
+ * Lock levels:
+ *     0 -- always lock
+ *     1 -- lock needed if there is more than one thread (including internal threads)
+ *     2 -- lock needed if more then one thread accesses the same psm2 ep
+ */
+static inline void psmx3_lock(fastlock_t *lock, int lock_level)
+{
+	if (psmx3_env.lock_level >= lock_level)
+		fastlock_acquire(lock);
+}
+
+static inline int psmx3_trylock(fastlock_t *lock, int lock_level)
+{
+	if (psmx3_env.lock_level >= lock_level)
+		return fastlock_tryacquire(lock);
+	else
+		return 0;
+}
+
+static inline void psmx3_unlock(fastlock_t *lock, int lock_level)
+{
+	if (psmx3_env.lock_level >= lock_level)
+		fastlock_release(lock);
+}
+
+/* Specialized lock functions used based on FI_THREAD model */
+
+static inline void psmx3_lock_disabled(fastlock_t *lock, int lock_level)
+{
+	return;
+}
+
+static inline int psmx3_trylock_disabled(fastlock_t *lock, int lock_level)
+{
+	return 0;
+}
+
+static inline void psmx3_lock_enabled(fastlock_t *lock, int lock_level)
+{
+	fastlock_acquire(lock);
+}
+
+static inline void psmx3_unlock_enabled(fastlock_t *lock, int lock_level)
+{
+	fastlock_release(lock);
+}
+
+static inline int psmx3_trylock_enabled(fastlock_t *lock, int lock_level)
+{
+	return fastlock_tryacquire(lock);
+}
+
+int	psmx3_init_prov_info(const struct fi_info *hints, struct fi_info **info);
+void	psmx3_update_prov_info(struct fi_info *info,
+			       struct psmx3_ep_name *src_addr,
+			       struct psmx3_ep_name *dest_addr);
+int	psmx3_check_prov_info(uint32_t api_version, const struct fi_info *hints,
+			      struct fi_info **info);
+void	psmx3_alter_prov_info(uint32_t api_version, const struct fi_info *hints,
+			      struct fi_info *info);
+
+void	psmx3_init_tag_layout(struct fi_info *info);
+
+int	psmx3_fabric(struct fi_fabric_attr *attr,
+		     struct fid_fabric **fabric, void *context);
+int	psmx3_domain_open(struct fid_fabric *fabric, struct fi_info *info,
+			  struct fid_domain **domain, void *context);
+int	psmx3_ep_open(struct fid_domain *domain, struct fi_info *info,
+		      struct fid_ep **ep, void *context);
+int	psmx3_sep_open(struct fid_domain *domain, struct fi_info *info,
+		       struct fid_ep **sep, void *context);
+int	psmx3_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr,
+		      struct fid_stx **stx, void *context);
+int	psmx3_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
+		      struct fid_cq **cq, void *context);
+int	psmx3_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
+		      struct fid_av **av, void *context);
+int	psmx3_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
+		        struct fid_cntr **cntr, void *context);
+int	psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
+			struct fid_wait **waitset);
+int	psmx3_wait_trywait(struct fid_fabric *fabric, struct fid **fids,
+			   int count);
+int	psmx3_query_atomic(struct fid_domain *doamin, enum fi_datatype datatype,
+			   enum fi_op op, struct fi_atomic_attr *attr,
+			   uint64_t flags);
+
+static inline void psmx3_fabric_acquire(struct psmx3_fid_fabric *fabric)
+{
+	ofi_atomic_inc32(&fabric->util_fabric.ref);
+}
+
+static inline void psmx3_fabric_release(struct psmx3_fid_fabric *fabric)
+{
+	ofi_atomic_dec32(&fabric->util_fabric.ref);
+}
+
+static inline void psmx3_domain_acquire(struct psmx3_fid_domain *domain)
+{
+	ofi_atomic_inc32(&domain->util_domain.ref);
+}
+
+static inline void psmx3_domain_release(struct psmx3_fid_domain *domain)
+{
+	ofi_atomic_dec32(&domain->util_domain.ref);
+}
+
+int	psmx3_domain_enable_ep(struct psmx3_fid_domain *domain, struct psmx3_fid_ep *ep);
+
+void	psmx3_trx_ctxt_free(struct psmx3_trx_ctxt *trx_ctxt, int usage_flags);
+struct	psmx3_trx_ctxt *psmx3_trx_ctxt_alloc(struct psmx3_fid_domain *domain,
+					     struct psmx3_ep_name *src_addr,
+					     int sep_ctxt_idx, int usage_flags,
+					     uint8_t *uuid);
+
+static inline
+int	psmx3_ns_service_cmp(void *svc1, void *svc2)
+{
+	int service1 = *(int *)svc1, service2 = *(int *)svc2;
+	if (service1 == PSMX3_ANY_SERVICE ||
+	    service2 == PSMX3_ANY_SERVICE)
+		return 0;
+	return (service1 < service2) ?
+		-1 : (service1 > service2);
+}
+static inline
+int	psmx3_ns_is_service_wildcard(void *svc)
+{
+	return (*(int *)svc == PSMX3_ANY_SERVICE);
+}
+void	psmx3_get_uuid(psm2_uuid_t uuid);
+int	psmx3_override_uuid(void);
+int	psmx3_uuid_to_port(psm2_uuid_t uuid);
+char	*psmx3_uuid_to_string(psm2_uuid_t uuid);
+void	*psmx3_ep_name_to_string(const struct psmx3_ep_name *name, size_t *len);
+struct	psmx3_ep_name *psmx3_string_to_ep_name(const void *s);
+int	psmx3_errno(int err);
+void	psmx3_query_mpi(void);
+
+void	psmx3_cq_enqueue_event(struct psmx3_fid_cq *cq, struct psmx3_cq_event *event);
+struct	psmx3_cq_event *psmx3_cq_create_event(struct psmx3_fid_cq *cq,
+					      void *op_context, void *buf,
+					      uint64_t flags, size_t len,
+					      uint64_t data, uint64_t tag,
+					      size_t olen, int err);
+int	psmx3_cq_poll_mq(struct psmx3_fid_cq *cq, struct psmx3_trx_ctxt *trx_ctxt,
+			 struct psmx3_cq_event *event, int count, fi_addr_t *src_addr);
+
+void	psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt,
+			     psm2_epid_t epid, psm2_epaddr_t *epaddr);
+
+int	psmx3_av_add_trx_ctxt(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt);
+
+void	psmx3_av_remove_conn(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt,
+			     psm2_epaddr_t epaddr);
+
+int	psmx3_av_query_sep(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt,
+			   size_t idx);
+
+static inline
+psm2_epaddr_t psmx3_av_translate_addr(struct psmx3_fid_av *av,
+				      struct psmx3_trx_ctxt *trx_ctxt,
+				      fi_addr_t addr,
+				      int av_type)
+{
+	psm2_epaddr_t epaddr;
+	size_t idx;
+	int ctxt;
+
+	if (av_type == FI_AV_MAP)
+		return (psm2_epaddr_t) addr;
+
+	av->domain->av_lock_fn(&av->lock, 1);
+
+	idx = PSMX3_ADDR_IDX(addr);
+	assert(idx < av->hdr->last && av->table[idx].valid);
+
+	if (OFI_UNLIKELY(av->table[idx].type == PSMX3_EP_SCALABLE)) {
+		if (OFI_UNLIKELY(!av->sep_info[idx].epids)) {
+			psmx3_av_query_sep(av, trx_ctxt, idx);
+			assert(av->sep_info[idx].epids);
+		}
+
+		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx])) {
+			av->conn_info[trx_ctxt->id].sepaddrs[idx] =
+				calloc(av->sep_info[idx].ctxt_cnt, sizeof(psm2_epaddr_t));
+			assert(av->conn_info[trx_ctxt->id].sepaddrs[idx]);
+		}
+
+		ctxt = PSMX3_ADDR_CTXT(addr, av->rx_ctx_bits);
+		assert(ctxt < av->sep_info[idx].ctxt_cnt);
+
+		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]))
+			 psmx3_epid_to_epaddr(trx_ctxt,
+					      av->sep_info[idx].epids[ctxt],
+					      &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]);
+		epaddr = av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt];
+	} else {
+		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx]))
+			psmx3_epid_to_epaddr(trx_ctxt, av->table[idx].epid,
+					     &av->conn_info[trx_ctxt->id].epaddrs[idx]);
+		epaddr = av->conn_info[trx_ctxt->id].epaddrs[idx];
+	}
+
+	av->domain->av_unlock_fn(&av->lock, 1);
+	return epaddr;
+}
+
+void	psmx3_am_global_init(void);
+void	psmx3_am_global_fini(void);
+int	psmx3_am_init(struct psmx3_trx_ctxt *trx_ctxt);
+void	psmx3_am_fini(struct psmx3_trx_ctxt *trx_ctxt);
+int	psmx3_am_progress(struct psmx3_trx_ctxt *trx_ctxt);
+int	psmx3_am_process_send(struct psmx3_trx_ctxt *trx_ctxt,
+			      struct psmx3_am_request *req);
+int	psmx3_am_process_rma(struct psmx3_trx_ctxt *trx_ctxt,
+			     struct psmx3_am_request *req);
+int	psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args,
+			     int nargs, void *src, uint32_t len,
+			     void *hctx);
+int	psmx3_am_atomic_handler(psm2_am_token_t token,
+				psm2_amarg_t *args, int nargs, void *src,
+				uint32_t len, void *hctx);
+int	psmx3_am_sep_handler(psm2_am_token_t token, psm2_amarg_t *args, int nargs,
+			     void *src, uint32_t len, void *hctx);
+int	psmx3_am_trx_ctxt_handler(psm2_am_token_t token,
+				  psm2_amarg_t *args, int nargs, void *src, uint32_t len,
+				  void *hctx);
+void	psmx3_atomic_global_init(void);
+void	psmx3_atomic_global_fini(void);
+
+void	psmx3_am_ack_rma(struct psmx3_am_request *req);
+
+static inline
+struct psmx3_am_request *psmx3_am_request_alloc(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	struct psmx3_am_request *req;
+
+	trx_ctxt->domain->am_req_pool_lock_fn(&trx_ctxt->am_req_pool_lock, 0);
+	req = ofi_buf_alloc(trx_ctxt->am_req_pool);
+	trx_ctxt->domain->am_req_pool_unlock_fn(&trx_ctxt->am_req_pool_lock, 0);
+
+	if (req)
+		memset(req, 0, sizeof(*req));
+
+	return req;
+}
+
+static inline void psmx3_am_request_free(struct psmx3_trx_ctxt *trx_ctxt,
+					 struct psmx3_am_request *req)
+{
+	trx_ctxt->domain->am_req_pool_lock_fn(&trx_ctxt->am_req_pool_lock, 0);
+	ofi_buf_free(req);
+	trx_ctxt->domain->am_req_pool_unlock_fn(&trx_ctxt->am_req_pool_lock, 0);
+}
+
+struct	psmx3_fid_mr *psmx3_mr_get(struct psmx3_fid_domain *domain, uint64_t key);
+int	psmx3_mr_validate(struct psmx3_fid_mr *mr, uint64_t addr, size_t len, uint64_t access);
+void	psmx3_cntr_check_trigger(struct psmx3_fid_cntr *cntr);
+void	psmx3_cntr_add_trigger(struct psmx3_fid_cntr *cntr, struct psmx3_trigger *trigger);
+
+int	psmx3_handle_sendv_req(struct psmx3_fid_ep *ep, PSMX3_STATUS_TYPE *status,
+			       int multi_recv);
+
+static inline void psmx3_cntr_inc(struct psmx3_fid_cntr *cntr, int error)
+{
+	if (OFI_UNLIKELY(error)) {
+		ofi_atomic_inc64(&cntr->error_counter);
+		cntr->error_avail = 1;
+	} else {
+		ofi_atomic_inc64(&cntr->counter);
+	}
+	psmx3_cntr_check_trigger(cntr);
+	if (cntr->wait)
+		cntr->wait->signal(cntr->wait);
+}
+
+fi_addr_t psmx3_av_translate_source(struct psmx3_fid_av *av,
+				    psm2_epaddr_t source, int source_sep_id);
+
+static inline void psmx3_get_source_name(psm2_epaddr_t source,
+					 int source_sep_id,
+					 struct psmx3_ep_name *name)
+{
+	memset(name, 0, sizeof(*name));
+	psm2_epaddr_to_epid(source, &name->epid);
+	name->sep_id = source_sep_id;
+	name->type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR;
+}
+
+static inline void psmx3_get_source_string_name(psm2_epaddr_t source,
+						int source_sep_id,
+						char *name, size_t *len)
+{
+	struct psmx3_ep_name ep_name;
+
+	memset(&ep_name, 0, sizeof(ep_name));
+	psm2_epaddr_to_epid(source, &ep_name.epid);
+	ep_name.sep_id = source_sep_id;
+	ep_name.type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR;
+
+	ofi_straddr(name, len, FI_ADDR_PSMX3, &ep_name);
+}
+
+static inline void psmx3_progress(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	if (trx_ctxt && trx_ctxt->poll_active) {
+		psmx3_cq_poll_mq(NULL, trx_ctxt, NULL, 1, NULL);
+		if (trx_ctxt->am_progress)
+			psmx3_am_progress(trx_ctxt);
+	}
+}
+
+static inline void psmx3_progress_all(struct psmx3_fid_domain *domain)
+{
+	struct dlist_entry *item;
+	struct psmx3_trx_ctxt *trx_ctxt;
+
+	domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1);
+	dlist_foreach(&domain->trx_ctxt_list, item) {
+		trx_ctxt = container_of(item, struct psmx3_trx_ctxt, entry);
+		psmx3_progress(trx_ctxt);
+	}
+	domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1);
+}
+
+/*
+ * There is a limitation in PSM2 AM implementation that can cause significant
+ * delay if too many AM requests are enqueued in a row without progress calls
+ * being made in between. As a workaround, call this function after each AM
+ * request is enqueued whenever possible.
+ */
+#define PSMX3_AM_POLL_INTERVAL	64
+static inline void psmx3_am_poll(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	if (OFI_UNLIKELY(++trx_ctxt->am_poll_count > PSMX3_AM_POLL_INTERVAL)) {
+		trx_ctxt->am_poll_count = 0;
+		psm2_poll(trx_ctxt->psm2_ep);
+	}
+}
+
+static inline int psmx3_peer_match(struct dlist_entry *item, const void *arg)
+{
+	struct psmx3_epaddr_context *peer;
+
+	peer = container_of(item, struct psmx3_epaddr_context, entry);
+	return  (peer->epaddr == arg);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_am.c b/deps/libfabric/prov/psm3/src/psmx3_am.c
new file mode 100644
index 0000000000000000000000000000000000000000..680a5fabf4cef41ac36080022b4f2f90b72e1021
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_am.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+int psmx3_am_progress(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	struct slist_entry *item;
+	struct psmx3_trigger *trigger;
+
+	trx_ctxt->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2);
+	while (!slist_empty(&trx_ctxt->trigger_queue.list)) {
+		item = slist_remove_head(&trx_ctxt->trigger_queue.list);
+		trigger = container_of(item, struct psmx3_trigger, list_entry);
+		trx_ctxt->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2);
+		psmx3_process_trigger(trx_ctxt, trigger);
+		trx_ctxt->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2);
+	}
+	trx_ctxt->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2);
+
+	return 0;
+}
+
+int psmx3_am_init(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	psm2_am_handler_2_fn_t psmx3_am_handlers[4];
+	struct psmx3_trx_ctxt *hctx[4];
+	int psmx3_am_handlers_idx[4];
+	int num_handlers = 4;
+
+	psm2_ep_t psm2_ep = trx_ctxt->psm2_ep;
+	size_t size;
+	int err = 0;
+	uint32_t max_atomic_size;
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid %016lx\n", trx_ctxt->psm2_epid);
+
+	if (!trx_ctxt->am_initialized) {
+		err = psm2_am_get_parameters(psm2_ep, &trx_ctxt->psm2_am_param,
+					     sizeof(struct psm2_am_parameters),
+					     &size);
+		if (err)
+			return psmx3_errno(err);
+
+		max_atomic_size = trx_ctxt->psm2_am_param.max_request_short;
+		if (trx_ctxt->domain->max_atomic_size > max_atomic_size)
+			trx_ctxt->domain->max_atomic_size = max_atomic_size;
+
+		psmx3_am_handlers[0] = psmx3_am_rma_handler;
+		hctx[0] = trx_ctxt;
+		psmx3_am_handlers[1] = psmx3_am_atomic_handler;
+		hctx[1] = trx_ctxt;
+		psmx3_am_handlers[2] = psmx3_am_sep_handler;
+		hctx[2] = trx_ctxt;
+		psmx3_am_handlers[3] = psmx3_am_trx_ctxt_handler;
+		hctx[3] = trx_ctxt;
+
+		err = psm2_am_register_handlers_2(psm2_ep, psmx3_am_handlers,
+						num_handlers, (void **)hctx, psmx3_am_handlers_idx);
+		if (err)
+			return psmx3_errno(err);
+
+		if ((psmx3_am_handlers_idx[0] != PSMX3_AM_RMA_HANDLER) ||
+		    (psmx3_am_handlers_idx[1] != PSMX3_AM_ATOMIC_HANDLER) ||
+		    (psmx3_am_handlers_idx[2] != PSMX3_AM_SEP_HANDLER) ||
+		    (psmx3_am_handlers_idx[3] != PSMX3_AM_TRX_CTXT_HANDLER)) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"failed to register one or more AM handlers "
+				"at indecies %d, %d, %d, %d\n", PSMX3_AM_RMA_HANDLER,
+				PSMX3_AM_ATOMIC_HANDLER, PSMX3_AM_SEP_HANDLER,
+				PSMX3_AM_TRX_CTXT_HANDLER);
+			return -FI_EBUSY;
+		}
+
+		trx_ctxt->am_initialized = 1;
+	}
+
+	return err;
+}
+
+void psmx3_am_fini(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	/* there is no way to unregister AM handlers */
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_atomic.c b/deps/libfabric/prov/psm3/src/psmx3_atomic.c
new file mode 100644
index 0000000000000000000000000000000000000000..639377e559bda0feb79597b8e92e0f46cacee473
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_atomic.c
@@ -0,0 +1,2101 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+/* Atomics protocol:
+ *
+ * Atomics REQ:
+ *	args[0].u32w0	cmd
+ *	args[0].u32w1	count
+ *	args[1].u64	req
+ *	args[2].u64	addr
+ *	args[3].u64	key
+ *	args[4].u32w0	datatype
+ *	args[4].u32w1	op
+ *
+ * Atomics REP:
+ *	args[0].u32w0	cmd
+ *	args[0].u32w1	error
+ *	args[1].u64	req
+ */
+
+static fastlock_t psmx3_atomic_lock;
+
+void psmx3_atomic_global_init(void)
+{
+	fastlock_init(&psmx3_atomic_lock);
+}
+
+void psmx3_atomic_global_fini(void)
+{
+	fastlock_destroy(&psmx3_atomic_lock);
+}
+
+static inline void psmx3_ioc_read(const struct fi_ioc *ioc, size_t count,
+				  int datatype, uint8_t *buf, size_t len)
+{
+	int i;
+	size_t copy_len;
+
+	for (i=0; i<count && len; i++) {
+		copy_len = ofi_datatype_size(datatype) * ioc[i].count;
+		if (copy_len > len)
+			copy_len = len;
+		memcpy(buf, ioc[i].addr, copy_len);
+		buf += copy_len;
+		len -= copy_len;
+	}
+}
+
+static inline void psmx3_ioc_write(struct fi_ioc *ioc, size_t count,
+				   int datatype, const uint8_t *buf, size_t len)
+{
+	int i;
+	size_t copy_len;
+
+	for (i=0; i<count && len; i++) {
+		copy_len = ofi_datatype_size(datatype) * ioc[i].count;
+		if (copy_len > len)
+			copy_len = len;
+		memcpy(ioc[i].addr, buf, copy_len);
+		buf += copy_len;
+		len -= copy_len;
+	}
+}
+
+static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
+				    int datatype)
+{
+	int i;
+	size_t len = 0;
+
+	for (i=0; i<count; i++)
+		len += ofi_datatype_size(datatype) * ioc[i].count;
+
+	return len;
+}
+
+#define CASE_INT_TYPE(FUNC,...) \
+		case FI_INT8:	FUNC(__VA_ARGS__,int8_t); break; \
+		case FI_UINT8:	FUNC(__VA_ARGS__,uint8_t); break; \
+		case FI_INT16:	FUNC(__VA_ARGS__,int16_t); break; \
+		case FI_UINT16: FUNC(__VA_ARGS__,uint16_t); break; \
+		case FI_INT32:	FUNC(__VA_ARGS__,int32_t); break; \
+		case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \
+		case FI_INT64:	FUNC(__VA_ARGS__,int64_t); break; \
+		case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break;
+
+#define CASE_FP_TYPE(FUNC,...) \
+		case FI_FLOAT:	FUNC(__VA_ARGS__,float); break; \
+		case FI_DOUBLE:	FUNC(__VA_ARGS__,double); break; \
+		case FI_LONG_DOUBLE: FUNC(__VA_ARGS__,long double); break;
+
+#define CASE_COMPLEX_TYPE(FUNC,...) \
+		case FI_FLOAT_COMPLEX:	FUNC(__VA_ARGS__,float complex); break; \
+		case FI_DOUBLE_COMPLEX:	FUNC(__VA_ARGS__,double complex); break; \
+		case FI_LONG_DOUBLE_COMPLEX: FUNC(__VA_ARGS__,long double complex); break;
+
+#define SWITCH_INT_TYPE(type,...) \
+		switch (type) { \
+		CASE_INT_TYPE(__VA_ARGS__) \
+		default: return -FI_EOPNOTSUPP; \
+		}
+
+#define SWITCH_ORD_TYPE(type,...) \
+		switch (type) { \
+		CASE_INT_TYPE(__VA_ARGS__) \
+		CASE_FP_TYPE(__VA_ARGS__) \
+		default: return -FI_EOPNOTSUPP; \
+		}
+
+#define SWITCH_ALL_TYPE(type,...) \
+		switch (type) { \
+		CASE_INT_TYPE(__VA_ARGS__) \
+		CASE_FP_TYPE(__VA_ARGS__) \
+		CASE_COMPLEX_TYPE(__VA_ARGS__) \
+		default: return -FI_EOPNOTSUPP; \
+		}
+
+#define PSMX3_MIN(dst,src)	if ((dst) > (src)) (dst) = (src)
+#define PSMX3_MAX(dst,src)	if ((dst) < (src)) (dst) = (src)
+#define PSMX3_SUM(dst,src)	(dst) += (src)
+#define PSMX3_PROD(dst,src)	(dst) *= (src)
+#define PSMX3_LOR(dst,src)	(dst) = (dst) || (src)
+#define PSMX3_LAND(dst,src)	(dst) = (dst) && (src)
+#define PSMX3_BOR(dst,src)	(dst) |= (src)
+#define PSMX3_BAND(dst,src)	(dst) &= (src)
+#define PSMX3_LXOR(dst,src)	(dst) = ((dst) && !(src)) || (!(dst) && (src))
+#define PSMX3_BXOR(dst,src)	(dst) ^= (src)
+#define PSMX3_COPY(dst,src)	(dst) = (src)
+
+#define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<(cnt); i++) \
+				r[i] = d[i]; \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+
+#define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<cnt; i++) \
+				OP(d[i],s[i]); \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+
+#define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<(cnt); i++) {\
+				r[i] = d[i]; \
+				OP(d[i],s[i]); \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+
+#define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *c = (cmp); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<(cnt); i++) { \
+				r[i] = d[i]; \
+				if (c[i] CMP_OP d[i]) \
+					d[i] = s[i]; \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+
+#define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *c = (cmp); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<(cnt); i++) { \
+				r[i] = d[i]; \
+				d[i] = (s[i] & c[i]) | (d[i] & ~c[i]); \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+
+static int psmx3_atomic_do_write(void *dest, void *src,
+				 int datatype, int op, int count)
+{
+	switch (op) {
+	case FI_MIN:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_MIN);
+		break;
+
+	case FI_MAX:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_MAX);
+		break;
+
+	case FI_SUM:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_SUM);
+		break;
+
+	case FI_PROD:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_PROD);
+		break;
+
+	case FI_LOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_LOR);
+		break;
+
+	case FI_LAND:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_LAND);
+		break;
+
+	case FI_BOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_BOR);
+		break;
+
+	case FI_BAND:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_BAND);
+		break;
+
+	case FI_LXOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_LXOR);
+		break;
+
+	case FI_BXOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_BXOR);
+		break;
+
+	case FI_ATOMIC_WRITE:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+				dest,src,count,PSMX3_COPY);
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int psmx3_atomic_do_readwrite(void *dest, void *src, void *result,
+				     int datatype, int op, int count)
+{
+	switch (op) {
+	case FI_MIN:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_MIN);
+		break;
+
+	case FI_MAX:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_MAX);
+		break;
+
+	case FI_SUM:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_SUM);
+		break;
+
+	case FI_PROD:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_PROD);
+		break;
+
+	case FI_LOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_LOR);
+		break;
+
+	case FI_LAND:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_LAND);
+		break;
+
+	case FI_BOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_BOR);
+		break;
+
+	case FI_BAND:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_BAND);
+		break;
+
+	case FI_LXOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_LXOR);
+		break;
+
+	case FI_BXOR:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_BXOR);
+		break;
+
+	case FI_ATOMIC_READ:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_READ,
+				dest,result,count);
+		break;
+
+	case FI_ATOMIC_WRITE:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_READWRITE,
+				dest,src,result,count,PSMX3_COPY);
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int psmx3_atomic_do_compwrite(void *dest, void *src, void *compare,
+				     void *result, int datatype, int op,
+				     int count)
+{
+	switch (op) {
+	case FI_CSWAP:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,==);
+		break;
+
+	case FI_CSWAP_NE:
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,!=);
+		break;
+
+	case FI_CSWAP_LE:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,<=);
+		break;
+
+	case FI_CSWAP_LT:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,<);
+		break;
+
+	case FI_CSWAP_GE:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,>=);
+		break;
+
+	case FI_CSWAP_GT:
+		SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_CSWAP,
+				dest,src,compare,result,count,>);
+		break;
+
+	case FI_MSWAP:
+		SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_MSWAP,
+				dest,src,compare,result,count);
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int psmx3_am_atomic_handler(psm2_am_token_t token,
+				psm2_amarg_t *args, int nargs, void *src,
+				uint32_t len, void *hctx)
+{
+	psm2_amarg_t rep_args[8];
+	int count;
+	uint8_t *addr;
+	uint64_t key;
+	int datatype, op;
+	int err = 0;
+	int op_error = 0;
+	struct psmx3_am_request *req;
+	struct psmx3_cq_event *event;
+	struct psmx3_fid_mr *mr;
+	struct psmx3_fid_cntr *cntr = NULL;
+	struct psmx3_fid_cntr *mr_cntr = NULL;
+	void *tmp_buf;
+	psm2_epaddr_t epaddr;
+	int cmd;
+	struct psmx3_trx_ctxt *rx;
+
+	psm2_am_get_source(token, &epaddr);
+	cmd = PSMX3_AM_GET_OP(args[0].u32w0);
+
+	switch (cmd) {
+	case PSMX3_AM_REQ_ATOMIC_WRITE:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		count = args[0].u32w1;
+		addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		datatype = args[4].u32w0;
+		op = args[4].u32w1;
+		assert(len == ofi_datatype_size(datatype) * count);
+
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_WRITE) :
+			-FI_EINVAL;
+
+		if (!op_error) {
+			addr += mr->offset;
+			psmx3_atomic_do_write(addr, src, datatype, op, count);
+
+			if (rx->ep->caps & FI_RMA_EVENT) {
+				cntr = rx->ep->remote_write_cntr;
+				mr_cntr = mr->cntr;
+
+				if (cntr)
+					psmx3_cntr_inc(cntr, 0);
+
+				if (mr_cntr && mr_cntr != cntr)
+					psmx3_cntr_inc(mr_cntr, 0);
+			}
+		}
+
+		rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_WRITE;
+		rep_args[0].u32w1 = op_error;
+		rep_args[1].u64 = args[1].u64;
+		err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER,
+					  rep_args, 2, NULL, 0, 0,
+					  NULL, NULL );
+		break;
+
+	case PSMX3_AM_REQ_ATOMIC_READWRITE:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		count = args[0].u32w1;
+		addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		datatype = args[4].u32w0;
+		op = args[4].u32w1;
+
+		if (op == FI_ATOMIC_READ)
+			len = ofi_datatype_size(datatype) * count;
+
+		assert(len == ofi_datatype_size(datatype) * count);
+
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)addr, len,
+					  FI_REMOTE_READ|FI_REMOTE_WRITE) :
+			-FI_EINVAL;
+
+		if (!op_error) {
+			addr += mr->offset;
+			tmp_buf = malloc(len);
+			if (tmp_buf)
+				psmx3_atomic_do_readwrite(addr, src, tmp_buf,
+							  datatype, op, count);
+			else
+				op_error = -FI_ENOMEM;
+
+			if (rx->ep->caps & FI_RMA_EVENT) {
+				if (op == FI_ATOMIC_READ) {
+					cntr = rx->ep->remote_read_cntr;
+				} else {
+					cntr = rx->ep->remote_write_cntr;
+					mr_cntr = mr->cntr;
+				}
+
+				if (cntr)
+					psmx3_cntr_inc(cntr, 0);
+
+				if (mr_cntr && mr_cntr != cntr)
+					psmx3_cntr_inc(mr_cntr, 0);
+			}
+		} else {
+			tmp_buf = NULL;
+		}
+
+		rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_READWRITE;
+		rep_args[0].u32w1 = op_error;
+		rep_args[1].u64 = args[1].u64;
+		err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER,
+					  rep_args, 2, tmp_buf,
+					  (tmp_buf ? len : 0),
+					  0, free, tmp_buf );
+		break;
+
+	case PSMX3_AM_REQ_ATOMIC_COMPWRITE:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		count = args[0].u32w1;
+		addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		datatype = args[4].u32w0;
+		op = args[4].u32w1;
+		len /= 2;
+		assert(len == ofi_datatype_size(datatype) * count);
+
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)addr, len,
+					  FI_REMOTE_READ|FI_REMOTE_WRITE) :
+			-FI_EINVAL;
+
+		if (!op_error) {
+			addr += mr->offset;
+			tmp_buf = malloc(len);
+			if (tmp_buf)
+				psmx3_atomic_do_compwrite(addr, src, (uint8_t *)src + len,
+							  tmp_buf, datatype,
+							  op, count);
+			else
+				op_error = -FI_ENOMEM;
+
+			if (rx->ep->caps & FI_RMA_EVENT) {
+				cntr = rx->ep->remote_write_cntr;
+				mr_cntr = mr->cntr;
+
+				if (cntr)
+					psmx3_cntr_inc(cntr, 0);
+
+				if (mr_cntr && mr_cntr != cntr)
+					psmx3_cntr_inc(mr_cntr, 0);
+			}
+		} else {
+			tmp_buf = NULL;
+		}
+
+		rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_READWRITE;
+		rep_args[0].u32w1 = op_error;
+		rep_args[1].u64 = args[1].u64;
+		err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER,
+					  rep_args, 2, tmp_buf,
+					  (tmp_buf ? len : 0),
+					  0, free, tmp_buf );
+		break;
+
+	case PSMX3_AM_REP_ATOMIC_WRITE:
+		req = (struct psmx3_am_request *)(uintptr_t)args[1].u64;
+		op_error = (int)args[0].u32w1;
+		assert(req->op == PSMX3_AM_REQ_ATOMIC_WRITE);
+		if (req->ep->send_cq && (!req->no_event || op_error)) {
+			event = psmx3_cq_create_event(
+					req->ep->send_cq,
+					req->atomic.context,
+					req->atomic.buf,
+					req->cq_flags,
+					req->atomic.len,
+					0, /* data */
+					0, /* tag */
+					0, /* olen */
+					op_error);
+			if (event)
+				psmx3_cq_enqueue_event(req->ep->send_cq, event);
+			else
+				err = -FI_ENOMEM;
+		}
+
+		if (req->ep->write_cntr)
+			psmx3_cntr_inc(req->ep->write_cntr, op_error);
+
+		free(req->tmpbuf);
+		psmx3_am_request_free(req->ep->tx, req);
+		break;
+
+	case PSMX3_AM_REP_ATOMIC_READWRITE:
+	case PSMX3_AM_REP_ATOMIC_COMPWRITE:
+		req = (struct psmx3_am_request *)(uintptr_t)args[1].u64;
+		op_error = (int)args[0].u32w1;
+		assert(op_error || req->atomic.len == len);
+
+		if (!op_error) {
+			if (req->atomic.result)
+				memcpy(req->atomic.result, src, len);
+			else
+				psmx3_ioc_write(req->ioc, req->atomic.iov_count,
+						req->atomic.datatype, src, len);
+		}
+
+		if (req->ep->send_cq && (!req->no_event || op_error)) {
+			event = psmx3_cq_create_event(
+					req->ep->send_cq,
+					req->atomic.context,
+					req->atomic.buf,
+					req->cq_flags,
+					req->atomic.len,
+					0, /* data */
+					0, /* tag */
+					0, /* olen */
+					op_error);
+			if (event)
+				psmx3_cq_enqueue_event(req->ep->send_cq, event);
+			else
+				err = -FI_ENOMEM;
+		}
+
+		if (req->ep->read_cntr)
+			psmx3_cntr_inc(req->ep->read_cntr, op_error);
+
+		free(req->tmpbuf);
+		psmx3_am_request_free(req->ep->tx, req);
+		break;
+
+	default:
+		err = -FI_EINVAL;
+	}
+	return err;
+}
+
+static int psmx3_atomic_self(int am_cmd,
+			     struct psmx3_fid_ep *ep,
+			     const void *buf,
+			     size_t count, void *desc,
+			     const void *compare, void *compare_desc,
+			     void *result, void *result_desc,
+			     uint64_t addr, uint64_t key,
+			     enum fi_datatype datatype,
+			     enum fi_op op, void *context,
+			     uint64_t flags)
+{
+	struct psmx3_fid_mr *mr;
+	struct psmx3_cq_event *event;
+	struct psmx3_fid_cntr *cntr = NULL;
+	struct psmx3_fid_cntr *mr_cntr = NULL;
+	void *tmp_buf;
+	size_t len;
+	int no_event;
+	int err = 0;
+	int op_error;
+	int access;
+	uint64_t cq_flags = 0;
+
+	if (am_cmd == PSMX3_AM_REQ_ATOMIC_WRITE)
+		access = FI_REMOTE_WRITE;
+	else
+		access = FI_REMOTE_READ | FI_REMOTE_WRITE;
+
+	len = ofi_datatype_size(datatype) * count;
+	mr = psmx3_mr_get(ep->domain, key);
+	op_error = mr ?  psmx3_mr_validate(mr, addr, len, access) : -FI_EINVAL;
+
+	if (op_error)
+		goto gen_local_event;
+
+	addr += mr->offset;
+
+	switch (am_cmd) {
+	case PSMX3_AM_REQ_ATOMIC_WRITE:
+		err = psmx3_atomic_do_write((void *)addr, (void *)buf,
+					    (int)datatype, (int)op, (int)count);
+		cq_flags = FI_WRITE | FI_ATOMIC;
+		break;
+
+	case PSMX3_AM_REQ_ATOMIC_READWRITE:
+		if (result != buf) {
+			err = psmx3_atomic_do_readwrite((void *)addr, (void *)buf,
+							(void *)result, (int)datatype,
+							(int)op, (int)count);
+		} else {
+			tmp_buf = malloc(len);
+			if (tmp_buf) {
+				memcpy(tmp_buf, result, len);
+				err = psmx3_atomic_do_readwrite((void *)addr, (void *)buf,
+								tmp_buf, (int)datatype,
+								(int)op, (int)count);
+				memcpy(result, tmp_buf, len);
+				free(tmp_buf);
+			} else {
+				err = -FI_ENOMEM;
+			}
+		}
+		if (op == FI_ATOMIC_READ)
+			cq_flags = FI_READ | FI_ATOMIC;
+		else
+			cq_flags = FI_WRITE | FI_ATOMIC;
+		break;
+
+	case PSMX3_AM_REQ_ATOMIC_COMPWRITE:
+		if (result != buf && result != compare) {
+			err = psmx3_atomic_do_compwrite((void *)addr, (void *)buf,
+							(void *)compare, (void *)result,
+							(int)datatype, (int)op, (int)count);
+		} else {
+			tmp_buf = malloc(len);
+			if (tmp_buf) {
+				memcpy(tmp_buf, result, len);
+				err = psmx3_atomic_do_compwrite((void *)addr, (void *)buf,
+								(void *)compare, tmp_buf,
+								(int)datatype, (int)op, (int)count);
+				memcpy(result, tmp_buf, len);
+				free(tmp_buf);
+			} else {
+				err = -FI_ENOMEM;
+			}
+		}
+		cq_flags = FI_WRITE | FI_ATOMIC;
+		break;
+	}
+
+	if (ep->caps & FI_RMA_EVENT) {
+		if (op == FI_ATOMIC_READ) {
+			cntr = ep->remote_read_cntr;
+		} else {
+			cntr = ep->remote_write_cntr;
+			mr_cntr = mr->cntr;
+		}
+
+		if (cntr)
+			psmx3_cntr_inc(cntr, 0);
+
+		if (mr_cntr && mr_cntr != cntr)
+			psmx3_cntr_inc(mr_cntr, 0);
+	}
+
+	op_error = err;
+
+gen_local_event:
+	no_event = ((flags & PSMX3_NO_COMPLETION) ||
+		    (ep->send_selective_completion && !(flags & FI_COMPLETION)));
+	if (ep->send_cq && (!no_event || op_error)) {
+		event = psmx3_cq_create_event(
+				ep->send_cq,
+				context,
+				(void *)buf,
+				cq_flags,
+				len,
+				0, /* data */
+				0, /* tag */
+				0, /* olen */
+				op_error);
+		if (event)
+			psmx3_cq_enqueue_event(ep->send_cq, event);
+		else
+			err = -FI_ENOMEM;
+	}
+
+	switch (am_cmd) {
+	case PSMX3_AM_REQ_ATOMIC_WRITE:
+		if (ep->write_cntr)
+			psmx3_cntr_inc(ep->write_cntr, op_error);
+		break;
+	case PSMX3_AM_REQ_ATOMIC_READWRITE:
+	case PSMX3_AM_REQ_ATOMIC_COMPWRITE:
+		if (ep->read_cntr)
+			psmx3_cntr_inc(ep->read_cntr, op_error);
+		break;
+	}
+
+	return err;
+}
+
+ssize_t psmx3_atomic_write_generic(struct fid_ep *ep,
+				   const void *buf,
+				   size_t count, void *desc,
+				   fi_addr_t dest_addr,
+				   uint64_t addr, uint64_t key,
+				   enum fi_datatype datatype,
+				   enum fi_op op, void *context,
+				   uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size, len;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_write(ep, buf, count, desc,
+							dest_addr, addr, key,
+							datatype, op, context,
+							flags);
+
+	assert(buf);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_WRITE, ep_priv,
+					 buf, count, desc, NULL, NULL, NULL,
+					 NULL, addr, key, datatype, op,
+					 context, flags);
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	len = ofi_datatype_size(datatype)* count;
+	if (len > chunk_size)
+		return -FI_EMSGSIZE;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (flags & FI_INJECT) {
+		req->tmpbuf = malloc(len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		memcpy(req->tmpbuf, (void *)buf, len);
+		buf = req->tmpbuf;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_WRITE;
+	req->atomic.buf = (void *)buf;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_WRITE;
+	args[0].u32w1 = count;
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+ssize_t psmx3_atomic_writev_generic(struct fid_ep *ep,
+				   const struct fi_ioc *iov,
+				   void **desc, size_t count,
+				   fi_addr_t dest_addr,
+				   uint64_t addr, uint64_t key,
+				   enum fi_datatype datatype,
+				   enum fi_op op, void *context,
+				   uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size;
+	size_t len;
+	uint8_t *buf;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_writev(ep, iov, desc, count,
+							 dest_addr, addr, key,
+							 datatype, op, context,
+							 flags);
+
+	assert(iov);
+	assert(count);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	while (count && !iov[count-1].count)
+		count--;
+
+	av = ep_priv->av;
+	assert(av);
+
+	len = psmx3_ioc_size(iov, count, datatype);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid) {
+		buf = malloc(len);
+		if (!buf)
+			return -FI_ENOMEM;
+
+		psmx3_ioc_read(iov, count, datatype, buf, len);
+
+		err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_WRITE, ep_priv,
+					buf, len / ofi_datatype_size(datatype),
+					NULL, NULL, NULL, NULL, NULL, addr,
+					key, datatype, op, context, flags);
+
+		free(buf);
+		return err;
+	}
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	if (len > chunk_size)
+		return -FI_EMSGSIZE;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (count > 1) {
+		req->tmpbuf = malloc(len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		buf = req->tmpbuf;
+		psmx3_ioc_read(iov, count, datatype, buf, len);
+	} else {
+		buf = iov[0].addr;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_WRITE;
+	req->atomic.buf = (void *)buf;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_WRITE;
+	args[0].u32w1 = len / ofi_datatype_size(datatype);
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_write(struct fid_ep *ep,
+				  const void *buf,
+				  size_t count, void *desc,
+				  fi_addr_t dest_addr,
+				  uint64_t addr, uint64_t key,
+				  enum fi_datatype datatype,
+				  enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	return psmx3_atomic_write_generic(ep, buf, count, desc, dest_addr,
+					  addr, key, datatype, op, context,
+					  ep_priv->tx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_writemsg(struct fid_ep *ep,
+				const struct fi_msg_atomic *msg,
+				uint64_t flags)
+{
+	assert(msg);
+	assert(msg->iov_count);
+	assert(msg->msg_iov);
+	assert(msg->rma_iov);
+	assert(msg->rma_iov_count == 1);
+
+	if (msg->iov_count > 1)
+		return psmx3_atomic_writev_generic(ep, msg->msg_iov, msg->desc,
+						   msg->iov_count, msg->addr,
+						   msg->rma_iov[0].addr,
+						   msg->rma_iov[0].key,
+						   msg->datatype, msg->op,
+						   msg->context, flags);
+
+	return psmx3_atomic_write_generic(ep, msg->msg_iov[0].addr,
+					  msg->msg_iov[0].count,
+					  msg->desc ? msg->desc[0] : NULL,
+					  msg->addr, msg->rma_iov[0].addr,
+					  msg->rma_iov[0].key, msg->datatype,
+					  msg->op, msg->context, flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_writev(struct fid_ep *ep,
+				   const struct fi_ioc *iov,
+				   void **desc, size_t count,
+				   fi_addr_t dest_addr,
+				   uint64_t addr, uint64_t key,
+				   enum fi_datatype datatype,
+				   enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	assert(iov);
+	assert(count);
+
+	if (count > 1)
+		return psmx3_atomic_writev_generic(ep, iov, desc, count,
+						   dest_addr, addr, key,
+						   datatype, op, context,
+						   ep_priv->tx_flags);
+
+	return psmx3_atomic_write_generic(ep, iov->addr, iov->count,
+					  desc ? desc[0] : NULL, dest_addr,
+					  addr, key, datatype, op, context,
+					  ep_priv->tx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_inject(struct fid_ep *ep,
+				   const void *buf,
+				   size_t count, /*void *desc,*/
+				   fi_addr_t dest_addr,
+				   uint64_t addr, uint64_t key,
+				   enum fi_datatype datatype,
+				   enum fi_op op)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	return psmx3_atomic_write_generic(ep, buf, count, NULL/*desc*/,
+					  dest_addr, addr, key,
+					  datatype, op, NULL,
+					  ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION);
+}
+
+ssize_t psmx3_atomic_readwrite_generic(struct fid_ep *ep,
+				       const void *buf,
+				       size_t count, void *desc,
+				       void *result, void *result_desc,
+				       fi_addr_t dest_addr,
+				       uint64_t addr, uint64_t key,
+				       enum fi_datatype datatype,
+				       enum fi_op op, void *context,
+				       uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size, len;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_readwrite(ep, buf, count,
+							    desc, result,
+							    result_desc,
+							    dest_addr, addr,
+							    key, datatype, op,
+							    context, flags);
+
+	assert(buf || op == FI_ATOMIC_READ);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_READWRITE, ep_priv,
+					 buf, count, desc, NULL, NULL, result,
+					 result_desc, addr, key, datatype, op,
+					 context, flags);
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	len = ofi_datatype_size(datatype) * count;
+	if (len > chunk_size)
+		return -FI_EMSGSIZE;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if ((flags & FI_INJECT) && op != FI_ATOMIC_READ) {
+		req->tmpbuf = malloc(len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		memcpy(req->tmpbuf, (void *)buf, len);
+		buf = req->tmpbuf;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_READWRITE;
+	req->atomic.buf = (void *)buf;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.result = result;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	if (op == FI_ATOMIC_READ)
+		req->cq_flags = FI_READ | FI_ATOMIC;
+	else
+		req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_READWRITE;
+	args[0].u32w1 = count;
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, (buf?len:0), am_flags, NULL,
+				    NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+ssize_t psmx3_atomic_readwritev_generic(struct fid_ep *ep,
+					const struct fi_ioc *iov,
+					void **desc, size_t count,
+					struct fi_ioc *resultv,
+					void **result_desc,
+					size_t result_count,
+					fi_addr_t dest_addr,
+					uint64_t addr, uint64_t key,
+					enum fi_datatype datatype,
+					enum fi_op op, void *context,
+					uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size;
+	size_t len, result_len, iov_size;
+	uint8_t *buf, *result;
+	void *desc0, *result_desc0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_readwritev(ep, iov, desc,
+							     count, resultv,
+							     result_desc,
+							     result_count,
+							     dest_addr, addr,
+							     key, datatype, op,
+							     context, flags);
+
+	assert((iov && count) || op == FI_ATOMIC_READ);
+	assert(resultv);
+	assert(result_count);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	if (iov) {
+		while (count && !iov[count-1].count)
+			count--;
+	}
+
+	while (result_count && !resultv[result_count-1].count)
+		result_count--;
+
+	result_len = psmx3_ioc_size(resultv, result_count, datatype);
+
+	if (op != FI_ATOMIC_READ) {
+		buf = iov[0].addr; /* as default for count == 1 */
+		len = psmx3_ioc_size(iov, count, datatype);
+		desc0 = desc ? desc[0] : NULL;
+	} else {
+		buf = NULL;
+		len = result_len;
+		desc0 = NULL;
+	}
+
+	assert(result_len >= len);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid) {
+		if (buf && count > 1) {
+			buf = malloc(len);
+			psmx3_ioc_read(iov, count, datatype, buf, len);
+			desc0 = NULL;
+		}
+
+		if (result_count > 1) {
+			result = malloc(len);
+			if (!result) {
+				if (buf && count > 1)
+					free(buf);
+				return -FI_ENOMEM;
+			}
+			result_desc0 = result_desc ? result_desc[0] : NULL;
+		} else {
+			result = resultv[0].addr;
+			result_desc0 = NULL;
+		}
+
+		err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_READWRITE, ep_priv,
+					buf, len / ofi_datatype_size(datatype),
+					desc0, NULL, NULL, result, result_desc0,
+					addr, key, datatype, op, context, flags);
+
+		if (result_count > 1) {
+			psmx3_ioc_write(resultv, result_count, datatype, result, len);
+			free(result);
+		}
+
+		if (buf && count > 1)
+			free(buf);
+
+		return err;
+	}
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	if (len > chunk_size)
+		return -FI_EMSGSIZE;
+
+	iov_size = result_count > 1 ? result_count * sizeof(struct fi_ioc) : 0;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (((flags & FI_INJECT) || count > 1) && op != FI_ATOMIC_READ) {
+		req->tmpbuf = malloc(iov_size + len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		buf = (uint8_t *)req->tmpbuf + iov_size;
+		psmx3_ioc_read(iov, count, datatype, buf, len);
+	} else {
+		req->tmpbuf = malloc(iov_size);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+	}
+
+	req->ioc = req->tmpbuf;
+	if (iov_size) {
+		memcpy(req->ioc, resultv, iov_size);
+		req->atomic.iov_count = result_count;
+		req->atomic.result = NULL;
+	} else {
+		req->atomic.buf = buf;
+		req->atomic.result = resultv[0].addr;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_READWRITE;
+	req->atomic.buf = (void *)buf;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	if (op == FI_ATOMIC_READ)
+		req->cq_flags = FI_READ | FI_ATOMIC;
+	else
+		req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_READWRITE;
+	args[0].u32w1 = len / ofi_datatype_size(datatype);
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, (buf?len:0), am_flags, NULL,
+				    NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_readwrite(struct fid_ep *ep,
+				      const void *buf,
+				      size_t count, void *desc,
+				      void *result, void *result_desc,
+				      fi_addr_t dest_addr,
+				      uint64_t addr, uint64_t key,
+				      enum fi_datatype datatype,
+				      enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	return psmx3_atomic_readwrite_generic(ep, buf, count, desc,
+					      result, result_desc, dest_addr,
+					      addr, key, datatype, op,
+					      context, ep_priv->tx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_readwritemsg(struct fid_ep *ep,
+					 const struct fi_msg_atomic *msg,
+					 struct fi_ioc *resultv,
+					 void **result_desc,
+					 size_t result_count,
+					 uint64_t flags)
+{
+	void *buf;
+	size_t count;
+	void *desc;
+
+	assert(msg);
+	assert(msg->rma_iov);
+	assert(msg->rma_iov_count ==1);
+	assert(resultv);
+	assert(result_count);
+	assert((msg->msg_iov && msg->iov_count) || msg->op == FI_ATOMIC_READ);
+
+	if ((msg->op != FI_ATOMIC_READ && msg->iov_count > 1) ||
+	    result_count > 1)
+		return psmx3_atomic_readwritev_generic(ep, msg->msg_iov, msg->desc,
+						       msg->iov_count, resultv,
+						       result_desc, result_count,
+						       msg->addr,
+						       msg->rma_iov[0].addr,
+						       msg->rma_iov[0].key,
+						       msg->datatype, msg->op,
+						       msg->context, flags);
+
+	if (msg->op == FI_ATOMIC_READ) {
+		buf = NULL;
+		count = resultv[0].count;
+		desc = result_desc ? result_desc[0] : NULL;
+	} else {
+		buf = msg->msg_iov[0].addr;
+		count = msg->msg_iov[0].count;
+		desc = msg->desc ? msg->desc[0] : NULL;
+	}
+
+	return psmx3_atomic_readwrite_generic(ep, buf, count, desc, resultv[0].addr,
+					      result_desc ? result_desc[0] : NULL,
+					      msg->addr, msg->rma_iov[0].addr,
+					      msg->rma_iov[0].key, msg->datatype,
+					      msg->op, msg->context, flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_readwritev(struct fid_ep *ep,
+				       const struct fi_ioc *iov,
+				       void **desc, size_t count,
+				       struct fi_ioc *resultv,
+				       void **result_desc, size_t result_count,
+				       fi_addr_t dest_addr,
+				       uint64_t addr, uint64_t key,
+				       enum fi_datatype datatype,
+				       enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+	void *buf;
+	void *src_desc;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	assert(resultv);
+	assert(result_count);
+	assert((iov && count) || op == FI_ATOMIC_READ);
+
+	if ((op != FI_ATOMIC_READ && count > 1) || result_count > 1)
+		return psmx3_atomic_readwritev_generic(ep, iov, desc, count,
+					      resultv, result_desc, result_count,
+					      dest_addr, addr, key, datatype, op,
+					      context, ep_priv->tx_flags);
+
+	if (op == FI_ATOMIC_READ) {
+		buf = NULL;
+		count = resultv[0].count;
+		src_desc = result_desc ? result_desc[0] : NULL;
+	} else {
+		buf = iov[0].addr;
+		count = iov[0].count;
+		src_desc = desc ? desc[0] : NULL;
+	}
+
+	return psmx3_atomic_readwrite_generic(ep, buf, count, src_desc, resultv[0].addr,
+					      result_desc ? result_desc[0] : NULL,
+					      dest_addr, addr, key, datatype, op,
+					      context, ep_priv->tx_flags);
+}
+
+ssize_t psmx3_atomic_compwrite_generic(struct fid_ep *ep,
+				       const void *buf,
+				       size_t count, void *desc,
+				       const void *compare, void *compare_desc,
+				       void *result, void *result_desc,
+				       fi_addr_t dest_addr,
+				       uint64_t addr, uint64_t key,
+				       enum fi_datatype datatype,
+				       enum fi_op op, void *context,
+				       uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size, len;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_compwrite(ep, buf, count,
+							    desc, compare,
+							    compare_desc,
+							    result, result_desc,
+							    dest_addr, addr,
+							    key, datatype, op,
+							    context, flags);
+
+	assert(buf);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_COMPWRITE, ep_priv,
+					 buf, count, desc, compare,
+					 compare_desc, result, result_desc,
+					 addr, key, datatype, op,
+					 context, flags);
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	len = ofi_datatype_size(datatype) * count;
+	if (len * 2 > chunk_size)
+		return -FI_EMSGSIZE;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if ((flags & FI_INJECT) ||
+	    ((uintptr_t)compare != (uintptr_t)buf + len)) {
+		req->tmpbuf = malloc(len * 2);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+		memcpy(req->tmpbuf, buf, len);
+		memcpy((uint8_t *)req->tmpbuf + len, compare, len);
+		buf = req->tmpbuf;
+		compare = (uint8_t *)buf + len;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_COMPWRITE;
+	req->atomic.buf = (void *)buf;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.result = result;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_COMPWRITE;
+	args[0].u32w1 = count;
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len * 2, am_flags,
+				    NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+ssize_t psmx3_atomic_compwritev_generic(struct fid_ep *ep,
+					const struct fi_ioc *iov,
+					void **desc, size_t count,
+					const struct fi_ioc *comparev,
+					void **compare_desc,
+					size_t compare_count,
+					struct fi_ioc *resultv,
+					void **result_desc,
+					size_t result_count,
+					fi_addr_t dest_addr,
+					uint64_t addr, uint64_t key,
+					enum fi_datatype datatype,
+					enum fi_op op, void *context,
+					uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size;
+	size_t len, iov_size;
+	uint8_t *buf, *compare, *result;
+	void *desc0, *compare_desc0, *result_desc0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_atomic_compwritev(ep, iov, desc,
+							     count, comparev,
+							     compare_desc,
+							     compare_count,
+							     resultv,
+							     result_desc,
+							     result_count,
+							     dest_addr, addr,
+							     key, datatype, op,
+							     context, flags);
+
+	assert(iov);
+	assert(count);
+	assert(comparev);
+	assert(compare_count);
+	assert(resultv);
+	assert(result_count);
+	assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST);
+	assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST);
+
+	while (count && !iov[count-1].count)
+		count--;
+
+	while (compare_count && !comparev[compare_count-1].count)
+		compare_count--;
+
+	while (result_count && !resultv[result_count-1].count)
+		result_count--;
+
+	len = psmx3_ioc_size(iov, count, datatype);
+
+	assert(psmx3_ioc_size(comparev, compare_count, datatype) >= len);
+	assert(psmx3_ioc_size(resultv, result_count, datatype) >= len);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid) {
+		if (count > 1) {
+			buf = malloc(len);
+			if (!buf)
+				return -FI_ENOMEM;
+			psmx3_ioc_read(iov, count, datatype, buf, len);
+			desc0 = NULL;
+		} else {
+			buf = iov[0].addr;
+			desc0 = desc ? desc[0] : NULL;
+		}
+
+		if (compare_count > 1) {
+			compare = malloc(len);
+			if (!compare) {
+				if (count > 1)
+					free(buf);
+				return -FI_ENOMEM;
+			}
+			psmx3_ioc_read(comparev, compare_count, datatype, compare, len);
+			compare_desc0 = NULL;
+		} else {
+			compare = comparev[0].addr;
+			compare_desc0 = compare_desc ? compare_desc[0] : NULL;
+		}
+
+		if (result_count > 1) {
+			result = malloc(len);
+			if (!result) {
+				if (compare_count > 1)
+					free(compare);
+				if (count > 1)
+					free(buf);
+				return -FI_ENOMEM;
+			}
+			result_desc0 = NULL;
+		} else {
+			result = resultv[0].addr;
+			result_desc0 = result_desc ? result_desc[0] : NULL;
+		}
+
+		err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_COMPWRITE, ep_priv,
+					buf, len / ofi_datatype_size(datatype), desc0,
+					compare, compare_desc0, result, result_desc0,
+					addr, key, datatype, op, context, flags);
+
+		if (result_count > 1) {
+			psmx3_ioc_write(resultv, result_count, datatype, result, len);
+			free(result);
+		}
+
+		if (compare_count > 1)
+			free(compare);
+
+		if (count > 1)
+			free(buf);
+
+		return err;
+	}
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	if (len * 2 > chunk_size)
+		return -FI_EMSGSIZE;
+
+	iov_size = result_count > 1 ? result_count * sizeof(struct fi_ioc) : 0;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if ((flags & FI_INJECT) || count > 1 || compare_count > 1 ||
+	    (uintptr_t)comparev[0].addr != (uintptr_t)iov[0].addr + len) {
+		req->tmpbuf = malloc(iov_size + len + len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+		buf = (uint8_t *)req->tmpbuf + iov_size;
+		psmx3_ioc_read(iov, count, datatype, buf, len);
+		psmx3_ioc_read(comparev, compare_count, datatype, buf + len, len);
+	} else {
+		req->tmpbuf = malloc(iov_size);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+		buf = iov[0].addr;
+	}
+
+	req->ioc = req->tmpbuf;
+	if (iov_size) {
+		memcpy(req->ioc, resultv, iov_size);
+		req->atomic.iov_count = result_count;
+		req->atomic.result = NULL;
+	} else {
+		req->atomic.buf = buf;
+		req->atomic.result = resultv[0].addr;
+	}
+
+	req->no_event = (flags & PSMX3_NO_COMPLETION) ||
+			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req->op = PSMX3_AM_REQ_ATOMIC_COMPWRITE;
+	req->atomic.len = len;
+	req->atomic.addr = addr;
+	req->atomic.key = key;
+	req->atomic.context = context;
+	req->atomic.datatype = datatype;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_ATOMIC;
+
+	args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_COMPWRITE;
+	args[0].u32w1 = len / ofi_datatype_size(datatype);
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	args[4].u32w0 = datatype;
+	args[4].u32w1 = op;
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX3_AM_ATOMIC_HANDLER, args, 5,
+				    buf, len * 2, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_compwrite(struct fid_ep *ep,
+				      const void *buf,
+				      size_t count, void *desc,
+				      const void *compare, void *compare_desc,
+				      void *result, void *result_desc,
+				      fi_addr_t dest_addr,
+				      uint64_t addr, uint64_t key,
+				      enum fi_datatype datatype,
+				      enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	return psmx3_atomic_compwrite_generic(ep, buf, count, desc,
+					      compare, compare_desc,
+					      result, result_desc,
+					      dest_addr, addr, key,
+					      datatype, op, context, ep_priv->tx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_compwritemsg(struct fid_ep *ep,
+					 const struct fi_msg_atomic *msg,
+					 const struct fi_ioc *comparev,
+					 void **compare_desc,
+					 size_t compare_count,
+					 struct fi_ioc *resultv,
+					 void **result_desc,
+					 size_t result_count,
+					 uint64_t flags)
+{
+	assert(msg);
+	assert(msg->msg_iov);
+	assert(msg->iov_count);
+	assert(msg->rma_iov);
+	assert(msg->rma_iov_count == 1);
+	assert(comparev);
+	assert(compare_count);
+	assert(resultv);
+	assert(result_count);
+
+	if (msg->iov_count > 1 || compare_count > 1 || result_count > 1)
+		return psmx3_atomic_compwritev_generic(ep, msg->msg_iov, msg->desc,
+						       msg->iov_count, comparev,
+						       compare_desc, compare_count,
+						       resultv, result_desc, result_count,
+						       msg->addr, msg->rma_iov[0].addr,
+						       msg->rma_iov[0].key, msg->datatype,
+						       msg->op, msg->context, flags);
+
+	return psmx3_atomic_compwrite_generic(ep, msg->msg_iov[0].addr,
+					      msg->msg_iov[0].count,
+					      msg->desc ? msg->desc[0] : NULL,
+					      comparev[0].addr,
+					      compare_desc ? compare_desc[0] : NULL,
+					      resultv[0].addr,
+					      result_desc ? result_desc[0] : NULL,
+					      msg->addr, msg->rma_iov[0].addr,
+					      msg->rma_iov[0].key, msg->datatype,
+					      msg->op, msg->context, flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_atomic_compwritev(struct fid_ep *ep,
+				       const struct fi_ioc *iov,
+				       void **desc, size_t count,
+				       const struct fi_ioc *comparev,
+				       void **compare_desc,
+				       size_t compare_count,
+				       struct fi_ioc *resultv,
+				       void **result_desc,
+				       size_t result_count,
+				       fi_addr_t dest_addr,
+				       uint64_t addr, uint64_t key,
+				       enum fi_datatype datatype,
+				       enum fi_op op, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	assert(iov);
+	assert(count);
+	assert(comparev);
+	assert(compare_count);
+	assert(resultv);
+	assert(result_count);
+
+	if (count > 1 || compare_count > 1 || result_count > 1)
+		return psmx3_atomic_compwritev_generic(ep, iov, desc, count,
+						       comparev, compare_desc,
+						       compare_count, resultv,
+						       result_desc, result_count,
+						       dest_addr, addr, key,
+						       datatype, op, context,
+						       ep_priv->tx_flags);
+
+	return psmx3_atomic_compwrite_generic(ep, iov->addr, iov->count,
+					      desc ? desc[0] : NULL,
+					      comparev[0].addr,
+					      compare_desc ? compare_desc[0] : NULL,
+					      resultv[0].addr,
+					      result_desc ? result_desc[0] : NULL,
+					      dest_addr, addr, key, datatype, op,
+					      context, ep_priv->tx_flags);
+}
+
+static int psmx3_atomic_writevalid_internal(size_t chunk_size,
+					    enum fi_datatype datatype,
+					    enum fi_op op, size_t *count)
+{
+	if (datatype >= FI_DATATYPE_LAST)
+		return -FI_EOPNOTSUPP;
+
+	switch (op) {
+	case FI_MIN:
+	case FI_MAX:
+	case FI_SUM:
+	case FI_PROD:
+	case FI_LOR:
+	case FI_LAND:
+	case FI_BOR:
+	case FI_BAND:
+	case FI_LXOR:
+	case FI_BXOR:
+	case FI_ATOMIC_WRITE:
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	if (count)
+		*count = chunk_size / ofi_datatype_size(datatype);
+
+	return 0;
+}
+
+static int psmx3_atomic_readwritevalid_internal(size_t chunk_size,
+						enum fi_datatype datatype,
+						enum fi_op op, size_t *count)
+{
+	if (datatype >= FI_DATATYPE_LAST)
+		return -FI_EOPNOTSUPP;
+
+	switch (op) {
+	case FI_MIN:
+	case FI_MAX:
+	case FI_SUM:
+	case FI_PROD:
+	case FI_LOR:
+	case FI_LAND:
+	case FI_BOR:
+	case FI_BAND:
+	case FI_LXOR:
+	case FI_BXOR:
+	case FI_ATOMIC_READ:
+	case FI_ATOMIC_WRITE:
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	if (count)
+		*count = chunk_size / ofi_datatype_size(datatype);
+
+	return 0;
+}
+
+static int psmx3_atomic_compwritevalid_internal(size_t chunk_size,
+						enum fi_datatype datatype,
+						enum fi_op op, size_t *count)
+{
+
+	if (datatype >= FI_DATATYPE_LAST)
+		return -FI_EOPNOTSUPP;
+
+	switch (op) {
+	case FI_CSWAP:
+	case FI_CSWAP_NE:
+		break;
+
+	case FI_CSWAP_LE:
+	case FI_CSWAP_LT:
+	case FI_CSWAP_GE:
+	case FI_CSWAP_GT:
+		if (datatype == FI_FLOAT_COMPLEX ||
+		    datatype == FI_DOUBLE_COMPLEX ||
+		    datatype == FI_LONG_DOUBLE_COMPLEX)
+			return -FI_EOPNOTSUPP;
+		break;
+
+	case FI_MSWAP:
+		if (datatype == FI_FLOAT_COMPLEX ||
+		    datatype == FI_DOUBLE_COMPLEX ||
+		    datatype == FI_LONG_DOUBLE_COMPLEX ||
+		    datatype == FI_FLOAT ||
+		    datatype == FI_DOUBLE ||
+		    datatype == FI_LONG_DOUBLE)
+			return -FI_EOPNOTSUPP;
+		break;
+
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+
+	if (count)
+		*count = chunk_size / (2 * ofi_datatype_size(datatype));
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_atomic_writevalid(struct fid_ep *ep,
+				   enum fi_datatype datatype,
+				   enum fi_op op, size_t *count)
+{
+	struct psmx3_fid_ep *ep_priv;
+	size_t chunk_size;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	return psmx3_atomic_writevalid_internal(chunk_size, datatype, op, count);
+}
+
+DIRECT_FN
+STATIC int psmx3_atomic_readwritevalid(struct fid_ep *ep,
+				       enum fi_datatype datatype,
+				       enum fi_op op, size_t *count)
+{
+	struct psmx3_fid_ep *ep_priv;
+	size_t chunk_size;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	return psmx3_atomic_readwritevalid_internal(chunk_size, datatype, op, count);
+}
+
+DIRECT_FN
+STATIC int psmx3_atomic_compwritevalid(struct fid_ep *ep,
+				       enum fi_datatype datatype,
+				       enum fi_op op, size_t *count)
+{
+	struct psmx3_fid_ep *ep_priv;
+	size_t chunk_size;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+	return psmx3_atomic_compwritevalid_internal(chunk_size, datatype, op, count);
+}
+
+DIRECT_FN
+int psmx3_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
+		       enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags)
+{
+	struct psmx3_fid_domain *domain_priv;
+	size_t chunk_size;
+	size_t count;
+	int ret;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain, util_domain.domain_fid);
+	chunk_size = domain_priv->max_atomic_size;
+
+	if (flags & FI_TAGGED)
+		return -FI_EOPNOTSUPP;
+
+	if (flags & FI_COMPARE_ATOMIC) {
+		if (flags & FI_FETCH_ATOMIC)
+			return -FI_EINVAL;
+		ret = psmx3_atomic_compwritevalid_internal(chunk_size, datatype,
+							   op, &count);
+	} else if (flags & FI_FETCH_ATOMIC) {
+		ret = psmx3_atomic_readwritevalid_internal(chunk_size, datatype,
+							   op, &count);
+	} else {
+		ret = psmx3_atomic_writevalid_internal(chunk_size, datatype,
+						       op, &count);
+	}
+
+	if (attr && !ret) {
+		attr->size = ofi_datatype_size(datatype);
+		attr->count = count;
+	}
+
+	return ret;
+}
+
+struct fi_ops_atomic psmx3_atomic_ops = {
+	.size = sizeof(struct fi_ops_atomic),
+	.write = psmx3_atomic_write,
+	.writev = psmx3_atomic_writev,
+	.writemsg = psmx3_atomic_writemsg,
+	.inject = psmx3_atomic_inject,
+	.readwrite = psmx3_atomic_readwrite,
+	.readwritev = psmx3_atomic_readwritev,
+	.readwritemsg = psmx3_atomic_readwritemsg,
+	.compwrite = psmx3_atomic_compwrite,
+	.compwritev = psmx3_atomic_compwritev,
+	.compwritemsg = psmx3_atomic_compwritemsg,
+	.writevalid = psmx3_atomic_writevalid,
+	.readwritevalid = psmx3_atomic_readwritevalid,
+	.compwritevalid = psmx3_atomic_compwritevalid,
+};
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_attr.c b/deps/libfabric/prov/psm3/src/psmx3_attr.c
new file mode 100644
index 0000000000000000000000000000000000000000..8d74f0f0a15b0cf6d26736f698b60a133a0bb151
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_attr.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+/*
+ * Default provider attributes are defined for:
+ *
+ * 	full set of capabilities
+ * 	ep type = FI_EP_RDM
+ * 	addr format = FI_ADDR_PSMX3
+ * 	cq_data_size = 0
+ *
+ * This is used as a template to create actual provider info, which will
+ * have some fields modified for different configurations and some fields
+ * updated to environment settings.
+ */
+
+static struct fi_tx_attr psmx3_tx_attr = {
+	.caps			= PSMX3_TX_CAPS, /* PSMX3_RMA_TX_CAPS */
+	.mode			= FI_CONTEXT, /* 0 */
+	.op_flags		= PSMX3_OP_FLAGS,
+	.msg_order		= PSMX3_MSG_ORDER,
+	.comp_order		= PSMX3_COMP_ORDER,
+	.inject_size		= 64, /* psmx3_env.inject_size */
+	.size			= UINT64_MAX,
+	.iov_limit		= PSMX3_IOV_MAX_COUNT,
+	.rma_iov_limit		= 1,
+};
+
+static struct fi_rx_attr psmx3_rx_attr = {
+	.caps			= PSMX3_RX_CAPS, /* PSMX3_RMA_RX_CAPS */
+	.mode			= FI_CONTEXT, /* 0 */
+	.op_flags		= PSMX3_OP_FLAGS,
+	.msg_order		= PSMX3_MSG_ORDER,
+	.comp_order		= PSMX3_COMP_ORDER,
+	.total_buffered_recv	= UINT64_MAX,
+	.size			= UINT64_MAX,
+	.iov_limit		= 1,
+};
+
+static struct fi_ep_attr psmx3_ep_attr = {
+	.type			= FI_EP_RDM, /* FI_EP_DGRAM */
+	.protocol		= FI_PROTO_PSMX3,
+	.protocol_version	= PSM2_VERNO,
+	.max_msg_size		= PSMX3_MAX_MSG_SIZE & ~0x0FFF,
+	.msg_prefix_size	= 0,
+	.max_order_raw_size	= PSMX3_RMA_ORDER_SIZE,
+	.max_order_war_size	= PSMX3_RMA_ORDER_SIZE,
+	.max_order_waw_size	= PSMX3_RMA_ORDER_SIZE,
+	.mem_tag_format		= FI_TAG_GENERIC, /* >>= 4 */
+	.tx_ctx_cnt		= 1,
+	.rx_ctx_cnt		= 1,
+	.auth_key_size		= sizeof(psm2_uuid_t),
+	.auth_key		= NULL,
+};
+
+static struct fi_domain_attr psmx3_domain_attr = {
+	.domain			= NULL,
+	.name			= PSMX3_DOMAIN_NAME,
+	.threading		= FI_THREAD_SAFE,
+	.control_progress	= FI_PROGRESS_AUTO,
+	.data_progress		= FI_PROGRESS_AUTO,
+	.resource_mgmt		= FI_RM_ENABLED,
+	.av_type		= FI_AV_UNSPEC,
+	.mr_mode		= FI_MR_SCALABLE | FI_MR_BASIC,
+	.mr_key_size		= sizeof(uint64_t),
+	.cq_data_size		= 0, /* 4, 8 */
+	.cq_cnt			= 65535,
+	.ep_cnt			= 65535,
+	.tx_ctx_cnt		= 1, /* psmx3_hfi_info.free_trx_ctxt */
+	.rx_ctx_cnt		= 1, /* psmx3_hfi_info.free_trx_ctxt */
+	.max_ep_tx_ctx		= 1, /* psmx3_hfi_info.max_trx_ctxt */
+	.max_ep_rx_ctx		= 1, /* psmx3_hfi_info.max_trx_ctxt */
+	.max_ep_stx_ctx		= 1, /* psmx3_hfi_info.max_trx_ctxt */
+	.max_ep_srx_ctx		= 0,
+	.cntr_cnt		= 65535,
+	.mr_iov_limit		= 65535,
+	.caps			= PSMX3_DOM_CAPS,
+	.mode			= 0,
+	.auth_key		= NULL,
+	.auth_key_size		= sizeof(psm2_uuid_t),
+	.max_err_data		= PSMX3_ERR_DATA_SIZE,
+	.mr_cnt			= 65535,
+};
+
+static struct fi_fabric_attr psmx3_fabric_attr = {
+	.name			= PSMX3_FABRIC_NAME,
+};
+
+static struct fi_info psmx3_prov_info = {
+	.next			= NULL,
+	.caps			= PSMX3_CAPS, /* PSMX3_RMA_CAPS */
+	.mode			= FI_CONTEXT, /* 0 */
+	.addr_format		= FI_ADDR_PSMX3, /* FI_ADDR_STR */
+	.src_addrlen		= sizeof(struct psmx3_ep_name),
+	.dest_addrlen		= sizeof(struct psmx3_ep_name),
+	.src_addr		= NULL,
+	.dest_addr		= NULL,
+	.handle			= NULL,
+	.tx_attr		= &psmx3_tx_attr,
+	.rx_attr		= &psmx3_rx_attr,
+	.ep_attr		= &psmx3_ep_attr,
+	.domain_attr		= &psmx3_domain_attr,
+	.fabric_attr		= &psmx3_fabric_attr,
+};
+
+#ifdef HAVE_PSM3_DL
+static struct fi_info *psmx3_allocinfo_internal(void)
+{
+	struct fi_info *info;
+
+	info = calloc(1, sizeof(*info));
+	if (!info)
+		return NULL;
+
+	info->tx_attr = calloc(1, sizeof(*info->tx_attr));
+	info->rx_attr = calloc(1, sizeof(*info->rx_attr));
+	info->ep_attr = calloc(1, sizeof(*info->ep_attr));
+	info->domain_attr = calloc(1, sizeof(*info->domain_attr));
+	info->fabric_attr = calloc(1, sizeof(*info->fabric_attr));
+	if (!info->tx_attr|| !info->rx_attr || !info->ep_attr ||
+	    !info->domain_attr || !info->fabric_attr)
+		goto err;
+
+	return info;
+err:
+	fi_freeinfo(info);
+	return NULL;
+}
+static struct fi_info *psmx3_dupinfo(const struct fi_info *info)
+{
+	struct fi_info *dup;
+	int ret;
+
+	if (!info)
+		return psmx3_allocinfo_internal();
+
+	dup = mem_dup(info, sizeof(*dup));
+	if (dup == NULL) {
+		return NULL;
+	}
+	dup->src_addr = NULL;
+	dup->dest_addr = NULL;
+	dup->tx_attr = NULL;
+	dup->rx_attr = NULL;
+	dup->ep_attr = NULL;
+	dup->domain_attr = NULL;
+	dup->fabric_attr = NULL;
+	dup->next = NULL;
+
+	if (info->src_addr != NULL) {
+		dup->src_addr = mem_dup(info->src_addr, info->src_addrlen);
+		if (dup->src_addr == NULL)
+			goto fail;
+	}
+	if (info->dest_addr != NULL) {
+		dup->dest_addr = mem_dup(info->dest_addr, info->dest_addrlen);
+		if (dup->dest_addr == NULL)
+			goto fail;
+	}
+	if (info->tx_attr != NULL) {
+		dup->tx_attr = mem_dup(info->tx_attr, sizeof(*info->tx_attr));
+		if (dup->tx_attr == NULL)
+			goto fail;
+	}
+	if (info->rx_attr != NULL) {
+		dup->rx_attr = mem_dup(info->rx_attr, sizeof(*info->rx_attr));
+		if (dup->rx_attr == NULL)
+			goto fail;
+	}
+	if (info->ep_attr != NULL) {
+		dup->ep_attr = mem_dup(info->ep_attr, sizeof(*info->ep_attr));
+		if (dup->ep_attr == NULL)
+			goto fail;
+		if (info->ep_attr->auth_key != NULL) {
+			dup->ep_attr->auth_key =
+				mem_dup(info->ep_attr->auth_key,
+					info->ep_attr->auth_key_size);
+			if (dup->ep_attr->auth_key == NULL)
+				goto fail;
+		}
+	}
+	if (info->domain_attr) {
+		dup->domain_attr = mem_dup(info->domain_attr,
+					   sizeof(*info->domain_attr));
+		if (dup->domain_attr == NULL)
+			goto fail;
+		dup->domain_attr->name = NULL;
+		dup->domain_attr->auth_key = NULL;
+		if (info->domain_attr->name != NULL) {
+			dup->domain_attr->name = strdup(info->domain_attr->name);
+			if (dup->domain_attr->name == NULL)
+				goto fail;
+		}
+		if (info->domain_attr->auth_key != NULL) {
+			dup->domain_attr->auth_key =
+				mem_dup(info->domain_attr->auth_key,
+					info->domain_attr->auth_key_size);
+			if (dup->domain_attr->auth_key == NULL)
+				goto fail;
+		}
+	}
+	if (info->fabric_attr) {
+		dup->fabric_attr = mem_dup(info->fabric_attr,
+					   sizeof(*info->fabric_attr));
+		if (dup->fabric_attr == NULL)
+			goto fail;
+		dup->fabric_attr->name = NULL;
+		dup->fabric_attr->prov_name = NULL;
+		if (info->fabric_attr->name != NULL) {
+			dup->fabric_attr->name = strdup(info->fabric_attr->name);
+			if (dup->fabric_attr->name == NULL)
+				goto fail;
+		}
+		if (info->fabric_attr->prov_name != NULL) {
+			dup->fabric_attr->prov_name = strdup(info->fabric_attr->prov_name);
+			if (dup->fabric_attr->prov_name == NULL)
+				goto fail;
+		}
+	}
+
+	if (info->nic) {
+		ret = fi_control(&info->nic->fid, FI_DUP, &dup->nic);
+		if (ret && ret != -FI_ENOSYS)
+			goto fail;
+	}
+
+	return dup;
+
+fail:
+	fi_freeinfo(dup);
+	return NULL;
+}
+#else
+#define psmx3_dupinfo fi_dupinfo
+#endif /* HAVE_PSM3_DL */
+
+#ifdef PSM_CUDA
+/* mimic parsing functionality of psmi_getenv */
+static long get_psm3_env(const char *var, int default_value) {
+	char *ep;
+	long val;
+	char *e = getenv(var);
+
+	if (!e || !*e)
+		return default_value; /* no value supplied */
+
+	val = strtol(e, &ep, 10);
+	if (!ep ||  *ep) { /* parse error - didn't consume all */
+		val = strtol(e, &ep, 16); /* try hex */
+		if (!ep ||  *ep)
+			return default_value;
+	}
+	return val;
+}
+#endif
+static uint64_t psmx3_check_fi_hmem_cap(void) {
+#ifdef PSM_CUDA
+	if ((get_psm3_env("PSM3_CUDA", 0) || get_psm3_env("PSM3_GPUDIRECT", 0)) &&
+	    !ofi_hmem_p2p_disabled())
+		return FI_HMEM;
+#endif
+	return 0;
+}
+
+/*
+ * Possible provider variations:
+ *
+ *  (1)  FI_ADDR_PSMX3, FI_EP_RDM,   tag64 (cq_data_size 0, FI_CONTEXT)
+ *  (2)  FI_ADDR_PSMX3, FI_EP_RDM,   tag60 (cq_data_size 4, FI_CONTEXT)
+ *  (3)  FI_ADDR_PSMX3, FI_EP_RDM,   rma   (cq_data_size 8)
+ *  (4)  FI_ADDR_PSMX3, FI_EP_DGRAM, tag64 (cq_data_size 0, FI_CONTEXT)
+ *  (5)  FI_ADDR_PSMX3, FI_EP_DGRAM, tag60 (cq_data_size 4, FI_CONTEXT)
+ *  (6)  FI_ADDR_PSMX3, FI_EP_DGRAM, rma   (cq_data_size 8)
+ *  (7)  FI_ADDR_STR,   FI_EP_RDM,   tag64 (cq_data_size 0, FI_CONTEXT)
+ *  (8)  FI_ADDR_STR,   FI_EP_RDM,   tag60 (cq_data_size 4, FI_CONTEXT)
+ *  (9)  FI_ADDR_STR,   FI_EP_RDM,   rma   (cq_data_size 8)
+ *  (10) FI_ADDR_STR,   FI_EP_DGRAM, tag64 (cq_data_size 0, FI_CONTEXT)
+ *  (11) FI_ADDR_STR,   FI_EP_DGRAM, tag60 (cq_data_size 4, FI_CONTEXT)
+ *  (12) FI_ADDR_STR,   FI_EP_DGRAM, rma   (cq_data_size 8)
+ *
+ * To avoid returning all 12 provider variations for an unrestricted query,
+ * "addr_format" and "ep_type" are checked first and a single value is set
+ * for each of them. As the result, at most three provider instances (tag64,
+ * tag60, rma) are returned.
+ *
+ * This also bypasses queries obviously unsuitable for this provider and
+ * avoid unnecessary initialization steps.
+ */
+
+int psmx3_init_prov_info(const struct fi_info *hints, struct fi_info **info)
+{
+	struct fi_fabric_attr *fabric_attr = &psmx3_fabric_attr;
+	struct fi_info *prov_info = &psmx3_prov_info;
+	struct fi_info *info_out, *info_new;
+	int addr_format = FI_ADDR_PSMX3;
+	int addr_format2 = FI_ADDR_STR;
+	int ep_type = FI_EP_RDM;
+	int ep_type2 = FI_EP_DGRAM;
+	uint64_t extra_caps = 0;
+
+	if (!hints)
+		goto alloc_info;
+
+	if (hints->fabric_attr && hints->fabric_attr->name &&
+	    strcasecmp(hints->fabric_attr->name, fabric_attr->name)) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "Unknown fabric name\n");
+		OFI_INFO_NAME(&psmx3_prov, fabric_attr, hints->fabric_attr);
+		return -FI_ENODATA;
+	}
+
+	if (hints->ep_attr) {
+		switch (hints->ep_attr->type) {
+		case FI_EP_UNSPEC:
+		case FI_EP_RDM:
+			break;
+		case FI_EP_DGRAM:
+			ep_type = FI_EP_DGRAM;
+			break;
+		default:
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"Unsupported endpoint type\n");
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n",
+				fi_tostr(&ep_type, FI_TYPE_EP_TYPE));
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n",
+				fi_tostr(&ep_type2, FI_TYPE_EP_TYPE));
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Requested: %s\n",
+				fi_tostr(&hints->ep_attr->type, FI_TYPE_EP_TYPE));
+			return -FI_ENODATA;
+		}
+	}
+
+	switch (hints->addr_format) {
+		case FI_FORMAT_UNSPEC:
+		case FI_ADDR_PSMX3:
+			break;
+		case FI_ADDR_STR:
+			addr_format = FI_ADDR_STR;
+			break;
+		default:
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"Unsupported address format\n");
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n",
+				fi_tostr(&addr_format, FI_TYPE_ADDR_FORMAT));
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n",
+				fi_tostr(&addr_format2, FI_TYPE_ADDR_FORMAT));
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "Requested: %s\n",
+				fi_tostr(&hints->addr_format, FI_TYPE_ADDR_FORMAT));
+			return -FI_ENODATA;
+	}
+
+	/* Check if CUDA is enable */
+	extra_caps |= psmx3_check_fi_hmem_cap();
+
+	prov_info->caps |= extra_caps;
+	prov_info->tx_attr->caps |= extra_caps;
+	prov_info->rx_attr->caps |= extra_caps;
+	prov_info->domain_attr->caps |= extra_caps;
+
+	if ((hints->caps & prov_info->caps) != hints->caps) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "caps not supported\n");
+		OFI_INFO_CHECK(&psmx3_prov, prov_info, hints, caps, FI_TYPE_CAPS);
+		return -FI_ENODATA;
+	}
+
+alloc_info:
+	psmx3_prov_info.fabric_attr->prov_version = get_psm3_provider_version();
+	info_out = NULL;
+	if (!hints || !(hints->caps & (FI_TAGGED | FI_MSG))) {
+		info_new = psmx3_dupinfo(&psmx3_prov_info);
+		if (info_new) {
+			/* rma only, 64 bit CQ data */
+			info_new->addr_format = addr_format;
+			info_new->ep_attr->type = ep_type;
+			info_new->caps = PSMX3_RMA_CAPS | extra_caps;
+			info_new->mode = 0;
+			info_new->tx_attr->caps = PSMX3_RMA_TX_CAPS | extra_caps;
+			info_new->tx_attr->mode = 0;
+			info_new->rx_attr->caps = PSMX3_RMA_RX_CAPS | extra_caps;
+			info_new->rx_attr->mode = 0;
+			info_new->domain_attr->cq_data_size = 8;
+			info_out = info_new;
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"RMA only instance included\n");
+		}
+	}
+
+	info_new = psmx3_dupinfo(&psmx3_prov_info);
+	if (info_new) {
+		/* 60 bit tag, 32 bit CQ data */
+		info_new->addr_format = addr_format;
+		info_new->ep_attr->type = ep_type;
+		info_new->ep_attr->mem_tag_format >>= 4;
+		info_new->domain_attr->cq_data_size = 4;
+		info_new->next = info_out;
+		info_out = info_new;
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"TAG60 instance included\n");
+	}
+
+	if (!hints || !hints->domain_attr ||
+	    !hints->domain_attr->cq_data_size) {
+		info_new = psmx3_dupinfo(&psmx3_prov_info);
+		if (info_new) {
+			/* 64 bit tag, no CQ data */
+			info_new->addr_format = addr_format;
+			info_new->ep_attr->type = ep_type;
+			info_new->next = info_out;
+			info_out = info_new;
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"TAG64 instance included\n");
+		}
+	}
+
+	*info = info_out;
+	return info_out ? 0 : -FI_ENODATA;
+}
+
+static void psmx3_dup_addr(int format, struct psmx3_ep_name *addr,
+			   void **addr_out, size_t *len)
+{
+	if (!addr)
+		return;
+
+	if (format == FI_ADDR_STR) {
+		*addr_out = psmx3_ep_name_to_string(addr, len);
+	} else {
+		*addr_out = mem_dup(addr, sizeof(*addr));
+		*len = sizeof(*addr);
+	}
+}
+
+static void psmx3_expand_default_unit(struct fi_info *info)
+{
+	struct fi_info *p, *next;
+	struct psmx3_ep_name *src_addr;
+	int i;
+
+	p = info;
+	while (p) {
+		next = p->next;
+		src_addr = p->src_addr;
+		if (src_addr->unit == PSMX3_DEFAULT_UNIT) {
+			if (psmx3_hfi_info.num_active_units == 1) {
+				src_addr->unit = psmx3_hfi_info.active_units[0];
+			} else {
+				for (i = 0; i < psmx3_hfi_info.num_active_units; i++) {
+					p->next = psmx3_dupinfo(p);
+					if (!p->next) {
+						FI_WARN(&psmx3_prov, FI_LOG_CORE,
+							"Failed to duplicate info for HFI unit %d\n",
+							psmx3_hfi_info.active_units[i]);
+						break;
+					}
+					p = p->next;
+					src_addr = p->src_addr;
+					src_addr->unit = psmx3_hfi_info.active_units[i];
+				}
+			}
+		}
+		p->next = next;
+		p = next;
+	}
+}
+
+void psmx3_update_prov_info(struct fi_info *info,
+			    struct psmx3_ep_name *src_addr,
+			    struct psmx3_ep_name *dest_addr)
+{
+	struct fi_info *p;
+
+	for (p = info; p; p = p->next) {
+		psmx3_dup_addr(p->addr_format, src_addr,
+			       &p->src_addr, &p->src_addrlen);
+		psmx3_dup_addr(p->addr_format, dest_addr,
+			       &p->dest_addr, &p->dest_addrlen);
+	}
+
+	psmx3_expand_default_unit(info);
+
+	for (p = info; p; p = p->next) {
+		int unit = ((struct psmx3_ep_name *)p->src_addr)->unit;
+
+		if (unit == PSMX3_DEFAULT_UNIT || !psmx3_env.multi_ep) {
+			p->domain_attr->tx_ctx_cnt = psmx3_hfi_info.free_trx_ctxt;
+			p->domain_attr->rx_ctx_cnt = psmx3_hfi_info.free_trx_ctxt;
+			p->domain_attr->max_ep_tx_ctx = psmx3_hfi_info.max_trx_ctxt;
+			p->domain_attr->max_ep_rx_ctx = psmx3_hfi_info.max_trx_ctxt;
+			p->domain_attr->max_ep_stx_ctx = psmx3_hfi_info.max_trx_ctxt;
+		} else {
+			p->domain_attr->tx_ctx_cnt = psmx3_hfi_info.unit_nfreectxts[unit];
+			p->domain_attr->rx_ctx_cnt = psmx3_hfi_info.unit_nfreectxts[unit];
+			p->domain_attr->max_ep_tx_ctx = psmx3_hfi_info.unit_nctxts[unit];
+			p->domain_attr->max_ep_rx_ctx = psmx3_hfi_info.unit_nctxts[unit];
+			p->domain_attr->max_ep_stx_ctx = psmx3_hfi_info.unit_nctxts[unit];
+		}
+
+		free(p->domain_attr->name);
+		if (unit == PSMX3_DEFAULT_UNIT)
+			p->domain_attr->name = strdup(psmx3_hfi_info.default_domain_name);
+		else {
+			char unit_name[NAME_MAX];
+			psm2_info_query_arg_t args[2];
+
+			args[0].unit = unit;
+			args[1].length = sizeof(unit_name);
+
+			if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_NAME,
+				unit_name, 2, args)) {
+				FI_WARN(&psmx3_prov, FI_LOG_CORE,
+					"Failed to read unit name for NIC unit %d\n", unit);
+				if (asprintf(&p->domain_attr->name, "UNKNOWN") < 0) {
+					FI_WARN(&psmx3_prov, FI_LOG_CORE,
+						"Failed to allocate memory for unit name for NIC unit %d\n", unit);
+				}
+			} else {
+				if (asprintf(&p->domain_attr->name, "%s", unit_name) <0) {
+					FI_WARN(&psmx3_prov, FI_LOG_CORE,
+						"Failed to allocate memory for unit name for NIC unit %d\n", unit);
+				}
+			}
+		}
+
+		p->tx_attr->inject_size = psmx3_env.inject_size;
+	}
+}
+
+int psmx3_check_prov_info(uint32_t api_version,
+			  const struct fi_info *hints,
+			  struct fi_info **info)
+{
+	struct util_prov util_prov = { .prov = &psmx3_prov };
+	struct fi_info *next;
+	struct fi_info *prev = NULL;
+	struct fi_info *curr = *info;
+	struct fi_info *new_info = *info;
+
+	while (curr) {
+		next = curr->next;
+		if (ofi_check_info(&util_prov, curr, api_version, hints)) {
+			if (prev)
+				prev->next = next;
+			else
+				new_info = next;
+			curr->next = NULL;
+			fi_freeinfo(curr);
+		} else {
+			prev = curr;
+		}
+		curr = next;
+	}
+
+	*info = new_info;
+	return new_info ? 0 : -FI_ENODATA;
+}
+
+void psmx3_alter_prov_info(uint32_t api_version,
+			   const struct fi_info *hints,
+			   struct fi_info *info)
+{
+	int cnt = 0;
+	int cq_data_cnt = 0;
+
+	ofi_alter_info(info, hints, api_version);
+
+	/*
+	 * Some of the default values are set to simplify info
+	 * checking. Now change them back to the preferred values.
+	 */
+	for (; info; info = info->next) {
+		if (!hints || !hints->domain_attr ||
+		    !hints->domain_attr->control_progress)
+			info->domain_attr->control_progress =
+				FI_PROGRESS_MANUAL;
+
+		if (!hints || !hints->domain_attr ||
+		    !hints->domain_attr->data_progress)
+			info->domain_attr->data_progress =
+				FI_PROGRESS_MANUAL;
+
+		if (info->domain_attr->mr_mode == (FI_MR_BASIC | FI_MR_SCALABLE))
+			info->domain_attr->mr_mode = FI_MR_SCALABLE;
+
+		/*
+		 * Avoid automatically adding secondary caps that may negatively
+		 * impact performance.
+		 */
+		if (hints && hints->caps && !(hints->caps & FI_TRIGGER))
+			info->caps &= ~FI_TRIGGER;
+
+		if (info->domain_attr->cq_data_size)
+			cq_data_cnt++;
+
+		cnt++;
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"%d instances available, %d with CQ data flag set\n",
+		cnt, cq_data_cnt);
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_av.c b/deps/libfabric/prov/psm3/src/psmx3_av.c
new file mode 100644
index 0000000000000000000000000000000000000000..22374e5cd13dd0525bfc0f579249fb8b0ceecea3
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_av.c
@@ -0,0 +1,1194 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+/*
+ * SEP address query protocol:
+ *
+ * SEP Query REQ:
+ *	args[0].u32w0	cmd, version
+ *	args[0].u32w1	id
+ *	args[1].u64	sep_info
+ *	args[2].u64	status
+ *
+ * SEP Query REP:
+ *	args[0].u32w0	cmd, version
+ *	args[0].u32w1	error
+ *	args[1].u64	sep_info
+ *	args[2].u64	status
+ *	args[3].u64	n
+ *	data		epids
+ */
+
+static int psmx3_am_sep_match(struct dlist_entry *entry, const void *arg)
+{
+	struct psmx3_fid_sep *sep;
+
+	sep = container_of(entry, struct psmx3_fid_sep, entry);
+	return ((uintptr_t)sep->id == (uintptr_t)arg);
+}
+
+static void psmx3_am_sep_completion(void *buf)
+{
+	free(buf);
+}
+
+int psmx3_am_sep_handler(psm2_am_token_t token, psm2_amarg_t *args,
+			 int nargs, void *src, uint32_t len, void *hctx)
+{
+	struct psmx3_fid_domain *domain;
+	psm2_amarg_t rep_args[4];
+	int op_error = 0;
+	int err = 0;
+	int cmd, version;
+	int n, i, j;
+	uint8_t sep_id;
+	struct psmx3_fid_sep *sep;
+	struct psmx3_av_sep *sep_info;
+	ofi_atomic32_t *status;
+	psm2_epid_t *epids;
+	psm2_epid_t *buf = NULL;
+	int buflen;
+	struct dlist_entry *entry;
+	struct psmx3_trx_ctxt *trx_ctxt = hctx;
+
+	cmd = PSMX3_AM_GET_OP(args[0].u32w0);
+	version = PSMX3_AM_GET_VER(args[0].u32w0);
+	if (version != PSMX3_AM_SEP_VERSION) {
+		FI_WARN(&psmx3_prov, FI_LOG_AV,
+			"AM SEP protocol version mismatch: request %d handler %d\n",
+			version, PSMX3_AM_SEP_VERSION);
+		return -FI_EINVAL;
+	}
+
+	domain = trx_ctxt->domain;
+
+	switch (cmd) {
+	case PSMX3_AM_REQ_SEP_QUERY:
+		sep_id = args[0].u32w1;
+		domain->sep_lock_fn(&domain->sep_lock, 1);
+		entry = dlist_find_first_match(&domain->sep_list, psmx3_am_sep_match,
+					       (void *)(uintptr_t)sep_id);
+		if (!entry) {
+			op_error = PSM2_EPID_UNKNOWN;
+			n = 0;
+			buflen = 0;
+		} else {
+			sep = container_of(entry, struct psmx3_fid_sep, entry);
+			n = sep->ctxt_cnt;
+			buflen = n * sizeof(psm2_epid_t);
+			if (n) {
+				buf = malloc(buflen);
+				if (!buf) {
+					op_error = PSM2_NO_MEMORY;
+					buflen = 0;
+					n = 0;
+				}
+				for (i=0; i< n; i++)
+					buf[i] = sep->ctxts[i].trx_ctxt->psm2_epid;
+			}
+		}
+		domain->sep_unlock_fn(&domain->sep_lock, 1);
+
+		rep_args[0].u32w0 = PSMX3_AM_REP_SEP_QUERY;
+		PSMX3_AM_SET_VER(rep_args[0].u32w0, PSMX3_AM_SEP_VERSION);
+		rep_args[0].u32w1 = op_error;
+		rep_args[1].u64 = args[1].u64;
+		rep_args[2].u64 = args[2].u64;
+		rep_args[3].u64 = n;
+		err = psm2_am_reply_short(token, PSMX3_AM_SEP_HANDLER,
+					  rep_args, 4, buf, buflen, 0,
+					  psmx3_am_sep_completion, buf);
+		break;
+
+	case PSMX3_AM_REP_SEP_QUERY:
+		op_error = args[0].u32w1;
+		sep_info = (struct psmx3_av_sep *)(uintptr_t)args[1].u64;
+		status = (void *)(uintptr_t)args[2].u64;
+		if (op_error) {
+			ofi_atomic_set32(status, psmx3_errno(op_error));
+		} else {
+			n = args[3].u64;
+			epids = malloc(n * sizeof(psm2_epid_t));
+			if (!epids) {
+				ofi_atomic_set32(status, -FI_ENOMEM);
+			} else {
+				for (j=0; j<n; j++)
+					epids[j] = ((psm2_epid_t *)src)[j];
+				/*
+				 * the sender of the SEP query request should
+				 * have acquired the lock and is waiting for
+				 * the response. see psmx3_av_query_sep().
+				 */
+				sep_info->ctxt_cnt = n;
+				sep_info->epids = epids;
+				ofi_atomic_set32(status, 0);
+			}
+		}
+		break;
+
+	default:
+		err = -FI_EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static void psmx3_set_epaddr_context(struct psmx3_trx_ctxt *trx_ctxt,
+				     psm2_epid_t epid, psm2_epaddr_t epaddr)
+{
+	struct psmx3_epaddr_context *context;
+	struct psmx3_epaddr_context *old_context = NULL;
+
+	context = (void *)psm2_epaddr_getctxt(epaddr);
+	if (context) {
+		if (context->trx_ctxt != trx_ctxt || context->epid != epid) {
+			FI_WARN(&psmx3_prov, FI_LOG_AV,
+				"trx_ctxt or epid doesn't match\n");
+			old_context = context;
+			context = NULL;
+		}
+	}
+
+	if (context)
+		return;
+
+	context = malloc(sizeof *context);
+	if (!context) {
+		FI_WARN(&psmx3_prov, FI_LOG_AV,
+			"cannot allocate context\n");
+		return;
+	}
+
+	context->trx_ctxt = trx_ctxt;
+	context->epid = epid;
+	context->epaddr = epaddr;
+	psm2_epaddr_setctxt(epaddr, context);
+	free(old_context);
+
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	dlist_insert_before(&context->entry, &trx_ctxt->peer_list);
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+}
+
+void psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt,
+			  psm2_epid_t epid, psm2_epaddr_t *epaddr)
+{
+	int err;
+	psm2_error_t errors;
+	psm2_epconn_t epconn;
+	struct psmx3_epaddr_context *context;
+
+	err = psm2_ep_epid_lookup2(trx_ctxt->psm2_ep, epid, &epconn);
+	if (err == PSM2_OK) {
+		context = psm2_epaddr_getctxt(epconn.addr);
+		if (context && context->epid  == epid) {
+			*epaddr = epconn.addr;
+			return;
+		}
+	}
+
+	err = psm2_ep_connect(trx_ctxt->psm2_ep, 1, &epid, NULL, &errors, epaddr,
+			      (int64_t) psmx3_env.conn_timeout * 1000000000LL);
+	if (err == PSM2_OK || err == PSM2_EPID_ALREADY_CONNECTED) {
+		psmx3_set_epaddr_context(trx_ctxt, epid, *epaddr);
+		return;
+	}
+
+	/* call fi_log() directly to always generate the output */
+	if (err == PSM2_TIMEOUT)
+		fi_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__,
+			"psm2_ep_connect returned error %s, remote epid=%lx."
+			"Try setting FI_PSM3_CONN_TIMEOUT "
+			"to a larger value (current: %d seconds).\n",
+			psm2_error_get_string(err), epid, psmx3_env.conn_timeout);
+	else
+		fi_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__,
+			"psm2_ep_connect returned error %s, remote epid=%lx.\n",
+			psm2_error_get_string(err), epid);
+
+	abort();
+}
+
+/*
+ * Must be called with av->lock held
+ */
+static int psmx3_av_check_space(struct psmx3_fid_av *av, size_t count)
+{
+	psm2_epaddr_t *new_epaddrs;
+	psm2_epaddr_t **new_sepaddrs;
+	struct psmx3_av_hdr *new_hdr;
+	struct psmx3_av_sep *new_sep_info;
+	size_t new_count;
+	size_t old_table_size, new_table_size;
+	int i;
+
+	new_count = av->count;
+	while (new_count < av->hdr->last + count)
+		new_count = new_count * 2;
+
+	if ((new_count <= av->count) && av->table)
+		return 0;
+
+	old_table_size = PSMX3_AV_TABLE_SIZE(av->count, av->shared);
+	new_table_size = PSMX3_AV_TABLE_SIZE(new_count, av->shared);
+	if (av->shared) {
+		new_hdr = mremap(av->hdr, old_table_size, new_table_size, 0);
+		if (new_hdr == MAP_FAILED)
+			return -FI_ENOMEM;
+		av->hdr = new_hdr;
+		av->map = (fi_addr_t *)(av->hdr + 1);
+		av->table = (struct psmx3_av_addr *)(av->map + new_count);
+		for (i = 0; i < new_count; i++)
+			av->map[i] = i;
+	} else {
+		new_hdr = realloc(av->hdr, new_table_size);
+		if (!new_hdr)
+			return -FI_ENOMEM;
+		av->hdr = new_hdr;
+		av->table = (struct psmx3_av_addr *)(av->hdr + 1);
+	}
+
+	new_sep_info = realloc(av->sep_info, new_count * sizeof(*new_sep_info));
+	if (!new_sep_info)
+		return -FI_ENOMEM;
+	av->sep_info = new_sep_info;
+
+	for (i = 0; i < av->max_trx_ctxt; i++) {
+		if (!av->conn_info[i].trx_ctxt)
+			continue;
+
+		new_epaddrs = realloc(av->conn_info[i].epaddrs,
+				      new_count * sizeof(*new_epaddrs));
+		if (!new_epaddrs)
+			return -FI_ENOMEM;
+		memset(new_epaddrs + av->hdr->last, 0,
+		       (new_count - av->hdr->last)  * sizeof(*new_epaddrs));
+		av->conn_info[i].epaddrs = new_epaddrs;
+
+		new_sepaddrs = realloc(av->conn_info[i].sepaddrs,
+				       new_count * sizeof(*new_sepaddrs));
+		if (!new_sepaddrs)
+			return -FI_ENOMEM;
+		memset(new_sepaddrs + av->hdr->last, 0,
+		       (new_count - av->hdr->last)  * sizeof(*new_sepaddrs));
+		av->conn_info[i].sepaddrs = new_sepaddrs;
+	}
+
+	av->count = av->hdr->size = new_count;
+	return 0;
+}
+
+static void psmx3_av_post_completion(struct psmx3_fid_av *av, void *context,
+				     uint64_t data, int prov_errno)
+{
+	if (prov_errno) {
+		struct fi_eq_err_entry entry;
+		entry.fid = &av->av.fid;
+		entry.context = context;
+		entry.data = data;
+		entry.err = -psmx3_errno(prov_errno);
+		entry.prov_errno = prov_errno;
+		entry.err_data = NULL;
+		entry.err_data_size = 0;
+		fi_eq_write(av->eq, FI_AV_COMPLETE, &entry, sizeof(entry),
+			    UTIL_FLAG_ERROR);
+	} else {
+		struct fi_eq_entry entry;
+		entry.fid = &av->av.fid;
+		entry.context = context;
+		entry.data = data;
+		fi_eq_write(av->eq, FI_AV_COMPLETE, &entry, sizeof(entry), 0);
+	}
+}
+
+/*
+ * Must be called with av->lock held
+ */
+int psmx3_av_query_sep(struct psmx3_fid_av *av,
+		       struct psmx3_trx_ctxt *trx_ctxt,
+		       size_t idx)
+{
+	ofi_atomic32_t status; /* 1: pending, 0: succ, <0: error */
+	psm2_amarg_t args[3];
+	int error;
+
+	if (!av->conn_info[trx_ctxt->id].epaddrs[idx])
+		psmx3_epid_to_epaddr(trx_ctxt, av->table[idx].epid,
+				     &av->conn_info[trx_ctxt->id].epaddrs[idx]);
+
+	psmx3_am_init(trx_ctxt); /* check AM handler installation */
+
+	ofi_atomic_initialize32(&status, 1);
+
+	args[0].u32w0 = PSMX3_AM_REQ_SEP_QUERY;
+	PSMX3_AM_SET_VER(args[0].u32w0, PSMX3_AM_SEP_VERSION);
+	args[0].u32w1 = av->table[idx].sep_id;
+	args[1].u64 = (uint64_t)(uintptr_t)&av->sep_info[idx];
+	args[2].u64 = (uint64_t)(uintptr_t)&status;
+	error = psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx],
+				      PSMX3_AM_SEP_HANDLER, args, 3, NULL,
+				      0, 0, NULL, NULL);
+
+	if (error)
+		return error;
+
+	/*
+	 * make sure AM is progressed promptly. don't call
+	 * psmx3_progress() which may call functions that
+	 * need to access the address vector.
+	 */
+	while (ofi_atomic_get32(&status) == 1)
+		psm2_poll(trx_ctxt->psm2_ep);
+
+	error = (int)(int32_t)ofi_atomic_get32(&status);
+
+	return error;
+}
+
+int psmx3_av_add_trx_ctxt(struct psmx3_fid_av *av,
+			  struct psmx3_trx_ctxt *trx_ctxt)
+{
+	int id;
+	int err = 0;
+
+	av->domain->av_lock_fn(&av->lock, 1);
+
+	if (av->type == FI_AV_MAP) {
+		av->av_map_trx_ctxt = trx_ctxt;
+		goto out;
+	}
+
+	id = trx_ctxt->id;
+	if (id >= av->max_trx_ctxt) {
+		FI_WARN(&psmx3_prov, FI_LOG_AV,
+			"trx_ctxt->id(%d) exceeds av->max_trx_ctxt(%d).\n",
+			id, av->max_trx_ctxt);
+		err = -FI_EINVAL;
+		goto out;
+	}
+
+	if (av->conn_info[id].trx_ctxt) {
+		if (av->conn_info[id].trx_ctxt == trx_ctxt) {
+			FI_INFO(&psmx3_prov, FI_LOG_AV,
+				"trx_ctxt(%p) with id(%d) already added.\n",
+				trx_ctxt, id);
+			goto out;
+		} else {
+			FI_INFO(&psmx3_prov, FI_LOG_AV,
+				"different trx_ctxt(%p) with same id(%d) already added.\n",
+				trx_ctxt, id);
+			err = -FI_EINVAL;
+			goto out;
+		}
+	}
+
+	av->conn_info[id].epaddrs = (psm2_epaddr_t *) calloc(av->count,
+							  sizeof(psm2_epaddr_t));
+	if (!av->conn_info[id].epaddrs) {
+		err = -FI_ENOMEM;
+		goto out;
+	}
+
+	av->conn_info[id].sepaddrs = (psm2_epaddr_t **)calloc(av->count,
+							   sizeof(psm2_epaddr_t *));
+	if (!av->conn_info[id].sepaddrs) {
+		err = -FI_ENOMEM;
+		goto out;
+	}
+
+	av->conn_info[id].trx_ctxt = trx_ctxt;
+
+out:
+	av->domain->av_unlock_fn(&av->lock, 1);
+	return err;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_insert(struct fid_av *av, const void *addr,
+			   size_t count, fi_addr_t *fi_addr,
+			   uint64_t flags, void *context)
+{
+	struct psmx3_fid_av *av_priv;
+	struct psmx3_ep_name *ep_name;
+	const struct psmx3_ep_name *names = addr;
+	const char **string_names = (void *)addr;
+	psm2_error_t *errors = NULL;
+	int error_count = 0;
+	int i, idx, ret;
+
+	assert(addr || !count);
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	av_priv->domain->av_lock_fn(&av_priv->lock, 1);
+
+	if ((av_priv->flags & FI_EVENT) && !av_priv->eq) {
+		ret = -FI_ENOEQ;
+		goto out;
+	}
+
+	if (av_priv->flags & FI_READ) {
+		ret = -FI_EINVAL;
+		goto out;
+	}
+
+	if (psmx3_av_check_space(av_priv, count)) {
+		ret = -FI_ENOMEM;
+		goto out;
+	}
+
+	errors = calloc(count, sizeof(*errors));
+	if (!errors) {
+		ret = -FI_ENOMEM;
+		goto out;
+	}
+
+	/* save the peer address information */
+	for (i = 0; i < count; i++) {
+		idx = av_priv->hdr->last + i;
+		if (av_priv->addr_format == FI_ADDR_STR) {
+			ep_name = psmx3_string_to_ep_name(string_names[i]);
+			if (!ep_name) {
+				ret = -FI_EINVAL;
+				goto out;
+			}
+			av_priv->table[idx].type = ep_name->type;
+			av_priv->table[idx].epid = ep_name->epid;
+			av_priv->table[idx].sep_id = ep_name->sep_id;
+			av_priv->table[idx].valid = 1;
+			free(ep_name);
+		} else {
+			av_priv->table[idx].type = names[i].type;
+			av_priv->table[idx].epid = names[i].epid;
+			av_priv->table[idx].sep_id = names[i].sep_id;
+			av_priv->table[idx].valid = 1;
+		}
+		av_priv->sep_info[idx].ctxt_cnt = 1;
+		av_priv->sep_info[idx].epids = NULL;
+	}
+
+	if (fi_addr) {
+		for (i = 0; i < count; i++) {
+			idx = av_priv->hdr->last + i;
+			if (errors[i] != PSM2_OK)
+				fi_addr[i] = FI_ADDR_NOTAVAIL;
+			else
+				fi_addr[i] = idx;
+		}
+	}
+
+	av_priv->hdr->last += count;
+
+	if (av_priv->flags & FI_EVENT) {
+		if (error_count) {
+			for (i = 0; i < count; i++)
+				psmx3_av_post_completion(av_priv, context, i, errors[i]);
+		}
+		psmx3_av_post_completion(av_priv, context, count - error_count, 0);
+		ret = 0;
+	} else {
+		if (flags & FI_SYNC_ERR) {
+			int *fi_errors = context;
+			for (i=0; i<count; i++)
+				fi_errors[i] = psmx3_errno(errors[i]);
+		}
+		ret = count - error_count;
+	}
+
+out:
+	free(errors);
+	av_priv->domain->av_unlock_fn(&av_priv->lock, 1);
+	return ret;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_map_insert(struct fid_av *av, const void *addr,
+			       size_t count, fi_addr_t *fi_addr,
+			       uint64_t flags, void *context)
+{
+	struct psmx3_fid_av *av_priv;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	struct psmx3_ep_name *ep_name;
+	const struct psmx3_ep_name *names = addr;
+	const char **string_names = (void *)addr;
+	psm2_epid_t *epids = NULL;
+	psm2_epaddr_t *epaddrs = NULL;
+	psm2_error_t *errors = NULL;
+	int error_count = 0;
+	int i, ret, err = 0;
+
+	assert(addr || !count);
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	av_priv->domain->av_lock_fn(&av_priv->lock, 1);
+
+	if (!count)
+		goto out;
+
+	epids = calloc(count, sizeof(*epids));
+	errors = calloc(count, sizeof(*errors));
+	if (!epids || !errors) {
+		err = -FI_ENOMEM;
+		goto out;
+	}
+
+	for (i=0; i<count; i++) {
+		if (av_priv->addr_format == FI_ADDR_STR) {
+			ep_name = psmx3_string_to_ep_name(string_names[i]);
+			if (!ep_name) {
+				err = -FI_EINVAL;
+				goto out;
+			}
+			epids[i] = ep_name->epid;
+			free(ep_name);
+		} else {
+			epids[i] = names[i].epid;
+		}
+	}
+
+	epaddrs = (psm2_epaddr_t *)fi_addr;
+
+	trx_ctxt = av_priv->av_map_trx_ctxt;
+	if (!trx_ctxt) {
+		FI_WARN(&psmx3_prov, FI_LOG_AV,
+			"unable to map address without AV-EP binding\n");
+		err = -FI_ENODEV;
+		goto out;
+	}
+
+	psm2_ep_connect(trx_ctxt->psm2_ep, count, epids, NULL, errors, epaddrs,
+			(int64_t) psmx3_env.conn_timeout * count * 1000000000LL);
+
+	for (i=0; i<count; i++) {
+		if (errors[i] == PSM2_EPID_ALREADY_CONNECTED)
+			errors[i] = PSM2_OK;
+
+		if (errors[i] == PSM2_OK)
+			psmx3_set_epaddr_context(trx_ctxt, epids[i], epaddrs[i]);
+		else
+			error_count++;
+	}
+
+out:
+	if (av_priv->flags & FI_EVENT) {
+		if (!err) {
+			if (error_count) {
+				for (i = 0; i < count; i++)
+					psmx3_av_post_completion(av_priv, context, i, errors[i]);
+			}
+			psmx3_av_post_completion(av_priv, context, count - error_count, 0);
+		}
+		ret = err;
+	} else {
+		if (flags & FI_SYNC_ERR) {
+			int *fi_errors = context;
+			for (i=0; i<count; i++)
+				fi_errors[i] = err ? err : psmx3_errno(errors[i]);
+		}
+		ret = err ? 0 : count - error_count;
+	}
+
+	if (count) {
+		free(errors);
+		free(epids);
+	}
+
+	av_priv->domain->av_unlock_fn(&av_priv->lock, 1);
+
+	return ret;
+}
+
+static int psmx3_av_disconnect_addr(int trx_ctxt_id, psm2_epid_t epid,
+				    psm2_epaddr_t epaddr)
+{
+	struct psmx3_epaddr_context *epaddr_context;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	psm2_error_t errors;
+	int err;
+
+	if (!epaddr)
+		return 0;
+
+	FI_INFO(&psmx3_prov, FI_LOG_AV,
+		"trx_ctxt_id %d epid %lx epaddr %p\n", trx_ctxt_id, epid, epaddr);
+
+	epaddr_context = psm2_epaddr_getctxt(epaddr);
+	if (!epaddr_context)
+		return -FI_EINVAL;
+
+	trx_ctxt = epaddr_context->trx_ctxt;
+	if (trx_ctxt_id != trx_ctxt->id)
+		return -FI_EINVAL;
+
+	if (epid != epaddr_context->epid)
+		return -FI_EINVAL;
+
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	dlist_remove_first_match(&trx_ctxt->peer_list,
+				 psmx3_peer_match, epaddr);
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+
+	psm2_epaddr_setctxt(epaddr, NULL);
+
+	err = psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &epaddr,
+				  NULL, &errors, PSM2_EP_DISCONNECT_FORCE, 0);
+
+	free(epaddr_context);
+	return psmx3_errno(err);
+}
+
+DIRECT_FN
+STATIC int psmx3_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count,
+			   uint64_t flags)
+{
+	struct psmx3_fid_av *av_priv;
+	int idx, i, j, k;
+	int err;
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	av_priv->domain->av_lock_fn(&av_priv->lock, 1);
+
+	for (i = 0; i < count; i++) {
+		idx = PSMX3_ADDR_IDX(fi_addr[i]);
+		if (idx >= av_priv->hdr->last) {
+			FI_WARN(&psmx3_prov, FI_LOG_AV,
+				"AV index out of range: fi_addr %lx idx %d last %ld\n",
+				fi_addr[i], idx, av_priv->hdr->last);
+			continue;
+		}
+
+		if (av_priv->table[idx].type == PSMX3_EP_REGULAR) {
+			for (j = 0; j < av_priv->max_trx_ctxt; j++) {
+				if (!av_priv->conn_info[j].trx_ctxt)
+					continue;
+
+				err = psmx3_av_disconnect_addr(
+						j, av_priv->table[idx].epid,
+						av_priv->conn_info[j].epaddrs[idx]);
+				if (!err)
+					av_priv->conn_info[j].epaddrs[idx] = NULL;
+			}
+			av_priv->table[idx].epid = 0;
+		} else {
+			if (!av_priv->sep_info[idx].epids)
+				continue;
+
+			for (j = 0; j < av_priv->max_trx_ctxt; j++) {
+				if (!av_priv->conn_info[j].trx_ctxt)
+					continue;
+
+				if (!av_priv->conn_info[j].sepaddrs[idx])
+					continue;
+
+				for (k = 0; k < av_priv->sep_info[idx].ctxt_cnt; k++) {
+					err = psmx3_av_disconnect_addr(
+							j, av_priv->sep_info[idx].epids[k],
+							av_priv->conn_info[j].sepaddrs[idx][k]);
+					if (!err)
+						av_priv->conn_info[j].sepaddrs[idx][k] = NULL;
+				}
+			}
+			free(av_priv->sep_info[idx].epids);
+			av_priv->sep_info[idx].epids = NULL;
+		}
+		av_priv->table[idx].valid = 0;
+	}
+
+	av_priv->domain->av_unlock_fn(&av_priv->lock, 1);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count,
+			       uint64_t flags)
+{
+	struct psmx3_fid_av *av_priv;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	psm2_error_t *errors;
+	int i;
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	if (!count)
+		return 0;
+
+	trx_ctxt = av_priv->av_map_trx_ctxt;
+	if (!trx_ctxt)
+		return -FI_ENODEV;
+
+	errors = calloc(count, sizeof(*errors));
+	if (!errors)
+		return -FI_ENOMEM;
+
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	for (i=0; i<count; i++) {
+		dlist_remove_first_match(&trx_ctxt->peer_list,
+					 psmx3_peer_match,
+					 (psm2_epaddr_t)(fi_addr[i]));
+	}
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+
+	for (i=0; i<count; i++)
+		psm2_epaddr_setctxt((psm2_epaddr_t)(fi_addr[i]), NULL);
+
+	psm2_ep_disconnect2(trx_ctxt->psm2_ep, count, (psm2_epaddr_t *)fi_addr,
+			    NULL, errors, PSM2_EP_DISCONNECT_FORCE, 0);
+
+	free(errors);
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
+			   size_t *addrlen)
+{
+	struct psmx3_fid_av *av_priv;
+	struct psmx3_ep_name name;
+	int idx = PSMX3_ADDR_IDX(fi_addr);
+	int err = 0;
+
+	assert(addr);
+	assert(addrlen);
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	memset(&name, 0, sizeof(name));
+
+	av_priv->domain->av_lock_fn(&av_priv->lock, 1);
+
+	if (idx >= av_priv->hdr->last) {
+		err = -FI_EINVAL;
+		goto out;
+	}
+
+	if (!av_priv->table[idx].valid) {
+		err = -FI_EINVAL;
+		goto out;
+	}
+
+	name.type = av_priv->table[idx].type;
+	name.epid = av_priv->table[idx].epid;
+	name.sep_id = av_priv->table[idx].sep_id;
+
+	if (av_priv->addr_format == FI_ADDR_STR) {
+		ofi_straddr(addr, addrlen, FI_ADDR_PSMX3, &name);
+	} else {
+		memcpy(addr, &name, MIN(*addrlen, sizeof(name)));
+		*addrlen = sizeof(name);
+	}
+
+out:
+	av_priv->domain->av_unlock_fn(&av_priv->lock, 1);
+	return err;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_map_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
+			       size_t *addrlen)
+{
+	struct psmx3_fid_av *av_priv;
+	struct psmx3_ep_name name;
+
+	assert(addr);
+	assert(addrlen);
+
+	av_priv = container_of(av, struct psmx3_fid_av, av);
+
+	memset(&name, 0, sizeof(name));
+	psm2_epaddr_to_epid((psm2_epaddr_t)fi_addr, &name.epid);
+	name.type = PSMX3_EP_REGULAR;
+
+	if (av_priv->addr_format == FI_ADDR_STR) {
+		ofi_straddr(addr, addrlen, FI_ADDR_PSMX3, &name);
+	} else {
+		memcpy(addr, &name, MIN(*addrlen, sizeof(name)));
+		*addrlen = sizeof(name);
+	}
+
+	return 0;
+}
+
+fi_addr_t psmx3_av_translate_source(struct psmx3_fid_av *av,
+				    psm2_epaddr_t source, int source_sep_id)
+{
+	psm2_epid_t epid;
+	fi_addr_t ret;
+	int i, j, found;
+	int ep_type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR;
+
+	if (av->type == FI_AV_MAP)
+		return (fi_addr_t) source;
+
+	psm2_epaddr_to_epid(source, &epid);
+
+	av->domain->av_lock_fn(&av->lock, 1);
+
+	ret = FI_ADDR_NOTAVAIL;
+	found = 0;
+	for (i = av->hdr->last - 1; i >= 0 && !found; i--) {
+		if (!av->table[i].valid)
+			continue;
+
+		if (av->table[i].type == PSMX3_EP_REGULAR) {
+			if (ep_type == PSMX3_EP_SCALABLE)
+				continue;
+			if (av->table[i].epid == epid) {
+				ret = (fi_addr_t)i;
+				found = 1;
+			}
+		} else {
+			/*
+			 * scalable endpoint must match sep_id exactly.
+			 * regular endpoint can match a context of any
+			 * scalable endpoint.
+			 */
+			if (ep_type == PSMX3_EP_SCALABLE &&
+			    av->table[i].sep_id != source_sep_id)
+				continue;
+
+			if (!av->sep_info[i].epids) {
+				for (j = 0; j < av->max_trx_ctxt; j++) {
+					if (av->conn_info[j].trx_ctxt)
+						break;
+				}
+				if (j >= av->max_trx_ctxt)
+					continue;
+				psmx3_av_query_sep(av, av->conn_info[j].trx_ctxt, i);
+				if (!av->sep_info[i].epids)
+					continue;
+			}
+
+			for (j=0; j<av->sep_info[i].ctxt_cnt; j++) {
+				if (av->sep_info[i].epids[j] == epid) {
+					ret = fi_rx_addr((fi_addr_t)i, j,
+							 av->rx_ctx_bits);
+					found = 1;
+					break;
+				}
+			}
+		}
+	}
+
+	av->domain->av_unlock_fn(&av->lock, 1);
+	return ret;
+}
+
+void psmx3_av_remove_conn(struct psmx3_fid_av *av,
+			  struct psmx3_trx_ctxt *trx_ctxt,
+			  psm2_epaddr_t epaddr)
+{
+	psm2_epid_t epid;
+	int i, j;
+
+	if (av->type == FI_AV_MAP)
+		return;
+
+	psm2_epaddr_to_epid(epaddr, &epid);
+
+	av->domain->av_lock_fn(&av->lock, 1);
+
+	for (i = 0; i < av->hdr->last; i++) {
+		if (!av->table[i].valid)
+			continue;
+		if (av->table[i].type == PSMX3_EP_REGULAR) {
+			if (av->table[i].epid == epid &&
+			    av->conn_info[trx_ctxt->id].epaddrs[i] == epaddr)
+				av->conn_info[trx_ctxt->id].epaddrs[i] = NULL;
+		} else {
+			if (!av->sep_info[i].epids)
+				continue;
+			for (j=0; j<av->sep_info[i].ctxt_cnt; j++) {
+				if (av->sep_info[i].epids[j] == epid &&
+				    av->conn_info[trx_ctxt->id].sepaddrs[i] &&
+				    av->conn_info[trx_ctxt->id].sepaddrs[i][j] == epaddr)
+					    av->conn_info[trx_ctxt->id].sepaddrs[i][j] = NULL;
+			}
+		}
+	}
+
+	av->domain->av_unlock_fn(&av->lock, 1);
+}
+
+DIRECT_FN
+STATIC const char *psmx3_av_straddr(struct fid_av *av, const void *addr,
+				    char *buf, size_t *len)
+{
+	return ofi_straddr(buf, len, FI_ADDR_PSMX3, addr);
+}
+
+static int psmx3_av_close(fid_t fid)
+{
+	struct psmx3_fid_av *av;
+	int i, j;
+	int err;
+
+	av = container_of(fid, struct psmx3_fid_av, av.fid);
+	psmx3_domain_release(av->domain);
+	fastlock_destroy(&av->lock);
+
+	if (av->type == FI_AV_MAP)
+		goto out;
+
+	for (i = 0; i < av->max_trx_ctxt; i++) {
+		if (!av->conn_info[i].trx_ctxt)
+			continue;
+		free(av->conn_info[i].epaddrs);
+		if (av->conn_info[i].sepaddrs) {
+			for (j = 0; j < av->hdr->last; j++)
+				free(av->conn_info[i].sepaddrs[j]);
+		}
+		free(av->conn_info[i].sepaddrs);
+	}
+	if (av->shared) {
+		err = ofi_shm_unmap(&av->shm);
+		if (err)
+			FI_INFO(&psmx3_prov, FI_LOG_AV,
+				"Failed to unmap shared AV: %s.\n",
+				strerror(ofi_syserr()));
+	} else {
+		free(av->hdr);
+	}
+
+	free(av->sep_info);
+out:
+	free(av);
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct psmx3_fid_av *av;
+
+	av = container_of(fid, struct psmx3_fid_av, av.fid);
+
+	assert(bfid);
+
+	switch (bfid->fclass) {
+	case FI_CLASS_EQ:
+		av->eq = (struct fid_eq *)bfid;
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_av_close,
+	.bind = psmx3_av_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_av psmx3_av_ops = {
+	.size = sizeof(struct fi_ops_av),
+	.insert = psmx3_av_insert,
+	.insertsvc = fi_no_av_insertsvc,
+	.insertsym = fi_no_av_insertsym,
+	.remove = psmx3_av_remove,
+	.lookup = psmx3_av_lookup,
+	.straddr = psmx3_av_straddr,
+};
+
+static struct fi_ops_av psmx3_av_map_ops = {
+	.size = sizeof(struct fi_ops_av),
+	.insert = psmx3_av_map_insert,
+	.insertsvc = fi_no_av_insertsvc,
+	.insertsym = fi_no_av_insertsym,
+	.remove = psmx3_av_map_remove,
+	.lookup = psmx3_av_map_lookup,
+	.straddr = psmx3_av_straddr,
+};
+
+DIRECT_FN
+int psmx3_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
+		  struct fid_av **av, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_av *av_priv;
+	size_t count = PSMX3_AV_DEFAULT_SIZE;
+	uint64_t flags = 0;
+	int shared = 0;
+	int rx_ctx_bits = PSMX3_MAX_RX_CTX_BITS;
+	size_t conn_size;
+	size_t table_size;
+	int av_type = FI_AV_TABLE;
+	int err;
+	int i;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+
+	if (attr) {
+		if (attr->type == FI_AV_MAP) {
+			if (psmx3_env.multi_ep) {
+				FI_INFO(&psmx3_prov, FI_LOG_AV,
+					"FI_AV_MAP asked, but force FI_AV_TABLE for multi-EP support\n");
+			} else if (psmx3_env.lazy_conn) {
+				FI_INFO(&psmx3_prov, FI_LOG_AV,
+					"FI_AV_MAP asked, but force FI_AV_TABLE for lazy connection\n");
+			} else if (attr->name) {
+				FI_INFO(&psmx3_prov, FI_LOG_AV,
+					"FI_AV_MAP asked, but force FI_AV_TABLE for shared AV\n");
+			} else {
+				FI_INFO(&psmx3_prov, FI_LOG_AV,
+					"FI_AV_MAP asked, and granted\n");
+				av_type = FI_AV_MAP;
+			}
+		}
+
+		if (attr->count)
+			count = attr->count;
+
+		if (attr->name)
+			shared = 1;
+
+		flags = attr->flags;
+		if (flags & FI_SYMMETRIC) {
+			FI_INFO(&psmx3_prov, FI_LOG_AV,
+				"FI_SYMMETRIC flags is no supported\n");
+			return -FI_ENOSYS;
+		}
+
+		if (attr->rx_ctx_bits > PSMX3_MAX_RX_CTX_BITS) {
+			FI_INFO(&psmx3_prov, FI_LOG_AV,
+				"attr->rx_ctx_bits=%d, maximum allowed is %d\n",
+				attr->rx_ctx_bits, PSMX3_MAX_RX_CTX_BITS);
+			return -FI_ENOSYS;
+		}
+
+		rx_ctx_bits = attr->rx_ctx_bits;
+	}
+
+	if (av_type == FI_AV_MAP)
+		conn_size = 0;
+	else
+		conn_size = psmx3_hfi_info.max_trx_ctxt * sizeof(struct psmx3_av_conn);
+
+	av_priv = (struct psmx3_fid_av *) calloc(1, sizeof(*av_priv) + conn_size);
+	if (!av_priv)
+		return -FI_ENOMEM;
+
+	if (av_type == FI_AV_MAP)
+		goto init_lock;
+
+	av_priv->sep_info = calloc(count, sizeof(struct psmx3_av_sep));
+	if (!av_priv->sep_info) {
+		err = -FI_ENOMEM;
+		goto errout_free;
+	}
+
+	table_size = PSMX3_AV_TABLE_SIZE(count, shared);
+	if (attr && attr->name) {
+		err = ofi_shm_map(&av_priv->shm, attr->name, table_size,
+				  flags & FI_READ, (void**)&av_priv->hdr);
+		if (err || av_priv->hdr == MAP_FAILED) {
+			FI_WARN(&psmx3_prov, FI_LOG_AV,
+				"failed to map shared AV: %s\n", attr->name);
+			err = -FI_EINVAL;
+			goto errout_free;
+		}
+
+		if (flags & FI_READ) {
+			if (av_priv->hdr->size != count) {
+				FI_WARN(&psmx3_prov, FI_LOG_AV,
+					"AV size doesn't match: shared %ld, asking %ld\n",
+					av_priv->hdr->size, count);
+				err = -FI_EINVAL;
+				goto errout_free;
+			}
+		} else {
+			av_priv->hdr->size = count;
+			av_priv->hdr->last = 0;
+		}
+		av_priv->shared = 1;
+		av_priv->map = (fi_addr_t *)(av_priv->hdr + 1);
+		av_priv->table = (struct psmx3_av_addr *)(av_priv->map + count);
+		for (i = 0; i < count; i++)
+			av_priv->map[i] = i;
+	} else {
+		av_priv->hdr = calloc(1, table_size);
+		if (!av_priv->hdr) {
+			err = -FI_ENOMEM;
+			goto errout_free;
+		}
+		av_priv->hdr->size = count;
+		av_priv->table = (struct psmx3_av_addr *)(av_priv->hdr + 1);
+	}
+
+init_lock:
+	fastlock_init(&av_priv->lock);
+
+	psmx3_domain_acquire(domain_priv);
+
+	av_priv->domain = domain_priv;
+	av_priv->addrlen = sizeof(psm2_epaddr_t);
+	av_priv->count = count;
+	av_priv->flags = flags;
+	av_priv->rx_ctx_bits = rx_ctx_bits;
+	av_priv->max_trx_ctxt = psmx3_hfi_info.max_trx_ctxt;
+	av_priv->addr_format = domain_priv->addr_format;
+	av_priv->type = av_type;
+
+	av_priv->av.fid.fclass = FI_CLASS_AV;
+	av_priv->av.fid.context = context;
+	av_priv->av.fid.ops = &psmx3_fi_ops;
+	if (av_type == FI_AV_MAP)
+		av_priv->av.ops = &psmx3_av_map_ops;
+	else
+		av_priv->av.ops = &psmx3_av_ops;
+
+	*av = &av_priv->av;
+	if (attr) {
+		attr->type = av_type;
+		if (shared)
+			attr->map_addr = av_priv->map;
+	}
+
+	return 0;
+
+errout_free:
+	free(av_priv->sep_info);
+	free(av_priv);
+	return err;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_cm.c b/deps/libfabric/prov/psm3/src/psmx3_cm.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5e2d3d40c1c84369f68ec2fb7ba38c0688ec633
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_cm.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+DIRECT_FN
+STATIC int psmx3_cm_getname(fid_t fid, void *addr, size_t *addrlen)
+{
+	struct psmx3_fid_ep *ep;
+	struct psmx3_fid_sep *sep;
+	struct psmx3_ep_name epname;
+	size_t	addr_size;
+	int err = 0;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+	if (!ep->domain)
+		return -FI_EBADF;
+
+	memset(&epname, 0, sizeof(epname));
+
+	if (ep->type == PSMX3_EP_REGULAR) {
+		epname.epid = ep->rx ? ep->rx->psm2_epid : 0;
+		epname.type = ep->type;
+	} else {
+		sep = (struct psmx3_fid_sep *)ep;
+		epname.epid = sep->ctxts[0].trx_ctxt->psm2_epid;
+		epname.sep_id = sep->id;
+		epname.type = sep->type;
+	}
+
+	if (ep->domain->addr_format == FI_ADDR_STR) {
+		addr_size = *addrlen;
+		ofi_straddr(addr, &addr_size, FI_ADDR_PSMX3, &epname);
+	} else {
+		addr_size = sizeof(epname);
+		memcpy(addr, &epname, MIN(*addrlen, addr_size));
+	}
+
+	if (*addrlen < addr_size)
+		err = -FI_ETOOSMALL;
+
+	*addrlen = addr_size;
+	return err;
+}
+
+struct fi_ops_cm psmx3_cm_ops = {
+	.size = sizeof(struct fi_ops_cm),
+	.setname = fi_no_setname,
+	.getname = psmx3_cm_getname,
+	.getpeer = fi_no_getpeer,
+	.connect = fi_no_connect,
+	.listen = fi_no_listen,
+	.accept = fi_no_accept,
+	.reject = fi_no_reject,
+	.shutdown = fi_no_shutdown,
+	.join = fi_no_join,
+};
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_cntr.c b/deps/libfabric/prov/psm3/src/psmx3_cntr.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1a92d9bb4b2670ebb2062de9a0ffa75f39454a6
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_cntr.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+void psmx3_cntr_check_trigger(struct psmx3_fid_cntr *cntr)
+{
+	struct psmx3_trigger *trigger;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	struct psmx3_fid_ep *ep;
+
+	if (!cntr->trigger)
+		return;
+
+	cntr->domain->trigger_lock_fn(&cntr->trigger_lock, 2);
+
+	trigger = cntr->trigger;
+	while (trigger) {
+		if (ofi_atomic_get64(&cntr->counter) < trigger->threshold)
+			break;
+
+		cntr->trigger = trigger->next;
+
+		/* 'ep' is the first field of the union regardless of the op type */
+		ep = container_of(trigger->send.ep, struct psmx3_fid_ep, ep);
+
+		switch (trigger->op) {
+		case PSMX3_TRIGGERED_RECV:
+		case PSMX3_TRIGGERED_TRECV:
+			trx_ctxt = ep->rx;
+			break;
+		default:
+			trx_ctxt = ep->tx;
+			break;
+		}
+
+		if (trx_ctxt->am_initialized) {
+			cntr->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2);
+			slist_insert_tail(&trigger->list_entry,
+					  &trx_ctxt->trigger_queue.list);
+			cntr->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2);
+		} else {
+			psmx3_process_trigger(trx_ctxt, trigger);
+		}
+
+		trigger = cntr->trigger;
+	}
+
+	cntr->domain->trigger_unlock_fn(&cntr->trigger_lock, 2);
+}
+
+void psmx3_cntr_add_trigger(struct psmx3_fid_cntr *cntr,
+			    struct psmx3_trigger *trigger)
+{
+	struct psmx3_trigger *p, *q;
+
+	cntr->domain->trigger_lock_fn(&cntr->trigger_lock, 2);
+
+	q = NULL;
+	p = cntr->trigger;
+	while (p && p->threshold <= trigger->threshold) {
+		q = p;
+		p = p->next;
+	}
+	if (q)
+		q->next = trigger;
+	else
+		cntr->trigger = trigger;
+	trigger->next = p;
+
+	cntr->domain->trigger_unlock_fn(&cntr->trigger_lock, 2);
+
+	psmx3_cntr_check_trigger(cntr);
+}
+
+DIRECT_FN
+STATIC uint64_t psmx3_cntr_read(struct fid_cntr *cntr)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+	struct psmx3_poll_ctxt *poll_ctxt;
+	struct slist_entry *item, *prev;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+
+	if (cntr_priv->poll_all) {
+		psmx3_progress_all(cntr_priv->domain);
+	} else {
+		slist_foreach(&cntr_priv->poll_list, item, prev) {
+			poll_ctxt = container_of(item,
+						 struct psmx3_poll_ctxt,
+						 list_entry);
+			psmx3_progress(poll_ctxt->trx_ctxt);
+			(void) prev; /* suppress compiler warning */
+		}
+	}
+
+	return ofi_atomic_get64(&cntr_priv->counter);
+}
+
+DIRECT_FN
+STATIC uint64_t psmx3_cntr_readerr(struct fid_cntr *cntr)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+	cntr_priv->error_avail = 0;
+
+	return ofi_atomic_get64(&cntr_priv->error_counter);
+}
+
+DIRECT_FN
+STATIC int psmx3_cntr_add(struct fid_cntr *cntr, uint64_t value)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+	ofi_atomic_add64(&cntr_priv->counter, value);
+
+	psmx3_cntr_check_trigger(cntr_priv);
+
+	if (cntr_priv->wait)
+		cntr_priv->wait->signal(cntr_priv->wait);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_cntr_set(struct fid_cntr *cntr, uint64_t value)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+	ofi_atomic_set64(&cntr_priv->counter, value);
+
+	psmx3_cntr_check_trigger(cntr_priv);
+
+	if (cntr_priv->wait)
+		cntr_priv->wait->signal(cntr_priv->wait);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_cntr_adderr(struct fid_cntr *cntr, uint64_t value)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+	ofi_atomic_add64(&cntr_priv->error_counter, value);
+	cntr_priv->error_avail = 1;
+
+	psmx3_cntr_check_trigger(cntr_priv);
+
+	if (cntr_priv->wait)
+		cntr_priv->wait->signal(cntr_priv->wait);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_cntr_seterr(struct fid_cntr *cntr, uint64_t value)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+	ofi_atomic_set64(&cntr_priv->error_counter, value);
+	cntr_priv->error_avail = 1;
+
+	psmx3_cntr_check_trigger(cntr_priv);
+
+	if (cntr_priv->wait)
+		cntr_priv->wait->signal(cntr_priv->wait);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout)
+{
+	struct psmx3_fid_cntr *cntr_priv;
+	struct psmx3_poll_ctxt *poll_ctxt;
+	struct slist_entry *item, *prev;
+	struct timespec ts0, ts;
+	int msec_passed = 0;
+	int ret = 0;
+
+	cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr);
+
+	clock_gettime(CLOCK_REALTIME, &ts0);
+
+	while (ofi_atomic_get64(&cntr_priv->counter) < threshold) {
+		if (cntr_priv->error_avail) {
+			ret = -FI_EAVAIL;
+			break;
+		}
+
+		if (cntr_priv->wait) {
+			ret = fi_wait((struct fid_wait *)cntr_priv->wait,
+				      timeout - msec_passed);
+			if (ret == -FI_ETIMEDOUT)
+				break;
+		} else if (cntr_priv->poll_all) {
+			psmx3_progress_all(cntr_priv->domain);
+		} else {
+			slist_foreach(&cntr_priv->poll_list, item, prev) {
+				poll_ctxt = container_of(item,
+							 struct psmx3_poll_ctxt,
+							 list_entry);
+				psmx3_progress(poll_ctxt->trx_ctxt);
+				(void) prev; /* suppress compiler warning */
+			}
+		}
+
+		if (cntr_priv->error_avail) {
+			ret = -FI_EAVAIL;
+			break;
+		}
+
+		if (ofi_atomic_get64(&cntr_priv->counter) >= threshold)
+			break;
+
+		if (timeout < 0)
+			continue;
+
+		clock_gettime(CLOCK_REALTIME, &ts);
+		msec_passed = (ts.tv_sec - ts0.tv_sec) * 1000 +
+			      (ts.tv_nsec - ts0.tv_nsec) / 1000000;
+
+		if (msec_passed >= timeout) {
+			ret = -FI_ETIMEDOUT;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int psmx3_cntr_close(fid_t fid)
+{
+	struct psmx3_fid_cntr *cntr;
+	struct psmx3_poll_ctxt *item;
+	struct slist_entry *entry;
+
+	cntr = container_of(fid, struct psmx3_fid_cntr, cntr.fid);
+
+	while (!slist_empty(&cntr->poll_list)) {
+		entry = slist_remove_head(&cntr->poll_list);
+		item = container_of(entry, struct psmx3_poll_ctxt, list_entry);
+		if (!ofi_atomic_dec32(&item->trx_ctxt->poll_refcnt))
+			free(item->trx_ctxt);
+		free(item);
+	}
+
+	if (cntr->wait) {
+		fi_poll_del(&cntr->wait->pollset->poll_fid, &cntr->cntr.fid, 0);
+		if (cntr->wait_is_local)
+			fi_close((fid_t)cntr->wait);
+	}
+
+	fastlock_destroy(&cntr->trigger_lock);
+	psmx3_domain_release(cntr->domain);
+	free(cntr);
+
+	return 0;
+}
+
+static int psmx3_cntr_control(fid_t fid, int command, void *arg)
+{
+	struct psmx3_fid_cntr *cntr;
+	int ret = 0;
+
+	cntr = container_of(fid, struct psmx3_fid_cntr, cntr.fid);
+
+	switch (command) {
+	case FI_SETOPSFLAG:
+		cntr->flags = *(uint64_t *)arg;
+		break;
+
+	case FI_GETOPSFLAG:
+		if (!arg)
+			return -FI_EINVAL;
+		*(uint64_t *)arg = cntr->flags;
+		break;
+
+	case FI_GETWAIT:
+		if (cntr->wait)
+			ret = fi_control(&cntr->wait->wait_fid.fid, FI_GETWAIT, arg);
+		else
+			return -FI_EINVAL;
+		break;
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return ret;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_cntr_close,
+	.bind = fi_no_bind,
+	.control = psmx3_cntr_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_cntr psmx3_cntr_ops = {
+	.size = sizeof(struct fi_ops_cntr),
+	.read = psmx3_cntr_read,
+	.readerr = psmx3_cntr_readerr,
+	.add = psmx3_cntr_add,
+	.set = psmx3_cntr_set,
+	.wait = psmx3_cntr_wait,
+	.adderr = psmx3_cntr_adderr,
+	.seterr = psmx3_cntr_seterr,
+};
+
+DIRECT_FN
+int psmx3_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
+			struct fid_cntr **cntr, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_cntr *cntr_priv;
+	struct fid_wait *wait = NULL;
+	struct fi_wait_attr wait_attr;
+	int wait_is_local = 0;
+	int events;
+	uint64_t flags;
+	int err;
+
+	flags = 0;
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+
+	switch (attr->events) {
+	case FI_CNTR_EVENTS_COMP:
+		events = attr->events;
+		break;
+
+	default:
+		FI_INFO(&psmx3_prov, FI_LOG_CQ,
+			"attr->events=%d, supported=%d\n",
+			attr->events, FI_CNTR_EVENTS_COMP);
+		return -FI_EINVAL;
+	}
+
+	switch (attr->wait_obj) {
+	case FI_WAIT_NONE:
+	case FI_WAIT_UNSPEC:
+		break;
+
+	case FI_WAIT_SET:
+		if (!attr->wait_set) {
+			FI_INFO(&psmx3_prov, FI_LOG_CQ,
+				"FI_WAIT_SET is specified but attr->wait_set is NULL\n");
+			return -FI_EINVAL;
+		}
+		wait = attr->wait_set;
+		break;
+
+	case FI_WAIT_FD:
+	case FI_WAIT_MUTEX_COND:
+		wait_attr.wait_obj = attr->wait_obj;
+		wait_attr.flags = 0;
+		err = fi_wait_open(&domain_priv->fabric->util_fabric.fabric_fid,
+				      &wait_attr, (struct fid_wait **)&wait);
+		if (err)
+			return err;
+		wait_is_local = 1;
+		break;
+
+	default:
+		FI_INFO(&psmx3_prov, FI_LOG_CQ,
+			"attr->wait_obj=%d, supported=%d...%d\n",
+			attr->wait_obj, FI_WAIT_NONE, FI_WAIT_MUTEX_COND);
+		return -FI_EINVAL;
+	}
+
+	cntr_priv = (struct psmx3_fid_cntr *) calloc(1, sizeof *cntr_priv);
+	if (!cntr_priv) {
+		err = -FI_ENOMEM;
+		goto fail;
+	}
+
+
+	cntr_priv->domain = domain_priv;
+	cntr_priv->events = events;
+	if (wait)
+		cntr_priv->wait = container_of(wait, struct util_wait, wait_fid);
+	cntr_priv->wait_is_local = wait_is_local;
+	cntr_priv->flags = flags;
+	cntr_priv->cntr.fid.fclass = FI_CLASS_CNTR;
+	cntr_priv->cntr.fid.context = context;
+	cntr_priv->cntr.fid.ops = &psmx3_fi_ops;
+	cntr_priv->cntr.ops = &psmx3_cntr_ops;
+	ofi_atomic_initialize64(&cntr_priv->counter, 0);
+	ofi_atomic_initialize64(&cntr_priv->error_counter, 0);
+
+	slist_init(&cntr_priv->poll_list);
+	fastlock_init(&cntr_priv->trigger_lock);
+
+	if (wait)
+		fi_poll_add(&cntr_priv->wait->pollset->poll_fid,
+			    &cntr_priv->cntr.fid, 0);
+
+	psmx3_domain_acquire(domain_priv);
+	*cntr = &cntr_priv->cntr;
+	return 0;
+fail:
+	if (wait && wait_is_local)
+		fi_close(&wait->fid);
+	return err;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_cq.c b/deps/libfabric/prov/psm3/src/psmx3_cq.c
new file mode 100644
index 0000000000000000000000000000000000000000..759fa2ae03c983cec1d3b27fbd83e82a58512392
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_cq.c
@@ -0,0 +1,1299 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+void psmx3_cq_enqueue_event(struct psmx3_fid_cq *cq,
+			    struct psmx3_cq_event *event)
+{
+	cq->domain->cq_lock_fn(&cq->lock, 2);
+	slist_insert_tail(&event->list_entry, &cq->event_queue);
+	cq->event_count++;
+	cq->domain->cq_unlock_fn(&cq->lock, 2);
+
+	if (cq->wait)
+		cq->wait->signal(cq->wait);
+}
+
+static struct psmx3_cq_event *psmx3_cq_dequeue_event(struct psmx3_fid_cq *cq)
+{
+	struct slist_entry *entry;
+
+	cq->domain->cq_lock_fn(&cq->lock, 2);
+	if (slist_empty(&cq->event_queue)) {
+		cq->domain->cq_unlock_fn(&cq->lock, 2);
+		return NULL;
+	}
+	entry = slist_remove_head(&cq->event_queue);
+	cq->event_count--;
+	cq->domain->cq_unlock_fn(&cq->lock, 2);
+
+	return container_of(entry, struct psmx3_cq_event, list_entry);
+}
+
+static struct psmx3_cq_event *psmx3_cq_alloc_event(struct psmx3_fid_cq *cq)
+{
+	struct psmx3_cq_event *event;
+
+	cq->domain->cq_lock_fn(&cq->lock, 2);
+	if (!slist_empty(&cq->free_list)) {
+		event = container_of(slist_remove_head(&cq->free_list),
+				     struct psmx3_cq_event, list_entry);
+		cq->domain->cq_unlock_fn(&cq->lock, 2);
+		return event;
+	}
+
+	cq->domain->cq_unlock_fn(&cq->lock, 2);
+	event = calloc(1, sizeof(*event));
+	if (!event)
+		FI_WARN(&psmx3_prov, FI_LOG_CQ, "out of memory.\n");
+
+	return event;
+}
+
+static void psmx3_cq_free_event(struct psmx3_fid_cq *cq,
+				struct psmx3_cq_event *event)
+{
+	memset(event, 0, sizeof(*event));
+
+	cq->domain->cq_lock_fn(&cq->lock, 2);
+	slist_insert_tail(&event->list_entry, &cq->free_list);
+	cq->domain->cq_unlock_fn(&cq->lock, 2);
+}
+
+struct psmx3_cq_event *psmx3_cq_create_event(struct psmx3_fid_cq *cq,
+					     void *op_context, void *buf,
+					     uint64_t flags, size_t len,
+					     uint64_t data, uint64_t tag,
+					     size_t olen, int err)
+{
+	struct psmx3_cq_event *event;
+
+	event = psmx3_cq_alloc_event(cq);
+	if (!event)
+		return NULL;
+
+	if ((event->error = !!err)) {
+		event->cqe.err.op_context = op_context;
+		event->cqe.err.err = -err;
+		event->cqe.err.data = data;
+		event->cqe.err.tag = tag;
+		event->cqe.err.olen = olen;
+		event->cqe.err.flags = flags;
+		event->cqe.err.prov_errno = PSM2_INTERNAL_ERR;
+		goto out;
+	}
+
+	switch (cq->format) {
+	case FI_CQ_FORMAT_CONTEXT:
+		event->cqe.context.op_context = op_context;
+		break;
+
+	case FI_CQ_FORMAT_MSG:
+		event->cqe.msg.op_context = op_context;
+		event->cqe.msg.flags = flags;
+		event->cqe.msg.len = len;
+		break;
+
+	case FI_CQ_FORMAT_DATA:
+		event->cqe.data.op_context = op_context;
+		event->cqe.data.buf = buf;
+		event->cqe.data.flags = flags;
+		event->cqe.data.len = len;
+		event->cqe.data.data = data;
+		break;
+
+	case FI_CQ_FORMAT_TAGGED:
+		event->cqe.tagged.op_context = op_context;
+		event->cqe.tagged.buf = buf;
+		event->cqe.tagged.flags = flags;
+		event->cqe.tagged.len = len;
+		event->cqe.tagged.data = data;
+		event->cqe.tagged.tag = tag;
+		break;
+
+	default:
+		FI_WARN(&psmx3_prov, FI_LOG_CQ,
+			"unsupported CQ format %d\n", cq->format);
+		psmx3_cq_free_event(cq, event);
+		return NULL;
+	}
+
+out:
+	return event;
+}
+
+static uint64_t psmx3_comp_flags[PSMX3_MAX_CONTEXT_TYPE] = {
+	[PSMX3_NOCOMP_SEND_CONTEXT]	= FI_SEND | FI_MSG,
+	[PSMX3_NOCOMP_RECV_CONTEXT]	= FI_RECV | FI_MSG,
+	[PSMX3_NOCOMP_TSEND_CONTEXT]	= FI_SEND | FI_TAGGED,
+	[PSMX3_NOCOMP_TRECV_CONTEXT]	= FI_RECV | FI_TAGGED,
+	[PSMX3_NOCOMP_WRITE_CONTEXT]	= FI_WRITE | FI_RMA,
+	[PSMX3_NOCOMP_READ_CONTEXT]	= FI_READ | FI_RMA,
+	[PSMX3_SEND_CONTEXT]		= FI_SEND | FI_MSG,
+	[PSMX3_RECV_CONTEXT]		= FI_RECV | FI_MSG,
+	[PSMX3_MULTI_RECV_CONTEXT]	= FI_RECV | FI_MSG,
+	[PSMX3_TSEND_CONTEXT]		= FI_SEND | FI_TAGGED,
+	[PSMX3_TRECV_CONTEXT]		= FI_RECV | FI_TAGGED,
+	[PSMX3_WRITE_CONTEXT]		= FI_WRITE | FI_RMA,
+	[PSMX3_READ_CONTEXT]		= FI_READ | FI_RMA,
+	[PSMX3_REMOTE_WRITE_CONTEXT]	= FI_REMOTE_WRITE | FI_RMA,
+	[PSMX3_REMOTE_READ_CONTEXT]	= FI_REMOTE_READ | FI_RMA,
+	[PSMX3_SENDV_CONTEXT]		= FI_SEND,
+	[PSMX3_IOV_SEND_CONTEXT]	= FI_SEND,
+	[PSMX3_IOV_RECV_CONTEXT]	= FI_RECV,
+};
+
+/*
+ * Translate "status" into completion event. A few factors determine where to
+ * save the event.
+ *
+ * If:
+ *
+ * (1) the CQE is for the CQ being polled; and
+ * (2) event buffer is supplied (event_in != NULL); and
+ * (3) the CQE is not an error entry,
+ *
+ * then the event is written to the event buffer directly. Otherwise a CQE is
+ * allocated on the corresponding CQ.
+ *
+ * The function doesn't use PSMX3_STATUS_CONTEXT(status) because the context
+ * field could refer to an allocated descriptor that may have already been
+ * freed. All the information that are dependent on the field are obtained
+ * in advance and passed in as separate parameters ("op_context", "buf",
+ * "flags", "data", and "is_recv").
+ *
+ * The flag "event_saved" is set to indicate to the caller that the event
+ * was saved to the user's provided buffer, otherwise the event was an error
+ * or the event has been saved to the comp_cq slist.
+ */
+
+__attribute__((always_inline))
+static inline int psmx3_cq_any_complete(struct psmx3_fid_cq *poll_cq,
+					struct psmx3_fid_cq *comp_cq,
+					struct psmx3_fid_av *av,
+					PSMX3_STATUS_TYPE *status,
+					void *op_context,
+					void *buf,
+					uint64_t flags,
+					uint64_t data,
+					struct psmx3_cq_event *event_in,
+					int *event_saved,
+					fi_addr_t *src_addr,
+					int is_recv)
+{
+	struct psmx3_cq_event *event = event_in;
+
+	*event_saved = 1;
+
+	if (OFI_UNLIKELY(PSMX3_STATUS_ERROR(status))) {
+		*event_saved = 0;
+		event = psmx3_cq_alloc_event(comp_cq);
+		if (!event)
+			return -FI_ENOMEM;
+
+		event->error = 1;
+		event->cqe.err.op_context = op_context;
+		event->cqe.err.flags = flags;
+		event->cqe.err.err = -psmx3_errno(PSMX3_STATUS_ERROR(status));
+		event->cqe.err.prov_errno = PSMX3_STATUS_ERROR(status);
+		event->cqe.err.tag = PSMX3_GET_TAG64(PSMX3_STATUS_TAG(status));
+		event->cqe.err.olen = PSMX3_STATUS_SNDLEN(status) - PSMX3_STATUS_RCVLEN(status);
+		event->cqe.err.data = data;
+
+		psmx3_cq_enqueue_event(comp_cq, event);
+		return 0;
+	}
+
+	if (OFI_UNLIKELY(poll_cq != comp_cq || !event)) {
+		*event_saved = 0;
+		event = psmx3_cq_alloc_event(comp_cq);
+		if (!event)
+			return -FI_ENOMEM;
+
+		event->error = 0;
+	}
+
+	if (is_recv) {
+		psm2_epaddr_t source = PSMX3_STATUS_PEER(status);
+		int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data;
+
+		if (event == event_in) {
+			if (src_addr) {
+				src_addr[0] = psmx3_av_translate_source(av, source,
+									source_sep_id);
+				if (src_addr[0] == FI_ADDR_NOTAVAIL) {
+					*event_saved = 0;
+					event = psmx3_cq_alloc_event(comp_cq);
+					if (!event)
+						return -FI_ENOMEM;
+
+					event->cqe = event_in->cqe;
+					event->cqe.err.err = FI_EADDRNOTAVAIL;
+					event->cqe.err.err_data = &comp_cq->error_data;
+					event->error = !!event->cqe.err.err;
+					if (av->addr_format == FI_ADDR_STR) {
+						event->cqe.err.err_data_size = PSMX3_ERR_DATA_SIZE;
+						psmx3_get_source_string_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data,
+							&event->cqe.err.err_data_size);
+					} else {
+						psmx3_get_source_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data);
+						event->cqe.err.err_data_size = sizeof(struct psmx3_ep_name);
+					}
+				}
+			}
+		} else {
+			event->source_is_valid = 1;
+			event->source_sep_id = source_sep_id;
+			event->source = source;
+			event->source_av = av;
+		}
+	}
+
+	switch (comp_cq->format) {
+	case FI_CQ_FORMAT_CONTEXT:
+		event->cqe.context.op_context = op_context;
+		break;
+
+	case FI_CQ_FORMAT_MSG:
+		event->cqe.msg.op_context = op_context;
+		event->cqe.msg.flags = flags;
+		event->cqe.msg.len = PSMX3_STATUS_RCVLEN(status);
+		break;
+
+	case FI_CQ_FORMAT_DATA:
+		event->cqe.data.op_context = op_context;
+		event->cqe.data.buf = buf;
+		event->cqe.data.flags = flags;
+		event->cqe.data.len = PSMX3_STATUS_RCVLEN(status);
+		event->cqe.data.data = data;
+		break;
+
+	case FI_CQ_FORMAT_TAGGED:
+		event->cqe.tagged.op_context = op_context;
+		event->cqe.tagged.buf = buf;
+		event->cqe.tagged.flags = flags;
+		event->cqe.tagged.len = PSMX3_STATUS_RCVLEN(status);
+		event->cqe.tagged.data = data;
+		event->cqe.tagged.tag = PSMX3_GET_TAG64(PSMX3_STATUS_TAG(status));
+		break;
+
+	default:
+		FI_WARN(&psmx3_prov, FI_LOG_CQ,
+			"unsupported CQ format %d\n", comp_cq->format);
+		if (event != event_in)
+			psmx3_cq_free_event(comp_cq, event);
+		return -FI_EINVAL;
+	}
+
+	if (OFI_UNLIKELY(event != event_in))
+		psmx3_cq_enqueue_event(comp_cq, event);
+
+	return 0;
+}
+
+static inline int psmx3_cq_tx_complete(struct psmx3_fid_cq *poll_cq,
+				       struct psmx3_fid_cq *comp_cq,
+				       struct psmx3_fid_av *av,
+				       PSMX3_STATUS_TYPE *status,
+				       void *op_context,
+				       void *buf,
+				       uint64_t flags,
+				       uint64_t data,
+				       struct psmx3_cq_event *event_in,
+				       int *event_saved)
+{
+	return psmx3_cq_any_complete(poll_cq, comp_cq, av, status,
+				     op_context, buf, flags, data,
+				     event_in, event_saved, NULL, 0);
+}
+
+static inline int psmx3_cq_rx_complete(struct psmx3_fid_cq *poll_cq,
+				       struct psmx3_fid_cq *comp_cq,
+				       struct psmx3_fid_av *av,
+				       PSMX3_STATUS_TYPE *status,
+				       void *op_context,
+				       void *buf,
+				       uint64_t flags,
+				       uint64_t data,
+				       struct psmx3_cq_event *event_in,
+				       fi_addr_t *src_addr,
+				       int *event_saved)
+{
+	return psmx3_cq_any_complete(poll_cq, comp_cq, av, status,
+				     op_context, buf, flags, data,
+				     event_in, event_saved, src_addr, 1);
+}
+
+int
+psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry_index)
+{
+	struct fi_context *fi_context;
+	struct psmx3_fid_ep *ep;
+	struct psmx3_fid_mr *mr;
+	struct psmx3_am_request *am_req;
+	struct psmx3_multi_recv *multi_recv_req;
+	struct psmx3_sendv_request *sendv_req;
+	struct psmx3_sendv_reply *sendv_rep;
+	psm2_mq_req_t psm2_req;
+	size_t len_remaining;
+	void *op_context;
+	void *buf;
+	uint64_t flags;
+	uint64_t data;
+	int err;
+	int context_type;
+	int event_saved = 0;
+	void *entry = NULL;
+
+	struct psmx3_status_data *status_data = status_array;
+
+	if (OFI_LIKELY(status_data->event_buffer && status_data->poll_cq))
+		entry = (uint8_t *)status_data->event_buffer +
+				(entry_index * status_data->poll_cq->entry_size);
+
+	fi_context = PSMX3_STATUS_CONTEXT(req);
+
+	if (OFI_UNLIKELY(!fi_context))
+		return 0;
+
+	context_type = (int)PSMX3_CTXT_TYPE(fi_context);
+	flags = psmx3_comp_flags[context_type];
+	ep = PSMX3_CTXT_EP(fi_context);
+
+	switch (context_type) {
+	case PSMX3_SEND_CONTEXT:
+	case PSMX3_TSEND_CONTEXT:
+		if (ep->send_cq) {
+			op_context = fi_context;
+			buf = PSMX3_CTXT_USER(fi_context);
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, buf, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->send_cntr)
+			psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req));
+
+		/* Bi-directional send/recv performance tweak for KNL */
+		if (event_saved && PSMX3_STATUS_SNDLEN(req) > 16384)
+			event_saved++;
+		break;
+
+	case PSMX3_NOCOMP_SEND_CONTEXT:
+	case PSMX3_NOCOMP_TSEND_CONTEXT:
+		if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) {
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, NULL, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->send_cntr)
+			psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_RECV_CONTEXT:
+		if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) &&
+				  !psmx3_handle_sendv_req(ep, req, 0))) {
+			return 0;
+		}
+		if (ep->recv_cq) {
+			op_context = fi_context;
+			buf = PSMX3_CTXT_USER(fi_context);
+			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, op_context, buf, flags, data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_TRECV_CONTEXT:
+		if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) &&
+				 !psmx3_handle_sendv_req(ep, req, 0))) {
+			return 0;
+		}
+		if (ep->recv_cq) {
+			op_context = fi_context;
+			buf = PSMX3_CTXT_USER(fi_context);
+			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, op_context, buf, flags, data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_NOCOMP_RECV_CONTEXT:
+		if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) &&
+				 !psmx3_handle_sendv_req(ep, req, 0))) {
+			PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context);
+			return 0;
+		}
+		PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context);
+		if (OFI_UNLIKELY(ep->recv_cq && PSMX3_STATUS_ERROR(req))) {
+			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, NULL, NULL, flags, data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_NOCOMP_TRECV_CONTEXT:
+		if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) &&
+				 !psmx3_handle_sendv_req(ep, req, 0))) {
+			PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context);
+			return 0;
+		}
+		PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context);
+		if (OFI_UNLIKELY(ep->recv_cq && PSMX3_STATUS_ERROR(req))) {
+			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, NULL, NULL, flags, data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_WRITE_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request,
+					  fi_context);
+		op_context = PSMX3_CTXT_USER(fi_context);
+		free(am_req->tmpbuf);
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		if (ep->send_cq) {
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->write_cntr)
+			psmx3_cntr_inc(ep->write_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_NOCOMP_WRITE_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request,
+					  fi_context);
+		op_context = PSMX3_CTXT_USER(fi_context);
+		free(am_req->tmpbuf);
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) {
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->write_cntr)
+			psmx3_cntr_inc(ep->write_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_READ_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request,
+					  fi_context);
+		if (OFI_UNLIKELY(am_req->op == PSMX3_AM_REQ_READV)) {
+			am_req->read.len_read += PSMX3_STATUS_RCVLEN(req);
+			if (am_req->read.len_read < am_req->read.len) {
+				FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,
+					"readv: long protocol finishes early\n");
+				if (PSMX3_STATUS_ERROR(req))
+					am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req));
+				/* Request to be freed in AM handler */
+				return 0;
+			}
+		}
+		op_context = PSMX3_CTXT_USER(fi_context);
+		free(am_req->tmpbuf);
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		if (ep->send_cq) {
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->read_cntr)
+			psmx3_cntr_inc(ep->read_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_NOCOMP_READ_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request,
+					  fi_context);
+		if (OFI_UNLIKELY(am_req->op == PSMX3_AM_REQ_READV)) {
+			am_req->read.len_read += PSMX3_STATUS_RCVLEN(req);
+			if (am_req->read.len_read < am_req->read.len) {
+				FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,
+					"readv: long protocol finishes early\n");
+				if (PSMX3_STATUS_ERROR(req))
+					am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req));
+				/* Request to be freed in AM handler */
+				return 0;
+			}
+		}
+		op_context = PSMX3_CTXT_USER(fi_context);
+		free(am_req->tmpbuf);
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) {
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->read_cntr)
+			psmx3_cntr_inc(ep->read_cntr, PSMX3_STATUS_ERROR(req));
+		break;
+
+	case PSMX3_MULTI_RECV_CONTEXT:
+		if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) &&
+			!psmx3_handle_sendv_req(ep, req, 1))) {
+			return 0;
+		}
+		multi_recv_req = PSMX3_CTXT_USER(fi_context);
+		if (ep->recv_cq) {
+			op_context = fi_context;
+			buf = multi_recv_req->buf + multi_recv_req->offset;
+			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
+			if (multi_recv_req->offset + PSMX3_STATUS_RCVLEN(req) +
+				multi_recv_req->min_buf_size > multi_recv_req->len)
+				flags |= FI_MULTI_RECV;	/* buffer used up */
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, op_context, buf, flags, data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err))
+				return err;
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+
+		/* repost multi-recv buffer */
+		multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req);
+		len_remaining = multi_recv_req->len - multi_recv_req->offset;
+		if (len_remaining >= multi_recv_req->min_buf_size) {
+			if (len_remaining > PSMX3_MAX_MSG_SIZE)
+				len_remaining = PSMX3_MAX_MSG_SIZE;
+			err = psm2_mq_irecv2(ep->rx->psm2_mq,
+						 multi_recv_req->src_addr, &multi_recv_req->tag,
+						 &multi_recv_req->tagsel, multi_recv_req->flag,
+						 multi_recv_req->buf + multi_recv_req->offset,
+						 len_remaining,
+						 (void *)fi_context, &psm2_req);
+			if (OFI_UNLIKELY(err != PSM2_OK))
+				return psmx3_errno(err);
+			PSMX3_CTXT_REQ(fi_context) = psm2_req;
+		} else {
+			free(multi_recv_req);
+		}
+		break;
+
+	case PSMX3_REMOTE_WRITE_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request, fi_context);
+		if (am_req->op & PSMX3_AM_FORCE_ACK) {
+			am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req));
+			psmx3_am_ack_rma(am_req);
+		}
+
+		if (am_req->ep->recv_cq && (am_req->cq_flags & FI_REMOTE_CQ_DATA)) {
+			flags |= FI_REMOTE_CQ_DATA;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, am_req->ep->recv_cq, am_req->ep->av,
+					req, NULL, NULL, flags, am_req->write.data,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err)) {
+				psmx3_am_request_free(status_data->trx_ctxt, am_req);
+				return err;
+			}
+		}
+
+		if (am_req->ep->caps & FI_RMA_EVENT) {
+			if (am_req->ep->remote_write_cntr)
+				psmx3_cntr_inc(am_req->ep->remote_write_cntr, 0);
+
+			mr = PSMX3_CTXT_USER(fi_context);
+			if (mr->cntr && mr->cntr != am_req->ep->remote_write_cntr)
+				psmx3_cntr_inc(mr->cntr, 0);
+		}
+
+		/* NOTE: am_req->tmpbuf is unused here */
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		break;
+
+	case PSMX3_REMOTE_READ_CONTEXT:
+		am_req = container_of(fi_context, struct psmx3_am_request, fi_context);
+		if (am_req->ep->caps & FI_RMA_EVENT) {
+			if (am_req->ep->remote_read_cntr)
+				psmx3_cntr_inc(am_req->ep->remote_read_cntr, 0);
+		}
+
+		/* NOTE: am_req->tmpbuf is unused here */
+		psmx3_am_request_free(status_data->trx_ctxt, am_req);
+		break;
+
+	case PSMX3_SENDV_CONTEXT:
+		sendv_req = PSMX3_CTXT_USER(fi_context);
+		sendv_req->iov_done++;
+		if (sendv_req->iov_protocol == PSMX3_IOV_PROTO_MULTI &&
+			sendv_req->iov_done < sendv_req->iov_info.count + 1) {
+			sendv_req->tag = PSMX3_STATUS_TAG(req);
+			return 0;
+		}
+		if (ep->send_cq && !sendv_req->no_completion) {
+			op_context = sendv_req->user_context;
+			flags |= sendv_req->comp_flag;
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err)) {
+				free(sendv_req);
+				return err;
+			}
+		}
+		if (ep->send_cntr)
+			psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req));
+		free(sendv_req);
+		break;
+
+	case PSMX3_IOV_SEND_CONTEXT:
+		sendv_req = PSMX3_CTXT_USER(fi_context);
+		sendv_req->iov_done++;
+		if (sendv_req->iov_done < sendv_req->iov_info.count + 1)
+			return 0;
+		PSMX3_STATUS_TAG(req) = sendv_req->tag;
+		if (ep->send_cq && !sendv_req->no_completion) {
+			op_context = sendv_req->user_context;
+			flags |= sendv_req->comp_flag;
+			err = psmx3_cq_tx_complete(
+					status_data->poll_cq, ep->send_cq, ep->av,
+					req, op_context, NULL, flags, 0,
+					entry, &event_saved);
+			if (OFI_UNLIKELY(err)) {
+				free(sendv_req);
+				return err;
+			}
+		}
+		if (ep->send_cntr)
+			psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req));
+		free(sendv_req);
+		break;
+
+	case PSMX3_IOV_RECV_CONTEXT:
+		sendv_rep = PSMX3_CTXT_USER(fi_context);
+		sendv_rep->iov_done++;
+		sendv_rep->msg_length += PSMX3_STATUS_SNDLEN(req);
+		sendv_rep->bytes_received += PSMX3_STATUS_RCVLEN(req);
+		if (PSMX3_STATUS_ERROR(req) != PSM2_OK)
+			sendv_rep->error_code = PSMX3_STATUS_ERROR(req);
+		if (sendv_rep->iov_done < sendv_rep->iov_info.count)
+			return 0;
+
+		PSMX3_STATUS_TAG(req) = sendv_rep->tag;
+		PSMX3_STATUS_RCVLEN(req) = sendv_rep->bytes_received;
+		PSMX3_STATUS_SNDLEN(req) = sendv_rep->msg_length;
+		PSMX3_STATUS_ERROR(req) = sendv_rep->error_code;
+
+		if (ep->recv_cq && !sendv_rep->no_completion) {
+			op_context = sendv_rep->user_context;
+			buf = sendv_rep->buf;
+			flags |= sendv_rep->comp_flag;
+			err = psmx3_cq_rx_complete(
+					status_data->poll_cq, ep->recv_cq, ep->av,
+					req, op_context, buf, flags, 0,
+					entry, status_data->src_addr, &event_saved);
+			if (OFI_UNLIKELY(err)) {
+				free(sendv_rep);
+				return err;
+			}
+		}
+		if (ep->recv_cntr)
+			psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req));
+
+		if (sendv_rep->multi_recv) {
+			/* repost the multi-recv buffer */
+			fi_context = sendv_rep->user_context;
+			multi_recv_req = PSMX3_CTXT_USER(fi_context);
+			multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req);
+			len_remaining = multi_recv_req->len - multi_recv_req->offset;
+			if (len_remaining >= multi_recv_req->min_buf_size) {
+				if (len_remaining > PSMX3_MAX_MSG_SIZE)
+					len_remaining = PSMX3_MAX_MSG_SIZE;
+				err = psm2_mq_irecv2(ep->rx->psm2_mq,
+							 multi_recv_req->src_addr, &multi_recv_req->tag,
+							 &multi_recv_req->tagsel, multi_recv_req->flag,
+							 multi_recv_req->buf + multi_recv_req->offset,
+							 len_remaining,
+							 (void *)fi_context, &psm2_req);
+				if (OFI_UNLIKELY(err != PSM2_OK)) {
+					free(sendv_rep);
+					return psmx3_errno(err);
+				}
+				PSMX3_CTXT_REQ(fi_context) = psm2_req;
+			} else {
+				free(multi_recv_req);
+			}
+		}
+
+		free(sendv_rep);
+		break;
+	}
+
+	return event_saved;
+}
+
+int psmx3_cq_poll_mq(struct psmx3_fid_cq *cq,
+		     struct psmx3_trx_ctxt *trx_ctxt,
+		     struct psmx3_cq_event *event_in,
+		     int count, fi_addr_t *src_addr)
+{
+	struct psmx3_status_data status_data;
+
+	/* psm2_mq_ipeek_dequeue_multi needs non-zero count to make progress */
+	if (!count) {
+		event_in = NULL;
+		count = 1;
+	}
+
+	status_data.poll_cq = cq;
+	status_data.event_buffer = event_in;
+	status_data.src_addr = src_addr;
+	status_data.trx_ctxt = trx_ctxt;
+
+	psm2_mq_ipeek_dequeue_multi(trx_ctxt->psm2_mq, &status_data,
+			psmx3_mq_status_copy, &count);
+	return count;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_cq_readfrom(struct fid_cq *cq, void *buf, size_t count,
+				 fi_addr_t *src_addr)
+{
+	struct psmx3_fid_cq *cq_priv;
+	struct psmx3_cq_event *event;
+	struct psmx3_poll_ctxt *poll_ctxt;
+	struct slist_entry *item, *prev;
+	int ret;
+	ssize_t read_count;
+	fi_addr_t source;
+	int i;
+
+	cq_priv = container_of(cq, struct psmx3_fid_cq, cq);
+
+	if (slist_empty(&cq_priv->event_queue) || !buf) {
+		slist_foreach(&cq_priv->poll_list, item, prev) {
+			poll_ctxt = container_of(item, struct psmx3_poll_ctxt,
+						 list_entry);
+
+			if (OFI_UNLIKELY(!poll_ctxt->trx_ctxt->poll_active))
+				continue;
+
+			ret = psmx3_cq_poll_mq(cq_priv, poll_ctxt->trx_ctxt,
+					       (struct psmx3_cq_event *)buf,
+					       count, src_addr);
+			if (ret > 0)
+				return ret;
+
+			if (poll_ctxt->trx_ctxt->am_progress)
+				psmx3_am_progress(poll_ctxt->trx_ctxt);
+
+			(void) prev; /* suppress compiler warning */
+		}
+	}
+
+	if (OFI_UNLIKELY(cq_priv->pending_error != NULL))
+		return -FI_EAVAIL;
+
+	assert(buf || !count);
+
+	read_count = 0;
+	for (i = 0; i < count; i++) {
+		if (slist_empty(&cq_priv->event_queue))
+			break;
+
+		event = psmx3_cq_dequeue_event(cq_priv);
+		if (event) {
+			if (!event->error) {
+				if (src_addr && event->source_is_valid) {
+					source = psmx3_av_translate_source(
+							event->source_av, event->source,
+							event->source_sep_id);
+					if (source == FI_ADDR_NOTAVAIL) {
+						if (cq_priv->domain->addr_format == FI_ADDR_STR) {
+							event->cqe.err.err_data_size = PSMX3_ERR_DATA_SIZE;
+							psmx3_get_source_string_name(
+								event->source, event->source_sep_id,
+								(void *)&cq_priv->error_data,
+								&event->cqe.err.err_data_size);
+						} else {
+							psmx3_get_source_name(
+								event->source,
+								event->source_sep_id,
+								(void *)&cq_priv->error_data);
+							event->cqe.err.err_data_size = sizeof(struct psmx3_ep_name);
+						}
+						event->cqe.err.err_data = &cq_priv->error_data;
+						event->cqe.err.err = FI_EADDRNOTAVAIL;
+						event->error = !!event->cqe.err.err;
+						cq_priv->pending_error = event;
+						if (!read_count)
+							read_count = -FI_EAVAIL;
+						break;
+					}
+
+					*src_addr = source;
+				}
+
+				memcpy(buf, (void *)&event->cqe, cq_priv->entry_size);
+				psmx3_cq_free_event(cq_priv, event);
+
+				read_count++;
+				buf = (uint8_t *)buf + cq_priv->entry_size;
+				if (src_addr)
+					src_addr++;
+				continue;
+			} else {
+				cq_priv->pending_error = event;
+				if (!read_count)
+					read_count = -FI_EAVAIL;
+				break;
+			}
+		} else {
+			break;
+		}
+	}
+
+	/*
+	 * Return 0 if and only if the input count is 0 and the CQ is not empty.
+	 * This is used by the util poll code to check the poll state.
+	 */
+	if (!read_count &&  (count || slist_empty(&cq_priv->event_queue)))
+		read_count = -FI_EAGAIN;
+
+	return read_count;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_cq_read(struct fid_cq *cq, void *buf, size_t count)
+{
+	return psmx3_cq_readfrom(cq, buf, count, NULL);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf,
+				uint64_t flags)
+{
+	struct psmx3_fid_cq *cq_priv;
+	uint32_t api_version;
+	size_t size;
+
+	cq_priv = container_of(cq, struct psmx3_fid_cq, cq);
+
+	cq_priv->domain->cq_lock_fn(&cq_priv->lock, 2);
+	if (cq_priv->pending_error) {
+		api_version = cq_priv->domain->fabric->util_fabric.
+			      fabric_fid.api_version;
+		size = FI_VERSION_GE(api_version, FI_VERSION(1, 5)) ?
+			sizeof(*buf) : sizeof(struct fi_cq_err_entry_1_0);
+
+		memcpy(buf, &cq_priv->pending_error->cqe, size);
+		free(cq_priv->pending_error);
+		cq_priv->pending_error = NULL;
+		psmx3_unlock(&cq_priv->lock, 2);
+		return 1;
+	}
+	cq_priv->domain->cq_unlock_fn(&cq_priv->lock, 2);
+
+	return -FI_EAGAIN;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
+				  fi_addr_t *src_addr, const void *cond,
+				  int timeout)
+{
+	struct psmx3_fid_cq *cq_priv;
+	struct psmx3_poll_ctxt *poll_ctxt;
+	struct slist_entry *item, *prev;
+	struct timespec ts0, ts;
+	size_t threshold, event_count;
+	int msec_passed = 0;
+	int sth_happened = 0;
+
+	cq_priv = container_of(cq, struct psmx3_fid_cq, cq);
+	if (cq_priv->wait_cond == FI_CQ_COND_THRESHOLD)
+		threshold = (size_t) cond;
+	else
+		threshold = 1;
+
+	/* NOTE: "cond" is only a hint, not a mandatory condition. */
+	event_count = cq_priv->event_count;
+	if (event_count < threshold) {
+		if (cq_priv->wait) {
+			if (ofi_atomic_get32(&cq_priv->signaled)) {
+				ofi_atomic_set32(&cq_priv->signaled, 0);
+				return -FI_ECANCELED;
+			}
+			fi_wait((struct fid_wait *)cq_priv->wait, timeout);
+		} else {
+			clock_gettime(CLOCK_REALTIME, &ts0);
+			while (!sth_happened) {
+				slist_foreach(&cq_priv->poll_list, item, prev) {
+					poll_ctxt = container_of(item,
+								 struct psmx3_poll_ctxt,
+								 list_entry);
+
+					if (OFI_UNLIKELY(!poll_ctxt->trx_ctxt->poll_active))
+						continue;
+
+					sth_happened =
+						psmx3_cq_poll_mq(cq_priv,
+								 poll_ctxt->trx_ctxt,
+								 NULL, 0, NULL);
+					if (sth_happened)
+						break;
+
+					(void) prev; /* suppress compiler warning */
+				}
+
+				/* CQ may be updated asynchronously by the AM handlers */
+				if (cq_priv->event_count > event_count)
+					break;
+
+				if (ofi_atomic_get32(&cq_priv->signaled)) {
+					ofi_atomic_set32(&cq_priv->signaled, 0);
+					return -FI_ECANCELED;
+				}
+
+				if (timeout < 0)
+					continue;
+
+				clock_gettime(CLOCK_REALTIME, &ts);
+				msec_passed = (ts.tv_sec - ts0.tv_sec) * 1000 +
+					       (ts.tv_nsec - ts0.tv_nsec) / 1000000;
+
+				if (msec_passed >= timeout)
+					break;
+			}
+		}
+	}
+
+	return psmx3_cq_readfrom(cq, buf, count, src_addr);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_cq_sread(struct fid_cq *cq, void *buf, size_t count,
+			      const void *cond, int timeout)
+{
+	return psmx3_cq_sreadfrom(cq, buf, count, NULL, cond, timeout);
+}
+
+DIRECT_FN
+STATIC int psmx3_cq_signal(struct fid_cq *cq)
+{
+	struct psmx3_fid_cq *cq_priv;
+	cq_priv = container_of(cq, struct psmx3_fid_cq, cq);
+
+	ofi_atomic_set32(&cq_priv->signaled, 1);
+	if (cq_priv->wait)
+		cq_priv->wait->signal(cq_priv->wait);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC const char *psmx3_cq_strerror(struct fid_cq *cq, int prov_errno, const void *prov_data,
+				     char *buf, size_t len)
+{
+	return psm2_error_get_string(prov_errno);
+}
+
+static int psmx3_cq_close(fid_t fid)
+{
+	struct psmx3_fid_cq *cq;
+	struct slist_entry *entry;
+	struct psmx3_cq_event *item;
+	struct psmx3_poll_ctxt *poll_item;
+
+	cq = container_of(fid, struct psmx3_fid_cq, cq.fid);
+
+	while (!slist_empty(&cq->poll_list)) {
+		entry = slist_remove_head(&cq->poll_list);
+		poll_item = container_of(entry, struct psmx3_poll_ctxt, list_entry);
+		if (!ofi_atomic_dec32(&poll_item->trx_ctxt->poll_refcnt))
+			free(poll_item->trx_ctxt);
+		free(poll_item);
+	}
+
+	while (!slist_empty(&cq->free_list)) {
+		entry = slist_remove_head(&cq->free_list);
+		item = container_of(entry, struct psmx3_cq_event, list_entry);
+		free(item);
+	}
+
+	while (!slist_empty(&cq->event_queue)) {
+		entry = slist_remove_head(&cq->event_queue);
+		item = container_of(entry, struct psmx3_cq_event, list_entry);
+		free(item);
+	}
+
+	fastlock_destroy(&cq->lock);
+
+	if (cq->wait) {
+		fi_poll_del(&cq->wait->pollset->poll_fid, &cq->cq.fid, 0);
+		if (cq->wait_is_local)
+			fi_close(&cq->wait->wait_fid.fid);
+	}
+
+	psmx3_domain_release(cq->domain);
+	free(cq);
+
+	return 0;
+}
+
+static int psmx3_cq_control(struct fid *fid, int command, void *arg)
+{
+	struct psmx3_fid_cq *cq;
+	int ret = 0;
+
+	cq = container_of(fid, struct psmx3_fid_cq, cq.fid);
+
+	switch (command) {
+	case FI_GETWAIT:
+		if (cq->wait)
+			ret = fi_control(&cq->wait->wait_fid.fid, FI_GETWAIT, arg);
+		else
+			return -FI_EINVAL;
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return ret;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_cq_close,
+	.bind = fi_no_bind,
+	.control = psmx3_cq_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_cq psmx3_cq_ops = {
+	.size = sizeof(struct fi_ops_cq),
+	.read = psmx3_cq_read,
+	.readfrom = psmx3_cq_readfrom,
+	.readerr = psmx3_cq_readerr,
+	.sread = psmx3_cq_sread,
+	.sreadfrom = psmx3_cq_sreadfrom,
+	.signal = psmx3_cq_signal,
+	.strerror = psmx3_cq_strerror,
+};
+
+DIRECT_FN
+int psmx3_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
+		 struct fid_cq **cq, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_cq *cq_priv;
+	struct fid_wait *wait = NULL;
+	struct psmx3_cq_event *event;
+	struct fi_wait_attr wait_attr;
+	int wait_is_local = 0;
+	int entry_size;
+	int err;
+	int i;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+	switch (attr->format) {
+	case FI_CQ_FORMAT_UNSPEC:
+		attr->format = FI_CQ_FORMAT_TAGGED;
+		entry_size = sizeof(struct fi_cq_tagged_entry);
+		break;
+
+	case FI_CQ_FORMAT_CONTEXT:
+		entry_size = sizeof(struct fi_cq_entry);
+		break;
+
+	case FI_CQ_FORMAT_MSG:
+		entry_size = sizeof(struct fi_cq_msg_entry);
+		break;
+
+	case FI_CQ_FORMAT_DATA:
+		entry_size = sizeof(struct fi_cq_data_entry);
+		break;
+
+	case FI_CQ_FORMAT_TAGGED:
+		entry_size = sizeof(struct fi_cq_tagged_entry);
+		break;
+
+	default:
+		FI_INFO(&psmx3_prov, FI_LOG_CQ,
+			"attr->format=%d, supported=%d...%d\n", attr->format,
+			FI_CQ_FORMAT_UNSPEC, FI_CQ_FORMAT_TAGGED);
+		return -FI_EINVAL;
+	}
+
+	switch (attr->wait_obj) {
+	case FI_WAIT_NONE:
+		break;
+
+	case FI_WAIT_SET:
+		if (!attr->wait_set) {
+			FI_INFO(&psmx3_prov, FI_LOG_CQ,
+				"FI_WAIT_SET is specified but attr->wait_set is NULL\n");
+			return -FI_EINVAL;
+		}
+		wait = attr->wait_set;
+		break;
+
+	case FI_WAIT_UNSPEC:
+	case FI_WAIT_FD:
+	case FI_WAIT_MUTEX_COND:
+		wait_attr.wait_obj = attr->wait_obj;
+		wait_attr.flags = 0;
+		err = fi_wait_open(&domain_priv->fabric->util_fabric.fabric_fid,
+				   &wait_attr, (struct fid_wait **)&wait);
+		if (err)
+			return err;
+		wait_is_local = 1;
+		break;
+
+	default:
+		FI_INFO(&psmx3_prov, FI_LOG_CQ,
+			"attr->wait_obj=%d, supported=%d...%d\n", attr->wait_obj,
+			FI_WAIT_NONE, FI_WAIT_MUTEX_COND);
+		return -FI_EINVAL;
+	}
+
+	if (wait) {
+		switch (attr->wait_cond) {
+		case FI_CQ_COND_NONE:
+		case FI_CQ_COND_THRESHOLD:
+			break;
+
+		default:
+			FI_INFO(&psmx3_prov, FI_LOG_CQ,
+				"attr->wait_cond=%d, supported=%d...%d\n",
+				attr->wait_cond, FI_CQ_COND_NONE, FI_CQ_COND_THRESHOLD);
+			return -FI_EINVAL;
+		}
+	}
+
+	cq_priv = (struct psmx3_fid_cq *) calloc(1, sizeof *cq_priv);
+	if (!cq_priv) {
+		if (wait)
+			free(wait);
+		return -FI_ENOMEM;
+	}
+
+	psmx3_domain_acquire(domain_priv);
+
+	cq_priv->domain = domain_priv;
+	cq_priv->format = attr->format;
+	cq_priv->entry_size = entry_size;
+	if (wait) {
+		cq_priv->wait = container_of(wait, struct util_wait, wait_fid);
+		cq_priv->wait_cond = attr->wait_cond;
+	}
+	cq_priv->wait_is_local = wait_is_local;
+	ofi_atomic_initialize32(&cq_priv->signaled, 0);
+
+	cq_priv->cq.fid.fclass = FI_CLASS_CQ;
+	cq_priv->cq.fid.context = context;
+	cq_priv->cq.fid.ops = &psmx3_fi_ops;
+	cq_priv->cq.ops = &psmx3_cq_ops;
+
+	slist_init(&cq_priv->poll_list);
+	slist_init(&cq_priv->event_queue);
+	slist_init(&cq_priv->free_list);
+	fastlock_init(&cq_priv->lock);
+
+#define PSMX3_FREE_LIST_SIZE	64
+	for (i=0; i<PSMX3_FREE_LIST_SIZE; i++) {
+		event = calloc(1, sizeof(*event));
+		if (!event) {
+			FI_WARN(&psmx3_prov, FI_LOG_CQ, "out of memory.\n");
+			exit(-1);
+		}
+		slist_insert_tail(&event->list_entry, &cq_priv->free_list);
+	}
+
+	if (wait)
+		fi_poll_add(&cq_priv->wait->pollset->poll_fid, &cq_priv->cq.fid, 0);
+
+	*cq = &cq_priv->cq;
+	return 0;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_domain.c b/deps/libfabric/prov/psm3/src/psmx3_domain.c
new file mode 100644
index 0000000000000000000000000000000000000000..f0f187e77f8932b2eb5f229550026073ccb6ba5b
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_domain.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+static inline int normalize_core_id(int core_id, int num_cores)
+{
+	if (core_id < 0)
+		core_id += num_cores;
+
+	if (core_id < 0)
+		core_id = 0;
+
+	if (core_id >= num_cores)
+		core_id = num_cores - 1;
+
+	return core_id;
+}
+
+static int psmx3_progress_set_affinity(char *affinity)
+{
+	int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+	int core_id;
+	cpu_set_t cpuset;
+	char *triplet;
+	int n, start, end, stride;
+	int set_count = 0;
+
+	if (!affinity) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"progress thread affinity not set\n");
+		return 0;
+	}
+
+	CPU_ZERO(&cpuset);
+
+	for (triplet = affinity; triplet; triplet = strchr(triplet, 'c')) {
+		if (triplet[0] == ',')
+			triplet++;
+
+		stride = 1;
+		n = sscanf(triplet, "%d:%d:%d", &start, &end, &stride);
+		if (n < 1)
+			continue;
+
+		if (n < 2)
+			end = start;
+
+		if (stride < 1)
+			stride = 1;
+
+		start = normalize_core_id(start, num_cores);
+		end = normalize_core_id(end, num_cores);
+
+		for (core_id = start; core_id <= end; core_id += stride) {
+			CPU_SET(core_id, &cpuset);
+			set_count++;
+		}
+
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"core set [%d:%d:%d] added to progress thread affinity set\n",
+			start, end, stride);
+	}
+
+	if (set_count)
+		pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+	else
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"progress thread affinity not set due to invalid format\n");
+
+	return set_count;
+}
+
+static void *psmx3_progress_func(void *args)
+{
+	struct psmx3_fid_domain *domain = args;
+	int affinity_set;
+	int sleep_usec;
+	struct timespec ts;
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n");
+
+	affinity_set = psmx3_progress_set_affinity(psmx3_env.prog_affinity);
+
+	/* Negative sleep time means let the system choose the default.
+	 * If affinity is set, sleep a short time to get better latency.
+	 * If affinity is not set, short sleep time doesn't make difference.
+	 */
+	sleep_usec = psmx3_env.prog_interval;
+	if (sleep_usec < 0) {
+		if (affinity_set)
+			sleep_usec = 1;
+		else
+			sleep_usec = 1000;
+	}
+
+	ts.tv_sec = sleep_usec / 1000000;
+	ts.tv_nsec = (sleep_usec % 1000000) * 1000;
+
+	while (1) {
+		psmx3_progress_all(domain);
+		nanosleep(&ts, NULL);
+	}
+
+	return NULL;
+}
+
+static void psmx3_domain_start_progress(struct psmx3_fid_domain *domain)
+{
+	int err;
+
+	err = pthread_create(&domain->progress_thread, NULL,
+			     psmx3_progress_func, (void *)domain);
+	if (err) {
+		domain->progress_thread = pthread_self();
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"pthread_create returns %d\n", err);
+	} else {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "progress thread started\n");
+	}
+}
+
+static void psmx3_domain_stop_progress(struct psmx3_fid_domain *domain)
+{
+	int err;
+	void *exit_code;
+
+	if (!pthread_equal(domain->progress_thread, pthread_self())) {
+		err = pthread_cancel(domain->progress_thread);
+		if (err) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"pthread_cancel returns %d\n", err);
+		}
+		err = pthread_join(domain->progress_thread, &exit_code);
+		if (err) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"pthread_join returns %d\n", err);
+		} else {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"progress thread exited with code %ld (%s)\n",
+				(uintptr_t)exit_code,
+				(exit_code == PTHREAD_CANCELED) ?
+					"PTHREAD_CANCELED" : "?");
+		}
+	}
+}
+
+static int psmx3_domain_close(fid_t fid)
+{
+	struct psmx3_fid_domain *domain;
+
+	domain = container_of(fid, struct psmx3_fid_domain,
+			      util_domain.domain_fid.fid);
+
+	FI_INFO(&psmx3_prov, FI_LOG_DOMAIN, "refcnt=%d\n",
+		ofi_atomic_get32(&domain->util_domain.ref));
+
+	if (ofi_domain_close(&domain->util_domain))
+		return 0;
+
+	if (domain->progress_thread_enabled)
+		psmx3_domain_stop_progress(domain);
+
+	fastlock_destroy(&domain->sep_lock);
+	fastlock_destroy(&domain->mr_lock);
+	rbtDelete(domain->mr_map);
+
+	psmx3_lock(&domain->fabric->domain_lock, 1);
+	dlist_remove(&domain->entry);
+	psmx3_unlock(&domain->fabric->domain_lock, 1);
+	psmx3_fabric_release(domain->fabric);
+
+	free(domain);
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_domain_control(fid_t fid, int command, void *arg)
+{
+	struct fi_mr_map_raw *map;
+
+	switch (command) {
+	case FI_MAP_RAW_MR:
+		map = arg;
+		if (!map || !map->key || !map->raw_key)
+			return -FI_EINVAL;
+		*(uint64_t *)map->key = *(uint64_t *)map->raw_key;
+		break;
+
+	case FI_UNMAP_KEY:
+		/* Nothing to do here */
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_domain_close,
+	.bind = fi_no_bind,
+	.control = psmx3_domain_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_domain psmx3_domain_ops = {
+	.size = sizeof(struct fi_ops_domain),
+	.av_open = psmx3_av_open,
+	.cq_open = psmx3_cq_open,
+	.endpoint = psmx3_ep_open,
+	.scalable_ep = psmx3_sep_open,
+	.cntr_open = psmx3_cntr_open,
+	.poll_open = fi_poll_create,
+	.stx_ctx = psmx3_stx_ctx,
+	.srx_ctx = fi_no_srx_context,
+	.query_atomic = psmx3_query_atomic,
+	.query_collective = fi_no_query_collective,
+};
+
+static int psmx3_key_compare(void *key1, void *key2)
+{
+	return (key1 < key2) ?  -1 : (key1 > key2);
+}
+
+static int psmx3_domain_init(struct psmx3_fid_domain *domain,
+			     struct psmx3_ep_name *src_addr)
+{
+	int err;
+
+	err = fastlock_init(&domain->mr_lock);
+	if (err) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"fastlock_init(mr_lock) returns %d\n", err);
+		goto err_out;
+	}
+
+	domain->mr_map = rbtNew(&psmx3_key_compare);
+	if (!domain->mr_map) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"rbtNew failed\n");
+		goto err_out_destroy_mr_lock;
+	}
+
+	domain->mr_reserved_key = 1;
+	domain->max_atomic_size = INT_MAX;
+
+	ofi_atomic_initialize32(&domain->sep_cnt, 0);
+	fastlock_init(&domain->sep_lock);
+	dlist_init(&domain->sep_list);
+	dlist_init(&domain->trx_ctxt_list);
+	fastlock_init(&domain->trx_ctxt_lock);
+
+	if (domain->progress_thread_enabled)
+		psmx3_domain_start_progress(domain);
+
+	return 0;
+
+err_out_destroy_mr_lock:
+	fastlock_destroy(&domain->mr_lock);
+
+err_out:
+	return err;
+}
+
+DIRECT_FN
+int psmx3_domain_open(struct fid_fabric *fabric, struct fi_info *info,
+		      struct fid_domain **domain, void *context)
+{
+	struct psmx3_fid_fabric *fabric_priv;
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_ep_name *src_addr = info->src_addr;
+	int mr_mode = (info->domain_attr->mr_mode & FI_MR_BASIC) ? FI_MR_BASIC : 0;
+	int err, tmp;
+
+	FI_INFO(&psmx3_prov, FI_LOG_DOMAIN, "\n");
+
+	fabric_priv = container_of(fabric, struct psmx3_fid_fabric,
+				   util_fabric.fabric_fid);
+
+#if 0
+	if (!info->domain_attr->name ||
+	    strncmp(info->domain_attr->name, PSMX3_DOMAIN_NAME, strlen(PSMX3_DOMAIN_NAME))) {
+		err = -FI_EINVAL;
+		goto err_out;
+	}
+#endif /* 0 */
+
+	domain_priv = (struct psmx3_fid_domain *) calloc(1, sizeof *domain_priv);
+	if (!domain_priv) {
+		err = -FI_ENOMEM;
+		goto err_out;
+	}
+
+	err = ofi_domain_init(fabric, info, &domain_priv->util_domain, context);
+	if (err)
+		goto err_out_free_domain;
+
+	/* fclass & context are set in ofi_domain_init */
+	domain_priv->util_domain.domain_fid.fid.ops = &psmx3_fi_ops;
+	domain_priv->util_domain.domain_fid.ops = &psmx3_domain_ops;
+	domain_priv->util_domain.domain_fid.mr = &psmx3_mr_ops;
+	domain_priv->mr_mode = mr_mode;
+	domain_priv->mode = info->mode;
+	domain_priv->caps = info->caps;
+	domain_priv->fabric = fabric_priv;
+	domain_priv->progress_thread_enabled =
+		(info->domain_attr->data_progress == FI_PROGRESS_AUTO);
+	domain_priv->addr_format = info->addr_format;
+
+	if (info->addr_format == FI_ADDR_STR)
+		src_addr = psmx3_string_to_ep_name(info->src_addr);
+
+	/* Use generic lock/unlock functions by default */
+	domain_priv->av_lock_fn = psmx3_lock;
+	domain_priv->am_req_pool_lock_fn = psmx3_lock;
+	domain_priv->trx_ctxt_lock_fn = psmx3_lock;
+	domain_priv->rma_queue_lock_fn = psmx3_lock;
+	domain_priv->trigger_queue_lock_fn = psmx3_lock;
+	domain_priv->peer_lock_fn = psmx3_lock;
+	domain_priv->sep_lock_fn = psmx3_lock;
+	domain_priv->trigger_lock_fn = psmx3_lock;
+	domain_priv->cq_lock_fn = psmx3_lock;
+	domain_priv->mr_lock_fn = psmx3_lock;
+	domain_priv->context_lock_fn = psmx3_lock;
+	domain_priv->poll_trylock_fn = psmx3_trylock;
+
+	domain_priv->av_unlock_fn = psmx3_unlock;
+	domain_priv->am_req_pool_unlock_fn = psmx3_unlock;
+	domain_priv->trx_ctxt_unlock_fn = psmx3_unlock;
+	domain_priv->rma_queue_unlock_fn = psmx3_unlock;
+	domain_priv->trigger_queue_unlock_fn = psmx3_unlock;
+	domain_priv->peer_unlock_fn = psmx3_unlock;
+	domain_priv->sep_unlock_fn = psmx3_unlock;
+	domain_priv->trigger_unlock_fn = psmx3_unlock;
+	domain_priv->cq_unlock_fn = psmx3_unlock;
+	domain_priv->mr_unlock_fn = psmx3_unlock;
+	domain_priv->context_unlock_fn = psmx3_unlock;
+	domain_priv->poll_unlock_fn = psmx3_unlock;
+
+	/* If lock_level env is unset, then set locks based off threading model*/
+	err = fi_param_get_bool(&psmx3_prov, "lock_level", &tmp);
+	if (err < 0) {
+		switch (info->domain_attr->threading) {
+		case FI_THREAD_DOMAIN:
+			/* Disable locks not required when serializing access to a domain */
+			domain_priv->av_lock_fn = psmx3_lock_disabled;
+			domain_priv->trx_ctxt_lock_fn = psmx3_lock_disabled;
+			domain_priv->trigger_queue_lock_fn = psmx3_lock_disabled;
+			domain_priv->sep_lock_fn = psmx3_lock_disabled;
+			domain_priv->trigger_lock_fn = psmx3_lock_disabled;
+			domain_priv->cq_lock_fn = psmx3_lock_disabled;
+			domain_priv->mr_lock_fn = psmx3_lock_disabled;
+			domain_priv->context_lock_fn = psmx3_lock_disabled;
+			domain_priv->poll_trylock_fn = psmx3_trylock_disabled;
+
+			domain_priv->av_unlock_fn = psmx3_lock_disabled;
+			domain_priv->trx_ctxt_unlock_fn = psmx3_lock_disabled;
+			domain_priv->trigger_queue_unlock_fn = psmx3_lock_disabled;
+			domain_priv->sep_unlock_fn = psmx3_lock_disabled;
+			domain_priv->trigger_unlock_fn = psmx3_lock_disabled;
+			domain_priv->cq_unlock_fn = psmx3_lock_disabled;
+			domain_priv->mr_unlock_fn = psmx3_lock_disabled;
+			domain_priv->context_unlock_fn = psmx3_lock_disabled;
+			domain_priv->poll_unlock_fn = psmx3_lock_disabled;
+
+			/* Enable lock accessed by the disconnection thread */
+			domain_priv->peer_lock_fn = psmx3_lock_enabled;
+			domain_priv->peer_unlock_fn = psmx3_unlock_enabled;
+
+			/*
+			 * If FI_RMA or FI_ATOMIC caps are enabled, then locks are
+			 * required for the CQ, am_req_pool, & rma_queue
+			 * due to the PSM2 Recv thread.
+			 * NOTE: am_req_pool & rma_queue are only used when FI_RMA
+			 * and FI_ATOMIC capabilities are enabled.
+			 */
+			if ((info->caps & FI_RMA) || (info->caps & FI_ATOMIC)) {
+				domain_priv->cq_lock_fn = psmx3_lock_enabled;
+				domain_priv->am_req_pool_lock_fn = psmx3_lock_enabled;
+				domain_priv->rma_queue_lock_fn = psmx3_lock_enabled;
+				domain_priv->cq_unlock_fn = psmx3_unlock_enabled;
+				domain_priv->am_req_pool_unlock_fn = psmx3_unlock_enabled;
+				domain_priv->rma_queue_unlock_fn = psmx3_unlock_enabled;
+			}
+
+			/*
+			 * Locks accessed by the progress thread are required because
+			 * they are outside the scope of domain access serialization
+			 * implied by FI_THREAD_DOMAIN.
+			 */
+			if (domain_priv->progress_thread_enabled) {
+				domain_priv->trx_ctxt_lock_fn = psmx3_lock_enabled;
+				domain_priv->poll_trylock_fn = psmx3_trylock_enabled;
+				domain_priv->cq_lock_fn = psmx3_lock_enabled;
+				domain_priv->trx_ctxt_unlock_fn = psmx3_unlock_enabled;
+				domain_priv->poll_unlock_fn = psmx3_unlock_enabled;
+				domain_priv->cq_unlock_fn = psmx3_unlock_enabled;
+				if (info->caps & FI_TRIGGER) {
+					domain_priv->trigger_queue_lock_fn = psmx3_lock_enabled;
+					domain_priv->trigger_lock_fn = psmx3_lock_enabled;
+					domain_priv->av_lock_fn = psmx3_lock_enabled;
+					domain_priv->mr_lock_fn = psmx3_lock_enabled;
+					domain_priv->context_lock_fn = psmx3_lock_enabled;
+					domain_priv->trigger_queue_unlock_fn = psmx3_unlock_enabled;
+					domain_priv->trigger_unlock_fn = psmx3_unlock_enabled;
+					domain_priv->av_unlock_fn = psmx3_unlock_enabled;
+					domain_priv->mr_unlock_fn = psmx3_unlock_enabled;
+					domain_priv->context_unlock_fn = psmx3_unlock_enabled;
+				}
+			}
+			break;
+		default:
+			/* Otherwise, enable all locks */
+			domain_priv->av_lock_fn = psmx3_lock_enabled;
+			domain_priv->am_req_pool_lock_fn = psmx3_lock_enabled;
+			domain_priv->trx_ctxt_lock_fn = psmx3_lock_enabled;
+			domain_priv->rma_queue_lock_fn = psmx3_lock_enabled;
+			domain_priv->trigger_queue_lock_fn = psmx3_lock_enabled;
+			domain_priv->peer_lock_fn = psmx3_lock_enabled;
+			domain_priv->sep_lock_fn = psmx3_lock_enabled;
+			domain_priv->trigger_lock_fn = psmx3_lock_enabled;
+			domain_priv->cq_lock_fn = psmx3_lock_enabled;
+			domain_priv->mr_lock_fn = psmx3_lock_enabled;
+			domain_priv->context_lock_fn = psmx3_lock_enabled;
+			domain_priv->poll_trylock_fn = psmx3_trylock_enabled;
+
+			domain_priv->av_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->am_req_pool_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->trx_ctxt_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->rma_queue_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->trigger_queue_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->peer_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->sep_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->trigger_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->cq_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->mr_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->context_unlock_fn = psmx3_unlock_enabled;
+			domain_priv->poll_unlock_fn = psmx3_unlock_enabled;
+			break;
+		}
+	}
+
+	err = psmx3_domain_init(domain_priv, src_addr);
+	if (info->addr_format == FI_ADDR_STR)
+		free(src_addr);
+	if (err)
+		goto err_out_close_domain;
+
+	psmx3_fabric_acquire(fabric_priv);
+	psmx3_lock(&fabric_priv->domain_lock, 1);
+	dlist_insert_before(&domain_priv->entry, &fabric_priv->domain_list);
+	psmx3_unlock(&fabric_priv->domain_lock, 1);
+
+	psmx3_init_tag_layout(info);
+
+	*domain = &domain_priv->util_domain.domain_fid;
+	return 0;
+
+err_out_close_domain:
+	ofi_domain_close(&domain_priv->util_domain);
+
+err_out_free_domain:
+	free(domain_priv);
+
+err_out:
+	return err;
+}
+
+static int psmx3_domain_check_features(struct psmx3_fid_domain *domain,
+				       uint64_t ep_caps)
+{
+	uint64_t domain_caps = domain->caps & ~PSMX3_SUB_CAPS;
+
+	ep_caps &= ~PSMX3_SUB_CAPS;
+
+	if ((domain_caps & ep_caps) != ep_caps) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"caps mismatch: domain_caps=%s;\n",
+			fi_tostr(&domain_caps, FI_TYPE_CAPS));
+
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"caps mismatch: ep_caps=%s.\n",
+			fi_tostr(&ep_caps, FI_TYPE_CAPS));
+
+		return -FI_EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int psmx3_domain_enable_ep(struct psmx3_fid_domain *domain,
+			   struct psmx3_fid_ep *ep)
+{
+	int err;
+
+	err = psmx3_domain_check_features(domain, ep->caps);
+	if (err)
+		return err;
+
+	if ((ep->caps & FI_RMA) || (ep->caps & FI_ATOMICS)) {
+		if (ep->tx) {
+			err = psmx3_am_init(ep->tx);
+			if (err)
+				return err;
+		}
+		if (ep->rx && ep->rx != ep->tx)
+			return psmx3_am_init(ep->rx);
+	}
+
+	return 0;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_ep.c b/deps/libfabric/prov/psm3/src/psmx3_ep.c
new file mode 100644
index 0000000000000000000000000000000000000000..d196abfd1c6c21af5a737e8674cdd492e5c127a8
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_ep.c
@@ -0,0 +1,1108 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+#define PSMX3_EP_SET_TAGGED_OPS(suffix, msg_suffix)				\
+	do {									\
+		if (!send_completion && !recv_completion) {			\
+			ep->ep.tagged = &psmx3_tagged_ops_no_event##suffix;	\
+			FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,			\
+				"tagged ops optimized for op_flags=0 "		\
+				"and event suppression "			\
+				msg_suffix					\
+				"\n");						\
+		} else if (!send_completion) {					\
+			ep->ep.tagged = &psmx3_tagged_ops_no_send_event##suffix;\
+			FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,			\
+				"tagged ops optimized for op_flags=0 "		\
+				"and send event suppression "			\
+				msg_suffix					\
+				"\n");						\
+		} else if (!recv_completion) {					\
+			ep->ep.tagged = &psmx3_tagged_ops_no_recv_event##suffix;\
+			FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,			\
+				"tagged ops optimized for op_flags=0 "		\
+				"and recv event suppression "			\
+				msg_suffix					\
+				"\n");						\
+		} else {							\
+			ep->ep.tagged = &psmx3_tagged_ops_no_flag##suffix;	\
+			FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,			\
+				"tagged ops optimized for op_flags=0 "		\
+				msg_suffix					\
+				"\n");						\
+		}								\
+	} while (0)
+
+static void psmx3_ep_optimize_ops(struct psmx3_fid_ep *ep)
+{
+	int send_completion;
+	int recv_completion;
+	uint64_t mask;
+
+	mask = PSMX3_OP_FLAGS &
+	       ~(FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE);
+
+	if (ep->ep.tagged) {
+		if (ep->tx_flags & mask & ~FI_COMPLETION || ep->rx_flags & mask & ~FI_COMPLETION) {
+			ep->ep.tagged = &psmx3_tagged_ops;
+			FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,
+				"generic tagged ops.\n");
+		} else {
+			send_completion = !ep->send_selective_completion || ep->tx_flags & FI_COMPLETION;
+			recv_completion = !ep->recv_selective_completion || ep->rx_flags & FI_COMPLETION;
+
+			if (ep->av && ep->av->type == FI_AV_MAP) {
+				if (ep->caps & FI_DIRECTED_RECV)
+					PSMX3_EP_SET_TAGGED_OPS(_directed_av_map, "and directed receive and av map");
+				else
+					PSMX3_EP_SET_TAGGED_OPS(_undirected_av_map, "and av map");
+			} else {
+				if (ep->caps & FI_DIRECTED_RECV)
+					PSMX3_EP_SET_TAGGED_OPS(_directed, "and directed receive");
+				else
+					PSMX3_EP_SET_TAGGED_OPS(_undirected, "");
+			}
+		}
+	}
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_ep_cancel(fid_t fid, void *context)
+{
+	struct psmx3_fid_ep *ep;
+	psm2_mq_status2_t status;
+	struct fi_context *fi_context = context;
+	uint64_t flags;
+	struct psmx3_cq_event *event;
+	int err;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+	assert(ep->domain);
+	assert(fi_context);
+
+	switch (PSMX3_CTXT_TYPE(fi_context)) {
+	case PSMX3_TRECV_CONTEXT:
+		flags = FI_RECV | FI_TAGGED;
+		break;
+	case PSMX3_RECV_CONTEXT:
+	case PSMX3_MULTI_RECV_CONTEXT:
+		flags = FI_RECV | FI_MSG;
+		break;
+	default:
+		return  -FI_EOPNOTSUPP;
+	}
+
+	err = psm2_mq_cancel((psm2_mq_req_t *)&PSMX3_CTXT_REQ(fi_context));
+	if (err == PSM2_OK) {
+		err = psm2_mq_test2((psm2_mq_req_t *)&PSMX3_CTXT_REQ(fi_context), &status);
+		if (err == PSM2_OK && ep->recv_cq) {
+			event = psmx3_cq_create_event(
+					ep->recv_cq,
+					status.context,
+					NULL,	/* buf */
+					flags,
+					0,	/* len */
+					0,	/* data */
+					0,	/* tag */
+					0	/* olen */,
+					-FI_ECANCELED);
+			if (event)
+				psmx3_cq_enqueue_event(ep->recv_cq, event);
+			else
+				return -FI_ENOMEM;
+		}
+	}
+
+	return psmx3_errno(err);
+}
+
+DIRECT_FN
+STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname,
+			   void *optval, size_t *optlen)
+{
+	struct psmx3_fid_ep *ep;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+
+	if (level != FI_OPT_ENDPOINT)
+		return -FI_ENOPROTOOPT;
+
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		*(size_t *)optval = ep->min_multi_recv;
+		*optlen = sizeof(size_t);
+		break;
+
+	default:
+		return -FI_ENOPROTOOPT;
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_ep_setopt(fid_t fid, int level, int optname,
+			   const void *optval, size_t optlen)
+{
+	struct psmx3_fid_ep *ep;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+
+	if (level != FI_OPT_ENDPOINT)
+		return -FI_ENOPROTOOPT;
+
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		ep->min_multi_recv = *(size_t *)optval;
+		break;
+
+	default:
+		return -FI_ENOPROTOOPT;
+	}
+
+	return 0;
+}
+
+static void psmx3_ep_close_internal(struct psmx3_fid_ep *ep)
+{
+	psmx3_domain_release(ep->domain);
+	PSMX3_EP_FINI_OP_CONTEXT(ep);
+	free(ep);
+}
+
+static int psmx3_ep_close(fid_t fid)
+{
+	struct psmx3_fid_ep *ep;
+	struct psmx3_ep_name ep_name;
+	int usage_flags = 0;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+
+	if (ep->base_ep) {
+		ofi_atomic_dec32(&ep->base_ep->ref);
+		return 0;
+	}
+
+	if (ofi_atomic_get32(&ep->ref))
+		return -FI_EBUSY;
+
+	if (ep->stx)
+		ofi_atomic_dec32(&ep->stx->ref);
+
+	if (ep->tx && !ep->stx)
+		usage_flags |= PSMX3_TX;
+
+	if (ep->rx) {
+		usage_flags |= PSMX3_RX;
+		ep_name.epid = ep->rx->psm2_epid;
+
+		ofi_ns_del_local_name(&ep->domain->fabric->name_server,
+				      &ep->service, &ep_name);
+	}
+
+	psmx3_trx_ctxt_free(ep->rx, usage_flags);
+	psmx3_ep_close_internal(ep);
+	return 0;
+}
+
+static int psmx3_poll_ctxt_match(struct slist_entry *entry, const void *arg)
+{
+	struct psmx3_poll_ctxt *poll_ctxt;
+
+	poll_ctxt = container_of(entry, struct psmx3_poll_ctxt, list_entry);
+	return (poll_ctxt->trx_ctxt == arg);
+}
+
+static int psmx3_add_poll_ctxt(struct slist *list, struct psmx3_trx_ctxt *trx_ctxt)
+{
+	struct psmx3_poll_ctxt *item;
+
+	if (!trx_ctxt)
+		return 0;
+
+	if (!slist_empty(list) &&
+	    slist_find_first_match(list, psmx3_poll_ctxt_match, trx_ctxt))
+		return 0;
+
+	item = calloc(1, sizeof(*item));
+	if (!item)
+		return -FI_ENOMEM;
+
+	ofi_atomic_inc32(&trx_ctxt->poll_refcnt);
+	item->trx_ctxt = trx_ctxt;
+	slist_insert_tail(&item->list_entry, list);
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct psmx3_fid_ep *ep;
+	struct psmx3_fid_av *av;
+	struct psmx3_fid_cq *cq;
+	struct psmx3_fid_cntr *cntr;
+	struct psmx3_fid_stx *stx;
+	int err;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+	err = ofi_ep_bind_valid(&psmx3_prov, bfid, flags);
+	if (err)
+		return err;
+
+	switch (bfid->fclass) {
+	case FI_CLASS_EQ:
+		return -FI_ENOSYS;
+
+	case FI_CLASS_CQ:
+		cq = container_of(bfid, struct psmx3_fid_cq, cq.fid);
+		if (ep->domain != cq->domain)
+			return -FI_EINVAL;
+		if (flags & FI_SEND) {
+			err = psmx3_add_poll_ctxt(&cq->poll_list, ep->tx);
+			if (err)
+				return err;
+			ep->send_cq = cq;
+			if (flags & FI_SELECTIVE_COMPLETION)
+				ep->send_selective_completion = 1;
+		}
+		if (flags & FI_RECV) {
+			err = psmx3_add_poll_ctxt(&cq->poll_list, ep->rx);
+			if (err)
+				return err;
+			ep->recv_cq = cq;
+			if (flags & FI_SELECTIVE_COMPLETION)
+				ep->recv_selective_completion = 1;
+		}
+		psmx3_ep_optimize_ops(ep);
+		break;
+
+	case FI_CLASS_CNTR:
+		cntr = container_of(bfid, struct psmx3_fid_cntr, cntr.fid);
+		if (ep->domain != cntr->domain)
+			return -FI_EINVAL;
+		if (flags & (FI_SEND | FI_WRITE | FI_READ)) {
+			err = psmx3_add_poll_ctxt(&cntr->poll_list, ep->tx);
+			if (err)
+				return err;
+		}
+		if (flags & (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ)) {
+			err = psmx3_add_poll_ctxt(&cntr->poll_list, ep->rx);
+			if (err)
+				return err;
+		}
+		if (flags & FI_SEND)
+			ep->send_cntr = cntr;
+		if (flags & FI_RECV)
+			ep->recv_cntr = cntr;
+		if (flags & FI_WRITE)
+			ep->write_cntr = cntr;
+		if (flags & FI_READ)
+			ep->read_cntr = cntr;
+		if (flags & FI_REMOTE_WRITE)
+			ep->remote_write_cntr = cntr;
+		if (flags & FI_REMOTE_READ)
+			ep->remote_read_cntr = cntr;
+		break;
+
+	case FI_CLASS_AV:
+		av = container_of(bfid,
+				struct psmx3_fid_av, av.fid);
+		if (ep->domain != av->domain)
+			return -FI_EINVAL;
+		ep->av = av;
+		psmx3_ep_optimize_ops(ep);
+		if (ep->tx)
+			psmx3_av_add_trx_ctxt(av, ep->tx);
+		if (ep->rx && ep->rx != ep->tx)
+			psmx3_av_add_trx_ctxt(av, ep->rx);
+		break;
+
+	case FI_CLASS_MR:
+		if (!bfid->ops || !bfid->ops->bind)
+			return -FI_EINVAL;
+		err = bfid->ops->bind(bfid, fid, flags);
+		if (err)
+			return err;
+		break;
+
+	case FI_CLASS_STX_CTX:
+		stx = container_of(bfid, struct psmx3_fid_stx, stx.fid);
+		if (ep->domain != stx->domain)
+			return -FI_EINVAL;
+		if (ep->tx || ep->stx)
+			return -FI_EINVAL;
+		ep->tx = stx->tx;
+		ep->stx = stx;
+		err = psmx3_domain_enable_ep(ep->domain, ep);
+		if (err)
+			return err;
+		if (ep->caps & FI_TRIGGER)
+			stx->tx->am_progress = 1;
+		ofi_atomic_inc32(&stx->ref);
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+static inline int psmx3_ep_set_flags(struct psmx3_fid_ep *ep, uint64_t flags)
+{
+	uint64_t real_flags = flags & ~(FI_TRANSMIT | FI_RECV);
+
+	if ((flags & FI_TRANSMIT) && (flags & FI_RECV))
+		return -EINVAL;
+	else if (flags & FI_TRANSMIT)
+		ep->tx_flags = real_flags;
+	else if (flags & FI_RECV)
+		ep->rx_flags = real_flags;
+
+	/* otherwise ok to leave the flags intact */
+
+	return 0;
+}
+
+static inline int psmx3_ep_get_flags(struct psmx3_fid_ep *ep, uint64_t *flags)
+{
+	uint64_t flags_in = *flags;
+
+	if ((flags_in & FI_TRANSMIT) && (flags_in & FI_RECV))
+		return -EINVAL;
+	else if (flags_in & FI_TRANSMIT)
+		*flags = ep->tx_flags;
+	else if (flags_in & FI_RECV)
+		*flags = ep->rx_flags;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_ep_control(fid_t fid, int command, void *arg)
+{
+	struct fi_alias *alias;
+	struct psmx3_fid_ep *ep, *new_ep;
+	int err;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+
+	switch (command) {
+	case FI_ALIAS:
+		new_ep = (struct psmx3_fid_ep *) calloc(1, sizeof *ep);
+		if (!new_ep)
+			return -FI_ENOMEM;
+		alias = arg;
+		*new_ep = *ep;
+		err = psmx3_ep_set_flags(new_ep, alias->flags);
+		if (err) {
+			free(new_ep);
+			return err;
+		}
+		new_ep->base_ep = ep;
+		ofi_atomic_inc32(&ep->ref);
+		psmx3_ep_optimize_ops(new_ep);
+		*alias->fid = &new_ep->ep.fid;
+		break;
+
+	case FI_SETOPSFLAG:
+		err = psmx3_ep_set_flags(ep, *(uint64_t *)arg);
+		if (err)
+			return err;
+		psmx3_ep_optimize_ops(ep);
+		break;
+
+	case FI_GETOPSFLAG:
+		if (!arg)
+			return -FI_EINVAL;
+		err = psmx3_ep_get_flags(ep, arg);
+		if (err)
+			return err;
+		break;
+
+	case FI_ENABLE:
+		ep->enabled = 1;
+		return 0;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_rx_size_left(struct fid_ep *ep)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	if (ep_priv->enabled)
+		return 0x7fffffff;
+	else
+		return -FI_EOPBADSTATE;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_tx_size_left(struct fid_ep *ep)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+	if (ep_priv->enabled)
+		return 0x7fffffff;
+	else
+		return -FI_EOPBADSTATE;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_ep_close,
+	.bind = psmx3_ep_bind,
+	.control = psmx3_ep_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_ep psmx3_ep_ops = {
+	.size = sizeof(struct fi_ops_ep),
+	.cancel = psmx3_ep_cancel,
+	.getopt = psmx3_ep_getopt,
+	.setopt = psmx3_ep_setopt,
+	.tx_ctx = fi_no_tx_ctx,
+	.rx_ctx = fi_no_rx_ctx,
+	.rx_size_left = psmx3_rx_size_left,
+	.tx_size_left = psmx3_tx_size_left,
+};
+
+int psmx3_ep_open_internal(struct psmx3_fid_domain *domain_priv,
+			   struct fi_info *info,
+			   struct psmx3_fid_ep **ep_out, void *context,
+			   struct psmx3_trx_ctxt *trx_ctxt,
+			   int usage_flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	uint64_t ep_cap;
+	int err = -FI_EINVAL;
+
+	if (info)
+		ep_cap = info->caps;
+	else
+		ep_cap = FI_TAGGED;
+
+	ep_priv = (struct psmx3_fid_ep *) calloc(1, sizeof *ep_priv);
+	if (!ep_priv) {
+		err = -FI_ENOMEM;
+		goto errout;
+	}
+
+	ep_priv->ep.fid.fclass = FI_CLASS_EP;
+	ep_priv->ep.fid.context = context;
+	ep_priv->ep.fid.ops = &psmx3_fi_ops;
+	ep_priv->ep.ops = &psmx3_ep_ops;
+	ep_priv->ep.cm = &psmx3_cm_ops;
+	ep_priv->domain = domain_priv;
+	if (usage_flags & PSMX3_RX) {
+		ep_priv->rx = trx_ctxt;
+		if (trx_ctxt)
+			trx_ctxt->ep = ep_priv; /* only used by RMA target */
+	}
+	if (usage_flags & PSMX3_TX)
+		ep_priv->tx = trx_ctxt;
+	ofi_atomic_initialize32(&ep_priv->ref, 0);
+
+	PSMX3_CTXT_TYPE(&ep_priv->nocomp_send_context) = PSMX3_NOCOMP_SEND_CONTEXT;
+	PSMX3_CTXT_EP(&ep_priv->nocomp_send_context) = ep_priv;
+	PSMX3_CTXT_TYPE(&ep_priv->nocomp_tsend_context) = PSMX3_NOCOMP_TSEND_CONTEXT;
+	PSMX3_CTXT_EP(&ep_priv->nocomp_tsend_context) = ep_priv;
+
+	if (ep_cap & FI_TAGGED)
+		ep_priv->ep.tagged = &psmx3_tagged_ops;
+	if (ep_cap & FI_MSG)
+		ep_priv->ep.msg = &psmx3_msg_ops;
+	if (ep_cap & FI_RMA)
+		ep_priv->ep.rma = &psmx3_rma_ops;
+	if (ep_cap & FI_ATOMICS)
+		ep_priv->ep.atomic = &psmx3_atomic_ops;
+
+	ep_priv->caps = ep_cap;
+
+	err = psmx3_domain_enable_ep(domain_priv, ep_priv);
+	if (err)
+		goto errout_free_ep;
+
+	psmx3_domain_acquire(domain_priv);
+
+	if (info) {
+		if (info->tx_attr)
+			ep_priv->tx_flags = info->tx_attr->op_flags;
+		if (info->rx_attr)
+			ep_priv->rx_flags = info->rx_attr->op_flags;
+	}
+
+	psmx3_ep_optimize_ops(ep_priv);
+
+	PSMX3_EP_INIT_OP_CONTEXT(ep_priv);
+	if ((ep_cap & FI_TRIGGER) && trx_ctxt)
+		trx_ctxt->am_progress = 1;
+
+	*ep_out = ep_priv;
+	return 0;
+
+errout_free_ep:
+	free(ep_priv);
+
+errout:
+	return err;
+}
+
+DIRECT_FN
+int psmx3_ep_open(struct fid_domain *domain, struct fi_info *info,
+		  struct fid_ep **ep, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_ep_name ep_name;
+	struct psmx3_ep_name *src_addr;
+	struct psmx3_trx_ctxt *trx_ctxt = NULL;
+	int err = -FI_EINVAL;
+	int usage_flags = PSMX3_TX_RX;
+	uint8_t *uuid = NULL;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid.fid);
+	if (!domain_priv)
+		goto errout;
+
+	if (info && info->ep_attr &&
+	    info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT)
+		return  -FI_ENOSYS;
+
+	if (info && info->ep_attr &&
+	    info->ep_attr->tx_ctx_cnt == FI_SHARED_CONTEXT)
+		usage_flags &= ~PSMX3_TX;
+
+	if (info && !ofi_send_allowed(info->caps) &&
+	    !ofi_rma_initiate_allowed(info->caps))
+		usage_flags &= ~PSMX3_TX;
+
+	if (info && !ofi_recv_allowed(info->caps) &&
+	    !ofi_rma_target_allowed(info->caps))
+		usage_flags &= ~PSMX3_RX;
+
+	src_addr = NULL;
+	if (info && info->src_addr) {
+		if (info->addr_format == FI_ADDR_STR)
+			src_addr = psmx3_string_to_ep_name(info->src_addr);
+		else
+			src_addr = info->src_addr;
+	}
+	if (!psmx3_override_uuid() && info && info->domain_attr && info->domain_attr->auth_key) {
+		if (info->domain_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+				"Invalid domain auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->domain_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			goto errout;
+		}
+		uuid = info->domain_attr->auth_key;
+	}
+
+	if (!psmx3_override_uuid() && info && info->ep_attr && info->ep_attr->auth_key) {
+		if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+				"Invalid ep auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->ep_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			goto errout;
+		}
+		uuid = info->ep_attr->auth_key;
+	}
+
+	 /* If override is true, the FI_PSM3_UUID was set to override other uuid */
+	if (psmx3_override_uuid()) {
+		uuid = domain_priv->fabric->uuid;
+	}
+
+	if (usage_flags) {
+		trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, src_addr, -1,
+						usage_flags, uuid);
+		if (!trx_ctxt)
+			goto errout;
+	} else {
+		FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL,
+			"Tx only endpoint with STX context.\n");
+	}
+
+	err = psmx3_ep_open_internal(domain_priv, info, &ep_priv, context,
+				     trx_ctxt, usage_flags);
+	if (err)
+		goto errout_free_ctxt;
+
+	ep_priv->type = PSMX3_EP_REGULAR;
+	ep_priv->service = PSMX3_ANY_SERVICE;
+	if (src_addr) {
+		ep_priv->service = src_addr->service;
+		if (info->addr_format == FI_ADDR_STR)
+			free(src_addr);
+	}
+
+	if (ep_priv->service == PSMX3_ANY_SERVICE)
+		ep_priv->service = ((getpid() & 0x7FFF) << 16) +
+				   ((uintptr_t)ep_priv & 0xFFFF);
+
+	if (usage_flags) {
+		ep_name.epid = trx_ctxt->psm2_epid;
+		ep_name.type = ep_priv->type;
+
+		ofi_ns_add_local_name(&domain_priv->fabric->name_server,
+				      &ep_priv->service, &ep_name);
+	}
+
+	*ep = &ep_priv->ep;
+	return 0;
+
+errout_free_ctxt:
+	psmx3_trx_ctxt_free(trx_ctxt, usage_flags);
+
+errout:
+	return err;
+}
+
+/*
+ * Shared tx context
+ */
+
+static int psmx3_stx_close(fid_t fid)
+{
+	struct psmx3_fid_stx *stx;
+
+	stx = container_of(fid, struct psmx3_fid_stx, stx.fid);
+
+	if (ofi_atomic_get32(&stx->ref))
+		return -FI_EBUSY;
+
+	psmx3_trx_ctxt_free(stx->tx, PSMX3_TX);
+	psmx3_domain_release(stx->domain);
+	free(stx);
+	return 0;
+}
+
+static struct fi_ops psmx3_fi_ops_stx = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_stx_close,
+	.bind = fi_no_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_ep psmx3_stx_ops = {
+	.size = sizeof(struct fi_ops_ep),
+	.cancel = fi_no_cancel,
+	.getopt = fi_no_getopt,
+	.setopt = fi_no_setopt,
+	.tx_ctx = fi_no_tx_ctx,
+	.rx_ctx = fi_no_rx_ctx,
+	.rx_size_left = fi_no_rx_size_left,
+	.tx_size_left = fi_no_tx_size_left,
+};
+
+DIRECT_FN
+int psmx3_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr,
+		  struct fid_stx **stx, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	struct psmx3_fid_stx *stx_priv;
+	int err = -FI_EINVAL;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid.fid);
+	if (!domain_priv)
+		goto errout;
+
+	stx_priv = (struct psmx3_fid_stx *) calloc(1, sizeof *stx_priv);
+	if (!stx_priv) {
+		err = -FI_ENOMEM;
+		goto errout;
+	}
+
+	/* no auth_key is provided, use NULL to pick the default uuid */
+	trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX3_TX,
+					NULL);
+	if (!trx_ctxt) {
+		err = -FI_ENOMEM;
+		goto errout_free_stx;
+	}
+
+	psmx3_domain_acquire(domain_priv);
+	stx_priv->stx.fid.fclass = FI_CLASS_STX_CTX;
+	stx_priv->stx.fid.context = context;
+	stx_priv->stx.fid.ops = &psmx3_fi_ops_stx;
+	stx_priv->stx.ops = &psmx3_stx_ops;
+	stx_priv->domain = domain_priv;
+	stx_priv->tx = trx_ctxt;
+	ofi_atomic_initialize32(&stx_priv->ref, 0);
+
+	*stx = &stx_priv->stx;
+	return 0;
+
+errout_free_stx:
+	free(stx_priv);
+
+errout:
+	return err;
+}
+
+/*
+ * Scalable endpoint
+ */
+
+static int psmx3_sep_close(fid_t fid)
+{
+	struct psmx3_fid_sep *sep;
+	struct psmx3_ep_name ep_name;
+	int i;
+
+	sep = container_of(fid, struct psmx3_fid_sep, ep.fid);
+
+	if (ofi_atomic_get32(&sep->ref))
+		return -FI_EBUSY;
+
+	for (i = 0; i < sep->ctxt_cnt; i++) {
+		if (sep->ctxts[i].ep && ofi_atomic_get32(&sep->ctxts[i].ep->ref))
+			return -FI_EBUSY;
+	}
+
+	ep_name.epid = sep->ctxts[0].trx_ctxt->psm2_epid;
+	ep_name.sep_id = sep->id;
+	ep_name.type = sep->type;
+
+	ofi_ns_del_local_name(&sep->domain->fabric->name_server,
+			      &sep->service, &ep_name);
+
+	for (i = 0; i < sep->ctxt_cnt; i++) {
+		psmx3_trx_ctxt_free(sep->ctxts[i].trx_ctxt, PSMX3_TX_RX);
+
+		if (sep->ctxts[i].ep)
+			psmx3_ep_close_internal(sep->ctxts[i].ep);
+	}
+
+	sep->domain->sep_lock_fn(&sep->domain->sep_lock, 1);
+	dlist_remove(&sep->entry);
+	sep->domain->sep_unlock_fn(&sep->domain->sep_lock, 1);
+
+	psmx3_domain_release(sep->domain);
+	free(sep);
+	return 0;
+}
+
+static int psmx3_sep_control(fid_t fid, int command, void *arg)
+{
+	struct psmx3_fid_sep *sep;
+
+	sep = container_of(fid, struct psmx3_fid_sep, ep.fid);
+
+	switch (command) {
+	case FI_ENABLE:
+		sep->enabled = 1;
+		return 0;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_sep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct psmx3_fid_sep *sep;
+	int i, err = 0;
+
+	sep = container_of(fid, struct psmx3_fid_sep, ep.fid);
+
+	for (i = 0; i < sep->ctxt_cnt; i++) {
+		err = psmx3_ep_bind(&sep->ctxts[i].ep->ep.fid, bfid, flags);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+DIRECT_FN
+STATIC int psmx3_tx_context(struct fid_ep *ep, int index, struct fi_tx_attr *attr,
+			    struct fid_ep **tx_ep, void *context)
+{
+	struct psmx3_fid_sep *sep;
+
+	sep = container_of(ep, struct psmx3_fid_sep, ep);
+
+	assert(index >= 0 && index < sep->ctxt_cnt);
+
+	*tx_ep = &sep->ctxts[index].ep->ep;
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_rx_context(struct fid_ep *ep, int index, struct fi_rx_attr *attr,
+			    struct fid_ep **rx_ep, void *context)
+{
+	struct psmx3_fid_sep *sep;
+
+	sep = container_of(ep, struct psmx3_fid_sep, ep);
+
+	assert(index >= 0 && index < sep->ctxt_cnt);
+
+	*rx_ep = &sep->ctxts[index].ep->ep;
+	return 0;
+}
+
+static int psmx3_sep_ctxt_close(fid_t fid)
+{
+	struct psmx3_fid_ep *ep;
+
+	ep = container_of(fid, struct psmx3_fid_ep, ep.fid);
+
+	if (ep->base_ep)
+		ofi_atomic_dec32(&ep->base_ep->ref);
+
+	return 0;
+}
+
+static struct fi_ops psmx3_fi_ops_sep_ctxt = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_sep_ctxt_close,
+	.bind = psmx3_ep_bind,
+	.control = psmx3_ep_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops psmx3_fi_ops_sep = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_sep_close,
+	.bind = psmx3_sep_bind,
+	.control = psmx3_sep_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_ops_ep psmx3_sep_ops = {
+	.size = sizeof(struct fi_ops_ep),
+	.cancel = fi_no_cancel,
+	.getopt = fi_no_getopt,
+	.setopt = fi_no_setopt,
+	.tx_ctx = psmx3_tx_context,
+	.rx_ctx = psmx3_rx_context,
+	.rx_size_left = fi_no_rx_size_left,
+	.tx_size_left = fi_no_tx_size_left,
+};
+
+DIRECT_FN
+int psmx3_sep_open(struct fid_domain *domain, struct fi_info *info,
+		   struct fid_ep **sep, void *context)
+{
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_sep *sep_priv;
+	struct psmx3_ep_name ep_name;
+	struct psmx3_ep_name *src_addr;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	size_t ctxt_cnt = 1;
+	size_t ctxt_size;
+	int err = -FI_EINVAL;
+	uint8_t *uuid = NULL;
+	int i;
+
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid.fid);
+	if (!domain_priv)
+		goto errout;
+
+	if (!psmx3_override_uuid() && info && info->domain_attr && info->domain_attr->auth_key) {
+		if (info->domain_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+			FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+				"Invalid domain auth_key_len %"PRIu64
+				", should be %"PRIu64".\n",
+				info->domain_attr->auth_key_size,
+				sizeof(psm2_uuid_t));
+			goto errout;
+		}
+		uuid = info->domain_attr->auth_key;
+	}
+
+	if (info && info->ep_attr) {
+		if (!psmx3_override_uuid() && info->ep_attr->auth_key) {
+			if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) {
+				FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+					"Invalid ep auth_key_len %"PRIu64
+					", should be %"PRIu64".\n",
+					info->ep_attr->auth_key_size,
+					sizeof(psm2_uuid_t));
+				goto errout;
+			}
+			uuid = info->ep_attr->auth_key;
+		}
+
+		if (info->ep_attr->tx_ctx_cnt > psmx3_hfi_info.max_trx_ctxt) {
+			FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+				"tx_ctx_cnt %"PRIu64" exceed limit %d.\n",
+				info->ep_attr->tx_ctx_cnt,
+				psmx3_hfi_info.max_trx_ctxt);
+			goto errout;
+		}
+		if (info->ep_attr->rx_ctx_cnt > psmx3_hfi_info.max_trx_ctxt) {
+			FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL,
+				"rx_ctx_cnt %"PRIu64" exceed limit %d.\n",
+				info->ep_attr->rx_ctx_cnt,
+				psmx3_hfi_info.max_trx_ctxt);
+			goto errout;
+		}
+		ctxt_cnt = info->ep_attr->tx_ctx_cnt;
+		if (ctxt_cnt < info->ep_attr->rx_ctx_cnt)
+			ctxt_cnt = info->ep_attr->rx_ctx_cnt;
+		if (ctxt_cnt == 0) {
+			FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL,
+				"tx_ctx_cnt and rx_ctx_cnt are 0, use 1.\n");
+			ctxt_cnt = 1;
+		}
+	}
+
+	/* If override is true, the FI_PSM3_UUID was set to override other uuid */
+	if (psmx3_override_uuid()) {
+		uuid = domain_priv->fabric->uuid;
+	}
+
+	ctxt_size = ctxt_cnt * sizeof(struct psmx3_sep_ctxt);
+	sep_priv = (struct psmx3_fid_sep *) calloc(1, sizeof(*sep_priv) + ctxt_size);
+	if (!sep_priv) {
+		err = -FI_ENOMEM;
+		goto errout;
+	}
+
+	sep_priv->ep.fid.fclass = FI_CLASS_SEP;
+	sep_priv->ep.fid.context = context;
+	sep_priv->ep.fid.ops = &psmx3_fi_ops_sep;
+	sep_priv->ep.ops = &psmx3_sep_ops;
+	sep_priv->ep.cm = &psmx3_cm_ops;
+	sep_priv->domain = domain_priv;
+	sep_priv->ctxt_cnt = ctxt_cnt;
+	ofi_atomic_initialize32(&sep_priv->ref, 0);
+ 
+	src_addr = NULL;
+	if (info && info->src_addr) {
+		if (info->addr_format == FI_ADDR_STR)
+			src_addr = psmx3_string_to_ep_name(info->src_addr);
+		else
+			src_addr = info->src_addr;
+	}
+
+	for (i = 0; i < ctxt_cnt; i++) {
+		trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, src_addr,
+						(ctxt_cnt > 1) ? i : -1,
+						PSMX3_TX_RX, uuid);
+		if (!trx_ctxt) {
+			err = -FI_ENOMEM;
+			goto errout_free_ctxt;
+		}
+
+		sep_priv->ctxts[i].trx_ctxt = trx_ctxt;
+
+		err = psmx3_ep_open_internal(domain_priv, info, &ep_priv, context,
+					     trx_ctxt, PSMX3_TX_RX);
+		if (err)
+			goto errout_free_ctxt;
+
+		/* override the ops so the fid can't be closed individually */
+		ep_priv->ep.fid.ops = &psmx3_fi_ops_sep_ctxt;
+
+		sep_priv->ctxts[i].ep = ep_priv;
+	}
+
+	sep_priv->type = PSMX3_EP_SCALABLE;
+	sep_priv->service = PSMX3_ANY_SERVICE;
+	if (src_addr) {
+		sep_priv->service = src_addr->service;
+		if (info->addr_format == FI_ADDR_STR)
+			free(src_addr);
+	}
+
+	if (sep_priv->service == PSMX3_ANY_SERVICE)
+		sep_priv->service = ((getpid() & 0x7FFF) << 16) +
+				   ((uintptr_t)sep_priv & 0xFFFF);
+
+	sep_priv->id = ofi_atomic_inc32(&domain_priv->sep_cnt);
+	for (i = 0; i < ctxt_cnt; i++)
+		sep_priv->ctxts[i].ep->sep_id = sep_priv->id;
+
+	domain_priv->sep_lock_fn(&domain_priv->sep_lock, 1);
+	dlist_insert_before(&sep_priv->entry, &domain_priv->sep_list);
+	domain_priv->sep_unlock_fn(&domain_priv->sep_lock, 1);
+
+	ep_name.epid = sep_priv->ctxts[0].trx_ctxt->psm2_epid;
+	ep_name.sep_id = sep_priv->id;
+	ep_name.type = sep_priv->type;
+
+	ofi_ns_add_local_name(&domain_priv->fabric->name_server,
+			      &sep_priv->service, &ep_name);
+
+	psmx3_domain_acquire(domain_priv);
+	*sep = &sep_priv->ep;
+
+	/* Make sure the AM handler is installed to answer SEP query */
+	psmx3_am_init(sep_priv->ctxts[0].trx_ctxt);
+
+	return 0;
+
+errout_free_ctxt:
+	while (i) {
+		if (sep_priv->ctxts[i].trx_ctxt)
+			psmx3_trx_ctxt_free(sep_priv->ctxts[i].trx_ctxt,
+					    PSMX3_TX_RX);
+
+		if (sep_priv->ctxts[i].ep)
+			psmx3_ep_close_internal(sep_priv->ctxts[i].ep);
+
+		i--;
+	}
+
+	free(sep_priv);
+
+errout:
+	return err;
+}
diff --git a/deps/libfabric/prov/psm3/src/psmx3_fabric.c b/deps/libfabric/prov/psm3/src/psmx3_fabric.c
new file mode 100644
index 0000000000000000000000000000000000000000..77a17abd6609d96901a2feeb361b785053baa450
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_fabric.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+extern int psmx3_trx_ctxt_cnt;
+struct psmx3_fid_fabric *psmx3_active_fabric = NULL;
+
+static int psmx3_fabric_close(fid_t fid)
+{
+	struct psmx3_fid_fabric *fabric;
+
+	fabric = container_of(fid, struct psmx3_fid_fabric,
+			      util_fabric.fabric_fid.fid);
+
+	psmx3_fabric_release(fabric);
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "refcnt=%d\n",
+		ofi_atomic_get32(&fabric->util_fabric.ref));
+
+	if (ofi_fabric_close(&fabric->util_fabric))
+		return 0;
+
+	if (psmx3_env.name_server)
+		ofi_ns_stop_server(&fabric->name_server);
+
+	fastlock_destroy(&fabric->domain_lock);
+	assert(fabric == psmx3_active_fabric);
+	psmx3_active_fabric = NULL;
+	free(fabric);
+
+	psmx3_atomic_global_fini();
+	return 0;
+}
+
+static struct fi_ops psmx3_fabric_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_fabric_close,
+};
+
+static struct fi_ops_fabric psmx3_fabric_ops = {
+	.size = sizeof(struct fi_ops_fabric),
+	.domain = psmx3_domain_open,
+	.passive_ep = fi_no_passive_ep,
+	.eq_open = ofi_eq_create,
+	.wait_open = psmx3_wait_open,
+	.trywait = psmx3_wait_trywait
+};
+
+static struct fi_fabric_attr psmx3_fabric_attr = {
+	.name = PSMX3_FABRIC_NAME,
+};
+
+int psmx3_fabric(struct fi_fabric_attr *attr,
+		 struct fid_fabric **fabric, void *context)
+{
+	struct psmx3_fid_fabric *fabric_priv;
+	int ret;
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n");
+
+	if (strcmp(attr->name, PSMX3_FABRIC_NAME))
+		return -FI_ENODATA;
+
+	if (psmx3_active_fabric) {
+		psmx3_fabric_acquire(psmx3_active_fabric);
+		*fabric = &psmx3_active_fabric->util_fabric.fabric_fid;
+		return 0;
+	}
+
+	fabric_priv = calloc(1, sizeof(*fabric_priv));
+	if (!fabric_priv)
+		return -FI_ENOMEM;
+
+	fastlock_init(&fabric_priv->domain_lock);
+	dlist_init(&fabric_priv->domain_list);
+
+	psmx3_get_uuid(fabric_priv->uuid);
+	if (psmx3_env.name_server) {
+		fabric_priv->name_server.port = psmx3_uuid_to_port(fabric_priv->uuid);
+		fabric_priv->name_server.name_len = sizeof(struct psmx3_ep_name);
+		fabric_priv->name_server.service_len = sizeof(int);
+		fabric_priv->name_server.service_cmp = psmx3_ns_service_cmp;
+		fabric_priv->name_server.is_service_wildcard = psmx3_ns_is_service_wildcard;
+
+		ofi_ns_init(&fabric_priv->name_server);
+		ofi_ns_start_server(&fabric_priv->name_server);
+	}
+
+	psmx3_fabric_attr.prov_version = get_psm3_provider_version();
+	ret = ofi_fabric_init(&psmx3_prov, &psmx3_fabric_attr, attr,
+			     &fabric_priv->util_fabric, context);
+	if (ret) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "ofi_fabric_init returns %d\n", ret);
+		if (psmx3_env.name_server)
+			ofi_ns_stop_server(&fabric_priv->name_server);
+		free(fabric_priv);
+		return ret;
+	}
+
+	/* fclass & context initialized in ofi_fabric_init */
+	fabric_priv->util_fabric.fabric_fid.fid.ops = &psmx3_fabric_fi_ops;
+	fabric_priv->util_fabric.fabric_fid.ops = &psmx3_fabric_ops;
+
+	psmx3_atomic_global_init();
+	psmx3_query_mpi();
+
+	/* take the reference to count for multiple fabric open calls */
+	psmx3_fabric_acquire(fabric_priv);
+
+	*fabric = &fabric_priv->util_fabric.fabric_fid;
+	psmx3_active_fabric = fabric_priv;
+	psmx3_trx_ctxt_cnt = 0;
+
+	return 0;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_init.c b/deps/libfabric/prov/psm3/src/psmx3_init.c
new file mode 100644
index 0000000000000000000000000000000000000000..6c31ecab9bca05bc3ec8ef8c0df6b00ff5502a41
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_init.c
@@ -0,0 +1,740 @@
+/*
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ofi_prov.h"
+#include "psmx3.h"
+#include <glob.h>
+#include <dlfcn.h>
+
+static int psmx3_init_count = 0;
+static int psmx3_lib_initialized = 0;
+static pthread_mutex_t psmx3_lib_mutex;
+
+struct psmx3_hfi_info psmx3_hfi_info;
+
+struct psmx3_env psmx3_env = {
+	.name_server	= 1,
+	.tagged_rma	= 1,
+	.uuid		= PSMX3_DEFAULT_UUID,
+	.uuid_override  = 0,
+	.delay		= 0,
+	.timeout	= 10,
+	.conn_timeout	= 10,
+	.prog_interval	= -1,
+	.prog_affinity	= NULL,
+	.multi_ep	= 1,
+	.inject_size	= 64,
+	.lock_level	= 2,
+	.lazy_conn	= 0,
+	.disconnect	= 0,
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+	.tag_layout	= "auto",
+#endif
+};
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+uint64_t psmx3_tag_mask;
+uint32_t psmx3_tag_upper_mask;
+uint32_t psmx3_data_mask;
+int	 psmx3_flags_idx;
+int	 psmx3_tag_layout_locked = 0;
+#endif
+
+static void psmx3_init_env(void)
+{
+	uint32_t uid = getuid();
+	char *uuid = NULL;
+
+	if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK") || getenv("PMIX_RANK"))
+		psmx3_env.name_server = 0;
+
+	fi_param_get_bool(&psmx3_prov, "name_server", &psmx3_env.name_server);
+	fi_param_get_bool(&psmx3_prov, "tagged_rma", &psmx3_env.tagged_rma);
+
+	if (FI_SUCCESS != fi_param_get_str(&psmx3_prov, "uuid", &psmx3_env.uuid)) {
+		/*
+		 * For OpenMPI 4.x only:
+		 * The job key is passed via the environment variable, but the format
+		 * is different. Perform format conversion and use it as the default
+		 * uuid. This will be overridden if FI_PSM3_UUID is set.
+		 */
+		psm2_uuid_t ompi_uuid = {};
+		unsigned long long int *u = (unsigned long long int *)ompi_uuid;
+		char *ompi_job_key = getenv("OMPI_MCA_orte_precondition_transports");
+		if (ompi_job_key) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"Open MPI job key: %s.\n", ompi_job_key);
+			if (sscanf(ompi_job_key, "%016llx-%016llx", &u[0], &u[1]) == 2)
+				uuid = strdup(psmx3_uuid_to_string(ompi_uuid));
+			else {
+				FI_INFO(&psmx3_prov, FI_LOG_CORE,
+					"Invalid Open MPI job key format.\n");
+			}
+		}
+
+		/* Set Default UUID if none supplied through environment variable */
+		if (!uuid) { /* If ompi_job_key is not set or invalid */
+			uuid = strdup(PSMX3_DEFAULT_UUID);
+			if (uuid) {
+				/* fill in uid as bytes 9-11 (XXXX-XXXX) in format:
+				 * xxxxxxxx-xxxx-XXXX-XXXX-xxxxxxxxxxxx
+				 */
+				snprintf(&uuid[14], 10, "%02hhX%02hhX-%02hhX%02hhX",
+					(uid >> 24) & 0xff, (uid >> 16) & 0xff,
+					(uid >> 8) & 0xff, uid & 0xff);
+				uuid[23] = '-';	/* restore */
+			}
+		}
+		psmx3_env.uuid = uuid;
+	} else {
+		/* FI_PSM3_UUID has highest priority, so it can override auth_key from fi_info */
+		psmx3_env.uuid_override = 1;
+	}
+	fi_param_get_int(&psmx3_prov, "delay", &psmx3_env.delay);
+	fi_param_get_int(&psmx3_prov, "timeout", &psmx3_env.timeout);
+	fi_param_get_int(&psmx3_prov, "prog_interval", &psmx3_env.prog_interval);
+	fi_param_get_str(&psmx3_prov, "prog_affinity", &psmx3_env.prog_affinity);
+	fi_param_get_int(&psmx3_prov, "inject_size", &psmx3_env.inject_size);
+	fi_param_get_bool(&psmx3_prov, "lock_level", &psmx3_env.lock_level);
+	fi_param_get_bool(&psmx3_prov, "lazy_conn", &psmx3_env.lazy_conn);
+	if (psmx3_env.lazy_conn)
+		psmx3_env.conn_timeout = 30;	// more headroom since app may be busy
+	fi_param_get_int(&psmx3_prov, "conn_timeout", &psmx3_env.conn_timeout);
+	fi_param_get_bool(&psmx3_prov, "disconnect", &psmx3_env.disconnect);
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+	fi_param_get_str(&psmx3_prov, "tag_layout", &psmx3_env.tag_layout);
+#endif
+}
+
+void psmx3_init_tag_layout(struct fi_info *info)
+{
+	int use_tag64;
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+	use_tag64 = (psmx3_tag_mask == PSMX3_TAG_MASK_64);
+
+	if (psmx3_tag_layout_locked) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"tag layout already set opened domain.\n");
+		goto out;
+	}
+
+	if (strcasecmp(psmx3_env.tag_layout, "tag60") == 0) {
+		psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_60;
+		psmx3_tag_mask = PSMX3_TAG_MASK_60;
+		psmx3_data_mask = PSMX3_DATA_MASK_60;
+		psmx3_flags_idx = PSMX3_FLAGS_IDX_60;
+		use_tag64 = 0;
+	} else if (strcasecmp(psmx3_env.tag_layout, "tag64") == 0) {
+		psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_64;
+		psmx3_tag_mask = PSMX3_TAG_MASK_64;
+		psmx3_data_mask = PSMX3_DATA_MASK_64;
+		psmx3_flags_idx = PSMX3_FLAGS_IDX_64;
+		use_tag64 = 1;
+	} else {
+		if (strcasecmp(psmx3_env.tag_layout, "auto") != 0) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"Invalid tag layout '%s', using 'auto'.\n",
+				psmx3_env.tag_layout);
+			psmx3_env.tag_layout = "auto";
+		}
+		if ((info->caps & (FI_TAGGED | FI_MSG)) &&
+		    info->domain_attr->cq_data_size) {
+			psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_60;
+			psmx3_tag_mask = PSMX3_TAG_MASK_60;
+			psmx3_data_mask = PSMX3_DATA_MASK_60;
+			psmx3_flags_idx = PSMX3_FLAGS_IDX_60;
+			use_tag64 = 0;
+		} else {
+			psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_64;
+			psmx3_tag_mask = PSMX3_TAG_MASK_64;
+			psmx3_data_mask = PSMX3_DATA_MASK_64;
+			psmx3_flags_idx = PSMX3_FLAGS_IDX_64;
+			use_tag64 = 1;
+		}
+	}
+
+	psmx3_tag_layout_locked = 1;
+out:
+#elif (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG64)
+	use_tag64 = 1;
+#else
+	use_tag64 = 0;
+#endif
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"use %s: tag_mask: %016" PRIX64 ", data_mask: %08" PRIX32 "\n",
+		use_tag64 ? "tag64" : "tag60", (uint64_t)PSMX3_TAG_MASK,
+		PSMX3_DATA_MASK);
+}
+
+static int psmx3_get_yes_no(char *s, int default_value)
+{
+	unsigned long value;
+	char *end_ptr;
+
+	if (!s || s[0] == '\0')
+		return default_value;
+
+	if (s[0] == 'Y' || s[0] == 'y')
+		return 1;
+
+	if (s[0] == 'N' || s[0] == 'n')
+		return 0;
+
+	value = strtoul(s, &end_ptr, 0);
+	if (end_ptr == s)
+		return default_value;
+
+	return value ? 1 : 0;
+}
+
+static int psmx3_check_multi_ep_cap(void)
+{
+	uint64_t caps = PSM2_MULTI_EP_CAP;
+	char *s = getenv("PSM3_MULTI_EP");
+
+	if (psm2_get_capability_mask(caps) == caps && psmx3_get_yes_no(s, 1))
+		psmx3_env.multi_ep = 1;
+	else
+		psmx3_env.multi_ep = 0;
+
+	return psmx3_env.multi_ep;
+}
+
+static int psmx3_init_lib(void)
+{
+	int major, minor;
+	int ret = 0, err;
+
+	if (psmx3_lib_initialized)
+		return 0;
+
+	pthread_mutex_lock(&psmx3_lib_mutex);
+
+	if (psmx3_lib_initialized)
+		goto out;
+
+	/* turn on multi-ep feature, but don't overwrite existing setting */
+	/*setenv("PSM3_MULTI_EP", "1", 0); - not needed, PSM3 default=1*/
+
+	psm2_error_register_handler(NULL, PSM2_ERRHANDLER_NO_HANDLER);
+
+	major = PSM2_VERNO_MAJOR;
+	minor = PSM2_VERNO_MINOR;
+
+	err = psm2_init(&major, &minor);
+	if (err != PSM2_OK) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"psm2_init failed: %s\n", psm2_error_get_string(err));
+		ret = err;
+		goto out;
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"PSM3 header version = (%d, %d)\n", PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR);
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"PSM3 library version = (%d, %d)\n", major, minor);
+
+	if (psmx3_check_multi_ep_cap())
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "PSM3 multi-ep feature enabled.\n");
+	else
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "PSM3 multi-ep feature not available or disabled.\n");
+
+	psmx3_lib_initialized = 1;
+
+out:
+	pthread_mutex_unlock(&psmx3_lib_mutex);
+	return ret;
+}
+
+static int psmx3_update_hfi_info(void)
+{
+	unsigned short i;
+	int nctxts = 0;
+	int nfreectxts = 0;
+	int hfi_unit = -1;
+	char *hfi_name = NULL;
+	int multirail = 0;
+	char *s;
+	char unit_name[NAME_MAX];
+	uint32_t cnt = 0;
+	int tmp_nctxts, tmp_nfreectxts;
+	int unit_active;
+	int ret;
+	psm2_info_query_arg_t args[2];
+
+	args[1].length = sizeof(unit_name);
+
+	if (psmx3_hfi_info.num_units > 0)
+		return 0;
+
+	if (psm2_info_query(PSM2_INFO_QUERY_NUM_UNITS, &cnt, 0, NULL) || !cnt)
+	{
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"no PSM3 device is found.\n");
+		return -FI_ENODEV;
+	}
+	psmx3_hfi_info.num_units = cnt;
+
+	assert(psmx3_hfi_info.num_units <= PSMX3_MAX_UNITS);
+
+	s = getenv("PSM3_NIC");
+	if (s && *s) {
+		if (0 == strcasecmp(s, "any")) {
+			hfi_unit = -1;
+		} else {
+			char *p;
+			long l = strtol(s, &p, 10);
+			if (p && *p == '\0')
+				hfi_unit = (int)l;	// consumed all of string as a number
+			else
+				hfi_name = s;	// name specified
+		}
+	}
+
+	s = getenv("PSM3_MULTIRAIL");
+	if (s)
+		multirail = atoi(s);
+
+	psmx3_hfi_info.num_active_units = 0;
+	for (i = 0; i < psmx3_hfi_info.num_units; i++) {
+		args[0].unit = i;
+		ret = psm2_info_query(PSM2_INFO_QUERY_UNIT_STATUS, &unit_active, 1, args);
+		if (ret != PSM2_OK) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to check active state of HFI unit %d\n",
+				i);
+			continue;
+		}
+
+		if (unit_active<=0) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"NIC %d STATE = INACTIVE\n",
+				i);
+			continue;
+		}
+
+		if (hfi_unit >=0 && i != hfi_unit) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"NIC %d skipped: PSM3_NIC=%d\n",
+				i, hfi_unit);
+			continue;
+		}
+
+		if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_FREE_CONTEXTS,
+						&tmp_nfreectxts, 1, args) || (tmp_nfreectxts < 0))
+		{
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read number of free contexts from HFI unit %d\n",
+				i);
+			continue;
+		}
+
+		if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_CONTEXTS,
+						&tmp_nctxts, 1, args) || (tmp_nctxts < 0))
+		{
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read number of contexts from HFI unit %d\n",
+				i);
+			continue;
+		}
+
+		if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_NAME,
+						unit_name, 2, args))
+		{
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read name of HFI unit %d\n",
+				i);
+			continue;
+		}
+		if (hfi_name && 0 != strcasecmp(hfi_name, unit_name)) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"NIC %d skipped: PSM3_NIC=%s\n",
+				i, hfi_name);
+			continue;
+		}
+
+		nctxts += tmp_nctxts;
+		nfreectxts += tmp_nfreectxts;
+
+		psmx3_hfi_info.unit_is_active[i] = 1;
+		psmx3_hfi_info.unit_nctxts[i] = tmp_nctxts;
+		psmx3_hfi_info.unit_nfreectxts[i] = tmp_nfreectxts;
+		psmx3_hfi_info.active_units[psmx3_hfi_info.num_active_units++] = i;
+
+		if (psmx3_hfi_info.num_active_units > 1)
+			strcat(psmx3_hfi_info.default_domain_name, ";");
+		strcat(psmx3_hfi_info.default_domain_name, unit_name);
+
+		if (multirail)
+			break;
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"hfi1 units: total %d, active %d; "
+		"hfi1 contexts: total %d, free %d\n",
+		psmx3_hfi_info.num_units, psmx3_hfi_info.num_active_units,
+		nctxts, nfreectxts);
+
+	if (psmx3_env.multi_ep) {
+		psmx3_hfi_info.max_trx_ctxt = nctxts;
+		psmx3_hfi_info.free_trx_ctxt = nfreectxts;
+	} else {
+		psmx3_hfi_info.max_trx_ctxt = 1;
+		psmx3_hfi_info.free_trx_ctxt = (nfreectxts == 0) ? 0 : 1;
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"Tx/Rx contexts: %d in total, %d available.\n",
+		psmx3_hfi_info.max_trx_ctxt, psmx3_hfi_info.free_trx_ctxt);
+
+	return 0;
+}
+
+static void psmx3_update_hfi_nic_info(struct fi_info *info)
+{
+        char *path;
+	char buffer[PATH_MAX];
+	char *s;
+	ssize_t n;
+	unsigned int a, b, c, d;
+	int unit;
+	char sys_dev_path[PATH_MAX];
+	psm2_info_query_arg_t args[2];
+	args[1].length=sizeof(sys_dev_path);
+
+	for ( ; info; info = info->next) {
+		unit = ((struct psmx3_ep_name *)info->src_addr)->unit;
+
+		if (unit == PSMX3_DEFAULT_UNIT)
+			continue;
+
+		if (!info->nic) {
+			info->nic = ofi_nic_dup(NULL);
+			if (!info->nic) {
+				FI_WARN(&psmx3_prov, FI_LOG_CORE,
+					"Failed to allocate nic info for HFI unit %d\n", unit);
+				continue;
+			}
+		}
+
+		args[0].unit = unit;
+		if ((PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_SYS_PATH,
+			sys_dev_path, 2, args)) ||
+			(asprintf(&path, "%s/%s", sys_dev_path, "device") < 0))
+		{
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read nic info for HFI unit %d\n", unit);
+			continue;
+		}
+
+		n = readlink(path, buffer, sizeof(buffer)-1);
+		free(path);
+
+		if (n < 0) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read nic info for HFI unit %d\n", unit);
+			continue;
+		}
+
+		buffer[n] = '\0';
+		if ((s = strrchr(buffer, '/')))
+			s++;
+		else
+			s = buffer;
+
+		n = sscanf(s, "%x:%x:%x.%x", &a, &b, &c, &d);
+		if (n < 4) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"Failed to read nic info for HFI unit %d\n", unit);
+			continue;
+		}
+
+		info->nic->bus_attr->bus_type = FI_BUS_PCI;
+		info->nic->bus_attr->attr.pci.domain_id = (uint16_t) a;
+		info->nic->bus_attr->attr.pci.bus_id =  (uint8_t) b;
+		info->nic->bus_attr->attr.pci.device_id = (uint8_t) c;
+		info->nic->bus_attr->attr.pci.function_id = (uint8_t) d;
+	}
+}
+
+static int psmx3_getinfo(uint32_t api_version, const char *node,
+			 const char *service, uint64_t flags,
+			 const struct fi_info *hints, struct fi_info **info)
+{
+	struct fi_info *prov_info = NULL;
+	struct psmx3_ep_name *dest_addr = NULL;
+	struct psmx3_ep_name *src_addr = NULL;
+	int svc0, svc = PSMX3_ANY_SERVICE;
+	size_t len;
+	void *addr;
+	uint32_t fmt;
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,"\n");
+
+	if (psmx3_init_prov_info(hints, &prov_info))
+		goto err_out;
+
+	if (psmx3_init_lib())
+		goto err_out;
+
+	if (psmx3_update_hfi_info())
+		goto err_out;
+
+	if (!psmx3_hfi_info.num_active_units) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"no PSM3 device is active.\n");
+		goto err_out;
+	}
+
+	if (hints && hints->domain_attr && hints->domain_attr->name &&
+		NULL == strcasestr(psmx3_hfi_info.default_domain_name, hints->domain_attr->name)) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "Unknown domain name\n");
+		OFI_INFO_STR(&psmx3_prov, psmx3_hfi_info.default_domain_name,
+					   hints->domain_attr->name, "Supported", "Requested");
+		goto err_out;
+	}
+
+	/* Set src or dest to used supplied address in native format */
+	if (node &&
+	    !ofi_str_toaddr(node, &fmt, &addr, &len) &&
+	    fmt == FI_ADDR_PSMX3) {
+		if (flags & FI_SOURCE) {
+			src_addr = addr;
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"'%s' is taken as src_addr: <unit=%d, port=%d, service=%d>\n",
+				node, src_addr->unit, src_addr->port, src_addr->service);
+		} else {
+			dest_addr = addr;
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"'%s' is taken as dest_addr: <epid=%"PRIu64">\n",
+				node, dest_addr->epid);
+		}
+		node = NULL;
+	}
+
+	/* Initialize src address based on the "host:unit:port" format */
+	if (!src_addr) {
+		src_addr = calloc(1, sizeof(*src_addr));
+		if (!src_addr) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"failed to allocate src addr.\n");
+			goto err_out;
+		}
+		src_addr->type = PSMX3_EP_SRC_ADDR;
+		src_addr->epid = PSMX3_RESERVED_EPID;
+		src_addr->unit = PSMX3_DEFAULT_UNIT;
+		src_addr->port = PSMX3_DEFAULT_PORT;
+		src_addr->service = PSMX3_ANY_SERVICE;
+
+		if (flags & FI_SOURCE) {
+			if (node)
+				sscanf(node, "%*[^:]:%" SCNi8 ":%" SCNu8,
+				       &src_addr->unit, &src_addr->port);
+			if (service)
+				sscanf(service, "%" SCNu32, &src_addr->service);
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"node '%s' service '%s' converted to <unit=%d, port=%d, service=%d>\n",
+				node, service, src_addr->unit, src_addr->port, src_addr->service);
+		}
+	}
+
+	/* Check that the src address contains valid unit */
+	if (src_addr->unit != PSMX3_DEFAULT_UNIT) {
+		if (src_addr->unit < 0 || src_addr->unit >= PSMX3_MAX_UNITS) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"invalid source address: unit %d out of range\n", src_addr->unit);
+			goto err_out;
+		}
+		if (!psmx3_hfi_info.unit_is_active[src_addr->unit]) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"invalid source address: unit %d is inactive\n", src_addr->unit);
+			goto err_out;
+		}
+	}
+
+	/* Resovle dest address using "node", "service" pair */
+	if (!dest_addr && node && !(flags & FI_SOURCE)) {
+		psm2_uuid_t uuid;
+
+		psmx3_get_uuid(uuid);
+		struct util_ns ns = {
+			.port = psmx3_uuid_to_port(uuid),
+			.name_len = sizeof(*dest_addr),
+			.service_len = sizeof(svc),
+			.service_cmp = psmx3_ns_service_cmp,
+			.is_service_wildcard = psmx3_ns_is_service_wildcard,
+		};
+		ofi_ns_init(&ns);
+
+		if (service)
+			svc = atoi(service);
+		svc0 = svc;
+		dest_addr = (struct psmx3_ep_name *)
+			ofi_ns_resolve_name(&ns, node, &svc);
+		if (dest_addr) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"'%s:%u' resolved to <epid=%"PRIu64">:%d\n",
+				node, svc0, dest_addr->epid, svc);
+		} else {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"failed to resolve '%s:%u'.\n", node, svc);
+			goto err_out;
+		}
+	}
+
+	/* Update prov info with resovled addresses and hfi info */
+	psmx3_update_prov_info(prov_info, src_addr, dest_addr);
+
+	/* Remove prov info that don't match the hints */
+	if (psmx3_check_prov_info(api_version, hints, &prov_info))
+		goto err_out;
+
+	/* Apply hints to the prov info */
+	psmx3_alter_prov_info(api_version, hints, prov_info);
+
+	/* Set fi_nic struture */
+	psmx3_update_hfi_nic_info(prov_info);
+
+	*info = prov_info;
+	free(src_addr);
+	free(dest_addr);
+	return 0;
+
+err_out:
+	free(src_addr);
+	free(dest_addr);
+	fi_freeinfo(prov_info);
+	*info = NULL;
+	return -FI_ENODATA;
+}
+
+static void psmx3_fini(void)
+{
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n");
+
+	if (! --psmx3_init_count && psmx3_lib_initialized) {
+		/* This function is called from a library destructor, which is called
+		 * automatically when exit() is called. The call to psm2_finalize()
+		 * might cause deadlock if the applicaiton is terminated with Ctrl-C
+		 * -- the application could be inside a PSM3 call, holding a lock that
+		 * psm2_finalize() tries to acquire. This can be avoided by only
+		 * calling psm2_finalize() when PSM3 is guaranteed to be unused.
+		 */
+		if (psmx3_active_fabric) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE,
+				"psmx3_active_fabric != NULL, skip psm2_finalize\n");
+		} else {
+			psm2_finalize();
+			psmx3_lib_initialized = 0;
+		}
+	}
+}
+
+struct fi_provider psmx3_prov = {
+	.name = PSMX3_PROV_NAME,
+	.fi_version = OFI_VERSION_LATEST,
+	.getinfo = psmx3_getinfo,
+	.fabric = psmx3_fabric,
+	.cleanup = psmx3_fini
+};
+
+PROVIDER_INI
+{
+	psmx3_prov.version = get_psm3_provider_version();
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "build options: VERSION=%u.%u=%u.%u.%u.%u, "
+			"HAVE_PSM3_SRC=%d, PSM3_CUDA=%d\n",
+			(psmx3_prov.version >> 16), (psmx3_prov.version & 0xFFFF),
+			(psmx3_prov.version >> 16) / 100, (psmx3_prov.version >> 16) % 100,
+			(psmx3_prov.version & 0xFFFF) / 1000, ((psmx3_prov.version & 0xFFFF) % 1000) / 10,
+			HAVE_PSM3_SRC, PSM3_CUDA);
+
+	fi_param_define(&psmx3_prov, "name_server", FI_PARAM_BOOL,
+			"Whether to turn on the name server or not "
+			"(default: yes)");
+
+	fi_param_define(&psmx3_prov, "tagged_rma", FI_PARAM_BOOL,
+			"Whether to use tagged messages for large size "
+			"RMA or not (default: yes)");
+
+	fi_param_define(&psmx3_prov, "uuid", FI_PARAM_STRING,
+			"Unique Job ID required by the fabric");
+
+	fi_param_define(&psmx3_prov, "delay", FI_PARAM_INT,
+			"Delay (seconds) before finalization (for debugging)");
+
+	fi_param_define(&psmx3_prov, "timeout", FI_PARAM_INT,
+			"Timeout (seconds) for gracefully closing the PSM3 endpoint");
+
+	fi_param_define(&psmx3_prov, "conn_timeout", FI_PARAM_INT,
+			"Timeout (seconds) for establishing connection between two PSM3 endpoints");
+
+	fi_param_define(&psmx3_prov, "prog_interval", FI_PARAM_INT,
+			"Interval (microseconds) between progress calls made in the "
+			"progress thread (default: 1 if affinity is set, 1000 if not)");
+
+	fi_param_define(&psmx3_prov, "prog_affinity", FI_PARAM_STRING,
+			"When set, specify the set of CPU cores to set the progress "
+			"thread affinity to. The format is "
+			"<start>[:<end>[:<stride>]][,<start>[:<end>[:<stride>]]]*, "
+			"where each triplet <start>:<end>:<stride> defines a block "
+			"of core_ids. Both <start> and <end> can be either the core_id "
+			"(when >=0) or core_id - num_cores (when <0). "
+			"(default: affinity not set)");
+
+	fi_param_define(&psmx3_prov, "inject_size", FI_PARAM_INT,
+			"Maximum message size for fi_inject and fi_tinject (default: 64).");
+
+	fi_param_define(&psmx3_prov, "lock_level", FI_PARAM_INT,
+			"How internal locking is used. 0 means no locking. (default: 2).");
+
+	fi_param_define(&psmx3_prov, "lazy_conn", FI_PARAM_BOOL,
+			"Whether to force lazy connection mode. (default: no).");
+
+	fi_param_define(&psmx3_prov, "disconnect", FI_PARAM_BOOL,
+			"Whether to issue disconnect request when process ends (default: no).");
+
+#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
+	fi_param_define(&psmx3_prov, "tag_layout", FI_PARAM_STRING,
+			"How the 96 bit PSM3 tag is organized: "
+			"tag60 means 32/4/60 for data/flags/tag;"
+			"tag64 means 4/28/64 for flags/data/tag (default: tag60).");
+#endif
+
+	psmx3_init_env();
+
+	pthread_mutex_init(&psmx3_lib_mutex, NULL);
+	psmx3_init_count++;
+	return (&psmx3_prov);
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_mr.c b/deps/libfabric/prov/psm3/src/psmx3_mr.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc6533062eea837fab90cfc0180698746f37f421
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_mr.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+struct psmx3_fid_mr *psmx3_mr_get(struct psmx3_fid_domain *domain,
+				  uint64_t key)
+{
+	RbtIterator it;
+	struct psmx3_fid_mr *mr = NULL;
+
+	domain->mr_lock_fn(&domain->mr_lock, 1);
+	it = rbtFind(domain->mr_map, (void *)key);
+	if (!it)
+		goto exit;
+
+	rbtKeyValue(domain->mr_map, it, (void **)&key, (void **)&mr);
+exit:
+	domain->mr_unlock_fn(&domain->mr_lock, 1);
+	return mr;
+}
+
+static inline void psmx3_mr_release_key(struct psmx3_fid_domain *domain,
+					uint64_t key)
+{
+	RbtIterator it;
+
+	domain->mr_lock_fn(&domain->mr_lock, 1);
+	it = rbtFind(domain->mr_map, (void *)key);
+	if (it)
+		rbtErase(domain->mr_map, it);
+	domain->mr_unlock_fn(&domain->mr_lock, 1);
+}
+
+static int psmx3_mr_reserve_key(struct psmx3_fid_domain *domain,
+				uint64_t requested_key,
+				uint64_t *assigned_key,
+				void *mr)
+{
+	uint64_t key;
+	int i;
+	int try_count;
+	int err = -FI_ENOKEY;
+
+	domain->mr_lock_fn(&domain->mr_lock, 1);
+
+	if (domain->mr_mode == FI_MR_BASIC) {
+		key = domain->mr_reserved_key;
+		try_count = 10000; /* large enough */
+	} else {
+		key = requested_key;
+		try_count = 1;
+	}
+
+	for (i=0; i<try_count; i++, key++) {
+		if (!rbtFind(domain->mr_map, (void *)key)) {
+			if (!rbtInsert(domain->mr_map, (void *)key, mr)) {
+				if (domain->mr_mode == FI_MR_BASIC)
+					domain->mr_reserved_key = key + 1;
+				*assigned_key = key;
+				err = 0;
+			}
+			break;
+		}
+	}
+
+	domain->mr_unlock_fn(&domain->mr_lock, 1);
+
+	return err;
+}
+
+int psmx3_mr_validate(struct psmx3_fid_mr *mr, uint64_t addr,
+		      size_t len, uint64_t access)
+{
+	int i;
+
+	addr += mr->offset;
+
+	if (!addr)
+		return -FI_EINVAL;
+
+	if ((access & mr->access) != access)
+		return -FI_EACCES;
+
+	for (i = 0; i < mr->iov_count; i++) {
+		if ((uint64_t)mr->iov[i].iov_base <= addr &&
+		    (uint64_t)mr->iov[i].iov_base + mr->iov[i].iov_len >= addr + len)
+			return 0;
+	}
+
+	return -FI_EACCES;
+}
+
+static int psmx3_mr_close(fid_t fid)
+{
+	struct psmx3_fid_mr *mr;
+
+	mr = container_of(fid, struct psmx3_fid_mr, mr.fid);
+	psmx3_mr_release_key(mr->domain, mr->mr.key);
+	psmx3_domain_release(mr->domain);
+	free(mr);
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct psmx3_fid_mr *mr;
+	struct psmx3_fid_ep *ep;
+	struct psmx3_fid_cntr *cntr;
+
+	mr = container_of(fid, struct psmx3_fid_mr, mr.fid);
+
+	assert(bfid);
+
+	switch (bfid->fclass) {
+	case FI_CLASS_EP:
+		ep = container_of(bfid, struct psmx3_fid_ep, ep.fid);
+		if (mr->domain != ep->domain)
+			return -FI_EINVAL;
+		break;
+
+	case FI_CLASS_CNTR:
+		cntr = container_of(bfid, struct psmx3_fid_cntr, cntr.fid);
+		if (mr->cntr && mr->cntr != cntr)
+			return -FI_EBUSY;
+		if (mr->domain != cntr->domain)
+			return -FI_EINVAL;
+		if (flags) {
+			if (flags != FI_REMOTE_WRITE)
+				return -FI_EINVAL;
+			mr->cntr = cntr;
+			cntr->poll_all = 1;
+		}
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_mr_control(fid_t fid, int command, void *arg)
+{
+	struct psmx3_fid_mr *mr;
+	struct fi_mr_raw_attr *attr;
+
+	mr = container_of(fid, struct psmx3_fid_mr, mr.fid);
+
+	switch (command) {
+	case FI_GET_RAW_MR:
+		attr = arg;
+		if (!attr)
+			return -FI_EINVAL;
+		if (attr->base_addr)
+			*attr->base_addr = (uint64_t)(uintptr_t)mr->iov[0].iov_base;
+		if (attr->raw_key)
+			*(uint64_t *)attr->raw_key = mr->mr.key;
+		if (attr->key_size)
+			*attr->key_size = sizeof(uint64_t);
+		break;
+
+	case FI_REFRESH:
+	case FI_ENABLE:
+		/* Nothing to do here */
+		break;
+
+	default:
+		return -FI_ENOSYS;
+	}
+
+	return 0;
+}
+
+static struct fi_ops psmx3_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx3_mr_close,
+	.bind = psmx3_mr_bind,
+	.control = psmx3_mr_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static void psmx3_mr_normalize_iov(struct iovec *iov, size_t *count)
+{
+	struct iovec tmp_iov;
+	int i, j, n, new_len;
+	uintptr_t iov_end_i, iov_end_j;
+
+	n = *count;
+
+	if (!n)
+		return;
+
+	/* sort segments by base address */
+	for (i = 0; i < n - 1; i++) {
+		for (j = i + 1; j < n; j++) {
+			if (iov[i].iov_base > iov[j].iov_base) {
+				tmp_iov = iov[i];
+				iov[i] = iov[j];
+				iov[j] = tmp_iov;
+			}
+		}
+	}
+
+	/* merge overlapping segments */
+	for (i = 0; i < n - 1; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+
+		for (j = i + 1; j < n; j++) {
+			if (iov[j].iov_len == 0)
+				continue;
+
+			iov_end_i = (uintptr_t)iov[i].iov_base + iov[i].iov_len;
+			iov_end_j = (uintptr_t)iov[j].iov_base + iov[j].iov_len;
+			if (iov_end_i >= (uintptr_t)iov[j].iov_base) {
+				new_len = iov_end_j - (uintptr_t)iov[i].iov_base;
+				if (new_len > iov[i].iov_len)
+					iov[i].iov_len = new_len;
+				iov[j].iov_len = 0;
+			} else {
+				break;
+			}
+		}
+	}
+
+	/* remove empty segments */
+	for (i = 0, j = 1; i < n; i++, j++) {
+		if (iov[i].iov_len)
+			continue;
+
+		while (j < n && iov[j].iov_len == 0)
+			j++;
+
+		if (j >= n)
+			break;
+
+		iov[i] = iov[j];
+		iov[j].iov_len = 0;
+	}
+
+	*count = i;
+}
+
+DIRECT_FN
+STATIC int psmx3_mr_reg(struct fid *fid, const void *buf, size_t len,
+			uint64_t access, uint64_t offset, uint64_t requested_key,
+			uint64_t flags, struct fid_mr **mr, void *context)
+{
+	struct fid_domain *domain;
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_mr *mr_priv;
+	uint64_t key;
+	int err;
+
+	assert(fid->fclass == FI_CLASS_DOMAIN);
+
+	domain = container_of(fid, struct fid_domain, fid);
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+
+	mr_priv = (struct psmx3_fid_mr *) calloc(1, sizeof(*mr_priv) + sizeof(struct iovec));
+	if (!mr_priv)
+		return -FI_ENOMEM;
+
+	err = psmx3_mr_reserve_key(domain_priv, requested_key, &key, mr_priv);
+	if (err) {
+		free(mr_priv);
+		return err;
+	}
+
+	psmx3_domain_acquire(domain_priv);
+
+	mr_priv->mr.fid.fclass = FI_CLASS_MR;
+	mr_priv->mr.fid.context = context;
+	mr_priv->mr.fid.ops = &psmx3_fi_ops;
+	mr_priv->mr.mem_desc = mr_priv;
+	mr_priv->mr.key = key;
+	mr_priv->domain = domain_priv;
+	mr_priv->access = access;
+	mr_priv->flags = flags;
+	mr_priv->iov_count = 1;
+	mr_priv->iov[0].iov_base = (void *)buf;
+	mr_priv->iov[0].iov_len = len;
+	mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 :
+				((uint64_t)mr_priv->iov[0].iov_base - offset);
+
+	*mr = &mr_priv->mr;
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_mr_regv(struct fid *fid,
+			 const struct iovec *iov, size_t count,
+			 uint64_t access, uint64_t offset,
+			 uint64_t requested_key, uint64_t flags,
+			 struct fid_mr **mr, void *context)
+{
+	struct fid_domain *domain;
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_mr *mr_priv;
+	int i, err;
+	uint64_t key;
+
+	assert(fid->fclass == FI_CLASS_DOMAIN);
+
+	domain = container_of(fid, struct fid_domain, fid);
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+
+	assert(count);
+	assert(iov);
+
+	mr_priv = (struct psmx3_fid_mr *)
+			calloc(1, sizeof(*mr_priv) +
+				  sizeof(struct iovec) * count);
+	if (!mr_priv)
+		return -FI_ENOMEM;
+
+	err = psmx3_mr_reserve_key(domain_priv, requested_key, &key, mr_priv);
+	if (err) {
+		free(mr_priv);
+		return err;
+	}
+
+	psmx3_domain_acquire(domain_priv);
+
+	mr_priv->mr.fid.fclass = FI_CLASS_MR;
+	mr_priv->mr.fid.context = context;
+	mr_priv->mr.fid.ops = &psmx3_fi_ops;
+	mr_priv->mr.mem_desc = mr_priv;
+	mr_priv->mr.key = key;
+	mr_priv->domain = domain_priv;
+	mr_priv->access = access;
+	mr_priv->flags = flags;
+	mr_priv->iov_count = count;
+	for (i=0; i<count; i++)
+		mr_priv->iov[i] = iov[i];
+	psmx3_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count);
+	mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 :
+				((uint64_t)mr_priv->iov[0].iov_base - offset);
+
+	*mr = &mr_priv->mr;
+	return 0;
+}
+
+DIRECT_FN
+STATIC int psmx3_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
+			uint64_t flags, struct fid_mr **mr)
+{
+	struct fid_domain *domain;
+	struct psmx3_fid_domain *domain_priv;
+	struct psmx3_fid_mr *mr_priv;
+	int i, err;
+	uint64_t key;
+
+	assert(fid->fclass == FI_CLASS_DOMAIN);
+
+	domain = container_of(fid, struct fid_domain, fid);
+	domain_priv = container_of(domain, struct psmx3_fid_domain,
+				   util_domain.domain_fid);
+
+	assert(attr);
+	assert(attr->iov_count);
+	assert(attr->mr_iov);
+
+	mr_priv = (struct psmx3_fid_mr *)
+			calloc(1, sizeof(*mr_priv) +
+				  sizeof(struct iovec) * attr->iov_count);
+	if (!mr_priv)
+		return -FI_ENOMEM;
+
+	err = psmx3_mr_reserve_key(domain_priv, attr->requested_key, &key, mr_priv);
+	if (err) {
+		free(mr_priv);
+		return err;
+	}
+
+	psmx3_domain_acquire(domain_priv);
+
+	mr_priv->mr.fid.fclass = FI_CLASS_MR;
+	mr_priv->mr.fid.context = attr->context;
+	mr_priv->mr.fid.ops = &psmx3_fi_ops;
+	mr_priv->mr.mem_desc = mr_priv;
+	mr_priv->mr.key = key;
+	mr_priv->domain = domain_priv;
+	mr_priv->access = attr->access;
+	mr_priv->flags = flags;
+	mr_priv->iov_count = attr->iov_count;
+	for (i=0; i<attr->iov_count; i++)
+		mr_priv->iov[i] = attr->mr_iov[i];
+	psmx3_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count);
+	mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 :
+				((uint64_t)mr_priv->iov[0].iov_base - attr->offset);
+
+	*mr = &mr_priv->mr;
+	return 0;
+}
+
+struct fi_ops_mr psmx3_mr_ops = {
+	.size = sizeof(struct fi_ops_mr),
+	.reg = psmx3_mr_reg,
+	.regv = psmx3_mr_regv,
+	.regattr = psmx3_mr_regattr,
+};
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_msg.c b/deps/libfabric/prov/psm3/src/psmx3_msg.c
new file mode 100644
index 0000000000000000000000000000000000000000..dd1933c5651da495587d184a2a7299cefe7f1098
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_msg.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+ssize_t psmx3_recv_generic(struct fid_ep *ep, void *buf, size_t len,
+			   void *desc, fi_addr_t src_addr, void *context,
+			   uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	struct fi_context *fi_context;
+	int recv_flag = 0;
+	int err;
+	int enable_completion;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_recv(ep, buf, len, desc, src_addr,
+						context, flags);
+
+	if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) {
+		av = ep_priv->av;
+		assert(av);
+		psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type);
+	} else {
+		psm2_epaddr = 0;
+	}
+
+	PSMX3_SET_TAG(psm2_tag, 0ULL, 0, PSMX3_TYPE_MSG);
+	PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_NONE, PSMX3_TYPE_MASK);
+
+	enable_completion = !ep_priv->recv_selective_completion ||
+			    (flags & FI_COMPLETION);
+	if (enable_completion) {
+		assert(context);
+		fi_context = context;
+		if (flags & FI_MULTI_RECV) {
+			struct psmx3_multi_recv *req;
+
+			req = calloc(1, sizeof(*req));
+			if (!req)
+				return -FI_ENOMEM;
+
+			req->src_addr = psm2_epaddr;
+			req->tag = psm2_tag;
+			req->tagsel = psm2_tagsel;
+			req->flag = recv_flag;
+			req->buf = buf;
+			req->len = len;
+			req->offset = 0;
+			req->min_buf_size = ep_priv->min_multi_recv;
+			req->context = fi_context; 
+			PSMX3_CTXT_TYPE(fi_context) = PSMX3_MULTI_RECV_CONTEXT;
+			PSMX3_CTXT_USER(fi_context) = req;
+			if (len > PSMX3_MAX_MSG_SIZE)
+				len = PSMX3_MAX_MSG_SIZE;
+		} else {
+			PSMX3_CTXT_TYPE(fi_context) = PSMX3_RECV_CONTEXT;
+			PSMX3_CTXT_USER(fi_context) = buf;
+		}
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+	} else {
+		PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context);
+		#if !PSMX3_USE_REQ_CONTEXT
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_RECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr,
+			     &psm2_tag, &psm2_tagsel, recv_flag, buf, len,
+			     (void *)fi_context, &psm2_req);
+	if (OFI_UNLIKELY(err != PSM2_OK))
+		return psmx3_errno(err);
+
+	if (enable_completion) {
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+	} else {
+		#if PSMX3_USE_REQ_CONTEXT
+		PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context);
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_RECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_recv(struct fid_ep *ep, void *buf, size_t len,
+			  void *desc, fi_addr_t src_addr, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_recv_generic(ep, buf, len, desc, src_addr, context,
+				  ep_priv->rx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
+			     uint64_t flags)
+{
+	void *buf;
+	size_t len;
+
+	assert(msg);
+	assert(!msg->iov_count || msg->msg_iov);
+	assert(msg->iov_count <= 1);
+
+	if (msg->iov_count) {
+		buf = msg->msg_iov[0].iov_base;
+		len = msg->msg_iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_recv_generic(ep, buf, len,
+				  msg->desc ? msg->desc[0] : NULL,
+				  msg->addr, msg->context, flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_recvv(struct fid_ep *ep, const struct iovec *iov,
+			   void **desc, size_t count, fi_addr_t src_addr,
+			   void *context)
+{
+	void *buf;
+	size_t len;
+
+	assert(!count || iov);
+	assert(count <= 1);
+
+	if (count) {
+		buf = iov[0].iov_base;
+		len = iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_recv(ep, buf, len, desc ? desc[0] : NULL,
+			  src_addr, context);
+}
+
+ssize_t psmx3_send_generic(struct fid_ep *ep, const void *buf, size_t len,
+			   void *desc, fi_addr_t dest_addr, void *context,
+			   uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	struct fi_context * fi_context;
+	int send_flag = 0;
+	int err;
+	int no_completion = 0;
+	struct psmx3_cq_event *event;
+	int have_data = (flags & FI_REMOTE_CQ_DATA) > 0;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_send(ep, buf, len, desc, dest_addr,
+						context, flags, data);
+
+	av = ep_priv->av;
+	assert(av);
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+
+	if (have_data)
+		PSMX3_SET_TAG(psm2_tag, 0, data, PSMX3_TYPE_MSG | PSMX3_IMM_BIT);
+	else
+		PSMX3_SET_TAG(psm2_tag, 0, ep_priv->sep_id, PSMX3_TYPE_MSG);
+
+	if ((flags & PSMX3_NO_COMPLETION) ||
+	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
+		no_completion = 1;
+
+	if (flags & FI_INJECT) {
+		if (len > psmx3_env.inject_size)
+			return -FI_EMSGSIZE;
+
+		err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				    send_flag, &psm2_tag, buf, len);
+
+		if (err != PSM2_OK)
+			return psmx3_errno(err);
+
+		if (ep_priv->send_cntr)
+			psmx3_cntr_inc(ep_priv->send_cntr, 0);
+
+		if (ep_priv->send_cq && !no_completion) {
+			event = psmx3_cq_create_event(
+					ep_priv->send_cq,
+					context, (void *)buf, flags, len,
+					(uint64_t) data,
+					0 /* tag */,
+					0 /* olen */,
+					0 /* err */);
+
+			if (event)
+				psmx3_cq_enqueue_event(ep_priv->send_cq, event);
+			else
+				return -FI_ENOMEM;
+		}
+
+		return 0;
+	}
+
+	if (no_completion) {
+		fi_context = &ep_priv->nocomp_send_context;
+	} else {
+		assert(context);
+		fi_context = context;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_SEND_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = (void *)buf;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+	}
+
+	err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr,
+			     send_flag, &psm2_tag, buf, len,
+			     (void *)fi_context, &psm2_req);
+
+	if (err != PSM2_OK)
+		return psmx3_errno(err);
+
+	if (fi_context == context)
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+
+	return 0;
+}
+
+ssize_t psmx3_sendv_generic(struct fid_ep *ep, const struct iovec *iov,
+			    void **desc, size_t count, fi_addr_t dest_addr,
+			    void *context, uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	uint32_t msg_flags;
+	struct fi_context * fi_context;
+	int send_flag = 0;
+	int err;
+	int no_completion = 0;
+	struct psmx3_cq_event *event;
+	size_t real_count;
+	size_t len, total_len;
+	char *p;
+	uint32_t *q;
+	int i, j=0;
+	struct psmx3_sendv_request *req;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_sendv(ep, iov, desc, count,
+						 dest_addr, context, flags,
+						 data);
+
+	total_len = 0;
+	real_count = 0;
+	for (i=0; i<count; i++) {
+		if (iov[i].iov_len) {
+			total_len += iov[i].iov_len;
+			real_count++;
+			j = i;
+		}
+	}
+
+	if (real_count == 1)
+		return psmx3_send_generic(ep, iov[j].iov_base, iov[j].iov_len,
+					  desc ? desc[j] : NULL, dest_addr,
+					  context, flags, data);
+
+	req = malloc(sizeof(*req));
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (total_len <= PSMX3_IOV_BUF_SIZE) {
+		req->iov_protocol = PSMX3_IOV_PROTO_PACK;
+		p = req->buf;
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len) {
+				memcpy(p, iov[i].iov_base, iov[i].iov_len);
+				p += iov[i].iov_len;
+			}
+		}
+
+		msg_flags = PSMX3_TYPE_MSG;
+		len = total_len;
+	} else {
+		req->iov_protocol = PSMX3_IOV_PROTO_MULTI;
+		req->iov_done = 0;
+		req->iov_info.seq_num = (++ep_priv->iov_seq_num) %
+					PSMX3_IOV_MAX_SEQ_NUM + 1;
+		req->iov_info.count = (uint32_t)real_count;
+		req->iov_info.total_len = (uint32_t)total_len;
+
+		q = req->iov_info.len;
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len)
+				*q++ = (uint32_t)iov[i].iov_len;
+		}
+
+		msg_flags = PSMX3_TYPE_MSG | PSMX3_IOV_BIT;
+		len = (3 + real_count) * sizeof(uint32_t);
+	}
+
+	av = ep_priv->av;
+	assert(av);
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+
+	if (flags & FI_REMOTE_CQ_DATA) {
+		msg_flags |= PSMX3_IMM_BIT;
+		PSMX3_SET_TAG(psm2_tag, 0ULL, data, msg_flags);
+	} else {
+		PSMX3_SET_TAG(psm2_tag, 0ULL, ep_priv->sep_id, msg_flags);
+	}
+
+	if ((flags & PSMX3_NO_COMPLETION) ||
+	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
+		no_completion = 1;
+
+	if (flags & FI_INJECT) {
+		if (len > psmx3_env.inject_size) {
+			free(req);
+			return -FI_EMSGSIZE;
+		}
+
+		err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				    send_flag, &psm2_tag, req->buf, len);
+
+		free(req);
+
+		if (err != PSM2_OK)
+			return psmx3_errno(err);
+
+		if (ep_priv->send_cntr)
+			psmx3_cntr_inc(ep_priv->send_cntr, 0);
+
+		if (ep_priv->send_cq && !no_completion) {
+			event = psmx3_cq_create_event(
+					ep_priv->send_cq,
+					context, NULL, flags, len,
+					(uint64_t) data,
+					0 /* tag */,
+					0 /* olen */,
+					0 /* err */);
+
+			if (event)
+				psmx3_cq_enqueue_event(ep_priv->send_cq, event);
+			else
+				return -FI_ENOMEM;
+		}
+
+		return 0;
+	}
+
+	req->no_completion = no_completion;
+	req->user_context = context;
+	req->comp_flag = FI_MSG;
+
+	fi_context = &req->fi_context;
+	PSMX3_CTXT_TYPE(fi_context) = PSMX3_SENDV_CONTEXT;
+	PSMX3_CTXT_USER(fi_context) = req;
+	PSMX3_CTXT_EP(fi_context) = ep_priv;
+
+	err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr,
+			     send_flag, &psm2_tag, req->buf, len,
+			     (void *)fi_context, &psm2_req);
+
+	if (err != PSM2_OK) {
+		free(req);
+		return psmx3_errno(err);
+	}
+
+	PSMX3_CTXT_REQ(fi_context) = psm2_req;
+
+	if (req->iov_protocol == PSMX3_IOV_PROTO_MULTI) {
+		fi_context = &req->fi_context_iov;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_SEND_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = req;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_SET_TAG(psm2_tag, req->iov_info.seq_num, 0, PSMX3_TYPE_IOV_PAYLOAD);
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len) {
+				err = psm2_mq_isend2(ep_priv->tx->psm2_mq,
+						     psm2_epaddr, send_flag, &psm2_tag,
+						     iov[i].iov_base, iov[i].iov_len,
+						     (void *)fi_context, &psm2_req);
+				if (err != PSM2_OK)
+					return psmx3_errno(err);
+			}
+		}
+	}
+
+	return 0;
+}
+
+int psmx3_handle_sendv_req(struct psmx3_fid_ep *ep,
+			   PSMX3_STATUS_TYPE *status,
+			   int multi_recv)
+{
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	struct psmx3_sendv_reply *rep;
+	struct psmx3_multi_recv *recv_req;
+	struct fi_context *fi_context;
+	struct fi_context *recv_context;
+	int i, err;
+	uint8_t *recv_buf;
+	size_t recv_len, len;
+
+	if (PSMX3_STATUS_ERROR(status) != PSM2_OK)
+		return psmx3_errno(PSMX3_STATUS_ERROR(status));
+
+	rep = malloc(sizeof(*rep));
+	if (!rep) {
+		PSMX3_STATUS_ERROR(status) = PSM2_NO_MEMORY;
+		return -FI_ENOMEM;
+	}
+
+	recv_context = PSMX3_STATUS_CONTEXT(status);
+	if (multi_recv) {
+		recv_req = PSMX3_CTXT_USER(recv_context);
+		recv_buf = recv_req->buf + recv_req->offset;
+		recv_len = recv_req->len - recv_req->offset;
+		rep->multi_recv = 1;
+	} else {
+		recv_buf = PSMX3_CTXT_USER(recv_context);
+		recv_len = PSMX3_CTXT_SIZE(recv_context);
+		rep->multi_recv = 0;
+	}
+
+	/* assert(PSMX3_STATUS_RCVLEN(status) <= PSMX3_IOV_BUF_SIZE); */
+
+	memcpy(&rep->iov_info, recv_buf, PSMX3_STATUS_RCVLEN(status));
+
+	rep->user_context = PSMX3_STATUS_CONTEXT(status);
+	rep->tag = PSMX3_STATUS_TAG(status);
+	rep->buf = recv_buf;
+	rep->no_completion = 0;
+	rep->iov_done = 0;
+	rep->bytes_received = 0;
+	rep->msg_length = 0;
+	rep->error_code = PSM2_OK;
+
+	fi_context = &rep->fi_context;
+	PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_RECV_CONTEXT;
+	PSMX3_CTXT_USER(fi_context) = rep;
+	PSMX3_CTXT_EP(fi_context) = ep;
+
+	rep->comp_flag = PSMX3_IS_MSG(PSMX3_GET_FLAGS(rep->tag)) ? FI_MSG : FI_TAGGED;
+	if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(rep->tag)))
+		rep->comp_flag |= FI_REMOTE_CQ_DATA;
+
+	/* IOV payload uses a sequence number in place of a tag. */
+	PSMX3_SET_TAG(psm2_tag, rep->iov_info.seq_num, 0, PSMX3_TYPE_IOV_PAYLOAD);
+	PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_TYPE_MASK);
+
+	for (i=0; i<rep->iov_info.count; i++) {
+		if (recv_len) {
+			len = MIN(recv_len, rep->iov_info.len[i]);
+			err = psm2_mq_irecv2(ep->rx->psm2_mq,
+					     PSMX3_STATUS_PEER(status),
+					     &psm2_tag, &psm2_tagsel,
+					     0/*flag*/, recv_buf, len,
+					     (void *)fi_context, &psm2_req);
+			if (err) {
+				PSMX3_STATUS_ERROR(status) = err;
+				return psmx3_errno(err);
+			}
+			recv_buf += len;
+			recv_len -= len;
+		} else {
+			/* recv buffer full, post empty recvs */
+			err = psm2_mq_irecv2(ep->rx->psm2_mq,
+					     PSMX3_STATUS_PEER(status),
+					     &psm2_tag, &psm2_tagsel,
+					     0/*flag*/, NULL, 0,
+					     (void *)fi_context, &psm2_req);
+			if (err) {
+				PSMX3_STATUS_ERROR(status) = err;
+				return psmx3_errno(err);
+			}
+		}
+	}
+
+	if (multi_recv && recv_len < recv_req->min_buf_size)
+		rep->comp_flag |= FI_MULTI_RECV;
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_send(struct fid_ep *ep, const void *buf, size_t len,
+			  void *desc, fi_addr_t dest_addr, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_send_generic(ep, buf, len, desc, dest_addr, context,
+				  ep_priv->tx_flags, 0);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_sendmsg(struct fid_ep *ep, const struct fi_msg *msg,
+			     uint64_t flags)
+{
+	void *buf;
+	size_t len;
+
+	assert(msg);
+	assert(!msg->iov_count || msg->msg_iov);
+	assert(msg->iov_count <= PSMX3_IOV_MAX_COUNT);
+
+	if (msg->iov_count > 1) {
+		return psmx3_sendv_generic(ep, msg->msg_iov, msg->desc,
+					   msg->iov_count, msg->addr,
+					   msg->context, flags,
+					   msg->data);
+	} else if (msg->iov_count) {
+		buf = msg->msg_iov[0].iov_base;
+		len = msg->msg_iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_send_generic(ep, buf, len,
+				  msg->desc ? msg->desc[0] : NULL,
+				  msg->addr, msg->context, flags,
+				  msg->data);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_sendv(struct fid_ep *ep, const struct iovec *iov,
+			   void **desc, size_t count, fi_addr_t dest_addr,
+			   void *context)
+{
+	void *buf;
+	size_t len;
+
+	assert(!count || iov);
+	assert(count <= PSMX3_IOV_MAX_COUNT);
+
+	if (count > 1) {
+		struct psmx3_fid_ep *ep_priv;
+		ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+		return psmx3_sendv_generic(ep, iov, desc, count, dest_addr,
+					   context, ep_priv->tx_flags, 0);
+	} else if (count) {
+		buf = iov[0].iov_base;
+		len = iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_send(ep, buf, len, desc ? desc[0] : NULL,
+			  dest_addr, context);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_inject(struct fid_ep *ep, const void *buf, size_t len,
+			    fi_addr_t dest_addr)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_send_generic(ep, buf, len, NULL, dest_addr, NULL,
+				  ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION, 
+				  0);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_senddata(struct fid_ep *ep, const void *buf, size_t len,
+			      void *desc, uint64_t data, fi_addr_t dest_addr,
+			      void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_send_generic(ep, buf, len, desc, dest_addr, context,
+				  ep_priv->tx_flags | FI_REMOTE_CQ_DATA, data);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_injectdata(struct fid_ep *ep, const void *buf, size_t len,
+				uint64_t data, fi_addr_t dest_addr)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_send_generic(ep, buf, len, NULL, dest_addr, NULL,
+				  ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION |
+					FI_REMOTE_CQ_DATA,
+				  data);
+}
+
+struct fi_ops_msg psmx3_msg_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = psmx3_recv,
+	.recvv = psmx3_recvv,
+	.recvmsg = psmx3_recvmsg,
+	.send = psmx3_send,
+	.sendv = psmx3_sendv,
+	.sendmsg = psmx3_sendmsg,
+	.inject = psmx3_inject,
+	.senddata = psmx3_senddata,
+	.injectdata = psmx3_injectdata,
+};
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_rma.c b/deps/libfabric/prov/psm3/src/psmx3_rma.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7f2c5d273b2443ff7219d3b2b6e2e817255ff69
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_rma.c
@@ -0,0 +1,1454 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+static inline void psmx3_iov_copy(struct iovec *iov, size_t count,
+				  size_t offset, const void *src,
+				  size_t len)
+{
+	int i;
+	size_t copy_len;
+
+	for (i=0; i<count && len; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		}
+
+		copy_len = iov[i].iov_len - offset;
+		if (copy_len > len)
+			copy_len = len;
+
+		memcpy((uint8_t *)iov[i].iov_base + offset, src, copy_len);
+
+		src = (const uint8_t *)src + copy_len;
+		len -= copy_len;
+
+		if (offset)
+			offset = 0;
+	}
+}
+
+/* RMA protocol:
+ *
+ * Write REQ:
+ *	args[0].u32w0	cmd, flag
+ *	args[0].u32w1	len
+ *	args[1].u64	req
+ *	args[2].u64	addr
+ *	args[3].u64	key
+ *	args[4].u64	data (optional)
+ *
+ * Write REP:
+ *	args[0].u32w0	cmd, flag
+ *	args[0].u32w1	error
+ *	args[1].u64	req
+ *
+ * Read REQ:
+ *	args[0].u32w0	cmd, flag
+ *	args[0].u32w1	len
+ *	args[1].u64	req
+ *	args[2].u64	addr
+ *	args[3].u64	key
+ *	args[4].u64	offset / unused for long protocol
+ *
+ * Read REP:
+ *	args[0].u32w0	cmd, flag
+ *	args[0].u32w1	error
+ *	args[1].u64	req
+ *	args[2].u64	offset
+ */
+
+int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args,
+		int nargs, void *src, uint32_t len,
+		void *hctx)
+{
+	psm2_amarg_t rep_args[8];
+	uint8_t *rma_addr;
+	ssize_t rma_len;
+	uint64_t key;
+	int err = 0;
+	int op_error = 0;
+	int cmd, eom, has_data;
+	struct psmx3_am_request *req;
+	struct psmx3_cq_event *event;
+	uint64_t offset;
+	struct psmx3_fid_mr *mr;
+	psm2_epaddr_t epaddr;
+	struct psmx3_trx_ctxt *rx;
+
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+
+	psm2_am_get_source(token, &epaddr);
+	cmd = PSMX3_AM_GET_OP(args[0].u32w0);
+	eom = args[0].u32w0 & PSMX3_AM_EOM;
+	has_data = args[0].u32w0 & PSMX3_AM_DATA;
+
+	switch (cmd) {
+	case PSMX3_AM_REQ_WRITE:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		rma_len = args[0].u32w1;
+		rma_addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) :
+			-FI_EINVAL;
+		if (!op_error) {
+			rma_addr += mr->offset;
+			memcpy(rma_addr, src, len);
+			if (eom) {
+				if (rx->ep->recv_cq && has_data) {
+					/* TODO: report the addr/len of the whole write */
+					event = psmx3_cq_create_event(
+							rx->ep->recv_cq,
+							0, /* context */
+							rma_addr,
+							FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA,
+							rma_len,
+							args[4].u64,
+							0, /* tag */
+							0, /* olen */
+							0);
+
+					if (event)
+						psmx3_cq_enqueue_event(rx->ep->recv_cq, event);
+					else
+						err = -FI_ENOMEM;
+				}
+
+				if (rx->ep->caps & FI_RMA_EVENT) {
+					if (rx->ep->remote_write_cntr)
+						psmx3_cntr_inc(rx->ep->remote_write_cntr, 0);
+
+					if (mr->cntr && mr->cntr != rx->ep->remote_write_cntr)
+						psmx3_cntr_inc(mr->cntr, 0);
+				}
+			}
+		}
+		if (eom || op_error) {
+			rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom;
+			rep_args[0].u32w1 = op_error;
+			rep_args[1].u64 = args[1].u64;
+			err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+						  rep_args, 2, NULL, 0, 0,
+						  NULL, NULL );
+		}
+		break;
+
+	case PSMX3_AM_REQ_WRITE_LONG:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		rma_len = args[0].u32w1;
+		rma_addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_WRITE) :
+			-FI_EINVAL;
+		if (op_error) {
+			rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom;
+			rep_args[0].u32w1 = op_error;
+			rep_args[1].u64 = args[1].u64;
+			err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+						  rep_args, 2, NULL, 0, 0,
+						  NULL, NULL );
+			break;
+		}
+
+		rma_addr += mr->offset;
+
+		req = psmx3_am_request_alloc(rx);
+		if (!req) {
+			err = -FI_ENOMEM;
+		} else {
+			req->ep = rx->ep;
+			req->op = args[0].u32w0;
+			req->write.addr = (uint64_t)rma_addr;
+			req->write.len = rma_len;
+			req->write.key = key;
+			req->write.context = (void *)args[1].u64;
+			req->write.peer_addr = (void *)epaddr;
+			req->write.data = has_data ? args[4].u64 : 0;
+			req->cq_flags = FI_REMOTE_WRITE | FI_RMA |
+					(has_data ? FI_REMOTE_CQ_DATA : 0),
+			PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_REMOTE_WRITE_CONTEXT;
+			PSMX3_CTXT_USER(&req->fi_context) = mr;
+			PSMX3_SET_TAG(psm2_tag, (uint64_t)req->write.context, 0,
+					PSMX3_RMA_TYPE_WRITE);
+			PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK);
+			op_error = psm2_mq_fp_msg(rx->psm2_ep, rx->psm2_mq,
+						 (psm2_epaddr_t)epaddr,
+						 &psm2_tag, &psm2_tagsel, 0,
+						 (void *)rma_addr, rma_len,
+						 (void *)&req->fi_context, PSM2_MQ_IRECV_FP, &psm2_req);
+			if (op_error) {
+				rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom;
+				rep_args[0].u32w1 = op_error;
+				rep_args[1].u64 = args[1].u64;
+				err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+							  rep_args, 2, NULL, 0, 0,
+							  NULL, NULL );
+				psmx3_am_request_free(rx, req);
+				break;
+			}
+		}
+		break;
+
+	case PSMX3_AM_REQ_READ:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		rma_len = args[0].u32w1;
+		rma_addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		offset = args[4].u64;
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_READ) :
+			-FI_EINVAL;
+		if (!op_error) {
+			rma_addr += mr->offset;
+		} else {
+			rma_addr = NULL;
+			rma_len = 0;
+		}
+
+		rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom;
+		rep_args[0].u32w1 = op_error;
+		rep_args[1].u64 = args[1].u64;
+		rep_args[2].u64 = offset;
+		err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+				rep_args, 3, rma_addr, rma_len, 0,
+				NULL, NULL );
+
+		if (eom && !op_error) {
+			if (rx->ep->caps & FI_RMA_EVENT) {
+				if (rx->ep->remote_read_cntr)
+					psmx3_cntr_inc(rx->ep->remote_read_cntr, 0);
+			}
+		}
+		break;
+
+	case PSMX3_AM_REQ_READ_LONG:
+		rx = (struct psmx3_trx_ctxt *)hctx;
+		rma_len = args[0].u32w1;
+		rma_addr = (uint8_t *)(uintptr_t)args[2].u64;
+		key = args[3].u64;
+		mr = psmx3_mr_get(rx->domain, key);
+		op_error = mr ?
+			psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_READ) :
+			-FI_EINVAL;
+		if (op_error) {
+			rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom;
+			rep_args[0].u32w1 = op_error;
+			rep_args[1].u64 = args[1].u64;
+			rep_args[2].u64 = 0;
+			err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+					rep_args, 3, NULL, 0, 0,
+					NULL, NULL );
+			break;
+		}
+
+		rma_addr += mr->offset;
+
+		req = psmx3_am_request_alloc(rx);
+		if (!req) {
+			err = -FI_ENOMEM;
+		} else {
+			req->ep = rx->ep;
+			req->op = args[0].u32w0;
+			req->read.addr = (uint64_t)rma_addr;
+			req->read.len = rma_len;
+			req->read.key = key;
+			req->read.context = (void *)args[1].u64;
+			req->read.peer_addr = (void *)epaddr;
+			PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_REMOTE_READ_CONTEXT;
+			PSMX3_CTXT_USER(&req->fi_context) = mr;
+			PSMX3_SET_TAG(psm2_tag, (uint64_t)req->read.context, 0,
+			PSMX3_RMA_TYPE_READ);
+			op_error = psm2_mq_fp_msg(rx->psm2_ep, rx->psm2_mq,
+				  (psm2_epaddr_t)req->read.peer_addr,
+				 &psm2_tag, 0, 0,
+				 (void *)req->read.addr, req->read.len,
+				 (void *)&req->fi_context, PSM2_MQ_ISEND_FP, &psm2_req);
+			if (op_error) {
+				rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom;
+				rep_args[0].u32w1 = op_error;
+				rep_args[1].u64 = args[1].u64;
+				rep_args[2].u64 = 0;
+				err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER,
+						rep_args, 3, NULL, 0, 0,
+						NULL, NULL );
+				psmx3_am_request_free(rx, req);
+				break;
+			}
+		}
+		break;
+
+	case PSMX3_AM_REP_WRITE:
+		req = (struct psmx3_am_request *)(uintptr_t)args[1].u64;
+		assert(req->op == PSMX3_AM_REQ_WRITE);
+		op_error = (int)args[0].u32w1;
+		if (!req->error)
+			req->error = op_error;
+		if (eom) {
+			if (req->ep->send_cq && (!req->no_event || req->error)) {
+				event = psmx3_cq_create_event(
+						req->ep->send_cq,
+						req->write.context,
+						req->write.buf,
+						req->cq_flags,
+						req->write.len,
+						0, /* data */
+						0, /* tag */
+						0, /* olen */
+						req->error);
+				if (event)
+					psmx3_cq_enqueue_event(req->ep->send_cq, event);
+				else
+					err = -FI_ENOMEM;
+			}
+
+			if (req->ep->write_cntr)
+				psmx3_cntr_inc(req->ep->write_cntr, req->error);
+
+			free(req->tmpbuf);
+			psmx3_am_request_free(req->ep->tx, req);
+		}
+		break;
+
+	case PSMX3_AM_REP_READ:
+		req = (struct psmx3_am_request *)(uintptr_t)args[1].u64;
+		assert(req->op == PSMX3_AM_REQ_READ || req->op == PSMX3_AM_REQ_READV);
+		op_error = (int)args[0].u32w1;
+		offset = args[2].u64;
+		if (!req->error)
+			req->error = op_error;
+		if (!op_error) {
+			if (req->op == PSMX3_AM_REQ_READ)
+				memcpy(req->read.buf + offset, src, len);
+			else
+				psmx3_iov_copy(req->iov, req->read.iov_count, offset, src, len);
+
+			req->read.len_read += len;
+		}
+		if (eom || req->read.len == req->read.len_read) {
+			if (!eom)
+				FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,
+					"readv: short protocol finishes after long protocol.\n");
+			if (req->ep->send_cq && (!req->no_event || req->error)) {
+				event = psmx3_cq_create_event(
+						req->ep->send_cq,
+						req->read.context,
+						req->read.buf,
+						req->cq_flags,
+						req->read.len_read,
+						0, /* data */
+						0, /* tag */
+						req->read.len - req->read.len_read,
+						req->error);
+				if (event)
+					psmx3_cq_enqueue_event(req->ep->send_cq, event);
+				else
+					err = -FI_ENOMEM;
+			}
+
+			if (req->ep->read_cntr)
+				psmx3_cntr_inc(req->ep->read_cntr, req->error);
+ 
+			free(req->tmpbuf);
+			psmx3_am_request_free(req->ep->tx, req);
+		}
+		break;
+
+	default:
+		err = -FI_EINVAL;
+	}
+	return err;
+}
+
+static ssize_t psmx3_rma_self(int am_cmd,
+			      struct psmx3_fid_ep *ep,
+			      void *buf, size_t len, void *desc,
+			      uint64_t addr, uint64_t key,
+			      void *context, uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_mr *mr;
+	struct psmx3_cq_event *event;
+	struct psmx3_fid_cntr *cntr = NULL;
+	struct psmx3_fid_cntr *mr_cntr = NULL;
+	struct psmx3_fid_cq *cq = NULL;
+	int no_event;
+	int err = 0;
+	int op_error = 0;
+	int access;
+	uint8_t *dst, *src;
+	uint64_t cq_flags;
+	struct iovec *iov = buf;
+	size_t iov_count = len;
+	int i;
+
+	switch (am_cmd) {
+	case PSMX3_AM_REQ_WRITE:
+		access = FI_REMOTE_WRITE;
+		cq_flags = FI_WRITE | FI_RMA;
+		break;
+	case PSMX3_AM_REQ_WRITEV:
+		access = FI_REMOTE_WRITE;
+		cq_flags = FI_WRITE | FI_RMA;
+		len = 0;
+		for (i=0; i<iov_count; i++)
+			len += iov[i].iov_len;
+		break;
+	case PSMX3_AM_REQ_READ:
+		access = FI_REMOTE_READ;
+		cq_flags = FI_READ | FI_RMA;
+		break;
+	case PSMX3_AM_REQ_READV:
+		access = FI_REMOTE_READ;
+		cq_flags = FI_READ | FI_RMA;
+		len = 0;
+		for (i=0; i<iov_count; i++)
+			len += iov[i].iov_len;
+		break;
+	default:
+		return -FI_EINVAL;
+	}
+
+	mr = psmx3_mr_get(ep->domain, key);
+	op_error = mr ? psmx3_mr_validate(mr, addr, len, access) : -FI_EINVAL;
+
+	if (!op_error) {
+		addr += mr->offset;
+		switch (am_cmd) {
+		case PSMX3_AM_REQ_WRITE:
+			cntr = ep->remote_write_cntr;
+			if (flags & FI_REMOTE_CQ_DATA)
+				cq = ep->recv_cq;
+			if (mr->cntr != cntr)
+				mr_cntr = mr->cntr;
+			memcpy((void *)addr, buf, len);
+			break;
+
+		case PSMX3_AM_REQ_WRITEV:
+			cntr = ep->remote_write_cntr;
+			if (flags & FI_REMOTE_CQ_DATA)
+				cq = ep->recv_cq;
+			if (mr->cntr != cntr)
+				mr_cntr = mr->cntr;
+			dst = (void *)addr;
+			for (i=0; i<iov_count; i++)
+				if (iov[i].iov_len) {
+					memcpy(dst, iov[i].iov_base, iov[i].iov_len);
+					dst += iov[i].iov_len;
+				}
+			break;
+
+		case PSMX3_AM_REQ_READ:
+			cntr = ep->remote_read_cntr;
+			memcpy(buf, (void *)addr, len);
+			break;
+
+		case PSMX3_AM_REQ_READV:
+			cntr = ep->remote_read_cntr;
+			src = (void *)addr;
+			for (i=0; i<iov_count; i++)
+				if (iov[i].iov_len) {
+					memcpy(iov[i].iov_base, src, iov[i].iov_len);
+					src += iov[i].iov_len;
+				}
+			break;
+		}
+
+		if (cq) {
+			event = psmx3_cq_create_event(
+					cq,
+					0, /* context */
+					(void *)addr,
+					FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA,
+					len,
+					data,
+					0, /* tag */
+					0, /* olen */
+					0 /* err */);
+
+			if (event)
+				psmx3_cq_enqueue_event(cq, event);
+			else
+				err = -FI_ENOMEM;
+		}
+
+		if (ep->caps & FI_RMA_EVENT) {
+			if (cntr)
+				psmx3_cntr_inc(cntr, 0);
+
+			if (mr_cntr)
+				psmx3_cntr_inc(mr_cntr, 0);
+		}
+	}
+
+	no_event = (flags & PSMX3_NO_COMPLETION) ||
+		   (ep->send_selective_completion && !(flags & FI_COMPLETION));
+
+	if (ep->send_cq && (!no_event || op_error)) {
+		event = psmx3_cq_create_event(
+				ep->send_cq,
+				context,
+				(void *)buf,
+				cq_flags,
+				len,
+				0, /* data */
+				0, /* tag */
+				0, /* olen */
+				op_error);
+		if (event)
+			psmx3_cq_enqueue_event(ep->send_cq, event);
+		else
+			err = -FI_ENOMEM;
+	}
+
+	switch (am_cmd) {
+	case PSMX3_AM_REQ_WRITE:
+	case PSMX3_AM_REQ_WRITEV:
+		if (ep->write_cntr)
+			psmx3_cntr_inc(ep->write_cntr, op_error);
+		break;
+
+	case PSMX3_AM_REQ_READ:
+	case PSMX3_AM_REQ_READV:
+		if (ep->read_cntr)
+			psmx3_cntr_inc(ep->read_cntr, op_error);
+		break;
+	}
+
+	return err;
+}
+
+void psmx3_am_ack_rma(struct psmx3_am_request *req)
+{
+	psm2_amarg_t args[8];
+	int err;
+
+	if ((req->op & PSMX3_AM_OP_MASK) != PSMX3_AM_REQ_WRITE_LONG)
+		return;
+
+	args[0].u32w0 = PSMX3_AM_REP_WRITE | PSMX3_AM_EOM;
+	args[0].u32w1 = req->error;
+	args[1].u64 = (uint64_t)(uintptr_t)req->write.context;
+
+	err = psm2_am_request_short(req->write.peer_addr,
+				    PSMX3_AM_RMA_HANDLER, args, 2, NULL, 0,
+				    PSM2_AM_FLAG_NOREPLY, NULL, NULL);
+	if (err)
+		FI_INFO(&psmx3_prov, FI_LOG_EP_DATA,
+			"failed to send am_ack: err %d.\n", err);
+}
+
+ssize_t psmx3_read_generic(struct fid_ep *ep, void *buf, size_t len,
+			   void *desc, fi_addr_t src_addr,
+			   uint64_t addr, uint64_t key, void *context,
+			   uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	int chunk_size;
+	size_t offset = 0;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	size_t req_refcnt = 0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_read(ep, buf, len, desc, src_addr,
+						addr, key, context, flags);
+
+	assert(buf);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, src_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_rma_self(PSMX3_AM_REQ_READ, ep_priv,
+				      buf, len, desc, addr, key,
+				      context, flags, 0);
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	req->op = PSMX3_AM_REQ_READ;
+	req->read.buf = buf;
+	req->read.len = len;
+	req->read.addr = addr;	/* needed? */
+	req->read.key = key; 	/* needed? */
+	req->read.context = context;
+	req->ep = ep_priv;
+	req->cq_flags = FI_READ | FI_RMA;
+	PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_READ_CONTEXT;
+	PSMX3_CTXT_USER(&req->fi_context) = context;
+	PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
+
+	if (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)) {
+		PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_NOCOMP_READ_CONTEXT;
+		req->no_event = 1;
+	}
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_reply_short;
+
+	args[0].u32w0 = 0;
+
+	if (psmx3_env.tagged_rma && len > chunk_size) {
+		PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_READ);
+		PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK);
+		err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				     &psm2_tag, &psm2_tagsel, 0, buf, len,
+				     (void *)&req->fi_context, &psm2_req);
+		if (err) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return psmx3_errno(err);
+		}
+
+		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ_LONG);
+		args[0].u32w1 = len;
+		args[1].u64 = (uint64_t)req;
+		args[2].u64 = addr;
+		args[3].u64 = key;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, 4, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			/* req in use, don't free */
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		return 0;
+	}
+
+	PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ);
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[3].u64 = key;
+	while (len > chunk_size) {
+		args[0].u32w1 = chunk_size;
+		args[2].u64 = addr;
+		args[4].u64 = offset;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt)
+				psmx3_am_request_free(ep_priv->tx, req);
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		addr += chunk_size;
+		len -= chunk_size;
+		offset += chunk_size;
+		req_refcnt++;
+	}
+
+	PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+	args[0].u32w1 = len;
+	args[2].u64 = addr;
+	args[4].u64 = offset;
+	err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+				    args, 5, NULL, 0, 0, NULL, NULL);
+	if (err) {
+		if (!req_refcnt)
+			psmx3_am_request_free(ep_priv->tx, req);
+		return psmx3_errno(err);
+	}
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+ssize_t psmx3_readv_generic(struct fid_ep *ep, const struct iovec *iov,
+			    void *desc, size_t count, fi_addr_t src_addr,
+			    uint64_t addr, uint64_t key, void *context,
+			    uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	int chunk_size;
+	size_t offset = 0;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	size_t total_len, long_len = 0, short_len;
+	void *long_buf = NULL;
+	int i;
+	size_t req_refcnt = 0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_readv(ep, iov, desc, count, src_addr,
+						 addr, key, context, flags);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, src_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_rma_self(PSMX3_AM_REQ_READV, ep_priv,
+				      (void *)iov, count, desc, addr,
+				      key, context, flags, 0);
+
+	total_len = 0;
+	for (i=0; i<count; i++)
+		total_len += iov[i].iov_len;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	req->tmpbuf = malloc(count * sizeof(struct iovec));
+	if (!req->tmpbuf) {
+		psmx3_am_request_free(ep_priv->tx, req);
+		return -FI_ENOMEM;
+	}
+
+	req->iov = req->tmpbuf;
+	memcpy(req->iov, iov, count * sizeof(struct iovec));
+
+	req->op = PSMX3_AM_REQ_READV;
+	req->read.iov_count = count;
+	req->read.len = total_len;
+	req->read.addr = addr;	/* needed? */
+	req->read.key = key; 	/* needed? */
+	req->read.context = context;
+	req->ep = ep_priv;
+	req->cq_flags = FI_READ | FI_RMA;
+	PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_READ_CONTEXT;
+	PSMX3_CTXT_USER(&req->fi_context) = context;
+	PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
+
+	if (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)) {
+		PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_NOCOMP_READ_CONTEXT;
+		req->no_event = 1;
+	}
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_reply_short;
+
+	if (psmx3_env.tagged_rma) {
+		for (i=count-1; i>=0; i--) {
+			if (iov[i].iov_len > chunk_size) {
+				long_buf = iov[i].iov_base;
+				long_len = iov[i].iov_len;
+				break;
+			} else if (iov[i].iov_len) {
+				break;
+			}
+		}
+	}
+
+	short_len = total_len - long_len;
+
+	/* Use short protocol for all but the last segment (long_len) */
+	args[0].u32w0 = 0;
+	PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ);
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[3].u64 = key;
+	while (short_len > chunk_size) {
+		args[0].u32w1 = chunk_size;
+		args[2].u64 = addr;
+		args[4].u64 = offset;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx3_am_request_free(ep_priv->tx, req);
+			}
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		addr += chunk_size;
+		short_len -= chunk_size;
+		offset += chunk_size;
+		req_refcnt++;
+	}
+
+	if (short_len) {
+		if (!long_len)
+			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+		args[0].u32w1 = short_len;
+		args[2].u64 = addr;
+		args[4].u64 = offset;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx3_am_request_free(ep_priv->tx, req);
+			}
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		req_refcnt++;
+	}
+
+	/* Use the long protocol for the last segment */
+	if (long_len) {
+		PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_READ);
+		PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK);
+		err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				     &psm2_tag, &psm2_tagsel, 0,
+				     long_buf, long_len,
+				     (void *)&req->fi_context, &psm2_req);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx3_am_request_free(ep_priv->tx, req);
+			}
+			return psmx3_errno(err);
+		}
+
+		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ_LONG);
+		args[0].u32w1 = long_len;
+		args[1].u64 = (uint64_t)req;
+		args[2].u64 = addr + short_len;
+		args[3].u64 = key;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, 4, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			/* req in use, don't free */
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_read(struct fid_ep *ep, void *buf, size_t len,
+			  void *desc, fi_addr_t src_addr,
+			  uint64_t addr, uint64_t key, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_read_generic(ep, buf, len, desc, src_addr, addr,
+				  key, context, ep_priv->tx_flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_readmsg(struct fid_ep *ep,
+			     const struct fi_msg_rma *msg,
+			     uint64_t flags)
+{
+	assert(msg);
+	assert(msg->iov_count);
+	assert(msg->msg_iov);
+	assert(msg->rma_iov);
+	assert(msg->rma_iov_count == 1);
+
+	if (msg->iov_count > 1)
+		return psmx3_readv_generic(ep, msg->msg_iov,
+					   msg->desc ? msg->desc[0] : NULL,
+					   msg->iov_count, msg->addr,
+					   msg->rma_iov[0].addr,
+					   msg->rma_iov[0].key,
+					   msg->context, flags);
+
+	return psmx3_read_generic(ep, msg->msg_iov[0].iov_base,
+				  msg->msg_iov[0].iov_len,
+				  msg->desc ? msg->desc[0] : NULL,
+				  msg->addr, msg->rma_iov[0].addr,
+				  msg->rma_iov[0].key, msg->context,
+				  flags);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_readv(struct fid_ep *ep, const struct iovec *iov,
+			   void **desc, size_t count, fi_addr_t src_addr,
+			   uint64_t addr, uint64_t key, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	assert(iov);
+	assert(count);
+
+	if (count > 1)
+		return psmx3_readv_generic(ep, iov, desc ? desc[0] : NULL,
+					   count, src_addr, addr, key,
+					   context, ep_priv->tx_flags);
+
+	return psmx3_read(ep, iov->iov_base, iov->iov_len,
+			  desc ? desc[0] : NULL, src_addr, addr, key, context);
+}
+
+ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len,
+			    void *desc, fi_addr_t dest_addr,
+			    uint64_t addr, uint64_t key, void *context,
+			    uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	int nargs;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	void *psm2_context;
+	int no_event;
+	size_t req_refcnt = 0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_write(ep, buf, len, desc, dest_addr,
+						 addr, key, context, flags,
+						 data);
+
+	assert(buf);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_rma_self(PSMX3_AM_REQ_WRITE, ep_priv,
+				      (void *)buf, len, desc, addr,
+				      key, context, flags, data);
+
+	no_event = (flags & PSMX3_NO_COMPLETION) ||
+		   (ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (flags & FI_INJECT) {
+		if (len > psmx3_env.inject_size) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_EMSGSIZE;
+		}
+
+		req->tmpbuf = malloc(len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		memcpy(req->tmpbuf, (void *)buf, len);
+		buf = req->tmpbuf;
+	} else {
+		PSMX3_CTXT_TYPE(&req->fi_context) = no_event ?
+						    PSMX3_NOCOMP_WRITE_CONTEXT :
+						    PSMX3_WRITE_CONTEXT;
+	}
+
+	req->no_event = no_event;
+	req->op = PSMX3_AM_REQ_WRITE;
+	req->write.buf = (void *)buf;
+	req->write.len = len;
+	req->write.addr = addr;	/* needed? */
+	req->write.key = key; 	/* needed? */
+	req->write.context = context;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_RMA;
+	PSMX3_CTXT_USER(&req->fi_context) = context;
+	PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+
+	args[0].u32w0 = 0;
+
+	if (psmx3_env.tagged_rma && len > chunk_size) {
+		PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_WRITE);
+		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE_LONG);
+		args[0].u32w1 = len;
+		args[1].u64 = (uint64_t)req;
+		args[2].u64 = addr;
+		args[3].u64 = key;
+		nargs = 4;
+		if (flags & FI_REMOTE_CQ_DATA) {
+			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA);
+			args[4].u64 = data;
+			nargs++;
+		}
+
+		if (flags & FI_DELIVERY_COMPLETE) {
+			args[0].u32w0 |= PSMX3_AM_FORCE_ACK;
+			psm2_context = NULL;
+		} else {
+			psm2_context = (void *)&req->fi_context;
+		}
+
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, nargs, NULL, 0, am_flags,
+					    NULL, NULL);
+		if (err) {
+			free(req->tmpbuf);
+			psmx3_am_request_free(ep_priv->tx, req);
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+
+		err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
+				     &psm2_tag, buf, len, psm2_context, &psm2_req);
+		if (err) {
+			/* req in use, don't free */
+			return psmx3_errno(err);
+		}
+		return 0;
+	}
+
+	PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
+	nargs = 4;
+	while (len > chunk_size) {
+		args[0].u32w1 = chunk_size;
+		args[1].u64 = (uint64_t)(uintptr_t)req;
+		args[2].u64 = addr;
+		args[3].u64 = key;
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf,
+					    chunk_size, am_flags, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx3_am_request_free(ep_priv->tx, req);
+			}
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		buf = (const uint8_t *)buf + chunk_size;
+		addr += chunk_size;
+		len -= chunk_size;
+		req_refcnt++;
+	}
+
+	args[0].u32w1 = len;
+	args[1].u64 = (uint64_t)(uintptr_t)req;
+	args[2].u64 = addr;
+	args[3].u64 = key;
+	if (flags & FI_REMOTE_CQ_DATA) {
+		PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
+		args[4].u64 = data;
+		nargs++;
+	} else {
+		PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+	}
+	err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+				    args, nargs, (void *)buf, len, am_flags,
+				    NULL, NULL);
+	if (err) {
+		if (!req_refcnt) {
+			free(req->tmpbuf);
+			psmx3_am_request_free(ep_priv->tx, req);
+		}
+		return psmx3_errno(err);
+	}
+	psmx3_am_poll(ep_priv->tx);
+	return 0;
+}
+
+ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
+			     void **desc, size_t count, fi_addr_t dest_addr,
+			     uint64_t addr, uint64_t key, void *context,
+			     uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_am_request *req;
+	psm2_amarg_t args[8];
+	int nargs;
+	int am_flags = PSM2_AM_FLAG_ASYNC;
+	int chunk_size;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_epid_t psm2_epid;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	void *psm2_context;
+	int no_event;
+	size_t total_len, len, len_sent;
+	uint8_t *buf, *p;
+	int i;
+	size_t req_refcnt = 0;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_writev(ep, iov, desc, count,
+						  dest_addr, addr, key,
+						  context, flags, data);
+
+	av = ep_priv->av;
+	assert(av);
+
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+	psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid);
+
+	if (psm2_epid == ep_priv->tx->psm2_epid)
+		return psmx3_rma_self(PSMX3_AM_REQ_WRITEV, ep_priv,
+				      (void *)iov, count, desc, addr,
+				      key, context, flags, data);
+
+	no_event = (flags & PSMX3_NO_COMPLETION) ||
+		   (ep_priv->send_selective_completion && !(flags & FI_COMPLETION));
+
+	total_len = 0;
+	for (i=0; i<count; i++)
+		total_len += iov[i].iov_len;
+
+	chunk_size = ep_priv->tx->psm2_am_param.max_request_short;
+
+	req = psmx3_am_request_alloc(ep_priv->tx);
+	if (!req)
+		return -FI_ENOMEM;
+
+	/* Case 1: fit into a AM message, then pack and send */
+	if (total_len <= chunk_size) {
+		req->tmpbuf = malloc(total_len);
+		if (!req->tmpbuf) {
+			psmx3_am_request_free(ep_priv->tx, req);
+			return -FI_ENOMEM;
+		}
+
+		p = req->tmpbuf;
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len) {
+				memcpy(p, iov[i].iov_base, iov[i].iov_len);
+				p += iov[i].iov_len;
+			}
+		}
+		buf = req->tmpbuf;
+		len = total_len;
+
+		req->no_event = no_event;
+		req->op = PSMX3_AM_REQ_WRITE;
+		req->write.buf = (void *)buf;
+		req->write.len = len;
+		req->write.addr = addr;	/* needed? */
+		req->write.key = key; 	/* needed? */
+		req->write.context = context;
+		req->ep = ep_priv;
+		req->cq_flags = FI_WRITE | FI_RMA;
+		PSMX3_CTXT_USER(&req->fi_context) = context;
+		PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
+
+		args[0].u32w0 = 0;
+		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
+		args[0].u32w1 = len;
+		args[1].u64 = (uint64_t)(uintptr_t)req;
+		args[2].u64 = addr;
+		args[3].u64 = key;
+		nargs = 4;
+		if (flags & FI_REMOTE_CQ_DATA) {
+			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
+			args[4].u64 = data;
+			nargs++;
+		} else {
+			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+		}
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf, len,
+					    am_flags, NULL, NULL);
+		if (err) {
+			free(req->tmpbuf);
+			psmx3_am_request_free(ep_priv->tx, req);
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+		return 0;
+	}
+
+	if (flags & FI_INJECT) {
+		psmx3_am_request_free(ep_priv->tx, req);
+		return -FI_EMSGSIZE;
+	}
+
+	PSMX3_CTXT_TYPE(&req->fi_context) = no_event ?
+					    PSMX3_NOCOMP_WRITE_CONTEXT :
+					    PSMX3_WRITE_CONTEXT;
+
+	req->no_event = no_event;
+	req->op = PSMX3_AM_REQ_WRITE;
+	req->write.buf = (void *)iov[0].iov_base;
+	req->write.len = total_len;
+	req->write.addr = addr;	/* needed? */
+	req->write.key = key; 	/* needed? */
+	req->write.context = context;
+	req->ep = ep_priv;
+	req->cq_flags = FI_WRITE | FI_RMA;
+	PSMX3_CTXT_USER(&req->fi_context) = context;
+	PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
+
+	/* Case 2: send iov in sequence */
+	args[0].u32w0 = 0;
+
+	len_sent = 0;
+	for (i=0; i<count; i++) {
+		if (!iov[i].iov_len)
+			continue;
+
+		/* Case 2.1: use long protocol for the last segment if it is large */
+		if (psmx3_env.tagged_rma && iov[i].iov_len > chunk_size &&
+		    len_sent + iov[i].iov_len == total_len) {
+			PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_WRITE);
+			PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE_LONG);
+			args[0].u32w1 = iov[i].iov_len;
+			args[1].u64 = (uint64_t)req;
+			args[2].u64 = addr;
+			args[3].u64 = key;
+			nargs = 4;
+			if (flags & FI_REMOTE_CQ_DATA) {
+				PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA);
+				args[4].u64 = data;
+				nargs++;
+			}
+
+			if (flags & FI_DELIVERY_COMPLETE) {
+				args[0].u32w0 |= PSMX3_AM_FORCE_ACK;
+				psm2_context = NULL;
+			} else {
+				psm2_context = (void *)&req->fi_context;
+			}
+
+			err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+						    args, nargs, NULL, 0, am_flags,
+						    NULL, NULL);
+			if (err) {
+				if (!req_refcnt)
+					psmx3_am_request_free(ep_priv->tx, req);
+				return psmx3_errno(err);
+			}
+			psmx3_am_poll(ep_priv->tx);
+
+			err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr,
+					     0, &psm2_tag, iov[i].iov_base,
+					     iov[i].iov_len, psm2_context,
+					     &psm2_req);
+			if (err) {
+				/* req in use, don't free */
+				return psmx3_errno(err);
+			}
+			return 0;
+		}
+
+		/* Case 2.2: use short protocol all other segments */
+		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
+		nargs = 4;
+		buf = iov[i].iov_base;
+		len = iov[i].iov_len;
+		while (len > chunk_size) {
+			args[0].u32w1 = chunk_size;
+			args[1].u64 = (uint64_t)(uintptr_t)req;
+			args[2].u64 = addr;
+			args[3].u64 = key;
+			err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+						    args, nargs, (void *)buf,
+						    chunk_size, am_flags,
+						    NULL, NULL);
+			if (err) {
+				if (!req_refcnt)
+					psmx3_am_request_free(ep_priv->tx, req);
+				return psmx3_errno(err);
+			}
+			psmx3_am_poll(ep_priv->tx);
+			buf += chunk_size;
+			addr += chunk_size;
+			len -= chunk_size;
+			len_sent += chunk_size;
+			req_refcnt++;
+		}
+
+		args[0].u32w1 = len;
+		args[1].u64 = (uint64_t)(uintptr_t)req;
+		args[2].u64 = addr;
+		args[3].u64 = key;
+		if (len_sent + len == total_len) {
+			if (flags & FI_REMOTE_CQ_DATA) {
+				PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
+				args[4].u64 = data;
+				nargs++;
+			} else {
+				PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+			}
+		}
+		err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf, len,
+					    am_flags, NULL, NULL);
+		if (err) {
+			if (!req_refcnt)
+				psmx3_am_request_free(ep_priv->tx, req);
+			return psmx3_errno(err);
+		}
+		psmx3_am_poll(ep_priv->tx);
+
+		addr += len;
+		len_sent += len;
+		req_refcnt++;
+	}
+
+	return 0;
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_write(struct fid_ep *ep, const void *buf, size_t len,
+			   void *desc, fi_addr_t dest_addr, uint64_t addr,
+			   uint64_t key, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_write_generic(ep, buf, len, desc, dest_addr, addr,
+				   key, context, ep_priv->tx_flags, 0);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_writemsg(struct fid_ep *ep,
+			      const struct fi_msg_rma *msg,
+			      uint64_t flags)
+{
+	assert(msg);
+	assert(msg->msg_iov);
+	assert(msg->iov_count);
+	assert(msg->rma_iov);
+	assert(msg->rma_iov_count == 1);
+
+	if (msg->iov_count > 1)
+		return psmx3_writev_generic(ep, msg->msg_iov, msg->desc,
+					    msg->iov_count, msg->addr,
+					    msg->rma_iov[0].addr,
+					    msg->rma_iov[0].key,
+					    msg->context, flags, msg->data);
+
+	return psmx3_write_generic(ep, msg->msg_iov[0].iov_base,
+				   msg->msg_iov[0].iov_len,
+				   msg->desc ? msg->desc[0] : NULL, msg->addr,
+				   msg->rma_iov[0].addr, msg->rma_iov[0].key,
+				   msg->context, flags, msg->data);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_writev(struct fid_ep *ep, const struct iovec *iov,
+			    void **desc, size_t count, fi_addr_t dest_addr,
+			    uint64_t addr, uint64_t key, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	assert(iov);
+	assert(count);
+
+	if (count > 1)
+		return psmx3_writev_generic(ep, iov, desc, count, dest_addr,
+					    addr, key, context, ep_priv->tx_flags, 0);
+
+	return psmx3_write_generic(ep, iov->iov_base, iov->iov_len,
+				   desc ? desc[0] : NULL, dest_addr, addr, key,
+				   context, ep_priv->tx_flags, 0);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_inject_write(struct fid_ep *ep, const void *buf, size_t len,
+			          fi_addr_t dest_addr, uint64_t addr, uint64_t key)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_write_generic(ep, buf, len, NULL, dest_addr, addr, key, NULL,
+				   ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION,
+				   0);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_writedata(struct fid_ep *ep, const void *buf, size_t len,
+			       void *desc, uint64_t data, fi_addr_t dest_addr,
+			       uint64_t addr, uint64_t key, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_write_generic(ep, buf, len, desc, dest_addr, addr, key,
+				   context, ep_priv->tx_flags | FI_REMOTE_CQ_DATA,
+				   data);
+}
+
+DIRECT_FN
+STATIC ssize_t psmx3_inject_writedata(struct fid_ep *ep, const void *buf, size_t len,
+				      uint64_t data, fi_addr_t dest_addr, uint64_t addr,
+				      uint64_t key)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_write_generic(ep, buf, len, NULL, dest_addr, addr, key, NULL,
+				   ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION,
+				   data);
+}
+
+struct fi_ops_rma psmx3_rma_ops = {
+	.size = sizeof(struct fi_ops_rma),
+	.read = psmx3_read,
+	.readv = psmx3_readv,
+	.readmsg = psmx3_readmsg,
+	.write = psmx3_write,
+	.writev = psmx3_writev,
+	.writemsg = psmx3_writemsg,
+	.inject = psmx3_inject_write,
+	.writedata = psmx3_writedata,
+	.injectdata = psmx3_inject_writedata,
+};
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_tagged.c b/deps/libfabric/prov/psm3/src/psmx3_tagged.c
new file mode 100644
index 0000000000000000000000000000000000000000..47a9a78fe265972c9dc4ccbaca923375c5b13e9f
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_tagged.c
@@ -0,0 +1,1142 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+#include "psmx3_trigger.h"
+
+static ssize_t psmx3_tagged_peek_generic(struct fid_ep *ep,
+					 void *buf, size_t len,
+					 void *desc, fi_addr_t src_addr,
+					 uint64_t tag, uint64_t ignore,
+					 void *context, uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	struct psmx3_cq_event *event;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t req = NULL;
+	psm2_mq_status2_t psm2_status;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	uint64_t data;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) {
+		av = ep_priv->av;
+		assert(av);
+		psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type);
+	} else {
+		psm2_epaddr = 0;
+	}
+
+	PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED);
+	PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK);
+
+	if (flags & (FI_CLAIM | FI_DISCARD))
+		err = psm2_mq_improbe2(ep_priv->rx->psm2_mq,
+				       psm2_epaddr, &psm2_tag,
+				       &psm2_tagsel, &req, &psm2_status);
+	else
+		err = psm2_mq_iprobe2(ep_priv->rx->psm2_mq,
+				      psm2_epaddr, &psm2_tag, &psm2_tagsel,
+				      &psm2_status);
+	switch (err) {
+	case PSM2_OK:
+		if (ep_priv->recv_cq) {
+			if (flags & FI_CLAIM) {
+				if (context)
+					PSMX3_CTXT_REQ((struct fi_context *)context) = req;
+			} else if (flags & FI_DISCARD) {
+				if (!psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0,
+						    NULL, 0, req, &req))
+					psm2_mq_wait2(&req, NULL);
+			}
+
+			tag = PSMX3_GET_TAG64(psm2_status.msg_tag);
+			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(psm2_status.msg_tag))) {
+				data = PSMX3_GET_CQDATA(psm2_status.msg_tag);
+				flags |= FI_REMOTE_CQ_DATA;
+			} else {
+				data = 0;
+			}
+			len = psm2_status.msg_length;
+			event = psmx3_cq_create_event(
+					ep_priv->recv_cq,
+					context,		/* op_context */
+					NULL,			/* buf */
+					flags|FI_RECV|FI_TAGGED,/* flags */
+					len,			/* len */
+					data,			/* data */
+					tag,			/* tag */
+					len,			/* olen */
+					0);			/* err */
+
+			if (!event)
+				return -FI_ENOMEM;
+
+			event->source_is_valid = 1;
+			event->source = psm2_status.msg_peer;
+			event->source_av = ep_priv->av;
+			psmx3_cq_enqueue_event(ep_priv->recv_cq, event);
+		}
+		return 0;
+
+	case PSM2_MQ_NO_COMPLETIONS:
+		if (ep_priv->recv_cq) {
+			event = psmx3_cq_create_event(
+					ep_priv->recv_cq,
+					context,		/* op_context */
+					NULL,			/* buf */
+					flags|FI_RECV|FI_TAGGED,/* flags */
+					len,			/* len */
+					0,			/* data */
+					tag,			/* tag */
+					len,			/* olen */
+					-FI_ENOMSG);		/* err */
+
+			if (!event)
+				return -FI_ENOMEM;
+
+			event->source = 0;
+			psmx3_cq_enqueue_event(ep_priv->recv_cq, event);
+		}
+		return 0;
+
+	default:
+		return psmx3_errno(err);
+	}
+}
+
+ssize_t psmx3_tagged_recv_generic(struct fid_ep *ep, void *buf,
+				  size_t len, void *desc,
+				  fi_addr_t src_addr, 
+				  uint64_t tag, uint64_t ignore,
+				  void *context, uint64_t flags)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	struct fi_context *fi_context;
+	int err;
+	int enable_completion;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_PEEK)
+		return psmx3_tagged_peek_generic(ep, buf, len, desc,
+						 src_addr, tag, ignore,
+						 context, flags);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_trecv(ep, buf, len, desc,
+						 src_addr, tag, ignore,
+						 context, flags);
+ 
+	if (flags & FI_CLAIM) {
+		assert(context);
+		if (flags & FI_DISCARD) {
+			psm2_mq_status2_t psm2_status;
+			struct psmx3_cq_event *event;
+
+			fi_context = context;
+			psm2_req = PSMX3_CTXT_REQ(fi_context);
+			err = psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0,
+					     NULL, 0, context, &psm2_req);
+			if (err != PSM2_OK)
+				return psmx3_errno(err);
+
+			psm2_mq_wait2(&psm2_req, &psm2_status);
+
+			if (ep_priv->recv_cq &&
+			    (!ep_priv->recv_selective_completion || (flags & FI_COMPLETION))) {
+				tag = PSMX3_GET_TAG64(psm2_status.msg_tag);
+				event = psmx3_cq_create_event(
+						ep_priv->recv_cq,
+						context,		/* op_context */
+						NULL,			/* buf */
+						flags|FI_RECV|FI_TAGGED,/* flags */
+						0,			/* len */
+						0,			/* data */
+						tag,			/* tag */
+						0,			/* olen */
+						0);			/* err */
+
+				if (!event)
+					return -FI_ENOMEM;
+
+				event->source_is_valid = 1;
+				event->source = psm2_status.msg_peer;
+				event->source_av = ep_priv->av;
+				psmx3_cq_enqueue_event(ep_priv->recv_cq, event);
+			}
+
+			if (ep_priv->recv_cntr)
+				psmx3_cntr_inc(ep_priv->recv_cntr, 0);
+
+			return 0;
+		}
+
+		fi_context = context;
+		psm2_req = PSMX3_CTXT_REQ(fi_context);
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+
+		err = psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0,
+				     buf, len, context, &psm2_req);
+		if (err != PSM2_OK)
+			return psmx3_errno(err);
+
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+		return 0;
+	}
+
+	enable_completion = !ep_priv->recv_selective_completion || (flags & FI_COMPLETION);
+
+	if (enable_completion) {
+		assert(context);
+		fi_context = context;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+	} else {
+		PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context);
+		#if !PSMX3_USE_REQ_CONTEXT
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) {
+		av = ep_priv->av;
+		assert(av);
+		psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type);
+	} else {
+		psm2_epaddr = 0;
+	}
+
+	PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED);
+	PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK);
+
+	err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr,
+			     &psm2_tag, &psm2_tagsel, 0, buf, len,
+			     (void *)fi_context, &psm2_req);
+
+	if (err != PSM2_OK)
+		return psmx3_errno(err);
+
+	if (enable_completion) {
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+	} else {
+		#if PSMX3_USE_REQ_CONTEXT
+		PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context);
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	return 0;
+}
+
+__attribute__((always_inline))
+static inline ssize_t
+psmx3_tagged_recv_specialized(struct fid_ep *ep, void *buf, size_t len,
+			      void *desc, fi_addr_t src_addr,
+			      uint64_t tag, uint64_t ignore,
+			      void *context,
+			      int enable_completion,
+			      int directed_receive,
+			      int av_map)
+{
+	struct psmx3_fid_ep *ep_priv;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	struct fi_context *fi_context;
+	int err;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (enable_completion) {
+		fi_context = context;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+	} else {
+		PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context);
+		#if !PSMX3_USE_REQ_CONTEXT
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	if (directed_receive && src_addr != FI_ADDR_UNSPEC) {
+		if (av_map) {
+			psm2_epaddr = (psm2_epaddr_t)src_addr;
+		} else {
+			assert(ep_priv->av);
+			psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->rx, src_addr, FI_AV_TABLE);
+		}
+	} else {
+		psm2_epaddr = 0;
+	}
+
+	PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED);
+	PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK);
+
+	err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr,
+			     &psm2_tag, &psm2_tagsel, 0, buf, len,
+			     (void *)fi_context, &psm2_req);
+
+	if (OFI_UNLIKELY((err != PSM2_OK)))
+			return psmx3_errno(err);
+
+	if (enable_completion) {
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+	} else {
+		#if PSMX3_USE_REQ_CONTEXT
+		PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context);
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_CTXT_USER(fi_context) = buf;
+		PSMX3_CTXT_SIZE(fi_context) = len;
+		#endif
+	}
+
+	return 0;
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE not set, av table */
+static ssize_t
+psmx3_tagged_recv_no_flag_undirected(struct fid_ep *ep, void *buf, size_t len,
+				     void *desc, fi_addr_t src_addr,
+				     uint64_t tag, uint64_t ignore,
+				     void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 1, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE not set, av table */
+static ssize_t
+psmx3_tagged_recv_no_event_undirected(struct fid_ep *ep, void *buf, size_t len,
+				      void *desc, fi_addr_t src_addr,
+				      uint64_t tag, uint64_t ignore,
+				      void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 0, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE set, av_table */
+static ssize_t
+psmx3_tagged_recv_no_flag_directed(struct fid_ep *ep, void *buf, size_t len,
+				   void *desc, fi_addr_t src_addr,
+				   uint64_t tag, uint64_t ignore,
+				   void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 1, 1, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE set, av table */
+static ssize_t
+psmx3_tagged_recv_no_event_directed(struct fid_ep *ep, void *buf, size_t len,
+				    void *desc, fi_addr_t src_addr,
+				    uint64_t tag, uint64_t ignore,
+				    void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 0, 1, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE not set, av map */
+static ssize_t
+psmx3_tagged_recv_no_flag_undirected_av_map(struct fid_ep *ep, void *buf, size_t len,
+					    void *desc, fi_addr_t src_addr,
+					    uint64_t tag, uint64_t ignore,
+					    void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 1, 0, 1);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE not set, av map */
+static ssize_t
+psmx3_tagged_recv_no_event_undirected_av_map(struct fid_ep *ep, void *buf, size_t len,
+					     void *desc, fi_addr_t src_addr,
+					     uint64_t tag, uint64_t ignore,
+					     void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 0, 0, 1);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE set, av_map */
+static ssize_t
+psmx3_tagged_recv_no_flag_directed_av_map(struct fid_ep *ep, void *buf, size_t len,
+					  void *desc, fi_addr_t src_addr,
+					  uint64_t tag, uint64_t ignore,
+					  void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 1, 1, 1);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE set, av map */
+static ssize_t
+psmx3_tagged_recv_no_event_directed_av_map(struct fid_ep *ep, void *buf, size_t len,
+					   void *desc, fi_addr_t src_addr,
+					   uint64_t tag, uint64_t ignore,
+					   void *context)
+{
+	return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr,
+					     tag, ignore, context, 0, 1, 1);
+}
+
+static ssize_t psmx3_tagged_recv(struct fid_ep *ep, void *buf, size_t len,
+				 void *desc, fi_addr_t src_addr, uint64_t tag,
+				 uint64_t ignore, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_tagged_recv_generic(ep, buf, len, desc, src_addr, tag,
+					 ignore, context, ep_priv->rx_flags);
+}
+
+static ssize_t psmx3_tagged_recvmsg(struct fid_ep *ep,
+				    const struct fi_msg_tagged *msg,
+				    uint64_t flags)
+{
+	void *buf;
+	size_t len;
+
+	assert(msg);
+	assert(!msg->iov_count || msg->msg_iov);
+	assert(msg->iov_count <= 1);
+
+	if (msg->iov_count) {
+		buf = msg->msg_iov[0].iov_base;
+		len = msg->msg_iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_tagged_recv_generic(ep, buf, len,
+					 msg->desc ? msg->desc[0] : NULL,
+					 msg->addr, msg->tag, msg->ignore,
+					 msg->context, flags);
+}
+
+#define PSMX3_TAGGED_RECVV_FUNC(suffix)					\
+static ssize_t								\
+psmx3_tagged_recvv##suffix(struct fid_ep *ep, const struct iovec *iov,	\
+			   void **desc, size_t count,			\
+			   fi_addr_t src_addr, uint64_t tag,		\
+			   uint64_t ignore, void *context)		\
+{									\
+	void *buf;							\
+	size_t len;							\
+	assert(!count || iov);						\
+	assert(count <= 1);						\
+	if (count) {							\
+		buf = iov[0].iov_base;					\
+		len = iov[0].iov_len;					\
+	} else {							\
+		buf = NULL;						\
+		len = 0;						\
+	}								\
+	return psmx3_tagged_recv##suffix(ep, buf, len,			\
+					 desc ? desc[0] : NULL,		\
+				 	 src_addr, tag, ignore, 	\
+					 context); 			\
+}
+
+PSMX3_TAGGED_RECVV_FUNC()
+PSMX3_TAGGED_RECVV_FUNC(_no_flag_directed)
+PSMX3_TAGGED_RECVV_FUNC(_no_event_directed)
+PSMX3_TAGGED_RECVV_FUNC(_no_flag_undirected)
+PSMX3_TAGGED_RECVV_FUNC(_no_event_undirected)
+PSMX3_TAGGED_RECVV_FUNC(_no_flag_directed_av_map)
+PSMX3_TAGGED_RECVV_FUNC(_no_event_directed_av_map)
+PSMX3_TAGGED_RECVV_FUNC(_no_flag_undirected_av_map)
+PSMX3_TAGGED_RECVV_FUNC(_no_event_undirected_av_map)
+
+ssize_t psmx3_tagged_send_generic(struct fid_ep *ep,
+				  const void *buf, size_t len,
+				  void *desc, fi_addr_t dest_addr,
+				  uint64_t tag, void *context,
+				  uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	struct fi_context *fi_context;
+	int err;
+	int no_completion = 0;
+	struct psmx3_cq_event *event;
+	int have_data = (flags & FI_REMOTE_CQ_DATA) > 0;
+
+	assert((tag & ~PSMX3_TAG_MASK) == 0);
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_tsend(ep, buf, len, desc,
+						 dest_addr, tag, context,
+						 flags, data);
+
+	av = ep_priv->av;
+	assert(av);
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+
+	if (have_data)
+		PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)data,
+			      PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT);
+	else
+		PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id,
+			      PSMX3_TYPE_TAGGED);
+
+	if ((flags & PSMX3_NO_COMPLETION) ||
+	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
+		no_completion = 1;
+
+	if (flags & FI_INJECT) {
+		if (len > psmx3_env.inject_size)
+			return -FI_EMSGSIZE;
+
+		err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				    0, &psm2_tag, buf, len);
+
+		if (err != PSM2_OK)
+			return psmx3_errno(err);
+
+		if (ep_priv->send_cntr)
+			psmx3_cntr_inc(ep_priv->send_cntr, 0);
+
+		if (ep_priv->send_cq && !no_completion) {
+			event = psmx3_cq_create_event(
+					ep_priv->send_cq,
+					context, (void *)buf, flags, len,
+					(uint64_t) data, tag,
+					0 /* olen */,
+					0 /* err */);
+
+			if (event)
+				psmx3_cq_enqueue_event(ep_priv->send_cq, event);
+			else
+				return -FI_ENOMEM;
+		}
+
+		return 0;
+	}
+
+	if (no_completion) {
+		fi_context = &ep_priv->nocomp_tsend_context;
+	} else {
+		assert(context);
+		fi_context = context;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_TSEND_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = (void *)buf;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+	}
+
+	err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
+			     &psm2_tag, buf, len, (void*)fi_context,
+			     &psm2_req);
+
+	if (err != PSM2_OK)
+		return psmx3_errno(err);
+
+	if (fi_context == context)
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+
+	return 0;
+}
+
+__attribute__((always_inline))
+static inline ssize_t
+psmx3_tagged_send_specialized(struct fid_ep *ep, const void *buf,
+			      size_t len, void *desc,
+			      fi_addr_t dest_addr, uint64_t tag,
+			      void *context,
+			      int enable_completion, int av_map,
+			      int have_data, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	struct fi_context *fi_context;
+	int err;
+
+	assert((tag & ~PSMX3_TAG_MASK) == 0);
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (av_map) {
+		psm2_epaddr = (psm2_epaddr_t)dest_addr;
+	} else {
+		assert(ep_priv->av);
+		psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE);
+	}
+
+	if (have_data)
+		PSMX3_SET_TAG(psm2_tag, tag, data, PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT);
+	else
+		PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED);
+
+	if (enable_completion) {
+		fi_context = context;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_TSEND_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = (void *)buf;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+	} else {
+		fi_context = &ep_priv->nocomp_tsend_context;
+	}
+
+	err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
+			     &psm2_tag, buf, len, (void*)fi_context,
+			     &psm2_req);
+
+	if (err != PSM2_OK)
+		return psmx3_errno(err);
+
+	if (enable_completion)
+		PSMX3_CTXT_REQ(fi_context) = psm2_req;
+
+	return 0;
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_table */
+static ssize_t
+psmx3_tagged_send_no_flag(struct fid_ep *ep, const void *buf, size_t len,
+			  void *desc, fi_addr_t dest_addr, uint64_t tag,
+			  void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 1, 0, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_table */
+static ssize_t
+psmx3_tagged_send_no_event(struct fid_ep *ep, const void *buf, size_t len,
+			   void *desc, fi_addr_t dest_addr, uint64_t tag,
+			   void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 0, 0, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_map */
+static ssize_t
+psmx3_tagged_send_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				 void *desc, fi_addr_t dest_addr, uint64_t tag,
+				 void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 1, 1, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_map */
+static ssize_t
+psmx3_tagged_send_no_event_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				  void *desc, fi_addr_t dest_addr, uint64_t tag,
+				  void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 0, 1, 0, 0);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_table */
+static ssize_t
+psmx3_tagged_senddata_no_flag(struct fid_ep *ep, const void *buf, size_t len,
+			      void *desc, uint64_t data, fi_addr_t dest_addr,
+			      uint64_t tag, void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 1, 0, 1, data);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_table */
+static ssize_t
+psmx3_tagged_senddata_no_event(struct fid_ep *ep, const void *buf, size_t len,
+			       void *desc, uint64_t data, fi_addr_t dest_addr,
+			       uint64_t tag, void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 0, 0, 1, data);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_map */
+static ssize_t
+psmx3_tagged_senddata_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				     void *desc, uint64_t data, fi_addr_t dest_addr,
+				     uint64_t tag, void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 1, 1, 1, data);
+}
+
+/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_map */
+static ssize_t
+psmx3_tagged_senddata_no_event_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				      void *desc, uint64_t data, fi_addr_t dest_addr,
+				      uint64_t tag, void *context)
+{
+	return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag,
+					     context, 0, 1, 1, data);
+}
+
+__attribute__((always_inline))
+static inline ssize_t
+psmx3_tagged_inject_specialized(struct fid_ep *ep, const void *buf,
+				size_t len, fi_addr_t dest_addr,
+				uint64_t tag, int av_map,
+				int have_data, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_tag_t psm2_tag;
+	int err;
+
+	assert((tag & ~PSMX3_TAG_MASK) == 0);
+
+	if (len > psmx3_env.inject_size)
+		return -FI_EMSGSIZE;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (av_map) {
+		psm2_epaddr = (psm2_epaddr_t)dest_addr;
+	} else {
+		assert(ep_priv->av);
+		psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE);
+	}
+
+	if (have_data)
+		PSMX3_SET_TAG(psm2_tag, tag, data, PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT);
+	else
+		PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED);
+
+	err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
+			    &psm2_tag, buf, len);
+
+	if (err != PSM2_OK)
+		return psmx3_errno(err);
+
+	if (ep_priv->send_cntr)
+		psmx3_cntr_inc(ep_priv->send_cntr, 0);
+
+	return 0;
+}
+
+/* op_flags=0, av_table */
+static ssize_t
+psmx3_tagged_inject_no_flag(struct fid_ep *ep, const void *buf, size_t len,
+			    fi_addr_t dest_addr, uint64_t tag)
+{
+	return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag,
+					       0, 0, 0);
+}
+
+/* op_flags=0, av_map */
+static ssize_t
+psmx3_tagged_inject_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				   fi_addr_t dest_addr, uint64_t tag)
+{
+	return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag,
+					       1, 0, 0);
+}
+
+/* op_flags=0, av_table */
+static ssize_t
+psmx3_tagged_injectdata_no_flag(struct fid_ep *ep, const void *buf, size_t len,
+				uint64_t data, fi_addr_t dest_addr, uint64_t tag)
+{
+	return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag,
+					       0, 1, data);
+}
+
+/* op_flags=0, av_map */
+static ssize_t
+psmx3_tagged_injectdata_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len,
+				       uint64_t data, fi_addr_t dest_addr, uint64_t tag)
+{
+	return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag,
+					       1, 1, data);
+}
+
+ssize_t psmx3_tagged_sendv_generic(struct fid_ep *ep,
+				   const struct iovec *iov, void **desc,
+				   size_t count, fi_addr_t dest_addr,
+				   uint64_t tag, void *context,
+				   uint64_t flags, uint64_t data)
+{
+	struct psmx3_fid_ep *ep_priv;
+	struct psmx3_fid_av *av;
+	psm2_epaddr_t psm2_epaddr;
+	psm2_mq_req_t psm2_req;
+	psm2_mq_tag_t psm2_tag;
+	struct fi_context * fi_context;
+	int send_flag = 0;
+	int err;
+	int no_completion = 0;
+	struct psmx3_cq_event *event;
+	size_t real_count;
+	size_t len, total_len;
+	char *p;
+	uint32_t *q;
+	int i, j=0;
+	struct psmx3_sendv_request *req;
+	int have_data = (flags & FI_REMOTE_CQ_DATA) > 0;
+	uint32_t msg_flags;
+
+	assert((tag & ~PSMX3_TAG_MASK) == 0);
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	if (flags & FI_TRIGGER)
+		return psmx3_trigger_queue_tsendv(ep, iov, desc, count,
+						  dest_addr, tag, context,
+						  flags, data);
+
+	total_len = 0;
+	real_count = 0;
+	for (i=0; i<count; i++) {
+		if (iov[i].iov_len) {
+			total_len += iov[i].iov_len;
+			real_count++;
+			j = i;
+		}
+	}
+
+	if (real_count == 1)
+		return psmx3_tagged_send_generic(ep, iov[j].iov_base, iov[j].iov_len,
+						 desc ? desc[j] : NULL, dest_addr,
+						 tag, context, flags, data);
+
+	req = malloc(sizeof(*req));
+	if (!req)
+		return -FI_ENOMEM;
+
+	if (total_len <= PSMX3_IOV_BUF_SIZE) {
+		req->iov_protocol = PSMX3_IOV_PROTO_PACK;
+		p = req->buf;
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len) {
+				memcpy(p, iov[i].iov_base, iov[i].iov_len);
+				p += iov[i].iov_len;
+			}
+		}
+
+		msg_flags = PSMX3_TYPE_TAGGED;
+		len = total_len;
+	} else {
+		req->iov_protocol = PSMX3_IOV_PROTO_MULTI;
+		req->iov_done = 0;
+		req->iov_info.seq_num = (++ep_priv->iov_seq_num) %
+					PSMX3_IOV_MAX_SEQ_NUM + 1;
+		req->iov_info.count = (uint32_t)real_count;
+		req->iov_info.total_len = (uint32_t)total_len;
+
+		q = req->iov_info.len;
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len)
+				*q++ = (uint32_t)iov[i].iov_len;
+		}
+
+		msg_flags = PSMX3_TYPE_TAGGED | PSMX3_IOV_BIT;
+		len = (3 + real_count) * sizeof(uint32_t);
+	}
+
+	av = ep_priv->av;
+	assert(av);
+	psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
+
+	if (have_data)
+		PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)data, msg_flags | PSMX3_IMM_BIT);
+	else
+		PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, msg_flags);
+
+	if ((flags & PSMX3_NO_COMPLETION) ||
+	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
+		no_completion = 1;
+
+	if (flags & FI_INJECT) {
+		if (len > psmx3_env.inject_size) {
+			free(req);
+			return -FI_EMSGSIZE;
+		}
+
+		err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				    send_flag, &psm2_tag, req->buf, len);
+
+		free(req);
+
+		if (err != PSM2_OK)
+			return psmx3_errno(err);
+
+		if (ep_priv->send_cntr)
+			psmx3_cntr_inc(ep_priv->send_cntr, 0);
+
+		if (ep_priv->send_cq && !no_completion) {
+			event = psmx3_cq_create_event(
+					ep_priv->send_cq,
+					context, NULL, flags, len,
+					(uint64_t) data,
+					0 /* tag */,
+					0 /* olen */,
+					0 /* err */);
+
+			if (event)
+				psmx3_cq_enqueue_event(ep_priv->send_cq, event);
+			else
+				return -FI_ENOMEM;
+		}
+
+		return 0;
+	}
+
+	req->no_completion = no_completion;
+	req->user_context = context;
+	req->comp_flag = FI_TAGGED;
+
+	fi_context = &req->fi_context;
+	PSMX3_CTXT_TYPE(fi_context) = PSMX3_SENDV_CONTEXT;
+	PSMX3_CTXT_USER(fi_context) = req;
+	PSMX3_CTXT_EP(fi_context) = ep_priv;
+
+	err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr,
+			     send_flag, &psm2_tag, req->buf, len,
+			     (void *)fi_context, &psm2_req);
+
+	if (err != PSM2_OK) {
+		free(req);
+		return psmx3_errno(err);
+	}
+
+	PSMX3_CTXT_REQ(fi_context) = psm2_req;
+
+	if (req->iov_protocol == PSMX3_IOV_PROTO_MULTI) {
+		fi_context = &req->fi_context_iov;
+		PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_SEND_CONTEXT;
+		PSMX3_CTXT_USER(fi_context) = req;
+		PSMX3_CTXT_EP(fi_context) = ep_priv;
+		PSMX3_SET_TAG(psm2_tag, req->iov_info.seq_num, 0,
+			      PSMX3_TYPE_IOV_PAYLOAD);
+		for (i=0; i<count; i++) {
+			if (iov[i].iov_len) {
+				err = psm2_mq_isend2(ep_priv->tx->psm2_mq,
+						     psm2_epaddr, send_flag, &psm2_tag,
+						     iov[i].iov_base, iov[i].iov_len,
+						     (void *)fi_context, &psm2_req);
+				if (err != PSM2_OK)
+					return psmx3_errno(err);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static ssize_t psmx3_tagged_send(struct fid_ep *ep,
+				 const void *buf, size_t len,
+				 void *desc, fi_addr_t dest_addr,
+				 uint64_t tag, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_tagged_send_generic(ep, buf, len, desc, dest_addr,
+					 tag, context, ep_priv->tx_flags, 0);
+}
+
+static ssize_t psmx3_tagged_sendmsg(struct fid_ep *ep,
+				    const struct fi_msg_tagged *msg,
+				    uint64_t flags)
+{
+	void *buf;
+	size_t len;
+
+	assert(msg);
+	assert(!msg->iov_count || msg->msg_iov);
+	assert(msg->iov_count <= PSMX3_IOV_MAX_COUNT);
+
+	if (msg->iov_count > 1) {
+		return psmx3_tagged_sendv_generic(ep, msg->msg_iov,
+						  msg->desc, msg->iov_count,
+						  msg->addr, msg->tag,
+						  msg->context, flags,
+						  msg->data);
+	} else if (msg->iov_count) {
+		buf = msg->msg_iov[0].iov_base;
+		len = msg->msg_iov[0].iov_len;
+	} else {
+		buf = NULL;
+		len = 0;
+	}
+
+	return psmx3_tagged_send_generic(ep, buf, len,
+					 msg->desc ? msg->desc[0] : NULL,
+					 msg->addr, msg->tag, msg->context,
+					 flags, msg->data);
+}
+
+ssize_t psmx3_tagged_senddata(struct fid_ep *ep, const void *buf, size_t len,
+			      void *desc, uint64_t data, fi_addr_t dest_addr,
+			      uint64_t tag, void *context)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_tagged_send_generic(ep, buf, len, desc, dest_addr,
+					 tag, context,
+					 ep_priv->tx_flags | FI_REMOTE_CQ_DATA,
+					 data);
+}
+
+#define PSMX3_TAGGED_SENDV_FUNC(suffix)					\
+static ssize_t								\
+psmx3_tagged_sendv##suffix(struct fid_ep *ep, const struct iovec *iov,	\
+			   void **desc,size_t count,			\
+			   fi_addr_t dest_addr,	uint64_t tag,		\
+			   void *context)				\
+{									\
+	void *buf;							\
+	size_t len;							\
+	assert(!count || iov);						\
+	assert(count <= PSMX3_IOV_MAX_COUNT); 				\
+	if (count > 1) {						\
+		struct psmx3_fid_ep *ep_priv;				\
+		ep_priv = container_of(ep, struct psmx3_fid_ep, ep);	\
+		return psmx3_tagged_sendv_generic(ep, iov, desc, count, \
+						  dest_addr, tag,	\
+						  context,		\
+						  ep_priv->tx_flags, 0);\
+	} else if (count) {						\
+		buf = iov[0].iov_base;					\
+		len = iov[0].iov_len;					\
+	} else {							\
+		buf = NULL;						\
+		len = 0;						\
+	}								\
+	return psmx3_tagged_send##suffix(ep, buf, len,			\
+					 desc ? desc[0] : NULL,		\
+				 	 dest_addr, tag, context);	\
+}
+
+PSMX3_TAGGED_SENDV_FUNC()
+PSMX3_TAGGED_SENDV_FUNC(_no_flag)
+PSMX3_TAGGED_SENDV_FUNC(_no_event)
+PSMX3_TAGGED_SENDV_FUNC(_no_flag_av_map)
+PSMX3_TAGGED_SENDV_FUNC(_no_event_av_map)
+
+static ssize_t psmx3_tagged_inject(struct fid_ep *ep,
+				   const void *buf, size_t len,
+				   fi_addr_t dest_addr, uint64_t tag)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_tagged_send_generic(ep, buf, len, NULL, dest_addr,
+					 tag, NULL,
+				  	 ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION,
+					 0);
+}
+
+static ssize_t psmx3_tagged_injectdata(struct fid_ep *ep,
+				       const void *buf, size_t len, uint64_t data,
+				       fi_addr_t dest_addr, uint64_t tag)
+{
+	struct psmx3_fid_ep *ep_priv;
+
+	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
+
+	return psmx3_tagged_send_generic(ep, buf, len, NULL, dest_addr,
+					 tag, NULL,
+					 ep_priv->tx_flags | FI_INJECT | FI_REMOTE_CQ_DATA |
+						  PSMX3_NO_COMPLETION,
+					 data);
+}
+
+#define PSMX3_TAGGED_OPS(suffix,sendopt,recvopt,injopt)	\
+struct fi_ops_tagged psmx3_tagged_ops##suffix = {	\
+	.size = sizeof(struct fi_ops_tagged),		\
+	.recv = psmx3_tagged_recv##recvopt,		\
+	.recvv = psmx3_tagged_recvv##recvopt,		\
+	.recvmsg = psmx3_tagged_recvmsg,		\
+	.send = psmx3_tagged_send##sendopt,		\
+	.sendv = psmx3_tagged_sendv##sendopt,		\
+	.sendmsg = psmx3_tagged_sendmsg,		\
+	.inject = psmx3_tagged_inject##injopt,		\
+	.senddata = psmx3_tagged_senddata##sendopt,	\
+	.injectdata = psmx3_tagged_injectdata##injopt,	\
+};
+
+PSMX3_TAGGED_OPS(,,,)
+PSMX3_TAGGED_OPS(_no_flag_directed, _no_flag, _no_flag_directed, _no_flag)
+PSMX3_TAGGED_OPS(_no_event_directed, _no_event, _no_event_directed, _no_flag)
+PSMX3_TAGGED_OPS(_no_send_event_directed, _no_event, _no_flag_directed, _no_flag)
+PSMX3_TAGGED_OPS(_no_recv_event_directed, _no_flag, _no_event_directed, _no_flag)
+PSMX3_TAGGED_OPS(_no_flag_undirected, _no_flag, _no_flag_undirected, _no_flag)
+PSMX3_TAGGED_OPS(_no_event_undirected, _no_event, _no_event_undirected, _no_flag)
+PSMX3_TAGGED_OPS(_no_send_event_undirected, _no_event, _no_flag_undirected, _no_flag)
+PSMX3_TAGGED_OPS(_no_recv_event_undirected, _no_flag, _no_event_undirected, _no_flag)
+PSMX3_TAGGED_OPS(_no_flag_directed_av_map, _no_flag_av_map, _no_flag_directed_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_event_directed_av_map, _no_event_av_map, _no_event_directed_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_send_event_directed_av_map, _no_event_av_map, _no_flag_directed_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_recv_event_directed_av_map, _no_flag_av_map, _no_event_directed_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_flag_undirected_av_map, _no_flag_av_map, _no_flag_undirected_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_event_undirected_av_map, _no_event_av_map, _no_event_undirected_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_send_event_undirected_av_map, _no_event_av_map, _no_flag_undirected_av_map, _no_flag_av_map)
+PSMX3_TAGGED_OPS(_no_recv_event_undirected_av_map, _no_flag_av_map, _no_event_undirected_av_map, _no_flag_av_map)
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_trigger.h b/deps/libfabric/prov/psm3/src/psmx3_trigger.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f152886b01aace81787f776b9e0eac76b1ee2e
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_trigger.h
@@ -0,0 +1,1113 @@
+/*
+ * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_PSM2_TRIGGER_H
+#define _FI_PSM2_TRIGGER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum psmx3_triggered_op {
+	PSMX3_TRIGGERED_SEND,
+	PSMX3_TRIGGERED_SENDV,
+	PSMX3_TRIGGERED_RECV,
+	PSMX3_TRIGGERED_TSEND,
+	PSMX3_TRIGGERED_TSENDV,
+	PSMX3_TRIGGERED_TRECV,
+	PSMX3_TRIGGERED_WRITE,
+	PSMX3_TRIGGERED_WRITEV,
+	PSMX3_TRIGGERED_READ,
+	PSMX3_TRIGGERED_READV,
+	PSMX3_TRIGGERED_ATOMIC_WRITE,
+	PSMX3_TRIGGERED_ATOMIC_WRITEV,
+	PSMX3_TRIGGERED_ATOMIC_READWRITE,
+	PSMX3_TRIGGERED_ATOMIC_READWRITEV,
+	PSMX3_TRIGGERED_ATOMIC_COMPWRITE,
+	PSMX3_TRIGGERED_ATOMIC_COMPWRITEV,
+};
+
+struct psmx3_trigger {
+	enum psmx3_triggered_op	op;
+	struct psmx3_fid_cntr	*cntr;
+	size_t			threshold;
+	union {
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} send;
+		struct {
+			struct fid_ep	*ep;
+			const struct iovec *iov;
+			size_t		count;
+			void		**desc;
+			fi_addr_t	dest_addr;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} sendv;
+		struct {
+			struct fid_ep	*ep;
+			void		*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	src_addr;
+			void		*context;
+			uint64_t	flags;
+		} recv;
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			uint64_t	tag;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} tsend;
+		struct {
+			struct fid_ep	*ep;
+			const struct iovec *iov;
+			size_t		count;
+			void		**desc;
+			fi_addr_t	dest_addr;
+			uint64_t	tag;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} tsendv;
+		struct {
+			struct fid_ep	*ep;
+			void		*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	src_addr;
+			uint64_t	tag;
+			uint64_t	ignore;
+			void		*context;
+			uint64_t	flags;
+		} trecv;
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} write;
+		struct {
+			struct fid_ep	*ep;
+			const struct iovec *iov;
+			size_t		count;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			void		*context;
+			uint64_t	flags;
+			uint64_t	data;
+		} writev;
+		struct {
+			struct fid_ep	*ep;
+			void		*buf;
+			size_t		len;
+			void		*desc;
+			fi_addr_t	src_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			void		*context;
+			uint64_t	flags;
+		} read;
+		struct {
+			struct fid_ep	*ep;
+			const struct iovec *iov;
+			size_t		count;
+			void		*desc;
+			fi_addr_t	src_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			void		*context;
+			uint64_t	flags;
+		} readv;
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		count;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_write;
+		struct {
+			struct fid_ep	*ep;
+			const struct fi_ioc *iov;
+			size_t		count;
+			void		*desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_writev;
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		count;
+			void		*desc;
+			void		*result;
+			void		*result_desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_readwrite;
+		struct {
+			struct fid_ep	*ep;
+			const struct fi_ioc *iov;
+			size_t		count;
+			void		**desc;
+			struct fi_ioc	*resultv;
+			void		**result_desc;
+			size_t		result_count;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_readwritev;
+		struct {
+			struct fid_ep	*ep;
+			const void	*buf;
+			size_t		count;
+			void		*desc;
+			const void	*compare;
+			void		*compare_desc;
+			void		*result;
+			void		*result_desc;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_compwrite;
+		struct {
+			struct fid_ep	*ep;
+			const struct fi_ioc *iov;
+			size_t		count;
+			void		**desc;
+			const struct fi_ioc *comparev;
+			void		**compare_desc;
+			size_t		compare_count;
+			struct fi_ioc	*resultv;
+			void		**result_desc;
+			size_t		result_count;
+			fi_addr_t	dest_addr;
+			uint64_t	addr;
+			uint64_t	key;
+			enum fi_datatype datatype;
+			enum fi_op	atomic_op;
+			void		*context;
+			uint64_t	flags;
+		} atomic_compwritev;
+	};
+	struct psmx3_trigger *next;	/* used for randomly accessed trigger list */
+	struct slist_entry list_entry;	/* used for ready-to-fire trigger queue */
+};
+
+ssize_t psmx3_send_generic(
+			struct fid_ep *ep,
+			const void *buf, size_t len,
+			void *desc, fi_addr_t dest_addr,
+			void *context, uint64_t flags,
+			uint64_t data);
+
+ssize_t psmx3_sendv_generic(
+			struct fid_ep *ep,
+			const struct iovec *iov, void **desc,
+			size_t count, fi_addr_t dest_addr,
+			void *context, uint64_t flags,
+			uint64_t data);
+
+ssize_t psmx3_recv_generic(
+			struct fid_ep *ep,
+			void *buf, size_t len, void *desc,
+			fi_addr_t src_addr, void *context,
+			uint64_t flags);
+
+ssize_t psmx3_tagged_send_generic(
+			struct fid_ep *ep,
+			const void *buf, size_t len,
+			void *desc, fi_addr_t dest_addr,
+			uint64_t tag, void *context,
+			uint64_t flags, uint64_t data);
+
+ssize_t psmx3_tagged_sendv_generic(
+			struct fid_ep *ep,
+			const struct iovec *iov, void **desc,
+			size_t count, fi_addr_t dest_addr,
+			uint64_t tag, void *context,
+			uint64_t flags, uint64_t data);
+
+ssize_t psmx3_tagged_recv_generic(
+			struct fid_ep *ep,
+			void *buf, size_t len,
+			void *desc, fi_addr_t src_addr,
+			uint64_t tag, uint64_t ignore,
+			void *context, uint64_t flags);
+
+ssize_t psmx3_write_generic(
+			struct fid_ep *ep,
+			const void *buf, size_t len,
+			void *desc, fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			void *context, uint64_t flags,
+			uint64_t data);
+
+ssize_t psmx3_writev_generic(
+			struct fid_ep *ep,
+			const struct iovec *iov, void **desc,
+			size_t count, fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			void *context, uint64_t flags,
+			uint64_t data);
+
+ssize_t psmx3_read_generic(
+			struct fid_ep *ep,
+			void *buf, size_t len,
+			void *desc, fi_addr_t src_addr,
+			uint64_t addr, uint64_t key,
+			void *context, uint64_t flags);
+
+ssize_t psmx3_readv_generic(
+			struct fid_ep *ep,
+			const struct iovec *iov, void *desc,
+			size_t count, fi_addr_t src_addr,
+			uint64_t addr, uint64_t key,
+			void *context, uint64_t flags);
+
+ssize_t psmx3_atomic_write_generic(
+			struct fid_ep *ep,
+			const void *buf,
+			size_t count, void *desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype,
+			enum fi_op op, void *context,
+			uint64_t flags);
+
+ssize_t psmx3_atomic_readwrite_generic(
+			struct fid_ep *ep,
+			const void *buf,
+			size_t count, void *desc,
+			void *result, void *result_desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype,
+			enum fi_op op, void *context,
+			uint64_t flags);
+
+ssize_t psmx3_atomic_compwrite_generic(
+			struct fid_ep *ep,
+			const void *buf,
+			size_t count, void *desc,
+			const void *compare, void *compare_desc,
+			void *result, void *result_desc,
+			fi_addr_t dest_addr,
+			uint64_t addr, uint64_t key,
+			enum fi_datatype datatype,
+			enum fi_op op, void *context,
+			uint64_t flags);
+
+static inline
+int psmx3_process_trigger(struct psmx3_trx_ctxt *trx_ctxt,
+			  struct psmx3_trigger *trigger)
+{
+	switch (trigger->op) {
+	case PSMX3_TRIGGERED_SEND:
+		psmx3_send_generic(trigger->send.ep,
+				   trigger->send.buf,
+				   trigger->send.len,
+				   trigger->send.desc,
+				   trigger->send.dest_addr,
+				   trigger->send.context,
+				   trigger->send.flags,
+				   trigger->send.data);
+		break;
+	case PSMX3_TRIGGERED_SENDV:
+		psmx3_sendv_generic(trigger->sendv.ep,
+				    trigger->sendv.iov,
+				    trigger->sendv.desc,
+				    trigger->sendv.count,
+				    trigger->sendv.dest_addr,
+				    trigger->sendv.context,
+				    trigger->sendv.flags,
+				    trigger->sendv.data);
+		break;
+	case PSMX3_TRIGGERED_RECV:
+		psmx3_recv_generic(trigger->recv.ep,
+				   trigger->recv.buf,
+				   trigger->recv.len,
+				   trigger->recv.desc,
+				   trigger->recv.src_addr,
+				   trigger->recv.context,
+				   trigger->recv.flags);
+		break;
+	case PSMX3_TRIGGERED_TSEND:
+		psmx3_tagged_send_generic(trigger->tsend.ep,
+					  trigger->tsend.buf,
+					  trigger->tsend.len,
+					  trigger->tsend.desc,
+					  trigger->tsend.dest_addr,
+					  trigger->tsend.tag,
+					  trigger->tsend.context,
+					  trigger->tsend.flags,
+					  trigger->tsend.data);
+		break;
+	case PSMX3_TRIGGERED_TSENDV:
+		psmx3_tagged_sendv_generic(trigger->tsendv.ep,
+					   trigger->tsendv.iov,
+					   trigger->tsendv.desc,
+					   trigger->tsendv.count,
+					   trigger->tsendv.dest_addr,
+					   trigger->tsendv.tag,
+					   trigger->tsendv.context,
+					   trigger->tsendv.flags,
+					   trigger->tsendv.data);
+		break;
+	case PSMX3_TRIGGERED_TRECV:
+		psmx3_tagged_recv_generic(trigger->trecv.ep,
+					  trigger->trecv.buf,
+					  trigger->trecv.len,
+					  trigger->trecv.desc,
+					  trigger->trecv.src_addr,
+					  trigger->trecv.tag,
+					  trigger->trecv.ignore,
+					  trigger->trecv.context,
+					  trigger->trecv.flags);
+		break;
+	case PSMX3_TRIGGERED_WRITE:
+		psmx3_write_generic(trigger->write.ep,
+				    trigger->write.buf,
+				    trigger->write.len,
+				    trigger->write.desc,
+				    trigger->write.dest_addr,
+				    trigger->write.addr,
+				    trigger->write.key,
+				    trigger->write.context,
+				    trigger->write.flags,
+				    trigger->write.data);
+		break;
+
+	case PSMX3_TRIGGERED_WRITEV:
+		psmx3_writev_generic(trigger->writev.ep,
+				     trigger->writev.iov,
+				     trigger->writev.desc,
+				     trigger->writev.count,
+				     trigger->writev.dest_addr,
+				     trigger->writev.addr,
+				     trigger->writev.key,
+				     trigger->writev.context,
+				     trigger->writev.flags,
+				     trigger->writev.data);
+		break;
+
+	case PSMX3_TRIGGERED_READ:
+		psmx3_read_generic(trigger->read.ep,
+				   trigger->read.buf,
+				   trigger->read.len,
+				   trigger->read.desc,
+				   trigger->read.src_addr,
+				   trigger->read.addr,
+				   trigger->read.key,
+				   trigger->read.context,
+				   trigger->read.flags);
+		break;
+
+	case PSMX3_TRIGGERED_READV:
+		psmx3_readv_generic(trigger->readv.ep,
+				    trigger->readv.iov,
+				    trigger->readv.desc,
+				    trigger->readv.count,
+				    trigger->readv.src_addr,
+				    trigger->readv.addr,
+				    trigger->readv.key,
+				    trigger->readv.context,
+				    trigger->readv.flags);
+		break;
+
+	case PSMX3_TRIGGERED_ATOMIC_WRITE:
+		psmx3_atomic_write_generic(
+				trigger->atomic_write.ep,
+				trigger->atomic_write.buf,
+				trigger->atomic_write.count,
+				trigger->atomic_write.desc,
+				trigger->atomic_write.dest_addr,
+				trigger->atomic_write.addr,
+				trigger->atomic_write.key,
+				trigger->atomic_write.datatype,
+				trigger->atomic_write.atomic_op,
+				trigger->atomic_write.context,
+				trigger->atomic_write.flags);
+		break;
+
+	case PSMX3_TRIGGERED_ATOMIC_READWRITE:
+		psmx3_atomic_readwrite_generic(
+				trigger->atomic_readwrite.ep,
+				trigger->atomic_readwrite.buf,
+				trigger->atomic_readwrite.count,
+				trigger->atomic_readwrite.desc,
+				trigger->atomic_readwrite.result,
+				trigger->atomic_readwrite.result_desc,
+				trigger->atomic_readwrite.dest_addr,
+				trigger->atomic_readwrite.addr,
+				trigger->atomic_readwrite.key,
+				trigger->atomic_readwrite.datatype,
+				trigger->atomic_readwrite.atomic_op,
+				trigger->atomic_readwrite.context,
+				trigger->atomic_readwrite.flags);
+		break;
+
+	case PSMX3_TRIGGERED_ATOMIC_COMPWRITE:
+		psmx3_atomic_compwrite_generic(
+				trigger->atomic_compwrite.ep,
+				trigger->atomic_compwrite.buf,
+				trigger->atomic_compwrite.count,
+				trigger->atomic_compwrite.desc,
+				trigger->atomic_compwrite.compare,
+				trigger->atomic_compwrite.compare_desc,
+				trigger->atomic_compwrite.result,
+				trigger->atomic_compwrite.result_desc,
+				trigger->atomic_compwrite.dest_addr,
+				trigger->atomic_compwrite.addr,
+				trigger->atomic_compwrite.key,
+				trigger->atomic_compwrite.datatype,
+				trigger->atomic_compwrite.atomic_op,
+				trigger->atomic_compwrite.context,
+				trigger->atomic_compwrite.flags);
+		break;
+	default:
+		FI_INFO(&psmx3_prov, FI_LOG_CQ,
+			"%d unsupported op\n", trigger->op);
+		break;
+	}
+
+	free(trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_trecv(struct fid_ep *ep, void *buf,
+			      size_t len, void *desc,
+			      fi_addr_t src_addr, 
+			      uint64_t tag, uint64_t ignore,
+			      void *context, uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_TRECV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->trecv.ep = ep;
+	trigger->trecv.buf = buf;
+	trigger->trecv.len = len;
+	trigger->trecv.desc = desc;
+	trigger->trecv.src_addr = src_addr;
+	trigger->trecv.tag = tag;
+	trigger->trecv.ignore = ignore;
+	trigger->trecv.context = context;
+	trigger->trecv.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_tsend(struct fid_ep *ep,
+			      const void *buf, size_t len,
+			      void *desc, fi_addr_t dest_addr,
+			      uint64_t tag, void *context,
+			      uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_TSEND;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->tsend.ep = ep;
+	trigger->tsend.buf = buf;
+	trigger->tsend.len = len;
+	trigger->tsend.desc = desc;
+	trigger->tsend.dest_addr = dest_addr;
+	trigger->tsend.tag = tag;
+	trigger->tsend.context = context;
+	trigger->tsend.flags = flags & ~FI_TRIGGER;
+	trigger->tsend.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_tsendv(struct fid_ep *ep,
+			       const struct iovec *iov, void *desc,
+			       size_t count, fi_addr_t dest_addr,
+			       uint64_t tag, void *context,
+			       uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_TSENDV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->tsendv.ep = ep;
+	trigger->tsendv.iov = iov;
+	trigger->tsendv.desc = desc;
+	trigger->tsendv.count = count;
+	trigger->tsendv.dest_addr = dest_addr;
+	trigger->tsendv.tag = tag;
+	trigger->tsendv.context = context;
+	trigger->tsendv.flags = flags & ~FI_TRIGGER;
+	trigger->tsendv.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_recv(struct fid_ep *ep, void *buf, size_t len,
+			     void *desc, fi_addr_t src_addr, void *context,
+			     uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_RECV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->recv.ep = ep;
+	trigger->recv.buf = buf;
+	trigger->recv.len = len;
+	trigger->recv.desc = desc;
+	trigger->recv.src_addr = src_addr;
+	trigger->recv.context = context;
+	trigger->recv.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_send(struct fid_ep *ep, const void *buf, size_t len,
+			     void *desc, fi_addr_t dest_addr, void *context,
+			     uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_SEND;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->send.ep = ep;
+	trigger->send.buf = buf;
+	trigger->send.len = len;
+	trigger->send.desc = desc;
+	trigger->send.dest_addr = dest_addr;
+	trigger->send.context = context;
+	trigger->send.flags = flags & ~FI_TRIGGER;
+	trigger->send.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_sendv(struct fid_ep *ep, const struct iovec *iov,
+			      void *desc, size_t count, fi_addr_t dest_addr,
+			      void *context, uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_SENDV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->sendv.ep = ep;
+	trigger->sendv.iov = iov;
+	trigger->sendv.desc = desc;
+	trigger->sendv.count = count;
+	trigger->sendv.dest_addr = dest_addr;
+	trigger->sendv.context = context;
+	trigger->sendv.flags = flags & ~FI_TRIGGER;
+	trigger->sendv.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_read(struct fid_ep *ep, void *buf, size_t len,
+			     void *desc, fi_addr_t src_addr,
+			     uint64_t addr, uint64_t key, void *context,
+			     uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_READ;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->read.ep = ep;
+	trigger->read.buf = buf;
+	trigger->read.len = len;
+	trigger->read.desc = desc;
+	trigger->read.src_addr = src_addr;
+	trigger->read.addr = addr;
+	trigger->read.key = key;
+	trigger->read.context = context;
+	trigger->read.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_readv(struct fid_ep *ep, const struct iovec *iov,
+			      void *desc, size_t count, fi_addr_t src_addr,
+			      uint64_t addr, uint64_t key, void *context,
+			      uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_READV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->readv.ep = ep;
+	trigger->readv.iov = iov;
+	trigger->readv.count = count;
+	trigger->readv.desc = desc;
+	trigger->readv.src_addr = src_addr;
+	trigger->readv.addr = addr;
+	trigger->readv.key = key;
+	trigger->readv.context = context;
+	trigger->readv.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_write(struct fid_ep *ep, const void *buf, size_t len,
+			      void *desc, fi_addr_t dest_addr,
+			      uint64_t addr, uint64_t key, void *context,
+			      uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_WRITE;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->write.ep = ep;
+	trigger->write.buf = buf;
+	trigger->write.len = len;
+	trigger->write.desc = desc;
+	trigger->write.dest_addr = dest_addr;
+	trigger->write.addr = addr;
+	trigger->write.key = key;
+	trigger->write.context = context;
+	trigger->write.flags = flags & ~FI_TRIGGER;
+	trigger->write.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_writev(struct fid_ep *ep, const struct iovec *iov,
+			       void **desc, size_t count, fi_addr_t dest_addr,
+			       uint64_t addr, uint64_t key, void *context,
+			       uint64_t flags, uint64_t data)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_WRITEV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->writev.ep = ep;
+	trigger->writev.iov = iov;
+	trigger->writev.count = count;
+	trigger->writev.desc = desc;
+	trigger->writev.dest_addr = dest_addr;
+	trigger->writev.addr = addr;
+	trigger->writev.key = key;
+	trigger->writev.context = context;
+	trigger->writev.flags = flags & ~FI_TRIGGER;
+	trigger->writev.data = data;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_write(struct fid_ep *ep,
+				     const void *buf,
+				     size_t count, void *desc,
+				     fi_addr_t dest_addr,
+				     uint64_t addr, uint64_t key,
+				     enum fi_datatype datatype,
+				     enum fi_op op, void *context,
+				     uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_WRITE;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_write.ep = ep;
+	trigger->atomic_write.buf = buf;
+	trigger->atomic_write.count = count;
+	trigger->atomic_write.desc = desc;
+	trigger->atomic_write.dest_addr = dest_addr;
+	trigger->atomic_write.addr = addr;
+	trigger->atomic_write.key = key;
+	trigger->atomic_write.datatype = datatype;
+	trigger->atomic_write.atomic_op = op;
+	trigger->atomic_write.context = context;
+	trigger->atomic_write.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_writev(struct fid_ep *ep,
+				      const struct fi_ioc *iov,
+				      void **desc, size_t count,
+				      fi_addr_t dest_addr,
+				      uint64_t addr, uint64_t key,
+				      enum fi_datatype datatype,
+				      enum fi_op op, void *context,
+				      uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_WRITEV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_writev.ep = ep;
+	trigger->atomic_writev.iov = iov;
+	trigger->atomic_writev.count = count;
+	trigger->atomic_writev.desc = desc;
+	trigger->atomic_writev.dest_addr = dest_addr;
+	trigger->atomic_writev.addr = addr;
+	trigger->atomic_writev.key = key;
+	trigger->atomic_writev.datatype = datatype;
+	trigger->atomic_writev.atomic_op = op;
+	trigger->atomic_writev.context = context;
+	trigger->atomic_writev.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_readwrite(struct fid_ep *ep,
+				         const void *buf,
+				         size_t count, void *desc,
+				         void *result, void *result_desc,
+				         fi_addr_t dest_addr,
+				         uint64_t addr, uint64_t key,
+				         enum fi_datatype datatype,
+				         enum fi_op op, void *context,
+				         uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_READWRITE;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_readwrite.ep = ep;
+	trigger->atomic_readwrite.buf = buf;
+	trigger->atomic_readwrite.count = count;
+	trigger->atomic_readwrite.desc = desc;
+	trigger->atomic_readwrite.result = result;
+	trigger->atomic_readwrite.result_desc = result_desc;
+	trigger->atomic_readwrite.dest_addr = dest_addr;
+	trigger->atomic_readwrite.addr = addr;
+	trigger->atomic_readwrite.key = key;
+	trigger->atomic_readwrite.datatype = datatype;
+	trigger->atomic_readwrite.atomic_op = op;
+	trigger->atomic_readwrite.context = context;
+	trigger->atomic_readwrite.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_readwritev(struct fid_ep *ep,
+					  const struct fi_ioc *iov,
+					  void **desc, size_t count,
+					  struct fi_ioc *resultv,
+					  void **result_desc,
+					  size_t result_count,
+					  fi_addr_t dest_addr,
+					  uint64_t addr, uint64_t key,
+					  enum fi_datatype datatype,
+					  enum fi_op op, void *context,
+					  uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_READWRITEV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_readwritev.ep = ep;
+	trigger->atomic_readwritev.iov = iov;
+	trigger->atomic_readwritev.count = count;
+	trigger->atomic_readwritev.desc = desc;
+	trigger->atomic_readwritev.resultv = resultv;
+	trigger->atomic_readwritev.result_desc = result_desc;
+	trigger->atomic_readwritev.result_count = result_count;
+	trigger->atomic_readwritev.dest_addr = dest_addr;
+	trigger->atomic_readwritev.addr = addr;
+	trigger->atomic_readwritev.key = key;
+	trigger->atomic_readwritev.datatype = datatype;
+	trigger->atomic_readwritev.atomic_op = op;
+	trigger->atomic_readwritev.context = context;
+	trigger->atomic_readwritev.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_compwrite(struct fid_ep *ep,
+				         const void *buf,
+				         size_t count, void *desc,
+				         const void *compare, void *compare_desc,
+				         void *result, void *result_desc,
+				         fi_addr_t dest_addr,
+				         uint64_t addr, uint64_t key,
+				         enum fi_datatype datatype,
+				         enum fi_op op, void *context,
+				         uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_COMPWRITE;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_compwrite.ep = ep;
+	trigger->atomic_compwrite.buf = buf;
+	trigger->atomic_compwrite.count = count;
+	trigger->atomic_compwrite.desc = desc;
+	trigger->atomic_compwrite.compare = compare;
+	trigger->atomic_compwrite.compare_desc = compare_desc;
+	trigger->atomic_compwrite.result = result;
+	trigger->atomic_compwrite.result_desc = result_desc;
+	trigger->atomic_compwrite.dest_addr = dest_addr;
+	trigger->atomic_compwrite.addr = addr;
+	trigger->atomic_compwrite.key = key;
+	trigger->atomic_compwrite.datatype = datatype;
+	trigger->atomic_compwrite.atomic_op = op;
+	trigger->atomic_compwrite.context = context;
+	trigger->atomic_compwrite.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+static inline
+int psmx3_trigger_queue_atomic_compwritev(struct fid_ep *ep,
+					  const struct fi_ioc *iov,
+					  void **desc, size_t count,
+					  const struct fi_ioc *comparev,
+					  void **compare_desc,
+					  size_t compare_count,
+					  struct fi_ioc *resultv,
+					  void **result_desc,
+					  size_t result_count,
+					  fi_addr_t dest_addr,
+					  uint64_t addr, uint64_t key,
+					  enum fi_datatype datatype,
+					  enum fi_op op, void *context,
+					  uint64_t flags)
+{
+	struct psmx3_trigger *trigger;
+	struct fi_triggered_context *ctxt = context;
+
+	trigger = calloc(1, sizeof(*trigger));
+	if (!trigger)
+		return -FI_ENOMEM;
+
+	trigger->op = PSMX3_TRIGGERED_ATOMIC_COMPWRITEV;
+	trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
+				     struct psmx3_fid_cntr, cntr);
+	trigger->threshold = ctxt->trigger.threshold.threshold;
+	trigger->atomic_compwritev.ep = ep;
+	trigger->atomic_compwritev.iov = iov;
+	trigger->atomic_compwritev.desc = desc;
+	trigger->atomic_compwritev.count = count;
+	trigger->atomic_compwritev.comparev = comparev;
+	trigger->atomic_compwritev.compare_desc = compare_desc;
+	trigger->atomic_compwritev.compare_count = compare_count;
+	trigger->atomic_compwritev.resultv = resultv;
+	trigger->atomic_compwritev.result_desc = result_desc;
+	trigger->atomic_compwritev.result_count = result_count;
+	trigger->atomic_compwritev.dest_addr = dest_addr;
+	trigger->atomic_compwritev.addr = addr;
+	trigger->atomic_compwritev.key = key;
+	trigger->atomic_compwritev.datatype = datatype;
+	trigger->atomic_compwritev.atomic_op = op;
+	trigger->atomic_compwritev.context = context;
+	trigger->atomic_compwritev.flags = flags & ~FI_TRIGGER;
+
+	psmx3_cntr_add_trigger(trigger->cntr, trigger);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_trx_ctxt.c b/deps/libfabric/prov/psm3/src/psmx3_trx_ctxt.c
new file mode 100644
index 0000000000000000000000000000000000000000..83dcee051ebc706d72e43d1d716162d5f779e462
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_trx_ctxt.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+int psmx3_trx_ctxt_cnt = 0;
+
+/*
+ * Tx/Rx context disconnect protocol:
+ *
+ * TRX_CTXT disconnect REQ:
+ *	args[0].u32w0	cmd
+ *
+ * Before a PSM2 endpoint is closed, a TRX_CTXT disconnect REQ is sent to
+ * all connected peers. Each peer then calls psm2_ep_disconnet() to clean
+ * up the local connection state. This allows a future endpoint with the
+ * same epid to connect to the same peers.
+ */
+
+struct disconnect_args {
+	struct psmx3_trx_ctxt	*trx_ctxt;
+	psm2_epaddr_t		epaddr;
+};
+
+static void *disconnect_func(void *args)
+{
+	struct disconnect_args *disconn = args;
+	struct psmx3_trx_ctxt *trx_ctxt = disconn->trx_ctxt;
+	struct psmx3_epaddr_context *epaddr_context;
+	psm2_error_t errors;
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"psm2_ep: %p, epaddr: %p\n", trx_ctxt->psm2_ep, disconn->epaddr);
+
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	dlist_remove_first_match(&trx_ctxt->peer_list,
+				 psmx3_peer_match, disconn->epaddr);
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+	if (trx_ctxt->ep && trx_ctxt->ep->av)
+		psmx3_av_remove_conn(trx_ctxt->ep->av, trx_ctxt, disconn->epaddr);
+
+	epaddr_context = psm2_epaddr_getctxt(disconn->epaddr);
+	psm2_epaddr_setctxt(disconn->epaddr, NULL);
+	free(epaddr_context);
+
+	psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &disconn->epaddr, NULL,
+			    &errors, PSM2_EP_DISCONNECT_FORCE, 0);
+
+	free(args);
+	return NULL;
+}
+
+int psmx3_am_trx_ctxt_handler(psm2_am_token_t token, psm2_amarg_t *args,
+			      int nargs, void *src, uint32_t len, void *hctx)
+{
+	psm2_epaddr_t epaddr;
+	int err = 0;
+	int cmd;
+	struct disconnect_args *disconn;
+	pthread_t disconnect_thread;
+	struct psmx3_trx_ctxt *trx_ctxt;
+	trx_ctxt = (struct psmx3_trx_ctxt *)hctx;
+
+	psm2_am_get_source(token, &epaddr);
+	cmd = PSMX3_AM_GET_OP(args[0].u32w0);
+
+	switch(cmd) {
+	case PSMX3_AM_REQ_TRX_CTXT_DISCONNECT:
+		/*
+		 * we can't call psm2_ep_disconnect from the AM
+		 * handler. instead, create a thread to do the work.
+		 * the performance of this operation is not important.
+		 *
+		 * also put the av cleanup operations into the thread
+		 * to avoid deadlock because the AM handler may be
+		 * called with the av lock held.
+		 */
+		disconn = malloc(sizeof(*disconn));
+		if (disconn) {
+			disconn->trx_ctxt = trx_ctxt;
+			disconn->epaddr = epaddr;
+			pthread_create(&disconnect_thread, NULL,
+				       disconnect_func, disconn);
+			pthread_detach(disconnect_thread);
+		}
+		break;
+
+	default:
+		err = -FI_EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+void psmx3_trx_ctxt_disconnect_peers(struct psmx3_trx_ctxt *trx_ctxt)
+{
+	struct dlist_entry *item, *tmp;
+	struct psmx3_epaddr_context *peer;
+	struct dlist_entry peer_list;
+	psm2_amarg_t arg;
+	int err;
+
+	arg.u32w0 = PSMX3_AM_REQ_TRX_CTXT_DISCONNECT;
+
+	/* use local peer_list to avoid entering AM handler while holding the lock */
+	dlist_init(&peer_list);
+	trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2);
+	dlist_foreach_safe(&trx_ctxt->peer_list, item, tmp) {
+		dlist_remove(item);
+		dlist_insert_before(item, &peer_list);
+	}
+	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
+
+	dlist_foreach_safe(&peer_list, item, tmp) {
+		peer = container_of(item, struct psmx3_epaddr_context, entry);
+		if (psmx3_env.disconnect) {
+			FI_INFO(&psmx3_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr);
+			err = psm2_am_request_short(peer->epaddr,
+						    PSMX3_AM_TRX_CTXT_HANDLER,
+						    &arg, 1, NULL, 0, 0, NULL,
+						    NULL);
+			if (err)
+				FI_INFO(&psmx3_prov, FI_LOG_CORE,
+					"failed to send disconnect, err %d\n",
+					err);
+		}
+		psm2_epaddr_setctxt(peer->epaddr, NULL);
+		free(peer);
+	}
+}
+
+static const char *psmx3_usage_flags_to_string(int usage_flags)
+{
+	switch (usage_flags & PSMX3_TX_RX) {
+	case PSMX3_TX: return "tx";
+	case PSMX3_RX: return "rx";
+	default: return "tx+rx";
+	}
+}
+
+void psmx3_trx_ctxt_free(struct psmx3_trx_ctxt *trx_ctxt, int usage_flags)
+{
+	int err;
+	int old_flags;
+
+	if (!trx_ctxt)
+		return;
+
+	old_flags = trx_ctxt->usage_flags;
+	trx_ctxt->usage_flags &= ~usage_flags;
+	if (trx_ctxt->usage_flags) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid: %016lx (%s -> %s)\n",
+			trx_ctxt->psm2_epid,
+			psmx3_usage_flags_to_string(old_flags),
+			psmx3_usage_flags_to_string(trx_ctxt->usage_flags));
+		return;
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid: %016lx (%s)\n",
+		trx_ctxt->psm2_epid, psmx3_usage_flags_to_string(old_flags));
+
+	trx_ctxt->am_progress = 0;
+	trx_ctxt->poll_active = 0;
+
+	trx_ctxt->domain->trx_ctxt_lock_fn(&trx_ctxt->domain->trx_ctxt_lock, 1);
+	dlist_remove(&trx_ctxt->entry);
+	trx_ctxt->domain->trx_ctxt_unlock_fn(&trx_ctxt->domain->trx_ctxt_lock, 1);
+
+	psmx3_trx_ctxt_disconnect_peers(trx_ctxt);
+
+	if (trx_ctxt->am_initialized)
+		psmx3_am_fini(trx_ctxt);
+
+#if 0
+	/* AM messages could arrive after MQ is finalized, causing segfault
+	 * when trying to dereference the MQ pointer. There is no mechanism
+	 * to properly shutdown AM. The workaround is to keep MQ valid.
+	 */
+	psm2_mq_finalize(trx_ctxt->psm2_mq);
+#endif
+
+	/* workaround for:
+	 * Assertion failure at psm2_ep.c:1059: ep->mctxt_master == ep
+	 */
+	if (psmx3_env.delay)
+		sleep(psmx3_env.delay);
+
+	if (psmx3_env.timeout)
+		err = psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL,
+				    (int64_t) psmx3_env.timeout * 1000000000LL);
+	else
+		err = PSM2_EP_CLOSE_TIMEOUT;
+
+	if (err != PSM2_OK)
+		psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0);
+
+	ofi_bufpool_destroy(trx_ctxt->am_req_pool);
+	fastlock_destroy(&trx_ctxt->am_req_pool_lock);
+	fastlock_destroy(&trx_ctxt->poll_lock);
+	fastlock_destroy(&trx_ctxt->peer_lock);
+
+	if (!ofi_atomic_dec32(&trx_ctxt->poll_refcnt))
+		free(trx_ctxt);
+}
+
+struct psmx3_trx_ctxt *psmx3_trx_ctxt_alloc(struct psmx3_fid_domain *domain,
+					    struct psmx3_ep_name *src_addr,
+					    int sep_ctxt_idx,
+					    int usage_flags,
+					    uint8_t *uuid)
+{
+	struct psmx3_trx_ctxt *trx_ctxt;
+	struct psm2_ep_open_opts opts;
+	int should_retry = 0;
+	int err;
+	struct dlist_entry *item;
+	int asked_flags = usage_flags & PSMX3_TX_RX;
+	int compatible_flags = ~asked_flags & PSMX3_TX_RX;
+
+	if (!uuid)
+		uuid = domain->fabric->uuid;
+
+	/* Check existing allocations first if only Tx or Rx is needed */
+	if (compatible_flags) {
+		domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1);
+		dlist_foreach(&domain->trx_ctxt_list, item) {
+			trx_ctxt = container_of(item, struct psmx3_trx_ctxt, entry);
+			if (compatible_flags == trx_ctxt->usage_flags &&
+			    !memcmp(uuid, trx_ctxt->uuid, sizeof(psm2_uuid_t))) {
+				trx_ctxt->usage_flags |= asked_flags;
+				domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1);
+				FI_INFO(&psmx3_prov, FI_LOG_CORE,
+					"use existing context. epid: %016lx "
+					"(%s -> tx+rx).\n", trx_ctxt->psm2_epid,
+					psmx3_usage_flags_to_string(compatible_flags));
+				return trx_ctxt;
+			}
+		}
+		domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1);
+	}
+
+	if (psmx3_trx_ctxt_cnt >= psmx3_hfi_info.max_trx_ctxt) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"number of Tx/Rx contexts exceeds limit (%d).\n",
+			psmx3_hfi_info.max_trx_ctxt);
+		return NULL;
+	}
+
+	trx_ctxt = calloc(1, sizeof(*trx_ctxt));
+	if (!trx_ctxt) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"failed to allocate trx_ctxt.\n");
+		return NULL;
+	}
+
+	err = ofi_bufpool_create(&trx_ctxt->am_req_pool,
+				 sizeof(struct psmx3_am_request),
+				 sizeof(void *), 0, 64, 0);
+	if (err) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"failed to allocate am_req_pool.\n");
+		goto err_out;
+	}
+
+	psm2_ep_open_opts_get_defaults(&opts);
+	memcpy(trx_ctxt->uuid, uuid, sizeof(psm2_uuid_t));
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"uuid: %s\n", psmx3_uuid_to_string(uuid));
+
+	opts.unit = src_addr ? src_addr->unit : PSMX3_DEFAULT_UNIT;
+	opts.port = src_addr ? src_addr->port : PSMX3_DEFAULT_PORT;
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"ep_open_opts: unit=%d port=%u\n", opts.unit, opts.port);
+
+	err = psm2_ep_open(uuid, &opts,
+			   &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid);
+	if (err != PSM2_OK) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"psm2_ep_open returns %d, errno=%d\n", err, errno);
+		if (!should_retry)
+			goto err_out_destroy_pool;
+
+		/* When round-robin fails, retry w/o explicit assignment */
+		opts.unit = -1;
+		err = psm2_ep_open(uuid, &opts,
+				   &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid);
+		if (err != PSM2_OK) {
+			FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"psm2_ep_open retry returns %d, errno=%d\n", err, errno);
+			goto err_out_destroy_pool;
+		}
+	}
+
+	FI_INFO(&psmx3_prov, FI_LOG_CORE,
+		"epid: %016lx (%s)\n", trx_ctxt->psm2_epid,
+		psmx3_usage_flags_to_string(usage_flags));
+
+	err = psm2_mq_init(trx_ctxt->psm2_ep, PSM2_MQ_ORDERMASK_ALL,
+			   NULL, 0, &trx_ctxt->psm2_mq);
+	if (err != PSM2_OK) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"psm2_mq_init returns %d, errno=%d\n", err, errno);
+		goto err_out_close_ep;
+	}
+
+	fastlock_init(&trx_ctxt->peer_lock);
+	fastlock_init(&trx_ctxt->poll_lock);
+	fastlock_init(&trx_ctxt->am_req_pool_lock);
+	fastlock_init(&trx_ctxt->trigger_queue.lock);
+	dlist_init(&trx_ctxt->peer_list);
+	slist_init(&trx_ctxt->trigger_queue.list);
+	trx_ctxt->id = psmx3_trx_ctxt_cnt++;
+	trx_ctxt->domain = domain;
+	trx_ctxt->usage_flags = asked_flags;
+	trx_ctxt->poll_active = 1;
+	ofi_atomic_initialize32(&trx_ctxt->poll_refcnt, 1); /* take one ref for domain->trx_ctxt_list */
+
+	domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1);
+	dlist_insert_before(&trx_ctxt->entry, &domain->trx_ctxt_list);
+	domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1);
+
+	return trx_ctxt;
+
+err_out_close_ep:
+	if (psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL,
+			  (int64_t) psmx3_env.timeout * 1000000000LL) != PSM2_OK)
+		psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0);
+
+err_out_destroy_pool:
+	ofi_bufpool_destroy(trx_ctxt->am_req_pool);
+
+err_out:
+	free(trx_ctxt);
+	return NULL;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_util.c b/deps/libfabric/prov/psm3/src/psmx3_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..fb3865124cc5c07a694a47ef634b56a896af3016
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_util.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+static void psmx3_string_to_uuid(const char *s, psm2_uuid_t uuid)
+{
+	int n;
+
+	if (!s) {
+		memset(uuid, 0, sizeof(psm2_uuid_t));
+		return;
+	}
+
+	n = sscanf(s,
+		"%2hhx%2hhx%2hhx%2hhx-"
+		"%2hhx%2hhx-%2hhx%2hhx-%2hhx%2hhx-"
+		"%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx",
+		&uuid[0], &uuid[1], &uuid[2], &uuid[3],
+		&uuid[4], &uuid[5], &uuid[6], &uuid[7], &uuid[8], &uuid[9],
+		&uuid[10], &uuid[11], &uuid[12], &uuid[13], &uuid[14], &uuid[15]);
+
+	if (n != 16) {
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+				"wrong uuid format: %s\n", s);
+		FI_WARN(&psmx3_prov, FI_LOG_CORE,
+			"correct uuid format is: "
+			"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\n");
+	}
+}
+
+void psmx3_get_uuid(psm2_uuid_t uuid)
+{
+	psmx3_string_to_uuid(psmx3_env.uuid, uuid);
+}
+int psmx3_override_uuid(void)
+{
+	return psmx3_env.uuid_override;
+}
+
+int psmx3_uuid_to_port(psm2_uuid_t uuid)
+{
+	uint16_t port;
+	uint16_t *u = (uint16_t *)uuid;
+
+	port = u[0] + u[1] + u[2] + u[3] + u[4] + u[5] + u[6] + u[7];
+	if (port < 4096)
+		port += 4096;
+
+	return (int)port;
+}
+
+char *psmx3_uuid_to_string(psm2_uuid_t uuid)
+{
+	static char s[40];
+
+	sprintf(s,
+		"%02hhX%02hhX%02hhX%02hhX-"
+		"%02hhX%02hhX-%02hhX%02hhX-%02hhX%02hhX-"
+		"%02hhX%02hhX%02hhX%02hhX%02hhX%02hhX",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
+		uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+
+	return s;
+}
+
+void *psmx3_ep_name_to_string(const struct psmx3_ep_name *name, size_t *len)
+{
+	char *s;
+
+	if (!name)
+		return NULL;
+
+	*len = PSMX3_MAX_STRING_NAME_LEN;
+
+	s = calloc(*len, 1);
+	if (!s)
+		return NULL;
+
+	if (!ofi_straddr((void *)s, len, FI_ADDR_PSMX3, name)) {
+		free(s);
+		return NULL;
+	}
+
+	return s;
+}
+
+struct psmx3_ep_name *psmx3_string_to_ep_name(const void *s)
+{
+	void *name;
+	size_t len;
+	uint32_t fmt;
+
+	if (!s)
+		return NULL;
+
+	if (ofi_str_toaddr(s, &fmt, &name, &len)) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"invalid string address: %s.\n",
+			(const char *)s);
+		return NULL;
+	}
+
+	if (fmt != FI_ADDR_PSMX3) {
+		FI_INFO(&psmx3_prov, FI_LOG_CORE,
+			"invalid string address format: %s.\n",
+			(const char *)s);
+		free(name);
+		return NULL;
+	}
+
+	return name;
+}
+
+static int psmx3_errno_table[PSM2_ERROR_LAST] = {
+	0,		/* PSM2_OK = 0 */
+	0,		/* PSM2_OK_NO_PROGRESS = 1 */
+	-FI_EOTHER,
+	-FI_EINVAL,	/* PSM2_PARAM_ERR = 3 */
+	-FI_ENOMEM, 	/* PSM2_NO_MEMORY = 4 */
+	-FI_EBADF,	/* PSM2_INIT_NOT_INIT = 5 */
+	-FI_EINVAL,	/* PSM2_INIT_BAD_API_VERSION = 6 */
+	-FI_ENOSYS,	/* PSM2_NO_AFFINITY = 7 */
+	-FI_EIO,	/* PSM2_INTERNAL_ERR = 8 */
+	-FI_EINVAL,	/* PSM2_SHMEM_SEGMENT_ERR = 9 */
+	-FI_EACCES,	/* PSM2_OPT_READONLY = 10 */
+	-FI_ETIMEDOUT,	/* PSM2_TIMEOUT = 11 */
+	-FI_EMFILE,	/* PSM2_TOO_MANY_ENDPOINTS = 12 */
+	-FI_ESHUTDOWN,	/* PSM2_IS_FINALIZED = 13 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_ESHUTDOWN,	/* PSM2_EP_WAS_CLOSED = 20 */
+	-FI_ENODEV,	/* PSM2_EP_NO_DEVICE = 21 */
+	-FI_ENOENT,	/* PSM2_EP_UNIT_NOT_FOUND = 22 */
+	-FI_EIO,	/* PSM2_EP_DEVICE_FAILURE = 23 */
+	-FI_ETIMEDOUT, 	/* PSM2_EP_CLOSE_TIMEOUT = 24 */
+	-FI_ENOENT,	/* PSM2_EP_NO_PORTS_AVAIL = 25 */
+	-FI_ENETDOWN,	/* PSM2_EP_NO_NETWORK = 26 */
+	-FI_EINVAL,	/* PSM2_EP_INVALID_UUID_KEY = 27 */
+	-FI_ENOSPC,	/* PSM2_EP_NO_RESOURCES = 28 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EBADF,	/* PSM2_EPID_UNKNOWN = 40 */
+	-FI_ENETUNREACH,/* PSM2_EPID_UNREACHABLE = 41 */
+	-FI_EOTHER,
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_NODE = 43 */
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_MTU =  44 */
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_UUID_KEY = 45 */
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_VERSION = 46 */
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_CONNECT = 47 */
+	-FI_EISCONN,	/* PSM2_EPID_ALREADY_CONNECTED = 48 */
+	-FI_EIO,	/* PSM2_EPID_NETWORK_ERROR = 49 */
+	-FI_EINVAL,	/* PSM2_EPID_INVALID_PKEY = 50 */
+	-FI_ENETUNREACH,/* PSM2_EPID_PATH_RESOLUTION = 51 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER,
+	-FI_EAGAIN,	/* PSM2_MQ_NO_COMPLETIONS = 60 */
+	-FI_ETRUNC,	/* PSM2_MQ_TRUNCATION = 61 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER,
+	-FI_EINVAL,	/* PSM2_AM_INVALID_REPLY = 70 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER
+			/* PSM2_ERROR_LAST = 80 */
+};
+
+int psmx3_errno(int err)
+{
+	if (err >= 0 && err < PSM2_ERROR_LAST)
+		return psmx3_errno_table[err];
+	else
+		return -FI_EOTHER;
+}
+
+/*
+ * PSM context sharing requires some information from the MPI process manager.
+ * Try to get the needed information from the environment.
+ */
+void psmx3_query_mpi(void)
+{
+	char *s;
+	char env[32];
+	int local_size = -1;
+	int local_rank = -1;
+
+	/* Check Open MPI */
+	if ((s = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"))) {
+		local_size = atoi(s);
+		if ((s = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
+			local_rank = atoi(s);
+		snprintf(env, sizeof(env), "%d", local_size);
+		setenv("MPI_LOCALNRANKS", env, 0);
+		snprintf(env, sizeof(env), "%d", local_rank);
+		setenv("MPI_LOCALRANKID", env, 0);
+		return;
+	}
+
+	/* TODO: check other MPI */
+}
+
diff --git a/deps/libfabric/prov/psm3/src/psmx3_wait.c b/deps/libfabric/prov/psm3/src/psmx3_wait.c
new file mode 100644
index 0000000000000000000000000000000000000000..f57ac046fce256f0942eebce356a056a9ab9c3b0
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/psmx3_wait.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx3.h"
+
+/* It is necessary to have a separate thread making progress in order
+ * for the wait functions to succeed. This thread is only created when
+ * wait functions are called and. In order to minimize performance
+ * impact, it only goes active during te time when wait calls are
+ * blocked.
+ */
+static pthread_t	psmx3_wait_thread;
+static pthread_mutex_t	psmx3_wait_mutex;
+static pthread_cond_t	psmx3_wait_cond;
+static volatile int	psmx3_wait_thread_ready = 0;
+static volatile int	psmx3_wait_thread_enabled = 0;
+static volatile int	psmx3_wait_thread_busy = 0;
+
+static void *psmx3_wait_progress(void *args)
+{
+	struct psmx3_fid_fabric *fabric = args;
+	struct psmx3_fid_domain *domain;
+	struct dlist_entry *item;
+
+	psmx3_wait_thread_ready = 1;
+	pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+
+	while (1) {
+		pthread_mutex_lock(&psmx3_wait_mutex);
+		if (!psmx3_wait_thread_enabled)
+			pthread_cond_wait(&psmx3_wait_cond, &psmx3_wait_mutex);
+		pthread_mutex_unlock(&psmx3_wait_mutex);
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+
+		psmx3_wait_thread_busy = 1;
+		while (psmx3_wait_thread_enabled) {
+			psmx3_lock(&fabric->domain_lock, 1);
+			dlist_foreach(&fabric->domain_list, item) {
+				domain = container_of(item, struct psmx3_fid_domain, entry);
+				if (domain->progress_thread_enabled &&
+				    domain->progress_thread != pthread_self())
+					continue;
+
+				psmx3_progress_all(domain);
+
+				if (!psmx3_wait_thread_enabled)
+					break;
+			}
+			psmx3_unlock(&fabric->domain_lock, 1);
+		}
+
+		psmx3_wait_thread_busy = 0;
+
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+	}
+
+	return NULL;
+}
+
+static void psmx3_wait_start_progress(struct psmx3_fid_fabric *fabric)
+{
+	struct dlist_entry *item;
+	struct psmx3_fid_domain *domain;
+	int run_wait_thread = 0;
+	pthread_attr_t attr;
+	int err;
+
+	if (!fabric)
+		return;
+
+	psmx3_lock(&fabric->domain_lock, 1);
+	dlist_foreach(&fabric->domain_list, item) {
+		domain = container_of(item, struct psmx3_fid_domain, entry);
+		if (!domain->progress_thread_enabled ||
+		    domain->progress_thread == pthread_self())
+			run_wait_thread = 1;
+	}
+	psmx3_unlock(&fabric->domain_lock, 1);
+
+	if (!run_wait_thread)
+		return;
+
+	if (!psmx3_wait_thread) {
+		pthread_mutex_init(&psmx3_wait_mutex, NULL);
+		pthread_cond_init(&psmx3_wait_cond, NULL);
+		pthread_attr_init(&attr);
+		pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_DETACHED);
+		err = pthread_create(&psmx3_wait_thread, &attr,
+				     psmx3_wait_progress, (void *)fabric);
+		if (err)
+			FI_WARN(&psmx3_prov, FI_LOG_EQ,
+				"cannot create wait progress thread\n");
+		pthread_attr_destroy(&attr);
+		while (!psmx3_wait_thread_ready)
+			;
+	}
+
+	psmx3_wait_thread_enabled = 1;
+	pthread_cond_signal(&psmx3_wait_cond);
+}
+
+static void psmx3_wait_stop_progress(void)
+{
+	psmx3_wait_thread_enabled = 0;
+
+	while (psmx3_wait_thread_busy)
+		;
+}
+
+static struct fi_ops_wait *psmx3_wait_ops_save;
+static struct fi_ops_wait psmx3_wait_ops;
+
+DIRECT_FN
+STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout)
+{
+	struct util_wait *wait_priv;
+	struct psmx3_fid_fabric *fabric;
+	int err;
+	
+	wait_priv = container_of(wait, struct util_wait, wait_fid);
+	fabric = container_of(wait_priv->fabric, struct psmx3_fid_fabric, util_fabric);
+
+	psmx3_wait_start_progress(fabric);
+
+	err = psmx3_wait_ops_save->wait(wait, timeout);
+
+	psmx3_wait_stop_progress();
+
+	return err;
+}
+
+DIRECT_FN
+int psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
+		   struct fid_wait **waitset)
+{
+	struct fid_wait *wait;
+	int err;
+
+	err = ofi_wait_fd_open(fabric, attr, &wait);
+	if (err)
+		return err;
+
+	psmx3_wait_ops_save = wait->ops;
+	psmx3_wait_ops = *psmx3_wait_ops_save;
+	psmx3_wait_ops.wait = psmx3_wait_wait;
+	wait->ops = &psmx3_wait_ops;
+
+	*waitset = wait;
+	return 0;
+}
+
+DIRECT_FN
+int psmx3_wait_trywait(struct fid_fabric *fabric, struct fid **fids, int count)
+{
+	struct psmx3_fid_cq *cq_priv;
+	struct util_eq *eq;
+	struct util_wait *wait;
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		switch (fids[i]->fclass) {
+			case FI_CLASS_CQ:
+				cq_priv = container_of(fids[i], struct psmx3_fid_cq, cq);
+				wait = cq_priv->wait;
+				break;
+			case FI_CLASS_EQ:
+				eq = container_of(fids[i], struct util_eq, eq_fid.fid);
+				wait = eq->wait;
+				break;
+			case FI_CLASS_CNTR:
+				return -FI_ENOSYS;
+			case FI_CLASS_WAIT:
+				wait = container_of(fids[i], struct util_wait, wait_fid.fid);
+				break;
+			default:
+				return -FI_EINVAL;
+		}
+
+		ret = wait->wait_try(wait);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
diff --git a/deps/libfabric/prov/psm3/src/version.h b/deps/libfabric/prov/psm3/src/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca094cd60c515e695acef8ee2f1dbed4a3776a81
--- /dev/null
+++ b/deps/libfabric/prov/psm3/src/version.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_PSM_VERSION_H_
+#define _FI_PSM_VERSION_H_
+
+#if HAVE_PSM3_SRC
+#include "psm3/psm2.h"
+#include "psm3/psm2_mq.h"
+#include "psm3/psm2_am.h"
+#ifdef VALGRIND_MAKE_MEM_DEFINED
+#undef VALGRIND_MAKE_MEM_DEFINED
+#endif
+#else
+#include <psm2.h>
+#include <psm2_mq.h>
+#include <psm2_am.h>
+#endif
+
+#define PSMX3_PROV_NAME		"psm3"
+#define PSMX3_DOMAIN_NAME	"any_verbs_ud_device"
+#define PSMX3_FABRIC_NAME	"psm3"
+
+#define PSMX3_DEFAULT_UUID	"00FF00FF-0000-0000-0000-00FF00FF00FF"
+#define PROVIDER_INI		PSM3_INI
+
+#ifndef PSMX3_USE_REQ_CONTEXT
+#define PSMX3_USE_REQ_CONTEXT	1
+#endif
+
+#define PSMX3_STATUS_TYPE	struct psm2_mq_req_user
+#define PSMX3_STATUS_ERROR(s)	((s)->error_code)
+#define PSMX3_STATUS_TAG(s)	((s)->tag)
+#define PSMX3_STATUS_RCVLEN(s)	((s)->recv_msglen)
+#define PSMX3_STATUS_SNDLEN(s)	((s)->send_msglen)
+#define PSMX3_STATUS_PEER(s)	((s)->peer)
+#define PSMX3_STATUS_CONTEXT(s)	((s)->context)
+
+/*
+ * Use reserved space within psm2_mq_req_user for fi_context instead of
+ * allocating from a internal queue.
+ *
+ * Only work with PSM2 that has psm2_mq_req_user defined. Can be turned off by
+ * passing "-DPSMX3_USE_REQ_CONTEXT=0" to the compiler.
+ */
+
+#if PSMX3_USE_REQ_CONTEXT
+
+#define PSMX3_EP_DECL_OP_CONTEXT
+
+#define PSMX3_EP_INIT_OP_CONTEXT(ep) \
+	do { \
+		FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL, \
+			"skip initialization of op context list.\n"); \
+	} while (0)
+
+#define PSMX3_EP_FINI_OP_CONTEXT(ep)
+
+#define PSMX3_EP_GET_OP_CONTEXT(ep, ctx) \
+	do { \
+		(ctx) = NULL; \
+	} while (0)
+
+#define PSMX3_EP_PUT_OP_CONTEXT(ep, ctx)
+
+#define PSMX3_REQ_GET_OP_CONTEXT(req, ctx) \
+	do { \
+		struct psm2_mq_req_user *req_user = (void *)(req); \
+		(ctx) = req_user->context = req_user->user_reserved; \
+	} while (0)
+
+#else /* !PSMX3_USE_REQ_CONTEXT */
+
+struct psmx3_context {
+        struct fi_context	fi_context;
+        struct slist_entry	list_entry;
+};
+
+#define PSMX3_EP_DECL_OP_CONTEXT \
+	struct slist	free_context_list; \
+	fastlock_t	context_lock;
+
+#define PSMX3_EP_INIT_OP_CONTEXT(ep) \
+	do { \
+		struct psmx3_context *item; \
+		int i; \
+		slist_init(&(ep)->free_context_list); \
+		fastlock_init(&(ep)->context_lock); \
+		for (i = 0; i < 64; i++) { \
+			item = calloc(1, sizeof(*item)); \
+			if (!item) { \
+				FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, "out of memory.\n"); \
+				break; \
+			} \
+			slist_insert_tail(&item->list_entry, &(ep)->free_context_list); \
+		} \
+	} while (0)
+
+#define PSMX3_EP_FINI_OP_CONTEXT(ep) \
+	do { \
+		struct slist_entry *entry; \
+		struct psmx3_context *item; \
+		while (!slist_empty(&(ep)->free_context_list)) { \
+			entry = slist_remove_head(&(ep)->free_context_list); \
+			item = container_of(entry, struct psmx3_context, list_entry); \
+			free(item); \
+		} \
+		fastlock_destroy(&(ep)->context_lock); \
+	} while (0)
+
+#define PSMX3_EP_GET_OP_CONTEXT(ep, ctx) \
+	do { \
+		struct psmx3_context *context; \
+		ep->domain->context_lock_fn(&(ep)->context_lock, 2); \
+		if (!slist_empty(&(ep)->free_context_list)) { \
+			context = container_of(slist_remove_head(&(ep)->free_context_list), \
+					       struct psmx3_context, list_entry); \
+			ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \
+			(ctx) = &context->fi_context; \
+			break; \
+		} \
+		ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \
+		context = malloc(sizeof(*context)); \
+		if (!context) { \
+			FI_WARN(&psmx3_prov, FI_LOG_EP_DATA, "out of memory.\n"); \
+			return -FI_ENOMEM; \
+		} \
+		(ctx) = &context->fi_context; \
+	} while (0)
+
+#define PSMX3_EP_PUT_OP_CONTEXT(ep, ctx) \
+	do { \
+		struct psmx3_context *context; \
+		context = container_of((ctx), struct psmx3_context, fi_context); \
+		context->list_entry.next = NULL; \
+		ep->domain->context_lock_fn(&(ep)->context_lock, 2); \
+		slist_insert_tail(&context->list_entry, &(ep)->free_context_list); \
+		ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \
+	} while (0)
+
+#endif /* !PSMX3_USE_REQ_CONTEXT */
+
+uint32_t get_psm3_provider_version();
+
+#endif
+
diff --git a/deps/libfabric/prov/psm3/util b/deps/libfabric/prov/psm3/util
new file mode 120000
index 0000000000000000000000000000000000000000..40c3fc5bdf8a05cbcfa702e1f7ca453c5d378b9c
--- /dev/null
+++ b/deps/libfabric/prov/psm3/util
@@ -0,0 +1 @@
+../util
\ No newline at end of file
diff --git a/deps/libfabric/prov/rstream/src/rstream.h b/deps/libfabric/prov/rstream/src/rstream.h
index 70583069e669d1c846d259f91e1d3536f3da5761..0b16c0e0752e66b2a7b74dca8f59e14b16efe9d9 100644
--- a/deps/libfabric/prov/rstream/src/rstream.h
+++ b/deps/libfabric/prov/rstream/src/rstream.h
@@ -142,7 +142,7 @@ struct rstream_ctx_data {
 	size_t len;
 };
 
-DECLARE_FREESTACK(struct rstream_ctx_data, rstream_tx_ctx_fs);
+OFI_DECLARE_FREESTACK(struct rstream_ctx_data, rstream_tx_ctx_fs);
 
 struct rstream_tx_ctx {
 	struct rstream_ctx_data *tx_ctxs;
diff --git a/deps/libfabric/prov/rstream/src/rstream_msg.c b/deps/libfabric/prov/rstream/src/rstream_msg.c
index 9b5451c2ec79f10655bbc449d115e3b1192a4b94..ef33f1c1932f35be5d656bba56300ba7879b6060 100644
--- a/deps/libfabric/prov/rstream/src/rstream_msg.c
+++ b/deps/libfabric/prov/rstream/src/rstream_msg.c
@@ -118,7 +118,7 @@ static struct fi_context *rstream_get_rx_ctx(struct rstream_ep *ep)
 static struct fi_context *rstream_get_tx_ctx(struct rstream_ep *ep, int len)
 {
 	struct rstream_tx_ctx_fs *fs = ep->tx_ctxs;
-	struct rstream_ctx_data *rtn_ctx = freestack_pop(fs);
+	struct rstream_ctx_data *rtn_ctx = ofi_freestack_pop(fs);
 
 	if (!rtn_ctx)
 		return NULL;
@@ -135,7 +135,7 @@ static int rstream_return_tx_ctx(struct fi_context *ctx_ptr,
 
 	struct rstream_ctx_data *ctx_data = (struct rstream_ctx_data *)ctx_ptr;
 	len = ctx_data->len;
-	freestack_push(fs, ctx_data);
+	ofi_freestack_push(fs, ctx_data);
 
 	return len;
 }
diff --git a/deps/libfabric/prov/rxd/src/rxd_atomic.c b/deps/libfabric/prov/rxd/src/rxd_atomic.c
index c6bc2aae0cf69620cb4dafff8f3acb7d559760a4..78ead0d5e942e48aa90cec7133e802060cf7324e 100644
--- a/deps/libfabric/prov/rxd/src/rxd_atomic.c
+++ b/deps/libfabric/prov/rxd/src/rxd_atomic.c
@@ -112,7 +112,7 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep,
 {
 	struct rxd_x_entry *tx_entry;
 	struct iovec iov[RXD_IOV_LIMIT], res_iov[RXD_IOV_LIMIT], comp_iov[RXD_IOV_LIMIT];
-	struct fi_rma_iov rma_iov[RXD_IOV_LIMIT]; 
+	struct fi_rma_iov rma_iov[RXD_IOV_LIMIT];
 	fi_addr_t rxd_addr;
 	ssize_t ret = -FI_EAGAIN;
 
@@ -133,7 +133,7 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-	
+
 	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
 					     RXD_IDX_OFFSET(addr));
 	if (!rxd_addr)
@@ -221,7 +221,7 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 	struct rxd_ep *rxd_ep = container_of(ep_fid, struct rxd_ep, util_ep.ep_fid.fid);
 	struct rxd_x_entry *tx_entry;
 	struct iovec iov;
-	struct fi_rma_iov rma_iov; 
+	struct fi_rma_iov rma_iov;
 	fi_addr_t rxd_addr;
 	ssize_t ret = -FI_EAGAIN;
 
@@ -237,7 +237,7 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), 
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
 					     RXD_IDX_OFFSET(addr));
 	if (!rxd_addr)
 		goto out;
@@ -413,7 +413,13 @@ int rxd_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
 	if (flags & FI_TAGGED) {
 		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL,
 			"tagged atomic op not supported\n");
-		return -FI_EINVAL;
+		return -FI_EOPNOTSUPP;
+	}
+
+	if ((datatype == FI_INT128) || (datatype == FI_UINT128)) {
+		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL,
+			"128-bit integers not supported\n");
+		return -FI_EOPNOTSUPP;
 	}
 
 	ret = ofi_atomic_valid(&rxd_prov, datatype, op, flags);
diff --git a/deps/libfabric/prov/rxd/src/rxd_av.c b/deps/libfabric/prov/rxd/src/rxd_av.c
index e4c92d03bd67464ee1adcb88db3f010883ad6ea6..8d30a98984f66cf88f043362ac73faf7007f52b4 100644
--- a/deps/libfabric/prov/rxd/src/rxd_av.c
+++ b/deps/libfabric/prov/rxd/src/rxd_av.c
@@ -44,7 +44,7 @@ static int rxd_tree_compare(struct ofi_rbmap *map, void *key, void *data)
 
 	memset(addr, 0, len);
 	av = container_of(map, struct rxd_av, rbmap);
-	dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, 
+	dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx,
 					   (fi_addr_t) data);
 
 	ret = fi_av_lookup(av->dg_av, dg_addr,addr, &len);
@@ -109,7 +109,7 @@ close:
 static fi_addr_t rxd_av_dg_addr(struct rxd_av *av, fi_addr_t fi_addr)
 {
 	fi_addr_t dg_addr;
-	fi_addr_t rxd_addr = (intptr_t) ofi_idx_lookup(&av->fi_addr_idx, 
+	fi_addr_t rxd_addr = (intptr_t) ofi_idx_lookup(&av->fi_addr_idx,
 					     RXD_IDX_OFFSET(fi_addr));
 	if (!rxd_addr)
 		goto err;
@@ -140,8 +140,8 @@ static fi_addr_t rxd_set_fi_addr(struct rxd_av *av, fi_addr_t rxd_addr)
 	fi_addr = ofi_idx_insert(&(av->fi_addr_idx), (void*)(uintptr_t)rxd_addr);
 	if (fi_addr < 0)
 		goto nomem1;
-	
-	if (ofi_idm_set(&(av->rxdaddr_fi_idm), rxd_addr, 
+
+	if (ofi_idm_set(&(av->rxdaddr_fi_idm), rxd_addr,
 		        (void*)(uintptr_t) fi_addr) < 0)
 		goto nomem2;
 
@@ -150,7 +150,7 @@ static fi_addr_t rxd_set_fi_addr(struct rxd_av *av, fi_addr_t rxd_addr)
 nomem2:
 	ofi_idx_remove_ordered(&(av->fi_addr_idx), fi_addr);
 nomem1:
-	dg_addr = (intptr_t) ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), 
+	dg_addr = (intptr_t) ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx),
 						    rxd_addr);
 	fi_av_remove(av->dg_av, &dg_addr, 1, 0);
 
@@ -195,10 +195,19 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 	struct rxd_av *av;
 	int i = 0, ret = 0, success_cnt = 0;
 	fi_addr_t rxd_addr;
-	int util_addr;
+	int util_addr, *sync_err = NULL;
 	struct ofi_rbnode *node;
 
 	av = container_of(av_fid, struct rxd_av, util_av.av_fid);
+	ret = ofi_verify_av_insert(&av->util_av, flags, context);
+	if (ret)
+		return ret;
+
+	if (flags & FI_SYNC_ERR) {
+		sync_err = context;
+		memset(sync_err, 0, sizeof(*sync_err) * count);
+	}
+
 	fastlock_acquire(&av->util_av.lock);
 	if (!av->dg_addrlen) {
 		ret = rxd_av_set_addrlen(av, addr);
@@ -212,19 +221,20 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 			rxd_addr = (fi_addr_t) node->data;
 		} else {
 			ret = rxd_av_insert_dg_addr(av, addr, &rxd_addr,
-						    flags, context);
+						    flags, sync_err ?
+						    &sync_err[i] : context);
 			if (ret)
 				break;
 		}
 
-		util_addr = (intptr_t)ofi_idm_lookup(&av->rxdaddr_fi_idm, 
+		util_addr = (intptr_t)ofi_idm_lookup(&av->rxdaddr_fi_idm,
 						     rxd_addr);
 
 		if (!util_addr) {
 			util_addr = rxd_set_fi_addr(av, rxd_addr);
 			if (util_addr < 0) {
 				ret = util_addr;
-				break;	
+				break;
 			}
 		}
 		if (fi_addr)
@@ -237,10 +247,12 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 		FI_WARN(&rxd_prov, FI_LOG_AV,
 			"failed to insert address %d: %d (%s)\n",
 			i, -ret, fi_strerror(-ret));
-		if (av->util_av.eq)
-			ofi_av_write_event(&av->util_av, i, -ret, context);
 		if (fi_addr)
 			fi_addr[i] = FI_ADDR_NOTAVAIL;
+		if (av->util_av.eq)
+			ofi_av_write_event(&av->util_av, i, -ret, context);
+		else if (sync_err)
+			sync_err[i] = -ret;
 		i++;
 	}
 out:
@@ -248,10 +260,12 @@ out:
 	fastlock_release(&av->util_av.lock);
 
 	for (; i < count; i++) {
-		if (av->util_av.eq)
-			ofi_av_write_event(&av->util_av, i, FI_ECANCELED, context);
 		if (fi_addr)
 			fi_addr[i] = FI_ADDR_NOTAVAIL;
+		if (av->util_av.eq)
+			ofi_av_write_event(&av->util_av, i, FI_ECANCELED, context);
+		else if (sync_err)
+			sync_err[i] = FI_ECANCELED;
 	}
 
 	if (av->util_av.eq) {
@@ -280,42 +294,21 @@ static int rxd_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
 			uint64_t flags)
 {
 	int ret = 0;
-	size_t i, addrlen;
+	size_t i;
 	fi_addr_t rxd_addr;
-	fi_addr_t dg_addr;
 	struct rxd_av *av;
-	uint8_t addr[RXD_NAME_LENGTH];
 
 	av = container_of(av_fid, struct rxd_av, util_av.av_fid);
 	fastlock_acquire(&av->util_av.lock);
 	for (i = 0; i < count; i++) {
-
-		addrlen = RXD_NAME_LENGTH;
 		rxd_addr = (intptr_t)ofi_idx_lookup(&av->fi_addr_idx,
 						    RXD_IDX_OFFSET(fi_addr[i]));
 		if (!rxd_addr)
 			goto err;
-		
-		dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, rxd_addr);
 
-		ret = fi_av_lookup(av->dg_av, dg_addr, addr, &addrlen);
-		if (ret)
-			goto err;
-		
-		ret = ofi_rbmap_find_delete(&av->rbmap, (void *) addr);
-		if (ret)
-			goto err;
-
-		ret = fi_av_remove(av->dg_av, &dg_addr, 1, flags);
-
-		if (ret)
-			goto err;
-		
-		ofi_idx_remove_ordered(&(av->fi_addr_idx), 
+		ofi_idx_remove_ordered(&(av->fi_addr_idx),
 				       RXD_IDX_OFFSET(fi_addr[i]));
-		ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), rxd_addr);
 		ofi_idm_clear(&(av->rxdaddr_fi_idm), rxd_addr);
-		av->dg_av_used--;
 	}
 
 err:
@@ -361,22 +354,39 @@ static struct fi_ops_av rxd_av_ops = {
 static int rxd_av_close(struct fid *fid)
 {
 	struct rxd_av *av;
+	struct ofi_rbnode *node;
+	fi_addr_t dg_addr, rxd_addr;
 	int ret;
 
-
 	av = container_of(fid, struct rxd_av, util_av.av_fid);
-	ret = fi_close(&av->dg_av->fid);
+
+	ret = ofi_av_close(&av->util_av);
 	if (ret)
 		return ret;
 
+	while ((node = ofi_rbmap_get_root(&av->rbmap))) {
+		rxd_addr = (fi_addr_t) node->data;
+		dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx,
+						   rxd_addr);
+
+		ret = fi_av_remove(av->dg_av, &dg_addr, 1, 0);
+		if (ret)
+			FI_WARN(&rxd_prov, FI_LOG_AV,
+				"failed to remove dg addr: %d (%s)\n",
+				-ret, fi_strerror(-ret));
+
+		ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), rxd_addr);
+		ofi_rbmap_delete(&av->rbmap, node);
+	}
 	ofi_rbmap_cleanup(&av->rbmap);
-	ret = ofi_av_close(&av->util_av);
+
+	ret = fi_close(&av->dg_av->fid);
 	if (ret)
 		return ret;
 
-	ofi_idx_reset(&(av->fi_addr_idx)); 
+	ofi_idx_reset(&(av->fi_addr_idx));
 	ofi_idx_reset(&(av->rxdaddr_dg_idx));
-	ofi_idm_reset(&(av->rxdaddr_fi_idm));
+	ofi_idm_reset(&(av->rxdaddr_fi_idm), NULL);
 
 	free(av);
 	return 0;
@@ -433,7 +443,7 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 		goto err1;
 
 	ofi_rbmap_init(&av->rbmap, rxd_tree_compare);
-	
+
 	av_attr = *attr;
 	av_attr.count = 0;
 	av_attr.flags = 0;
@@ -448,7 +458,7 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 
 err2:
 	ofi_av_close(&av->util_av);
-err1:	
+err1:
 	free(av);
 	return ret;
 }
diff --git a/deps/libfabric/prov/rxd/src/rxd_ep.c b/deps/libfabric/prov/rxd/src/rxd_ep.c
index bdbb7c3a52e928806ddbedd7c2f398febf563505..ca06cfa84bee9b71fe39a12af99adfaf9a3ec3a8 100644
--- a/deps/libfabric/prov/rxd/src/rxd_ep.c
+++ b/deps/libfabric/prov/rxd/src/rxd_ep.c
@@ -462,7 +462,7 @@ static ssize_t rxd_ep_send_rts(struct rxd_ep *rxd_ep, fi_addr_t rxd_addr)
 
 	rxd_ep_send_pkt(rxd_ep, pkt_entry);
 	rxd_insert_unacked(rxd_ep, rxd_addr, pkt_entry);
-	dlist_insert_tail(&(rxd_peer(rxd_ep, rxd_addr)->entry), 
+	dlist_insert_tail(&(rxd_peer(rxd_ep, rxd_addr)->entry),
 			  &rxd_ep->rts_sent_list);
 
 	return 0;
@@ -613,19 +613,19 @@ static void rxd_close_peer(struct rxd_ep *ep, struct rxd_peer *peer)
 		peer->unacked_cnt--;
 	}
 
-	while(!dlist_empty(&peer->tx_list)) {
+	while (!dlist_empty(&peer->tx_list)) {
 		dlist_pop_front(&peer->tx_list, struct rxd_x_entry,
 				x_entry, entry);
 		rxd_tx_entry_free(ep, x_entry);
 	}
 
-	while(!dlist_empty(&peer->rx_list)) {
+	while (!dlist_empty(&peer->rx_list)) {
 		dlist_pop_front(&peer->rx_list, struct rxd_x_entry,
 				x_entry, entry);
 		rxd_rx_entry_free(ep, x_entry);
 	}
 
-	while(!dlist_empty(&peer->rma_rx_list)) {
+	while (!dlist_empty(&peer->rma_rx_list)) {
 		dlist_pop_front(&peer->rma_rx_list, struct rxd_x_entry,
 				x_entry, entry);
 		rxd_tx_entry_free(ep, x_entry);
@@ -670,6 +670,9 @@ static int rxd_ep_close(struct fid *fid)
 
 	dlist_foreach_container(&ep->active_peers, struct rxd_peer, peer, entry)
 		rxd_close_peer(ep, peer);
+	dlist_foreach_container(&ep->rts_sent_list, struct rxd_peer, peer, entry)
+		rxd_close_peer(ep, peer);
+	ofi_idm_reset(&(ep->peers_idm), free);
 
 	ret = fi_close(&ep->dg_ep->fid);
 	if (ret)
@@ -695,8 +698,7 @@ static int rxd_ep_close(struct fid *fid)
 				pkt_entry, d_entry);
 		ofi_buf_free(pkt_entry);
 	}
-	
-	ofi_idm_reset(&(ep->peers_idm));	
+
 	rxd_ep_free_res(ep);
 	ofi_endpoint_close(&ep->util_ep);
 	free(ep);
@@ -1147,10 +1149,11 @@ err:
 int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr)
 {
 
-	struct rxd_peer *peer;	
+	struct rxd_peer *peer;
+
 	peer = calloc(1, sizeof(struct rxd_peer));
 	if (!peer)
-		return -FI_ENOMEM;	
+		return -FI_ENOMEM;
 
 	peer->peer_addr = RXD_ADDR_INVALID;
 	peer->tx_seq_no = 0;
@@ -1167,12 +1170,12 @@ int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr)
 	dlist_init(&(peer->rx_list));
 	dlist_init(&(peer->rma_rx_list));
 	dlist_init(&(peer->buf_pkts));
-		
+
 	if (ofi_idm_set(&(ep->peers_idm), rxd_addr, peer) < 0)
 		goto err;
-	
+
 	return 0;
-err:	
+err:
 	free(peer);
 	return -FI_ENOMEM;
 }
@@ -1185,10 +1188,7 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
 	struct rxd_ep *rxd_ep;
 	int ret;
 
-
-
 	rxd_ep = calloc(1, sizeof(*rxd_ep));
-
 	if (!rxd_ep)
 		return -FI_ENOMEM;
 
@@ -1229,9 +1229,9 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
 	ret = rxd_ep_init_res(rxd_ep, info);
 	if (ret)
 		goto err3;
-	
+
 	memset(&(rxd_ep->peers_idm), 0, sizeof(rxd_ep->peers_idm));
-	
+
 	rxd_ep->util_ep.ep_fid.fid.ops = &rxd_ep_fi_ops;
 	rxd_ep->util_ep.ep_fid.cm = &rxd_ep_cm;
 	rxd_ep->util_ep.ep_fid.ops = &rxd_ops_ep;
diff --git a/deps/libfabric/prov/rxd/src/rxd_rma.c b/deps/libfabric/prov/rxd/src/rxd_rma.c
index ae9649a0067def280dac2e733a5921195b766f3e..3b3cb1172fe5f9860fd4ffdfd4177210fd1e8463 100644
--- a/deps/libfabric/prov/rxd/src/rxd_rma.c
+++ b/deps/libfabric/prov/rxd/src/rxd_rma.c
@@ -100,7 +100,7 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-	
+
 	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
 					      RXD_IDX_OFFSET(addr));
 	if (!rxd_addr)
@@ -130,7 +130,8 @@ out:
 	return ret;
 }
 
-ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov,
+static ssize_t
+rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov,
 	size_t iov_count, const struct fi_rma_iov *rma_iov, size_t rma_count,
 	void **desc, fi_addr_t addr, void *context, uint32_t op, uint64_t data,
 	uint32_t rxd_flags)
@@ -180,7 +181,8 @@ out:
 	return ret;
 }
 
-ssize_t rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
+static ssize_t
+rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 	fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context)
 {
 	struct rxd_ep *ep;
@@ -195,12 +197,13 @@ ssize_t rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 	rma_iov.len = len;
 	rma_iov.key = key;
 
-	return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, 
+	return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc,
 			       src_addr, context, RXD_READ_REQ, 0,
 			       ep->tx_flags);
 }
 
-ssize_t rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+static ssize_t
+rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 	size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
 	void *context)
 {
@@ -218,7 +221,8 @@ ssize_t rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 			       ep->tx_flags);
 }
 
-ssize_t rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+static ssize_t
+rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 	uint64_t flags)
 {
 	struct rxd_ep *ep;
@@ -232,7 +236,8 @@ ssize_t rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 			       ep->util_ep.tx_msg_flags));
 }
 
-ssize_t rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc,
+static ssize_t
+rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc,
 	fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context)
 {
 	struct rxd_ep *ep;
@@ -247,12 +252,13 @@ ssize_t rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc
 	rma_iov.len = len;
 	rma_iov.key = key;
 
-	return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, 
+	return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc,
 			       dest_addr, context, RXD_WRITE, 0,
 			       ep->tx_flags);
 }
 
-ssize_t rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+static ssize_t
+rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 		size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
 		void *context)
 {
@@ -270,8 +276,8 @@ ssize_t rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 			       ep->tx_flags);
 }
 
-
-ssize_t rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+static ssize_t
+rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 	uint64_t flags)
 {
 	struct rxd_ep *ep;
@@ -285,7 +291,8 @@ ssize_t rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 			       ep->util_ep.tx_msg_flags));
 }
 
-ssize_t rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
+static ssize_t
+rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
 		      void *desc, uint64_t data, fi_addr_t dest_addr,
 		      uint64_t addr, uint64_t key, void *context)
 {
@@ -306,7 +313,8 @@ ssize_t rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
 			       ep->tx_flags | RXD_REMOTE_CQ_DATA);
 }
 
-ssize_t rxd_inject_write(struct fid_ep *ep_fid, const void *buf,
+static ssize_t
+rxd_inject_write(struct fid_ep *ep_fid, const void *buf,
 	size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key)
 {
 	struct rxd_ep *rxd_ep;
@@ -326,9 +334,10 @@ ssize_t rxd_inject_write(struct fid_ep *ep_fid, const void *buf,
 					RXD_NO_TX_COMP | RXD_INJECT);
 }
 
-ssize_t rxd_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
-			     uint64_t data, fi_addr_t dest_addr, uint64_t addr,
-			     uint64_t key)
+static ssize_t
+rxd_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
+		     uint64_t data, fi_addr_t dest_addr, uint64_t addr,
+		     uint64_t key)
 {
 	struct rxd_ep *rxd_ep;
 	struct iovec iov;
diff --git a/deps/libfabric/prov/rxm/src/rxm.h b/deps/libfabric/prov/rxm/src/rxm.h
index 0494ffd911eb0de06a428020619bd56ab5e9b9ae..f91e38ffa233217c68e228f714dd3f60abc30ca4 100644
--- a/deps/libfabric/prov/rxm/src/rxm.h
+++ b/deps/libfabric/prov/rxm/src/rxm.h
@@ -1,7 +1,8 @@
 
 /*
- * Copyright (c) 2016 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2016-2021 Intel Corporation, Inc.  All rights reserved.
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -53,6 +54,7 @@
 #include <ofi_list.h>
 #include <ofi_proto.h>
 #include <ofi_iov.h>
+#include <ofi_hmem.h>
 
 #ifndef _RXM_H_
 #define _RXM_H_
@@ -62,10 +64,40 @@
 #define RXM_OP_VERSION		3
 #define RXM_CTRL_VERSION	4
 
-#define RXM_BUF_SIZE	16384
-extern size_t rxm_eager_limit;
+enum {
+	RXM_REJECT_UNSPEC,
+	RXM_REJECT_ECONNREFUSED,
+	RXM_REJECT_EALREADY,
+};
+
+union rxm_cm_data {
+	struct _connect {
+		uint8_t version;
+		uint8_t endianness;
+		uint8_t ctrl_version;
+		uint8_t op_version;
+		uint16_t port;
+		uint8_t padding[2];
+		uint32_t eager_limit;
+		uint32_t rx_size; /* used? */
+		uint64_t client_conn_id;
+	} connect;
+
+	struct _accept {
+		uint64_t server_conn_id;
+		uint32_t rx_size; /* used? */
+	} accept;
+
+	struct _reject {
+		uint8_t version;
+		uint8_t reason;
+	} reject;
+};
+
+
+extern size_t rxm_buffer_size;
+extern size_t rxm_packet_size;
 
-#define RXM_SAR_LIMIT	131072
 #define RXM_SAR_TX_ERROR	UINT64_MAX
 #define RXM_SAR_RX_INIT		UINT64_MAX
 
@@ -100,6 +132,9 @@ extern size_t rxm_eager_limit;
 	FI_DBG(&rxm_prov, subsystem, log_str 			\
 	       " (fi_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 ")\n",\
 	       addr, tag)
+#define RXM_WARN_ERR(subsystem, log_str, err) \
+	FI_WARN(&rxm_prov, subsystem, log_str "%s (%d)\n", \
+		fi_strerror((int) -(err)), (int) err)
 
 #define RXM_GET_PROTO_STATE(context)					\
 	(*(enum rxm_proto_state *)					\
@@ -124,149 +159,79 @@ extern struct util_prov rxm_util_prov;
 extern struct fi_ops_rma rxm_ops_rma;
 extern struct fi_ops_atomic rxm_ops_atomic;
 
+enum {
+	RXM_MSG_RXTX_SIZE = 128,
+	RXM_MSG_SRX_SIZE = 4096,
+	RXM_RX_SIZE = 65536,
+	RXM_TX_SIZE = 16384,
+};
+
 extern size_t rxm_msg_tx_size;
 extern size_t rxm_msg_rx_size;
 extern size_t rxm_cm_progress_interval;
 extern size_t rxm_cq_eq_fairness;
 extern int force_auto_progress;
+extern int rxm_use_write_rndv;
 extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj;
 
 struct rxm_ep;
+struct rxm_av;
 
 
-/*
- * Connection Map
- */
-
-#define RXM_CMAP_IDX_BITS OFI_IDX_INDEX_BITS
-
-enum rxm_cmap_signal {
-	RXM_CMAP_UNSPEC,
-	RXM_CMAP_FREE,
-	RXM_CMAP_EXIT,
+enum rxm_cm_state {
+	RXM_CM_IDLE,
+	RXM_CM_CONNECTING,
+	RXM_CM_ACCEPTING,
+	RXM_CM_CONNECTED,
 };
 
-#define RXM_CM_STATES(FUNC)		\
-	FUNC(RXM_CMAP_IDLE),		\
-	FUNC(RXM_CMAP_CONNREQ_SENT),	\
-	FUNC(RXM_CMAP_CONNREQ_RECV),	\
-	FUNC(RXM_CMAP_CONNECTED),	\
-	FUNC(RXM_CMAP_SHUTDOWN),	\
-
-enum rxm_cmap_state {
-	RXM_CM_STATES(OFI_ENUM_VAL)
+enum {
+	RXM_CONN_INDEXED = BIT(0),
 };
 
-extern char *rxm_cm_state_str[];
-
-#define RXM_CM_UPDATE_STATE(handle, new_state)				\
-	do {								\
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "[CM] handle: "	\
-		       "%p %s -> %s\n",	handle,				\
-		       rxm_cm_state_str[handle->state],			\
-		       rxm_cm_state_str[new_state]);			\
-		handle->state = new_state;				\
-	} while (0)
-
-struct rxm_cmap_handle {
-	struct rxm_cmap *cmap;
-	enum rxm_cmap_state state;
-	/* Unique identifier for a connection. Can be exchanged with a peer
-	 * during connection setup and can later be used in a message header
-	 * to identify the source of the message (Used for FI_SOURCE, RNDV
-	 * protocol, etc.) */
-	uint64_t key;
-	uint64_t remote_key;
+/* There will be at most 1 peer address per AV entry.  There
+ * may be addresses that have not been inserted into the local
+ * AV, and have no matching entry.  This can occur if we are
+ * only receiving data from the remote rxm ep.
+ */
+struct rxm_peer_addr {
+	struct rxm_av *av;
 	fi_addr_t fi_addr;
-	struct rxm_cmap_peer *peer;
-};
-
-struct rxm_cmap_peer {
-	struct rxm_cmap_handle *handle;
-	struct dlist_entry entry;
-	uint8_t addr[];
-};
-
-struct rxm_cmap_attr {
-	void 				*name;
+	struct ofi_rbnode *node;
+	int index;
+	int refcnt;
+	union ofi_sock_ip addr;
 };
 
-struct rxm_cmap {
-	struct rxm_ep		*ep;
-	struct util_av		*av;
-
-	/* cmap handles that correspond to addresses in AV */
-	struct rxm_cmap_handle **handles_av;
-	size_t			num_allocated;
-
-	/* Store all cmap handles (inclusive of handles_av) in an indexer.
-	 * This allows reverse lookup of the handle using the index. */
-	struct indexer		handles_idx;
-
-	struct ofi_key_idx	key_idx;
-
-	struct dlist_entry	peer_list;
-	struct rxm_cmap_attr	attr;
-	pthread_t		cm_thread;
-	ofi_fastlock_acquire_t	acquire;
-	ofi_fastlock_release_t	release;
-	fastlock_t		lock;
-};
+struct rxm_peer_addr *rxm_get_peer(struct rxm_av *av, const void *addr);
+void rxm_put_peer(struct rxm_peer_addr *peer);
 
-enum rxm_cmap_reject_reason {
-	RXM_CMAP_REJECT_UNSPEC,
-	RXM_CMAP_REJECT_GENUINE,
-	RXM_CMAP_REJECT_SIMULT_CONN,
-};
-
-union rxm_cm_data {
-	struct _connect {
-		uint8_t version;
-		uint8_t endianness;
-		uint8_t ctrl_version;
-		uint8_t op_version;
-		uint16_t port;
-		uint8_t padding[2];
-		uint32_t eager_size;
-		uint32_t rx_size;
-		uint64_t client_conn_id;
-	} connect;
+/* Each local rxm ep will have at most 1 connection to a single
+ * remote rxm ep.  A local rxm ep may not be connected to all
+ * remote rxm ep's.
+ */
+struct rxm_conn {
+	enum rxm_cm_state state;
+	struct rxm_peer_addr *peer;
+	struct fid_ep *msg_ep;
+	struct rxm_ep *ep;
 
-	struct _accept {
-		uint64_t server_conn_id;
-		uint32_t rx_size;
-	} accept;
+	/* Prior versions of libfabric did not guarantee that all connections
+	 * from the same peer would have the same conn_id.  For compatibility
+	 * we need to store the remote_index per connection, rather than with
+	 * the peer_addr.
+	 */
+	int remote_index;
+	uint8_t flags;
 
-	struct _reject {
-		uint8_t version;
-		uint8_t reason;
-	} reject;
+	struct dlist_entry deferred_entry;
+	struct dlist_entry deferred_tx_queue;
+	struct dlist_entry deferred_sar_msgs;
+	struct dlist_entry deferred_sar_segments;
+	struct dlist_entry loopback_entry;
 };
 
-int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr,
-			  enum rxm_cmap_state state,
-			  struct rxm_cmap_handle **handle);
-struct rxm_cmap_handle *rxm_cmap_key2handle(struct rxm_cmap *cmap, uint64_t key);
-int rxm_cmap_update(struct rxm_cmap *cmap, const void *addr, fi_addr_t fi_addr);
-
-void rxm_cmap_process_reject(struct rxm_cmap *cmap,
-			     struct rxm_cmap_handle *handle,
-			     enum rxm_cmap_reject_reason cm_reject_reason);
-void rxm_cmap_process_shutdown(struct rxm_cmap *cmap,
-			       struct rxm_cmap_handle *handle);
-int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr,
-		     struct rxm_cmap_handle *handle);
-void rxm_cmap_free(struct rxm_cmap *cmap);
-int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr);
-int rxm_cmap_remove(struct rxm_cmap *cmap, int index);
-int rxm_msg_eq_progress(struct rxm_ep *rxm_ep);
-
-static inline struct rxm_cmap_handle *
-rxm_cmap_acquire_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr)
-{
-	assert(fi_addr < cmap->num_allocated);
-	return cmap->handles_av[fi_addr];
-}
+void rxm_freeall_conns(struct rxm_ep *ep);
 
 struct rxm_fabric {
 	struct util_fabric util_fabric;
@@ -277,20 +242,63 @@ struct rxm_domain {
 	struct util_domain util_domain;
 	struct fid_domain *msg_domain;
 	size_t max_atomic_size;
+	size_t rx_post_size;
 	uint64_t mr_key;
-	uint8_t mr_local;
+	bool dyn_rbuf;
 	struct ofi_ops_flow_ctrl *flow_ctrl_ops;
+	struct ofi_bufpool *amo_bufpool;
+	fastlock_t amo_bufpool_lock;
+};
+
+/* All peer addresses, whether they've been inserted into the AV
+ * or an endpoint has an active connection to it, are stored in
+ * the addr_map.  Peers are allocated from a buffer pool and
+ * assigned a local index using the pool.  All rxm endpoints
+ * maintain a connection array which is aligned with the peer_pool.
+ *
+ * We technically only need to store the index of each peer in
+ * the AV itself.  The 'util_av' could basically be replaced by
+ * an ofi_index_map.  However, too much of the existing code
+ * relies on the util_av existing and storing the AV addresses.
+ *
+ * A future cleanup would be to remove using the util_av and have the
+ * rxm_av implementation be independent.
+ */
+ struct rxm_av {
+	struct util_av util_av;
+	struct ofi_rbmap addr_map;
+	struct ofi_bufpool *peer_pool;
+	struct ofi_bufpool *conn_pool;
 };
 
 int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
-		struct fid_av **av, void *context);
+		struct fid_av **fid_av, void *context);
+size_t rxm_av_max_peers(struct rxm_av *av);
+void rxm_ref_peer(struct rxm_peer_addr *peer);
+struct rxm_conn *rxm_av_alloc_conn(struct rxm_av *av);
+void rxm_av_free_conn(struct rxm_conn *conn);
 
 struct rxm_mr {
 	struct fid_mr mr_fid;
 	struct fid_mr *msg_mr;
 	struct rxm_domain *domain;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	fastlock_t amo_lock;
 };
 
+static inline enum fi_hmem_iface
+rxm_mr_desc_to_hmem_iface_dev(void **desc, size_t count, uint64_t *device)
+{
+	if (!count || !desc || !desc[0]) {
+		*device = 0;
+		return FI_HMEM_SYSTEM;
+	}
+
+	*device = ((struct rxm_mr *) desc[0])->device;
+	return ((struct rxm_mr *) desc[0])->iface;
+}
+
 struct rxm_rndv_hdr {
 	struct ofi_rma_iov iov[RXM_IOV_LIMIT];
 	uint8_t count;
@@ -337,11 +345,18 @@ struct rxm_atomic_resp_hdr {
 	FUNC(RXM_SAR_TX),		\
 	FUNC(RXM_CREDIT_TX),		\
 	FUNC(RXM_RNDV_TX),		\
-	FUNC(RXM_RNDV_ACK_WAIT),	\
+	FUNC(RXM_RNDV_READ_DONE_WAIT),	\
+	FUNC(RXM_RNDV_WRITE_DATA_WAIT),	\
+	FUNC(RXM_RNDV_WRITE_DONE_WAIT),	\
 	FUNC(RXM_RNDV_READ),		\
-	FUNC(RXM_RNDV_ACK_SENT),	\
-	FUNC(RXM_RNDV_ACK_RECVD),	\
-	FUNC(RXM_RNDV_FINISH),		\
+	FUNC(RXM_RNDV_WRITE), /* not used */ \
+	FUNC(RXM_RNDV_READ_DONE_SENT),	\
+	FUNC(RXM_RNDV_READ_DONE_RECVD),	\
+	FUNC(RXM_RNDV_WRITE_DATA_SENT),	\
+	FUNC(RXM_RNDV_WRITE_DATA_RECVD), /* not used */ \
+	FUNC(RXM_RNDV_WRITE_DONE_SENT),	\
+	FUNC(RXM_RNDV_WRITE_DONE_RECVD),\
+	FUNC(RXM_RNDV_FINISH), /* not needed */	\
 	FUNC(RXM_ATOMIC_RESP_WAIT),	\
 	FUNC(RXM_ATOMIC_RESP_SENT)
 
@@ -354,11 +369,13 @@ extern char *rxm_proto_state_str[];
 enum {
 	rxm_ctrl_eager,
 	rxm_ctrl_seg,
-	rxm_ctrl_rndv,
-	rxm_ctrl_rndv_ack,
+	rxm_ctrl_rndv_req,
+	rxm_ctrl_rndv_rd_done,
 	rxm_ctrl_atomic,
 	rxm_ctrl_atomic_resp,
-	rxm_ctrl_credit
+	rxm_ctrl_credit,
+	rxm_ctrl_rndv_wr_data,
+	rxm_ctrl_rndv_wr_done
 };
 
 struct rxm_pkt {
@@ -409,22 +426,6 @@ struct rxm_iov {
 	uint8_t count;
 };
 
-enum rxm_buf_pool_type {
-	RXM_BUF_POOL_RX		= 0,
-	RXM_BUF_POOL_START	= RXM_BUF_POOL_RX,
-	RXM_BUF_POOL_TX,
-	RXM_BUF_POOL_TX_START	= RXM_BUF_POOL_TX,
-	RXM_BUF_POOL_TX_INJECT,
-	RXM_BUF_POOL_TX_ACK,
-	RXM_BUF_POOL_TX_RNDV,
-	RXM_BUF_POOL_TX_ATOMIC,
-	RXM_BUF_POOL_TX_CREDIT,
-	RXM_BUF_POOL_TX_SAR,
-	RXM_BUF_POOL_TX_END	= RXM_BUF_POOL_TX_SAR,
-	RXM_BUF_POOL_RMA,
-	RXM_BUF_POOL_MAX,
-};
-
 struct rxm_buf {
 	/* Must stay at top */
 	struct fi_context fi_context;
@@ -440,100 +441,67 @@ struct rxm_rx_buf {
 
 	struct rxm_ep *ep;
 	/* MSG EP / shared context to which bufs would be posted to */
-	struct fid_ep *msg_ep;
+	struct fid_ep *rx_ep;
 	struct dlist_entry repost_entry;
-	struct rxm_conn *conn;
+	struct rxm_conn *conn;		/* msg ep data was received on */
+	/* if recv_entry is set, then we matched dyn rbuf */
 	struct rxm_recv_entry *recv_entry;
 	struct rxm_unexp_msg unexp_msg;
 	uint64_t comp_flags;
 	struct fi_recv_context recv_context;
-	// TODO remove this and modify unexp msg handling path to not repost
-	// rx_buf
-	uint8_t repost;
+	bool repost;
 
 	/* Used for large messages */
-	struct rxm_rndv_hdr *rndv_hdr;
+	struct dlist_entry rndv_wait_entry;
+	struct rxm_rndv_hdr *remote_rndv_hdr;
 	size_t rndv_rma_index;
 	struct fid_mr *mr[RXM_IOV_LIMIT];
 
+	/* Only differs from pkt.data for unexpected messages */
+	void *data;
 	/* Must stay at bottom */
 	struct rxm_pkt pkt;
 };
 
-struct rxm_tx_base_buf {
-	/* Must stay at top */
-	struct rxm_buf hdr;
-
-	/* Must stay at bottom */
-	struct rxm_pkt pkt;
-};
-
-struct rxm_tx_eager_buf {
+struct rxm_tx_buf {
 	/* Must stay at top */
 	struct rxm_buf hdr;
 
+	OFI_DBG_VAR(bool, user_tx)
 	void *app_context;
 	uint64_t flags;
 
-	/* Must stay at bottom */
-	struct rxm_pkt pkt;
-};
-
-struct rxm_tx_sar_buf {
-	/* Must stay at top */
-	struct rxm_buf hdr;
-
-	void *app_context;
-	uint64_t flags;
-
-	/* Must stay at bottom */
-	struct rxm_pkt pkt;
-};
-
-struct rxm_tx_rndv_buf {
-	/* Must stay at top */
-	struct rxm_buf hdr;
-
-	void *app_context;
-	uint64_t flags;
-	struct fid_mr *mr[RXM_IOV_LIMIT];
-	uint8_t count;
-
-	/* Must stay at bottom */
-	struct rxm_pkt pkt;
-};
-
-struct rxm_rma_buf {
-	/* Must stay at top */
-	struct rxm_buf hdr;
-
-	void *app_context;
-	uint64_t flags;
+	union {
+		struct {
+			struct fid_mr *mr[RXM_IOV_LIMIT];
+			uint8_t count;
+		} rma;
+		struct rxm_iov atomic_result;
+	};
 
 	struct {
-		struct fid_mr *mr[RXM_IOV_LIMIT];
-		uint8_t count;
-	} mr;
-	/* Must stay at bottom */
-	struct rxm_pkt pkt;
-};
-
-struct rxm_tx_atomic_buf {
-	/* Must stay at top */
-	struct rxm_buf hdr;
-
-	void *app_context;
-	uint64_t flags;
-	struct iovec result_iov[RXM_IOV_LIMIT];
-	uint8_t result_iov_count;
+		struct iovec iov[RXM_IOV_LIMIT];
+		void *desc[RXM_IOV_LIMIT];
+		struct rxm_conn *conn;
+		size_t rndv_rma_index;
+		size_t rndv_rma_count;
+		struct rxm_tx_buf *done_buf;
+		struct rxm_rndv_hdr remote_hdr;
+	} write_rndv;
 
 	/* Must stay at bottom */
 	struct rxm_pkt pkt;
 };
 
+/* Used for application transmits, provides credit check */
+struct rxm_tx_buf *rxm_get_tx_buf(struct rxm_ep *ep);
+void rxm_free_rx_buf(struct rxm_ep *ep, struct rxm_tx_buf *buf);
+
 enum rxm_deferred_tx_entry_type {
 	RXM_DEFERRED_TX_RNDV_ACK,
+	RXM_DEFERRED_TX_RNDV_DONE,
 	RXM_DEFERRED_TX_RNDV_READ,
+	RXM_DEFERRED_TX_RNDV_WRITE,
 	RXM_DEFERRED_TX_SAR_SEG,
 	RXM_DEFERRED_TX_ATOMIC_RESP,
 	RXM_DEFERRED_TX_CREDIT_SEND,
@@ -548,14 +516,23 @@ struct rxm_deferred_tx_entry {
 	union {
 		struct {
 			struct rxm_rx_buf *rx_buf;
+			size_t pkt_size;
 		} rndv_ack;
+		struct {
+			struct rxm_tx_buf *tx_buf;
+		} rndv_done;
 		struct {
 			struct rxm_rx_buf *rx_buf;
 			struct fi_rma_iov rma_iov;
 			struct rxm_iov rxm_iov;
 		} rndv_read;
 		struct {
-			struct rxm_tx_sar_buf *cur_seg_tx_buf;
+			struct rxm_tx_buf *tx_buf;
+			struct fi_rma_iov rma_iov;
+			struct rxm_iov rxm_iov;
+		} rndv_write;
+		struct {
+			struct rxm_tx_buf *cur_seg_tx_buf;
 			struct {
 				struct iovec iov[RXM_IOV_LIMIT];
 				uint8_t count;
@@ -571,13 +548,15 @@ struct rxm_deferred_tx_entry {
 			uint64_t msg_id;
 			void *app_context;
 			uint64_t flags;
+			enum fi_hmem_iface iface;
+			uint64_t device;
 		} sar_seg;
 		struct {
-			struct rxm_tx_atomic_buf *tx_buf;
+			struct rxm_tx_buf *tx_buf;
 			ssize_t len;
 		} atomic_resp;
 		struct {
-			struct rxm_tx_base_buf *tx_buf;
+			struct rxm_tx_buf *tx_buf;
 		} credit_msg;
 	};
 };
@@ -604,10 +583,10 @@ struct rxm_recv_entry {
 	/* Used for Rendezvous protocol */
 	struct {
 		/* This is used to send RNDV ACK */
-		struct rxm_tx_base_buf *tx_buf;
+		struct rxm_tx_buf *tx_buf;
 	} rndv;
 };
-DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs);
+OFI_DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs);
 
 enum rxm_recv_queue_type {
 	RXM_RECV_QUEUE_UNSPEC,
@@ -616,103 +595,99 @@ enum rxm_recv_queue_type {
 };
 
 struct rxm_recv_queue {
-	struct rxm_ep *rxm_ep;
+	struct rxm_ep		*rxm_ep;
 	enum rxm_recv_queue_type type;
-	struct rxm_recv_fs *fs;
-	struct dlist_entry recv_list;
-	struct dlist_entry unexp_msg_list;
-	dlist_func_t *match_recv;
-	dlist_func_t *match_unexp;
+	struct rxm_recv_fs	*fs;
+	struct dlist_entry	recv_list;
+	struct dlist_entry	unexp_msg_list;
+	size_t			dyn_rbuf_unexp_cnt;
+	dlist_func_t		*match_recv;
+	dlist_func_t		*match_unexp;
 };
 
-struct rxm_buf_pool {
-	enum rxm_buf_pool_type type;
-	struct ofi_bufpool *pool;
-	struct rxm_ep *rxm_ep;
-};
+ssize_t rxm_get_dyn_rbuf(struct ofi_cq_rbuf_entry *entry, struct iovec *iov,
+			 size_t *count);
 
-struct rxm_msg_eq_entry {
-	ssize_t			rd;
-	uint32_t		event;
-	/* Used for connection refusal */
-	void			*context;
-	struct fi_eq_err_entry	err_entry;
-	/* must stay at the bottom */
-	struct fi_eq_cm_entry	cm_entry;
+struct rxm_eager_ops {
+	void (*comp_tx)(struct rxm_ep *rxm_ep,
+			struct rxm_tx_buf *tx_eager_buf);
+	void (*handle_rx)(struct rxm_rx_buf *rx_buf);
 };
 
-#define RXM_MSG_EQ_ENTRY_SZ (sizeof(struct rxm_msg_eq_entry) + \
-			     sizeof(union rxm_cm_data))
-#define RXM_CM_ENTRY_SZ (sizeof(struct fi_eq_cm_entry) + \
-			 sizeof(union rxm_cm_data))
-
-struct rxm_eager_ops {
-	int (*comp_tx)(struct rxm_ep *rxm_ep,
-		       struct rxm_tx_eager_buf *tx_eager_buf);
+struct rxm_rndv_ops {
+	int rx_mr_access;
+	int tx_mr_access;
 	ssize_t (*handle_rx)(struct rxm_rx_buf *rx_buf);
+	ssize_t (*xfer)(struct fid_ep *ep, const struct iovec *iov, void **desc,
+			size_t count, fi_addr_t remote_addr, uint64_t addr,
+			uint64_t key, void *context);
+	ssize_t (*defer_xfer)(struct rxm_deferred_tx_entry **def_tx_entry,
+			      size_t index, struct iovec *iov,
+			      void *desc[RXM_IOV_LIMIT], size_t count,
+			      void *buf);
 };
 
 struct rxm_ep {
 	struct util_ep 		util_ep;
 	struct fi_info 		*rxm_info;
 	struct fi_info 		*msg_info;
-	struct rxm_cmap		*cmap;
+
+	int			connecting_cnt;
+	struct index_map	conn_idx_map;
+	struct dlist_entry	loopback_list;
+	union ofi_sock_ip	addr;
+
+	pthread_t		cm_thread;
 	struct fid_pep 		*msg_pep;
 	struct fid_eq 		*msg_eq;
+	struct fid_ep 		*srx_ctx;
+
 	struct fid_cq 		*msg_cq;
 	uint64_t		msg_cq_last_poll;
-	struct fid_ep 		*srx_ctx;
 	size_t 			comp_per_progress;
-	ofi_atomic32_t		atomic_tx_credits;
 	int			cq_eq_fairness;
 
 	bool			msg_mr_local;
 	bool			rdm_mr_local;
 	bool			do_progress;
+	bool			enable_direct_send;
 
 	size_t			min_multi_recv_size;
 	size_t			buffered_min;
 	size_t			buffered_limit;
 	size_t			inject_limit;
+
 	size_t			eager_limit;
 	size_t			sar_limit;
+	size_t			tx_credit;
 
-	struct rxm_buf_pool	*buf_pools;
+	struct ofi_bufpool	*rx_pool;
+	struct ofi_bufpool	*tx_pool;
+	struct rxm_pkt		*inject_pkt;
 
-	struct dlist_entry	repost_ready_list;
-	struct dlist_entry	deferred_tx_conn_queue;
+	struct dlist_entry	deferred_queue;
+	struct dlist_entry	rndv_wait_list;
 
 	struct rxm_recv_queue	recv_queue;
 	struct rxm_recv_queue	trecv_queue;
+	struct ofi_bufpool	*multi_recv_pool;
 
 	struct rxm_eager_ops	*eager_ops;
+	struct rxm_rndv_ops	*rndv_ops;
 };
 
-struct rxm_conn {
-	/* This should stay at the top */
-	struct rxm_cmap_handle handle;
-
-	struct fid_ep *msg_ep;
+int rxm_start_listen(struct rxm_ep *ep);
+void rxm_stop_listen(struct rxm_ep *ep);
+void rxm_conn_progress(struct rxm_ep *ep);
 
-	/* This is used only in non-FI_THREAD_SAFE case */
-	struct rxm_pkt *inject_pkt;
-	struct rxm_pkt *inject_data_pkt;
-	struct rxm_pkt *tinject_pkt;
-	struct rxm_pkt *tinject_data_pkt;
-
-	struct dlist_entry deferred_conn_entry;
-	struct dlist_entry deferred_tx_queue;
-	struct dlist_entry sar_rx_msg_list;
-	struct dlist_entry sar_deferred_rx_msg_list;
-
-	uint32_t rndv_tx_credits;
-};
 
 extern struct fi_provider rxm_prov;
 extern struct fi_fabric_attr rxm_fabric_attr;
 extern struct fi_domain_attr rxm_domain_attr;
 extern struct fi_tx_attr rxm_tx_attr;
 extern struct fi_rx_attr rxm_rx_attr;
+extern struct rxm_rndv_ops rxm_rndv_ops_read;
+extern struct rxm_rndv_ops rxm_rndv_ops_write;
 
 int rxm_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
 			void *context);
@@ -729,7 +704,6 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf);
 int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 			  struct fid_ep **ep, void *context);
 
-int rxm_conn_cmap_alloc(struct rxm_ep *rxm_ep);
 void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr,
 			void *op_context, int err);
 void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err);
@@ -739,30 +713,34 @@ void rxm_ep_progress(struct util_ep *util_ep);
 void rxm_ep_progress_coll(struct util_ep *util_ep);
 void rxm_ep_do_progress(struct util_ep *util_ep);
 
-ssize_t rxm_handle_eager(struct rxm_rx_buf *rx_buf);
-ssize_t rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf);
-int rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_eager_buf);
-int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_eager_buf);
+void rxm_handle_eager(struct rxm_rx_buf *rx_buf);
+void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf);
+void rxm_finish_eager_send(struct rxm_ep *rxm_ep,
+			   struct rxm_tx_buf *tx_eager_buf);
+void rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep,
+				struct rxm_tx_buf *tx_eager_buf);
 
-int rxm_msg_ep_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep);
+int rxm_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep);
 
 int rxm_ep_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
 			enum fi_op op, struct fi_atomic_attr *attr,
 			uint64_t flags);
+ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf);
+ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf);
+void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf,
+			      const struct iovec *iov, size_t count,
+			      struct fid_mr **mr);
+
 
 static inline size_t rxm_ep_max_atomic_size(struct fi_info *info)
 {
-	size_t overhead = sizeof(struct rxm_atomic_hdr) +
-			  sizeof(struct rxm_pkt);
-
-	/* Must be set to eager size or less */
-	return (info->tx_attr && info->tx_attr->inject_size > overhead) ?
-		info->tx_attr->inject_size - overhead : 0;
+	assert(rxm_buffer_size >= sizeof(struct rxm_atomic_hdr));
+	return rxm_buffer_size - sizeof(struct rxm_atomic_hdr);
 }
 
 static inline ssize_t
 rxm_atomic_send_respmsg(struct rxm_ep *rxm_ep, struct rxm_conn *conn,
-			struct rxm_tx_atomic_buf *resp_buf, ssize_t len)
+			struct rxm_tx_buf *resp_buf, ssize_t len)
 {
 	struct iovec iov = {
 		.iov_base = (void *) &resp_buf->pkt,
@@ -778,17 +756,6 @@ rxm_atomic_send_respmsg(struct rxm_ep *rxm_ep, struct rxm_conn *conn,
 	return fi_sendmsg(conn->msg_ep, &msg, FI_COMPLETION);
 }
 
-static inline int rxm_needs_atomic_progress(const struct fi_info *info)
-{
-	return (info->caps & FI_ATOMIC) && info->domain_attr &&
-			info->domain_attr->data_progress == FI_PROGRESS_AUTO;
-}
-
-static inline struct rxm_conn *rxm_key2conn(struct rxm_ep *rxm_ep, uint64_t key)
-{
-	return (struct rxm_conn *)rxm_cmap_key2handle(rxm_ep->cmap, key);
-}
-
 void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 				    struct rxm_conn *rxm_conn);
 
@@ -797,20 +764,32 @@ rxm_ep_alloc_deferred_tx_entry(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			       enum rxm_deferred_tx_entry_type type);
 
 static inline void
-rxm_ep_enqueue_deferred_tx_queue(struct rxm_deferred_tx_entry *tx_entry)
+rxm_queue_deferred_tx(struct rxm_deferred_tx_entry *tx_entry,
+		      enum ofi_list_end list_end)
 {
-	if (dlist_empty(&tx_entry->rxm_conn->deferred_tx_queue))
-		dlist_insert_tail(&tx_entry->rxm_conn->deferred_conn_entry,
-				  &tx_entry->rxm_ep->deferred_tx_conn_queue);
-	dlist_insert_tail(&tx_entry->entry, &tx_entry->rxm_conn->deferred_tx_queue);
+	struct rxm_conn *conn = tx_entry->rxm_conn;
+
+	if (dlist_empty(&conn->deferred_tx_queue))
+		dlist_insert_tail(&conn->deferred_entry,
+				  &conn->ep->deferred_queue);
+	if (list_end == OFI_LIST_HEAD) {
+		dlist_insert_head(&tx_entry->entry,
+				  &conn->deferred_tx_queue);
+	} else  {
+		dlist_insert_tail(&tx_entry->entry,
+				  &conn->deferred_tx_queue);
+	}
 }
 
 static inline void
-rxm_ep_dequeue_deferred_tx_queue(struct rxm_deferred_tx_entry *tx_entry)
+rxm_dequeue_deferred_tx(struct rxm_deferred_tx_entry *tx_entry)
 {
-	dlist_remove_init(&tx_entry->entry);
-	if (dlist_empty(&tx_entry->rxm_conn->deferred_tx_queue))
-		dlist_remove(&tx_entry->rxm_conn->deferred_conn_entry);
+	struct rxm_conn *conn = tx_entry->rxm_conn;
+
+	assert(!dlist_empty(&conn->deferred_tx_queue));
+	dlist_remove(&tx_entry->entry);
+	if (dlist_empty(&conn->deferred_tx_queue))
+		dlist_remove_init(&conn->deferred_entry);
 }
 
 int rxm_conn_process_eq_events(struct rxm_ep *rxm_ep);
@@ -829,16 +808,38 @@ static inline void rxm_cntr_incerr(struct util_cntr *cntr)
 		cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1);
 }
 
+static inline void
+rxm_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len,
+	     void *buf, uint64_t data, uint64_t tag)
+{
+	int ret;
+
+	FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n",
+	       fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS));
 
+	ret = ofi_cq_write(cq, context, flags, len, buf, data, tag);
+	if (ret) {
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"Unable to report completion\n");
+		assert(0);
+	}
+}
 
-static inline void rxm_cq_log_comp(uint64_t flags)
+static inline void
+rxm_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len,
+		 void *buf, uint64_t data, uint64_t tag, fi_addr_t addr)
 {
-#if ENABLE_DEBUG
+	int ret;
+
 	FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n",
-	       fi_tostr((void *)&flags, FI_TYPE_CQ_EVENT_FLAGS));
-#else
-	/* NOP */
-#endif
+	       fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS));
+
+	ret = ofi_cq_write_src(cq, context, flags, len, buf, data, tag, addr);
+	if (ret) {
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"Unable to report completion\n");
+		assert(0);
+	}
 }
 
 ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr,
@@ -849,7 +850,7 @@ rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op,
 			 uint64_t data, uint64_t tag, uint64_t flags,
 			 struct rxm_pkt *pkt)
 {
-	pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
+	pkt->ctrl_hdr.conn_id = rxm_conn->remote_index;
 	pkt->hdr.size = len;
 	pkt->hdr.op = op;
 	pkt->hdr.tag = tag;
@@ -857,68 +858,53 @@ rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op,
 	pkt->hdr.data = data;
 }
 
-static inline void *
-rxm_tx_buf_alloc(struct rxm_ep *rxm_ep, enum rxm_buf_pool_type type)
-{
-	assert((type == RXM_BUF_POOL_TX) ||
-	       (type == RXM_BUF_POOL_TX_INJECT) ||
-	       (type == RXM_BUF_POOL_TX_ACK) ||
-	       (type == RXM_BUF_POOL_TX_RNDV) ||
-	       (type == RXM_BUF_POOL_TX_ATOMIC) ||
-	       (type == RXM_BUF_POOL_TX_CREDIT) ||
-	       (type == RXM_BUF_POOL_TX_SAR));
-	return ofi_buf_alloc(rxm_ep->buf_pools[type].pool);
-}
-
-static inline struct rxm_rx_buf *
-rxm_rx_buf_alloc(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, uint8_t repost)
-{
-	struct rxm_rx_buf *rx_buf =
-		ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RX].pool);
-	if (OFI_LIKELY((long int)rx_buf)) {
-		assert(rx_buf->ep == rxm_ep);
-		rx_buf->hdr.state = RXM_RX;
-		rx_buf->msg_ep = msg_ep;
-		rx_buf->repost = repost;
-
-		if (!rxm_ep->srx_ctx)
-			rx_buf->conn = container_of(msg_ep->fid.context,
-						    struct rxm_conn, handle);
-	}
-	return rx_buf;
-}
+int rxm_post_recv(struct rxm_rx_buf *rx_buf);
 
 static inline void
 rxm_rx_buf_free(struct rxm_rx_buf *rx_buf)
 {
-	if (rx_buf->repost) {
-		dlist_insert_tail(&rx_buf->repost_entry,
-				  &rx_buf->ep->repost_ready_list);
+	if (rx_buf->data != rx_buf->pkt.data) {
+		free(rx_buf->data);
+		rx_buf->data = &rx_buf->pkt.data;
+	}
+
+	/* Discard rx buffer if its msg_ep was closed */
+	if (rx_buf->repost && (rx_buf->ep->srx_ctx || rx_buf->conn->msg_ep)) {
+		rxm_post_recv(rx_buf);
 	} else {
 		ofi_buf_free(rx_buf);
 	}
 }
 
 static inline void
-rxm_recv_entry_release(struct rxm_recv_queue *queue, struct rxm_recv_entry *entry)
+rxm_recv_entry_release(struct rxm_recv_entry *entry)
 {
-	entry->total_len = 0;
-	freestack_push(queue->fs, entry);
+	if (entry->recv_queue)
+		ofi_freestack_push(entry->recv_queue->fs, entry);
+	else
+		ofi_buf_free(entry);
 }
 
-static inline int rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf,
-					 void *context, uint64_t flags,
-					 size_t len, char *buf)
+static inline void
+rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, uint64_t flags,
+		       size_t len, char *buf)
 {
 	if (rx_buf->ep->rxm_info->caps & FI_SOURCE)
-		return ofi_cq_write_src(rx_buf->ep->util_ep.rx_cq, context,
-					flags, len, buf, rx_buf->pkt.hdr.data,
-					rx_buf->pkt.hdr.tag,
-					rx_buf->conn->handle.fi_addr);
+		rxm_cq_write_src(rx_buf->ep->util_ep.rx_cq, context,
+				 flags, len, buf, rx_buf->pkt.hdr.data,
+				 rx_buf->pkt.hdr.tag,
+				 rx_buf->conn->peer->fi_addr);
 	else
-		return ofi_cq_write(rx_buf->ep->util_ep.rx_cq, context,
-				    flags, len, buf, rx_buf->pkt.hdr.data,
-				    rx_buf->pkt.hdr.tag);
+		rxm_cq_write(rx_buf->ep->util_ep.rx_cq, context,
+			     flags, len, buf, rx_buf->pkt.hdr.data,
+			     rx_buf->pkt.hdr.tag);
 }
 
+struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key);
+
+struct rxm_recv_entry *
+rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov,
+		   void **desc, size_t count, fi_addr_t src_addr,
+		   uint64_t tag, uint64_t ignore, void *context,
+		   uint64_t flags);
 #endif
diff --git a/deps/libfabric/prov/rxm/src/rxm_atomic.c b/deps/libfabric/prov/rxm/src/rxm_atomic.c
index de7e234e9da0a81134f275fd4688a45162a6b30d..caef29caa2b0708438c5f15348796f89cff69705 100644
--- a/deps/libfabric/prov/rxm/src/rxm_atomic.c
+++ b/deps/libfabric/prov/rxm/src/rxm_atomic.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2018 Cray Inc. All rights reserved.
  * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,14 +38,14 @@
 
 static void
 rxm_ep_format_atomic_pkt_hdr(struct rxm_conn *rxm_conn,
-		 struct rxm_tx_atomic_buf *tx_buf, size_t data_len,
+		 struct rxm_tx_buf *tx_buf, size_t data_len,
 		 uint32_t pkt_op, enum fi_datatype datatype,
 		 uint8_t atomic_op, uint64_t flags, uint64_t data,
 		 const struct fi_rma_ioc *rma_ioc, size_t rma_ioc_count)
 {
 	struct rxm_atomic_hdr *atomic_hdr;
 
-	atomic_hdr = (struct rxm_atomic_hdr *)tx_buf->pkt.data;
+	atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data;
 	rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, pkt_op, data, 0,
 				 flags, &tx_buf->pkt);
 	tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_atomic;
@@ -60,7 +61,7 @@ rxm_ep_format_atomic_pkt_hdr(struct rxm_conn *rxm_conn,
 
 static inline int
 rxm_ep_send_atomic_req(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		       struct rxm_tx_atomic_buf *tx_buf, uint64_t len)
+		       struct rxm_tx_buf *tx_buf, uint64_t len)
 {
 	int ret;
 
@@ -93,15 +94,20 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		struct fi_ioc *resultv, void **result_desc,
 		size_t result_iov_count, uint32_t op, uint64_t flags)
 {
-	struct rxm_tx_atomic_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 	struct rxm_atomic_hdr *atomic_hdr;
 	struct iovec buf_iov[RXM_IOV_LIMIT];
 	struct iovec cmp_iov[RXM_IOV_LIMIT];
+	enum fi_hmem_iface buf_iface = FI_HMEM_SYSTEM;
+	enum fi_hmem_iface cmp_iface;
+	uint64_t buf_device = 0;
+	uint64_t cmp_device;
 	size_t datatype_sz = ofi_datatype_size(msg->datatype);
 	size_t buf_len = 0;
 	size_t cmp_len = 0;
-	size_t tot_len;
+	size_t data_len, tot_len;
 	ssize_t ret;
+	int i;
 
 	assert(msg->iov_count <= RXM_IOV_LIMIT &&
 	       msg->rma_iov_count <= RXM_IOV_LIMIT);
@@ -117,6 +123,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		ofi_ioc_to_iov(msg->msg_iov, buf_iov, msg->iov_count,
 			       datatype_sz);
 		buf_len = ofi_total_iov_len(buf_iov, msg->iov_count);
+
+		buf_iface = rxm_mr_desc_to_hmem_iface_dev(msg->desc,
+							  msg->iov_count,
+							  &buf_device);
 	}
 
 	if (op == ofi_op_atomic_compare) {
@@ -125,31 +135,26 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			       datatype_sz);
 		cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count);
 		assert(buf_len == cmp_len);
+
+		cmp_iface = rxm_mr_desc_to_hmem_iface_dev(compare_desc,
+							  compare_iov_count,
+							  &cmp_device);
 	}
 
-	tot_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr) +
-			sizeof(struct rxm_pkt);
+	data_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr);
+	tot_len = data_len + sizeof(struct rxm_pkt);
 
-	if (tot_len > rxm_eager_limit) {
+	if (tot_len > rxm_packet_size) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"atomic data too large %zu\n", tot_len);
 		return -FI_EINVAL;
 	}
 
-	if (ofi_atomic_dec32(&rxm_ep->atomic_tx_credits) < 0) {
-		ret = -FI_EAGAIN;
-		goto restore_credit;
-	}
-
-	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
-	if (OFI_UNLIKELY(!tx_buf)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-			"Ran out of buffers from Atomic buffer pool\n");
-		ret = -FI_EAGAIN;
-		goto restore_credit;
-	}
+	tx_buf = rxm_get_tx_buf(rxm_ep);
+	if (!tx_buf)
+		return -FI_EAGAIN;
 
-	rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, tot_len, op,
+	rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, data_len, op,
 				msg->datatype, msg->op, flags, msg->data,
 				msg->rma_iov, msg->rma_iov_count);
 	tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
@@ -157,24 +162,32 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 	atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data;
 
-	ofi_copy_from_iov(atomic_hdr->data, buf_len, buf_iov,
-			  msg->iov_count, 0);
-	if (cmp_len)
-		ofi_copy_from_iov(atomic_hdr->data + buf_len, cmp_len,
-				  cmp_iov, compare_iov_count, 0);
+	ret = ofi_copy_from_hmem_iov(atomic_hdr->data, buf_len, buf_iface,
+				     buf_device, buf_iov, msg->iov_count, 0);
+	assert(ret == buf_len);
 
-	tx_buf->result_iov_count = result_iov_count;
-	if (resultv)
-		ofi_ioc_to_iov(resultv, tx_buf->result_iov, result_iov_count,
-			       datatype_sz);
+	if (cmp_len) {
+		ret = ofi_copy_from_hmem_iov(atomic_hdr->data + buf_len,
+					     cmp_len, cmp_iface, cmp_device,
+					     cmp_iov, compare_iov_count, 0);
+		assert(ret == cmp_len);
+	}
+
+	tx_buf->atomic_result.count = result_iov_count;
+	if (resultv) {
+		ofi_ioc_to_iov(resultv, tx_buf->atomic_result.iov,
+			       result_iov_count, datatype_sz);
+
+		if (result_desc) {
+			for (i = 0; i < result_iov_count; i++)
+				tx_buf->atomic_result.desc[i] = result_desc[i];
+		}
+	}
 
 	ret = rxm_ep_send_atomic_req(rxm_ep, rxm_conn, tx_buf, tot_len);
-	if (OFI_LIKELY(!ret))
-		return ret;
+	if (ret)
+		rxm_free_rx_buf(rxm_ep, tx_buf);
 
-	ofi_buf_free(tx_buf);
-restore_credit:
-	ofi_atomic_inc32(&rxm_ep->atomic_tx_credits);
 	return ret;
 }
 
diff --git a/deps/libfabric/prov/rxm/src/rxm_attr.c b/deps/libfabric/prov/rxm/src/rxm_attr.c
index 63fcf69094550860c6a78316734dd2c5d237d5dc..2078dfac13613c732aaf095381cba687647cfedc 100644
--- a/deps/libfabric/prov/rxm/src/rxm_attr.c
+++ b/deps/libfabric/prov/rxm/src/rxm_attr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016 Intel Corporation. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,10 +33,13 @@
 
 #include "rxm.h"
 
-#define RXM_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS)
+#define RXM_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | \
+		     FI_ATOMICS)
+
 #define RXM_RX_CAPS (FI_SOURCE | OFI_RX_MSG_CAPS | FI_TAGGED | \
 		     OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \
 		     FI_MULTI_RECV)
+
 #define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM)
 
 
@@ -45,21 +49,21 @@
  * requested by the app. */
 
 struct fi_tx_attr rxm_tx_attr = {
-	.caps = RXM_TX_CAPS,
+	.caps = RXM_TX_CAPS | FI_HMEM,
 	.op_flags = RXM_PASSTHRU_TX_OP_FLAGS | RXM_TX_OP_FLAGS,
 	.msg_order = ~0x0ULL,
 	.comp_order = FI_ORDER_NONE,
-	.size = 1024,
+	.size = RXM_TX_SIZE,
 	.iov_limit = RXM_IOV_LIMIT,
 	.rma_iov_limit = RXM_IOV_LIMIT,
 };
 
 struct fi_rx_attr rxm_rx_attr = {
-	.caps = RXM_RX_CAPS,
+	.caps = RXM_RX_CAPS | FI_HMEM,
 	.op_flags = RXM_PASSTHRU_RX_OP_FLAGS | RXM_RX_OP_FLAGS,
 	.msg_order = ~0x0ULL,
 	.comp_order = FI_ORDER_NONE,
-	.size = 1024,
+	.size = RXM_RX_SIZE,
 	.iov_limit= RXM_IOV_LIMIT,
 };
 
@@ -68,7 +72,7 @@ struct fi_tx_attr rxm_tx_attr_coll = {
 	.op_flags = RXM_PASSTHRU_TX_OP_FLAGS | RXM_TX_OP_FLAGS,
 	.msg_order = ~0x0ULL,
 	.comp_order = FI_ORDER_NONE,
-	.size = 1024,
+	.size = RXM_TX_SIZE,
 	.iov_limit = RXM_IOV_LIMIT,
 	.rma_iov_limit = RXM_IOV_LIMIT,
 };
@@ -78,7 +82,7 @@ struct fi_rx_attr rxm_rx_attr_coll = {
 	.op_flags = RXM_PASSTHRU_RX_OP_FLAGS | RXM_RX_OP_FLAGS,
 	.msg_order = ~0x0ULL,
 	.comp_order = FI_ORDER_NONE,
-	.size = 1024,
+	.size = RXM_RX_SIZE,
 	.iov_limit= RXM_IOV_LIMIT,
 };
 
@@ -155,7 +159,7 @@ struct fi_info rxm_coll_info = {
 };
 
 struct fi_info rxm_base_info = {
-	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS,
+	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_HMEM,
 	.addr_format = FI_SOCKADDR,
 	.tx_attr = &rxm_tx_attr,
 	.rx_attr = &rxm_rx_attr,
@@ -177,7 +181,7 @@ struct fi_info rxm_tcp_info = {
 };
 
 struct fi_info rxm_verbs_info = {
-	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS,
+	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_HMEM,
 	.addr_format = FI_SOCKADDR,
 	.tx_attr = &rxm_tx_attr,
 	.rx_attr = &rxm_rx_attr,
diff --git a/deps/libfabric/prov/rxm/src/rxm_av.c b/deps/libfabric/prov/rxm/src/rxm_av.c
index 94957fc67b97f7be113200154cc7cdb0ad6a7600..375b41a313f759fa35207d2d75f82c8be9af2b51 100644
--- a/deps/libfabric/prov/rxm/src/rxm_av.c
+++ b/deps/libfabric/prov/rxm/src/rxm_av.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Intel Corporation. All rights reserved.
+ * Copyright (c) 2018-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,141 +34,253 @@
 
 #include "rxm.h"
 
-static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
-			 size_t count, uint64_t flags)
+
+size_t rxm_av_max_peers(struct rxm_av *av)
 {
-	struct util_av *av = container_of(av_fid, struct util_av, av_fid);
-	struct rxm_ep *rxm_ep;
-	int i, ret = 0;
-
-	fastlock_acquire(&av->ep_list_lock);
-	/* This should be before ofi_ip_av_remove as we need to know
-	 * fi_addr -> addr mapping when moving handle to peer list. */
-	dlist_foreach_container(&av->ep_list, struct rxm_ep,
-				rxm_ep, util_ep.av_entry) {
-		ofi_ep_lock_acquire(&rxm_ep->util_ep);
-		for (i = 0; i < count; i++) {
-			ret = rxm_cmap_remove(rxm_ep->cmap, *fi_addr + i);
-			if (ret)
-				FI_WARN(&rxm_prov, FI_LOG_AV,
-					"cmap remove failed for fi_addr: %"
-					PRIu64 "\n", *fi_addr + i);
-		}
-		ofi_ep_lock_release(&rxm_ep->util_ep);
+	size_t cnt;
+
+	fastlock_acquire(&av->util_av.lock);
+	cnt = av->peer_pool->entry_cnt;
+	fastlock_release(&av->util_av.lock);
+	return cnt;
+}
+
+struct rxm_conn *rxm_av_alloc_conn(struct rxm_av *av)
+{
+	struct rxm_conn *conn;
+	fastlock_acquire(&av->util_av.lock);
+	conn = ofi_buf_alloc(av->conn_pool);
+	fastlock_release(&av->util_av.lock);
+	return conn;
+}
+
+void rxm_av_free_conn(struct rxm_conn *conn)
+{
+	struct rxm_av *av;
+	av = container_of(conn->ep->util_ep.av, struct rxm_av, util_av);
+	fastlock_acquire(&av->util_av.lock);
+	ofi_buf_free(conn);
+	fastlock_release(&av->util_av.lock);
+}
+
+static int rxm_addr_compare(struct ofi_rbmap *map, void *key, void *data)
+{
+	return memcmp(&((struct rxm_peer_addr *) data)->addr, key,
+		container_of(map, struct rxm_av, addr_map)->util_av.addrlen);
+}
+
+static struct rxm_peer_addr *
+rxm_alloc_peer(struct rxm_av *av, const void *addr)
+{
+	struct rxm_peer_addr *peer;
+
+	assert(fastlock_held(&av->util_av.lock));
+	peer = ofi_ibuf_alloc(av->peer_pool);
+	if (!peer)
+		return NULL;
+
+	peer->av = av;
+	peer->index = (int) ofi_buf_index(peer);
+	peer->fi_addr = FI_ADDR_NOTAVAIL;
+	peer->refcnt = 1;
+	memcpy(&peer->addr, addr, av->util_av.addrlen);
+
+	if (ofi_rbmap_insert(&av->addr_map, &peer->addr, peer, &peer->node)) {
+		ofi_ibuf_free(peer);
+		peer = NULL;
 	}
-	fastlock_release(&av->ep_list_lock);
 
-	return ofi_ip_av_remove(av_fid, fi_addr, count, flags);
+	return peer;
+}
+
+static void rxm_free_peer(struct rxm_peer_addr *peer)
+{
+	assert(fastlock_held(&peer->av->util_av.lock));
+	assert(!peer->refcnt);
+	ofi_rbmap_delete(&peer->av->addr_map, peer->node);
+	ofi_ibuf_free(peer);
+}
+
+struct rxm_peer_addr *
+rxm_get_peer(struct rxm_av *av, const void *addr)
+{
+	struct rxm_peer_addr *peer;
+	struct ofi_rbnode *node;
+
+	fastlock_acquire(&av->util_av.lock);
+	node = ofi_rbmap_find(&av->addr_map, (void *) addr);
+	if (node) {
+		peer = node->data;
+		peer->refcnt++;
+	} else {
+		peer = rxm_alloc_peer(av, addr);
+	}
+
+	fastlock_release(&av->util_av.lock);
+	return peer;
+}
+
+void rxm_put_peer(struct rxm_peer_addr *peer)
+{
+	struct rxm_av *av;
+
+	av = peer->av;
+	fastlock_acquire(&av->util_av.lock);
+	if (--peer->refcnt == 0)
+		rxm_free_peer(peer);
+	fastlock_release(&av->util_av.lock);
+}
+
+void rxm_ref_peer(struct rxm_peer_addr *peer)
+{
+	fastlock_acquire(&peer->av->util_av.lock);
+	peer->refcnt++;
+	fastlock_release(&peer->av->util_av.lock);
+}
+
+static void
+rxm_set_av_context(struct rxm_av *av, fi_addr_t fi_addr,
+		   struct rxm_peer_addr *peer)
+{
+	struct rxm_peer_addr **peer_ctx;
+
+	peer_ctx = ofi_av_addr_context(&av->util_av, fi_addr);
+	*peer_ctx = peer;
+}
+
+static void
+rxm_put_peer_addr(struct rxm_av *av, fi_addr_t fi_addr)
+{
+	struct rxm_peer_addr **peer;
+
+	fastlock_acquire(&av->util_av.lock);
+	peer = ofi_av_addr_context(&av->util_av, fi_addr);
+	if (--(*peer)->refcnt == 0)
+		rxm_free_peer(*peer);
+
+	rxm_set_av_context(av, fi_addr, NULL);
+	fastlock_release(&av->util_av.lock);
 }
 
-/* TODO: Determine if it's cleaner to insert an address into the cmap only
- * when we need to send to that address, rather than inserting the address
- * into the cmap when adding it to the AV.
- */
 static int
-rxm_av_insert_cmap(struct fid_av *av_fid, const void *addr, size_t count,
-		   fi_addr_t *fi_addr, uint64_t flags)
+rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count,
+		 fi_addr_t *fi_addr)
 {
-	struct util_av *av = container_of(av_fid, struct util_av, av_fid);
-	struct rxm_ep *rxm_ep;
-	fi_addr_t fi_addr_tmp;
-	size_t i;
-	int ret = 0;
+	struct rxm_peer_addr *peer;
 	const void *cur_addr;
+	fi_addr_t cur_fi_addr;
+	size_t i;
+
+	for (i = 0; i < count; i++) {
+		cur_addr = ((char *) addr + i * av->util_av.addrlen);
+		peer = rxm_get_peer(av, cur_addr);
+		if (!peer)
+			goto err;
+
+		peer->fi_addr = fi_addr ? fi_addr[i] :
+				ofi_av_lookup_fi_addr(&av->util_av, cur_addr);
+
+		/* lookup can fail if prior AV insertion failed */
+		if (peer->fi_addr != FI_ADDR_NOTAVAIL)
+			rxm_set_av_context(av, peer->fi_addr, peer);
+	}
+	return 0;
 
-	fastlock_acquire(&av->ep_list_lock);
-	dlist_foreach_container(&av->ep_list, struct rxm_ep,
-				rxm_ep, util_ep.av_entry) {
-		ofi_ep_lock_acquire(&rxm_ep->util_ep);
-		for (i = 0; i < count; i++) {
-			if (!rxm_ep->cmap)
-				break;
-
-			cur_addr = (const void *) ((char *) addr + i * av->addrlen);
-			fi_addr_tmp = (fi_addr ? fi_addr[i] :
-				       ofi_av_lookup_fi_addr_unsafe(av, cur_addr));
-			if (fi_addr_tmp == FI_ADDR_NOTAVAIL)
-				continue;
-
-			ret = rxm_cmap_update(rxm_ep->cmap, cur_addr, fi_addr_tmp);
-			if (OFI_UNLIKELY(ret)) {
-				FI_WARN(&rxm_prov, FI_LOG_AV,
-					"cmap update failed for fi_addr: %"
-					PRIu64 "\n", fi_addr_tmp);
-				break;
-			}
+err:
+	while (i--) {
+		if (fi_addr) {
+			cur_fi_addr = fi_addr[i];
+		} else {
+			cur_addr = ((char *) addr + i * av->util_av.addrlen);
+			cur_fi_addr = ofi_av_lookup_fi_addr(&av->util_av,
+							    cur_addr);
 		}
-		ofi_ep_lock_release(&rxm_ep->util_ep);
+		if (cur_fi_addr != FI_ADDR_NOTAVAIL)
+			rxm_put_peer_addr(av, cur_fi_addr);
 	}
-	fastlock_release(&av->ep_list_lock);
-	return ret;
+	return -FI_ENOMEM;
+}
+
+static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
+			 size_t count, uint64_t flags)
+{
+	struct rxm_av *av;
+	size_t i;
+
+	av = container_of(av_fid, struct rxm_av, util_av.av_fid);
+	for (i = 0; i < count; i++)
+		rxm_put_peer_addr(av, fi_addr[i]);
+
+	return ofi_ip_av_remove(av_fid, fi_addr, count, flags);
 }
 
 static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 			 fi_addr_t *fi_addr, uint64_t flags, void *context)
 {
-	struct util_av *av = container_of(av_fid, struct util_av, av_fid);
-	int ret, retv;
+	struct rxm_av *av;
+	int ret;
 
+	av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid);
 	ret = ofi_ip_av_insert(av_fid, addr, count, fi_addr, flags, context);
 	if (ret < 0)
 		return ret;
 
-	if (!av->eq && !ret)
-		return ret;
+	if (!av->util_av.eq)
+		count = ret;
 
-	retv = rxm_av_insert_cmap(av_fid, addr, count, fi_addr, flags);
-	if (retv) {
-		ret = rxm_av_remove(av_fid, fi_addr, count, flags);
-		if (ret)
-			FI_WARN(&rxm_prov, FI_LOG_AV, "Failed to remove addr "
-				"from AV during error handling\n");
-		return retv;
+	ret = rxm_av_add_peers(av, addr, count, fi_addr);
+	if (ret) {
+		/* If insert was async, ofi_ip_av_insert() will have written
+		 * an event to the EQ with the number of insertions.  For
+		 * correctness we need to delay writing the event to the EQ
+		 * until all processing has completed.  This should be done
+		 * when separating the rxm av from the util av.  For now,
+		 * assume synchronous operation (most common case) and fail
+		 * the insert.  This could leave a bogus entry on the EQ.
+		 * But the app should detect that insert failed and is likely
+		 * to abort.
+		 */
+		rxm_av_remove(av_fid, fi_addr, count, flags);
+		return ret;
 	}
-	return ret;
+
+	return av->util_av.eq ? 0 : count;
 }
 
 static int rxm_av_insertsym(struct fid_av *av_fid, const char *node,
 			    size_t nodecnt, const char *service, size_t svccnt,
 			    fi_addr_t *fi_addr, uint64_t flags, void *context)
 {
-	struct util_av *av = container_of(av_fid, struct util_av, av_fid);
+	struct rxm_av *av;
 	void *addr;
-	size_t addrlen, count = nodecnt * svccnt;
-	int ret, retv;
+	size_t addrlen, count;
+	int ret;
 
-	ret = ofi_verify_av_insert(av, flags);
+	av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid);
+	ret = ofi_verify_av_insert(&av->util_av, flags, context);
 	if (ret)
 		return ret;
 
-	ret = ofi_ip_av_sym_getaddr(av, node, nodecnt, service,
+	ret = ofi_ip_av_sym_getaddr(&av->util_av, node, nodecnt, service,
 				    svccnt, &addr, &addrlen);
 	if (ret <= 0)
 		return ret;
 
-	assert(ret == count);
-
-	ret = ofi_ip_av_insertv(av, addr, addrlen, count, fi_addr, context);
-	if (!av->eq && ret < count) {
+	count = ret;
+	ret = ofi_ip_av_insertv(&av->util_av, addr, addrlen, count, fi_addr, flags,
+				context);
+	if (ret > 0 && ret < count)
 		count = ret;
-	}
 
-	/* If the AV is bound to an EQ, we can't determine which entries were
-	 * added successfully to the AV until we process the insertion events
-	 * later when reading the EQ.  Add all addresses to the cmap
-	 * optimistically.
-	 */
-	retv = rxm_av_insert_cmap(av_fid, addr, count, fi_addr, flags);
-	if (retv) {
-		ret = rxm_av_remove(av_fid, fi_addr, count, flags);
-		if (ret)
-			FI_WARN(&rxm_prov, FI_LOG_AV, "Failed to remove addr "
-				"from AV during error handling\n");
-		ret = retv;
+	ret = rxm_av_add_peers(av, addr, count, fi_addr);
+	if (ret) {
+		/* See comment in rxm_av_insert. */
+		rxm_av_remove(av_fid, fi_addr, count, flags);
+		return ret;
 	}
 
 	free(addr);
-	return ret;
+	return av->util_av.eq ? 0 : count;
 }
 
 int rxm_av_insertsvc(struct fid_av *av, const char *node, const char *service,
@@ -189,6 +301,30 @@ int rxm_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
 	return ofi_ip_av_lookup(av_fid, fi_addr, addr, addrlen);
 }
 
+static int rxm_av_close(struct fid *av_fid)
+{
+	struct rxm_av *av;
+	int ret;
+
+	av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid);
+	ret = ofi_av_close(&av->util_av);
+	if (ret)
+		return ret;
+
+	ofi_rbmap_cleanup(&av->addr_map);
+	ofi_bufpool_destroy(av->conn_pool);
+	ofi_bufpool_destroy(av->peer_pool);
+	free(av);
+	return 0;
+}
+
+static struct fi_ops rxm_av_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = rxm_av_close,
+	.bind = ofi_av_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
 
 static struct fi_ops_av rxm_av_ops = {
 	.size = sizeof(struct fi_ops_av),
@@ -202,15 +338,54 @@ static struct fi_ops_av rxm_av_ops = {
 };
 
 int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
-		struct fid_av **av, void *context)
+		struct fid_av **fid_av, void *context)
 {
+	struct rxm_domain *domain;
+	struct util_av_attr util_attr;
+	struct rxm_av *av;
 	int ret;
 
-	ret = ofi_ip_av_create(domain_fid, attr, av, context);
+	av = calloc(1, sizeof(*av));
+	if (!av)
+		return -FI_ENOMEM;
+
+	ret = ofi_bufpool_create(&av->peer_pool, sizeof(struct rxm_peer_addr),
+				 0, 0, 0, OFI_BUFPOOL_INDEXED |
+				 OFI_BUFPOOL_NO_TRACK);
 	if (ret)
-		return ret;
+		goto free;
+
+	ret = ofi_bufpool_create(&av->conn_pool, sizeof(struct rxm_conn),
+				 0, 0, 0, 0);
+	if (ret)
+		goto destroy1;
+
+	ofi_rbmap_init(&av->addr_map, rxm_addr_compare);
+	domain = container_of(domain_fid, struct rxm_domain,
+			      util_domain.domain_fid);
 
-	(*av)->ops = &rxm_av_ops;
+	util_attr.context_len = sizeof(struct rxm_peer_addr *);
+	util_attr.flags = 0;
+	util_attr.addrlen = ofi_sizeof_addr_format(domain->util_domain.
+						   addr_format);
+	if (attr->type == FI_AV_UNSPEC)
+		attr->type = FI_AV_TABLE;
+
+	ret = ofi_av_init(&domain->util_domain, attr, &util_attr,
+			  &av->util_av, context);
+	if (ret)
+		goto destroy2;
+
+	av->util_av.av_fid.fid.ops = &rxm_av_fi_ops;
+	av->util_av.av_fid.ops = &rxm_av_ops;
+	*fid_av = &av->util_av.av_fid;
 	return 0;
-}
 
+destroy2:
+	ofi_bufpool_destroy(av->conn_pool);
+destroy1:
+	ofi_bufpool_destroy(av->peer_pool);
+free:
+	free(av);
+	return ret;
+}
diff --git a/deps/libfabric/prov/rxm/src/rxm_conn.c b/deps/libfabric/prov/rxm/src/rxm_conn.c
index dff5d7dcc0e9e5d620b10d0fe8f007e1ea567ed7..473e6bfe9ea30277b36a1fe3c60dd8e785338436 100644
--- a/deps/libfabric/prov/rxm/src/rxm_conn.c
+++ b/deps/libfabric/prov/rxm/src/rxm_conn.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2016-2021 Intel Corporation, Inc.  All rights reserved.
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -39,1350 +39,757 @@
 #include <ofi_util.h>
 #include "rxm.h"
 
-static struct rxm_cmap_handle *rxm_conn_alloc(struct rxm_cmap *cmap);
-static int rxm_conn_connect(struct rxm_ep *ep,
-			    struct rxm_cmap_handle *handle, const void *addr);
-static int rxm_conn_signal(struct rxm_ep *ep, void *context,
-			   enum rxm_cmap_signal signal);
-static void rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle);
-static void *rxm_conn_progress(void *arg);
-static void *rxm_conn_atomic_progress(void *arg);
-static int rxm_conn_handle_event(struct rxm_ep *rxm_ep,
-				 struct rxm_msg_eq_entry *entry);
 
+static void *rxm_cm_progress(void *arg);
+static void *rxm_cm_atomic_progress(void *arg);
+static void rxm_flush_msg_cq(struct rxm_ep *rxm_ep);
 
-/*
- * Connection map
- */
 
-char *rxm_cm_state_str[] = {
-	RXM_CM_STATES(OFI_STR)
+/* castable to fi_eq_cm_entry - we can't use fi_eq_cm_entry directly
+ * here because of a compiler error with a 0-sized array
+ */
+struct rxm_eq_cm_entry {
+	fid_t fid;
+	struct fi_info *info;
+	union rxm_cm_data data;
 };
 
-static inline ssize_t rxm_eq_readerr(struct rxm_ep *rxm_ep,
-				     struct rxm_msg_eq_entry *entry)
+
+static void rxm_close_conn(struct rxm_conn *conn)
 {
-	ssize_t ret;
+	struct rxm_deferred_tx_entry *tx_entry;
+	struct rxm_recv_entry *rx_entry;
+	struct rxm_rx_buf *buf;
 
-	/* reset previous err data info */
-	entry->err_entry.err_data_size = 0;
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing conn %p\n", conn);
 
-	ret = fi_eq_readerr(rxm_ep->msg_eq, &entry->err_entry, 0);
-	if (ret != sizeof(entry->err_entry)) {
-		if (ret != -FI_EAGAIN)
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-				"unable to fi_eq_readerr: %zd\n", ret);
-		return ret < 0 ? ret : -FI_EINVAL;
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
+	/* All deferred transfers are internally generated */
+	while (!dlist_empty(&conn->deferred_tx_queue)) {
+		tx_entry = container_of(conn->deferred_tx_queue.next,
+				     struct rxm_deferred_tx_entry, entry);
+		rxm_dequeue_deferred_tx(tx_entry);
+		free(tx_entry);
 	}
 
-	if (entry->err_entry.err == ECONNREFUSED) {
-		entry->context = entry->err_entry.fid->context;
-		return -FI_ECONNREFUSED;
+	while (!dlist_empty(&conn->deferred_sar_segments)) {
+		buf = container_of(conn->deferred_sar_segments.next,
+				   struct rxm_rx_buf, unexp_msg.entry);
+		dlist_remove(&buf->unexp_msg.entry);
+		rxm_rx_buf_free(buf);
 	}
 
-	OFI_EQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
-			rxm_ep->msg_eq, &entry->err_entry);
-	return -entry->err_entry.err;
-}
-
-static ssize_t rxm_eq_read(struct rxm_ep *ep, size_t len,
-			   struct rxm_msg_eq_entry *entry)
-{
-	ssize_t ret;
-
-	ret = fi_eq_read(ep->msg_eq, &entry->event, &entry->cm_entry, len, 0);
-	if (ret == -FI_EAVAIL)
-		ret = rxm_eq_readerr(ep, entry);
-
-	return ret;
-}
-
-static void rxm_cmap_set_key(struct rxm_cmap_handle *handle)
-{
-	handle->key = ofi_idx2key(&handle->cmap->key_idx,
-		ofi_idx_insert(&handle->cmap->handles_idx, handle));
-}
-
-static void rxm_cmap_clear_key(struct rxm_cmap_handle *handle)
-{
-	int index = ofi_key2idx(&handle->cmap->key_idx, handle->key);
-
-	if (!ofi_idx_is_valid(&handle->cmap->handles_idx, index))
-		FI_WARN(handle->cmap->av->prov, FI_LOG_AV, "Invalid key!\n");
-	else
-		ofi_idx_remove(&handle->cmap->handles_idx, index);
-}
-
-struct rxm_cmap_handle *rxm_cmap_key2handle(struct rxm_cmap *cmap, uint64_t key)
-{
-	struct rxm_cmap_handle *handle;
-
-	if (!(handle = ofi_idx_lookup(&cmap->handles_idx,
-				      ofi_key2idx(&cmap->key_idx, key)))) {
-		FI_WARN(cmap->av->prov, FI_LOG_AV, "Invalid key!\n");
-	} else {
-		if (handle->key != key) {
-			FI_WARN(cmap->av->prov, FI_LOG_AV,
-				"handle->key not matching given key\n");
-			handle = NULL;
-		}
+	while (!dlist_empty(&conn->deferred_sar_msgs)) {
+		rx_entry = container_of(conn->deferred_sar_msgs.next,
+					struct rxm_recv_entry, sar.entry);
+		dlist_remove(&rx_entry->entry);
+		rxm_recv_entry_release(rx_entry);
 	}
-	return handle;
-}
-
-static void rxm_cmap_init_handle(struct rxm_cmap_handle *handle,
-				  struct rxm_cmap *cmap,
-				  enum rxm_cmap_state state,
-				  fi_addr_t fi_addr,
-				  struct rxm_cmap_peer *peer)
-{
-	handle->cmap = cmap;
-	RXM_CM_UPDATE_STATE(handle, state);
-	rxm_cmap_set_key(handle);
-	handle->fi_addr = fi_addr;
-	handle->peer = peer;
-}
-
-static int rxm_cmap_match_peer(struct dlist_entry *entry, const void *addr)
-{
-	struct rxm_cmap_peer *peer;
+	fi_close(&conn->msg_ep->fid);
+	rxm_flush_msg_cq(conn->ep);
+	dlist_remove_init(&conn->loopback_entry);
+	conn->msg_ep = NULL;
 
-	peer = container_of(entry, struct rxm_cmap_peer, entry);
-	return !memcmp(peer->addr, addr, peer->handle->cmap->av->addrlen);
+	if (conn->state == RXM_CM_CONNECTING || conn->state == RXM_CM_ACCEPTING)
+		conn->ep->connecting_cnt--;
+	assert(conn->ep->connecting_cnt >= 0);
+	conn->state = RXM_CM_IDLE;
 }
 
-static int rxm_cmap_del_handle(struct rxm_cmap_handle *handle)
+static int rxm_open_conn(struct rxm_conn *conn, struct fi_info *msg_info)
 {
-	struct rxm_cmap *cmap = handle->cmap;
+	struct rxm_domain *domain;
+	struct rxm_ep *ep;
+	struct fid_ep *msg_ep;
 	int ret;
 
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-	       "marking connection handle: %p for deletion\n", handle);
-	rxm_cmap_clear_key(handle);
-
-	RXM_CM_UPDATE_STATE(handle, RXM_CMAP_SHUTDOWN);
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "open msg ep %p\n", conn);
 
-	/* Signal CM thread to delete the handle. This is required
-	 * so that the CM thread handles any pending events for this
-	 * ep correctly. Handle would be freed finally after processing the
-	 * events */
-	ret = rxm_conn_signal(cmap->ep, handle, RXM_CMAP_FREE);
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
+	ep = conn->ep;
+	domain = container_of(ep->util_ep.domain, struct rxm_domain,
+			      util_domain);
+	ret = fi_endpoint(domain->msg_domain, msg_info, &msg_ep, conn);
 	if (ret) {
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Unable to signal CM thread\n");
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_endpoint", ret);
 		return ret;
 	}
-	return 0;
-}
 
-ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr,
-		     struct rxm_conn **rxm_conn)
-{
-	struct rxm_cmap_handle *handle;
-	ssize_t ret;
-
-	assert(rxm_ep->util_ep.tx_cq);
-	handle = rxm_cmap_acquire_handle(rxm_ep->cmap, addr);
-	if (!handle) {
-		ret = rxm_cmap_alloc_handle(rxm_ep->cmap, addr,
-					    RXM_CMAP_IDLE, &handle);
-		if (ret)
-			return ret;
+	ret = fi_ep_bind(msg_ep, &ep->msg_eq->fid, 0);
+	if (ret) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_ep_bind", ret);
+		goto err;
 	}
 
-	*rxm_conn = container_of(handle, struct rxm_conn, handle);
-
-	if (handle->state != RXM_CMAP_CONNECTED) {
-		ret = rxm_cmap_connect(rxm_ep, addr, handle);
-		if (ret)
-			return ret;
+	if (ep->srx_ctx) {
+		ret = fi_ep_bind(msg_ep, &ep->srx_ctx->fid, 0);
+		if (ret) {
+			RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_ep_bind", ret);
+			goto err;
+		}
 	}
 
-	if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue)) {
-		rxm_ep_do_progress(&rxm_ep->util_ep);
-		if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue))
-			return -FI_EAGAIN;
+	ret = fi_ep_bind(msg_ep, &ep->msg_cq->fid, FI_TRANSMIT | FI_RECV);
+	if (ret) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_ep_bind", ret);
+		goto err;
 	}
-	return 0;
-}
-
-static inline int
-rxm_cmap_check_and_realloc_handles_table(struct rxm_cmap *cmap,
-					 fi_addr_t fi_addr)
-{
-	void *new_handles;
-	size_t grow_size;
 
-	if (OFI_LIKELY(fi_addr < cmap->num_allocated))
-		return 0;
+	ret = fi_enable(msg_ep);
+	if (ret) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_enable", ret);
+		goto err;
+	}
 
-	grow_size = MAX(cmap->av->count, fi_addr - cmap->num_allocated + 1);
+	ret = domain->flow_ctrl_ops->enable(msg_ep);
+	if (!ret) {
+		domain->flow_ctrl_ops->set_threshold(msg_ep,
+					ep->msg_info->rx_attr->size / 2);
+	}
 
-	new_handles = realloc(cmap->handles_av,
-			      (grow_size + cmap->num_allocated) *
-			      sizeof(*cmap->handles_av));
-	if (OFI_LIKELY(!new_handles))
-		return -FI_ENOMEM;
+	if (!ep->srx_ctx) {
+		ret = rxm_prepost_recv(ep, msg_ep);
+		if (ret)
+			goto err;
+	}
 
-	cmap->handles_av = new_handles;
-	memset(&cmap->handles_av[cmap->num_allocated], 0,
-	       sizeof(*cmap->handles_av) * grow_size);
-	cmap->num_allocated += grow_size;
+	conn->msg_ep = msg_ep;
 	return 0;
+err:
+	fi_close(&msg_ep->fid);
+	return ret;
 }
 
-static struct rxm_pkt *
-rxm_conn_inject_pkt_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-			  uint8_t op, uint64_t flags)
+/* We send passive endpoint's port to the server as connection request
+ * would be from a different one.
+ */
+static int rxm_init_connect_data(struct rxm_conn *conn,
+				 union rxm_cm_data *cm_data)
 {
-	struct rxm_pkt *inject_pkt;
-	int ret = ofi_memalign((void **) &inject_pkt, 16,
-			       rxm_ep->inject_limit + sizeof(*inject_pkt));
-
-	if (ret)
-		return NULL;
-
-	memset(inject_pkt, 0, rxm_ep->inject_limit + sizeof(*inject_pkt));
-	inject_pkt->ctrl_hdr.version = RXM_CTRL_VERSION;
-	inject_pkt->ctrl_hdr.type = rxm_ctrl_eager;
-	inject_pkt->hdr.version = OFI_OP_VERSION;
-	inject_pkt->hdr.op = op;
-	inject_pkt->hdr.flags = flags;
-
-	return inject_pkt;
-}
+	size_t cm_data_size = 0;
+	size_t opt_size = sizeof(cm_data_size);
+	int ret;
 
-static void rxm_conn_res_free(struct rxm_conn *rxm_conn)
-{
-	ofi_freealign(rxm_conn->inject_pkt);
-	rxm_conn->inject_pkt = NULL;
-	ofi_freealign(rxm_conn->inject_data_pkt);
-	rxm_conn->inject_data_pkt = NULL;
-	ofi_freealign(rxm_conn->tinject_pkt);
-	rxm_conn->tinject_pkt = NULL;
-	ofi_freealign(rxm_conn->tinject_data_pkt);
-	rxm_conn->tinject_data_pkt = NULL;
-}
+	memset(cm_data, 0, sizeof(*cm_data));
+	cm_data->connect.version = RXM_CM_DATA_VERSION;
+	cm_data->connect.ctrl_version = RXM_CTRL_VERSION;
+	cm_data->connect.op_version = RXM_OP_VERSION;
+	cm_data->connect.endianness = ofi_detect_endianness();
+	cm_data->connect.eager_limit = conn->ep->eager_limit;
+	cm_data->connect.rx_size = conn->ep->msg_info->rx_attr->size;
 
-static int rxm_conn_res_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn)
-{
-	dlist_init(&rxm_conn->deferred_conn_entry);
-	dlist_init(&rxm_conn->deferred_tx_queue);
-	dlist_init(&rxm_conn->sar_rx_msg_list);
-	dlist_init(&rxm_conn->sar_deferred_rx_msg_list);
-
-	if (rxm_ep->util_ep.domain->threading != FI_THREAD_SAFE) {
-		rxm_conn->inject_pkt =
-			rxm_conn_inject_pkt_alloc(rxm_ep, rxm_conn,
-						  ofi_op_msg, 0);
-		rxm_conn->inject_data_pkt =
-			rxm_conn_inject_pkt_alloc(rxm_ep, rxm_conn,
-						  ofi_op_msg, FI_REMOTE_CQ_DATA);
-		rxm_conn->tinject_pkt =
-			rxm_conn_inject_pkt_alloc(rxm_ep, rxm_conn,
-						  ofi_op_tagged, 0);
-		rxm_conn->tinject_data_pkt =
-			rxm_conn_inject_pkt_alloc(rxm_ep, rxm_conn,
-						  ofi_op_tagged, FI_REMOTE_CQ_DATA);
-
-		if (!rxm_conn->inject_pkt || !rxm_conn->inject_data_pkt ||
-		    !rxm_conn->tinject_pkt || !rxm_conn->tinject_data_pkt) {
-			rxm_conn_res_free(rxm_conn);
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to allocate "
-				"inject pkt for connection\n");
-			return -FI_ENOMEM;
-		}
+	ret = fi_getopt(&conn->ep->msg_pep->fid, FI_OPT_ENDPOINT,
+			FI_OPT_CM_DATA_SIZE, &cm_data_size, &opt_size);
+	if (ret) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_getopt", ret);
+		return ret;
 	}
-	return 0;
-}
-
-static void rxm_conn_close(struct rxm_cmap_handle *handle)
-{
-	struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle);
-	struct rxm_conn *rxm_conn_tmp;
-	struct rxm_deferred_tx_entry *def_tx_entry;
-	struct dlist_entry *conn_entry_tmp;
-
-	dlist_foreach_container_safe(&handle->cmap->ep->deferred_tx_conn_queue,
-				     struct rxm_conn, rxm_conn_tmp,
-				     deferred_conn_entry, conn_entry_tmp)
-	{
-		if (rxm_conn_tmp->handle.key != handle->key)
-			continue;
 
-		while (!dlist_empty(&rxm_conn_tmp->deferred_tx_queue)) {
-			def_tx_entry =
-				container_of(rxm_conn_tmp->deferred_tx_queue.next,
-					     struct rxm_deferred_tx_entry, entry);
-			FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-			       "cancelled deferred message\n");
-			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-			free(def_tx_entry);
-		}
+	if (cm_data_size < sizeof(*cm_data)) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data too small\n");
+		return -FI_EOTHER;
 	}
 
-	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing msg ep\n");
-	if (!rxm_conn->msg_ep)
-		return;
-
-	if (fi_close(&rxm_conn->msg_ep->fid))
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to close msg_ep\n");
-
-	rxm_conn->msg_ep = NULL;
+	cm_data->connect.port = ofi_addr_get_port(&conn->ep->addr.sa);
+	cm_data->connect.client_conn_id = conn->peer->index;
+	return 0;
 }
 
-static void rxm_conn_free(struct rxm_cmap_handle *handle)
+static int rxm_send_connect(struct rxm_conn *conn)
 {
-	struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle);
+	union rxm_cm_data cm_data;
+	struct fi_info *info;
+	int ret;
 
-	rxm_conn_close(handle);
-	rxm_conn_res_free(rxm_conn);
-	free(rxm_conn);
-}
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connecting %p\n", conn);
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
 
-int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr,
-			  enum rxm_cmap_state state,
-			  struct rxm_cmap_handle **handle)
-{
-	int ret;
+	info = conn->ep->msg_info;
+	info->dest_addrlen = conn->ep->msg_info->src_addrlen;
 
-	*handle = rxm_conn_alloc(cmap);
-	if (!*handle)
+	free(info->dest_addr);
+	info->dest_addr = mem_dup(&conn->peer->addr, info->dest_addrlen);
+	if (!info->dest_addr)
 		return -FI_ENOMEM;
 
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-	       "Allocated handle: %p for fi_addr: %" PRIu64 "\n",
-	       *handle, fi_addr);
+	ret = rxm_open_conn(conn, info);
+	if (ret)
+		return ret;
+
+	ret = rxm_init_connect_data(conn, &cm_data);
+	if (ret)
+		goto err;
 
-	ret = rxm_cmap_check_and_realloc_handles_table(cmap, fi_addr);
+	ret = fi_connect(conn->msg_ep, info->dest_addr, &cm_data,
+			 sizeof(cm_data));
 	if (ret) {
-		rxm_conn_free(*handle);
-		return ret;
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_connect", ret);
+		goto err;
 	}
-
-	rxm_cmap_init_handle(*handle, cmap, state, fi_addr, NULL);
-	cmap->handles_av[fi_addr] = *handle;
+	conn->state = RXM_CM_CONNECTING;
+	conn->ep->connecting_cnt++;
 	return 0;
+
+err:
+	fi_close(&conn->msg_ep->fid);
+	conn->msg_ep = NULL;
+	return ret;
 }
 
-static int rxm_cmap_alloc_handle_peer(struct rxm_cmap *cmap, void *addr,
-				       enum rxm_cmap_state state,
-				       struct rxm_cmap_handle **handle)
+static int rxm_connect(struct rxm_conn *conn)
 {
-	struct rxm_cmap_peer *peer;
+	int ret;
 
-	peer = calloc(1, sizeof(*peer) + cmap->av->addrlen);
-	if (!peer)
-		return -FI_ENOMEM;
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
 
-	*handle = rxm_conn_alloc(cmap);
-	if (!*handle) {
-		free(peer);
-		return -FI_ENOMEM;
+	switch (conn->state) {
+	case RXM_CM_IDLE:
+		ret = rxm_send_connect(conn);
+		if (ret)
+			return ret;
+		break;
+	case RXM_CM_CONNECTING:
+	case RXM_CM_ACCEPTING:
+		break;
+	case RXM_CM_CONNECTED:
+		return 0;
+	default:
+		assert(0);
+		conn->state = RXM_CM_IDLE;
+		break;
 	}
 
-	ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV,
-			"Allocated handle for addr", addr);
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "handle: %p\n", *handle);
-
-	rxm_cmap_init_handle(*handle, cmap, state, FI_ADDR_NOTAVAIL, peer);
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Adding handle to peer list\n");
-	peer->handle = *handle;
-	memcpy(peer->addr, addr, cmap->av->addrlen);
-	dlist_insert_tail(&peer->entry, &cmap->peer_list);
-	return 0;
-}
-
-static struct rxm_cmap_handle *
-rxm_cmap_get_handle_peer(struct rxm_cmap *cmap, const void *addr)
-{
-	struct rxm_cmap_peer *peer;
-	struct dlist_entry *entry;
-
-	entry = dlist_find_first_match(&cmap->peer_list, rxm_cmap_match_peer,
-				       addr);
-	if (!entry)
-		return NULL;
-
-	ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV,
-			"handle found in peer list for addr", addr);
-	peer = container_of(entry, struct rxm_cmap_peer, entry);
-	return peer->handle;
+	return -FI_EAGAIN;
 }
 
-int rxm_cmap_remove(struct rxm_cmap *cmap, int index)
+static void rxm_free_conn(struct rxm_conn *conn)
 {
-	struct rxm_cmap_handle *handle;
-	int ret = -FI_ENOENT;
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "free conn %p\n", conn);
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
 
-	handle = cmap->handles_av[index];
-	if (!handle) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cmap entry not found\n");
-		return ret;
-	}
+	if (conn->flags & RXM_CONN_INDEXED)
+		ofi_idm_clear(&conn->ep->conn_idx_map, conn->peer->index);
 
-	handle->peer = calloc(1, sizeof(*handle->peer) + cmap->av->addrlen);
-	if (!handle->peer) {
-		ret = -FI_ENOMEM;
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to allocate memory "
-			"for moving handle to peer list, deleting it instead\n");
-		rxm_cmap_del_handle(handle);
-		return ret;
-	}
-	handle->fi_addr = FI_ADDR_NOTAVAIL;
-	cmap->handles_av[index] = NULL;
-	handle->peer->handle = handle;
-	memcpy(handle->peer->addr, ofi_av_get_addr(cmap->av, index),
-	       cmap->av->addrlen);
-	dlist_insert_tail(&handle->peer->entry, &cmap->peer_list);
-	return 0;
+	rxm_put_peer(conn->peer);
+	rxm_av_free_conn(conn);
 }
 
-static int rxm_cmap_move_handle(struct rxm_cmap_handle *handle,
-				fi_addr_t fi_addr)
+void rxm_freeall_conns(struct rxm_ep *ep)
 {
-	int ret;
+	struct rxm_conn *conn;
+	struct dlist_entry *tmp;
+	struct rxm_av *av;
+	int i, cnt;
 
-	dlist_remove(&handle->peer->entry);
-	free(handle->peer);
-	handle->peer = NULL;
-	handle->fi_addr = fi_addr;
-	ret = rxm_cmap_check_and_realloc_handles_table(handle->cmap, fi_addr);
-	if (OFI_UNLIKELY(ret))
-		return ret;
-	handle->cmap->handles_av[fi_addr] = handle;
-	return 0;
-}
+	av = container_of(ep->util_ep.av, struct rxm_av, util_av);
+	ofi_ep_lock_acquire(&ep->util_ep);
 
-int rxm_cmap_update(struct rxm_cmap *cmap, const void *addr, fi_addr_t fi_addr)
-{
-	struct rxm_cmap_handle *handle;
-	int ret;
+	/* We can't have more connections than the current number of
+	 * possible peers.
+	 */
+	cnt = (int) rxm_av_max_peers(av);
+	for (i = 0; i < cnt; i++) {
+		conn = ofi_idm_lookup(&ep->conn_idx_map, i);
+		if (!conn)
+			continue;
 
-	/* Check whether we have already allocated a handle for this `fi_addr`. */
-	/* We rely on the fact that `ofi_ip_av_insert`/`ofi_av_insert_addr` returns
-	 * the same `fi_addr` for the equal addresses */
-	if (fi_addr < cmap->num_allocated) {
-		handle = rxm_cmap_acquire_handle(cmap, fi_addr);
-		if (handle)
-			return 0;
+		if (conn->state != RXM_CM_IDLE)
+			rxm_close_conn(conn);
+		rxm_free_conn(conn);
 	}
 
-	handle = rxm_cmap_get_handle_peer(cmap, addr);
-	if (!handle) {
-		ret = rxm_cmap_alloc_handle(cmap, fi_addr,
-					    RXM_CMAP_IDLE, &handle);
-		return ret;
+	dlist_foreach_container_safe(&ep->loopback_list, struct rxm_conn,
+				     conn, loopback_entry, tmp) {
+		rxm_close_conn(conn);
+		rxm_free_conn(conn);
 	}
-	ret = rxm_cmap_move_handle(handle, fi_addr);
-	if (ret)
-		return ret;
 
-	rxm_conn_av_updated_handler(handle);
-	return 0;
+	ofi_ep_lock_release(&ep->util_ep);
 }
 
-void rxm_cmap_process_shutdown(struct rxm_cmap *cmap,
-			       struct rxm_cmap_handle *handle)
+static struct rxm_conn *
+rxm_alloc_conn(struct rxm_ep *ep, struct rxm_peer_addr *peer)
 {
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-		"Processing shutdown for handle: %p\n", handle);
-	if (handle->state > RXM_CMAP_SHUTDOWN) {
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Invalid handle on shutdown event\n");
-	} else if (handle->state != RXM_CMAP_SHUTDOWN) {
-		FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Got remote shutdown\n");
-		rxm_cmap_del_handle(handle);
-	} else {
-		FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Got local shutdown\n");
-	}
-}
+	struct rxm_conn *conn;
+	struct rxm_av *av;
 
-void rxm_cmap_process_connect(struct rxm_cmap *cmap,
-			      struct rxm_cmap_handle *handle,
-			      union rxm_cm_data *cm_data)
-{
-	struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle);
-
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-	       "processing FI_CONNECTED event for handle: %p\n", handle);
-	if (cm_data) {
-		assert(handle->state == RXM_CMAP_CONNREQ_SENT);
-		handle->remote_key = cm_data->accept.server_conn_id;
-		rxm_conn->rndv_tx_credits = cm_data->accept.rx_size;
-	} else {
-		assert(handle->state == RXM_CMAP_CONNREQ_RECV);
-	}
-	RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNECTED);
-
-	/* Set the remote key to the inject packets */
-	if (cmap->ep->util_ep.domain->threading != FI_THREAD_SAFE) {
-		rxm_conn->inject_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
-		rxm_conn->inject_data_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
-		rxm_conn->tinject_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
-		rxm_conn->tinject_data_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	av = container_of(ep->util_ep.av, struct rxm_av, util_av);
+	conn = rxm_av_alloc_conn(av);
+	if (!conn) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "rxm_av_alloc_conn", -FI_ENOMEM);
+		return NULL;
 	}
-}
 
-void rxm_cmap_process_reject(struct rxm_cmap *cmap,
-			     struct rxm_cmap_handle *handle,
-			     enum rxm_cmap_reject_reason reject_reason)
-{
-	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-		"Processing reject for handle: %p\n", handle);
-	switch (handle->state) {
-	case RXM_CMAP_CONNREQ_RECV:
-	case RXM_CMAP_CONNECTED:
-		/* Handle is being re-used for incoming connection request */
-		break;
-	case RXM_CMAP_CONNREQ_SENT:
-		if (reject_reason == RXM_CMAP_REJECT_GENUINE) {
-			FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-			       "Deleting connection handle\n");
-			rxm_cmap_del_handle(handle);
-		} else {
-			FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-			       "Connection handle is being re-used. Close the connection\n");
-			rxm_conn_close(handle);
-		}
-		break;
-	case RXM_CMAP_SHUTDOWN:
-		FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Connection handle already being deleted\n");
-		break;
-	default:
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL, "Invalid cmap state: "
-			"%d when receiving connection reject\n", handle->state);
-		assert(0);
-	}
+	conn->ep = ep;
+	conn->state = RXM_CM_IDLE;
+	conn->remote_index = -1;
+	conn->flags = 0;
+	dlist_init(&conn->deferred_entry);
+	dlist_init(&conn->deferred_tx_queue);
+	dlist_init(&conn->deferred_sar_msgs);
+	dlist_init(&conn->deferred_sar_segments);
+	dlist_init(&conn->loopback_entry);
+
+	conn->peer = peer;
+	rxm_ref_peer(peer);
+
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "allocated conn %p\n", conn);
+	return conn;
 }
 
-int rxm_cmap_process_connreq(struct rxm_cmap *cmap, void *addr,
-			     struct rxm_cmap_handle **handle_ret,
-			     uint8_t *reject_reason)
+static struct rxm_conn *
+rxm_add_conn(struct rxm_ep *ep, struct rxm_peer_addr *peer)
 {
-	struct rxm_cmap_handle *handle;
-	int ret = 0, cmp;
-	fi_addr_t fi_addr = ofi_ip_av_get_fi_addr(cmap->av, addr);
-
-	ofi_straddr_dbg(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Processing connreq from remote pep", addr);
-
-	if (fi_addr == FI_ADDR_NOTAVAIL)
-		handle = rxm_cmap_get_handle_peer(cmap, addr);
-	else
-		handle = rxm_cmap_acquire_handle(cmap, fi_addr);
-
-	if (!handle) {
-		if (fi_addr == FI_ADDR_NOTAVAIL)
-			ret = rxm_cmap_alloc_handle_peer(cmap, addr,
-							 RXM_CMAP_CONNREQ_RECV,
-							 &handle);
-		else
-			ret = rxm_cmap_alloc_handle(cmap, fi_addr,
-						    RXM_CMAP_CONNREQ_RECV,
-						    &handle);
-		if (ret)
-			goto unlock;
-	}
+	struct rxm_conn *conn;
 
-	switch (handle->state) {
-	case RXM_CMAP_CONNECTED:
-		FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Connection already present.\n");
-		ret = -FI_EALREADY;
-		break;
-	case RXM_CMAP_CONNREQ_SENT:
-		ofi_straddr_dbg(cmap->av->prov, FI_LOG_EP_CTRL, "local_name",
-				cmap->attr.name);
-		ofi_straddr_dbg(cmap->av->prov, FI_LOG_EP_CTRL, "remote_name",
-				addr);
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	conn = ofi_idm_lookup(&ep->conn_idx_map, peer->index);
+	if (conn)
+		return conn;
 
-		cmp = ofi_addr_cmp(cmap->av->prov, addr, cmap->attr.name);
+	conn = rxm_alloc_conn(ep, peer);
+	if (!conn)
+		return NULL;
 
-		if (cmp < 0) {
-			FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-				"Remote name lower than local name.\n");
-			*reject_reason = RXM_CMAP_REJECT_SIMULT_CONN;
-			ret = -FI_EALREADY;
-			break;
-		} else if (cmp > 0) {
-			FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-				"Re-using handle: %p to accept remote "
-				"connection\n", handle);
-			*reject_reason = RXM_CMAP_REJECT_GENUINE;
-			rxm_conn_close(handle);
-		} else {
-			FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
-				"Endpoint connects to itself\n");
-			ret = rxm_cmap_alloc_handle_peer(cmap, addr,
-							  RXM_CMAP_CONNREQ_RECV,
-							  &handle);
-			if (ret)
-				goto unlock;
-
-			assert(fi_addr != FI_ADDR_NOTAVAIL);
-			handle->fi_addr = fi_addr;
-		}
-		/* Fall through */
-	case RXM_CMAP_IDLE:
-		RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNREQ_RECV);
-		/* Fall through */
-	case RXM_CMAP_CONNREQ_RECV:
-		*handle_ret = handle;
-		break;
-	case RXM_CMAP_SHUTDOWN:
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL, "handle :%p marked for "
-			"deletion / shutdown, reject connection\n", handle);
-		*reject_reason = RXM_CMAP_REJECT_GENUINE;
-		ret = -FI_EOPBADSTATE;
-		break;
-	default:
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL,
-		       "invalid handle state: %d\n", handle->state);
-		assert(0);
-		ret = -FI_EOPBADSTATE;
+	if (ofi_idm_set(&ep->conn_idx_map, peer->index, conn) < 0) {
+		rxm_free_conn(conn);
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "ofi_idm_set", -FI_ENOMEM);
+		return NULL;
 	}
-unlock:
-	return ret;
+
+	conn->flags |= RXM_CONN_INDEXED;
+	return conn;
 }
 
-int rxm_msg_eq_progress(struct rxm_ep *rxm_ep)
+/* The returned conn is only valid if the function returns success. */
+ssize_t rxm_get_conn(struct rxm_ep *ep, fi_addr_t addr, struct rxm_conn **conn)
 {
-	struct rxm_msg_eq_entry *entry;
-	int ret;
+	struct rxm_peer_addr **peer;
+	ssize_t ret;
 
-	entry = alloca(RXM_MSG_EQ_ENTRY_SZ);
-	if (!entry) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to allocate memory!\n");
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	peer = ofi_av_addr_context(ep->util_ep.av, addr);
+	*conn = rxm_add_conn(ep, *peer);
+	if (!*conn)
 		return -FI_ENOMEM;
-	}
 
-	while (1) {
-		entry->rd = rxm_eq_read(rxm_ep, RXM_MSG_EQ_ENTRY_SZ, entry);
-		if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED) {
-			ret = (int) entry->rd;
-			break;
-		}
-		ret = rxm_conn_handle_event(rxm_ep, entry);
-		if (ret) {
-			FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-			       "invalid connection handle event: %d\n", ret);
-			break;
+	if ((*conn)->state == RXM_CM_CONNECTED) {
+		if (!dlist_empty(&(*conn)->deferred_tx_queue)) {
+			rxm_ep_do_progress(&ep->util_ep);
+			if (!dlist_empty(&(*conn)->deferred_tx_queue))
+				return -FI_EAGAIN;
 		}
+		return 0;
 	}
-	return ret;
-}
 
-int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr,
-		     struct rxm_cmap_handle *handle)
-{
-	int ret = FI_SUCCESS;
-
-	switch (handle->state) {
-	case RXM_CMAP_IDLE:
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "initiating MSG_EP connect "
-		       "for fi_addr: %" PRIu64 "\n", fi_addr);
-		ret = rxm_conn_connect(rxm_ep, handle,
-				       ofi_av_get_addr(rxm_ep->cmap->av, fi_addr));
-		if (ret) {
-			if (ret == -FI_ECONNREFUSED)
-				return -FI_EAGAIN;
+	ret = rxm_connect(*conn);
 
-			rxm_cmap_del_handle(handle);
-		} else {
-			RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNREQ_SENT);
-			ret = -FI_EAGAIN;
-		}
-		break;
-	case RXM_CMAP_CONNREQ_SENT:
-	case RXM_CMAP_CONNREQ_RECV:
-	case RXM_CMAP_SHUTDOWN:
-		ret = -FI_EAGAIN;
-		break;
-	default:
-		FI_WARN(rxm_ep->cmap->av->prov, FI_LOG_EP_CTRL,
-			"Invalid cmap handle state\n");
-		assert(0);
-		ret = -FI_EOPBADSTATE;
-	}
+	/* If the progress function encounters an error trying to establish
+	 * the connection, it may free the connection object.  This resets
+	 * the connection process to restart from the beginning.
+	 */
 	if (ret == -FI_EAGAIN)
-		rxm_msg_eq_progress(rxm_ep);
-
+		rxm_conn_progress(ep);
 	return ret;
 }
 
-static int rxm_cmap_cm_thread_close(struct rxm_cmap *cmap)
+void rxm_process_connect(struct rxm_eq_cm_entry *cm_entry)
 {
-	int ret;
+	struct rxm_conn *conn;
 
-	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "stopping CM thread\n");
-	if (!cmap->cm_thread)
-		return 0;
+	conn = cm_entry->fid->context;
+	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
+	       "processing connected for handle: %p\n", conn);
 
-	cmap->ep->do_progress = false;
-	ret = rxm_conn_signal(cmap->ep, NULL, RXM_CMAP_EXIT);
-	if (ret) {
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Unable to signal CM thread\n");
-		return ret;
-	}
-	ret = pthread_join(cmap->cm_thread, NULL);
-	if (ret) {
-		FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL,
-			"Unable to join CM thread\n");
-		return ret;
-	}
-	return 0;
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
+	if (conn->state == RXM_CM_CONNECTING)
+		conn->remote_index = cm_entry->data.accept.server_conn_id;
+
+	conn->ep->connecting_cnt--;
+	assert(conn->ep->connecting_cnt >= 0);
+	conn->state = RXM_CM_CONNECTED;
 }
 
-void rxm_cmap_free(struct rxm_cmap *cmap)
+/* For simultaneous connection requests, if the peer won the coin
+ * flip (reject EALREADY), our connection request is discarded.
+ */
+static void
+rxm_process_reject(struct rxm_conn *conn, struct fi_eq_err_entry *entry)
 {
-	struct rxm_cmap_peer *peer;
-	struct dlist_entry *entry;
-	size_t i;
+	union rxm_cm_data *cm_data;
+	uint8_t reason;
 
-	FI_INFO(cmap->av->prov, FI_LOG_EP_CTRL, "Closing cmap\n");
-	rxm_cmap_cm_thread_close(cmap);
+	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL,
+	       "Processing reject for handle: %p\n", conn);
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
 
-	for (i = 0; i < cmap->num_allocated; i++) {
-		if (cmap->handles_av[i]) {
-			rxm_cmap_clear_key(cmap->handles_av[i]);
-			rxm_conn_free(cmap->handles_av[i]);
+	if (entry->err_data_size >= sizeof(cm_data->reject)) {
+		cm_data = entry->err_data;
+		if (cm_data->reject.version != RXM_CM_DATA_VERSION) {
+			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "invalid reject version\n");
+			reason = RXM_REJECT_ECONNREFUSED;
+		} else {
+			reason = cm_data->reject.reason;
 		}
+	} else {
+		reason = RXM_REJECT_ECONNREFUSED;
 	}
 
-	while (!dlist_empty(&cmap->peer_list)) {
-		entry = cmap->peer_list.next;
-		peer = container_of(entry, struct rxm_cmap_peer, entry);
-		dlist_remove(&peer->entry);
-		rxm_cmap_clear_key(peer->handle);
-		rxm_conn_free(peer->handle);
-		free(peer);
+	switch (conn->state) {
+	case RXM_CM_IDLE:
+		/* Unlikely, but can occur if our request was rejected, and
+		 * there was a failure trying to accept the peer's.
+		 */
+		break;
+	case RXM_CM_CONNECTING:
+		rxm_close_conn(conn);
+		if (reason != RXM_REJECT_EALREADY)
+			rxm_free_conn(conn);
+		break;
+	case RXM_CM_ACCEPTING:
+	case RXM_CM_CONNECTED:
+		/* Our request was rejected, but we accepted the peer's. */
+		break;
+	default:
+		assert(0);
+		break;
 	}
-
-	free(cmap->handles_av);
-	free(cmap->attr.name);
-	ofi_idx_reset(&cmap->handles_idx);
-	free(cmap);
 }
 
 static int
-rxm_cmap_update_addr(struct util_av *av, void *addr,
-		     fi_addr_t fi_addr, void *arg)
-{
-	return rxm_cmap_update((struct rxm_cmap *)arg, addr, fi_addr);
-}
-
-int rxm_cmap_bind_to_av(struct rxm_cmap *cmap, struct util_av *av)
+rxm_verify_connreq(struct rxm_ep *ep, union rxm_cm_data *cm_data)
 {
-	cmap->av = av;
-	return ofi_av_elements_iter(av, rxm_cmap_update_addr, (void *)cmap);
-}
-
-int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr)
-{
-	struct rxm_cmap *cmap;
-	struct util_ep *ep = &rxm_ep->util_ep;
-	int ret;
-
-	cmap = calloc(1, sizeof *cmap);
-	if (!cmap)
-		return -FI_ENOMEM;
-
-	cmap->ep = rxm_ep;
-	cmap->av = ep->av;
-
-	cmap->handles_av = calloc(cmap->av->count, sizeof(*cmap->handles_av));
-	if (!cmap->handles_av) {
-		ret = -FI_ENOMEM;
-		goto err1;
+	if (cm_data->connect.version != RXM_CM_DATA_VERSION) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm version mismatch");
+		return -FI_EINVAL;
 	}
-	cmap->num_allocated = ep->av->count;
 
-	cmap->attr = *attr;
-	cmap->attr.name = mem_dup(attr->name, ep->av->addrlen);
-	if (!cmap->attr.name) {
-		ret = -FI_ENOMEM;
-		goto err2;
+	if (cm_data->connect.endianness != ofi_detect_endianness()) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "endianness mismatch");
+		return -FI_EINVAL;
 	}
 
-	memset(&cmap->handles_idx, 0, sizeof(cmap->handles_idx));
-	ofi_key_idx_init(&cmap->key_idx, RXM_CMAP_IDX_BITS);
-
-	dlist_init(&cmap->peer_list);
-
-	rxm_ep->cmap = cmap;
-
-	if (ep->domain->data_progress == FI_PROGRESS_AUTO || force_auto_progress) {
+	if (cm_data->connect.ctrl_version != RXM_CTRL_VERSION) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm ctrl_version mismatch");
+		return -FI_EINVAL;
+	}
 
-		assert(ep->domain->threading == FI_THREAD_SAFE);
-		rxm_ep->do_progress = true;
-		if (pthread_create(&cmap->cm_thread, 0,
-				   rxm_ep->rxm_info->caps & FI_ATOMIC ?
-				   rxm_conn_atomic_progress :
-				   rxm_conn_progress, ep)) {
-			FI_WARN(ep->av->prov, FI_LOG_EP_CTRL,
-				"unable to create cmap thread\n");
-			ret = -ofi_syserr();
-			goto err3;
-		}
+	if (cm_data->connect.op_version != RXM_OP_VERSION) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm op_version mismatch");
+		return -FI_EINVAL;
 	}
 
-	assert(ep->av);
-	ret = rxm_cmap_bind_to_av(cmap, ep->av);
-	if (ret)
-		goto err4;
+	if (cm_data->connect.eager_limit != ep->eager_limit) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "eager_limit mismatch");
+		return -FI_EINVAL;
+	}
 
 	return FI_SUCCESS;
-err4:
-	rxm_cmap_cm_thread_close(cmap);
-err3:
-	rxm_ep->cmap = NULL;
-	free(cmap->attr.name);
-err2:
-	free(cmap->handles_av);
-err1:
-	free(cmap);
-	return ret;
 }
 
-static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info,
-			   struct rxm_conn *rxm_conn, void *context)
+static void
+rxm_reject_connreq(struct rxm_ep *ep, struct rxm_eq_cm_entry *cm_entry,
+		   uint8_t reason)
 {
-	struct rxm_domain *rxm_domain;
-	struct fid_ep *msg_ep;
+	union rxm_cm_data cm_data;
 	int ret;
 
-	rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
-			util_domain);
-
-	ret = fi_endpoint(rxm_domain->msg_domain, msg_info, &msg_ep, context);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to create msg_ep: %d\n", ret);
-		return ret;
-	}
+	cm_data.reject.version = RXM_CM_DATA_VERSION;
+	cm_data.reject.reason = reason;
 
-	ret = fi_ep_bind(msg_ep, &rxm_ep->msg_eq->fid, 0);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to bind msg EP to EQ: %d\n", ret);
-		goto err;
-	}
-
-	if (rxm_ep->srx_ctx) {
-		ret = fi_ep_bind(msg_ep, &rxm_ep->srx_ctx->fid, 0);
-		if (ret) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to bind msg "
-				"EP to shared RX ctx: %d\n", ret);
-			goto err;
-		}
-	}
-
-	// TODO add other completion flags
-	ret = fi_ep_bind(msg_ep, &rxm_ep->msg_cq->fid, FI_TRANSMIT | FI_RECV);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-				"unable to bind msg_ep to msg_cq: %d\n", ret);
-		goto err;
-	}
-
-	ret = fi_enable(msg_ep);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to enable msg_ep: %d\n", ret);
-		goto err;
-	}
-
-	ret = rxm_domain->flow_ctrl_ops->enable(msg_ep);
-	if (!ret) {
-		rxm_domain->flow_ctrl_ops->set_threshold(
-			msg_ep, rxm_ep->msg_info->rx_attr->size / 2);
-	}
-
-	rxm_conn->msg_ep = msg_ep;
-
-	if (!rxm_ep->srx_ctx) {
-		ret = rxm_msg_ep_prepost_recv(rxm_ep, msg_ep);
-		if (ret)
-			goto err;
-	}
-	return 0;
-err:
-	fi_close(&msg_ep->fid);
-	return ret;
+	ret = fi_reject(ep->msg_pep, cm_entry->info->handle,
+			&cm_data.reject, sizeof(cm_data.reject));
+	if (ret)
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_reject", ret);
 }
 
-static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue)
+static int
+rxm_accept_connreq(struct rxm_conn *conn, struct rxm_eq_cm_entry *cm_entry)
 {
-	struct rxm_rx_buf *rx_buf;
-	struct dlist_entry *entry, *tmp_entry;
-	struct rxm_recv_match_attr match_attr;
-	struct fi_cq_err_entry err_entry = {0};
-	int ret, count = 0;
-
-	dlist_foreach_container_safe(&recv_queue->unexp_msg_list,
-				     struct rxm_rx_buf, rx_buf,
-				     unexp_msg.entry, tmp_entry) {
-		if (rx_buf->unexp_msg.addr == rx_buf->conn->handle.fi_addr)
-			continue;
-
-		assert(rx_buf->unexp_msg.addr == FI_ADDR_NOTAVAIL);
-
-		rx_buf->unexp_msg.addr = rx_buf->conn->handle.fi_addr;
-		match_attr.addr = rx_buf->unexp_msg.addr;
-		match_attr.tag = rx_buf->unexp_msg.tag;
-
-		entry = dlist_remove_first_match(&recv_queue->recv_list,
-						 recv_queue->match_recv,
-						 &match_attr);
-		if (!entry)
-			continue;
+	union rxm_cm_data cm_data;
+	int ret;
 
-		dlist_remove(&rx_buf->unexp_msg.entry);
-		rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry,
-						  entry);
+	cm_data.accept.server_conn_id = conn->peer->index;
+	cm_data.accept.rx_size = cm_entry->info->rx_attr->size;
 
-		ret = rxm_handle_rx_buf(rx_buf);
-		if (ret) {
-			err_entry.op_context = rx_buf;
-			err_entry.flags = rx_buf->recv_entry->comp_flags;
-			err_entry.len = rx_buf->pkt.hdr.size;
-			err_entry.data = rx_buf->pkt.hdr.data;
-			err_entry.tag = rx_buf->pkt.hdr.tag;
-			err_entry.err = ret;
-			err_entry.prov_errno = ret;
-			ofi_cq_write_error(recv_queue->rxm_ep->util_ep.rx_cq,
-					   &err_entry);
-			if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED)
-				rxm_cntr_incerr(rx_buf->ep->util_ep.rx_cntr);
-
-			rxm_rx_buf_free(rx_buf);
-
-			if (!(rx_buf->recv_entry->flags & FI_MULTI_RECV))
-				rxm_recv_entry_release(recv_queue,
-						       rx_buf->recv_entry);
-		}
-		count++;
-	}
-	return count;
+	ret = fi_accept(conn->msg_ep, &cm_data.accept, sizeof(cm_data.accept));
+	if (ret)
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_accept", ret);
+	return ret;
 }
 
 static void
-rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle)
+rxm_process_connreq(struct rxm_ep *ep, struct rxm_eq_cm_entry *cm_entry)
 {
-	struct rxm_ep *ep = handle->cmap->ep;
-	int count = 0;
-
-	if (ep->rxm_info->caps & FI_DIRECTED_RECV) {
-		count += rxm_conn_reprocess_directed_recvs(&ep->recv_queue);
-		count += rxm_conn_reprocess_directed_recvs(&ep->trecv_queue);
-
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-		       "Reprocessed directed recvs - %d\n", count);
-	}
-}
+	union ofi_sock_ip peer_addr;
+	struct rxm_peer_addr *peer;
+	struct rxm_conn *conn;
+	struct rxm_av *av;
+	ssize_t ret;
+	int cmp;
 
-static struct rxm_cmap_handle *rxm_conn_alloc(struct rxm_cmap *cmap)
-{
-	struct rxm_conn *rxm_conn;
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	if (rxm_verify_connreq(ep, &cm_entry->data))
+		goto reject;
 
-	rxm_conn = calloc(1, sizeof(*rxm_conn));
-	if (!rxm_conn)
-		return NULL;
+	memcpy(&peer_addr, cm_entry->info->dest_addr,
+	       cm_entry->info->dest_addrlen);
+	ofi_addr_set_port(&peer_addr.sa, cm_entry->data.connect.port);
 
-	if (rxm_conn_res_alloc(cmap->ep, rxm_conn)) {
-		free(rxm_conn);
-		return NULL;
+	av = container_of(ep->util_ep.av, struct rxm_av, util_av);
+	peer = rxm_get_peer(av, &peer_addr);
+	if (!peer) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "rxm_get_peer", -FI_ENOMEM);
+		goto reject;
 	}
 
-	return &rxm_conn->handle;
-}
+	conn = rxm_add_conn(ep, peer);
+	if (!conn)
+		goto remove;
 
-static inline int
-rxm_conn_verify_cm_data(union rxm_cm_data *remote_cm_data,
-			union rxm_cm_data *local_cm_data)
-{
-	/* This should stay at top as it helps to avoid endian conversion
-	 * for other fields in rxm_cm_data */
-	if (remote_cm_data->connect.version != local_cm_data->connect.version) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data version mismatch "
-			"(local: %" PRIu8 ", remote:  %" PRIu8 ")\n",
-			local_cm_data->connect.version,
-			remote_cm_data->connect.version);
-		goto err;
-	}
-	if (remote_cm_data->connect.endianness != local_cm_data->connect.endianness) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data endianness mismatch "
-			"(local: %" PRIu8 ", remote:  %" PRIu8 ")\n",
-			local_cm_data->connect.endianness,
-			remote_cm_data->connect.endianness);
-		goto err;
-	}
-	if (remote_cm_data->connect.ctrl_version != local_cm_data->connect.ctrl_version) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data ctrl_version mismatch "
-			"(local: %" PRIu8 ", remote:  %" PRIu8 ")\n",
-			local_cm_data->connect.ctrl_version,
-			remote_cm_data->connect.ctrl_version);
-		goto err;
-	}
-	if (remote_cm_data->connect.op_version != local_cm_data->connect.op_version) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data op_version mismatch "
-			"(local: %" PRIu8 ", remote:  %" PRIu8 ")\n",
-			local_cm_data->connect.op_version,
-			remote_cm_data->connect.op_version);
-		goto err;
-	}
-	if (remote_cm_data->connect.eager_size != local_cm_data->connect.eager_size) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "cm data eager_size mismatch "
-			"(local: %" PRIu32 ", remote:  %" PRIu32 ")\n",
-			local_cm_data->connect.eager_size,
-			remote_cm_data->connect.eager_size);
-		goto err;
-	}
-	return FI_SUCCESS;
-err:
-	return -FI_EINVAL;
-}
-
-static size_t rxm_conn_get_rx_size(struct rxm_ep *rxm_ep,
-				   struct fi_info *msg_info)
-{
-	if (msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT)
-		return MAX(MIN(16, msg_info->rx_attr->size),
-			   (msg_info->rx_attr->size /
-			    rxm_ep->util_ep.av->count));
-	else
-		return msg_info->rx_attr->size;
-}
-
-static int
-rxm_msg_process_connreq(struct rxm_ep *rxm_ep, struct fi_info *msg_info,
-			union rxm_cm_data *remote_cm_data)
-{
-	struct rxm_conn *rxm_conn;
-	union rxm_cm_data cm_data = {
-		.connect = {
-			.version = RXM_CM_DATA_VERSION,
-			.endianness = ofi_detect_endianness(),
-			.ctrl_version = RXM_CTRL_VERSION,
-			.op_version = RXM_OP_VERSION,
-			.eager_size = rxm_ep->rxm_info->tx_attr->inject_size,
-		},
-	};
-	union rxm_cm_data reject_cm_data = {
-		.reject = {
-			.version = RXM_CM_DATA_VERSION,
-			.reason = RXM_CMAP_REJECT_GENUINE,
+	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "connreq for %p\n", conn);
+	switch (conn->state) {
+	case RXM_CM_IDLE:
+		break;
+	case RXM_CM_CONNECTING:
+		/* simultaneous connections */
+		cmp = ofi_addr_cmp(&rxm_prov, &peer_addr.sa, &ep->addr.sa);
+		if (cmp < 0) {
+			/* let our request finish */
+			FI_INFO(&rxm_prov, FI_LOG_EP_CTRL,
+				"simultaneous, reject peer %p\n", conn);
+			rxm_reject_connreq(ep, cm_entry,
+					   RXM_REJECT_ECONNREFUSED);
+			goto put;
+		} else if (cmp > 0) {
+			/* accept peer's request */
+			FI_INFO(&rxm_prov, FI_LOG_EP_CTRL,
+				"simultaneous, accept peer %p\n", conn);
+			rxm_close_conn(conn);
+		} else {
+			/* connecting to ourself, create loopback conn */
+			FI_INFO(&rxm_prov, FI_LOG_EP_CTRL,
+				"loopback conn %p\n", conn);
+			conn = rxm_alloc_conn(ep, peer);
+			if (!conn)
+				goto remove;
+
+			dlist_insert_tail(&conn->loopback_entry, &ep->loopback_list);
+			break;
 		}
-	};
-	struct rxm_cmap_handle *handle;
-	struct sockaddr_storage remote_pep_addr;
-	int ret;
-
-	assert(sizeof(uint32_t) == sizeof(cm_data.accept.rx_size));
-	assert(msg_info->rx_attr->size <= (uint32_t)-1);
-
-	if (rxm_conn_verify_cm_data(remote_cm_data, &cm_data)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"CM data mismatch was detected\n");
-		ret = -FI_EINVAL;
-		goto err1;
+		break;
+	case RXM_CM_ACCEPTING:
+	case RXM_CM_CONNECTED:
+		FI_INFO(&rxm_prov, FI_LOG_EP_CTRL,
+			"old connection accepting/done, replacing %p\n", conn);
+		rxm_close_conn(conn);
+		break;
+	default:
+		assert(0);
+		break;
 	}
 
-	memcpy(&remote_pep_addr, msg_info->dest_addr, msg_info->dest_addrlen);
-	ofi_addr_set_port((struct sockaddr *)&remote_pep_addr,
-			  remote_cm_data->connect.port);
-
-	ret = rxm_cmap_process_connreq(rxm_ep->cmap, &remote_pep_addr,
-				       &handle, &reject_cm_data.reject.reason);
+	conn->remote_index = cm_entry->data.connect.client_conn_id;
+	ret = rxm_open_conn(conn, cm_entry->info);
 	if (ret)
-		goto err1;
-
-	rxm_conn = container_of(handle, struct rxm_conn, handle);
+		goto free;
 
-	rxm_conn->handle.remote_key = remote_cm_data->connect.client_conn_id;
-	rxm_conn->rndv_tx_credits = remote_cm_data->connect.rx_size;
-	assert(rxm_conn->rndv_tx_credits);
-
-	ret = rxm_msg_ep_open(rxm_ep, msg_info, rxm_conn, handle);
+	ret = rxm_accept_connreq(conn, cm_entry);
 	if (ret)
-		goto err2;
+		goto close;
 
-	cm_data.accept.server_conn_id = rxm_conn->handle.key;
-	cm_data.accept.rx_size = rxm_conn_get_rx_size(rxm_ep, msg_info);
+	conn->state = RXM_CM_ACCEPTING;
+	conn->ep->connecting_cnt++;
+put:
+	rxm_put_peer(peer);
+	fi_freeinfo(cm_entry->info);
+	return;
 
-	ret = fi_accept(rxm_conn->msg_ep, &cm_data.accept.server_conn_id,
-			sizeof(cm_data.accept));
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unable to accept incoming connection\n");
-		goto err2;
-	}
-
-	return ret;
-err2:
-	rxm_cmap_del_handle(&rxm_conn->handle);
-err1:
-	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-	       "rejecting incoming connection request (reject reason: %d)\n",
-	       (enum rxm_cmap_reject_reason)reject_cm_data.reject.reason);
-	fi_reject(rxm_ep->msg_pep, msg_info->handle,
-		  &reject_cm_data.reject, sizeof(reject_cm_data.reject));
-	return ret;
+close:
+	rxm_close_conn(conn);
+free:
+	rxm_free_conn(conn);
+remove:
+	rxm_put_peer(peer);
+reject:
+	rxm_reject_connreq(ep, cm_entry, RXM_REJECT_ECONNREFUSED);
+	fi_freeinfo(cm_entry->info);
 }
 
-static void rxm_flush_msg_cq(struct rxm_ep *rxm_ep)
+void rxm_process_shutdown(struct rxm_conn *conn)
 {
-	struct fi_cq_data_entry comp;
-	int ret;
-	do {
-		ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1);
-		if (ret > 0) {
-			ret = rxm_handle_comp(rxm_ep, &comp);
-			if (OFI_UNLIKELY(ret)) {
-				rxm_cq_write_error_all(rxm_ep, ret);
-			} else {
-				ret = 1;
-			}
-		} else if (ret == -FI_EAVAIL) {
-			rxm_handle_comp_error(rxm_ep);
-			ret = 1;
-		} else if (ret < 0 && ret != -FI_EAGAIN) {
-			rxm_cq_write_error_all(rxm_ep, ret);
-		}
-	} while (ret > 0);
-}
-
-static int rxm_conn_handle_notify(struct fi_eq_entry *eq_entry)
-{
-	struct rxm_cmap *cmap;
-	struct rxm_cmap_handle *handle;
-
-	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "notify event %" PRIu64 "\n",
-		eq_entry->data);
-
-	if ((enum rxm_cmap_signal) eq_entry->data != RXM_CMAP_FREE)
-		return -FI_EOTHER;
-
-	handle = eq_entry->context;
-	assert(handle->state == RXM_CMAP_SHUTDOWN);
-	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "freeing handle: %p\n", handle);
-	cmap = handle->cmap;
-
-	rxm_conn_close(handle);
+	assert(ofi_ep_lock_held(&conn->ep->util_ep));
 
-	// after closing the connection, we need to flush any dangling references to the
-	// handle from msg_cq entries that have not been cleaned up yet, otherwise we
-	// could run into problems during CQ cleanup.  these entries will be errored so
-	// keep reading through EAVAIL.
-	rxm_flush_msg_cq(cmap->ep);
+	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "shutdown conn %p (state %d)\n",
+		conn, conn->state);
 
-	if (handle->peer) {
-		dlist_remove(&handle->peer->entry);
-		free(handle->peer);
-		handle->peer = NULL;
-	} else {
-		cmap->handles_av[handle->fi_addr] = NULL;
+	switch (conn->state) {
+	case RXM_CM_IDLE:
+		break;
+	case RXM_CM_CONNECTING:
+	case RXM_CM_ACCEPTING:
+	case RXM_CM_CONNECTED:
+		rxm_close_conn(conn);
+		rxm_free_conn(conn);
+		break;
+	default:
+		break;
 	}
-	rxm_conn_free(handle);
-	return 0;
 }
 
-static void rxm_conn_wake_up_wait_obj(struct rxm_ep *rxm_ep)
+static void rxm_handle_error(struct rxm_ep *ep)
 {
-	if (rxm_ep->util_ep.tx_cq && rxm_ep->util_ep.tx_cq->wait)
-		util_cq_signal(rxm_ep->util_ep.tx_cq);
-	if (rxm_ep->util_ep.tx_cntr && rxm_ep->util_ep.tx_cntr->wait)
-		util_cntr_signal(rxm_ep->util_ep.tx_cntr);
-}
+	struct fi_eq_err_entry entry = {0};
+	ssize_t ret;
 
-static int
-rxm_conn_handle_reject(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry)
-{
-	union rxm_cm_data *cm_data = entry->err_entry.err_data;
-
-	if (!cm_data || entry->err_entry.err_data_size != sizeof(cm_data->reject)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-			"no reject error data (cm_data) was found "
-			"(data length expected: %zu found: %zu)\n",
-			sizeof(cm_data->reject),
-			entry->err_entry.err_data_size);
-		return -FI_EOTHER;
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	ret = fi_eq_readerr(ep->msg_eq, &entry, 0);
+	if (ret != sizeof(entry)) {
+		if (ret != -FI_EAGAIN)
+			RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_eq_readerr", ret);
+		return;
 	}
 
-	if (cm_data->reject.version != RXM_CM_DATA_VERSION) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-			"cm data version mismatch (local: %" PRIu8
-			", remote:  %" PRIu8 ")\n",
-			(uint8_t) RXM_CM_DATA_VERSION,
-			cm_data->reject.version);
-		return -FI_EOTHER;
-	}
+	OFI_EQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, ep->msg_eq,
+			&entry);
+	if (!entry.fid || entry.fid->fclass != FI_CLASS_EP)
+		return;
 
-	if (cm_data->reject.reason == RXM_CMAP_REJECT_GENUINE) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-		       "remote peer didn't accept the connection\n");
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-		       "(reason: RXM_CMAP_REJECT_GENUINE)\n");
-		OFI_EQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
-				rxm_ep->msg_eq, &entry->err_entry);
-	} else if (cm_data->reject.reason == RXM_CMAP_REJECT_SIMULT_CONN) {
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-		       "(reason: RXM_CMAP_REJECT_SIMULT_CONN)\n");
+	if (entry.err == ECONNREFUSED) {
+		rxm_process_reject(entry.fid->context, &entry);
 	} else {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: "
-		        "received unknown reject reason: %d\n",
-			cm_data->reject.reason);
+		rxm_process_shutdown(entry.fid->context);
 	}
-	rxm_cmap_process_reject(rxm_ep->cmap, entry->context,
-				cm_data->reject.reason);
-	return 0;
 }
 
-static int
-rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry)
+static void
+rxm_handle_event(struct rxm_ep *ep, uint32_t event,
+		 struct rxm_eq_cm_entry *cm_entry, size_t len)
 {
-	if (entry->rd == -FI_ECONNREFUSED)
-		return rxm_conn_handle_reject(rxm_ep, entry);
-
-	switch (entry->event) {
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	switch (event) {
 	case FI_NOTIFY:
-		return rxm_conn_handle_notify((struct fi_eq_entry *)
-					      &entry->cm_entry);
+		break;
 	case FI_CONNREQ:
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Got new connection\n");
-		if ((size_t)entry->rd != RXM_CM_ENTRY_SZ) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-				"Received a connection request with no CM data. "
-				"Is sender running FI_PROTO_RXM?\n");
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Received CM entry "
-				"size (%zd) not matching expected (%zu)\n",
-				entry->rd, RXM_CM_ENTRY_SZ);
-			return -FI_EOTHER;
-		}
-		rxm_msg_process_connreq(rxm_ep, entry->cm_entry.info,
-					(union rxm_cm_data *) entry->cm_entry.data);
-		fi_freeinfo(entry->cm_entry.info);
+		rxm_process_connreq(ep, cm_entry);
 		break;
 	case FI_CONNECTED:
-		assert(entry->cm_entry.fid->context);
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-		       "connection successful\n");
-		rxm_cmap_process_connect(rxm_ep->cmap,
-			entry->cm_entry.fid->context,
-			entry->rd - sizeof(entry->cm_entry) > 0 ?
-			(union rxm_cm_data *) entry->cm_entry.data : NULL);
-		rxm_conn_wake_up_wait_obj(rxm_ep);
+		rxm_process_connect(cm_entry);
 		break;
 	case FI_SHUTDOWN:
-		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
-		       "Received connection shutdown\n");
-		rxm_cmap_process_shutdown(rxm_ep->cmap,
-					  entry->cm_entry.fid->context);
+		rxm_process_shutdown(cm_entry->fid->context);
 		break;
 	default:
 		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unknown event: %u\n", entry->event);
-		return -FI_EOTHER;
+			"Unknown event: %u\n", event);
+		break;
 	}
-	return 0;
 }
 
-static ssize_t rxm_eq_sread(struct rxm_ep *rxm_ep, size_t len,
-			    struct rxm_msg_eq_entry *entry)
+void rxm_conn_progress(struct rxm_ep *ep)
 {
-	ssize_t rd;
-	int once = 1;
+	struct rxm_eq_cm_entry cm_entry;
+	uint32_t event;
+	int ret;
 
+	assert(ofi_ep_lock_held(&ep->util_ep));
 	do {
-		/* TODO convert this to poll + fi_eq_read so that we can grab
-		 * rxm_ep lock before reading the EQ. This is needed to avoid
-		 * processing events / error entries from closed MSG EPs. This
-		 * can be done only for non-Windows OSes as Windows doesn't
-		 * have poll for a generic file descriptor. */
-		rd = fi_eq_sread(rxm_ep->msg_eq, &entry->event, &entry->cm_entry,
-				 len, -1, 0);
-		if (rd >= 0)
-			return rd;
-		if (rd == -FI_EINTR && once) {
-			FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Ignoring EINTR\n");
-			once = 0;
+		ret = fi_eq_read(ep->msg_eq, &event, &cm_entry,
+				 sizeof(cm_entry), 0);
+		if (ret > 0) {
+			rxm_handle_event(ep, event, &cm_entry, ret);
+		} else if (ret == -FI_EAVAIL) {
+			rxm_handle_error(ep);
+			ret = 1;
 		}
-	} while (rd == -FI_EINTR);
+	} while (ret > 0);
+}
 
-	if (rd != -FI_EAVAIL) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to fi_eq_sread: %s (%zd)\n",
-			fi_strerror(-rd), -rd);
-		return rd;
+void rxm_stop_listen(struct rxm_ep *ep)
+{
+	struct fi_eq_entry entry = {0};
+	int ret;
+
+	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "stopping CM thread\n");
+	if (!ep->cm_thread)
+		return;
+
+	ofi_ep_lock_acquire(&ep->util_ep);
+	ep->do_progress = false;
+	ofi_ep_lock_release(&ep->util_ep);
+
+	ret = fi_eq_write(ep->msg_eq, FI_NOTIFY, &entry, sizeof(entry), 0);
+	if (ret != sizeof(entry)) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to signal\n");
+		return;
 	}
 
-	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	rd = rxm_eq_readerr(rxm_ep, entry);
-	ofi_ep_lock_release(&rxm_ep->util_ep);
-	return rd;
+	ret = pthread_join(ep->cm_thread, NULL);
+	if (ret) {
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "pthread_join", -ret);
+	}
 }
 
-static inline int rxm_conn_eq_event(struct rxm_ep *rxm_ep,
-				    struct rxm_msg_eq_entry *entry)
+static void rxm_flush_msg_cq(struct rxm_ep *ep)
 {
+	struct fi_cq_data_entry comp;
 	int ret;
 
-	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_conn_handle_event(rxm_ep, entry) ? -1 : 0;
-	ofi_ep_lock_release(&rxm_ep->util_ep);
-
-	return ret;
+	assert(ofi_ep_lock_held(&ep->util_ep));
+	do {
+		ret = fi_cq_read(ep->msg_cq, &comp, 1);
+		if (ret > 0) {
+			ret = rxm_handle_comp(ep, &comp);
+			if (ret) {
+				rxm_cq_write_error_all(ep, ret);
+			} else {
+				ret = 1;
+			}
+		} else if (ret == -FI_EAVAIL) {
+			rxm_handle_comp_error(ep);
+			ret = 1;
+		} else if (ret < 0 && ret != -FI_EAGAIN) {
+			rxm_cq_write_error_all(ep, ret);
+		}
+	} while (ret > 0);
 }
 
-static void *rxm_conn_progress(void *arg)
+static void *rxm_cm_progress(void *arg)
 {
 	struct rxm_ep *ep = container_of(arg, struct rxm_ep, util_ep);
-	struct rxm_msg_eq_entry *entry;
-
-	entry = alloca(RXM_MSG_EQ_ENTRY_SZ);
-	if (!entry)
-		return NULL;
+	struct rxm_eq_cm_entry cm_entry;
+	uint32_t event;
+	ssize_t ret;
 
 	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Starting auto-progress thread\n");
 
+	ofi_ep_lock_acquire(&ep->util_ep);
 	while (ep->do_progress) {
-		memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ);
-		entry->rd = rxm_eq_sread(ep, RXM_CM_ENTRY_SZ, entry);
-		if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED)
-			continue;
+		ofi_ep_lock_release(&ep->util_ep);
 
-		rxm_conn_eq_event(ep, entry);
+		ret = fi_eq_sread(ep->msg_eq, &event, &cm_entry,
+				  sizeof(cm_entry), -1, 0);
+
+		ofi_ep_lock_acquire(&ep->util_ep);
+		if (ret > 0) {
+			rxm_handle_event(ep, event, &cm_entry, ret);
+		} else if (ret == -FI_EAVAIL) {
+			rxm_handle_error(ep);
+		} else {
+			RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_eq_sread", ret);
+			break;
+		}
 	}
+	ofi_ep_lock_release(&ep->util_ep);
 
 	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Stopping auto-progress thread\n");
 	return NULL;
 }
 
-static inline int
-rxm_conn_auto_progress_eq(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry)
-{
-	memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ);
-
-	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	entry->rd = rxm_eq_read(rxm_ep, RXM_CM_ENTRY_SZ, entry);
-	ofi_ep_lock_release(&rxm_ep->util_ep);
-
-	if (!entry->rd || entry->rd == -FI_EAGAIN)
-		return FI_SUCCESS;
-	if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED)
-		return entry->rd;
-
-	return rxm_conn_eq_event(rxm_ep, entry);
-}
-
-static void *rxm_conn_atomic_progress(void *arg)
+static void *rxm_cm_atomic_progress(void *arg)
 {
 	struct rxm_ep *ep = container_of(arg, struct rxm_ep, util_ep);
-	struct rxm_msg_eq_entry *entry;
 	struct rxm_fabric *fabric;
 	struct fid *fids[2] = {
 		&ep->msg_eq->fid,
@@ -1394,187 +801,89 @@ static void *rxm_conn_atomic_progress(void *arg)
 	};
 	int ret;
 
-	entry = alloca(RXM_MSG_EQ_ENTRY_SZ);
-	if (!entry)
-		return NULL;
-
 	fabric = container_of(ep->util_ep.domain->fabric,
 			      struct rxm_fabric, util_fabric);
-
 	ret = fi_control(&ep->msg_eq->fid, FI_GETWAIT, &fds[0].fd);
 	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to get msg EQ fd: %s\n", fi_strerror(ret));
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_control", ret);
 		return NULL;
 	}
 
 	ret = fi_control(&ep->msg_cq->fid, FI_GETWAIT, &fds[1].fd);
 	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"unable to get msg CQ fd: %s\n", fi_strerror(ret));
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_control", ret);
 		return NULL;
 	}
 
 	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Starting auto-progress thread\n");
+	ofi_ep_lock_acquire(&ep->util_ep);
 	while (ep->do_progress) {
+		ofi_ep_lock_release(&ep->util_ep);
 		ret = fi_trywait(fabric->msg_fabric, fids, 2);
 
 		if (!ret) {
-			fds[0].revents = 0;
-			fds[1].revents = 0;
-
 			ret = poll(fds, 2, -1);
-			if (ret == -1 && errno != EINTR) {
-				FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-					"Select error %s, closing CM thread\n",
-					strerror(errno));
-				break;
+			if (ret == -1) {
+				RXM_WARN_ERR(FI_LOG_EP_CTRL, "poll", -errno);
 			}
 		}
-		rxm_conn_auto_progress_eq(ep, entry);
 		ep->util_ep.progress(&ep->util_ep);
+		ofi_ep_lock_acquire(&ep->util_ep);
+		rxm_conn_progress(ep);
 	}
+	ofi_ep_lock_release(&ep->util_ep);
 
 	FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Stopping auto progress thread\n");
 	return NULL;
 }
 
-static int rxm_prepare_cm_data(struct fid_pep *pep, struct rxm_cmap_handle *handle,
-		union rxm_cm_data *cm_data)
+int rxm_start_listen(struct rxm_ep *ep)
 {
-	struct sockaddr_storage name;
-	size_t cm_data_size = 0;
-	size_t name_size = sizeof(name);
-	size_t opt_size = sizeof(cm_data_size);
+	size_t addr_len;
 	int ret;
 
-	ret = fi_getopt(&pep->fid, FI_OPT_ENDPOINT, FI_OPT_CM_DATA_SIZE,
-			&cm_data_size, &opt_size);
+	ret = fi_listen(ep->msg_pep);
 	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "fi_getopt failed\n");
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_listen", ret);
 		return ret;
 	}
 
-	if (cm_data_size < sizeof(*cm_data)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "MSG EP CM data size too small\n");
-		return -FI_EOTHER;
-	}
-
-	ret = fi_getname(&pep->fid, &name, &name_size);
+	addr_len = sizeof(ep->addr);
+	ret = fi_getname(&ep->msg_pep->fid, &ep->addr, &addr_len);
 	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to get msg pep name\n");
+		RXM_WARN_ERR(FI_LOG_EP_CTRL, "fi_getname", ret);
 		return ret;
 	}
 
-	cm_data->connect.port = ofi_addr_get_port((struct sockaddr *)&name);
-	cm_data->connect.client_conn_id = handle->key;
-	return 0;
-}
-
-static int
-rxm_conn_connect(struct rxm_ep *ep, struct rxm_cmap_handle *handle,
-		 const void *addr)
-{
-	int ret;
-	struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle);
-	union rxm_cm_data cm_data = {
-		.connect = {
-			.version = RXM_CM_DATA_VERSION,
-			.ctrl_version = RXM_CTRL_VERSION,
-			.op_version = RXM_OP_VERSION,
-			.endianness = ofi_detect_endianness(),
-			.eager_size = ep->rxm_info->tx_attr->inject_size,
-		},
-	};
-
-	assert(sizeof(uint32_t) == sizeof(cm_data.connect.eager_size));
-	assert(sizeof(uint32_t) == sizeof(cm_data.connect.rx_size));
-	assert(ep->rxm_info->tx_attr->inject_size <= (uint32_t) -1);
-	assert(ep->msg_info->rx_attr->size <= (uint32_t) -1);
-
-	free(ep->msg_info->dest_addr);
-	ep->msg_info->dest_addrlen = ep->msg_info->src_addrlen;
-
-	ep->msg_info->dest_addr = mem_dup(addr, ep->msg_info->dest_addrlen);
-	if (!ep->msg_info->dest_addr) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "mem_dup failed, len %zu\n",
-			ep->msg_info->dest_addrlen);
-		return -FI_ENOMEM;
+	/* Update src_addr that will be used for active endpoints.
+	 * Zero out the port to avoid address conflicts, as we will
+	 * create multiple msg ep's for a single rdm ep.
+	 */
+	if (ep->msg_info->src_addr) {
+		free(ep->msg_info->src_addr);
+		ep->msg_info->src_addr = NULL;
+		ep->msg_info->src_addrlen = 0;
 	}
 
-	ret = rxm_msg_ep_open(ep, ep->msg_info, rxm_conn, &rxm_conn->handle);
-	if (ret)
-		return ret;
-
-	/* We have to send passive endpoint's address to the server since the
-	 * address from which connection request would be sent would have a
-	 * different port. */
-	ret = rxm_prepare_cm_data(ep->msg_pep, &rxm_conn->handle, &cm_data);
-	if (ret)
-		goto err;
-
-	cm_data.connect.rx_size = rxm_conn_get_rx_size(ep, ep->msg_info);
-
-	ret = fi_connect(rxm_conn->msg_ep, ep->msg_info->dest_addr,
-			 &cm_data, sizeof(cm_data));
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to connect msg_ep\n");
-		goto err;
-	}
-	return 0;
-
-err:
-	fi_close(&rxm_conn->msg_ep->fid);
-	rxm_conn->msg_ep = NULL;
-	return ret;
-}
+	ep->msg_info->src_addr = mem_dup(&ep->addr, addr_len);
+	if (!ep->msg_info->src_addr)
+		return -FI_ENOMEM;
 
-static int rxm_conn_signal(struct rxm_ep *ep, void *context,
-			   enum rxm_cmap_signal signal)
-{
-	struct fi_eq_entry entry = {0};
-	ssize_t rd;
+	ep->msg_info->src_addrlen = addr_len;
+	ofi_addr_set_port(ep->msg_info->src_addr, 0);
 
-	entry.context = context;
-	entry.data = (uint64_t) signal;
+	if (ep->util_ep.domain->data_progress == FI_PROGRESS_AUTO ||
+	    force_auto_progress) {
 
-	rd = fi_eq_write(ep->msg_eq, FI_NOTIFY, &entry, sizeof(entry), 0);
-	if (rd != sizeof(entry)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to signal\n");
-		return (int)rd;
+		assert(ep->util_ep.domain->threading == FI_THREAD_SAFE);
+		ep->do_progress = true;
+		ret = pthread_create(&ep->cm_thread, 0,
+				     ep->rxm_info->caps & FI_ATOMIC ?
+				     rxm_cm_atomic_progress : rxm_cm_progress, ep);
+		if (ret) {
+			RXM_WARN_ERR(FI_LOG_EP_CTRL, "pthread_create", -ret);
+			return -ret;
+		}
 	}
 	return 0;
 }
-
-int rxm_conn_cmap_alloc(struct rxm_ep *rxm_ep)
-{
-	struct rxm_cmap_attr attr;
-	int ret;
-	size_t len = rxm_ep->util_ep.av->addrlen;
-	void *name = calloc(1, len);
-	if (!name) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unable to allocate memory for EP name\n");
-		return -FI_ENOMEM;
-	}
-
-	/* Passive endpoint should already have fi_setname or fi_listen
-	 * called on it for this to work */
-	ret = fi_getname(&rxm_ep->msg_pep->fid, name, &len);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unable to fi_getname on msg_ep\n");
-		goto fn;
-	}
-	ofi_straddr_dbg(&rxm_prov, FI_LOG_EP_CTRL, "local_name", name);
-
-	attr.name		= name;
-
-	ret = rxm_cmap_alloc(rxm_ep, &attr);
-	if (ret)
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unable to allocate CMAP\n");
-fn:
-	free(name);
-	return ret;
-}
diff --git a/deps/libfabric/prov/rxm/src/rxm_cq.c b/deps/libfabric/prov/rxm/src/rxm_cq.c
index 32a31e4de07f137d6f17af3bc3599ea5c7077486..687e5d494b9e84751f1f0a6a0810521ecc672b65 100644
--- a/deps/libfabric/prov/rxm/src/rxm_cq.c
+++ b/deps/libfabric/prov/rxm/src/rxm_cq.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2018 Cray Inc. All rights reserved.
  * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved.
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -62,22 +63,46 @@ rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno,
 	return fi_cq_strerror(rxm_ep->msg_cq, prov_errno, err_data, buf, len);
 }
 
-static int rxm_repost_new_rx(struct rxm_rx_buf *rx_buf)
+static struct rxm_rx_buf *
+rxm_rx_buf_alloc(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep)
+{
+	struct rxm_rx_buf *rx_buf;
+
+	rx_buf = ofi_buf_alloc(rxm_ep->rx_pool);
+	if (!rx_buf)
+		return NULL;
+
+	assert(rx_buf->ep == rxm_ep);
+	rx_buf->hdr.state = RXM_RX;
+	rx_buf->rx_ep = rx_ep;
+	rx_buf->repost = true;
+
+	if (!rxm_ep->srx_ctx)
+		rx_buf->conn = rx_ep->fid.context;
+
+	return rx_buf;
+}
+
+/* Processing on the current rx buffer is expected to be slow.
+ * Post a new buffer to take its place, and mark the current
+ * buffer to return to the free pool when finished.
+ */
+static void rxm_replace_rx_buf(struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_rx_buf *new_rx_buf;
-	if (rx_buf->repost) {
-		rx_buf->repost = 0;
-		new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1);
-		if (!new_rx_buf)
-			return -FI_ENOMEM;
+	int ret;
 
-		dlist_insert_tail(&new_rx_buf->repost_entry,
-				  &new_rx_buf->ep->repost_ready_list);
-	}
-	return FI_SUCCESS;
+	new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->rx_ep);
+	if (!new_rx_buf)
+		return;
+
+	rx_buf->repost = false;
+	ret = rxm_post_recv(new_rx_buf);
+	if (ret)
+		ofi_buf_free(new_rx_buf);
 }
 
-static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
+static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
 {
 	uint64_t flags;
 	char *data;
@@ -85,9 +110,8 @@ static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
 	if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) &&
 	    rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) {
 		dlist_insert_tail(&rx_buf->unexp_msg.entry,
-				  &rx_buf->conn->sar_deferred_rx_msg_list);
-		// repost a new buffer immediately while SAR takes some time to complete
-		return rxm_repost_new_rx(rx_buf);
+				  &rx_buf->conn->deferred_sar_segments);
+		rxm_replace_rx_buf(rx_buf);
 	}
 
 	flags = (rx_buf->pkt.hdr.flags | ofi_rx_flags[rx_buf->pkt.hdr.op]);
@@ -95,7 +119,7 @@ static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
 	if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_eager)
 		flags |= FI_MORE;
 
-	if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv)
+	if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_req)
 		data = rxm_pkt_rndv_data(&rx_buf->pkt);
 	else
 		data = rx_buf->pkt.data;
@@ -104,11 +128,11 @@ static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
 	       "length: %" PRIu64 "\n", rx_buf->pkt.hdr.size);
 	rx_buf->recv_context.ep = &rx_buf->ep->util_ep.ep_fid;
 
-	return rxm_cq_write_recv_comp(rx_buf, &rx_buf->recv_context, flags,
-				      rx_buf->pkt.hdr.size, data);
+	rxm_cq_write_recv_comp(rx_buf, &rx_buf->recv_context, flags,
+			       rx_buf->pkt.hdr.size, data);
 }
 
-static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len)
+static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len)
 {
 	int ret;
 
@@ -126,95 +150,57 @@ static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len)
 				       rx_buf->recv_entry->rxm_iov.iov[0].iov_base,
 				       rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag,
 				       rx_buf->pkt.hdr.size - done_len);
-	if (ret)
+	if (ret) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n");
-	return ret;
+		assert(0);
+	}
 }
 
-static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len)
+static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len)
 {
 	struct rxm_recv_entry *recv_entry = rx_buf->recv_entry;
-	size_t recv_size;
-	int ret = FI_SUCCESS;
 
 	if (done_len < rx_buf->pkt.hdr.size) {
-		ret = rxm_cq_write_error_trunc(rx_buf, done_len);
+		rxm_cq_write_error_trunc(rx_buf, done_len);
 		goto release;
 	}
 
 	if (rx_buf->recv_entry->flags & FI_COMPLETION ||
 	    rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) {
-		ret = rxm_cq_write_recv_comp(rx_buf,
-				rx_buf->recv_entry->context,
-				rx_buf->recv_entry->comp_flags |
-				rx_buf->pkt.hdr.flags,
-				rx_buf->pkt.hdr.size,
-				rx_buf->recv_entry->rxm_iov.iov[0].iov_base);
-		if (ret)
-			goto release;
+		rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context,
+				       rx_buf->recv_entry->comp_flags |
+				       rx_buf->pkt.hdr.flags |
+				       (rx_buf->recv_entry->flags & FI_MULTI_RECV),
+				       rx_buf->pkt.hdr.size,
+				       rx_buf->recv_entry->rxm_iov.
+				       iov[0].iov_base);
 	}
 	ofi_ep_rx_cntr_inc(&rx_buf->ep->util_ep);
 
-	if (rx_buf->recv_entry->flags & FI_MULTI_RECV) {
-		recv_size = rx_buf->pkt.hdr.size;
-
-		recv_entry->total_len -= recv_size;
-
-		if (recv_entry->total_len < rx_buf->ep->min_multi_recv_size) {
-			ret = ofi_cq_write(rx_buf->ep->util_ep.rx_cq,
-					   recv_entry->context,
-					   FI_MULTI_RECV, 0, NULL, 0, 0);
-			goto release;
-		}
-
-		recv_entry->rxm_iov.iov[0].iov_base = (uint8_t *)
-				recv_entry->rxm_iov.iov[0].iov_base + recv_size;
-		recv_entry->rxm_iov.iov[0].iov_len -= recv_size;
-
-		dlist_insert_head(&recv_entry->entry,
-				  &recv_entry->recv_queue->recv_list);
-		goto free_buf;
-	}
-
 release:
-	rxm_recv_entry_release(recv_entry->recv_queue, recv_entry);
-free_buf:
+	rxm_recv_entry_release(recv_entry);
 	rxm_rx_buf_free(rx_buf);
-	if (ret)
-		FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing CQ entry\n");
-	return ret;
 }
 
-static int
+static void
 rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags,
 		     void *app_context,  uint64_t flags)
 {
-	int ret;
-
 	if (flags & FI_COMPLETION) {
-		ret = ofi_cq_write(rxm_ep->util_ep.tx_cq, app_context,
-				   comp_flags, 0, NULL, 0, 0);
-		if (ret) {
-			FI_WARN(&rxm_prov, FI_LOG_CQ,
-				"Unable to report completion\n");
-		} else {
-			rxm_cq_log_comp(comp_flags);
-		}
-	} else {
-		ret = 0;
+		rxm_cq_write(rxm_ep->util_ep.tx_cq, app_context,
+			     comp_flags, 0, NULL, 0, 0);
 	}
-	return ret;
 }
 
-static int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf,
+static void rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_tx_buf *rma_buf,
 			  uint64_t comp_flags)
 {
-	int ret = rxm_cq_write_tx_comp(rxm_ep, comp_flags,
-				       rma_buf->app_context, rma_buf->flags);
-
 	assert(((comp_flags & FI_WRITE) && !(comp_flags & FI_READ)) ||
 	       ((comp_flags & FI_READ) && !(comp_flags & FI_WRITE)));
 
+	rxm_cq_write_tx_comp(rxm_ep, comp_flags, rma_buf->app_context,
+			     rma_buf->flags);
+
 	if (comp_flags & FI_WRITE)
 		ofi_ep_wr_cntr_inc(&rxm_ep->util_ep);
 	else
@@ -222,57 +208,62 @@ static int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf,
 
 	if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local &&
 	    rxm_ep->msg_mr_local) {
-		rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count);
+		rxm_msg_mr_closev(rma_buf->rma.mr, rma_buf->rma.count);
 	}
 
-	ofi_buf_free(rma_buf);
-	return ret;
+	rxm_free_rx_buf(rxm_ep, rma_buf);
 }
 
-int rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_buf)
+void rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_buf)
 {
-	int ret = rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
-				       tx_buf->app_context, tx_buf->flags);
-
 	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
-	ofi_ep_tx_cntr_inc(&rxm_ep->util_ep);
 
-	return ret;
+	rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
+			     tx_buf->app_context, tx_buf->flags);
+	ofi_ep_tx_cntr_inc(&rxm_ep->util_ep);
 }
 
-static int rxm_finish_sar_segment_send(struct rxm_ep *rxm_ep,
-				       struct rxm_tx_sar_buf *tx_buf, bool err)
+static bool rxm_complete_sar(struct rxm_ep *rxm_ep,
+			     struct rxm_tx_buf *tx_buf)
 {
-	struct rxm_tx_sar_buf *first_tx_buf;
-	int ret = FI_SUCCESS;
+	struct rxm_tx_buf *first_tx_buf;
 
+	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
 	switch (rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr)) {
 	case RXM_SAR_SEG_FIRST:
 		break;
 	case RXM_SAR_SEG_MIDDLE:
-		ofi_buf_free(tx_buf);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
 		break;
 	case RXM_SAR_SEG_LAST:
-		if (!err) {
-			ret = rxm_cq_write_tx_comp(rxm_ep,
-					ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
-					tx_buf->app_context, tx_buf->flags);
-
-			assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
-			ofi_ep_tx_cntr_inc(&rxm_ep->util_ep);
-		}
-		first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep->
-					buf_pools[RXM_BUF_POOL_TX_SAR].pool,
-					tx_buf->pkt.ctrl_hdr.msg_id);
-		ofi_buf_free(first_tx_buf);
-		ofi_buf_free(tx_buf);
-		break;
+		first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep->tx_pool,
+						tx_buf->pkt.ctrl_hdr.msg_id);
+		rxm_free_rx_buf(rxm_ep, first_tx_buf);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
+		return true;
 	}
 
-	return ret;
+	return false;
 }
 
-static int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf)
+static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep,
+				struct rxm_tx_buf *tx_buf)
+{
+	void *app_context;
+	uint64_t comp_flags, tx_flags;
+
+	app_context = tx_buf->app_context;
+	comp_flags = ofi_tx_cq_flags(tx_buf->pkt.hdr.op);
+	tx_flags = tx_buf->flags;
+
+	if (!rxm_complete_sar(rxm_ep, tx_buf))
+		return;
+
+	rxm_cq_write_tx_comp(rxm_ep, comp_flags, app_context, tx_flags);
+	ofi_ep_tx_cntr_inc(&rxm_ep->util_ep);
+}
+
+static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf)
 {
 	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH);
 
@@ -285,52 +276,91 @@ static int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf)
 		rxm_msg_mr_closev(rx_buf->mr,
 				  rx_buf->recv_entry->rxm_iov.count);
 
-	return rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len);
+	rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len);
 }
 
-static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep,
-			      struct rxm_tx_rndv_buf *tx_buf)
+static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep,
+			       struct rxm_tx_buf *tx_buf)
 {
-	int ret;
+	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
 
 	RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_FINISH);
-
 	if (!rxm_ep->rdm_mr_local)
-		rxm_msg_mr_closev(tx_buf->mr, tx_buf->count);
+		rxm_msg_mr_closev(tx_buf->rma.mr, tx_buf->rma.count);
 
-	ret = rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
-				   tx_buf->app_context, tx_buf->flags);
+	rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
+			     tx_buf->app_context, tx_buf->flags);
 
-	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
+	if (rxm_ep->rndv_ops == &rxm_rndv_ops_write &&
+	    tx_buf->write_rndv.done_buf) {
+		ofi_buf_free(tx_buf->write_rndv.done_buf);
+		tx_buf->write_rndv.done_buf = NULL;
+	}
 	ofi_ep_tx_cntr_inc(&rxm_ep->util_ep);
-
-	ofi_buf_free(tx_buf);
-
-	return ret;
+	rxm_free_rx_buf(rxm_ep, tx_buf);
 }
 
-static int rxm_rndv_handle_ack(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
+static void rxm_rndv_handle_rd_done(struct rxm_ep *rxm_ep,
+				    struct rxm_rx_buf *rx_buf)
 {
-	struct rxm_tx_rndv_buf *tx_buf;
-	int ret;
-
-	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_RNDV].pool,
-				      rx_buf->pkt.ctrl_hdr.msg_id);
+	struct rxm_tx_buf *tx_buf;
 
 	FI_DBG(&rxm_prov, FI_LOG_CQ, "Got ACK for msg_id: 0x%" PRIx64 "\n",
 	       rx_buf->pkt.ctrl_hdr.msg_id);
 
+	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->tx_pool,
+				      rx_buf->pkt.ctrl_hdr.msg_id);
 	assert(tx_buf->pkt.ctrl_hdr.msg_id == rx_buf->pkt.ctrl_hdr.msg_id);
 
 	rxm_rx_buf_free(rx_buf);
 
-	if (tx_buf->hdr.state == RXM_RNDV_ACK_WAIT) {
-		ret = rxm_rndv_tx_finish(rxm_ep, tx_buf);
+	if (tx_buf->hdr.state == RXM_RNDV_READ_DONE_WAIT) {
+		rxm_rndv_tx_finish(rxm_ep, tx_buf);
 	} else {
 		assert(tx_buf->hdr.state == RXM_RNDV_TX);
-		RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_ACK_RECVD);
-		ret = 0;
+		RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_READ_DONE_RECVD);
+	}
+}
+
+static int rxm_rndv_rx_match(struct dlist_entry *item, const void *arg)
+{
+	uint64_t msg_id = *((uint64_t *) arg);
+	struct rxm_rx_buf *rx_buf;
+
+	rx_buf = container_of(item, struct rxm_rx_buf, rndv_wait_entry);
+	return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id);
+}
+
+static int rxm_rndv_handle_wr_done(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
+{
+	struct dlist_entry *rx_buf_entry;
+	struct rxm_rx_buf *rndv_rx_buf;
+	int ret = 0;
+
+	FI_DBG(&rxm_prov, FI_LOG_CQ, "Got DONE for msg_id: 0x%" PRIx64 "\n",
+	       rx_buf->pkt.ctrl_hdr.msg_id);
+
+	rx_buf_entry = dlist_remove_first_match(&rx_buf->ep->rndv_wait_list,
+						rxm_rndv_rx_match,
+						&rx_buf->pkt.ctrl_hdr.msg_id);
+	if (!rx_buf_entry) {
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"Failed to find rndv wait entry for msg_id: 0x%" PRIx64 "\n",
+			rx_buf->pkt.ctrl_hdr.msg_id);
+		ret = -FI_EINVAL;
+		goto out;
 	}
+	rndv_rx_buf = container_of(rx_buf_entry, struct rxm_rx_buf,
+				   rndv_wait_entry);
+
+	if (rndv_rx_buf->hdr.state == RXM_RNDV_WRITE_DONE_WAIT) {
+		rxm_rndv_rx_finish(rndv_rx_buf);
+	} else {
+		assert(rndv_rx_buf->hdr.state == RXM_RNDV_WRITE_DATA_SENT);
+		RXM_UPDATE_STATE(FI_LOG_CQ, rndv_rx_buf, RXM_RNDV_WRITE_DONE_RECVD);
+	}
+out:
+	rxm_rx_buf_free(rx_buf);
 	return ret;
 }
 
@@ -343,16 +373,24 @@ static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg)
 	return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id);
 }
 
-static ssize_t rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
+static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 {
-	uint64_t done_len;
-	ssize_t ret;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	ssize_t done_len;
+
+	iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc,
+					      rx_buf->recv_entry->rxm_iov.count,
+					      &device);
+
+	done_len = ofi_copy_to_hmem_iov(iface, device,
+					rx_buf->recv_entry->rxm_iov.iov,
+					rx_buf->recv_entry->rxm_iov.count,
+					rx_buf->recv_entry->sar.total_recv_len,
+					rx_buf->pkt.data,
+					rx_buf->pkt.ctrl_hdr.seg_size);
+	assert(done_len == rx_buf->pkt.ctrl_hdr.seg_size);
 
-	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-				   rx_buf->recv_entry->rxm_iov.count,
-				   rx_buf->recv_entry->sar.total_recv_len,
-				   rx_buf->pkt.data,
-				   rx_buf->pkt.ctrl_hdr.seg_size);
 	rx_buf->recv_entry->sar.total_recv_len += done_len;
 
 	if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) ||
@@ -367,19 +405,19 @@ static ssize_t rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 		rx_buf->recv_entry->sar.total_recv_len = 0;
 
 		*done = 1;
-		ret = rxm_finish_recv(rx_buf, done_len);
+		rxm_finish_recv(rx_buf, done_len);
 	} else {
 		if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) {
 			if (!rx_buf->conn) {
-				rx_buf->conn = rxm_key2conn(rx_buf->ep,
-							    rx_buf->pkt.ctrl_hdr.conn_id);
+				rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+						(int) rx_buf->pkt.ctrl_hdr.conn_id);
 			}
 
 			rx_buf->recv_entry->sar.conn = rx_buf->conn;
 			rx_buf->recv_entry->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
 
 			dlist_insert_tail(&rx_buf->recv_entry->sar.entry,
-					  &rx_buf->conn->sar_rx_msg_list);
+					  &rx_buf->conn->deferred_sar_msgs);
 		}
 
 		/* The RX buffer can be reposted for further re-use */
@@ -387,29 +425,26 @@ static ssize_t rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 		rxm_rx_buf_free(rx_buf);
 
 		*done = 0;
-		ret = FI_SUCCESS;
 	}
-	return ret;
 }
 
-static ssize_t rxm_handle_seg_data(struct rxm_rx_buf *rx_buf)
+static void rxm_handle_seg_data(struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_recv_entry *recv_entry;
 	struct rxm_conn *conn;
 	uint64_t msg_id;
 	struct dlist_entry *entry;
-	ssize_t ret;
 	int done;
 
-	ret = rxm_process_seg_data(rx_buf, &done);
+	rxm_process_seg_data(rx_buf, &done);
 	if (done || !(rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV))
-		return ret;
+		return;
 
 	recv_entry = rx_buf->recv_entry;
 	conn = rx_buf->conn;
 	msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
 
-	dlist_foreach_container_safe(&conn->sar_deferred_rx_msg_list,
+	dlist_foreach_container_safe(&conn->deferred_sar_segments,
 				     struct rxm_rx_buf, rx_buf,
 				     unexp_msg.entry, entry) {
 		if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id))
@@ -417,78 +452,153 @@ static ssize_t rxm_handle_seg_data(struct rxm_rx_buf *rx_buf)
 
 		dlist_remove(&rx_buf->unexp_msg.entry);
 		rx_buf->recv_entry = recv_entry;
-		ret = rxm_process_seg_data(rx_buf, &done);
+		rxm_process_seg_data(rx_buf, &done);
 		if (done)
 			break;
 	}
-	return ret;
 }
 
-static ssize_t
-rxm_prepare_deferred_rndv_read(struct rxm_deferred_tx_entry **def_tx_entry,
-			       size_t index, struct iovec *iov,
-			       void *desc[RXM_IOV_LIMIT], size_t count,
-			       struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_rndv_xfer(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep,
+			     struct rxm_rndv_hdr *remote_hdr, struct iovec *local_iov,
+			     void **local_desc, size_t local_count, size_t total_len,
+			     void *context)
 {
-	uint8_t i;
+	size_t i, index = 0, offset = 0, count, copy_len;
+	struct iovec iov[RXM_IOV_LIMIT];
+	void *desc[RXM_IOV_LIMIT];
+	ssize_t ret = FI_SUCCESS;
 
-	*def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, rx_buf->conn,
-						       RXM_DEFERRED_TX_RNDV_READ);
-	if (!*def_tx_entry)
-		return -FI_ENOMEM;
+	for (i = 0; i < remote_hdr->count && total_len > 0; i++) {
+		copy_len = MIN(remote_hdr->iov[i].len, total_len);
+
+		ret = ofi_copy_iov_desc(&iov[0], &desc[0], &count,
+					&local_iov[0],
+					&local_desc[0],
+					local_count,
+					&index, &offset, copy_len);
+		if (ret)
+			return ret;
+		total_len -= copy_len;
+		ret = rxm_ep->rndv_ops->xfer(msg_ep, iov, desc, count, 0,
+			       remote_hdr->iov[i].addr, remote_hdr->iov[i].key,
+			       context);
+
+		if (ret) {
+			if (ret == -FI_EAGAIN) {
+				struct rxm_deferred_tx_entry *def_tx_entry;
 
-	(*def_tx_entry)->rndv_read.rx_buf = rx_buf;
-	(*def_tx_entry)->rndv_read.rma_iov.addr =
-			rx_buf->rndv_hdr->iov[index].addr;
-	(*def_tx_entry)->rndv_read.rma_iov.key =
-			rx_buf->rndv_hdr->iov[index].key;
+				ret = rxm_ep->rndv_ops->defer_xfer(
+					&def_tx_entry, i, iov, desc, count,
+					context);
 
-	for (i = 0; i < count; i++) {
-		(*def_tx_entry)->rndv_read.rxm_iov.iov[i] = iov[i];
-		(*def_tx_entry)->rndv_read.rxm_iov.desc[i] = desc[i];
+				if (ret)
+					break;
+				rxm_queue_deferred_tx(def_tx_entry, OFI_LIST_TAIL);
+				continue;
+			}
+			break;
+		}
 	}
-	(*def_tx_entry)->rndv_read.rxm_iov.count = count;
+	assert(!total_len);
+	return ret;
+}
 
-	return 0;
+ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf)
+{
+	ssize_t ret;
+	size_t total_len;
+
+	total_len = MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size);
+	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ);
+
+	ret = rxm_rndv_xfer(rx_buf->ep, rx_buf->conn->msg_ep,
+			    rx_buf->remote_rndv_hdr,
+			    rx_buf->recv_entry->rxm_iov.iov,
+			    rx_buf->recv_entry->rxm_iov.desc,
+			    rx_buf->recv_entry->rxm_iov.count, total_len,
+			    rx_buf);
+	if (ret) {
+		rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq,
+				   rx_buf->ep->util_ep.rx_cntr, rx_buf, ret);
+	}
+	return ret;
 }
 
-static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_rndv_handle_wr_data(struct rxm_rx_buf *rx_buf)
 {
-	size_t i, index = 0, offset = 0, count, total_recv_len;
-	struct iovec iov[RXM_IOV_LIMIT];
-	void *desc[RXM_IOV_LIMIT];
-	int ret = 0;
+	int i;
+	ssize_t ret;
+	struct rxm_tx_buf *tx_buf;
+	size_t total_len, rma_len = 0;
+	struct rxm_rndv_hdr *rx_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data;
 
+	tx_buf = ofi_bufpool_get_ibuf(rx_buf->ep->tx_pool,
+				      rx_buf->pkt.ctrl_hdr.msg_id);
+	total_len = tx_buf->pkt.hdr.size;
+
+	tx_buf->write_rndv.remote_hdr.count = rx_hdr->count;
+	memcpy(tx_buf->write_rndv.remote_hdr.iov, rx_hdr->iov,
+	       rx_hdr->count * sizeof(rx_hdr->iov[0]));
+	// calculate number of RMA writes required to complete the transfer.
+	// there me be less than iov count RMA writes required,
+	// depending on differences between remote and local IOV sizes.
+	for (i = 0; i < tx_buf->write_rndv.remote_hdr.count; i++) {
+		if (total_len > rma_len) {
+			tx_buf->write_rndv.rndv_rma_count++;
+			rma_len += tx_buf->write_rndv.remote_hdr.iov[i].len;
+		}
+	}
+
+	/* BUG: This is forcing a state change without knowing what state
+	 * we're currently in.  This loses whether we processed the completion
+	 * for the original send request.  Valid states here are
+	 * RXM_RNDV_TX or RXM_RNDV_WRITE_DATA_WAIT.
+	 */
+	RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_WRITE);
+
+	ret = rxm_rndv_xfer(rx_buf->ep, tx_buf->write_rndv.conn->msg_ep, rx_hdr,
+			    tx_buf->write_rndv.iov, tx_buf->write_rndv.desc,
+			    tx_buf->rma.count, total_len, tx_buf);
 
-	/* En-queue new rx buf to be posted ASAP so that we don't block any
-	* incoming messages. RNDV processing can take a while. */
-	ret = rxm_repost_new_rx(rx_buf);
 	if (ret)
-		return ret;
+		rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq,
+				   rx_buf->ep->util_ep.rx_cntr,
+				   tx_buf, ret);
+	rxm_rx_buf_free(rx_buf);
+	return ret;
+}
+
+static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf)
+{
+	int ret = 0, i;
+	size_t total_recv_len;
+
+	rxm_replace_rx_buf(rx_buf);
 
 	if (!rx_buf->conn) {
 		assert(rx_buf->ep->srx_ctx);
-		rx_buf->conn = rxm_key2conn(rx_buf->ep,
-					    rx_buf->pkt.ctrl_hdr.conn_id);
+		rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+					  (int) rx_buf->pkt.ctrl_hdr.conn_id);
 		if (!rx_buf->conn)
 			return -FI_EOTHER;
 	}
 	assert(rx_buf->conn);
 
 	FI_DBG(&rxm_prov, FI_LOG_CQ,
-	       "Got incoming recv with msg_id: 0x%" PRIx64 "\n",
+	       "Got incoming rndv req with msg_id: 0x%" PRIx64 "\n",
 	       rx_buf->pkt.ctrl_hdr.msg_id);
 
-	rx_buf->rndv_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data;
+	rx_buf->remote_rndv_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data;
 	rx_buf->rndv_rma_index = 0;
 
 	if (!rx_buf->ep->rdm_mr_local) {
 		total_recv_len = MIN(rx_buf->recv_entry->total_len,
 				     rx_buf->pkt.hdr.size);
-		ret = rxm_msg_mr_regv(rx_buf->ep,
-				      rx_buf->recv_entry->rxm_iov.iov,
+		ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->recv_entry->rxm_iov.iov,
 				      rx_buf->recv_entry->rxm_iov.count,
-				      total_recv_len, FI_READ, rx_buf->mr);
+				      total_recv_len,
+				      rx_buf->ep->rndv_ops->rx_mr_access,
+				      rx_buf->mr);
 		if (ret)
 			return ret;
 
@@ -498,100 +608,77 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf)
 						fi_mr_desc(rx_buf->mr[i]);
 		}
 	} else {
+		struct rxm_mr *mr;
+
 		for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) {
+			mr = rx_buf->recv_entry->rxm_iov.desc[i];
 			rx_buf->recv_entry->rxm_iov.desc[i] =
-				fi_mr_desc(rx_buf->recv_entry->rxm_iov.desc[i]);
+				fi_mr_desc(mr->msg_mr);
 		}
-		total_recv_len = MIN(rx_buf->recv_entry->total_len,
-				     rx_buf->pkt.hdr.size);
 	}
 
-	assert(rx_buf->rndv_hdr->count &&
-	       (rx_buf->rndv_hdr->count <= RXM_IOV_LIMIT));
+	assert(rx_buf->remote_rndv_hdr->count &&
+	       (rx_buf->remote_rndv_hdr->count <= RXM_IOV_LIMIT));
 
-	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ);
+	return rx_buf->ep->rndv_ops->handle_rx(rx_buf);
+}
 
-	for (i = 0; i < rx_buf->rndv_hdr->count; i++) {
-		size_t copy_len = MIN(rx_buf->rndv_hdr->iov[i].len,
-				      total_recv_len);
+void rxm_handle_eager(struct rxm_rx_buf *rx_buf)
+{
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	ssize_t done_len;
 
-		ret = ofi_copy_iov_desc(&iov[0], &desc[0], &count,
-					&rx_buf->recv_entry->rxm_iov.iov[0],
-					&rx_buf->recv_entry->rxm_iov.desc[0],
-					rx_buf->recv_entry->rxm_iov.count,
-					&index, &offset, copy_len);
-		if (ret) {
-			assert(ret == -FI_ETOOSMALL);
-			return rxm_cq_write_error_trunc(rx_buf, rx_buf->
-							recv_entry->total_len);
-		}
-		total_recv_len -= copy_len;
-		ret = fi_readv(rx_buf->conn->msg_ep, iov, desc, count, 0,
-			       rx_buf->rndv_hdr->iov[i].addr,
-			       rx_buf->rndv_hdr->iov[i].key, rx_buf);
-		if (ret) {
-			if (ret == -FI_EAGAIN) {
-				struct rxm_deferred_tx_entry *def_tx_entry;
+	iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc,
+					      rx_buf->recv_entry->rxm_iov.count,
+					      &device);
 
-				ret = rxm_prepare_deferred_rndv_read(
-						&def_tx_entry, i, iov, desc,
-						count, rx_buf);
-				if (ret)
-					goto readv_err;
-				rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
-				continue;
-			}
-readv_err:
-			rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq,
-					   rx_buf->ep->util_ep.rx_cntr,
-					   rx_buf->recv_entry->context, ret);
-			break;
-		}
-	}
-	assert(!total_recv_len);
-	return ret;
+	done_len = ofi_copy_to_hmem_iov(iface, device,
+					rx_buf->recv_entry->rxm_iov.iov,
+					rx_buf->recv_entry->rxm_iov.count, 0,
+					rx_buf->data, rx_buf->pkt.hdr.size);
+	assert(done_len == rx_buf->pkt.hdr.size);
+
+	rxm_finish_recv(rx_buf, done_len);
 }
 
-ssize_t rxm_handle_eager(struct rxm_rx_buf *rx_buf)
+void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf)
 {
-	uint64_t done_len;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	ssize_t done_len;
 
-	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-				   rx_buf->recv_entry->rxm_iov.count,
-				   0, rx_buf->pkt.data, rx_buf->pkt.hdr.size);
-	return rxm_finish_recv(rx_buf, done_len);
-}
+	iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc,
+					      rx_buf->recv_entry->rxm_iov.count,
+					      &device);
 
-ssize_t rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf)
-{
-	uint64_t done_len;
-	ssize_t ret;
+	done_len = ofi_copy_to_hmem_iov(iface, device,
+					rx_buf->recv_entry->rxm_iov.iov,
+					rx_buf->recv_entry->rxm_iov.count, 0,
+					rx_buf->data, rx_buf->pkt.hdr.size);
+	assert(done_len == rx_buf->pkt.hdr.size);
 
-	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-				   rx_buf->recv_entry->rxm_iov.count,
-				   0, rx_buf->pkt.data, rx_buf->pkt.hdr.size);
 	if (rx_buf->pkt.hdr.tag & OFI_COLL_TAG_FLAG) {
 		ofi_coll_handle_xfer_comp(rx_buf->pkt.hdr.tag,
 				rx_buf->recv_entry->context);
+		rxm_recv_entry_release(rx_buf->recv_entry);
 		rxm_rx_buf_free(rx_buf);
-		rxm_recv_entry_release(rx_buf->recv_entry->recv_queue,
-				rx_buf->recv_entry);
-		ret = FI_SUCCESS;
 	} else {
-		ret = rxm_finish_recv(rx_buf, done_len);
+		rxm_finish_recv(rx_buf, done_len);
 	}
-	return ret;
 }
 
 ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf)
 {
 	switch (rx_buf->pkt.ctrl_hdr.type) {
 	case rxm_ctrl_eager:
-		return rx_buf->ep->eager_ops->handle_rx(rx_buf);
-	case rxm_ctrl_rndv:
+		rx_buf->ep->eager_ops->handle_rx(rx_buf);
+		return 0;
+	case rxm_ctrl_rndv_req:
 		return rxm_handle_rndv(rx_buf);
 	case rxm_ctrl_seg:
-		return rxm_handle_seg_data(rx_buf);
+		rxm_handle_seg_data(rx_buf);
+		return 0;
 	default:
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown message type\n");
 		assert(0);
@@ -599,17 +686,65 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf)
 	}
 }
 
+static void rxm_adjust_multi_recv(struct rxm_rx_buf *rx_buf)
+{
+	struct rxm_recv_entry *recv_entry;
+	struct iovec new_iov;
+	size_t recv_size;
+
+	recv_size = rx_buf->pkt.hdr.size;
+
+	if (rx_buf->recv_entry->rxm_iov.iov[0].iov_len < recv_size ||
+	    rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size <
+	    rx_buf->ep->min_multi_recv_size)
+		return;
+
+	new_iov.iov_base = (uint8_t *)
+		rx_buf->recv_entry->rxm_iov.iov[0].iov_base + recv_size;
+	new_iov.iov_len = rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size;;
+
+	rx_buf->recv_entry->rxm_iov.iov[0].iov_len = recv_size;
+
+	recv_entry = rxm_multi_recv_entry_get(rx_buf->ep, &new_iov,
+					rx_buf->recv_entry->rxm_iov.desc, 1,
+					rx_buf->recv_entry->addr,
+					rx_buf->recv_entry->tag,
+					rx_buf->recv_entry->ignore,
+					rx_buf->recv_entry->context,
+					rx_buf->recv_entry->flags);
+
+	rx_buf->recv_entry->flags &= ~FI_MULTI_RECV;
+
+	dlist_insert_head(&recv_entry->entry, &rx_buf->ep->recv_queue.recv_list);
+}
+
 static ssize_t
 rxm_match_rx_buf(struct rxm_rx_buf *rx_buf,
 		 struct rxm_recv_queue *recv_queue,
-		    struct rxm_recv_match_attr *match_attr)
+		 struct rxm_recv_match_attr *match_attr)
 {
 	struct dlist_entry *entry;
 
+	/* Dynamic receive buffers may have already matched */
+	if (rx_buf->recv_entry) {
+		if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_req)
+			return rxm_handle_rndv(rx_buf);
+
+		rxm_finish_recv(rx_buf, rx_buf->pkt.hdr.size);
+		return 0;
+	}
+
+	if (recv_queue->dyn_rbuf_unexp_cnt)
+		recv_queue->dyn_rbuf_unexp_cnt--;
+
 	entry = dlist_remove_first_match(&recv_queue->recv_list,
 					 recv_queue->match_recv, match_attr);
 	if (entry) {
 		rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry);
+
+		if (rx_buf->recv_entry->flags & FI_MULTI_RECV)
+			rxm_adjust_multi_recv(rx_buf);
+
 		return rxm_handle_rx_buf(rx_buf);
 	}
 
@@ -621,10 +756,8 @@ rxm_match_rx_buf(struct rxm_rx_buf *rx_buf,
 
 	dlist_insert_tail(&rx_buf->unexp_msg.entry,
 			  &recv_queue->unexp_msg_list);
-
-	// repost a new buffer now since we don't know when the unexpected
-	// buffer will be consumed
-	return rxm_repost_new_rx(rx_buf);
+	rxm_replace_rx_buf(rx_buf);
+	return 0;
 }
 
 static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
@@ -635,15 +768,17 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
 
 	if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) {
 		if (rx_buf->ep->srx_ctx)
-			rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf->
-						    pkt.ctrl_hdr.conn_id);
+			rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+					(int) rx_buf->pkt.ctrl_hdr.conn_id);
 		if (!rx_buf->conn)
 			return -FI_EOTHER;
-		match_attr.addr = rx_buf->conn->handle.fi_addr;
+		match_attr.addr = rx_buf->conn->peer->fi_addr;
 	}
 
-	if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV)
-		return rxm_finish_buf_recv(rx_buf);
+	if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) {
+		rxm_finish_buf_recv(rx_buf);
+		return 0;
+	}
 
 	switch(rx_buf->pkt.hdr.op) {
 	case ofi_op_msg:
@@ -675,15 +810,15 @@ static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf)
 {
 	struct dlist_entry *sar_entry;
 
-	rx_buf->conn = rxm_key2conn(rx_buf->ep,
-				    rx_buf->pkt.ctrl_hdr.conn_id);
+	rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+				  (int) rx_buf->pkt.ctrl_hdr.conn_id);
 	if (!rx_buf->conn)
 		return -FI_EOTHER;
 
 	FI_DBG(&rxm_prov, FI_LOG_CQ,
 	       "Got incoming recv with msg_id: 0x%" PRIx64 " for conn - %p\n",
 	       rx_buf->pkt.ctrl_hdr.msg_id, rx_buf->conn);
-	sar_entry = dlist_find_first_match(&rx_buf->conn->sar_rx_msg_list,
+	sar_entry = dlist_find_first_match(&rx_buf->conn->deferred_sar_msgs,
 					   rxm_sar_match_msg_id,
 					   &rx_buf->pkt.ctrl_hdr.msg_id);
 	if (!sar_entry)
@@ -691,122 +826,187 @@ static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf)
 
 	rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry,
 					  sar.entry);
-	return rxm_handle_seg_data(rx_buf);
+	rxm_handle_seg_data(rx_buf);
+	return 0;
 }
 
-static ssize_t rxm_rndv_send_ack_inject(struct rxm_rx_buf *rx_buf)
+static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf)
 {
-	struct rxm_pkt pkt = {
-		.hdr.op = ofi_op_msg,
-		.hdr.version = OFI_OP_VERSION,
-		.ctrl_hdr.version = RXM_CTRL_VERSION,
-		.ctrl_hdr.type = rxm_ctrl_rndv_ack,
-		.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key,
-		.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id
-	};
-	struct iovec iov = {
-		.iov_base = &pkt,
-		.iov_len = sizeof(pkt),
-	};
-	struct fi_msg msg = {
-		.msg_iov = &iov,
-		.iov_count = 1,
-		.context = rx_buf,
-	};
+	struct rxm_deferred_tx_entry *def_entry;
+	struct rxm_tx_buf *buf;
+	ssize_t ret;
+
+	assert(rx_buf->conn);
+	assert(rx_buf->hdr.state == RXM_RNDV_READ);
+	buf = ofi_buf_alloc(rx_buf->ep->tx_pool);
+	if (!buf) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	rx_buf->recv_entry->rndv.tx_buf = buf;
+
+	buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_rd_done;
+	buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index;
+	buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
+
+	ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt),
+		      buf->hdr.desc, 0, rx_buf);
+	if (ret) {
+		if (ret == -FI_EAGAIN) {
+			def_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep,
+						rx_buf->conn,
+						RXM_DEFERRED_TX_RNDV_ACK);
+			if (def_entry) {
+				def_entry->rndv_ack.rx_buf = rx_buf;
+				def_entry->rndv_ack.pkt_size = sizeof(rx_buf->pkt);
+				rxm_queue_deferred_tx(def_entry, OFI_LIST_TAIL);
+				return;
+			}
+		}
+		goto free;
+	}
+
+	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ_DONE_SENT);
+	return;
 
-	return fi_sendmsg(rx_buf->conn->msg_ep, &msg, FI_INJECT);
+free:
+	ofi_buf_free(buf);
+	rx_buf->recv_entry->rndv.tx_buf = NULL;
+err:
+	FI_WARN(&rxm_prov, FI_LOG_CQ,
+		"unable to allocate/send rd rndv ack: %s\n",
+		fi_strerror((int) ret));
+	assert(0);
+	/* TODO: Allocate all resources needed on receiving
+	 * original message receive request, to avoid allocation failures.
+	 * On other failures, we need to fail the receive.
+	 */
 }
 
-static ssize_t rxm_rndv_send_ack(struct rxm_rx_buf *rx_buf)
+static void
+rxm_rndv_send_wr_done(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_buf)
 {
-	struct rxm_deferred_tx_entry *def_tx_entry;
+	struct rxm_deferred_tx_entry *def_entry;
+	struct rxm_tx_buf *buf;
 	ssize_t ret;
 
-	assert(rx_buf->conn);
+	assert(tx_buf->hdr.state == RXM_RNDV_WRITE);
+	buf = ofi_buf_alloc(rxm_ep->tx_pool);
+	if (!buf) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
 
-	if (sizeof(rx_buf->pkt) <= rx_buf->ep->inject_limit) {
-		ret = rxm_rndv_send_ack_inject(rx_buf);
-		if (!ret)
-			goto out;
+	tx_buf->write_rndv.done_buf = buf;
 
-		if (ret != -FI_EAGAIN) {
-			FI_WARN(&rxm_prov, FI_LOG_CQ,
-				"send ack via inject failed for MSG provider\n");
-			return ret;
+	buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_wr_done;
+	buf->pkt.ctrl_hdr.conn_id = tx_buf->pkt.ctrl_hdr.conn_id;
+	buf->pkt.ctrl_hdr.msg_id = tx_buf->pkt.ctrl_hdr.msg_id;
+
+	ret = fi_send(tx_buf->write_rndv.conn->msg_ep, &buf->pkt,
+		      sizeof(buf->pkt), buf->hdr.desc, 0, tx_buf);
+	if (ret) {
+		if (ret == -FI_EAGAIN) {
+			def_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep,
+						tx_buf->write_rndv.conn,
+						RXM_DEFERRED_TX_RNDV_DONE);
+			if (def_entry) {
+				def_entry->rndv_done.tx_buf = tx_buf;
+				rxm_queue_deferred_tx(def_entry, OFI_LIST_TAIL);
+				return;
+			}
 		}
+		goto free;
 	}
 
-	rx_buf->recv_entry->rndv.tx_buf = rxm_tx_buf_alloc(rx_buf->ep,
-							   RXM_BUF_POOL_TX_ACK);
-	if (!rx_buf->recv_entry->rndv.tx_buf) {
-		FI_WARN(&rxm_prov, FI_LOG_CQ,
-			"ran out of buffers from ACK buffer pool\n");
-		return -FI_EAGAIN;
+	RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_WRITE_DONE_SENT);
+	return;
+
+free:
+	ofi_buf_free(buf);
+	tx_buf->write_rndv.done_buf = NULL;
+err:
+	FI_WARN(&rxm_prov, FI_LOG_CQ,
+		"unable to allocate/send wr rndv ack: %s\n",
+		fi_strerror((int) ret));
+	assert(0);
+	/* TODO: Allocate all resources needed prior to initiating the
+	 * original message send request, to avoid allocation failures.
+	 * On other failures, we need to fail the original message.
+	 */
+}
+
+ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf)
+{
+	struct rxm_deferred_tx_entry *def_entry;
+	struct rxm_tx_buf *buf;
+	ssize_t ret;
+
+	assert(rx_buf->conn);
+
+	buf = ofi_buf_alloc(rx_buf->ep->tx_pool);
+	if (!buf) {
+		ret = -FI_ENOMEM;
+		goto err;
 	}
-	assert(rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.type ==
-	       rxm_ctrl_rndv_ack);
 
-	assert(rx_buf->hdr.state == RXM_RNDV_READ);
+	rx_buf->recv_entry->rndv.tx_buf = buf;
 
-	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->
-								handle.remote_key;
-	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.
-							       ctrl_hdr.msg_id;
+	buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_wr_data;
+	buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index;
+	buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
+	rxm_rndv_hdr_init(rx_buf->ep, buf->pkt.data,
+			  rx_buf->recv_entry->rxm_iov.iov,
+			  rx_buf->recv_entry->rxm_iov.count, rx_buf->mr);
 
-	ret = fi_send(rx_buf->conn->msg_ep, &rx_buf->recv_entry->rndv.tx_buf->pkt,
-		      sizeof(rx_buf->recv_entry->rndv.tx_buf->pkt),
-		      rx_buf->recv_entry->rndv.tx_buf->hdr.desc, 0, rx_buf);
+	ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt) +
+		      sizeof(struct rxm_rndv_hdr), buf->hdr.desc, 0, rx_buf);
 	if (ret) {
 		if (ret == -FI_EAGAIN) {
-			def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep,
-					rx_buf->conn, RXM_DEFERRED_TX_RNDV_ACK);
-			if (!def_tx_entry) {
-				FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to "
-					"allocate TX entry for deferred ACK\n");
-				ret = -FI_EAGAIN;
-				goto err;
+			def_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep,
+						rx_buf->conn,
+						RXM_DEFERRED_TX_RNDV_ACK);
+			if (def_entry) {
+				def_entry->rndv_ack.rx_buf = rx_buf;
+				def_entry->rndv_ack.pkt_size =
+						sizeof(buf->pkt) +
+						sizeof(struct rxm_rndv_hdr);
+				rxm_queue_deferred_tx(def_entry, OFI_LIST_TAIL);
+				return 0;
 			}
-
-			def_tx_entry->rndv_ack.rx_buf = rx_buf;
-			rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
-			return 0;
-		} else {
-			FI_WARN(&rxm_prov, FI_LOG_CQ,
-				"unable to send ACK: %zd\n", ret);
 		}
-		goto err;
+		goto free;
 	}
-out:
-	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_ACK_SENT);
+	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_WRITE_DATA_SENT);
 	return 0;
+
+free:
+	ofi_buf_free(buf);
+	rx_buf->recv_entry->rndv.tx_buf = NULL;
 err:
-	ofi_buf_free(rx_buf->recv_entry->rndv.tx_buf);
-	return ret;
+	FI_WARN(&rxm_prov, FI_LOG_CQ,
+		"unable to allocate/send wr rndv ready: %s\n",
+		fi_strerror((int) ret));
+	assert(0);
+	/* TODO: Sender will be blocked forever waiting for a response
+	 * that will not come.  Need to tear down communication.
+	 */
+	return 0;
 }
 
-
-
-static int rxm_handle_remote_write(struct rxm_ep *rxm_ep,
+static void rxm_handle_remote_write(struct rxm_ep *rxm_ep,
 				   struct fi_cq_data_entry *comp)
 {
-	int ret;
-
-	FI_DBG(&rxm_prov, FI_LOG_CQ, "writing remote write completion\n");
-	ret = ofi_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, 0, NULL,
-			   comp->data, 0);
-	if (ret) {
-		FI_WARN(&rxm_prov, FI_LOG_CQ,
-				"Unable to write remote write completion\n");
-		return ret;
-	}
+	rxm_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, 0, NULL,
+		     comp->data, 0);
 	ofi_ep_rem_wr_cntr_inc(&rxm_ep->util_ep);
 	if (comp->op_context)
 		rxm_rx_buf_free(comp->op_context);
-	return 0;
 }
 
 static void rxm_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn,
-					   struct rxm_tx_atomic_buf *tx_buf,
+					   struct rxm_tx_buf *tx_buf,
 					   size_t data_len, uint32_t pkt_op,
 					   enum fi_datatype datatype,
 					   uint8_t atomic_op)
@@ -822,36 +1022,36 @@ static void rxm_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn,
 
 static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 				    struct rxm_rx_buf *rx_buf,
-				    struct rxm_tx_atomic_buf *resp_buf,
+				    struct rxm_tx_buf *resp_buf,
 				    ssize_t result_len, uint32_t status)
 {
 	struct rxm_deferred_tx_entry *def_tx_entry;
 	struct rxm_atomic_resp_hdr *atomic_hdr;
 	ssize_t ret;
-	ssize_t resp_len;
+	size_t data_len, tot_len;
 
-	resp_len = result_len + sizeof(struct rxm_atomic_resp_hdr) +
-		   sizeof(struct rxm_pkt);
+	data_len = result_len + sizeof(struct rxm_atomic_resp_hdr);
+	tot_len = data_len + sizeof(struct rxm_pkt);
 
 	resp_buf->hdr.state = RXM_ATOMIC_RESP_SENT;
-	rxm_format_atomic_resp_pkt_hdr(rx_buf->conn, resp_buf, resp_len,
+	rxm_format_atomic_resp_pkt_hdr(rx_buf->conn, resp_buf, data_len,
 				       rx_buf->pkt.hdr.op,
 				       rx_buf->pkt.hdr.atomic.datatype,
 				       rx_buf->pkt.hdr.atomic.op);
-	resp_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key;
+	resp_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index;
 	resp_buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
 	atomic_hdr = (struct rxm_atomic_resp_hdr *) resp_buf->pkt.data;
 	atomic_hdr->status = htonl(status);
 	atomic_hdr->result_len = htonl(result_len);
 
-	if (resp_len < rxm_ep->inject_limit) {
+	if (tot_len < rxm_ep->inject_limit) {
 		ret = fi_inject(rx_buf->conn->msg_ep, &resp_buf->pkt,
-				resp_len, 0);
+				tot_len, 0);
 		if (!ret)
 			ofi_buf_free(resp_buf);
 	} else {
 		ret = rxm_atomic_send_respmsg(rxm_ep, rx_buf->conn, resp_buf,
-					      resp_len);
+					      tot_len);
 	}
 	if (ret) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ,
@@ -868,8 +1068,8 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 			}
 
 			def_tx_entry->atomic_resp.tx_buf = resp_buf;
-			def_tx_entry->atomic_resp.len = resp_len;
-			rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
+			def_tx_entry->atomic_resp.len = tot_len;
+			rxm_queue_deferred_tx(def_tx_entry, OFI_LIST_TAIL);
 			ret = 0;
 		}
 	}
@@ -878,22 +1078,24 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 	return ret;
 }
 
-static void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src,
-			  void *cmp, void *res, size_t count,
-			  enum fi_datatype datatype, enum fi_op op)
+static void rxm_do_atomic(uint8_t op, void *dst, void *src, void *cmp,
+			  void *res, size_t count, enum fi_datatype datatype,
+			  enum fi_op amo_op)
 {
-	switch (pkt->hdr.op) {
+	switch (op) {
 	case ofi_op_atomic:
-		assert(ofi_atomic_iswrite_op(op));
-		ofi_atomic_write_handler(op, datatype, dst, src, count);
+		assert(ofi_atomic_iswrite_op(amo_op));
+		ofi_atomic_write_handler(amo_op, datatype, dst, src, count);
 		break;
 	case ofi_op_atomic_fetch:
-		assert(ofi_atomic_isreadwrite_op(op));
-		ofi_atomic_readwrite_handler(op, datatype, dst, src, res, count);
+		assert(ofi_atomic_isreadwrite_op(amo_op));
+		ofi_atomic_readwrite_handler(amo_op, datatype, dst, src, res,
+					     count);
 		break;
 	case ofi_op_atomic_compare:
-		assert(ofi_atomic_isswap_op(op));
-		ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, count);
+		assert(ofi_atomic_isswap_op(amo_op));
+		ofi_atomic_swap_handler(amo_op, datatype, dst, src, cmp, res,
+					count);
 		break;
 	default:
 		/* Validated prior to calling function */
@@ -901,6 +1103,48 @@ static void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src,
 	}
 }
 
+static int rxm_do_device_mem_atomic(struct rxm_mr *dev_mr, uint8_t op,
+				    void *dev_dst, void *src, void *cmp,
+				    void *res, size_t amo_count,
+				    enum fi_datatype datatype,
+				    enum fi_op amo_op, size_t amo_op_size)
+{
+	struct rxm_domain *dom = dev_mr->domain;
+	void *tx_buf;
+	ssize_t ret __attribute__((unused));
+	struct iovec iov = {
+		.iov_base = dev_dst,
+		.iov_len = amo_op_size,
+	};
+
+	fastlock_acquire(&dom->amo_bufpool_lock);
+	tx_buf = ofi_buf_alloc(dom->amo_bufpool);
+	fastlock_release(&dom->amo_bufpool_lock);
+
+	if (!tx_buf)
+		return -FI_ENOMEM;
+
+	fastlock_acquire(&dev_mr->amo_lock);
+	ret = ofi_copy_from_hmem_iov(tx_buf, amo_op_size, dev_mr->iface, 0,
+				    &iov, 1, 0);
+	assert(ret == amo_op_size);
+
+	rxm_do_atomic(op, tx_buf, src, cmp, res, amo_count, datatype,
+		      amo_op);
+
+	ret = ofi_copy_to_hmem_iov(dev_mr->iface, 0, &iov, 1, 0, tx_buf,
+				   amo_op_size);
+	assert(ret == amo_op_size);
+
+	fastlock_release(&dev_mr->amo_lock);
+
+	fastlock_acquire(&dom->amo_bufpool_lock);
+	ofi_buf_free(tx_buf);
+	fastlock_release(&dom->amo_bufpool_lock);
+
+	return FI_SUCCESS;
+}
+
 static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 				     struct rxm_rx_buf *rx_buf)
 {
@@ -913,33 +1157,32 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 	ssize_t result_len;
 	uint64_t offset;
 	int i;
-	int ret = 0;
-	struct rxm_tx_atomic_buf *resp_buf;
+	ssize_t ret = 0;
+	struct rxm_tx_buf *resp_buf;
 	struct rxm_atomic_resp_hdr *resp_hdr;
 	struct rxm_domain *domain = container_of(rxm_ep->util_ep.domain,
 					 struct rxm_domain, util_domain);
+	uint8_t op = rx_buf->pkt.hdr.op;
 
 	assert(!(rx_buf->comp_flags &
 		 ~(FI_RECV | FI_RECV | FI_REMOTE_CQ_DATA)));
-	assert(rx_buf->pkt.hdr.op == ofi_op_atomic ||
-	       rx_buf->pkt.hdr.op == ofi_op_atomic_fetch ||
-	       rx_buf->pkt.hdr.op == ofi_op_atomic_compare);
+	assert(op == ofi_op_atomic || op == ofi_op_atomic_fetch ||
+	       op == ofi_op_atomic_compare);
 
 	if (rx_buf->ep->srx_ctx)
-		rx_buf->conn = rxm_key2conn(rx_buf->ep,
-					    rx_buf->pkt.ctrl_hdr.conn_id);
+		rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+					  (int) rx_buf->pkt.ctrl_hdr.conn_id);
 	if (!rx_buf->conn)
 		return -FI_EOTHER;
 
-	resp_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
+	resp_buf = ofi_buf_alloc(rxm_ep->tx_pool);
 	if (!resp_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-			"Unable to allocate from Atomic buffer pool\n");
-		/* TODO: Should this be -FI_ENOMEM - how does it get
-		 * processed again */
-		return -FI_EAGAIN;
+			"Unable to allocate for atomic response\n");
+		return -FI_ENOMEM;
 	}
 
+	resp_buf->pkt.ctrl_hdr.type = rxm_ctrl_atomic;
 	for (i = 0; i < rx_buf->pkt.hdr.atomic.ioc_count; i++) {
 		ret = ofi_mr_verify(&domain->util_domain.mr_map,
 				    req_hdr->rma_ioc[i].count * datatype_sz,
@@ -949,7 +1192,7 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 							atomic_op));
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"Atomic RMA MR verify error %d\n", ret);
+				"Atomic RMA MR verify error %ld\n", ret);
 			return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, 0,
 						    -FI_EACCES);
 		}
@@ -960,17 +1203,37 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 	resp_hdr = (struct rxm_atomic_resp_hdr *) resp_buf->pkt.data;
 
 	for (i = 0, offset = 0; i < rx_buf->pkt.hdr.atomic.ioc_count; i++) {
-		rxm_do_atomic(&rx_buf->pkt,
-			      (uintptr_t *) req_hdr->rma_ioc[i].addr,
-			      req_hdr->data + offset,
-			      req_hdr->data + len + offset,
-			      resp_hdr->data + offset,
-			      req_hdr->rma_ioc[i].count, datatype, atomic_op);
-		offset += req_hdr->rma_ioc[i].count * datatype_sz;
+		struct rxm_mr *mr =
+			rxm_mr_get_map_entry(domain, req_hdr->rma_ioc[i].key);
+		size_t amo_count = req_hdr->rma_ioc[i].count;
+		size_t amo_op_size = amo_count * datatype_sz;
+		void *src_buf = req_hdr->data + offset;
+		void *cmp_buf = req_hdr->data + len + offset;
+		void *res_buf = resp_hdr->data + offset;
+		void *dst_buf = (void *) req_hdr->rma_ioc[i].addr;
+
+		if (mr->iface != FI_HMEM_SYSTEM) {
+			ret = rxm_do_device_mem_atomic(mr, op, dst_buf, src_buf,
+						       cmp_buf, res_buf,
+						       amo_count, datatype,
+						       atomic_op, amo_op_size);
+			if (ret) {
+				FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
+					"Atomic operation failed %ld\n", ret);
+
+				return rxm_atomic_send_resp(rxm_ep, rx_buf,
+							    resp_buf, 0, ret);
+			}
+		} else {
+			rxm_do_atomic(op, dst_buf, src_buf, cmp_buf, res_buf,
+				      amo_count, datatype, atomic_op);
+		}
+
+		offset += amo_op_size;
 	}
-	result_len = rx_buf->pkt.hdr.op == ofi_op_atomic ? 0 : offset;
+	result_len = op == ofi_op_atomic ? 0 : offset;
 
-	if (rx_buf->pkt.hdr.op == ofi_op_atomic)
+	if (op == ofi_op_atomic)
 		ofi_ep_rem_wr_cntr_inc(&rxm_ep->util_ep);
 	else
 		ofi_ep_rem_rd_cntr_inc(&rxm_ep->util_ep);
@@ -979,53 +1242,59 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 				    result_len, FI_SUCCESS);
 }
 
-
 static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 				      struct rxm_rx_buf *rx_buf)
 {
-	struct rxm_tx_atomic_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 	struct rxm_atomic_resp_hdr *resp_hdr;
+	struct util_cntr *cntr = NULL;
 	uint64_t len;
-	int ret = 0;
+	ssize_t copy_len;
+	ssize_t ret = 0;
+	enum fi_hmem_iface iface;
+	uint64_t device;
 
 	resp_hdr = (struct rxm_atomic_resp_hdr *) rx_buf->pkt.data;
-	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_ATOMIC].pool,
+	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->tx_pool,
 				      rx_buf->pkt.ctrl_hdr.msg_id);
 	FI_DBG(&rxm_prov, FI_LOG_CQ, "received atomic response: op: %" PRIu8
 	       " msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.hdr.op,
 	       rx_buf->pkt.ctrl_hdr.msg_id);
 
+	iface = rxm_mr_desc_to_hmem_iface_dev(tx_buf->atomic_result.desc,
+					      tx_buf->atomic_result.count,
+					      &device);
+
 	assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA)));
 
 	if (resp_hdr->status) {
-		struct util_cntr *cntr = NULL;
+		ret = ntohl(resp_hdr->status);
 		FI_WARN(&rxm_prov, FI_LOG_CQ,
-		       "bad atomic response status %d\n", ntohl(resp_hdr->status));
+			"bad atomic response status %d\n",
+			ntohl(resp_hdr->status));
+		goto write_err;
+	}
 
-		if (tx_buf->pkt.hdr.op == ofi_op_atomic) {
-			cntr = rxm_ep->util_ep.wr_cntr;
-		} else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare ||
-			   tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) {
-			cntr = rxm_ep->util_ep.rd_cntr;
-		} else {
-			FI_WARN(&rxm_prov, FI_LOG_CQ,
-				"unknown atomic request op!\n");
-			assert(0);
-		}
-		rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr,
-				   tx_buf->app_context, ntohl(resp_hdr->status));
-		goto free;
+	len = ofi_total_iov_len(tx_buf->atomic_result.iov,
+				tx_buf->atomic_result.count);
+	if (ntohl(resp_hdr->result_len) != len) {
+		ret = -FI_EIO;
+		FI_WARN(&rxm_prov, FI_LOG_CQ, "result size mismatch\n");
+		goto write_err;
 	}
 
-	len = ofi_total_iov_len(tx_buf->result_iov, tx_buf->result_iov_count);
-	assert(ntohl(resp_hdr->result_len) == len);
-	ofi_copy_to_iov(tx_buf->result_iov, tx_buf->result_iov_count, 0,
-			resp_hdr->data, len);
+	copy_len = ofi_copy_to_hmem_iov(iface, device, tx_buf->atomic_result.iov,
+				   tx_buf->atomic_result.count, 0, resp_hdr->data,
+				   len);
+	if (copy_len != len) {
+		ret = -FI_EIO;
+		FI_WARN(&rxm_prov, FI_LOG_CQ, "copy length error\n");
+		goto write_err;
+	}
 
 	if (!(tx_buf->flags & FI_INJECT))
-		ret = rxm_cq_write_tx_comp(rxm_ep,
-					   ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
-					   tx_buf->app_context, tx_buf->flags);
+		rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
+				     tx_buf->app_context, tx_buf->flags);
 
 	if (tx_buf->pkt.hdr.op == ofi_op_atomic) {
 		ofi_ep_wr_cntr_inc(&rxm_ep->util_ep);
@@ -1033,82 +1302,84 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 		   tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) {
 		ofi_ep_rd_cntr_inc(&rxm_ep->util_ep);
 	} else {
-		FI_WARN(&rxm_prov, FI_LOG_CQ, "unknown atomic request op!\n");
-		rxm_cq_write_error(rxm_ep->util_ep.tx_cq, NULL,
-				   tx_buf->app_context, ntohl(resp_hdr->status));
-		assert(0);
+		ret = -FI_EOPNOTSUPP;
+		goto write_err;
 	}
 free:
 	rxm_rx_buf_free(rx_buf);
-	ofi_buf_free(tx_buf);
-	ofi_atomic_inc32(&rxm_ep->atomic_tx_credits);
-	assert(ofi_atomic_get32(&rxm_ep->atomic_tx_credits) <=
-	       rxm_ep->rxm_info->tx_attr->size);
+	rxm_free_rx_buf(rxm_ep, tx_buf);
 	return ret;
+
+write_err:
+	if (tx_buf->pkt.hdr.op == ofi_op_atomic) {
+		cntr = rxm_ep->util_ep.wr_cntr;
+	} else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare ||
+		   tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) {
+		cntr = rxm_ep->util_ep.rd_cntr;
+	} else {
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"unknown atomic request op!\n");
+		assert(0);
+	}
+	rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr,
+			   tx_buf->app_context, (int) ret);
+	goto free;
 }
 
 static ssize_t rxm_handle_credit(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
 {
-	struct rxm_domain *domain = container_of(rxm_ep->util_ep.domain,
-						 struct rxm_domain, util_domain);
+	struct rxm_domain *domain;
 
-	domain->flow_ctrl_ops->add_credits(rx_buf->msg_ep,
+	assert(rx_buf->rx_ep->fid.fclass == FI_CLASS_EP);
+	domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
+			      util_domain);
+	domain->flow_ctrl_ops->add_credits(rx_buf->rx_ep,
 					   rx_buf->pkt.ctrl_hdr.ctrl_data);
 	rxm_rx_buf_free(rx_buf);
 	return FI_SUCCESS;
 }
 
-int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep,
-			       struct rxm_tx_eager_buf *tx_eager_buf)
+void rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep,
+			        struct rxm_tx_buf *tx_eager_buf)
 {
-	int ret;
-
 	if (tx_eager_buf->pkt.hdr.tag & OFI_COLL_TAG_FLAG) {
 		ofi_coll_handle_xfer_comp(tx_eager_buf->pkt.hdr.tag,
 				tx_eager_buf->app_context);
-		ret = FI_SUCCESS;
 	} else {
-		ret = rxm_finish_eager_send(rxm_ep, tx_eager_buf);
+		rxm_finish_eager_send(rxm_ep, tx_eager_buf);
 	}
-
-	return ret;
-};
+}
 
 ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 {
 	struct rxm_rx_buf *rx_buf;
-	struct rxm_tx_base_buf *tx_buf;
-	struct rxm_tx_sar_buf *tx_sar_buf;
-	struct rxm_tx_eager_buf *tx_eager_buf;
-	struct rxm_tx_rndv_buf *tx_rndv_buf;
-	struct rxm_tx_atomic_buf *tx_atomic_buf;
-	struct rxm_rma_buf *rma_buf;
-	ssize_t ret;
+	struct rxm_tx_buf *tx_buf;
 
 	/* Remote write events may not consume a posted recv so op context
 	 * and hence state would be NULL */
-	if (comp->flags & FI_REMOTE_WRITE)
-		return rxm_handle_remote_write(rxm_ep, comp);
+	if (comp->flags & FI_REMOTE_WRITE) {
+		rxm_handle_remote_write(rxm_ep, comp);
+		return 0;
+	}
 
 	switch (RXM_GET_PROTO_STATE(comp->op_context)) {
 	case RXM_TX:
-		tx_eager_buf = comp->op_context;
-		ret = rxm_ep->eager_ops->comp_tx(rxm_ep, tx_eager_buf);
-		ofi_buf_free(tx_eager_buf);
-		return ret;
+	case RXM_INJECT_TX:
+		tx_buf = comp->op_context;
+		rxm_ep->eager_ops->comp_tx(rxm_ep, tx_buf);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
+		return 0;
 	case RXM_CREDIT_TX:
 		tx_buf = comp->op_context;
 		assert(comp->flags & FI_SEND);
 		ofi_buf_free(tx_buf);
 		return 0;
-	case RXM_INJECT_TX:
-		assert(0);
-		return -FI_EOPBADSTATE;
 	case RXM_RMA:
-		rma_buf = comp->op_context;
+		tx_buf = comp->op_context;
 		assert((comp->flags & (FI_WRITE | FI_RMA)) ||
 		       (comp->flags & (FI_READ | FI_RMA)));
-		return rxm_finish_rma(rxm_ep, rma_buf, comp->flags);
+		rxm_finish_rma(rxm_ep, tx_buf, comp->flags);
+		return 0;
 	case RXM_RX:
 		rx_buf = comp->op_context;
 		assert(!(comp->flags & FI_REMOTE_READ));
@@ -1117,10 +1388,15 @@ ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 
 		switch (rx_buf->pkt.ctrl_hdr.type) {
 		case rxm_ctrl_eager:
-		case rxm_ctrl_rndv:
+		case rxm_ctrl_rndv_req:
 			return rxm_handle_recv_comp(rx_buf);
-		case rxm_ctrl_rndv_ack:
-			return rxm_rndv_handle_ack(rxm_ep, rx_buf);
+		case rxm_ctrl_rndv_rd_done:
+			rxm_rndv_handle_rd_done(rxm_ep, rx_buf);
+			return 0;
+		case rxm_ctrl_rndv_wr_done:
+			return rxm_rndv_handle_wr_done(rxm_ep, rx_buf);
+		case rxm_ctrl_rndv_wr_data:
+			return rxm_rndv_handle_wr_data(rx_buf);
 		case rxm_ctrl_seg:
 			return rxm_sar_handle_segment(rx_buf);
 		case rxm_ctrl_atomic:
@@ -1135,50 +1411,239 @@ ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 			return -FI_EINVAL;
 		}
 	case RXM_SAR_TX:
-		tx_sar_buf = comp->op_context;
+		tx_buf = comp->op_context;
 		assert(comp->flags & FI_SEND);
-		return rxm_finish_sar_segment_send(rxm_ep, tx_sar_buf, false);
+		rxm_handle_sar_comp(rxm_ep, tx_buf);
+		return 0;
 	case RXM_RNDV_TX:
-		tx_rndv_buf = comp->op_context;
+		tx_buf = comp->op_context;
 		assert(comp->flags & FI_SEND);
-		RXM_UPDATE_STATE(FI_LOG_CQ, tx_rndv_buf, RXM_RNDV_ACK_WAIT);
+		if (rxm_ep->rndv_ops == &rxm_rndv_ops_write)
+			RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf,
+					 RXM_RNDV_WRITE_DATA_WAIT);
+		else
+			RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf,
+					 RXM_RNDV_READ_DONE_WAIT);
 		return 0;
-	case RXM_RNDV_ACK_WAIT:
+	case RXM_RNDV_READ_DONE_WAIT:
+	case RXM_RNDV_WRITE_DATA_WAIT:
 		assert(0);
-		return -FI_EOPBADSTATE;
+		return 0;
 	case RXM_RNDV_READ:
 		rx_buf = comp->op_context;
 		assert(comp->flags & FI_READ);
-		if (++rx_buf->rndv_rma_index < rx_buf->rndv_hdr->count)
+		if (++rx_buf->rndv_rma_index < rx_buf->remote_rndv_hdr->count)
 			return 0;
-		else
-			return rxm_rndv_send_ack(rx_buf);
-	case RXM_RNDV_ACK_SENT:
+
+		rxm_rndv_send_rd_done(rx_buf);
+		return 0;
+	case RXM_RNDV_WRITE:
+		tx_buf = comp->op_context;
+		assert(comp->flags & FI_WRITE);
+		if (++tx_buf->write_rndv.rndv_rma_index <
+		    tx_buf->write_rndv.rndv_rma_count)
+			return 0;
+
+		rxm_rndv_send_wr_done(rxm_ep, tx_buf);
+		return 0;
+	case RXM_RNDV_READ_DONE_SENT:
 		assert(comp->flags & FI_SEND);
-		return rxm_finish_send_rndv_ack(comp->op_context);
-	case RXM_RNDV_ACK_RECVD:
-		tx_rndv_buf = comp->op_context;
+		rxm_rndv_rx_finish(comp->op_context);
+		return 0;
+	case RXM_RNDV_WRITE_DATA_SENT:
+		rx_buf = comp->op_context;
 		assert(comp->flags & FI_SEND);
-		return rxm_rndv_tx_finish(rxm_ep, tx_rndv_buf);
+		dlist_insert_tail(&rx_buf->rndv_wait_entry, &rx_buf->ep->rndv_wait_list);
+		RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_WRITE_DONE_WAIT);
+		return 0;
+	case RXM_RNDV_WRITE_DONE_SENT:
+	case RXM_RNDV_READ_DONE_RECVD:
+		assert(comp->flags & FI_SEND || comp->flags & FI_WRITE);
+		rxm_rndv_tx_finish(rxm_ep, comp->op_context);
+		return 0;
+	case RXM_RNDV_WRITE_DONE_RECVD:
+		assert(comp->flags & FI_SEND);
+		rxm_rndv_rx_finish(comp->op_context);
+		return 0;
 	case RXM_RNDV_FINISH:
 		assert(0);
-		return -FI_EOPBADSTATE;
+		return 0;
 	case RXM_ATOMIC_RESP_WAIT:
-		/* Optional atomic request completion; TX completion
-		 * processing is performed when atomic response is received */
+		/* BUG: need to wait for completion, even if a response has
+		 * been received.
+		 */
 		assert(comp->flags & FI_SEND);
 		return 0;
 	case RXM_ATOMIC_RESP_SENT:
-		tx_atomic_buf = comp->op_context;
+		tx_buf = comp->op_context;
 		assert(comp->flags & FI_SEND);
-		ofi_buf_free(tx_atomic_buf);
+		ofi_buf_free(tx_buf);	/* BUG: should have consumed tx credit */
 		return 0;
 	default:
 		assert(0);
-		return -FI_EOPBADSTATE;
+		return 0;
 	}
 }
 
+static void rxm_get_recv_entry(struct rxm_rx_buf *rx_buf,
+			       struct ofi_cq_rbuf_entry *cq_entry)
+{
+	struct rxm_recv_match_attr match_attr;
+	struct rxm_conn *conn;
+	struct rxm_recv_queue *recv_queue;
+	struct dlist_entry *entry;
+
+	assert(!rx_buf->recv_entry);
+	if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) {
+		conn = cq_entry->ep_context;
+		match_attr.addr = conn->peer->fi_addr;
+	} else {
+		match_attr.addr = FI_ADDR_UNSPEC;
+	}
+
+	match_attr.ignore = 0;
+	if (rx_buf->pkt.hdr.op == ofi_op_tagged) {
+		match_attr.tag = rx_buf->pkt.hdr.tag;
+		recv_queue = &rx_buf->ep->trecv_queue;
+	} else {
+		match_attr.tag = 0;
+		recv_queue = &rx_buf->ep->recv_queue;
+	}
+
+	/* See comment with rxm_get_dyn_rbuf */
+	if (recv_queue->dyn_rbuf_unexp_cnt == 0) {
+		entry = dlist_remove_first_match(&recv_queue->recv_list,
+						 recv_queue->match_recv,
+						 &match_attr);
+		if (entry) {
+			rx_buf->recv_entry = container_of(entry,
+						struct rxm_recv_entry, entry);
+			if (rx_buf->recv_entry->flags & FI_MULTI_RECV)
+				rxm_adjust_multi_recv(rx_buf);
+		} else {
+			recv_queue->dyn_rbuf_unexp_cnt++;
+		}
+	} else {
+		recv_queue->dyn_rbuf_unexp_cnt++;
+	}
+}
+
+static void rxm_fake_rx_hdr(struct rxm_rx_buf *rx_buf,
+			    struct ofi_cq_rbuf_entry *entry)
+{
+	struct rxm_conn *conn;
+
+	conn = entry->ep_context;
+
+	OFI_DBG_SET(rx_buf->pkt.hdr.version, OFI_OP_VERSION);
+	OFI_DBG_SET(rx_buf->pkt.ctrl_hdr.version, RXM_CTRL_VERSION);
+	rx_buf->pkt.ctrl_hdr.type = rxm_ctrl_eager;
+	rx_buf->pkt.ctrl_hdr.conn_id = conn->peer->index;
+	rx_buf->pkt.hdr.op = ofi_op_tagged;
+	rx_buf->pkt.hdr.tag = entry->tag;
+	rx_buf->pkt.hdr.size = entry->len;
+	rx_buf->pkt.hdr.flags = 0;
+}
+
+static ssize_t
+rxm_get_dyn_unexp(struct rxm_rx_buf *rx_buf, struct iovec *iov, size_t *count)
+{
+	*count = 1;
+
+	if (rx_buf->pkt.hdr.size > rxm_buffer_size) {
+		rx_buf->data = malloc(rx_buf->pkt.hdr.size);
+		if (!rx_buf->data)
+			goto trunc;
+	}
+
+	iov[0].iov_base = rx_buf->data;
+	iov[0].iov_len = rx_buf->pkt.hdr.size;
+	return 0;
+
+trunc:
+	rx_buf->data = &rx_buf->pkt.data;
+	iov[0].iov_base = rx_buf->data;
+	iov[0].iov_len = rxm_buffer_size;
+	return -FI_ETRUNC;
+}
+
+/*
+ * Dynamic receive buffer callback from fi_cq_read(msg cq).
+ * We're holding the ep lock.
+ *
+ * There's a subtle race condition handling unexpected messages. If we cannot
+ * find a matching receive, the message will be marked as unexpected.
+ * However, we can't queue it on the unexpected list until is has been fully
+ * received and returned through fi_cq_read().  It's possible for the
+ * application to post the matching buffer prior to that occurring.  That is,
+ * the matching buffer is posted after we checked for a match, but before the
+ * message endpoint is finishes receiving the unexpected data.
+ *
+ * Once the unexpected message has been received, it's completion may be
+ * written to the CQ.  If the message provider continues processing messages
+ * it could invoke a callback for a second message.  If we allow the second
+ * message to match the posted receive buffer, then the second message would
+ * match out of order from the first message.
+ *
+ * To handle this, we need to track the number of unexpected messages queued
+ * within the message provider, so that they can check for matching
+ * receives in order.  If there are any unexpected messages outstanding, we
+ * need to fail all matches until they have been read from the CQ.
+ */
+ssize_t rxm_get_dyn_rbuf(struct ofi_cq_rbuf_entry *entry, struct iovec *iov,
+			 size_t *count)
+{
+	struct rxm_rx_buf *rx_buf;
+
+	rx_buf = entry->op_context;
+	assert(!(rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV));
+
+	/* Messages tagged at the tcp layer do not carry an rxm header */
+	if (entry->flags & FI_TAGGED)
+		rxm_fake_rx_hdr(rx_buf, entry);
+
+	assert((rx_buf->pkt.hdr.version == OFI_OP_VERSION) &&
+		(rx_buf->pkt.ctrl_hdr.version == RXM_CTRL_VERSION));
+
+	switch (rx_buf->pkt.ctrl_hdr.type) {
+	case rxm_ctrl_eager:
+		rxm_get_recv_entry(rx_buf, entry);
+		if (rx_buf->recv_entry) {
+			*count = rx_buf->recv_entry->rxm_iov.count;
+			memcpy(iov, rx_buf->recv_entry->rxm_iov.iov, *count *
+			       sizeof(*iov));
+		} else {
+			rxm_get_dyn_unexp(rx_buf, iov, count);
+		}
+		break;
+	case rxm_ctrl_rndv_req:
+		/* Find matching receive to maintain message ordering. */
+		rxm_get_recv_entry(rx_buf, entry);
+
+		/* fall through */
+	case rxm_ctrl_atomic:
+	case rxm_ctrl_atomic_resp:
+	case rxm_ctrl_rndv_wr_data:
+	case rxm_ctrl_rndv_wr_done:
+	case rxm_ctrl_rndv_rd_done:
+	case rxm_ctrl_credit:
+		*count = 1;
+		iov[0].iov_base = &rx_buf->pkt.data;
+		iov[0].iov_len = rxm_buffer_size;
+		break;
+	case rxm_ctrl_seg:
+	default:
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"Unexpected request for dynamic rbuf\n");
+		*count = 1;
+		iov[0].iov_base = &rx_buf->pkt.data;
+		iov[0].iov_len = rxm_buffer_size;
+		break;
+	}
+
+	return 0;
+}
+
 void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr,
 			void *op_context, int err)
 {
@@ -1234,12 +1699,8 @@ void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err)
 
 void rxm_handle_comp_error(struct rxm_ep *rxm_ep)
 {
-	struct rxm_tx_base_buf *base_buf;
-	struct rxm_tx_eager_buf *eager_buf;
-	struct rxm_tx_sar_buf *sar_buf;
-	struct rxm_tx_rndv_buf *rndv_buf;
+	struct rxm_tx_buf *tx_buf;
 	struct rxm_rx_buf *rx_buf;
-	struct rxm_rma_buf *rma_buf;
 	struct util_cq *cq;
 	struct util_cntr *cntr;
 	struct fi_cq_err_entry err_entry = {0};
@@ -1262,55 +1723,71 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep)
 
 	switch (RXM_GET_PROTO_STATE(err_entry.op_context)) {
 	case RXM_TX:
-		eager_buf = err_entry.op_context;
-		err_entry.op_context = eager_buf->app_context;
-		err_entry.flags = ofi_tx_cq_flags(eager_buf->pkt.hdr.op);
-		ofi_buf_free(eager_buf);
+	case RXM_RNDV_TX:
+	case RXM_RNDV_WRITE_DONE_SENT:
+	case RXM_ATOMIC_RESP_WAIT:
+		tx_buf = err_entry.op_context;
+		err_entry.op_context = tx_buf->app_context;
+		err_entry.flags = ofi_tx_cq_flags(tx_buf->pkt.hdr.op);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
 		break;
+	case RXM_RNDV_READ_DONE_RECVD:
+		/* We received the response, so ignore the send error */
+		rxm_rndv_tx_finish(rxm_ep, err_entry.op_context);
+		return;
+	case RXM_RNDV_WRITE_DONE_RECVD:
+		/* We received the response, so ignore the send error */
+		rxm_rndv_rx_finish(err_entry.op_context);
+		return;
 	case RXM_INJECT_TX:
-		assert(0);
+		rxm_free_rx_buf(rxm_ep, err_entry.op_context);
+		if (cntr)
+			rxm_cntr_incerr(cntr);
+		return;
+	case RXM_CREDIT_TX:
+	case RXM_ATOMIC_RESP_SENT: /* BUG: should have consumed tx credit */
+		tx_buf = err_entry.op_context;
+		ofi_buf_free(tx_buf);
 		return;
 	case RXM_RMA:
-		rma_buf = err_entry.op_context;
-		err_entry.op_context = rma_buf->app_context;
+		tx_buf = err_entry.op_context;
+		err_entry.op_context = tx_buf->app_context;
 		/* err_entry.flags pass through from msg ep */
-		if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local &&
+		if (!(tx_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local &&
 		    rxm_ep->msg_mr_local) {
-			rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count);
+			rxm_msg_mr_closev(tx_buf->rma.mr, tx_buf->rma.count);
 		}
-		ofi_buf_free(rma_buf);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
 		break;
 	case RXM_SAR_TX:
-		sar_buf = err_entry.op_context;
-		err_entry.op_context = sar_buf->app_context;
-		err_entry.flags = ofi_tx_cq_flags(sar_buf->pkt.hdr.op);
-		rxm_finish_sar_segment_send(rxm_ep, sar_buf, true);
-		break;
-	case RXM_CREDIT_TX:
-		base_buf = err_entry.op_context;
-		err_entry.op_context = 0;
-		err_entry.flags = ofi_tx_cq_flags(base_buf->pkt.hdr.op);
+		tx_buf = err_entry.op_context;
+		err_entry.op_context = tx_buf->app_context;
+		err_entry.flags = ofi_tx_cq_flags(tx_buf->pkt.hdr.op);
+		if (!rxm_complete_sar(rxm_ep, tx_buf))
+			return;
 		break;
-	case RXM_RNDV_TX:
-		rndv_buf = err_entry.op_context;
-		err_entry.op_context = rndv_buf->app_context;
-		err_entry.flags = ofi_tx_cq_flags(rndv_buf->pkt.hdr.op);
+	case RXM_RNDV_WRITE:
+		tx_buf = err_entry.op_context;
+		err_entry.op_context = tx_buf->app_context;
+		err_entry.flags = ofi_tx_cq_flags(tx_buf->pkt.hdr.op);
 		break;
 
-	/* Application receive related error */
+	/* Incoming application data error */
 	case RXM_RX:
-		/* Silently drop any MSG CQ error entries for canceled receive
-		 * operations as these are internal to RxM. This situation can
-		 * happen when the MSG EP receives a reject / shutdown and CM
-		 * thread hasn't handled the event yet. */
-		if (err_entry.err == FI_ECANCELED) {
-			/* No need to re-post these buffers. Free directly */
+		/* Silently drop MSG CQ error entries for internal receive
+		 * operations not associated with an application posted
+		 * receive. This situation can happen when the MSG EP
+		 * receives a reject / shutdown and CM thread hasn't handled
+		 * the event yet.
+		 */
+		rx_buf = (struct rxm_rx_buf *) err_entry.op_context;
+		if (!rx_buf->recv_entry) {
 			ofi_buf_free((struct rxm_rx_buf *)err_entry.op_context);
 			return;
 		}
 		/* fall through */
-	case RXM_RNDV_ACK_SENT:
-		/* fall through */
+	case RXM_RNDV_READ_DONE_SENT:
+	case RXM_RNDV_WRITE_DATA_SENT: /* BUG: should fail initial send */
 	case RXM_RNDV_READ:
 		rx_buf = (struct rxm_rx_buf *) err_entry.op_context;
 		assert(rx_buf->recv_entry);
@@ -1333,45 +1810,49 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep)
 
 	assert(cq);
 	ret = ofi_cq_write_error(cq, &err_entry);
-	if (ret)
+	if (ret) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n");
+		assert(0);
+	}
 }
 
-static int rxm_msg_ep_recv(struct rxm_rx_buf *rx_buf)
+int rxm_post_recv(struct rxm_rx_buf *rx_buf)
 {
-	int ret, level;
+	struct rxm_domain *domain;
+	int ret;
 
 	if (rx_buf->ep->srx_ctx)
 		rx_buf->conn = NULL;
 	rx_buf->hdr.state = RXM_RX;
+	rx_buf->recv_entry = NULL;
 
-	ret = (int) fi_recv(rx_buf->msg_ep, &rx_buf->pkt,
-			    rxm_eager_limit + sizeof(struct rxm_pkt),
-			    rx_buf->hdr.desc, FI_ADDR_UNSPEC, rx_buf);
+	domain = container_of(rx_buf->ep->util_ep.domain,
+			      struct rxm_domain, util_domain);
+	ret = (int) fi_recv(rx_buf->rx_ep, &rx_buf->pkt,
+			    domain->rx_post_size, rx_buf->hdr.desc,
+			    FI_ADDR_UNSPEC, rx_buf);
 	if (!ret)
 		return 0;
 
 	if (ret != -FI_EAGAIN) {
-		level = (rx_buf->conn->handle.state == RXM_CMAP_SHUTDOWN) ?
-			FI_LOG_DEBUG : FI_LOG_WARN;
-		FI_LOG(&rxm_prov, level, FI_LOG_EP_CTRL,
+		FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
 		       "unable to post recv buf: %d\n", ret);
 	}
 	return ret;
 }
 
-int rxm_msg_ep_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep)
+int rxm_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep)
 {
 	struct rxm_rx_buf *rx_buf;
 	int ret;
 	size_t i;
 
 	for (i = 0; i < rxm_ep->msg_info->rx_attr->size; i++) {
-		rx_buf = rxm_rx_buf_alloc(rxm_ep, msg_ep, 1);
+		rx_buf = rxm_rx_buf_alloc(rxm_ep, rx_ep);
 		if (!rx_buf)
 			return -FI_ENOMEM;
 
-		ret = rxm_msg_ep_recv(rx_buf);
+		ret = rxm_post_recv(rx_buf);
 		if (ret) {
 			ofi_buf_free(&rx_buf->hdr);
 			return ret;
@@ -1386,27 +1867,9 @@ void rxm_ep_do_progress(struct util_ep *util_ep)
 	struct fi_cq_data_entry comp;
 	struct dlist_entry *conn_entry_tmp;
 	struct rxm_conn *rxm_conn;
-	struct rxm_rx_buf *buf;
-	ssize_t ret;
 	size_t comp_read = 0;
 	uint64_t timestamp;
-
-	while (!dlist_empty(&rxm_ep->repost_ready_list)) {
-		dlist_pop_front(&rxm_ep->repost_ready_list, struct rxm_rx_buf,
-				buf, repost_entry);
-
-		/* Discard rx buffer if its msg_ep was closed */
-		if (!rxm_ep->srx_ctx && !buf->conn->msg_ep) {
-			ofi_buf_free(&buf->hdr);
-			continue;
-		}
-
-		ret = rxm_msg_ep_recv(buf);
-		if (ret) {
-			if (ret == -FI_EAGAIN)
-				ofi_buf_free(&buf->hdr);
-		}
-	}
+	ssize_t ret;
 
 	do {
 		ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1);
@@ -1426,21 +1889,27 @@ void rxm_ep_do_progress(struct util_ep *util_ep)
 				rxm_cq_write_error_all(rxm_ep, ret);
 		}
 
-		if (ret == -FI_EAGAIN || --rxm_ep->cq_eq_fairness <= 0) {
+		if (ret == -FI_EAGAIN || rxm_ep->connecting_cnt ||
+		    --rxm_ep->cq_eq_fairness <= 0) {
 			rxm_ep->cq_eq_fairness = rxm_cq_eq_fairness;
-			timestamp = ofi_gettime_us();
-			if (timestamp - rxm_ep->msg_cq_last_poll >
-				rxm_cm_progress_interval) {
-				rxm_ep->msg_cq_last_poll = timestamp;
-				rxm_msg_eq_progress(rxm_ep);
+			if (rxm_ep->connecting_cnt == 0 &&
+			    rxm_cm_progress_interval) {
+				timestamp = ofi_gettime_us();
+				if (timestamp - rxm_ep->msg_cq_last_poll >
+				    rxm_cm_progress_interval) {
+					rxm_ep->msg_cq_last_poll = timestamp;
+					rxm_conn_progress(rxm_ep);
+				}
+			} else {
+					rxm_conn_progress(rxm_ep);
 			}
 		}
 	} while ((ret > 0) && (++comp_read < rxm_ep->comp_per_progress));
 
-	if (!dlist_empty(&rxm_ep->deferred_tx_conn_queue)) {
-		dlist_foreach_container_safe(&rxm_ep->deferred_tx_conn_queue,
+	if (!dlist_empty(&rxm_ep->deferred_queue)) {
+		dlist_foreach_container_safe(&rxm_ep->deferred_queue,
 					     struct rxm_conn, rxm_conn,
-					     deferred_conn_entry, conn_entry_tmp) {
+					     deferred_entry, conn_entry_tmp) {
 			rxm_ep_progress_deferred_queue(rxm_ep, rxm_conn);
 		}
 	}
diff --git a/deps/libfabric/prov/rxm/src/rxm_domain.c b/deps/libfabric/prov/rxm/src/rxm_domain.c
index bd805a75eb0978d1c2cb9c4dea1dd8ebdac3c5ec..0a694bbfbccdef315bb1cb0a56f3ef7b90052d7f 100644
--- a/deps/libfabric/prov/rxm/src/rxm_domain.c
+++ b/deps/libfabric/prov/rxm/src/rxm_domain.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -106,6 +107,17 @@ static int rxm_mr_add_map_entry(struct util_domain *domain,
 	return ret;
 }
 
+struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key)
+{
+	struct rxm_mr *mr;
+
+	fastlock_acquire(&domain->util_domain.lock);
+	mr = ofi_mr_map_get(&domain->util_domain.mr_map, key);
+	fastlock_release(&domain->util_domain.lock);
+
+	return mr;
+}
+
 static int rxm_domain_close(fid_t fid)
 {
 	struct rxm_domain *rxm_domain;
@@ -113,6 +125,9 @@ static int rxm_domain_close(fid_t fid)
 
 	rxm_domain = container_of(fid, struct rxm_domain, util_domain.domain_fid.fid);
 
+	fastlock_destroy(&rxm_domain->amo_bufpool_lock);
+	ofi_bufpool_destroy(rxm_domain->amo_bufpool);
+
 	ret = fi_close(&rxm_domain->msg_domain->fid);
 	if (ret)
 		return ret;
@@ -216,14 +231,18 @@ err:
 	return ret;
 }
 
+/* Large send/recv transfers use RMA rendezvous protocol */
 static uint64_t
 rxm_mr_get_msg_access(struct rxm_domain *rxm_domain, uint64_t access)
 {
-	/* Additional flags to use RMA read for large message transfers */
-	access |= FI_READ | FI_REMOTE_READ;
+	if (access & FI_SEND) {
+		access |= rxm_use_write_rndv ? FI_WRITE : FI_REMOTE_READ;
+	}
+
+	if (access & FI_RECV) {
+		access |= rxm_use_write_rndv ? FI_REMOTE_WRITE : FI_READ;
+	}
 
-	if (rxm_domain->mr_local)
-		access |= FI_WRITE;
 	return access;
 }
 
@@ -233,10 +252,7 @@ static void rxm_mr_init(struct rxm_mr *rxm_mr, struct rxm_domain *domain,
 	rxm_mr->mr_fid.fid.fclass = FI_CLASS_MR;
 	rxm_mr->mr_fid.fid.context = context;
 	rxm_mr->mr_fid.fid.ops = &rxm_mr_ops;
-	/* Store msg_mr as rxm_mr descriptor so that we can get its key when
-	 * the app passes msg_mr as the descriptor in fi_send and friends.
-	 * The key would be used in large message transfer protocol and RMA. */
-	rxm_mr->mr_fid.mem_desc = rxm_mr->msg_mr;
+	rxm_mr->mr_fid.mem_desc = rxm_mr;
 	rxm_mr->mr_fid.key = fi_mr_key(rxm_mr->msg_mr);
 	rxm_mr->domain = domain;
 	ofi_atomic_inc32(&domain->util_domain.ref);
@@ -257,6 +273,13 @@ static int rxm_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	if (!rxm_mr)
 		return -FI_ENOMEM;
 
+	ofi_mr_update_attr(rxm_domain->util_domain.fabric->fabric_fid.api_version,
+			   rxm_domain->util_domain.info_domain_caps, attr,
+			   &msg_attr);
+
+	if ((flags & FI_HMEM_HOST_ALLOC) && (attr->iface == FI_HMEM_ZE))
+		msg_attr.device.ze = -1;
+
 	msg_attr.access = rxm_mr_get_msg_access(rxm_domain, attr->access);
 
 	ret = fi_mr_regattr(rxm_domain->msg_domain, &msg_attr,
@@ -266,6 +289,9 @@ static int rxm_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 		goto err;
 	}
 	rxm_mr_init(rxm_mr, rxm_domain, attr->context);
+	fastlock_init(&rxm_mr->amo_lock);
+	rxm_mr->iface = msg_attr.iface;
+	rxm_mr->device = msg_attr.device.reserved;
 	*mr = &rxm_mr->mr_fid;
 
 	if (rxm_domain->util_domain.info_domain_caps & FI_ATOMIC) {
@@ -357,29 +383,29 @@ static struct fi_ops_mr rxm_domain_mr_ops = {
 
 static ssize_t rxm_send_credits(struct fid_ep *ep, size_t credits)
 {
-	struct rxm_conn *rxm_conn =
-		container_of(ep->fid.context, struct rxm_conn, handle);
-	struct rxm_ep *rxm_ep = rxm_conn->handle.cmap->ep;
+	struct rxm_conn *rxm_conn = ep->fid.context;
+	struct rxm_ep *rxm_ep = rxm_conn->ep;
 	struct rxm_deferred_tx_entry *def_tx_entry;
-	struct rxm_tx_base_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 	struct iovec iov;
 	struct fi_msg msg;
 	ssize_t ret;
 
-	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_CREDIT);
+	tx_buf = ofi_buf_alloc(rxm_ep->tx_pool);
 	if (!tx_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Ran out of buffers from TX credit buffer pool.\n");
 		return -FI_ENOMEM;
 	}
 
+	tx_buf->hdr.state = RXM_CREDIT_TX;
 	rxm_ep_format_tx_buf_pkt(rxm_conn, 0, rxm_ctrl_credit, 0, 0, FI_SEND,
 				 &tx_buf->pkt);
 	tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_credit;
 	tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
 	tx_buf->pkt.ctrl_hdr.ctrl_data = credits;
 
-	if (rxm_conn->handle.state != RXM_CMAP_CONNECTED)
+	if (rxm_conn->state != RXM_CM_CONNECTED)
 		goto defer;
 
 	iov.iov_base = &tx_buf->pkt;
@@ -404,7 +430,7 @@ defer:
 	}
 
 	def_tx_entry->credit_msg.tx_buf = tx_buf;
-	rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
+	rxm_queue_deferred_tx(def_tx_entry, OFI_LIST_HEAD);
 	return FI_SUCCESS;
 }
 
@@ -431,14 +457,70 @@ struct ofi_ops_flow_ctrl rxm_no_ops_flow_ctrl = {
 	.set_send_handler = rxm_no_credit_handler,
 };
 
+static int rxm_config_flow_ctrl(struct rxm_domain *domain)
+{
+	struct ofi_ops_flow_ctrl *flow_ctrl_ops;
+	int ret;
+
+	ret = fi_open_ops(&domain->msg_domain->fid, OFI_OPS_FLOW_CTRL, 0,
+			  (void **) &flow_ctrl_ops, NULL);
+	if (ret) {
+		if (ret == -FI_ENOSYS) {
+			domain->flow_ctrl_ops = &rxm_no_ops_flow_ctrl;
+			return 0;
+		}
+		return ret;
+	}
+
+	assert(flow_ctrl_ops);
+	domain->flow_ctrl_ops = flow_ctrl_ops;
+	domain->flow_ctrl_ops->set_send_handler(domain->msg_domain,
+						rxm_send_credits);
+	return 0;
+}
+
+struct ofi_ops_dynamic_rbuf rxm_dynamic_rbuf = {
+	.size = sizeof(struct ofi_ops_dynamic_rbuf),
+	.get_rbuf = rxm_get_dyn_rbuf,
+};
+
+static void rxm_config_dyn_rbuf(struct rxm_domain *domain, struct fi_info *info,
+				struct fi_info *msg_info)
+{
+	int ret = 1;
+
+	/* Collective support requires rxm generated and consumed messages.
+	 * Although we could update the code to handle receiving collective
+	 * messages, collective support is mostly for development purposes.
+	 * So, fallback to bounce buffers when enabled.
+	 * We also can't pass through HMEM buffers, unless the lower layer
+	 * can handle them.
+	 */
+	if ((info->caps & FI_COLLECTIVE) ||
+	    ((info->caps & FI_HMEM) && !(msg_info->caps & FI_HMEM)))
+		return;
+
+	fi_param_get_bool(&rxm_prov, "enable_dyn_rbuf", &ret);
+	domain->dyn_rbuf = (ret != 0);
+	if (!domain->dyn_rbuf)
+		return;
+
+	ret = fi_set_ops(&domain->msg_domain->fid, OFI_OPS_DYNAMIC_RBUF, 0,
+			 (void *) &rxm_dynamic_rbuf, NULL);
+	domain->dyn_rbuf = (ret == FI_SUCCESS);
+
+	if (domain->dyn_rbuf) {
+		domain->rx_post_size = sizeof(struct rxm_pkt);
+	}
+}
+
 int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		struct fid_domain **domain, void *context)
 {
-	int ret;
 	struct rxm_domain *rxm_domain;
 	struct rxm_fabric *rxm_fabric;
 	struct fi_info *msg_info;
-	struct ofi_ops_flow_ctrl *flow_ctrl_ops;
+	int ret;
 
 	rxm_domain = calloc(1, sizeof(*rxm_domain));
 	if (!rxm_domain)
@@ -468,28 +550,32 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	rxm_domain->util_domain.mr_map.mode &= ~FI_MR_PROV_KEY;
 
 	rxm_domain->max_atomic_size = rxm_ep_max_atomic_size(info);
+	rxm_domain->rx_post_size = rxm_packet_size;
+
 	*domain = &rxm_domain->util_domain.domain_fid;
 	(*domain)->fid.ops = &rxm_domain_fi_ops;
 	/* Replace MR ops set by ofi_domain_init() */
 	(*domain)->mr = &rxm_domain_mr_ops;
 	(*domain)->ops = &rxm_domain_ops;
 
-	rxm_domain->mr_local = ofi_mr_local(msg_info) && !ofi_mr_local(info);
-
-	ret = fi_open_ops(&rxm_domain->msg_domain->fid, OFI_OPS_FLOW_CTRL, 0,
-			  (void **) &flow_ctrl_ops, NULL);
-	if (!ret && flow_ctrl_ops) {
-		rxm_domain->flow_ctrl_ops = flow_ctrl_ops;
-		rxm_domain->flow_ctrl_ops->set_send_handler(
-			rxm_domain->msg_domain, rxm_send_credits);
-	} else if (ret == -FI_ENOSYS) {
-		rxm_domain->flow_ctrl_ops = &rxm_no_ops_flow_ctrl;
-	} else {
+	ret = ofi_bufpool_create(&rxm_domain->amo_bufpool,
+				 rxm_domain->max_atomic_size, 64, 0, 0, 0);
+	if (ret)
 		goto err3;
-	}
+
+	fastlock_init(&rxm_domain->amo_bufpool_lock);
+
+	ret = rxm_config_flow_ctrl(rxm_domain);
+	if (ret)
+		goto err4;
+
+	rxm_config_dyn_rbuf(rxm_domain, info, msg_info);
 
 	fi_freeinfo(msg_info);
 	return 0;
+err4:
+	fastlock_destroy(&rxm_domain->amo_bufpool_lock);
+	ofi_bufpool_destroy(rxm_domain->amo_bufpool);
 err3:
 	fi_close(&rxm_domain->msg_domain->fid);
 err2:
diff --git a/deps/libfabric/prov/rxm/src/rxm_ep.c b/deps/libfabric/prov/rxm/src/rxm_ep.c
index de3e858696de59eaf458fd1cf5f2443f1dbe0211..897c0714be0e98f398d5e5f9d6ded7fb0f1a1f78 100644
--- a/deps/libfabric/prov/rxm/src/rxm_ep.c
+++ b/deps/libfabric/prov/rxm/src/rxm_ep.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  * Copyright (c) 2020 Cisco Systems, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -108,178 +109,75 @@ static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *ar
 
 static int rxm_buf_reg(struct ofi_bufpool_region *region)
 {
-	struct rxm_buf_pool *pool = region->pool->attr.context;
+	struct rxm_ep *rxm_ep = region->pool->attr.context;
 	struct rxm_domain *rxm_domain;
 	int ret;
+	bool hmem_enabled = !!(rxm_ep->util_ep.caps & FI_HMEM);
 
-	if ((pool->type == RXM_BUF_POOL_TX_INJECT) ||
-	    !pool->rxm_ep->msg_mr_local)
+	if (hmem_enabled) {
+		ret = ofi_hmem_host_register(region->mem_region,
+					     region->pool->region_size);
+		if (ret != FI_SUCCESS)
+			return ret;
+	}
+
+	if (!rxm_ep->msg_mr_local)
 		return 0;
 
-	rxm_domain = container_of(pool->rxm_ep->util_ep.domain,
+	rxm_domain = container_of(rxm_ep->util_ep.domain,
 				  struct rxm_domain, util_domain);
+
 	ret = rxm_msg_mr_reg_internal(rxm_domain, region->mem_region,
 				      region->pool->region_size,
 				      FI_SEND | FI_RECV | FI_READ | FI_WRITE,
 				      OFI_MR_NOCACHE,
 				      (struct fid_mr **) &region->context);
 
+	if (ret != FI_SUCCESS) {
+		if (hmem_enabled)
+			ofi_hmem_host_unregister(region->mem_region);
+	}
+
 	return ret;
 }
 
-static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf)
+static void rxm_init_rx_buf(struct ofi_bufpool_region *region, void *buf)
 {
-	struct rxm_buf_pool *pool = region->pool->attr.context;
-	struct rxm_pkt *pkt;
-	struct rxm_rx_buf *rx_buf;
-	struct rxm_tx_base_buf *tx_base_buf;
-	struct rxm_tx_eager_buf *tx_eager_buf;
-	struct rxm_tx_sar_buf *tx_sar_buf;
-	struct rxm_tx_rndv_buf *tx_rndv_buf;
-	struct rxm_tx_atomic_buf *tx_atomic_buf;
-	struct rxm_rma_buf *rma_buf;
-	void *mr_desc;
-	uint8_t type;
-
-	if ((pool->type != RXM_BUF_POOL_TX_INJECT) &&
-	    pool->rxm_ep->msg_mr_local) {
-		mr_desc = fi_mr_desc((struct fid_mr *) region->context);
-	} else {
-		mr_desc = NULL;
-	}
-
-	switch (pool->type) {
-	case RXM_BUF_POOL_RX:
-		rx_buf = buf;
-		rx_buf->ep = pool->rxm_ep;
-
-		rx_buf->hdr.desc = mr_desc;
-		pkt = NULL;
-		type = rxm_ctrl_eager; /* This can be any value */
-		break;
-	case RXM_BUF_POOL_TX:
-		tx_eager_buf = buf;
-		tx_eager_buf->hdr.state = RXM_TX;
-
-		tx_eager_buf->hdr.desc = mr_desc;
-		pkt = &tx_eager_buf->pkt;
-		type = rxm_ctrl_eager;
-		break;
-	case RXM_BUF_POOL_TX_INJECT:
-		tx_base_buf = buf;
-		tx_base_buf->hdr.state = RXM_INJECT_TX;
-
-		pkt = &tx_base_buf->pkt;
-		type = rxm_ctrl_eager;
-		break;
-	case RXM_BUF_POOL_TX_SAR:
-		tx_sar_buf = buf;
-		tx_sar_buf->hdr.state = RXM_SAR_TX;
-
-		tx_sar_buf->hdr.desc = mr_desc;
-		pkt = &tx_sar_buf->pkt;
-		type = rxm_ctrl_seg;
-		break;
-	case RXM_BUF_POOL_TX_CREDIT:
-		tx_base_buf = buf;
-		tx_base_buf->hdr.state = RXM_CREDIT_TX;
-
-		tx_base_buf->hdr.desc = mr_desc;
-		pkt = &tx_base_buf->pkt;
-		type = rxm_ctrl_credit;
-		break;
-	case RXM_BUF_POOL_TX_RNDV:
-		tx_rndv_buf = buf;
+	struct rxm_ep *ep = region->pool->attr.context;
+	struct rxm_rx_buf *rx_buf = buf;
 
-		tx_rndv_buf->hdr.desc = mr_desc;
-		pkt = &tx_rndv_buf->pkt;
-		type = rxm_ctrl_rndv;
-		break;
-	case RXM_BUF_POOL_TX_ATOMIC:
-		tx_atomic_buf = buf;
+	rx_buf->hdr.desc = ep->msg_mr_local ?
+			   fi_mr_desc((struct fid_mr *) region->context) : NULL;
+	rx_buf->ep = ep;
+	rx_buf->data = &rx_buf->pkt.data;
+}
 
-		tx_atomic_buf->hdr.desc = mr_desc;
-		pkt = &tx_atomic_buf->pkt;
-		type = rxm_ctrl_atomic;
-		break;
-	case RXM_BUF_POOL_TX_ACK:
-		tx_base_buf = buf;
-		tx_base_buf->pkt.hdr.op = ofi_op_msg;
+static void rxm_init_tx_buf(struct ofi_bufpool_region *region, void *buf)
+{
+	struct rxm_ep *ep = region->pool->attr.context;
+	struct rxm_tx_buf *tx_buf = buf;
 
-		tx_base_buf->hdr.desc = mr_desc;
-		pkt = &tx_base_buf->pkt;
-		type = rxm_ctrl_rndv_ack;
-		break;
-	case RXM_BUF_POOL_RMA:
-		rma_buf = buf;
-		rma_buf->pkt.hdr.op = ofi_op_msg;
-		rma_buf->hdr.state = RXM_RMA;
-
-		rma_buf->hdr.desc = mr_desc;
-		pkt = &rma_buf->pkt;
-		type = rxm_ctrl_eager;
-		break;
-	default:
-		assert(0);
-		pkt = NULL;
-		break;
-	}
+	tx_buf->hdr.desc = ep->msg_mr_local ?
+			   fi_mr_desc((struct fid_mr *) region->context) : NULL;
 
-	if (pkt) {
-		pkt->ctrl_hdr.version = RXM_CTRL_VERSION;
-		pkt->hdr.version = OFI_OP_VERSION;
-		pkt->ctrl_hdr.type = type;
-	}
+	tx_buf->pkt.ctrl_hdr.version = RXM_CTRL_VERSION;
+	tx_buf->pkt.hdr.version = OFI_OP_VERSION;
 }
 
 static void rxm_buf_close(struct ofi_bufpool_region *region)
 {
-	struct rxm_buf_pool *pool = region->pool->attr.context;
-	struct rxm_ep *rxm_ep = pool->rxm_ep;
+	struct rxm_ep *ep = region->pool->attr.context;
+
+	if (ep->util_ep.caps & FI_HMEM)
+		ofi_hmem_host_unregister(region->mem_region);
 
-	if ((rxm_ep->msg_mr_local) && (pool->type != RXM_BUF_POOL_TX_INJECT)) {
+	if (ep->msg_mr_local) {
 		/* We would get a (fid_mr *) in context but
 		 * it is safe to cast it into (fid *) */
 		fi_close(region->context);
 	}
 }
 
-static void rxm_buf_pool_destroy(struct rxm_buf_pool *pool)
-{
-	/* This indicates whether the pool is allocated or not */
-	if (pool->rxm_ep) {
-		ofi_bufpool_destroy(pool->pool);
-	}
-}
-
-static int rxm_buf_pool_create(struct rxm_ep *rxm_ep, size_t size,
-			       size_t max_cnt, size_t chunk_count,
-			       struct rxm_buf_pool *pool,
-			       enum rxm_buf_pool_type type)
-{
-	int ret;
-	struct ofi_bufpool_attr attr = {
-		.size		= size,
-		.alignment	= 16,
-		.max_cnt	= max_cnt,
-		.chunk_cnt	= chunk_count,
-		.alloc_fn	= rxm_buf_reg,
-		.free_fn	= rxm_buf_close,
-		.init_fn	= rxm_buf_init,
-		.context	= pool,
-		.flags		= OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_HUGEPAGES,
-	};
-
-	pool->rxm_ep = rxm_ep;
-	pool->type = type;
-	ret = ofi_bufpool_create_attr(&attr, &pool->pool);
-	if (ret)
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-			"Unable to create buf pool\n");
-
-	return ret;
-}
-
 static void rxm_recv_entry_init(struct rxm_recv_entry *entry, void *arg)
 {
 	struct rxm_recv_queue *recv_queue = arg;
@@ -338,82 +236,63 @@ static void rxm_recv_queue_close(struct rxm_recv_queue *recv_queue)
 	/* It indicates that the recv_queue were allocated */
 	if (recv_queue->fs) {
 		rxm_recv_fs_free(recv_queue->fs);
+		recv_queue->fs = NULL;
 	}
 	// TODO cleanup recv_list and unexp msg list
 }
 
-static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep)
+static int rxm_ep_create_pools(struct rxm_ep *rxm_ep)
 {
-	int ret, i;
-	size_t queue_sizes[] = {
-		[RXM_BUF_POOL_RX] = rxm_ep->msg_info->rx_attr->size,
-		[RXM_BUF_POOL_TX] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_INJECT] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_ACK] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_RNDV] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_ATOMIC] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_SAR] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_TX_CREDIT] = rxm_ep->msg_info->tx_attr->size,
-		[RXM_BUF_POOL_RMA] = rxm_ep->msg_info->tx_attr->size,
-	};
-	size_t entry_sizes[] = {
-		[RXM_BUF_POOL_RX] = rxm_eager_limit +
-				    sizeof(struct rxm_rx_buf),
-		[RXM_BUF_POOL_TX] = rxm_eager_limit +
-				    sizeof(struct rxm_tx_eager_buf),
-		[RXM_BUF_POOL_TX_INJECT] = rxm_ep->inject_limit +
-					   sizeof(struct rxm_tx_base_buf),
-		[RXM_BUF_POOL_TX_ACK] = sizeof(struct rxm_tx_base_buf),
-		[RXM_BUF_POOL_TX_RNDV] = sizeof(struct rxm_rndv_hdr) +
-					 rxm_ep->buffered_min +
-					 sizeof(struct rxm_tx_rndv_buf),
-		[RXM_BUF_POOL_TX_ATOMIC] = rxm_eager_limit +
-					 sizeof(struct rxm_tx_atomic_buf),
-		[RXM_BUF_POOL_TX_SAR] = rxm_eager_limit +
-					sizeof(struct rxm_tx_sar_buf),
-		[RXM_BUF_POOL_TX_CREDIT] = sizeof(struct rxm_tx_base_buf),
-		[RXM_BUF_POOL_RMA] = rxm_eager_limit +
-				     sizeof(struct rxm_rma_buf),
-	};
-
-	dlist_init(&rxm_ep->repost_ready_list);
+	struct ofi_bufpool_attr attr = {0};
+	int ret;
 
-	rxm_ep->buf_pools = calloc(1, RXM_BUF_POOL_MAX *
-				      sizeof(*rxm_ep->buf_pools));
-	if (!rxm_ep->buf_pools)
-		return -FI_ENOMEM;
+	attr.size = rxm_buffer_size + sizeof(struct rxm_rx_buf);
+	attr.alignment = 16;
+	attr.chunk_cnt = 1024;
+	attr.alloc_fn = rxm_buf_reg;
+	attr.free_fn = rxm_buf_close;
+	attr.init_fn = rxm_init_rx_buf;
+	attr.context = rxm_ep;
+	attr.flags = OFI_BUFPOOL_NO_TRACK;
 
-	for (i = RXM_BUF_POOL_START; i < RXM_BUF_POOL_MAX; i++) {
-		if ((i == RXM_BUF_POOL_TX_INJECT) &&
-		    (rxm_ep->util_ep.domain->threading != FI_THREAD_SAFE))
-			continue;
+	ret = ofi_bufpool_create_attr(&attr, &rxm_ep->rx_pool);
+	if (ret) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
+			"Unable to create rx buf pool\n");
+		return ret;
+	}
 
-		ret = rxm_buf_pool_create(rxm_ep, entry_sizes[i],
-					  (i == RXM_BUF_POOL_RX ||
-					   i == RXM_BUF_POOL_TX_ATOMIC) ? 0 :
-					  rxm_ep->rxm_info->tx_attr->size,
-					  queue_sizes[i],
-					  &rxm_ep->buf_pools[i], i);
-		if (ret)
-			goto err;
+	attr.size = rxm_buffer_size + sizeof(struct rxm_tx_buf);
+	attr.init_fn = rxm_init_tx_buf;
+	ret = ofi_bufpool_create_attr(&attr, &rxm_ep->tx_pool);
+	if (ret) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
+			"Unable to create rx buf pool\n");
+		goto free_rx_pool;
 	}
 
-	return FI_SUCCESS;
+	return 0;
 
-err:
-	while (--i >= RXM_BUF_POOL_START)
-		rxm_buf_pool_destroy(&rxm_ep->buf_pools[i]);
-	free(rxm_ep->buf_pools);
+free_rx_pool:
+	ofi_bufpool_destroy(rxm_ep->rx_pool);
+	rxm_ep->rx_pool = NULL;
 	return ret;
 }
 
-static void rxm_ep_txrx_pool_destroy(struct rxm_ep *rxm_ep)
+static int rxm_multi_recv_pool_init(struct rxm_ep *rxm_ep)
 {
-	size_t i;
+	struct ofi_bufpool_attr attr = {
+		.size		= sizeof(struct rxm_recv_entry),
+		.alignment	= 16,
+		.max_cnt	= 0,
+		.chunk_cnt	= 16,
+		.alloc_fn	= NULL,
+		.init_fn	= NULL,
+		.context	= rxm_ep,
+		.flags		= OFI_BUFPOOL_NO_TRACK,
+	};
 
-	for (i = RXM_BUF_POOL_START; i < RXM_BUF_POOL_MAX; i++)
-		rxm_buf_pool_destroy(&rxm_ep->buf_pools[i]);
-	free(rxm_ep->buf_pools);
+	return ofi_bufpool_create_attr(&attr, &rxm_ep->multi_recv_pool);
 }
 
 static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep)
@@ -432,26 +311,38 @@ static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep)
 	if (ret)
 		goto err_recv_tag;
 
+	ret = rxm_multi_recv_pool_init(rxm_ep);
+	if (ret)
+		goto err_multi;
+
 	return FI_SUCCESS;
 
+err_multi:
+	rxm_recv_queue_close(&rxm_ep->trecv_queue);
 err_recv_tag:
 	rxm_recv_queue_close(&rxm_ep->recv_queue);
 	return ret;
 }
 
-static void rxm_ep_rx_queue_close(struct rxm_ep *rxm_ep)
-{
-	rxm_recv_queue_close(&rxm_ep->trecv_queue);
-	rxm_recv_queue_close(&rxm_ep->recv_queue);
-}
-
 /* It is safe to call this function, even if `rxm_ep_txrx_res_open`
  * has not yet been called */
-static void rxm_ep_txrx_res_close(struct rxm_ep *rxm_ep)
+static void rxm_ep_txrx_res_close(struct rxm_ep *ep)
 {
-	rxm_ep_rx_queue_close(rxm_ep);
-	if (rxm_ep->buf_pools)
-		rxm_ep_txrx_pool_destroy(rxm_ep);
+	rxm_recv_queue_close(&ep->trecv_queue);
+	rxm_recv_queue_close(&ep->recv_queue);
+
+	if (ep->multi_recv_pool) {
+		ofi_bufpool_destroy(ep->multi_recv_pool);
+		ep->multi_recv_pool = NULL;
+	}
+	if (ep->rx_pool) {
+		ofi_bufpool_destroy(ep->rx_pool);
+		ep->rx_pool = NULL;
+	}
+	if (ep->tx_pool) {
+		ofi_bufpool_destroy(ep->tx_pool);
+		ep->tx_pool = NULL;
+	}
 }
 
 static int rxm_setname(fid_t fid, void *addr, size_t addrlen)
@@ -506,8 +397,8 @@ static struct rxm_eager_ops coll_eager_ops = {
 	.handle_rx = rxm_handle_coll_eager,
 };
 
-static int rxm_ep_cancel_recv(struct rxm_ep *rxm_ep,
-			      struct rxm_recv_queue *recv_queue, void *context)
+static bool rxm_ep_cancel_recv(struct rxm_ep *rxm_ep,
+			       struct rxm_recv_queue *recv_queue, void *context)
 {
 	struct fi_cq_err_entry err_entry;
 	struct rxm_recv_entry *recv_entry;
@@ -518,35 +409,35 @@ static int rxm_ep_cancel_recv(struct rxm_ep *rxm_ep,
 	entry = dlist_remove_first_match(&recv_queue->recv_list,
 					 rxm_match_recv_entry_context,
 					 context);
-	if (entry) {
-		recv_entry = container_of(entry, struct rxm_recv_entry, entry);
-		memset(&err_entry, 0, sizeof(err_entry));
-		err_entry.op_context = recv_entry->context;
-		err_entry.flags |= recv_entry->comp_flags;
-		err_entry.tag = recv_entry->tag;
-		err_entry.err = FI_ECANCELED;
-		err_entry.prov_errno = -FI_ECANCELED;
-		rxm_recv_entry_release(recv_queue, recv_entry);
-		ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry);
-	} else {
-		ret = 0;
+	if (!entry)
+		goto unlock;
+
+	recv_entry = container_of(entry, struct rxm_recv_entry, entry);
+	memset(&err_entry, 0, sizeof(err_entry));
+	err_entry.op_context = recv_entry->context;
+	err_entry.flags |= recv_entry->comp_flags;
+	err_entry.tag = recv_entry->tag;
+	err_entry.err = FI_ECANCELED;
+	err_entry.prov_errno = -FI_ECANCELED;
+	rxm_recv_entry_release(recv_entry);
+	ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry);
+	if (ret) {
+		FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n");
+		assert(0);
 	}
+
+unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
-	return ret;
+	return entry != NULL;
 }
 
 static ssize_t rxm_ep_cancel(fid_t fid_ep, void *context)
 {
-	struct rxm_ep *rxm_ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid);
-	int ret;
-
-	ret = rxm_ep_cancel_recv(rxm_ep, &rxm_ep->recv_queue, context);
-	if (ret)
-		return ret;
+	struct rxm_ep *rxm_ep;
 
-	ret = rxm_ep_cancel_recv(rxm_ep, &rxm_ep->trecv_queue, context);
-	if (ret)
-		return ret;
+	rxm_ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid);
+	if (!rxm_ep_cancel_recv(rxm_ep, &rxm_ep->recv_queue, context))
+		rxm_ep_cancel_recv(rxm_ep, &rxm_ep->trecv_queue, context);
 
 	return 0;
 }
@@ -600,7 +491,7 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname,
 			rxm_ep->min_multi_recv_size);
 		break;
 	case FI_OPT_BUFFERED_MIN:
-		if (rxm_ep->buf_pools) {
+		if (rxm_ep->rx_pool) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 				"Endpoint already enabled. Can't set opt now!\n");
 			ret = -FI_EOPBADSTATE;
@@ -618,7 +509,7 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname,
 		}
 		break;
 	case FI_OPT_BUFFERED_LIMIT:
-		if (rxm_ep->buf_pools) {
+		if (rxm_ep->rx_pool) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 				"Endpoint already enabled. Can't set opt now!\n");
 			ret = -FI_EOPBADSTATE;
@@ -653,7 +544,7 @@ static struct fi_ops_ep rxm_ops_ep = {
 	.tx_size_left = fi_no_tx_size_left,
 };
 
-/* Caller must hold recv_queue->lock */
+/* Caller must hold recv_queue->lock -- TODO which lock? */
 static struct rxm_rx_buf *
 rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr,
 		  uint64_t tag, uint64_t ignore)
@@ -709,8 +600,8 @@ static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue,
 			continue;
 
 		if (!rx_buf->conn) {
-			rx_buf->conn = rxm_key2conn(rx_buf->ep,
-							rx_buf->pkt.ctrl_hdr.conn_id);
+			rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map,
+					(int) rx_buf->pkt.ctrl_hdr.conn_id);
 		}
 		if (recv_entry->sar.conn != rx_buf->conn)
 			continue;
@@ -726,24 +617,24 @@ static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue,
 
 }
 
-static int rxm_ep_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf,
+static void rxm_ep_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf,
 			       void *context)
 {
-	int ret;
 	RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message",
 			 rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag);
 
-	ret = ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
-			    0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag);
+	rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
+		     0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag);
 	rxm_rx_buf_free(rx_buf);
-	return ret;
 }
 
-static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag,
-			    uint64_t ignore, void *context, uint64_t flags,
-			    struct rxm_recv_queue *recv_queue)
+static void
+rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag,
+		 uint64_t ignore, void *context, uint64_t flags,
+		 struct rxm_recv_queue *recv_queue)
 {
 	struct rxm_rx_buf *rx_buf;
+	int ret;
 
 	RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Peeking message", addr, tag);
 
@@ -752,15 +643,19 @@ static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag,
 	rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore);
 	if (!rx_buf) {
 		FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n");
-		return ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag,
-					       context);
+		ret = ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag,
+					      context);
+		if (ret)
+			FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n");
+		return;
 	}
 
 	FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message found\n");
 
 	if (flags & FI_DISCARD) {
 		dlist_remove(&rx_buf->unexp_msg.entry);
-		return rxm_ep_discard_recv(rxm_ep, rx_buf, context);
+		rxm_ep_discard_recv(rxm_ep, rx_buf, context);
+		return;
 	}
 
 	if (flags & FI_CLAIM) {
@@ -769,26 +664,20 @@ static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag,
 		dlist_remove(&rx_buf->unexp_msg.entry);
 	}
 
-	return ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
-			    rx_buf->pkt.hdr.size, NULL,
-			    rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag);
+	rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
+		     rx_buf->pkt.hdr.size, NULL,
+		     rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag);
 }
 
-static struct rxm_recv_entry *
-rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov,
-		   void **desc, size_t count, fi_addr_t src_addr,
-		   uint64_t tag, uint64_t ignore, void *context,
-		   uint64_t flags, struct rxm_recv_queue *recv_queue)
+static void rxm_recv_entry_init_common(struct rxm_recv_entry *recv_entry,
+		const struct iovec *iov, void **desc, size_t count,
+		fi_addr_t src_addr, uint64_t tag, uint64_t ignore,
+		void *context, uint64_t flags,
+		struct rxm_recv_queue *recv_queue)
 {
-	struct rxm_recv_entry *recv_entry;
 	size_t i;
 
-	if (freestack_isempty(recv_queue->fs))
-		return NULL;
-
-	recv_entry = freestack_pop(recv_queue->fs);
 	assert(!recv_entry->rndv.tx_buf);
-
 	recv_entry->rxm_iov.count = (uint8_t) count;
 	recv_entry->addr = src_addr;
 	recv_entry->context = context;
@@ -796,16 +685,81 @@ rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov,
 	recv_entry->ignore = ignore;
 	recv_entry->tag = tag;
 
+	recv_entry->sar.msg_id = RXM_SAR_RX_INIT;
+	recv_entry->sar.total_recv_len = 0;
+	recv_entry->total_len = 0;
+
 	for (i = 0; i < count; i++) {
 		recv_entry->rxm_iov.iov[i] = iov[i];
 		recv_entry->total_len += iov[i].iov_len;
-		if (desc)
+		if (desc && desc[i])
 			recv_entry->rxm_iov.desc[i] = desc[i];
+		else
+			recv_entry->rxm_iov.desc[i] = NULL;
 	}
+}
 
+static struct rxm_recv_entry *
+rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov,
+		   void **desc, size_t count, fi_addr_t src_addr,
+		   uint64_t tag, uint64_t ignore, void *context,
+		   uint64_t flags, struct rxm_recv_queue *recv_queue)
+{
+	struct rxm_recv_entry *recv_entry;
+
+	if (ofi_freestack_isempty(recv_queue->fs))
+		return NULL;
+
+	recv_entry = ofi_freestack_pop(recv_queue->fs);
+
+	rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag,
+			    ignore, context, flags, recv_queue);
+
+	return recv_entry;
+}
+
+struct rxm_recv_entry *
+rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov,
+		   void **desc, size_t count, fi_addr_t src_addr,
+		   uint64_t tag, uint64_t ignore, void *context,
+		   uint64_t flags)
+{
+	struct rxm_recv_entry *recv_entry;
+
+	recv_entry = ofi_buf_alloc(rxm_ep->multi_recv_pool);
+
+	rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag,
+			    ignore, context, flags, NULL);
+
+	recv_entry->comp_flags = FI_MSG | FI_RECV;
 	return recv_entry;
 }
 
+struct rxm_tx_buf *rxm_get_tx_buf(struct rxm_ep *ep)
+{
+	struct rxm_tx_buf *buf;
+
+	assert(fastlock_held(&ep->util_ep.lock));
+	if (!ep->tx_credit)
+		return NULL;
+
+	buf = ofi_buf_alloc(ep->tx_pool);
+	if (buf) {
+		OFI_DBG_SET(buf->user_tx, true);
+		ep->tx_credit--;
+	}
+	return buf;
+}
+
+void rxm_free_rx_buf(struct rxm_ep *ep, struct rxm_tx_buf *buf)
+{
+	assert(fastlock_held(&ep->util_ep.lock));
+	assert(buf->user_tx);
+	OFI_DBG_SET(buf->user_tx, false);
+	ep->tx_credit++;
+	ofi_buf_free(buf);
+}
+
 /*
  * We don't expect to have unexpected messages when the app is using
  * multi-recv buffers.  Optimize for that case.
@@ -859,7 +813,7 @@ rxm_ep_post_mrecv(struct rxm_ep *ep, const struct iovec *iov,
 
 	if ((cur_iov.iov_len < ep->min_multi_recv_size) ||
 	    (ret && cur_iov.iov_len != iov->iov_len)) {
-		ofi_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV,
+		rxm_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV,
 			     0, NULL, 0, 0);
 	}
 
@@ -998,14 +952,14 @@ static ssize_t rxm_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov,
 				  context, rxm_ep->util_ep.rx_op_flags);
 }
 
-static void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf,
+void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf,
 			      const struct iovec *iov, size_t count,
 			      struct fid_mr **mr)
 {
 	struct rxm_rndv_hdr *rndv_hdr = (struct rxm_rndv_hdr *)buf;
 	size_t i;
 
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < count && mr[i]; i++) {
 		rndv_hdr->iov[i].addr = RXM_MR_VIRT_ADDR(rxm_ep->msg_info) ?
 			(uintptr_t)iov[i].iov_base : 0;
 		rndv_hdr->iov[i].len = (uint64_t)iov[i].iov_len;
@@ -1014,23 +968,6 @@ static void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf,
 	rndv_hdr->count = (uint8_t)count;
 }
 
-static ssize_t
-rxm_ep_msg_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		       struct rxm_pkt *tx_pkt, size_t pkt_size,
-		       ofi_cntr_inc_func cntr_inc_func)
-{
-	FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting inject with length: %zu"
-	       " tag: 0x%" PRIx64 "\n", pkt_size, tx_pkt->hdr.tag);
-
-	assert((tx_pkt->hdr.flags & FI_REMOTE_CQ_DATA) || !tx_pkt->hdr.flags);
-	assert(pkt_size <= rxm_ep->inject_limit);
-
-	ssize_t ret = fi_inject(rxm_conn->msg_ep, tx_pkt, pkt_size, 0);
-	if (ret == -FI_EAGAIN)
-		rxm_ep_do_progress(&rxm_ep->util_ep);
-	return ret;
-}
-
 static ssize_t
 rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt,
 		       size_t pkt_size, void *desc, void *context)
@@ -1044,110 +981,129 @@ rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt,
 }
 
 static ssize_t
-rxm_ep_alloc_rndv_tx_res(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-			 void *context, uint8_t count, const struct iovec *iov,
-			 void **desc, size_t data_len, uint64_t data,
-			 uint64_t flags, uint64_t tag, uint8_t op,
-			 struct rxm_tx_rndv_buf **tx_rndv_buf)
+rxm_alloc_rndv_buf(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+		   void *context, uint8_t count, const struct iovec *iov,
+		   void **desc, size_t data_len, uint64_t data,
+		   uint64_t flags, uint64_t tag, uint8_t op,
+		   enum fi_hmem_iface iface, uint64_t device,
+		   struct rxm_tx_buf **rndv_buf)
 {
+	struct fid_mr *rxm_mr_msg_mr[RXM_IOV_LIMIT];
 	struct fid_mr **mr_iov;
+	size_t len, i;
 	ssize_t ret;
-	struct rxm_tx_rndv_buf *tx_buf;
 
-	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV);
-	if (!tx_buf) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-			"Ran out of buffers from RNDV buffer pool\n");
+	*rndv_buf = rxm_get_tx_buf(rxm_ep);
+	if (!*rndv_buf)
 		return -FI_EAGAIN;
-	}
 
+	(*rndv_buf)->pkt.ctrl_hdr.type = rxm_ctrl_rndv_req;
 	rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag,
-				 flags, &(tx_buf)->pkt);
-	tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
-	tx_buf->app_context = context;
-	tx_buf->flags = flags;
-	tx_buf->count = count;
+				 flags, &(*rndv_buf)->pkt);
+	(*rndv_buf)->pkt.ctrl_hdr.msg_id = ofi_buf_index(*rndv_buf);
+	(*rndv_buf)->app_context = context;
+	(*rndv_buf)->flags = flags;
+	(*rndv_buf)->rma.count = count;
 
 	if (!rxm_ep->rdm_mr_local) {
-		ret = rxm_msg_mr_regv(rxm_ep, iov, tx_buf->count, data_len,
-				      FI_REMOTE_READ, tx_buf->mr);
+		ret = rxm_msg_mr_regv(rxm_ep, iov, (*rndv_buf)->rma.count, data_len,
+				      rxm_ep->rndv_ops->tx_mr_access,
+				      (*rndv_buf)->rma.mr);
 		if (ret)
 			goto err;
-		mr_iov = tx_buf->mr;
+		mr_iov = (*rndv_buf)->rma.mr;
 	} else {
-		/* desc is msg fid_mr * array */
-		mr_iov = (struct fid_mr **)desc;
+		for (i = 0; i < count; i++)
+			rxm_mr_msg_mr[i] = ((struct rxm_mr *) desc[i])->msg_mr;
+
+		mr_iov = rxm_mr_msg_mr;
 	}
 
-	rxm_rndv_hdr_init(rxm_ep, &tx_buf->pkt.data, iov, tx_buf->count,
-			  mr_iov);
+	if (rxm_ep->rndv_ops == &rxm_rndv_ops_write) {
+		(*rndv_buf)->write_rndv.conn = rxm_conn;
+		for (i = 0; i < count; i++) {
+			(*rndv_buf)->write_rndv.iov[i] = iov[i];
+			(*rndv_buf)->write_rndv.desc[i] = fi_mr_desc(mr_iov[i]);
+		}
+	}
+
+	rxm_rndv_hdr_init(rxm_ep, &(*rndv_buf)->pkt.data, iov,
+			  (*rndv_buf)->rma.count, mr_iov);
 
-	ret = sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr);
+	len = sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr);
 
 	if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) {
-		ofi_copy_from_iov(rxm_pkt_rndv_data(&tx_buf->pkt),
-				  rxm_ep->buffered_min, iov, count, 0);
-		ret += rxm_ep->buffered_min;
+		ret = ofi_copy_from_hmem_iov(rxm_pkt_rndv_data(&(*rndv_buf)->pkt),
+					     rxm_ep->buffered_min, iface,
+					     device, iov, count, 0);
+		assert(ret == rxm_ep->buffered_min);
+
+		len += rxm_ep->buffered_min;
 	}
 
-	*tx_rndv_buf = tx_buf;
-	return ret;
+	return len;
+
 err:
-	*tx_rndv_buf = NULL;
-	ofi_buf_free(tx_buf);
+	rxm_free_rx_buf(rxm_ep, *rndv_buf);
 	return ret;
 }
 
 static ssize_t
 rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		   struct rxm_tx_rndv_buf *tx_buf, size_t pkt_size)
+		    struct rxm_tx_buf *tx_buf, size_t pkt_size)
 {
 	ssize_t ret;
 
-	RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_TX);
 	if (pkt_size <= rxm_ep->inject_limit) {
-		RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_ACK_WAIT);
-		ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, &tx_buf->pkt,
-					     pkt_size, ofi_cntr_inc_noop);
-	} else {
-		tx_buf->hdr.state = RXM_RNDV_TX;
+		if (rxm_ep->rndv_ops == &rxm_rndv_ops_write)
+			RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf,
+					 RXM_RNDV_WRITE_DATA_WAIT);
+		else
+			RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf,
+					 RXM_RNDV_READ_DONE_WAIT);
 
+		ret = fi_inject(rxm_conn->msg_ep, &tx_buf->pkt, pkt_size, 0);
+	} else {
+		RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_TX);
 		ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size,
 					     tx_buf->hdr.desc, tx_buf);
 	}
+
 	if (ret)
 		goto err;
+
 	return FI_SUCCESS;
+
 err:
 	FI_DBG(&rxm_prov, FI_LOG_EP_DATA,
 	       "Transmit for MSG provider failed\n");
 	if (!rxm_ep->rdm_mr_local)
-		rxm_msg_mr_closev(tx_buf->mr, tx_buf->count);
-	ofi_buf_free(tx_buf);
+		rxm_msg_mr_closev(tx_buf->rma.mr, tx_buf->rma.count);
+	rxm_free_rx_buf(rxm_ep, tx_buf);
 	return ret;
 }
 
 static size_t
 rxm_ep_sar_calc_segs_cnt(struct rxm_ep *rxm_ep, size_t data_len)
 {
-	return (data_len + rxm_eager_limit - 1) / rxm_eager_limit;
+	return (data_len + rxm_buffer_size - 1) / rxm_buffer_size;
 }
 
-static struct rxm_tx_sar_buf *
+static struct rxm_tx_buf *
 rxm_ep_sar_tx_prepare_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			      void *app_context, size_t total_len,
 			      size_t seg_len, size_t seg_no, uint64_t data,
 			      uint64_t flags, uint64_t tag, uint8_t op,
 			      enum rxm_sar_seg_type seg_type, uint64_t *msg_id)
 {
-	struct rxm_tx_sar_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 
-	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_SAR);
-	if (!tx_buf) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-			"Ran out of buffers from SAR buffer pool\n");
+	tx_buf = rxm_get_tx_buf(rxm_ep);
+	if (!tx_buf)
 		return NULL;
-	};
+
+	tx_buf->hdr.state = RXM_SAR_TX;
+	tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_seg;
 
 	rxm_ep_format_tx_buf_pkt(rxm_conn, total_len, op, data, tag, flags,
 				 &tx_buf->pkt);
@@ -1167,15 +1123,14 @@ rxm_ep_sar_tx_prepare_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 static void
 rxm_ep_sar_tx_cleanup(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		      struct rxm_tx_sar_buf *tx_buf)
+		      struct rxm_tx_buf *tx_buf)
 {
-	struct rxm_tx_sar_buf *first_tx_buf;
+	struct rxm_tx_buf *first_tx_buf;
 
-	first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep->
-					    buf_pools[RXM_BUF_POOL_TX_SAR].pool,
+	first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep->tx_pool,
 					    tx_buf->pkt.ctrl_hdr.msg_id);
-	ofi_buf_free(first_tx_buf);
-	ofi_buf_free(tx_buf);
+	rxm_free_rx_buf(rxm_ep, first_tx_buf);
+	rxm_free_rx_buf(rxm_ep, tx_buf);
 }
 
 static ssize_t
@@ -1185,10 +1140,12 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep,
 		size_t seg_no, size_t segs_cnt, uint64_t data, uint64_t flags,
 		uint64_t tag, uint8_t op, const struct iovec *iov,
 		uint8_t count, size_t *iov_offset,
-		struct rxm_tx_sar_buf **out_tx_buf)
+		struct rxm_tx_buf **out_tx_buf,
+		enum fi_hmem_iface iface, uint64_t device)
 {
-	struct rxm_tx_sar_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 	enum rxm_sar_seg_type seg_type = RXM_SAR_SEG_MIDDLE;
+	ssize_t ret __attribute__((unused));
 
 	if (seg_no == (segs_cnt - 1)) {
 		seg_type = RXM_SAR_SEG_LAST;
@@ -1204,7 +1161,10 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep,
 		return -FI_EAGAIN;
 	}
 
-	ofi_copy_from_iov(tx_buf->pkt.data, seg_len, iov, count, *iov_offset);
+	ret = ofi_copy_from_hmem_iov(tx_buf->pkt.data, seg_len, iface, device,
+				     iov, count, *iov_offset);
+	assert(ret == seg_len);
+
 	*iov_offset += seg_len;
 
 	*out_tx_buf = tx_buf;
@@ -1214,29 +1174,34 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep,
 }
 
 static ssize_t
-rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		   void *context, uint8_t count, const struct iovec *iov,
-		   size_t data_len, size_t segs_cnt, uint64_t data,
-		   uint64_t flags, uint64_t tag, uint8_t op)
+rxm_send_sar(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+	     const struct iovec *iov, void **desc, uint8_t count,
+	     void *context, uint64_t data, uint64_t flags, uint64_t tag,
+	     uint8_t op, size_t data_len, size_t segs_cnt)
 {
-	struct rxm_tx_sar_buf *tx_buf, *first_tx_buf;
+	struct rxm_tx_buf *tx_buf, *first_tx_buf;
 	size_t i, iov_offset = 0, remain_len = data_len;
-	ssize_t ret;
 	struct rxm_deferred_tx_entry *def_tx;
+	enum fi_hmem_iface iface;
+	uint64_t device;
 	uint64_t msg_id = 0;
+	ssize_t ret;
 
 	assert(segs_cnt >= 2);
+	iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device);
 
 	first_tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, context,
-						     data_len, rxm_eager_limit,
+						     data_len, rxm_buffer_size,
 						     0, data, flags, tag, op,
 						     RXM_SAR_SEG_FIRST, &msg_id);
 	if (!first_tx_buf)
 		return -FI_EAGAIN;
 
-	ofi_copy_from_iov(first_tx_buf->pkt.data, rxm_eager_limit,
-			  iov, count, iov_offset);
-	iov_offset += rxm_eager_limit;
+	ret = ofi_copy_from_hmem_iov(first_tx_buf->pkt.data, rxm_buffer_size,
+				     iface, device, iov, count, iov_offset);
+	assert(ret == rxm_buffer_size);
+
+	iov_offset += rxm_buffer_size;
 
 	ret = fi_send(rxm_conn->msg_ep, &first_tx_buf->pkt,
 		      sizeof(struct rxm_pkt) + first_tx_buf->pkt.ctrl_hdr.seg_size,
@@ -1244,37 +1209,37 @@ rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	if (ret) {
 		if (ret == -FI_EAGAIN)
 			rxm_ep_do_progress(&rxm_ep->util_ep);
-		ofi_buf_free(first_tx_buf);
+		rxm_free_rx_buf(rxm_ep, first_tx_buf);
 		return ret;
 	}
 
-	remain_len -= rxm_eager_limit;
+	remain_len -= rxm_buffer_size;
 
 	for (i = 1; i < segs_cnt; i++) {
-		ret = rxm_ep_sar_tx_prepare_and_send_segment(rxm_ep, rxm_conn,
-				context, data_len, remain_len,
-				msg_id, rxm_eager_limit, i, segs_cnt,
-				data, flags, tag, op, iov, count,
-				&iov_offset, &tx_buf);
+		ret = rxm_ep_sar_tx_prepare_and_send_segment(
+				rxm_ep, rxm_conn, context, data_len, remain_len,
+				msg_id, rxm_buffer_size, i, segs_cnt, data,
+				flags, tag, op, iov, count, &iov_offset, &tx_buf,
+				iface, device);
 		if (ret) {
 			if (ret == -FI_EAGAIN)
 				goto defer;
 			goto free;
 		}
-		remain_len -= rxm_eager_limit;
+		remain_len -= rxm_buffer_size;
 	}
 
 	return 0;
 
 free:
-	ofi_buf_free(first_tx_buf);
+	rxm_free_rx_buf(rxm_ep, first_tx_buf);
 	return ret;
 defer:
 	def_tx = rxm_ep_alloc_deferred_tx_entry(rxm_ep,
 			rxm_conn, RXM_DEFERRED_TX_SAR_SEG);
 	if (!def_tx) {
 		if (tx_buf)
-			ofi_buf_free(tx_buf);
+			rxm_free_rx_buf(rxm_ep, tx_buf);
 		return -FI_ENOMEM;
 	}
 	memcpy(def_tx->sar_seg.payload.iov,
@@ -1292,7 +1257,9 @@ defer:
 	def_tx->sar_seg.total_len = data_len;
 	def_tx->sar_seg.remain_len = remain_len;
 	def_tx->sar_seg.msg_id = msg_id;
-	rxm_ep_enqueue_deferred_tx_queue(def_tx);
+	def_tx->sar_seg.iface = iface;
+	def_tx->sar_seg.device = device;
+	rxm_queue_deferred_tx(def_tx, OFI_LIST_TAIL);
 	return 0;
 }
 
@@ -1302,48 +1269,73 @@ rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		      uint64_t data, uint64_t flags, uint64_t tag,
 		      uint8_t op)
 {
-	struct rxm_tx_eager_buf *tx_buf;
+	struct rxm_tx_buf *tx_buf;
 	ssize_t ret;
 
-	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
-	if (!tx_buf) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-			"Ran out of buffers from Eager buffer pool\n");
+	tx_buf = rxm_get_tx_buf(rxm_ep);
+	if (!tx_buf)
 		return -FI_EAGAIN;
-	}
-	/* This is needed so that we don't report bogus context in fi_cq_err_entry */
-	tx_buf->app_context = NULL;
 
-	rxm_ep_format_tx_buf_pkt(rxm_conn, len, op, data, tag, flags, &tx_buf->pkt);
-	memcpy(tx_buf->pkt.data, buf, len);
+	tx_buf->hdr.state = RXM_INJECT_TX;
+	tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_eager;
 	tx_buf->flags = flags;
 
+	rxm_ep_format_tx_buf_pkt(rxm_conn, len, op, data, tag, flags,
+				 &tx_buf->pkt);
+	memcpy(tx_buf->pkt.data, buf, len);
+
 	ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size,
 				     tx_buf->hdr.desc, tx_buf);
 	if (ret) {
 		if (ret == -FI_EAGAIN)
 			rxm_ep_do_progress(&rxm_ep->util_ep);
-		ofi_buf_free(tx_buf);
+		rxm_free_rx_buf(rxm_ep, tx_buf);
 	}
 	return ret;
 }
 
+static bool
+rxm_use_msg_tinject(struct rxm_ep *ep, uint8_t op)
+{
+	struct rxm_domain *domain;
+
+	domain = container_of(ep->util_ep.domain, struct rxm_domain,
+			      util_domain);
+	return domain->dyn_rbuf && (op == ofi_op_tagged);
+}
+
+static ssize_t
+rxm_msg_tinject(struct fid_ep *msg_ep, const void *buf, size_t len,
+		bool cq_data, uint64_t data, uint64_t tag)
+{
+	return cq_data ?
+		fi_tinject(msg_ep, buf, len, 0, tag) :
+		fi_tinjectdata(msg_ep, buf, len, data, 0, tag);
+}
+
 static ssize_t
-rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-			const void *buf, size_t len, struct rxm_pkt *inject_pkt)
+rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+		   const void *buf, size_t len)
 {
-	size_t pkt_size = sizeof(struct rxm_pkt) + len;
+	struct rxm_pkt *inject_pkt = rxm_ep->inject_pkt;
+	size_t pkt_size = sizeof(*inject_pkt) + len;
 	ssize_t ret;
 
 	assert(len <= rxm_ep->rxm_info->tx_attr->inject_size);
 
-	if (pkt_size <= rxm_ep->inject_limit &&
-	    !rxm_ep->util_ep.tx_cntr) {
+	inject_pkt->ctrl_hdr.conn_id = rxm_conn->remote_index;
+	if (pkt_size <= rxm_ep->inject_limit && !rxm_ep->util_ep.tx_cntr) {
+		if (rxm_use_msg_tinject(rxm_ep, inject_pkt->hdr.op)) {
+			return rxm_msg_tinject(rxm_conn->msg_ep, buf, len,
+					       inject_pkt->hdr.flags &
+							FI_REMOTE_CQ_DATA,
+					       inject_pkt->hdr.data,
+					       inject_pkt->hdr.tag);
+		}
+
 		inject_pkt->hdr.size = len;
 		memcpy(inject_pkt->data, buf, len);
-		ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, inject_pkt,
-					     pkt_size,
-					     rxm_ep->util_ep.tx_cntr_inc);
+		ret = fi_inject(rxm_conn->msg_ep, inject_pkt, pkt_size, 0);
 	} else {
 		ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len,
 					    pkt_size, inject_pkt->hdr.data,
@@ -1354,101 +1346,196 @@ rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	return ret;
 }
 
+static bool
+rxm_use_direct_send(struct rxm_ep *ep, size_t iov_count, uint64_t flags)
+{
+	return ep->enable_direct_send &&
+		(iov_count < ep->msg_info->tx_attr->iov_limit);
+}
+
 static ssize_t
-rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		   const void *buf, size_t len, uint64_t data,
-		   uint64_t flags, uint64_t tag, uint8_t op)
+rxm_direct_send(struct rxm_ep *ep, struct rxm_conn *rxm_conn,
+		struct rxm_tx_buf *tx_buf,
+		const struct iovec *iov, void **desc, size_t count)
 {
-	struct rxm_tx_base_buf *tx_buf;
-	size_t pkt_size = sizeof(struct rxm_pkt) + len;
+	struct iovec send_iov[RXM_IOV_LIMIT];
+	void *send_desc[RXM_IOV_LIMIT];
+	struct rxm_mr *mr;
 	ssize_t ret;
+	int i;
 
-	assert(len <= rxm_ep->rxm_info->tx_attr->inject_size);
+	send_iov[0].iov_base = &tx_buf->pkt;
+	send_iov[0].iov_len = sizeof(tx_buf->pkt);
+	memcpy(&send_iov[1], iov, sizeof(*iov) * count);
 
-	if (pkt_size <= rxm_ep->inject_limit &&
-	    !rxm_ep->util_ep.tx_cntr) {
-		tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_INJECT);
-		if (!tx_buf) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"Ran out of eager inject buffers\n");
-			ret = -FI_EAGAIN;
-			goto unlock;
+	if (ep->msg_mr_local) {
+		send_desc[0] = tx_buf->hdr.desc;
+
+		for (i = 0; i < count; i++) {
+			assert(desc[i]);
+			mr = desc[i];
+			send_desc[i + 1] = fi_mr_desc(mr->msg_mr);
 		}
-		rxm_ep_format_tx_buf_pkt(rxm_conn, len, op, data, tag,
-					 flags, &tx_buf->pkt);
-		memcpy(tx_buf->pkt.data, buf, len);
-
-		ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, &tx_buf->pkt,
-					     pkt_size,
-					     rxm_ep->util_ep.tx_cntr_inc);
-		ofi_buf_free(tx_buf);
+
+		ret = fi_sendv(rxm_conn->msg_ep, send_iov, send_desc,
+			       count + 1, 0, tx_buf);
 	} else {
-		ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len,
-					    pkt_size, data, flags, tag, op);
+		ret = fi_sendv(rxm_conn->msg_ep, send_iov, NULL,
+			       count + 1, 0, tx_buf);
 	}
-unlock:
 	return ret;
+}
+
+static bool
+rxm_use_msg_tsend(struct rxm_ep *ep, size_t iov_count, uint8_t op)
+{
+	struct rxm_domain *domain;
+
+	domain = container_of(ep->util_ep.domain, struct rxm_domain,
+			      util_domain);
+
+	return domain->dyn_rbuf && (op == ofi_op_tagged) &&
+	       (iov_count <= ep->msg_info->tx_attr->iov_limit);
+}
+
+static ssize_t
+rxm_msg_tsend(struct rxm_ep *ep, struct rxm_conn *conn,
+	      struct rxm_tx_buf *tx_buf,
+	      const struct iovec *iov, size_t count,
+	      uint64_t data, uint64_t tag)
+{
+	struct fi_msg_tagged msg;
+
+	assert(!(ep->msg_info->domain_attr->mr_mode & FI_MR_LOCAL));
+
+	if (count == 0) {
+		return !(tx_buf->flags & FI_REMOTE_CQ_DATA) ?
+			fi_tsend(conn->msg_ep, NULL, 0, NULL, 0, tag, tx_buf) :
+			fi_tsenddata(conn->msg_ep, NULL, 0, NULL, data, 0,
+				     tag, tx_buf);
+	}
+
+	if (count == 1) {
+		return !(tx_buf->flags & FI_REMOTE_CQ_DATA) ?
+			fi_tsend(conn->msg_ep, iov[0].iov_base, iov[0].iov_len,
+				 NULL, 0, tag, tx_buf) :
+			fi_tsenddata(conn->msg_ep, iov[0].iov_base,
+				     iov[0].iov_len, NULL, data, 0, tag,
+				     tx_buf);
+	}
+
+	if (!(tx_buf->flags & FI_REMOTE_CQ_DATA)) {
+		return fi_tsendv(conn->msg_ep, iov, NULL, count, 0, tag,
+				 tx_buf);
+	}
+
+	msg.addr = 0;
+	msg.context = tx_buf;
+	msg.data = data;
+	msg.desc = NULL;
+	msg.ignore = 0;
+	msg.iov_count = count;
+	msg.msg_iov = iov;
+	msg.tag = tag;
+
+	return fi_tsendmsg(conn->msg_ep, &msg, ep->msg_info->tx_attr->op_flags |
+			   FI_REMOTE_CQ_DATA);
+}
+
+static ssize_t
+rxm_send_eager(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+	       const struct iovec *iov, void **desc, size_t count,
+	       void *context, uint64_t data, uint64_t flags, uint64_t tag,
+	       uint8_t op, size_t data_len, size_t total_len)
+{
+	struct rxm_tx_buf *eager_buf;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	ssize_t ret;
+
+	eager_buf = rxm_get_tx_buf(rxm_ep);
+	if (!eager_buf)
+		return -FI_EAGAIN;
 
+	eager_buf->hdr.state = RXM_TX;
+	eager_buf->pkt.ctrl_hdr.type = rxm_ctrl_eager;
+	eager_buf->app_context = context;
+	eager_buf->flags = flags;
+
+	if (rxm_use_msg_tsend(rxm_ep, count, op)) {
+		/* hdr isn't sent, but op is accessed handling completion */
+		eager_buf->pkt.hdr.op = op;
+		ret = rxm_msg_tsend(rxm_ep, rxm_conn, eager_buf, iov, count,
+				    data, tag);
+	} else if (rxm_use_direct_send(rxm_ep, count, flags)) {
+		rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag,
+					 flags, &eager_buf->pkt);
+
+		ret = rxm_direct_send(rxm_ep, rxm_conn, eager_buf,
+				      iov, desc, count);
+	} else {
+		rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag,
+					 flags, &eager_buf->pkt);
+
+		iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device);
+		ret = ofi_copy_from_hmem_iov(eager_buf->pkt.data,
+					     eager_buf->pkt.hdr.size,
+					     iface, device, iov, count, 0);
+		assert(ret == eager_buf->pkt.hdr.size);
+
+		ret = rxm_ep_msg_normal_send(rxm_conn, &eager_buf->pkt,
+					     total_len, eager_buf->hdr.desc,
+					     eager_buf);
+	}
+
+	if (ret) {
+		if (ret == -FI_EAGAIN)
+			rxm_ep_do_progress(&rxm_ep->util_ep);
+		rxm_free_rx_buf(rxm_ep, eager_buf);
+	}
+	return ret;
 }
 
 static ssize_t
-rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-		   const struct iovec *iov, void **desc, size_t count,
-		   void *context, uint64_t data, uint64_t flags, uint64_t tag,
-		   uint8_t op, struct rxm_pkt *inject_pkt)
-{
-	struct rxm_tx_eager_buf *tx_buf;
-	size_t data_len = ofi_total_iov_len(iov, count);
-	size_t total_len = sizeof(struct rxm_pkt) + data_len;
+rxm_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+		const struct iovec *iov, void **desc, size_t count,
+		void *context, uint64_t data, uint64_t flags, uint64_t tag,
+		uint8_t op)
+{
+	struct rxm_tx_buf *rndv_buf;
+	size_t data_len, total_len;
+	enum fi_hmem_iface iface;
+	uint64_t device;
 	ssize_t ret;
 
+	data_len = ofi_total_iov_len(iov, count);
+	total_len = sizeof(struct rxm_pkt) + data_len;
+
 	assert(count <= rxm_ep->rxm_info->tx_attr->iov_limit);
 	assert((!(flags & FI_INJECT) &&
 		(data_len > rxm_ep->rxm_info->tx_attr->inject_size)) ||
 	       (data_len <= rxm_ep->rxm_info->tx_attr->inject_size));
 
-	if (data_len <= rxm_eager_limit) {
-		tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
-		if (!tx_buf) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"Ran out of buffers from Eager buffer pool\n");
-			ret = -FI_EAGAIN;
-			goto unlock;
-		}
-
-		rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag,
-					 flags, &tx_buf->pkt);
-		ofi_copy_from_iov(tx_buf->pkt.data, tx_buf->pkt.hdr.size,
-				  iov, count, 0);
-		tx_buf->app_context = context;
-		tx_buf->flags = flags;
-
-		ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, total_len,
-					     tx_buf->hdr.desc, tx_buf);
-		if (ret) {
-			if (ret == -FI_EAGAIN)
-				rxm_ep_do_progress(&rxm_ep->util_ep);
-			ofi_buf_free(tx_buf);
-		}
-	} else if (data_len <= rxm_ep->sar_limit &&
-		   /* SAR uses eager_limit as segment size */
-		   (rxm_eager_limit <
-		    (1ULL << (8 * sizeof_field(struct ofi_ctrl_hdr, seg_size))))) {
-		ret = rxm_ep_sar_tx_send(rxm_ep, rxm_conn, context,
-					 count, iov, data_len,
-					 rxm_ep_sar_calc_segs_cnt(rxm_ep, data_len),
-					 data, flags, tag, op);
+	if (data_len <= rxm_ep->eager_limit) {
+		ret = rxm_send_eager(rxm_ep, rxm_conn, iov, desc, count,
+				     context, data, flags, tag, op,
+				     data_len, total_len);
+	} else if (data_len <= rxm_ep->sar_limit) {
+		ret = rxm_send_sar(rxm_ep, rxm_conn, iov, desc, (uint8_t) count,
+				   context, data, flags, tag, op, data_len,
+				   rxm_ep_sar_calc_segs_cnt(rxm_ep, data_len));
 	} else {
-		struct rxm_tx_rndv_buf *tx_buf;
+		iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device);
 
-		ret = rxm_ep_alloc_rndv_tx_res(rxm_ep, rxm_conn, context,
-					       (uint8_t) count, iov, desc,
-					       data_len, data, flags, tag, op,
-					       &tx_buf);
+		ret = rxm_alloc_rndv_buf(rxm_ep, rxm_conn, context,
+					 (uint8_t) count, iov, desc,
+					 data_len, data, flags, tag, op,
+					 iface, device, &rndv_buf);
 		if (ret >= 0)
-			ret = rxm_ep_rndv_tx_send(rxm_ep, rxm_conn, tx_buf, ret);
+			ret = rxm_ep_rndv_tx_send(rxm_ep, rxm_conn,
+						  rndv_buf, ret);
 	}
-unlock:
+
 	return ret;
 }
 
@@ -1487,7 +1574,7 @@ static ssize_t
 rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry)
 {
 	ssize_t ret = 0;
-	struct rxm_tx_sar_buf *tx_buf = def_tx_entry->sar_seg.cur_seg_tx_buf;
+	struct rxm_tx_buf *tx_buf = def_tx_entry->sar_seg.cur_seg_tx_buf;
 
 	if (tx_buf) {
 		ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &tx_buf->pkt,
@@ -1495,20 +1582,20 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry
 			      tx_buf->hdr.desc, 0, tx_buf);
 		if (ret) {
 			if (ret != -FI_EAGAIN) {
-				rxm_ep_sar_handle_segment_failure(def_tx_entry, ret);
-				goto sar_finish;
+				rxm_ep_sar_handle_segment_failure(def_tx_entry,
+								  ret);
 			}
 			return ret;
 		}
 
 		def_tx_entry->sar_seg.next_seg_no++;
-		def_tx_entry->sar_seg.remain_len -= rxm_eager_limit;
+		def_tx_entry->sar_seg.remain_len -= rxm_buffer_size;
 
 		if (def_tx_entry->sar_seg.next_seg_no ==
 		    def_tx_entry->sar_seg.segs_cnt) {
 			assert(rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr) ==
 			       RXM_SAR_SEG_LAST);
-			goto sar_finish;
+			return 0;
 		}
 	}
 
@@ -1519,7 +1606,7 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry
 				def_tx_entry->sar_seg.app_context,
 				def_tx_entry->sar_seg.total_len,
 				def_tx_entry->sar_seg.remain_len,
-				def_tx_entry->sar_seg.msg_id, rxm_eager_limit,
+				def_tx_entry->sar_seg.msg_id, rxm_buffer_size,
 				def_tx_entry->sar_seg.next_seg_no,
 				def_tx_entry->sar_seg.segs_cnt,
 				def_tx_entry->sar_seg.payload.data,
@@ -1529,24 +1616,22 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry
 				def_tx_entry->sar_seg.payload.iov,
 				def_tx_entry->sar_seg.payload.count,
 				&def_tx_entry->sar_seg.payload.cur_iov_offset,
-				&def_tx_entry->sar_seg.cur_seg_tx_buf);
+				&def_tx_entry->sar_seg.cur_seg_tx_buf,
+				def_tx_entry->sar_seg.iface,
+				def_tx_entry->sar_seg.device);
 		if (ret) {
 			if (ret != -FI_EAGAIN) {
-				rxm_ep_sar_handle_segment_failure(def_tx_entry, ret);
-				goto sar_finish;
+				rxm_ep_sar_handle_segment_failure(def_tx_entry,
+								  ret);
 			}
 
 			return ret;
 		}
 		def_tx_entry->sar_seg.next_seg_no++;
-		def_tx_entry->sar_seg.remain_len -= rxm_eager_limit;
+		def_tx_entry->sar_seg.remain_len -= rxm_buffer_size;
 	}
 
-sar_finish:
-	rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-	free(def_tx_entry);
-
-	return ret;
+	return 0;
 }
 
 void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
@@ -1557,7 +1642,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 	struct fi_msg msg;
 	ssize_t ret = 0;
 
-	if (rxm_conn->handle.state != RXM_CMAP_CONNECTED)
+	if (rxm_conn->state != RXM_CM_CONNECTED)
 		return;
 
 	while (!dlist_empty(&rxm_conn->deferred_tx_queue) && !ret) {
@@ -1568,58 +1653,93 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 			ret = fi_send(def_tx_entry->rxm_conn->msg_ep,
 				      &def_tx_entry->rndv_ack.rx_buf->
 					recv_entry->rndv.tx_buf->pkt,
-				      sizeof(def_tx_entry->rndv_ack.rx_buf->
-					recv_entry->rndv.tx_buf->pkt),
+				      def_tx_entry->rndv_ack.pkt_size,
 				      def_tx_entry->rndv_ack.rx_buf->recv_entry->
 					rndv.tx_buf->hdr.desc,
 				      0, def_tx_entry->rndv_ack.rx_buf);
 			if (ret) {
 				if (ret == -FI_EAGAIN)
-					break;
+					return;
 				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq,
 						   def_tx_entry->rxm_ep->util_ep.rx_cntr,
-						   def_tx_entry->rndv_read.rx_buf->
+						   def_tx_entry->rndv_ack.rx_buf->
 						   recv_entry->context, ret);
 			}
+			if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv
+				    .tx_buf->pkt.ctrl_hdr
+				    .type == rxm_ctrl_rndv_rd_done)
+				RXM_UPDATE_STATE(FI_LOG_EP_DATA,
+						 def_tx_entry->rndv_ack.rx_buf,
+						 RXM_RNDV_READ_DONE_SENT);
+			else
+				RXM_UPDATE_STATE(FI_LOG_EP_DATA,
+						 def_tx_entry->rndv_ack.rx_buf,
+						 RXM_RNDV_WRITE_DATA_SENT);
+			break;
+		case RXM_DEFERRED_TX_RNDV_DONE:
+			ret = fi_send(def_tx_entry->rxm_conn->msg_ep,
+				      &def_tx_entry->rndv_done.tx_buf->write_rndv.done_buf->pkt,
+				      sizeof(struct rxm_pkt),
+				      def_tx_entry->rndv_done.tx_buf->write_rndv.done_buf->hdr.desc,
+				      0, def_tx_entry->rndv_done.tx_buf);
+			if (ret) {
+				if (ret == -FI_EAGAIN)
+					return;
+				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq,
+						   def_tx_entry->rxm_ep->util_ep.tx_cntr,
+						   def_tx_entry->rndv_done.tx_buf, ret);
+			}
 			RXM_UPDATE_STATE(FI_LOG_EP_DATA,
-					 def_tx_entry->rndv_ack.rx_buf,
-					 RXM_RNDV_ACK_SENT);
-			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-			free(def_tx_entry);
+					 def_tx_entry->rndv_done.tx_buf,
+					 RXM_RNDV_WRITE_DONE_SENT);
 			break;
 		case RXM_DEFERRED_TX_RNDV_READ:
-			ret = fi_readv(def_tx_entry->rxm_conn->msg_ep,
-				       def_tx_entry->rndv_read.rxm_iov.iov,
-				       def_tx_entry->rndv_read.rxm_iov.desc,
-				       def_tx_entry->rndv_read.rxm_iov.count, 0,
-				       def_tx_entry->rndv_read.rma_iov.addr,
-				       def_tx_entry->rndv_read.rma_iov.key,
-				       def_tx_entry->rndv_read.rx_buf);
+			ret = rxm_ep->rndv_ops->xfer(
+				def_tx_entry->rxm_conn->msg_ep,
+				def_tx_entry->rndv_read.rxm_iov.iov,
+				def_tx_entry->rndv_read.rxm_iov.desc,
+				def_tx_entry->rndv_read.rxm_iov.count, 0,
+				def_tx_entry->rndv_read.rma_iov.addr,
+				def_tx_entry->rndv_read.rma_iov.key,
+				def_tx_entry->rndv_read.rx_buf);
 			if (ret) {
 				if (ret == -FI_EAGAIN)
-					break;
+					return;
 				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq,
 						   def_tx_entry->rxm_ep->util_ep.rx_cntr,
 						   def_tx_entry->rndv_read.rx_buf->
 							recv_entry->context, ret);
-				break;
 			}
-			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-			free(def_tx_entry);
+			break;
+		case RXM_DEFERRED_TX_RNDV_WRITE:
+			ret = rxm_ep->rndv_ops->xfer(
+				def_tx_entry->rxm_conn->msg_ep,
+				def_tx_entry->rndv_write.rxm_iov.iov,
+				def_tx_entry->rndv_write.rxm_iov.desc,
+				def_tx_entry->rndv_write.rxm_iov.count, 0,
+				def_tx_entry->rndv_write.rma_iov.addr,
+				def_tx_entry->rndv_write.rma_iov.key,
+				def_tx_entry->rndv_write.tx_buf);
+			if (ret) {
+				if (ret == -FI_EAGAIN)
+					return;
+				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq,
+						   def_tx_entry->rxm_ep->util_ep.rx_cntr,
+						   def_tx_entry->rndv_write.tx_buf, ret);
+			}
 			break;
 		case RXM_DEFERRED_TX_SAR_SEG:
 			ret = rxm_ep_progress_sar_deferred_segments(def_tx_entry);
+			if (ret == -FI_EAGAIN)
+				return;
 			break;
 		case RXM_DEFERRED_TX_ATOMIC_RESP:
 			ret = rxm_atomic_send_respmsg(rxm_ep,
 					def_tx_entry->rxm_conn,
 					def_tx_entry->atomic_resp.tx_buf,
 					def_tx_entry->atomic_resp.len);
-			if (ret)
-				if (ret == -FI_EAGAIN)
-					break;
-			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-			free(def_tx_entry);
+			if (ret == -FI_EAGAIN)
+				return;
 			break;
 		case RXM_DEFERRED_TX_CREDIT_SEND:
 			iov.iov_base = &def_tx_entry->credit_msg.tx_buf->pkt;
@@ -1635,19 +1755,20 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 			ret = fi_sendmsg(def_tx_entry->rxm_conn->msg_ep, &msg,
 					 FI_PRIORITY);
 			if (ret) {
-				if (ret == -FI_EAGAIN)
-					break;
-				rxm_cq_write_error(
-					def_tx_entry->rxm_ep->util_ep.rx_cq,
-					def_tx_entry->rxm_ep->util_ep.rx_cntr,
-					def_tx_entry->rndv_read.rx_buf->
-						recv_entry->context, ret);
-				break;
+				if (ret != -FI_EAGAIN) {
+					rxm_cq_write_error(
+						def_tx_entry->rxm_ep->util_ep.rx_cq,
+						def_tx_entry->rxm_ep->util_ep.rx_cntr,
+						def_tx_entry->rndv_read.rx_buf->
+							recv_entry->context, ret);
+				}
+				return;
 			}
-			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
-			free(def_tx_entry);
 			break;
 		}
+
+		rxm_dequeue_deferred_tx(def_tx_entry);
+		free(def_tx_entry);
 	}
 }
 
@@ -1664,11 +1785,9 @@ rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags)
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
-				 msg->iov_count, msg->context, msg->data,
-				 flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg,
-				 ((flags & FI_REMOTE_CQ_DATA) ?
-				 rxm_conn->inject_data_pkt : rxm_conn->inject_pkt));
+	ret = rxm_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
+			      msg->iov_count, msg->context, msg->data,
+			      flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -1691,9 +1810,8 @@ static ssize_t rxm_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context,
-				  0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg,
-				  rxm_conn->inject_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context,
+			      0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -1713,16 +1831,15 @@ static ssize_t rxm_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context,
-				  0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg,
-				  rxm_conn->inject_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, iov, desc, count, context,
+			      0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
-			     fi_addr_t dest_addr)
+static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf,
+			     size_t len, fi_addr_t dest_addr)
 {
 	struct rxm_conn *rxm_conn;
 	struct rxm_ep *rxm_ep;
@@ -1734,30 +1851,17 @@ static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0,
-				  rxm_ep->util_ep.inject_op_flags,
-				  0, ofi_op_msg);
+	rxm_ep->inject_pkt->hdr.op = ofi_op_msg;
+	rxm_ep->inject_pkt->hdr.flags = 0;
+	rxm_ep->inject_pkt->hdr.tag = 0;
+	rxm_ep->inject_pkt->hdr.data = 0;
+
+	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_inject_fast(struct fid_ep *ep_fid, const void *buf,
-				  size_t len, fi_addr_t dest_addr)
-{
-	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep;
-	ssize_t ret;
-
-	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
-	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
-	if (ret)
-		return ret;
-
-	return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len,
-				       rxm_conn->inject_pkt);
-}
-
 static ssize_t rxm_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 			       void *desc, uint64_t data, fi_addr_t dest_addr,
 			       void *context)
@@ -1776,16 +1880,17 @@ static ssize_t rxm_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
-				 rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA,
-				 0, ofi_op_msg, rxm_conn->inject_data_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
+			      rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA,
+			      0, ofi_op_msg);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
-				 uint64_t data, fi_addr_t dest_addr)
+static ssize_t
+rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
+		  uint64_t data, fi_addr_t dest_addr)
 {
 	struct rxm_conn *rxm_conn;
 	struct rxm_ep *rxm_ep;
@@ -1797,32 +1902,17 @@ static ssize_t rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data,
-				  rxm_ep->util_ep.inject_op_flags |
-				  FI_REMOTE_CQ_DATA, 0, ofi_op_msg);
+	rxm_ep->inject_pkt->hdr.op = ofi_op_msg;
+	rxm_ep->inject_pkt->hdr.flags = FI_REMOTE_CQ_DATA;
+	rxm_ep->inject_pkt->hdr.tag = 0;
+	rxm_ep->inject_pkt->hdr.data = data;
+
+	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
-				      uint64_t data, fi_addr_t dest_addr)
-{
-	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep;
-	ssize_t ret;
-
-	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
-	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
-	if (ret)
-		return ret;
-
-	rxm_conn->inject_data_pkt->hdr.data = data;
-
-	return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len,
-				       rxm_conn->inject_data_pkt);
-}
-
 static struct fi_ops_msg rxm_ops_msg = {
 	.size = sizeof(struct fi_ops_msg),
 	.recv = rxm_ep_recv,
@@ -1836,18 +1926,6 @@ static struct fi_ops_msg rxm_ops_msg = {
 	.injectdata = rxm_ep_injectdata,
 };
 
-static struct fi_ops_msg rxm_ops_msg_thread_unsafe = {
-	.size = sizeof(struct fi_ops_msg),
-	.recv = rxm_ep_recv,
-	.recvv = rxm_ep_recvv,
-	.recvmsg = rxm_ep_recvmsg,
-	.send = rxm_ep_send,
-	.sendv = rxm_ep_sendv,
-	.sendmsg = rxm_ep_sendmsg,
-	.inject = rxm_ep_inject_fast,
-	.senddata = rxm_ep_senddata,
-	.injectdata = rxm_ep_injectdata_fast,
-};
 
 static ssize_t
 rxm_ep_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov,
@@ -1940,8 +2018,8 @@ rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
 	}
 
 	if (flags & FI_PEEK) {
-		ret = rxm_ep_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore,
-				       context, flags, &rxm_ep->trecv_queue);
+		rxm_ep_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore,
+				 context, flags, &rxm_ep->trecv_queue);
 		goto unlock;
 	}
 
@@ -1950,7 +2028,7 @@ rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
 	FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n");
 
 	if (flags & FI_DISCARD) {
-		ret = rxm_ep_discard_recv(rxm_ep, rx_buf, context);
+		rxm_ep_discard_recv(rxm_ep, rx_buf, context);
 		goto unlock;
 	}
 
@@ -2016,11 +2094,10 @@ rxm_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
-				  msg->iov_count, msg->context, msg->data,
-				  flags | rxm_ep->util_ep.tx_msg_flags, msg->tag,
-				  ofi_op_tagged, ((flags & FI_REMOTE_CQ_DATA) ?
-				   rxm_conn->tinject_data_pkt : rxm_conn->tinject_pkt));
+	ret = rxm_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
+			      msg->iov_count, msg->context, msg->data,
+			      flags | rxm_ep->util_ep.tx_msg_flags, msg->tag,
+			      ofi_op_tagged);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -2044,9 +2121,8 @@ static ssize_t rxm_ep_tsend(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, 0,
-				 rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged,
-				 rxm_conn->tinject_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, 0,
+			      rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -2066,16 +2142,16 @@ static ssize_t rxm_ep_tsendv(struct fid_ep *ep_fid, const struct iovec *iov,
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context, 0,
-				 rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged,
-				 rxm_conn->tinject_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, iov, desc, count, context, 0,
+			      rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len,
-			      fi_addr_t dest_addr, uint64_t tag)
+static ssize_t
+rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len,
+	       fi_addr_t dest_addr, uint64_t tag)
 {
 	struct rxm_conn *rxm_conn;
 	struct rxm_ep *rxm_ep;
@@ -2087,32 +2163,17 @@ static ssize_t rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0,
-				  rxm_ep->util_ep.inject_op_flags, tag,
-				  ofi_op_tagged);
+	rxm_ep->inject_pkt->hdr.op = ofi_op_tagged;
+	rxm_ep->inject_pkt->hdr.flags = 0;
+	rxm_ep->inject_pkt->hdr.tag = tag;
+	rxm_ep->inject_pkt->hdr.data = 0;
+
+	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_tinject_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
-				   fi_addr_t dest_addr, uint64_t tag)
-{
-	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep;
-	ssize_t ret;
-
-	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
-	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
-	if (ret)
-		return ret;
-
-	rxm_conn->tinject_pkt->hdr.tag = tag;
-
-	return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len,
-				       rxm_conn->tinject_pkt);
-}
-
 static ssize_t rxm_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 				void *desc, uint64_t data, fi_addr_t dest_addr,
 				uint64_t tag, void *context)
@@ -2131,16 +2192,17 @@ static ssize_t rxm_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t l
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
-				 rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA,
-				 tag, ofi_op_tagged, rxm_conn->tinject_data_pkt);
+	ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
+			      rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA,
+			tag, ofi_op_tagged);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
-				  uint64_t data, fi_addr_t dest_addr, uint64_t tag)
+static ssize_t
+rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
+		   uint64_t data, fi_addr_t dest_addr, uint64_t tag)
 {
 	struct rxm_conn *rxm_conn;
 	struct rxm_ep *rxm_ep;
@@ -2152,33 +2214,17 @@ static ssize_t rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t
 	if (ret)
 		goto unlock;
 
-	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data,
-				  rxm_ep->util_ep.inject_op_flags |
-				  FI_REMOTE_CQ_DATA, tag, ofi_op_tagged);
+	rxm_ep->inject_pkt->hdr.op = ofi_op_tagged;
+	rxm_ep->inject_pkt->hdr.flags = FI_REMOTE_CQ_DATA;
+	rxm_ep->inject_pkt->hdr.tag = tag;
+	rxm_ep->inject_pkt->hdr.data = data;
+
+	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
 }
 
-static ssize_t rxm_ep_tinjectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
-				       uint64_t data, fi_addr_t dest_addr, uint64_t tag)
-{
-	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep;
-	ssize_t ret;
-
-	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
-	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
-	if (ret)
-		return ret;
-
-	rxm_conn->tinject_data_pkt->hdr.tag = tag;
-	rxm_conn->tinject_data_pkt->hdr.data = data;
-
-	return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len,
-				       rxm_conn->tinject_data_pkt);
-}
-
 static struct fi_ops_tagged rxm_ops_tagged = {
 	.size = sizeof(struct fi_ops_tagged),
 	.recv = rxm_ep_trecv,
@@ -2192,19 +2238,6 @@ static struct fi_ops_tagged rxm_ops_tagged = {
 	.injectdata = rxm_ep_tinjectdata,
 };
 
-static struct fi_ops_tagged rxm_ops_tagged_thread_unsafe = {
-	.size = sizeof(struct fi_ops_tagged),
-	.recv = rxm_ep_trecv,
-	.recvv = rxm_ep_trecvv,
-	.recvmsg = rxm_ep_trecvmsg,
-	.send = rxm_ep_tsend,
-	.sendv = rxm_ep_tsendv,
-	.sendmsg = rxm_ep_tsendmsg,
-	.inject = rxm_ep_tinject_fast,
-	.senddata = rxm_ep_tsenddata,
-	.injectdata = rxm_ep_tinjectdata_fast,
-};
-
 static struct fi_ops_collective rxm_ops_collective = {
 	.size = sizeof(struct fi_ops_collective),
 	.barrier = ofi_ep_barrier,
@@ -2233,75 +2266,76 @@ static struct fi_ops_collective rxm_ops_collective_none = {
 	.msg = fi_coll_no_msg,
 };
 
-static int rxm_ep_msg_res_close(struct rxm_ep *rxm_ep)
-{
-	int ret = 0;
-
-	if (rxm_ep->srx_ctx) {
-		ret = fi_close(&rxm_ep->srx_ctx->fid);
-		if (ret) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, \
-				"Unable to close msg shared ctx\n");
-		}
-	}
-
-	fi_freeinfo(rxm_ep->msg_info);
-	return ret;
-}
 
-static int rxm_listener_close(struct rxm_ep *rxm_ep)
+static int rxm_listener_close(struct rxm_ep *ep)
 {
-	int ret, retv = 0;
+	int ret;
 
-	if (rxm_ep->msg_pep) {
-		ret = fi_close(&rxm_ep->msg_pep->fid);
+	if (ep->msg_pep) {
+		ret = fi_close(&ep->msg_pep->fid);
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
 				"Unable to close msg pep\n");
-			retv = ret;
+			return ret;
 		}
+		ep->msg_pep = NULL;
 	}
-	if (rxm_ep->msg_eq) {
-		ret = fi_close(&rxm_ep->msg_eq->fid);
+
+	if (ep->msg_eq) {
+		ret = fi_close(&ep->msg_eq->fid);
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
 				"Unable to close msg EQ\n");
-			retv = ret;
+			return ret;
 		}
+		ep->msg_eq = NULL;
 	}
-	return retv;
+	return 0;
 }
 
 static int rxm_ep_close(struct fid *fid)
 {
-	int ret, retv = 0;
-	struct rxm_ep *rxm_ep;
+	struct rxm_ep *ep;
+	int ret;
 
-	rxm_ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid);
-	if (rxm_ep->cmap)
-		rxm_cmap_free(rxm_ep->cmap);
+	ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid);
 
-	ret = rxm_listener_close(rxm_ep);
+	/* Stop listener thread to halt event processing before closing all
+	 * connections.
+	 */
+	rxm_stop_listen(ep);
+	rxm_freeall_conns(ep);
+	ret = rxm_listener_close(ep);
 	if (ret)
-		retv = ret;
+		return ret;
 
-	rxm_ep_txrx_res_close(rxm_ep);
-	ret = rxm_ep_msg_res_close(rxm_ep);
-	if (ret)
-		retv = ret;
+	rxm_ep_txrx_res_close(ep);
+	if (ep->srx_ctx) {
+		ret = fi_close(&ep->srx_ctx->fid);
+		if (ret) {
+			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, \
+				"Unable to close msg shared ctx\n");
+			return ret;
+		}
+		ep->srx_ctx = NULL;
+	}
 
-	if (rxm_ep->msg_cq) {
-		ret = fi_close(&rxm_ep->msg_cq->fid);
+	if (ep->msg_cq) {
+		ret = fi_close(&ep->msg_cq->fid);
 		if (ret) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to close msg CQ\n");
-			retv = ret;
+			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
+				"Unable to close msg CQ\n");
+			return ret;
 		}
+		ep->msg_cq = NULL;
 	}
 
-	ofi_endpoint_close(&rxm_ep->util_ep);
-	fi_freeinfo(rxm_ep->rxm_info);
-	free(rxm_ep);
-	return retv;
+	free(ep->inject_pkt);
+	ofi_endpoint_close(&ep->util_ep);
+	fi_freeinfo(ep->msg_info);
+	fi_freeinfo(ep->rxm_info);
+	free(ep);
+	return 0;
 }
 
 static int rxm_ep_trywait_cq(void *arg)
@@ -2347,6 +2381,12 @@ static int rxm_ep_wait_fd_add(struct rxm_ep *rxm_ep, struct util_wait *wait)
 				rxm_ep_trywait_eq);
 }
 
+static bool rxm_needs_atomic_progress(const struct fi_info *info)
+{
+	return (info->caps & FI_ATOMIC) && info->domain_attr &&
+		info->domain_attr->data_progress == FI_PROGRESS_AUTO;
+}
+
 static int rxm_msg_cq_fd_needed(struct rxm_ep *rxm_ep)
 {
 	return (rxm_needs_atomic_progress(rxm_ep->rxm_info) ||
@@ -2433,33 +2473,55 @@ err:
 	return ret;
 }
 
-static void rxm_ep_sar_init(struct rxm_ep *rxm_ep)
+static void rxm_ep_init_proto(struct rxm_ep *ep)
 {
+	struct rxm_domain *domain;
 	size_t param;
 
-	if (!fi_param_get_size_t(&rxm_prov, "sar_limit", &param)) {
-		if (param <= rxm_eager_limit) {
-			FI_WARN(&rxm_prov, FI_LOG_CORE,
-				"Requsted SAR limit (%zd) less or equal "
-				"Eager limit (%zd). SAR limit won't be used. "
-				"Messages of size <= SAR limit would be "
-				"transmitted via Inject/Eager protocol. "
-				"Messages of size > SAR limit would be "
-				"transmitted via Rendezvous protocol\n",
-				param, rxm_eager_limit);
-			param = rxm_eager_limit;
-		}
+	domain = container_of(ep->util_ep.domain, struct rxm_domain,
+			      util_domain);
 
-		rxm_ep->sar_limit = param;
-	} else {
-		size_t sar_limit = rxm_ep->msg_info->tx_attr->size *
-				   rxm_eager_limit;
+	if (ep->enable_direct_send && domain->dyn_rbuf) {
+		if (!fi_param_get_size_t(&rxm_prov, "eager_limit", &param))
+			ep->eager_limit = param;
+	}
 
-		rxm_ep->sar_limit = (sar_limit > RXM_SAR_LIMIT) ?
-				    RXM_SAR_LIMIT : sar_limit;
+	if (ep->eager_limit < rxm_buffer_size)
+		ep->eager_limit = rxm_buffer_size;
+
+	/* SAR segment size is capped at 64k. */
+	if (domain->dyn_rbuf || ep->eager_limit > UINT16_MAX) {
+		ep->sar_limit = ep->eager_limit;
+		return;
+	}
+
+	if (!fi_param_get_size_t(&rxm_prov, "sar_limit", &param)) {
+		if (param <= ep->eager_limit)
+			ep->sar_limit = ep->eager_limit;
+		else
+			ep->sar_limit = param;
+	} else {
+		ep->sar_limit = ep->eager_limit * 8;
 	}
 }
 
+/* Direct send works with verbs, provided that msg_mr_local == rdm_mr_local.
+ * However, it fails consistently on HFI, with the receiving side getting
+ * corrupted data beyond the first iov.  Only enable if MR_LOCAL is not
+ * required (feature of tcp provider).
+ */
+static void rxm_config_direct_send(struct rxm_ep *ep)
+{
+	int ret = 1;
+
+	if (ep->msg_mr_local)
+		return;
+
+	fi_param_get_bool(&rxm_prov, "enable_direct_send", &ret);
+	ep->enable_direct_send = (ret != 0);
+}
+
+
 static void rxm_ep_settings_init(struct rxm_ep *rxm_ep)
 {
 	size_t max_prog_val;
@@ -2470,13 +2532,12 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep)
 			   rxm_ep->msg_info->rx_attr->size) / 2;
 	rxm_ep->comp_per_progress = (rxm_ep->comp_per_progress > max_prog_val) ?
 				    max_prog_val : rxm_ep->comp_per_progress;
-	ofi_atomic_initialize32(&rxm_ep->atomic_tx_credits,
-				rxm_ep->rxm_info->tx_attr->size);
 
 	rxm_ep->msg_mr_local = ofi_mr_local(rxm_ep->msg_info);
 	rxm_ep->rdm_mr_local = ofi_mr_local(rxm_ep->rxm_info);
 
 	rxm_ep->inject_limit = rxm_ep->msg_info->tx_attr->inject_size;
+	rxm_ep->tx_credit = rxm_ep->rxm_info->tx_attr->size;
 
 	/* Favor a default buffered_min size that's small enough to be
 	 * injected by FI_EP_MSG provider */
@@ -2484,17 +2545,18 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep)
 	if (rxm_ep->inject_limit >
 	    (sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr)))
 		rxm_ep->buffered_min = MIN((rxm_ep->inject_limit -
-					(sizeof(struct rxm_pkt) +
-					 sizeof(struct rxm_rndv_hdr))),
-					   rxm_eager_limit);
+					    (sizeof(struct rxm_pkt) +
+					     sizeof(struct rxm_rndv_hdr))),
+					   rxm_buffer_size);
 
 	assert(!rxm_ep->min_multi_recv_size);
-	rxm_ep->min_multi_recv_size = rxm_eager_limit;
+	rxm_ep->min_multi_recv_size = rxm_buffer_size;
 
 	assert(!rxm_ep->buffered_limit);
-	rxm_ep->buffered_limit = rxm_eager_limit;
+	rxm_ep->buffered_limit = rxm_buffer_size;
 
-	rxm_ep_sar_init(rxm_ep);
+	rxm_config_direct_send(rxm_ep);
+	rxm_ep_init_proto(rxm_ep);
 
  	FI_INFO(&rxm_prov, FI_LOG_CORE,
 		"Settings:\n"
@@ -2502,26 +2564,23 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep)
 		"\t\t Completions per progress: MSG - %zu\n"
 	        "\t\t Buffered min: %zu\n"
 	        "\t\t Min multi recv size: %zu\n"
-	        "\t\t FI_EP_MSG provider inject size: %zu\n"
-	        "\t\t rxm inject size: %zu\n"
-		"\t\t Protocol limits: Eager: %zu, "
-				      "SAR: %zu\n",
+	        "\t\t inject size: %zu\n"
+		"\t\t Protocol limits: Eager: %zu, SAR: %zu\n",
 		rxm_ep->msg_mr_local, rxm_ep->rdm_mr_local,
 		rxm_ep->comp_per_progress, rxm_ep->buffered_min,
 		rxm_ep->min_multi_recv_size, rxm_ep->inject_limit,
-		rxm_ep->rxm_info->tx_attr->inject_size,
-		rxm_eager_limit, rxm_ep->sar_limit);
+		rxm_ep->eager_limit, rxm_ep->sar_limit);
 }
 
 static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep)
 {
 	int ret;
 
-	ret = rxm_ep_txrx_pool_create(rxm_ep);
+	ret = rxm_ep_create_pools(rxm_ep);
 	if (ret)
 		return ret;
 
-	dlist_init(&rxm_ep->deferred_tx_conn_queue);
+	dlist_init(&rxm_ep->deferred_queue);
 
 	ret = rxm_ep_rx_queue_init(rxm_ep);
 	if (ret)
@@ -2529,27 +2588,28 @@ static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep)
 
 	return FI_SUCCESS;
 err:
-	rxm_ep_txrx_pool_destroy(rxm_ep);
+	ofi_bufpool_destroy(rxm_ep->rx_pool);
+	ofi_bufpool_destroy(rxm_ep->tx_pool);
+	rxm_ep->rx_pool = NULL;
+	rxm_ep->tx_pool = NULL;
 	return ret;
 }
 
-#define RXM_NEED_RX_CQ_PROGRESS(info) 				\
-	((info->rx_attr->caps & (FI_MSG | FI_TAGGED)) ||	\
-	 (info->rx_attr->caps & FI_ATOMIC))
-
 static int rxm_ep_enable_check(struct rxm_ep *rxm_ep)
 {
 	if (!rxm_ep->util_ep.av)
 		return -FI_EOPBADSTATE;
 
+	if (ofi_needs_tx(rxm_ep->rxm_info->caps) && !rxm_ep->util_ep.tx_cq) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "missing Tx CQ\n");
+		return -FI_ENOCQ;
+	}
+
 	if (rxm_ep->util_ep.rx_cq)
 		return 0;
 
-	if (RXM_NEED_RX_CQ_PROGRESS(rxm_ep->rxm_info)) {
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "endpoint missing recv CQ"
-			"needed for progress of operations enabled by one "
-			"or more of requested capabilities: %s\n",
-			fi_tostr(&rxm_ep->rxm_info->rx_attr->caps, FI_TYPE_CAPS));
+	if (ofi_needs_rx(rxm_ep->rxm_info->caps)) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "missing Rx CQ\n");
 		return -FI_ENOCQ;
 	}
 
@@ -2558,6 +2618,7 @@ static int rxm_ep_enable_check(struct rxm_ep *rxm_ep)
 			"may be used but endpoint is missing recv CQ\n");
 		return -FI_ENOCQ;
 	}
+
 	return 0;
 }
 
@@ -2579,23 +2640,13 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg)
 		 * and then progressing both MSG EQ and MSG CQ once the latter
 		 * is opened) */
 		assert(!(rxm_ep->rxm_info->caps & FI_ATOMIC) ||
-		       !rxm_ep->cmap || !rxm_ep->cmap->cm_thread);
+		       !rxm_ep->cm_thread);
 
 		ret = rxm_ep_msg_cq_open(rxm_ep);
 		if (ret)
 			return ret;
 
-		/* fi_listen should be called before cmap alloc as cmap alloc
-		 * calls fi_getname on pep which would succeed only if fi_listen
-		 * was called first */
-		ret = fi_listen(rxm_ep->msg_pep);
-		if (ret) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-				"unable to set msg PEP to listen state\n");
-			return ret;
-		}
-
-		ret = rxm_conn_cmap_alloc(rxm_ep);
+		ret = rxm_start_listen(rxm_ep);
 		if (ret)
 			return ret;
 
@@ -2608,20 +2659,18 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg)
 			return ret;
 
 		if (rxm_ep->srx_ctx) {
-			ret = rxm_msg_ep_prepost_recv(rxm_ep, rxm_ep->srx_ctx);
-			if (ret) {
-				rxm_cmap_free(rxm_ep->cmap);
-				FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
-					"unable to prepost recv bufs\n");
+			ret = rxm_prepost_recv(rxm_ep, rxm_ep->srx_ctx);
+			if (ret)
 				goto err;
-			}
 		}
 		break;
 	default:
 		return -FI_ENOSYS;
 	}
 	return 0;
+
 err:
+	/* TODO: cleanup all allocated resources on error */
 	rxm_ep_txrx_res_close(rxm_ep);
 	return ret;
 }
@@ -2672,23 +2721,21 @@ err:
 	return ret;
 }
 
-static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep)
+static int rxm_open_core_res(struct rxm_ep *ep)
 {
-	struct rxm_domain *rxm_domain;
+	struct rxm_domain *domain;
 	int ret;
 
-	rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
-				  util_domain);
- 	ret = ofi_get_core_info(rxm_ep->util_ep.domain->fabric->fabric_fid.api_version,
-				NULL, NULL, 0, &rxm_util_prov, rxm_ep->rxm_info,
-				NULL, rxm_info_to_core, &rxm_ep->msg_info);
+	domain = container_of(ep->util_ep.domain, struct rxm_domain, util_domain);
+	ret = ofi_get_core_info(domain->util_domain.fabric->fabric_fid.api_version,
+				NULL, NULL, 0, &rxm_util_prov, ep->rxm_info,
+				NULL, rxm_info_to_core, &ep->msg_info);
 	if (ret)
 		return ret;
 
- 	if (rxm_ep->msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) {
-		ret = fi_srx_context(rxm_domain->msg_domain,
-				     rxm_ep->msg_info->rx_attr,
-				     &rxm_ep->srx_ctx, NULL);
+	if (ep->msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) {
+		ret = fi_srx_context(domain->msg_domain, ep->msg_info->rx_attr,
+				     &ep->srx_ctx, NULL);
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
 				"Unable to open shared receive context\n");
@@ -2696,30 +2743,97 @@ static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep)
 		}
 	}
 
- 	ret = rxm_listener_open(rxm_ep);
+	ret = rxm_listener_open(ep);
 	if (ret)
 		goto err2;
 
- 	/* Zero out the port as we would be creating multiple MSG EPs for a single
-	 * RXM EP and we don't want address conflicts. */
-	if (rxm_ep->msg_info->src_addr) {
-		if (((struct sockaddr *) rxm_ep->msg_info->src_addr)->sa_family == AF_INET)
-			((struct sockaddr_in *) (rxm_ep->msg_info->src_addr))->sin_port = 0;
-		else
-			((struct sockaddr_in6 *) (rxm_ep->msg_info->src_addr))->sin6_port = 0;
-	}
-
 	return 0;
 err2:
-	if (rxm_ep->srx_ctx) {
-		fi_close(&rxm_ep->srx_ctx->fid);
-		rxm_ep->srx_ctx = NULL;
+	if (ep->srx_ctx) {
+		fi_close(&ep->srx_ctx->fid);
+		ep->srx_ctx = NULL;
 	}
 err1:
-	fi_freeinfo(rxm_ep->msg_info);
+	fi_freeinfo(ep->msg_info);
+	ep->msg_info = NULL;
 	return ret;
 }
 
+static ssize_t
+rxm_prepare_deferred_rndv_read(struct rxm_deferred_tx_entry **def_tx_entry,
+			       size_t index, struct iovec *iov,
+			       void *desc[RXM_IOV_LIMIT], size_t count,
+			       void *buf)
+{
+	uint8_t i;
+	struct rxm_rx_buf *rx_buf = buf;
+
+	*def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, rx_buf->conn,
+						       RXM_DEFERRED_TX_RNDV_READ);
+	if (!*def_tx_entry)
+		return -FI_ENOMEM;
+
+	(*def_tx_entry)->rndv_read.rx_buf = rx_buf;
+	(*def_tx_entry)->rndv_read.rma_iov.addr =
+			rx_buf->remote_rndv_hdr->iov[index].addr;
+	(*def_tx_entry)->rndv_read.rma_iov.key =
+			rx_buf->remote_rndv_hdr->iov[index].key;
+
+	for (i = 0; i < count; i++) {
+		(*def_tx_entry)->rndv_read.rxm_iov.iov[i] = iov[i];
+		(*def_tx_entry)->rndv_read.rxm_iov.desc[i] = desc[i];
+	}
+	(*def_tx_entry)->rndv_read.rxm_iov.count = count;
+
+	return 0;
+}
+
+static ssize_t
+rxm_prepare_deferred_rndv_write(struct rxm_deferred_tx_entry **def_tx_entry,
+			       size_t index, struct iovec *iov,
+			       void *desc[RXM_IOV_LIMIT], size_t count,
+			       void *buf)
+{
+	uint8_t i;
+	struct rxm_tx_buf *tx_buf = buf;
+	struct rxm_ep *rxm_ep = tx_buf->write_rndv.conn->ep;
+
+	*def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, tx_buf->write_rndv.conn,
+						       RXM_DEFERRED_TX_RNDV_WRITE);
+	if (!*def_tx_entry)
+		return -FI_ENOMEM;
+
+	(*def_tx_entry)->rndv_write.tx_buf = tx_buf;
+	(*def_tx_entry)->rndv_write.rma_iov.addr =
+			tx_buf->write_rndv.remote_hdr.iov[index].addr;
+	(*def_tx_entry)->rndv_write.rma_iov.key =
+			tx_buf->write_rndv.remote_hdr.iov[index].key;
+
+	for (i = 0; i < count; i++) {
+		(*def_tx_entry)->rndv_write.rxm_iov.iov[i] = iov[i];
+		(*def_tx_entry)->rndv_write.rxm_iov.desc[i] = desc[i];
+	}
+	(*def_tx_entry)->rndv_write.rxm_iov.count = count;
+
+	return 0;
+}
+
+struct rxm_rndv_ops rxm_rndv_ops_read = {
+	.rx_mr_access = FI_READ,
+	.tx_mr_access = FI_REMOTE_READ,
+	.handle_rx = rxm_rndv_read,
+	.xfer = fi_readv,
+	.defer_xfer = rxm_prepare_deferred_rndv_read
+};
+
+struct rxm_rndv_ops rxm_rndv_ops_write = {
+	.rx_mr_access = FI_REMOTE_WRITE,
+	.tx_mr_access = FI_WRITE,
+	.handle_rx = rxm_rndv_send_wr_data,
+	.xfer = fi_writev,
+	.defer_xfer = rxm_prepare_deferred_rndv_write
+};
+
 int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 		 struct fid_ep **ep_fid, void *context)
 {
@@ -2749,16 +2863,25 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 					&rxm_ep->util_ep, context,
 					&rxm_ep_progress);
 	}
-
 	if (ret)
 		goto err1;
 
-	ret = rxm_ep_msg_res_open(rxm_ep);
+	ret = rxm_open_core_res(rxm_ep);
 	if (ret)
 		goto err2;
 
 	rxm_ep_settings_init(rxm_ep);
 
+	rxm_ep->inject_pkt = calloc(1, sizeof(*rxm_ep->inject_pkt) +
+				       rxm_ep->inject_limit);
+	if (!rxm_ep->inject_pkt) {
+		ret = -FI_ENOMEM;
+		goto err2;
+	}
+	rxm_ep->inject_pkt->ctrl_hdr.version = RXM_CTRL_VERSION;
+	rxm_ep->inject_pkt->ctrl_hdr.type = rxm_ctrl_eager;
+	rxm_ep->inject_pkt->hdr.version = OFI_OP_VERSION;
+
 	*ep_fid = &rxm_ep->util_ep.ep_fid;
 	(*ep_fid)->fid.ops = &rxm_ep_fi_ops;
 	(*ep_fid)->ops = &rxm_ops_ep;
@@ -2772,18 +2895,21 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 		rxm_ep->eager_ops = &def_eager_ops;
 	}
 
-	if (rxm_ep->util_ep.domain->threading != FI_THREAD_SAFE) {
-		(*ep_fid)->msg = &rxm_ops_msg_thread_unsafe;
-		(*ep_fid)->tagged = &rxm_ops_tagged_thread_unsafe;
-	} else {
-		(*ep_fid)->msg = &rxm_ops_msg;
-		(*ep_fid)->tagged = &rxm_ops_tagged;
-	}
+	if (rxm_use_write_rndv)
+		rxm_ep->rndv_ops = &rxm_rndv_ops_write;
+	else
+		rxm_ep->rndv_ops = &rxm_rndv_ops_read;
+	dlist_init(&rxm_ep->rndv_wait_list);
+
+	(*ep_fid)->msg = &rxm_ops_msg;
+	(*ep_fid)->tagged = &rxm_ops_tagged;
 	(*ep_fid)->rma = &rxm_ops_rma;
 
 	if (rxm_ep->rxm_info->caps & FI_ATOMIC)
 		(*ep_fid)->atomic = &rxm_ops_atomic;
 
+	dlist_init(&rxm_ep->loopback_list);
+
 	return 0;
 err2:
 	ofi_endpoint_close(&rxm_ep->util_ep);
diff --git a/deps/libfabric/prov/rxm/src/rxm_init.c b/deps/libfabric/prov/rxm/src/rxm_init.c
index b3e3139d335829995a5c2ede40434fd9bc821b79..e458d0f39078b1d37aaa76a662b3c71868ead1ab 100644
--- a/deps/libfabric/prov/rxm/src/rxm_init.c
+++ b/deps/libfabric/prov/rxm/src/rxm_init.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016 Intel Corporation. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -45,12 +46,18 @@
 
 #define RXM_PASSTHRU_CAPS (FI_MSG | FI_RMA | FI_SEND | FI_RECV |	\
 			   FI_READ | FI_WRITE | FI_REMOTE_READ |	\
-			   FI_REMOTE_WRITE)
+			   FI_REMOTE_WRITE | FI_HMEM)
+
+size_t rxm_msg_tx_size;
+size_t rxm_msg_rx_size;
+size_t rxm_def_rx_size = 2048;
+size_t rxm_def_tx_size = 2048;
+
+size_t rxm_buffer_size = 16384;
+size_t rxm_packet_size;
 
-size_t rxm_msg_tx_size		= 128;
-size_t rxm_msg_rx_size		= 128;
-size_t rxm_eager_limit		= RXM_BUF_SIZE - sizeof(struct rxm_pkt);
 int force_auto_progress		= 0;
+int rxm_use_write_rndv		= 0;
 enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC;
 
 char *rxm_proto_state_str[] = {
@@ -84,14 +91,33 @@ void rxm_info_to_core_mr_modes(uint32_t version, const struct fi_info *hints,
 		else
 			core_info->domain_attr->mr_mode |=
 				hints->domain_attr->mr_mode;
+
+		/* RxM is setup to support FI_HMEM with the core provider requiring
+		 * FI_MR_HMEM. Always set this MR mode bit.
+		 */
+		if (hints && hints->caps & FI_HMEM)
+			core_info->domain_attr->mr_mode |= FI_MR_HMEM;
 	}
 }
+static bool rxm_use_srx(const struct fi_info *hints,
+			const struct fi_info *base_info)
+{
+	const struct fi_info *info;
+	int ret, use_srx = 0;
+
+	ret = fi_param_get_bool(&rxm_prov, "use_srx", &use_srx);
+	if (ret != -FI_ENODATA)
+		return use_srx;
+
+	info = base_info ? base_info : hints;
+
+	return info && info->fabric_attr && info->fabric_attr->prov_name &&
+	       strcasestr(info->fabric_attr->prov_name, "tcp");
+}
 
 int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
 		     const struct fi_info *base_info, struct fi_info *core_info)
 {
-	int ret, use_srx = 0;
-
 	rxm_info_to_core_mr_modes(version, hints, core_info);
 
 	core_info->mode |= FI_RX_CQ_DATA | FI_CONTEXT;
@@ -102,12 +128,15 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
 			core_info->caps |= FI_MSG | FI_SEND | FI_RECV;
 
 		/* FI_RMA cap is needed for large message transfer protocol */
-		if (core_info->caps & FI_MSG)
-			core_info->caps |= FI_RMA | FI_READ | FI_REMOTE_READ;
+		if (core_info->caps & FI_MSG) {
+			core_info->caps |= FI_RMA | FI_READ |
+					   FI_REMOTE_READ | FI_REMOTE_WRITE;
+		}
 
 		if (hints->domain_attr) {
 			core_info->domain_attr->caps |= hints->domain_attr->caps;
-			core_info->domain_attr->threading = hints->domain_attr->threading;
+			core_info->domain_attr->threading =
+				hints->domain_attr->threading;
 		}
 		if (hints->tx_attr) {
 			core_info->tx_attr->op_flags =
@@ -121,24 +150,28 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
 			core_info->rx_attr->msg_order = hints->rx_attr->msg_order;
 			core_info->rx_attr->comp_order = hints->rx_attr->comp_order;
 		}
+		if ((hints->caps & FI_HMEM) && ofi_hmem_p2p_disabled())
+			return -FI_ENODATA;
 	}
 
 	core_info->ep_attr->type = FI_EP_MSG;
 
-	ret = fi_param_get_bool(&rxm_prov, "use_srx", &use_srx);
-	if (use_srx || ((ret == -FI_ENODATA) && base_info &&
-	    base_info->fabric_attr->prov_name &&
-	    !strcmp(base_info->fabric_attr->prov_name, "tcp"))) {
+	if (rxm_use_srx(hints, base_info)) {
 		FI_DBG(&rxm_prov, FI_LOG_FABRIC,
 		       "Requesting shared receive context from core provider\n");
 		core_info->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT;
+		core_info->rx_attr->size = rxm_msg_rx_size ?
+					   rxm_msg_rx_size : RXM_MSG_SRX_SIZE;
+	} else {
+		core_info->rx_attr->size = rxm_msg_rx_size ?
+					   rxm_msg_rx_size : RXM_MSG_RXTX_SIZE;
 	}
 
 	core_info->tx_attr->op_flags &= ~RXM_TX_OP_FLAGS;
-	core_info->tx_attr->size = rxm_msg_tx_size;
+	core_info->tx_attr->size = rxm_msg_tx_size ?
+				   rxm_msg_tx_size : RXM_MSG_RXTX_SIZE;
 
 	core_info->rx_attr->op_flags &= ~FI_MULTI_RECV;
-	core_info->rx_attr->size = rxm_msg_rx_size;
 
 	return 0;
 }
@@ -147,15 +180,38 @@ int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
 		    const struct fi_info *base_info, struct fi_info *info)
 {
 	info->caps = base_info->caps;
-	// TODO find which other modes should be filtered
 	info->mode = (core_info->mode & ~FI_RX_CQ_DATA) | base_info->mode;
 
 	info->tx_attr->caps		= base_info->tx_attr->caps;
 	info->tx_attr->mode		= info->mode;
 	info->tx_attr->msg_order 	= core_info->tx_attr->msg_order;
 	info->tx_attr->comp_order 	= base_info->tx_attr->comp_order;
-	info->tx_attr->inject_size	= base_info->tx_attr->inject_size;
-	info->tx_attr->size 		= base_info->tx_attr->size;
+
+	/* If the core provider requires registering send buffers, it's
+	 * usually faster to copy small transfer through bounce buffers
+	 * than requiring the user to register the buffers.  Bump the
+	 * inject size up to the rxm limit (eager buffer size) in this
+	 * case.  If registration is not required, use the core provider's
+	 * limit, which avoids potential extra data copies.
+	 *
+	 * If we report the size of the bounce buffer, apps may call inject
+	 * rather than send, which hampers our ability to use the direct
+	 * send feature that avoids data copies.
+	 */
+	if (ofi_mr_local(info) ||
+	    (core_info->tx_attr->inject_size <= sizeof(struct rxm_pkt))) {
+		info->tx_attr->inject_size = base_info->tx_attr->inject_size;
+	} else {
+		info->tx_attr->inject_size = core_info->tx_attr->inject_size -
+					     sizeof(struct rxm_pkt);
+	}
+
+	/* User hints will override the modified info attributes through
+	 * ofi_alter_info.  Set default sizes lower than supported maximums.
+	 */
+	info->tx_attr->size = MIN(base_info->tx_attr->size, rxm_def_tx_size);
+	info->rx_attr->size = MIN(base_info->rx_attr->size, rxm_def_rx_size);
+
 	info->tx_attr->iov_limit 	= MIN(base_info->tx_attr->iov_limit,
 					      core_info->tx_attr->iov_limit);
 	info->tx_attr->rma_iov_limit	= MIN(base_info->tx_attr->rma_iov_limit,
@@ -165,7 +221,6 @@ int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
 	info->rx_attr->mode		= info->rx_attr->mode & ~FI_RX_CQ_DATA;
 	info->rx_attr->msg_order 	= core_info->rx_attr->msg_order;
 	info->rx_attr->comp_order 	= base_info->rx_attr->comp_order;
-	info->rx_attr->size 		= base_info->rx_attr->size;
 	info->rx_attr->iov_limit 	= MIN(base_info->rx_attr->iov_limit,
 					      core_info->rx_attr->iov_limit);
 
@@ -187,6 +242,13 @@ int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
 			return -FI_ENOMEM;
 	}
 
+	/* FI_HMEM is only supported if core provider supports it. */
+	if (!(core_info->caps & FI_HMEM)) {
+		info->caps &= ~FI_HMEM;
+		info->tx_attr->caps &= ~FI_HMEM;
+		info->rx_attr->caps &= ~FI_HMEM;
+	}
+
 	return 0;
 }
 
@@ -195,23 +257,36 @@ static void rxm_init_infos(void)
 	struct fi_info *cur;
 	size_t buf_size, tx_size = 0, rx_size = 0;
 
+	/* Historically, 'buffer_size' was the name given for the eager message
+	 * size.  Maintain the name for backwards compatability.
+	 */
 	if (!fi_param_get_size_t(&rxm_prov, "buffer_size", &buf_size)) {
-		if (buf_size <
-		    sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr)) {
+		/* We need enough space to carry extra headers */
+		if (buf_size < sizeof(struct rxm_rndv_hdr) ||
+		    buf_size < sizeof(struct rxm_atomic_hdr)) {
 			FI_WARN(&rxm_prov, FI_LOG_CORE,
 				"Requested buffer size too small\n");
-			buf_size = sizeof(struct rxm_pkt) +
-				   sizeof(struct rxm_rndv_hdr);
+			buf_size = MAX(sizeof(struct rxm_rndv_hdr),
+				       sizeof(struct rxm_atomic_hdr));
 		}
 
-		rxm_eager_limit = buf_size - sizeof(struct rxm_pkt);
+		if (buf_size > INT32_MAX)
+			buf_size = INT32_MAX;
+
+		rxm_buffer_size = buf_size;
 	}
 
+	rxm_packet_size = sizeof(struct rxm_pkt) + rxm_buffer_size;
+
 	fi_param_get_size_t(&rxm_prov, "tx_size", &tx_size);
 	fi_param_get_size_t(&rxm_prov, "rx_size", &rx_size);
+	if (tx_size)
+		rxm_def_tx_size = tx_size;
+	if (rx_size)
+		rxm_def_rx_size = rx_size;
 
 	for (cur = (struct fi_info *) rxm_util_prov.info; cur; cur = cur->next) {
-		cur->tx_attr->inject_size = rxm_eager_limit;
+		cur->tx_attr->inject_size = rxm_buffer_size;
 		if (tx_size)
 			cur->tx_attr->size = tx_size;
 		if (rx_size)
@@ -224,12 +299,6 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info)
 	struct fi_info *cur;
 
 	for (cur = info; cur; cur = cur->next) {
-		/* RxM can support higher inject size without any big
-		 * performance penalty even if app had requested lower value
-		 * in hints. App is still free to reduce this when opening an
-		 * endpoint. This overrides setting by ofi_alter_info */
-		cur->tx_attr->inject_size = rxm_eager_limit;
-
 		/* Remove the following caps if they are not requested as they
 		 * may affect performance in fast-path */
 		if (!hints) {
@@ -336,6 +405,7 @@ static int rxm_getinfo(uint32_t version, const char *node, const char *service,
 			port_save = ofi_addr_get_port(ai->ai_addr);
 			freeaddrinfo(ai);
 			service = NULL;
+			flags &= ~FI_SOURCE;
 		} else {
 			port_save = ofi_addr_get_port(hints->src_addr);
 			ofi_addr_set_port(hints->src_addr, 0);
@@ -364,7 +434,10 @@ static int rxm_getinfo(uint32_t version, const char *node, const char *service,
 
 static void rxm_fini(void)
 {
-	/* yawn */
+#if HAVE_RXM_DL
+	ofi_hmem_cleanup();
+	ofi_mem_fini();
+#endif
 }
 
 struct fi_provider rxm_prov = {
@@ -388,7 +461,8 @@ static void rxm_get_def_wait(void)
 	fi_param_define(&rxm_prov, "def_tcp_wait_obj", FI_PARAM_STRING,
 			"See def_wait_obj for description.  If set, this "
 			"overrides the def_wait_obj when running over the "
-			"tcp provider.");
+			"tcp provider.  See def_wait_obj for valid values. "
+			"(default: UNSPEC, tcp provider will select).");
 
 	fi_param_get_str(&rxm_prov, "def_wait_obj", &wait_str);
 	if (wait_str && !strcasecmp(wait_str, "pollfd"))
@@ -405,16 +479,23 @@ static void rxm_get_def_wait(void)
 RXM_INI
 {
 	fi_param_define(&rxm_prov, "buffer_size", FI_PARAM_SIZE_T,
-			"Defines the transmit buffer size / inject size "
-			"(default: 16 KB). Eager protocol would be used to "
-			"transmit messages of size less than eager limit "
-			"(FI_OFI_RXM_BUFFER_SIZE - RxM header size (%zu B)). "
-			"Any message whose size is greater than eager limit would"
-			" be transmitted via rendezvous or SAR "
-			"(Segmentation And Reassembly) protocol depending on "
-			"the value of FI_OFI_RXM_SAR_LIMIT). Also, transmit data "
-			" would be copied up to eager limit.",
-			sizeof(struct rxm_pkt));
+			"Defines the allocated buffer size used for bounce "
+			"buffers, including buffers posted at the receive side "
+			"to handle unexpected messages.  This value "
+			"corresponds to the rxm inject limit, and is also "
+			"typically used as the eager message size. "
+			"(default %zu)", rxm_buffer_size);
+
+	fi_param_define(&rxm_prov, "eager_limit", FI_PARAM_SIZE_T,
+			"Specifies the maximum size transfer that the eager "
+			"protocol will be used.  For transfers smaller than "
+			"this limit, data may be copied into a bounce "
+			"buffer on the transmit side and received into "
+			"bounce buffer at the receiver.  The eager_limit must "
+			"be equal to the buffer_size when using rxm over "
+			"verbs, but may differ in the case of tcp."
+			"(default: %zu)", rxm_buffer_size);
+			/* rxm_buffer_size is correct here */
 
 	fi_param_define(&rxm_prov, "comp_per_progress", FI_PARAM_INT,
 			"Defines the maximum number of MSG provider CQ entries "
@@ -422,13 +503,14 @@ RXM_INI
 			"(RxM CQ read).");
 
 	fi_param_define(&rxm_prov, "sar_limit", FI_PARAM_SIZE_T,
-			"Set this environment variable to enable and control "
-			"RxM SAR (Segmentation And Reassembly) protocol "
-			"(default: 128 KB). This value should be set greater than "
-			" eager limit (FI_OFI_RXM_BUFFER_SIZE - RxM protocol "
-			"header size (%zu B)) for SAR to take effect. Messages "
-			"of size greater than this would be transmitted via "
-			"rendezvous protocol.", sizeof(struct rxm_pkt));
+			"Specifies the maximum size transfer that the SAR "
+			"Segmentation And Reassembly) protocol "
+			"For transfers smaller than SAR, data may be copied "
+			"into multiple bounce buffers on the transmit side "
+			"and received into bounce buffers at the receiver. "
+			"The sar_limit value must be greater than the "
+			"eager_limit to take effect.  (default %zu).",
+			rxm_buffer_size * 8);
 
 	fi_param_define(&rxm_prov, "use_srx", FI_PARAM_BOOL,
 			"Set this environment variable to control the RxM "
@@ -438,20 +520,18 @@ RXM_INI
 			"latency as a side-effect.");
 
 	fi_param_define(&rxm_prov, "tx_size", FI_PARAM_SIZE_T,
-			"Defines default tx context size (default: 1024).");
+			"Defines default tx context size (default: 2048).");
 
 	fi_param_define(&rxm_prov, "rx_size", FI_PARAM_SIZE_T,
-			"Defines default rx context size (default: 1024).");
+			"Defines default rx context size (default: 2048).");
 
 	fi_param_define(&rxm_prov, "msg_tx_size", FI_PARAM_SIZE_T,
 			"Defines FI_EP_MSG tx size that would be requested "
-			"(default: 128). Setting this to 0 would get default "
-			"value defined by the MSG provider.");
+			"(default: 128).");
 
 	fi_param_define(&rxm_prov, "msg_rx_size", FI_PARAM_SIZE_T,
-			"Defines FI_EP_MSG rx size that would be requested "
-			"(default: 128). Setting this to 0 would get default "
-			"value defined by the MSG provider.");
+			"Defines FI_EP_MSG rx or srx size that would be requested. "
+			"(default: 128, 4096 with srx");
 
 	fi_param_define(&rxm_prov, "cm_progress_interval", FI_PARAM_INT,
 			"Defines the number of microseconds to wait between "
@@ -461,8 +541,8 @@ RXM_INI
 			"longer connection establishment times. (default: 10000).");
 
 	fi_param_define(&rxm_prov, "cq_eq_fairness", FI_PARAM_INT,
-			"Defines the maximum number of message provider CQ entries"
-			" that can be consecutively read across progress calls "
+			"Defines the maximum number of message provider CQ entries "
+			"that can be consecutively read across progress calls "
 			"without checking to see if the CM progress interval has "
 			"been reached. (default: 128).");
 
@@ -470,6 +550,28 @@ RXM_INI
 			"Force auto-progress for data transfers even if app "
 			"requested manual progress (default: false/no).");
 
+	fi_param_define(&rxm_prov, "use_rndv_write", FI_PARAM_BOOL,
+			"Set this environment variable to control the  "
+			"RxM Rendezvous protocol.  If set (1), RxM will use "
+			"RMA writes rather than RMA reads during Rendezvous "
+			"transactions. (default: false/no).");
+
+	fi_param_define(&rxm_prov, "enable_dyn_rbuf", FI_PARAM_BOOL,
+			"Enable support for dynamic receive buffering, if "
+			"available by the message endpoint provider. "
+			"This allows direct placement of received messages "
+			"into application buffers, bypassing RxM bounce "
+			"buffers.  This feature targets using tcp sockets "
+			"for the message transport.  (default: true)");
+
+	fi_param_define(&rxm_prov, "enable_direct_send", FI_PARAM_BOOL,
+			"Enable support to pass application buffers directly "
+			"to the core provider when possible.  This avoids "
+			"copying application buffers through bounce buffers "
+			"before passing them to the core provider.  This "
+			"feature targets small to medium size message "
+			"transfers over the tcp provider.  (default: true)");
+
 	rxm_init_infos();
 	fi_param_get_size_t(&rxm_prov, "msg_tx_size", &rxm_msg_tx_size);
 	fi_param_get_size_t(&rxm_prov, "msg_rx_size", &rxm_msg_rx_size);
@@ -480,6 +582,8 @@ RXM_INI
 				(int *) &rxm_cq_eq_fairness))
 		rxm_cq_eq_fairness = 128;
 	fi_param_get_bool(&rxm_prov, "data_auto_progress", &force_auto_progress);
+	fi_param_get_bool(&rxm_prov, "use_rndv_write", &rxm_use_write_rndv);
+
 	rxm_get_def_wait();
 
 	if (force_auto_progress)
@@ -487,5 +591,10 @@ RXM_INI
 			"(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading "
 			"level would be set to FI_THREAD_SAFE\n");
 
+#if HAVE_RXM_DL
+	ofi_mem_init();
+	ofi_hmem_init();
+#endif
+
 	return &rxm_prov;
 }
diff --git a/deps/libfabric/prov/rxm/src/rxm_rma.c b/deps/libfabric/prov/rxm/src/rxm_rma.c
index c665b7b1b950c9d3bdabaeefb484caa9a049a192..987953ad4ba2b4ee3de7439213ba00b76ec5fc14 100644
--- a/deps/libfabric/prov/rxm/src/rxm_rma.c
+++ b/deps/libfabric/prov/rxm/src/rxm_rma.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2017-2020 Intel Corporation. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,7 +37,7 @@
 static ssize_t
 rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov,
 		   void **desc, void **desc_storage, size_t iov_count,
-		   uint64_t access, struct rxm_rma_buf *rma_buf)
+		   uint64_t access, struct rxm_tx_buf *rma_buf)
 {
 	size_t i, ret;
 
@@ -45,16 +46,17 @@ rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov,
 
 	if (!rxm_ep->rdm_mr_local) {
 		ret = rxm_msg_mr_regv(rxm_ep, msg_iov, iov_count, SIZE_MAX,
-				      access, rma_buf->mr.mr);
+				      access, rma_buf->rma.mr);
 		if (OFI_UNLIKELY(ret))
 			return ret;
 
 		for (i = 0; i < iov_count; i++)
-			desc_storage[i] = fi_mr_desc(rma_buf->mr.mr[i]);
-		rma_buf->mr.count = iov_count;
+			desc_storage[i] = fi_mr_desc(rma_buf->rma.mr[i]);
+		rma_buf->rma.count = iov_count;
 	} else {
 		for (i = 0; i < iov_count; i++)
-			desc_storage[i] = fi_mr_desc(desc[i]);
+			desc_storage[i] =
+				fi_mr_desc(((struct rxm_mr *) desc[i])->msg_mr);
 	}
 	return FI_SUCCESS;
 }
@@ -65,7 +67,7 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
 		  const struct fi_msg_rma *msg, uint64_t flags),
 		  uint64_t comp_flags)
 {
-	struct rxm_rma_buf *rma_buf;
+	struct rxm_tx_buf *rma_buf;
 	struct fi_msg_rma msg_rma = *msg;
 	struct rxm_conn *rxm_conn;
 	void *mr_desc[RXM_IOV_LIMIT] = { 0 };
@@ -79,12 +81,14 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
-	rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool);
+	rma_buf = rxm_get_tx_buf(rxm_ep);
 	if (!rma_buf) {
 		ret = -FI_EAGAIN;
 		goto unlock;
 	}
 
+	rma_buf->hdr.state = RXM_RMA;
+	rma_buf->pkt.ctrl_hdr.type = rxm_ctrl_eager;
 	rma_buf->app_context = msg->context;
 	rma_buf->flags = flags;
 
@@ -98,13 +102,13 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
 	msg_rma.context = rma_buf;
 
 	ret = rma_msg(rxm_conn->msg_ep, &msg_rma, flags);
-	if (OFI_LIKELY(!ret))
+	if (!ret)
 		goto unlock;
 
 	if ((rxm_ep->msg_mr_local) && (!rxm_ep->rdm_mr_local))
-		rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count);
+		rxm_msg_mr_closev(rma_buf->rma.mr, rma_buf->rma.count);
 release:
-	ofi_buf_free(rma_buf);
+	rxm_free_rx_buf(rxm_ep, rma_buf);
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -178,15 +182,26 @@ static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len,
 }
 
 static void
-rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig_msg,
+rxm_ep_format_rma_msg(struct rxm_tx_buf *rma_buf,
+		      const struct fi_msg_rma *orig_msg,
 		      struct iovec *rxm_iov, struct fi_msg_rma *rxm_msg)
 {
+	ssize_t ret __attribute__((unused));
+	enum fi_hmem_iface iface;
+	uint64_t device;
+
+	iface = rxm_mr_desc_to_hmem_iface_dev(orig_msg->desc,
+					      orig_msg->iov_count, &device);
+
 	rxm_msg->context = rma_buf;
 	rxm_msg->addr = orig_msg->addr;
 	rxm_msg->data = orig_msg->data;
 
-	ofi_copy_from_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size,
-			  orig_msg->msg_iov, orig_msg->iov_count, 0);
+	ret = ofi_copy_from_hmem_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size,
+				     iface, device, orig_msg->msg_iov,
+				     orig_msg->iov_count, 0);
+	assert(ret == rma_buf->pkt.hdr.size);
+
 	rxm_iov->iov_base = &rma_buf->pkt.data;
 	rxm_iov->iov_len = rma_buf->pkt.hdr.size;
 	rxm_msg->msg_iov = rxm_iov;
@@ -202,17 +217,19 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			      size_t total_size, const struct fi_msg_rma *msg,
 			      uint64_t flags)
 {
-	struct rxm_rma_buf *rma_buf;
+	struct rxm_tx_buf *rma_buf;
 	ssize_t ret;
 	struct iovec rxm_msg_iov = { 0 };
 	struct fi_msg_rma rxm_rma_msg = { 0 };
 
 	assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit);
 
-	rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool);
+	rma_buf = rxm_get_tx_buf(rxm_ep);
 	if (!rma_buf)
 		return -FI_EAGAIN;
 
+	rma_buf->hdr.state = RXM_RMA;
+	rma_buf->pkt.ctrl_hdr.type = rxm_ctrl_eager;
 	rma_buf->pkt.hdr.size = total_size;
 	rma_buf->app_context = msg->context;
 	rma_buf->flags = flags;
@@ -221,10 +238,10 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	flags = (flags & ~FI_INJECT) | FI_COMPLETION;
 
 	ret = fi_writemsg(rxm_conn->msg_ep, &rxm_rma_msg, flags);
-	if (OFI_UNLIKELY(ret)) {
+	if (ret) {
 		if (ret == -FI_EAGAIN)
 			rxm_ep_do_progress(&rxm_ep->util_ep);
-		ofi_buf_free(rma_buf);
+		rxm_free_rx_buf(rxm_ep, rma_buf);
 	}
 	return ret;
 }
@@ -274,7 +291,7 @@ rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
-	if ((total_size > rxm_ep->msg_info->tx_attr->inject_size) ||
+	if ((total_size > rxm_ep->rxm_info->tx_attr->inject_size) ||
 	    rxm_ep->util_ep.wr_cntr ||
 	    (flags & FI_COMPLETION) || (msg->iov_count > 1) ||
 	    (msg->rma_iov_count > 1)) {
@@ -432,8 +449,7 @@ static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf,
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
-	if (len > rxm_ep->msg_info->tx_attr->inject_size ||
-	    rxm_ep->util_ep.wr_cntr) {
+	if (len > rxm_ep->inject_limit || rxm_ep->util_ep.wr_cntr) {
 		ret = rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, 0,
 						dest_addr, addr, key,
 						FI_INJECT);
@@ -467,8 +483,7 @@ static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf,
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
-	if (len > rxm_ep->msg_info->tx_attr->inject_size ||
-	    rxm_ep->util_ep.wr_cntr) {
+	if (len > rxm_ep->inject_limit || rxm_ep->util_ep.wr_cntr) {
 		ret = rxm_ep_rma_emulate_inject(
 			rxm_ep, rxm_conn, buf, len, data, dest_addr,
 			addr, key, FI_REMOTE_CQ_DATA | FI_INJECT);
diff --git a/deps/libfabric/prov/shm/src/smr.h b/deps/libfabric/prov/shm/src/smr.h
index f6a494b8fab0660faf723f0b30968637e0af6af5..954dd5c2a0c737f80f2b0bb6992b33df0df471ed 100644
--- a/deps/libfabric/prov/shm/src/smr.h
+++ b/deps/libfabric/prov/shm/src/smr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2015-2021 Intel Corporation, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -58,6 +58,7 @@
 #include <ofi_rbuf.h>
 #include <ofi_list.h>
 #include <ofi_signal.h>
+#include <ofi_epoll.h>
 #include <ofi_util.h>
 #include <ofi_atomic.h>
 #include <ofi_iov.h>
@@ -67,12 +68,14 @@
 
 struct smr_env {
 	size_t sar_threshold;
+	int disable_cma;
 };
 
 extern struct smr_env smr_env;
 extern struct fi_provider smr_prov;
 extern struct fi_info smr_info;
 extern struct util_prov smr_util_prov;
+extern int smr_global_ep_idx; //protected by the ep_list_lock
 
 int smr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
 		void *context);
@@ -128,6 +131,7 @@ struct smr_tx_entry {
 	struct smr_ep_name *map_name;
 	enum fi_hmem_iface	iface;
 	uint64_t		device;
+	int			fd;
 };
 
 struct smr_sar_entry {
@@ -178,15 +182,21 @@ static inline enum fi_hmem_iface smr_get_mr_hmem_iface(struct util_domain *domai
 	return ((struct ofi_mr *) *desc)->iface;
 }
 
+static inline uint64_t smr_get_mr_flags(void **desc)
+{
+	assert(desc && *desc);
+	return ((struct ofi_mr *) *desc)->flags;
+}
+
 struct smr_unexp_msg {
 	struct dlist_entry entry;
 	struct smr_cmd cmd;
 };
 
-DECLARE_FREESTACK(struct smr_rx_entry, smr_recv_fs);
-DECLARE_FREESTACK(struct smr_unexp_msg, smr_unexp_fs);
-DECLARE_FREESTACK(struct smr_tx_entry, smr_pend_fs);
-DECLARE_FREESTACK(struct smr_sar_entry, smr_sar_fs);
+OFI_DECLARE_FREESTACK(struct smr_rx_entry, smr_recv_fs);
+OFI_DECLARE_FREESTACK(struct smr_unexp_msg, smr_unexp_fs);
+OFI_DECLARE_FREESTACK(struct smr_tx_entry, smr_pend_fs);
+OFI_DECLARE_FREESTACK(struct smr_sar_entry, smr_sar_fs);
 
 struct smr_queue {
 	struct dlist_entry list;
@@ -195,25 +205,17 @@ struct smr_queue {
 
 struct smr_fabric {
 	struct util_fabric	util_fabric;
-	int			dom_idx;
 };
 
 struct smr_domain {
 	struct util_domain	util_domain;
-	int			dom_idx;
-	int			ep_idx;
 	int			fast_rma;
 };
 
 #define SMR_PREFIX	"fi_shm://"
 #define SMR_PREFIX_NS	"fi_ns://"
 
-static inline const char *smr_no_prefix(const char *addr)
-{
-	char *start;
-
-	return (start = strstr(addr, "://")) ? start + 3 : addr;
-}
+#define SMR_ZE_SOCK_PATH	"/dev/shm/ze_"
 
 #define SMR_RMA_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | FI_ORDER_RAS |	\
 		       OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET | FI_ORDER_WAS |	\
@@ -231,6 +233,36 @@ static inline void *smr_get_ptr(void *base, uint64_t offset)
 	return (char *) base + (uintptr_t) offset;
 }
 
+extern struct dlist_entry sock_name_list;
+extern pthread_mutex_t sock_list_lock;
+
+struct smr_sock_name {
+	char name[SMR_SOCK_NAME_MAX];
+	struct dlist_entry entry;
+};
+
+enum smr_cmap_state {
+	SMR_CMAP_INIT = 0,
+	SMR_CMAP_SUCCESS,
+	SMR_CMAP_FAILED,
+};
+
+struct smr_cmap_entry {
+	enum smr_cmap_state	state;
+	int			device_fds[ZE_MAX_DEVICES];
+};
+
+struct smr_sock_info {
+	char			name[SMR_SOCK_NAME_MAX];
+	int			listen_sock;
+	ofi_epoll_t		epollfd;
+	struct fd_signal	signal;
+	pthread_t		listener_thread;
+	int			*my_fds;
+	int			nfds;
+	struct smr_cmap_entry	peers[SMR_MAX_PEERS];
+};
+
 struct smr_ep {
 	struct util_ep		util_ep;
 	smr_rx_comp_func	rx_comp;
@@ -250,6 +282,9 @@ struct smr_ep {
 	struct smr_queue	unexp_msg_queue;
 	struct smr_queue	unexp_tagged_queue;
 	struct dlist_entry	sar_list;
+
+	int			ep_idx;
+	struct smr_sock_info	*sock_info;
 };
 
 #define smr_ep_rx_flags(smr_ep) ((smr_ep)->util_ep.rx_op_flags)
@@ -264,6 +299,7 @@ static inline int smr_mmap_name(char *shm_name, const char *ep_name,
 
 int smr_endpoint(struct fid_domain *domain, struct fi_info *info,
 		  struct fid_ep **ep, void *context);
+void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id);
 
 int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 		struct fid_cq **cq_fid, void *context);
@@ -286,14 +322,21 @@ void smr_format_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t d
 void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
 		    size_t total_len, struct smr_region *smr,
 		    struct smr_resp *resp);
+int smr_format_ze_ipc(struct smr_ep *ep, int64_t id, struct smr_cmd *cmd,
+		      const struct iovec *iov, uint64_t device,
+		      size_t total_len, struct smr_region *smr,
+		      struct smr_resp *resp, struct smr_tx_entry *pend);
+int smr_format_ipc(struct smr_cmd *cmd, void *ptr,
+		   size_t len, struct smr_region *smr,
+		   struct smr_resp *resp, enum fi_hmem_iface iface);
 int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd,
 		    const struct iovec *iov, size_t count, size_t total_len,
 		    struct smr_tx_entry *pend, struct smr_resp *resp);
-void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t deivce,
-		    const struct iovec *iov, size_t count,
-		    size_t total_len, struct smr_region *smr,
-		    struct smr_region *peer_smr, struct smr_sar_msg *sar_msg,
-		    struct smr_tx_entry *pending, struct smr_resp *resp);
+int smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t deivce,
+		   const struct iovec *iov, size_t count,
+		   size_t total_len, struct smr_region *smr,
+		   struct smr_region *peer_smr, int64_t id,
+		   struct smr_tx_entry *pending, struct smr_resp *resp);
 size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 		       struct smr_cmd *cmd, enum fi_hmem_iface,
 		       uint64_t device, const struct iovec *iov, size_t count,
@@ -332,8 +375,17 @@ void smr_ep_progress(struct util_ep *util_ep);
 static inline bool smr_cma_enabled(struct smr_ep *ep,
 				   struct smr_region *peer_smr)
 {
-	return ep->region->cma_cap == SMR_CMA_CAP_ON ||
-	       ep->region == peer_smr;
+	if (ep->region == peer_smr)
+		return ep->region->cma_cap_self == SMR_CMA_CAP_ON;
+	else
+		return ep->region->cma_cap_peer == SMR_CMA_CAP_ON;
+}
+
+static inline bool smr_ze_ipc_enabled(struct smr_region *smr,
+				      struct smr_region *peer_smr)
+{
+	return (smr->flags & SMR_FLAG_IPC_SOCK) &&
+	       (peer_smr->flags & SMR_FLAG_IPC_SOCK);
 }
 
 static inline int smr_cma_loop(pid_t pid, struct iovec *local,
diff --git a/deps/libfabric/prov/shm/src/smr_atomic.c b/deps/libfabric/prov/shm/src/smr_atomic.c
index aaa305ea28a84edb672b174e43c05dd0da899de7..69aef0635c33237ea09cf20b5dd26fe273f75e70 100644
--- a/deps/libfabric/prov/shm/src/smr_atomic.c
+++ b/deps/libfabric/prov/shm/src/smr_atomic.c
@@ -173,9 +173,9 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 		goto unlock_cq;
 	}
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	total_len = ofi_datatype_size(datatype) * ofi_total_ioc_cnt(ioc, count);
-	
+
 	switch (op) {
 	case ofi_op_atomic_compare:
 		assert(compare_ioc);
@@ -221,8 +221,8 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 				ret = -FI_EAGAIN;
 				goto unlock_cq;
 			}
-			resp = ofi_cirque_tail(smr_resp_queue(ep->region));
-			pend = freestack_pop(ep->pend_fs);
+			resp = ofi_cirque_next(smr_resp_queue(ep->region));
+			pend = ofi_freestack_pop(ep->pend_fs);
 			smr_format_pend_resp(pend, cmd, context, iface, device, result_iov,
 					     result_count, id, resp);
 			cmd->msg.hdr.data = smr_get_offset(ep->region, resp);
@@ -248,10 +248,11 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 		}
 	}
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_format_rma_ioc(cmd, rma_ioc, rma_count);
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 unlock_cq:
 	fastlock_release(&ep->util_ep.tx_cq->cq_lock);
 unlock_region:
@@ -346,9 +347,9 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 		goto unlock_region;
 	}
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	total_len = count * ofi_datatype_size(datatype);
-	
+
 	iov.iov_base = (void *) buf;
 	iov.iov_len = total_len;
 
@@ -369,10 +370,11 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_format_rma_ioc(cmd, &rma_ioc, 1);
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 
 	ofi_ep_tx_cntr_inc_func(&ep->util_ep, ofi_op_atomic);
 unlock_region:
diff --git a/deps/libfabric/prov/shm/src/smr_attr.c b/deps/libfabric/prov/shm/src/smr_attr.c
index 34026d5964f403bd01c8e840e8a4c908f6d7cccc..cd71ec424c5f02c4c8d90f30515257a0a53d1534 100644
--- a/deps/libfabric/prov/shm/src/smr_attr.c
+++ b/deps/libfabric/prov/shm/src/smr_attr.c
@@ -36,6 +36,8 @@
 #define SMR_RX_CAPS (FI_SOURCE | FI_RMA_EVENT | OFI_RX_MSG_CAPS | FI_TAGGED | \
 		     OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \
 		     FI_MULTI_RECV)
+#define SMR_HMEM_TX_CAPS ((SMR_TX_CAPS | FI_HMEM) & ~FI_ATOMICS)
+#define SMR_HMEM_RX_CAPS ((SMR_RX_CAPS | FI_HMEM) & ~FI_ATOMICS)
 #define SMR_TX_OP_FLAGS (FI_COMPLETION | FI_INJECT_COMPLETE | \
 			 FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)
 #define SMR_RX_OP_FLAGS (FI_COMPLETION | FI_MULTI_RECV)
@@ -61,7 +63,7 @@ struct fi_rx_attr smr_rx_attr = {
 };
 
 struct fi_tx_attr smr_hmem_tx_attr = {
-	.caps = SMR_TX_CAPS | FI_HMEM,
+	.caps = SMR_HMEM_TX_CAPS,
 	.op_flags = SMR_TX_OP_FLAGS,
 	.comp_order = FI_ORDER_NONE,
 	.msg_order = SMR_RMA_ORDER | FI_ORDER_SAS,
@@ -72,7 +74,7 @@ struct fi_tx_attr smr_hmem_tx_attr = {
 };
 
 struct fi_rx_attr smr_hmem_rx_attr = {
-	.caps = SMR_RX_CAPS | FI_HMEM,
+	.caps = SMR_HMEM_RX_CAPS,
 	.op_flags = SMR_RX_OP_FLAGS,
 	.comp_order = FI_ORDER_STRICT,
 	.msg_order = SMR_RMA_ORDER | FI_ORDER_SAS,
@@ -119,7 +121,7 @@ struct fi_fabric_attr smr_fabric_attr = {
 };
 
 struct fi_info smr_hmem_info = {
-	.caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_HMEM | FI_MULTI_RECV,
+	.caps = SMR_HMEM_TX_CAPS | SMR_HMEM_RX_CAPS | FI_MULTI_RECV,
 	.addr_format = FI_ADDR_STR,
 	.tx_attr = &smr_hmem_tx_attr,
 	.rx_attr = &smr_hmem_rx_attr,
diff --git a/deps/libfabric/prov/shm/src/smr_av.c b/deps/libfabric/prov/shm/src/smr_av.c
index 7ce79167af1abe0d1fa07117b8c19b583a981675..2961f84efb2ee6b7d9b1d86e939d694f213b364e 100644
--- a/deps/libfabric/prov/shm/src/smr_av.c
+++ b/deps/libfabric/prov/shm/src/smr_av.c
@@ -62,7 +62,6 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 	struct smr_av *smr_av;
 	struct smr_ep *smr_ep;
 	struct dlist_entry *av_entry;
-	const char *ep_name;
 	fi_addr_t util_addr;
 	int64_t shm_id = -1;
 	int i, ret;
@@ -73,17 +72,19 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 
 	for (i = 0; i < count; i++, addr = (char *) addr + strlen(addr) + 1) {
 		if (smr_av->used < SMR_MAX_PEERS) {
-			ep_name = smr_no_prefix(addr);
+			util_addr = FI_ADDR_NOTAVAIL;
 			ret = smr_map_add(&smr_prov, smr_av->smr_map,
-					  ep_name, &shm_id);
-			if (!ret)
+					  addr, &shm_id);
+			if (!ret) {
+				fastlock_acquire(&util_av->lock);
 				ret = ofi_av_insert_addr(util_av, &shm_id,
 							 &util_addr);
+				fastlock_release(&util_av->lock);
+			}
 		} else {
 			FI_WARN(&smr_prov, FI_LOG_AV,
 				"AV insert failed. The maximum number of AV "
 				"entries shm supported has been reached.\n");
-			util_addr = FI_ADDR_NOTAVAIL;
 			ret = -FI_ENOMEM;
 		}
 
@@ -159,21 +160,19 @@ static int smr_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
 {
 	struct util_av *util_av;
 	struct smr_av *smr_av;
-	struct smr_region *peer_smr;
 	int64_t id;
+	char *name;
 
 	util_av = container_of(av, struct util_av, av_fid);
 	smr_av = container_of(util_av, struct smr_av, util_av);
 
 	id = smr_addr_lookup(util_av, fi_addr);
-	peer_smr = smr_map_get(smr_av->smr_map, id);
+	name = smr_av->smr_map->peers[id].peer.name;
 
-	if (!peer_smr)
-		return -FI_ENODATA;
+	strncpy((char *) addr, name, *addrlen);
 
-	strncpy((char *)addr, smr_name(peer_smr), *addrlen);
-	((char *) addr)[MIN(*addrlen - 1, strlen(smr_name(peer_smr)))] = '\0';
-	*addrlen = strlen(smr_name(peer_smr) + 1);
+	((char *) addr)[MIN(*addrlen - 1, strlen(name))] = '\0';
+	*addrlen = strlen(name) + 1;
 	return 0;
 }
 
diff --git a/deps/libfabric/prov/shm/src/smr_comp.c b/deps/libfabric/prov/shm/src/smr_comp.c
index 02c91823cffb8f82021187b32d546729e48e5853..e883ec0dbf8f1404e6e9b5b9e148fdda9b47a267 100644
--- a/deps/libfabric/prov/shm/src/smr_comp.c
+++ b/deps/libfabric/prov/shm/src/smr_comp.c
@@ -48,36 +48,67 @@ int smr_complete_tx(struct smr_ep *ep, void *context, uint32_t op,
 	return ep->tx_comp(ep, context, op, flags, err);
 }
 
-int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op,
-		uint16_t flags, uint64_t err)
+static int
+smr_write_err_comp(struct util_cq *cq, void *context,
+		   uint64_t flags, uint64_t tag, uint64_t err)
+{
+	struct fi_cq_err_entry err_entry;
+
+	memset(&err_entry, 0, sizeof err_entry);
+	err_entry.op_context = context;
+	err_entry.flags = flags;
+	err_entry.tag = tag;
+	err_entry.err = err;
+	err_entry.prov_errno = -err;
+	return ofi_cq_insert_error(cq, &err_entry);
+}
+
+static int
+smr_write_comp(struct util_cq *cq, void *context,
+	       uint64_t flags, size_t len, void *buf,
+	       uint64_t tag, uint64_t data, uint64_t err)
 {
-	struct fi_cq_tagged_entry *comp;
-	struct util_cq_oflow_err_entry *entry;
-
-	comp = ofi_cirque_tail(ep->util_ep.tx_cq->cirq);
-	if (err) {
-		if (!(entry = calloc(1, sizeof(*entry))))
-			return -FI_ENOMEM;
-		entry->comp.op_context = context;
-		entry->comp.flags = ofi_tx_cq_flags(op);
-		entry->comp.err = err;
-		entry->comp.prov_errno = -err;
-		slist_insert_tail(&entry->list_entry,
-				  &ep->util_ep.tx_cq->oflow_err_list);
-		comp->flags = UTIL_FLAG_ERROR;
+	if (err)
+		return smr_write_err_comp(cq, context, flags, tag, err);
+
+	if (ofi_cirque_freecnt(cq->cirq) > 1) {
+		ofi_cq_write_entry(cq, context, flags, len,
+				   buf, data, tag);
+		return 0;
 	} else {
-		comp->op_context = context;
-		comp->flags = ofi_tx_cq_flags(op);
-		comp->len = 0;
-		comp->buf = NULL;
-		comp->data = 0;
+		return ofi_cq_write_overflow(cq, context, flags,
+					     len, buf, data, tag,
+					     FI_ADDR_NOTAVAIL);
 	}
-	ofi_cirque_commit(ep->util_ep.tx_cq->cirq);
-	return 0;
 }
 
-int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
+static int
+smr_write_src_comp(struct util_cq *cq, void *context,
+		   uint64_t flags, size_t len, void *buf, fi_addr_t addr,
+		   uint64_t tag, uint64_t data, uint64_t err)
+{
+	if (err)
+		return smr_write_err_comp(cq, context, flags, tag, err);
+
+	if (ofi_cirque_freecnt(cq->cirq) > 1) {
+		ofi_cq_write_src_entry(cq, context, flags, len,
+				       buf, data, tag, addr);
+		return 0;
+	} else {
+		return ofi_cq_write_overflow(cq, context, flags,
+					     len, buf, data, tag, addr);
+	}
+}
+
+int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op,
 		uint16_t flags, uint64_t err)
+{
+	return smr_write_comp(ep->util_ep.tx_cq, context,
+			      ofi_tx_cq_flags(op), 0, NULL, 0, 0, err);
+}
+
+int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
+		       uint16_t flags, uint64_t err)
 {
 	int ret;
 
@@ -88,9 +119,9 @@ int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
 	return 0;
 }
 
-int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags,
-		    size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data,
-		    uint64_t err)
+int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op,
+		    uint16_t flags, size_t len, void *buf, int64_t id,
+		    uint64_t tag, uint64_t data, uint64_t err)
 {
 	fi_addr_t fiaddr = FI_ADDR_UNSPEC;
 
@@ -99,7 +130,6 @@ int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint16_t flag
 	if (!err && !(flags & (SMR_REMOTE_CQ_DATA | SMR_RX_COMPLETION)))
 		return 0;
 
-	//TODO I was here
 	if (ep->util_ep.domain->info_domain_caps & FI_SOURCE)
 		fiaddr = ep->region->map->peers[id].fiaddr;
 
@@ -111,45 +141,18 @@ int smr_rx_comp(struct smr_ep *ep, void *context, uint32_t op,
 		uint16_t flags, size_t len, void *buf, fi_addr_t addr,
 		uint64_t tag, uint64_t data, uint64_t err)
 {
-	struct fi_cq_tagged_entry *comp;
-	struct util_cq_oflow_err_entry *entry;
-
-	if (ofi_cirque_isfull(ep->util_ep.rx_cq->cirq))
-		return ofi_cq_write_overflow(ep->util_ep.rx_cq, context,
-					     smr_rx_cq_flags(op, flags),
-					     len, buf, data, tag, addr);
-
-	comp = ofi_cirque_tail(ep->util_ep.rx_cq->cirq);
-	if (err) {
-		if (!(entry = calloc(1, sizeof(*entry))))
-			return -FI_ENOMEM;
-		entry->comp.op_context = context;
-		entry->comp.flags = smr_rx_cq_flags(op, flags);
-		entry->comp.tag = tag;
-		entry->comp.err = err;
-		entry->comp.prov_errno = -err;
-		slist_insert_tail(&entry->list_entry,
-				  &ep->util_ep.rx_cq->oflow_err_list);
-		comp->flags = UTIL_FLAG_ERROR;
-	} else {
-		comp->op_context = context;
-		comp->flags = smr_rx_cq_flags(op, flags);
-		comp->len = len;
-		comp->buf = buf;
-		comp->data = data;
-		comp->tag = tag;
-	}
-	ofi_cirque_commit(ep->util_ep.rx_cq->cirq);
-	return 0;
+	return smr_write_comp(ep->util_ep.rx_cq, context,
+			      smr_rx_cq_flags(op, flags), len, buf,
+			      tag, data, err);
 }
 
 int smr_rx_src_comp(struct smr_ep *ep, void *context, uint32_t op,
 		    uint16_t flags, size_t len, void *buf, fi_addr_t addr,
 		    uint64_t tag, uint64_t data, uint64_t err)
 {
-	ep->util_ep.rx_cq->src[ofi_cirque_windex(ep->util_ep.rx_cq->cirq)] = addr;
-	return smr_rx_comp(ep, context, op, flags, len, buf, addr, tag,
-			   data, err);
+	return smr_write_src_comp(ep->util_ep.rx_cq, context,
+				  smr_rx_cq_flags(op, flags), len, buf, addr,
+				  tag, data, err);
 }
 
 int smr_rx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
@@ -158,7 +161,8 @@ int smr_rx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
 {
 	int ret;
 
-	ret = smr_rx_comp(ep, context, op, flags, len, buf, addr, tag, data, err);
+	ret = smr_rx_comp(ep, context, op, flags, len, buf, addr, tag,
+			  data, err);
 	if (ret)
 		return ret;
 	ep->util_ep.rx_cq->wait->signal(ep->util_ep.rx_cq->wait);
diff --git a/deps/libfabric/prov/shm/src/smr_domain.c b/deps/libfabric/prov/shm/src/smr_domain.c
index 597000242905dc4148af4c422c81f9a8d077ec49..afa17ba87263d10c69789103381165a66d9cac09 100644
--- a/deps/libfabric/prov/shm/src/smr_domain.c
+++ b/deps/libfabric/prov/shm/src/smr_domain.c
@@ -101,7 +101,6 @@ int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 
 	smr_fabric = container_of(fabric, struct smr_fabric, util_fabric.fabric_fid);
 	fastlock_acquire(&smr_fabric->util_fabric.lock);
-	smr_domain->dom_idx = smr_fabric->dom_idx++;
 	smr_domain->fast_rma = smr_fast_rma_enabled(info->domain_attr->mr_mode,
 						    info->tx_attr->msg_order);
 	fastlock_release(&smr_fabric->util_fabric.lock);
diff --git a/deps/libfabric/prov/shm/src/smr_ep.c b/deps/libfabric/prov/shm/src/smr_ep.c
index cf8215b2b46f97bbde86e12bf3be372427cf8967..8d09a0db216e9d128728068df7e29434060f5752 100644
--- a/deps/libfabric/prov/shm/src/smr_ep.c
+++ b/deps/libfabric/prov/shm/src/smr_ep.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Intel Corporation. All rights reserved
+ * Copyright (c) 2013-2021 Intel Corporation. All rights reserved
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/uio.h>
+#include <sys/un.h>
 
 #include "ofi_iov.h"
 #include "ofi_hmem.h"
@@ -42,13 +43,28 @@ extern struct fi_ops_msg smr_msg_ops;
 extern struct fi_ops_tagged smr_tagged_ops;
 extern struct fi_ops_rma smr_rma_ops;
 extern struct fi_ops_atomic smr_atomic_ops;
+DEFINE_LIST(sock_name_list);
+pthread_mutex_t sock_list_lock = PTHREAD_MUTEX_INITIALIZER;
+int smr_global_ep_idx = 0;
 
 int smr_setname(fid_t fid, void *addr, size_t addrlen)
 {
 	struct smr_ep *ep;
 	char *name;
 
+	if (addrlen > SMR_NAME_MAX) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"Addrlen exceeds max addrlen (%d)\n", SMR_NAME_MAX);
+		return -FI_EINVAL;
+	}
+
 	ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid);
+	if (ep->region) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"Cannot set name after EP has been enabled\n");
+		return -FI_EBUSY;
+	}
+
 	name = strdup(addr);
 	if (!name)
 		return -FI_ENOMEM;
@@ -145,7 +161,7 @@ static int smr_ep_cancel_recv(struct smr_ep *ep, struct smr_queue *queue,
 				  recv_entry->flags, 0,
 				  NULL, recv_entry->peer_id,
 				  recv_entry->tag, 0, FI_ECANCELED);
-		freestack_push(ep->recv_fs, recv_entry);
+		ofi_freestack_push(ep->recv_fs, recv_entry);
 		ret = ret ? ret : 1;
 	}
 
@@ -192,20 +208,22 @@ static void smr_send_name(struct smr_ep *ep, int64_t id)
 	if (smr_peer_data(ep->region)[id].name_sent || !peer_smr->cmd_cnt)
 		goto out;
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 
 	cmd->msg.hdr.op = SMR_OP_MAX + ofi_ctrl_connreq;
 	cmd->msg.hdr.id = id;
+	cmd->msg.hdr.data = ep->region->pid;
 
 	tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
 	cmd->msg.hdr.src_data = smr_get_offset(peer_smr, tx_buf);
 
-	cmd->msg.hdr.size = strlen(smr_name(ep->region)) + 1;
-	memcpy(tx_buf->data, smr_name(ep->region), cmd->msg.hdr.size);
+	cmd->msg.hdr.size = strlen(ep->name) + 1;
+	memcpy(tx_buf->data, ep->name, cmd->msg.hdr.size);
 
 	smr_peer_data(ep->region)[id].name_sent = 1;
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 
 out:
 	fastlock_release(&peer_smr->lock);
@@ -250,8 +268,8 @@ static int smr_match_tagged(struct dlist_entry *item, const void *args)
 
 	recv_entry = container_of(item, struct smr_rx_entry, entry);
 	return smr_match_id(recv_entry->peer_id, attr->id) &&
-	       smr_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); 
-} 
+	       smr_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag);
+}
 
 static int smr_match_unexp_msg(struct dlist_entry *item, const void *args)
 {
@@ -350,6 +368,54 @@ void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
 	memcpy(cmd->msg.data.iov, iov, sizeof(*iov) * count);
 }
 
+int smr_format_ze_ipc(struct smr_ep *ep, int64_t id, struct smr_cmd *cmd,
+		      const struct iovec *iov, uint64_t device,
+		      size_t total_len, struct smr_region *smr,
+		      struct smr_resp *resp, struct smr_tx_entry *pend)
+{
+	int ret;
+	void *base;
+
+	cmd->msg.hdr.op_src = smr_src_ipc;
+	cmd->msg.hdr.src_data = smr_get_offset(smr, resp);
+	cmd->msg.hdr.size = total_len;
+	cmd->msg.data.ipc_info.iface = FI_HMEM_ZE;
+
+	if (ep->sock_info->peers[id].state == SMR_CMAP_INIT)
+		smr_ep_exchange_fds(ep, id);
+	if (ep->sock_info->peers[id].state != SMR_CMAP_SUCCESS)
+		return -FI_EAGAIN;
+
+	ret = ze_hmem_get_base_addr(iov[0].iov_base, &base, NULL);
+	if (ret)
+		return ret;
+
+	ret = ze_hmem_get_shared_handle(ep->sock_info->my_fds[device],
+			base, &pend->fd,
+			(void **) &cmd->msg.data.ipc_info.fd_handle);
+	if (ret)
+		return ret;
+
+	cmd->msg.data.ipc_info.device = device;
+	cmd->msg.data.ipc_info.offset = (char *) iov[0].iov_base -
+					(char *) base;
+
+	return FI_SUCCESS;
+}
+
+int smr_format_ipc(struct smr_cmd *cmd, void *ptr,
+                   size_t len, struct smr_region *smr,
+                   struct smr_resp *resp, enum fi_hmem_iface iface)
+{
+	cmd->msg.hdr.op_src = smr_src_ipc;
+	cmd->msg.hdr.src_data = smr_get_offset(smr, resp);
+	cmd->msg.hdr.size = len;
+	cmd->msg.data.ipc_info.iface = iface;
+
+	return ofi_hmem_get_handle(cmd->msg.data.ipc_info.iface, ptr,
+				   (void **)&cmd->msg.data.ipc_info.ipc_handle);
+}
+
 int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd,
 		    const struct iovec *iov, size_t count, size_t total_len,
 		    struct smr_tx_entry *pend, struct smr_resp *resp)
@@ -489,12 +555,18 @@ size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 	return *bytes_done - start;
 }
 
-void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device,
-		    const struct iovec *iov, size_t count,
-		    size_t total_len, struct smr_region *smr,
-		    struct smr_region *peer_smr, struct smr_sar_msg *sar_msg,
-		    struct smr_tx_entry *pending, struct smr_resp *resp)
+int smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device,
+		   const struct iovec *iov, size_t count,
+		   size_t total_len, struct smr_region *smr,
+		   struct smr_region *peer_smr, int64_t id,
+		   struct smr_tx_entry *pending, struct smr_resp *resp)
 {
+	struct smr_sar_msg *sar_msg;
+
+	if (!peer_smr->sar_cnt)
+		return -FI_EAGAIN;
+
+	sar_msg = smr_freestack_pop(smr_sar_pool(peer_smr));
 	cmd->msg.hdr.op_src = smr_src_sar;
 	cmd->msg.hdr.src_data = smr_get_offset(smr, resp);
 	cmd->msg.data.sar = smr_get_offset(peer_smr, sar_msg);
@@ -505,8 +577,19 @@ void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t devi
 	sar_msg->sar[0].status = SMR_SAR_FREE;
 	sar_msg->sar[1].status = SMR_SAR_FREE;
 	if (cmd->msg.hdr.op != ofi_op_read_req)
-		smr_copy_to_sar(sar_msg, NULL, cmd, iface, device ,iov, count,
+		smr_copy_to_sar(sar_msg, resp, cmd, iface, device ,iov, count,
 				&pending->bytes_done, &pending->next);
+
+	peer_smr->sar_cnt--;
+	smr_peer_data(smr)[id].sar_status = SMR_SAR_READY;
+
+	return 0;
+}
+
+static void smr_cleanup_epoll(struct smr_sock_info *sock_info)
+{
+	fd_signal_free(&sock_info->signal);
+	ofi_epoll_close(sock_info->epollfd);
 }
 
 static int smr_ep_close(struct fid *fid)
@@ -515,6 +598,15 @@ static int smr_ep_close(struct fid *fid)
 
 	ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid);
 
+	if (ep->sock_info) {
+		fd_signal_set(&ep->sock_info->signal);
+		pthread_join(ep->sock_info->listener_thread, NULL);
+		close(ep->sock_info->listen_sock);
+		unlink(ep->sock_info->name);
+		smr_cleanup_epoll(ep->sock_info);
+		free(ep->sock_info);
+	}
+
 	ofi_endpoint_close(&ep->util_ep);
 
 	if (ep->region)
@@ -585,7 +677,7 @@ static int smr_ep_bind_cntr(struct smr_ep *ep, struct util_cntr *cntr, uint64_t
 	if (ret)
 		return ret;
 
-	if (cntr->wait) {	
+	if (cntr->wait) {
 		ret = ofi_wait_add_fid(cntr->wait, &ep->util_ep.ep_fid.fid, 0,
 				       smr_ep_trywait);
 		if (ret)
@@ -631,6 +723,332 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 	return ret;
 }
 
+static int smr_sendmsg_fd(int sock, int64_t id, int64_t peer_id,
+			  int *fds, int nfds)
+{
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+	struct iovec iov;
+	char *ctrl_buf;
+	size_t ctrl_size;
+	int ret;
+
+	ctrl_size = sizeof(*fds) * nfds;
+	ctrl_buf = calloc(CMSG_SPACE(ctrl_size), 1);
+	if (!ctrl_buf)
+		return -FI_ENOMEM;
+
+	iov.iov_base = &peer_id;
+	iov.iov_len = sizeof(peer_id);
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_control = ctrl_buf;
+	msg.msg_controllen = CMSG_SPACE(ctrl_size);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(ctrl_size);
+	memcpy(CMSG_DATA(cmsg), fds, ctrl_size);
+
+	ret = sendmsg(sock, &msg, 0);
+	if (ret == sizeof(peer_id)) {
+		ret = FI_SUCCESS;
+	} else {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "sendmsg error\n");
+		ret = -FI_EIO;
+	}
+
+	free(ctrl_buf);
+	return ret;
+}
+
+static int smr_recvmsg_fd(int sock, int64_t *peer_id, int *fds, int nfds)
+{
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+	struct iovec iov;
+	char *ctrl_buf;
+	size_t ctrl_size;
+	int ret;
+
+	ctrl_size = sizeof(*fds) * nfds;
+	ctrl_buf = calloc(CMSG_SPACE(ctrl_size), 1);
+	if (!ctrl_buf)
+		return -FI_ENOMEM;
+
+	iov.iov_base = peer_id;
+	iov.iov_len = sizeof(*peer_id);
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_control = ctrl_buf;
+	msg.msg_controllen = CMSG_SPACE(ctrl_size);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	ret = recvmsg(sock, &msg, 0);
+	if (ret == sizeof(*peer_id)) {
+		ret = FI_SUCCESS;
+	} else {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recvmsg error\n");
+		ret = -FI_EIO;
+		goto out;
+	}
+
+	assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)));
+	cmsg = CMSG_FIRSTHDR(&msg);
+	assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) &&
+	       cmsg->cmsg_level == SOL_SOCKET &&
+	       cmsg->cmsg_type == SCM_RIGHTS && CMSG_DATA(cmsg));
+	memcpy(fds, CMSG_DATA(cmsg), ctrl_size);
+out:
+	free(ctrl_buf);
+	return ret;
+}
+
+static void *smr_start_listener(void *args)
+{
+	struct smr_ep *ep = (struct smr_ep *) args;
+	struct sockaddr_un sockaddr;
+	struct ofi_epollfds_event events[SMR_MAX_PEERS + 1];
+	int i, ret, poll_fds, sock = -1;
+	int peer_fds[ZE_MAX_DEVICES];
+	socklen_t len;
+	int64_t id, peer_id;
+
+	ep->region->flags |= SMR_FLAG_IPC_SOCK;
+	while (1) {
+		poll_fds = ofi_epoll_wait(ep->sock_info->epollfd, events,
+					  SMR_MAX_PEERS + 1, -1);
+
+		if (poll_fds < 0) {
+			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+				"epoll error\n");
+			continue;
+		}
+
+		for (i = 0; i < poll_fds; i++) {
+			if (!events[i].data.ptr)
+				goto out;
+
+			sock = accept(ep->sock_info->listen_sock,
+				      (struct sockaddr *) &sockaddr, &len);
+			if (sock < 0) {
+				FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+					"accept error\n");
+				continue;
+			}
+
+			FI_DBG(&smr_prov, FI_LOG_EP_CTRL,
+			       "EP accepted connection request from %s\n",
+			       sockaddr.sun_path);
+
+			ret = smr_recvmsg_fd(sock, &id, peer_fds,
+					     ep->sock_info->nfds);
+			if (!ret) {
+				memcpy(ep->sock_info->peers[id].device_fds,
+				       peer_fds, sizeof(*peer_fds) *
+				       ep->sock_info->nfds);
+
+				peer_id = smr_peer_data(ep->region)[id].addr.id;
+				ret = smr_sendmsg_fd(sock, id, peer_id,
+						ep->sock_info->my_fds,
+						ep->sock_info->nfds);
+				ep->sock_info->peers[id].state =
+					ret ? SMR_CMAP_FAILED :
+					SMR_CMAP_SUCCESS;
+			}
+
+			close(sock);
+			unlink(sockaddr.sun_path);
+		}
+	}
+out:
+	close(ep->sock_info->listen_sock);
+	unlink(ep->sock_info->name);
+	return NULL;
+}
+
+static int smr_init_epoll(struct smr_sock_info *sock_info)
+{
+	int ret;
+
+	ret = ofi_epoll_create(&sock_info->epollfd);
+	if (ret < 0)
+		return ret;
+
+	ret = fd_signal_init(&sock_info->signal);
+	if (ret < 0)
+		goto err2;
+
+	ret = ofi_epoll_add(sock_info->epollfd,
+	                    sock_info->signal.fd[FI_READ_FD],
+	                    OFI_EPOLL_IN, NULL);
+	if (ret != 0)
+		goto err1;
+
+	ret = ofi_epoll_add(sock_info->epollfd, sock_info->listen_sock,
+			    OFI_EPOLL_IN, sock_info);
+	if (ret != 0)
+		goto err1;
+
+	return FI_SUCCESS;
+err1:
+	ofi_epoll_close(sock_info->epollfd);
+err2:
+	fd_signal_free(&sock_info->signal);
+	return ret;
+}
+
+void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id)
+{
+	struct smr_region *peer_smr = smr_peer_region(ep->region, id);
+	struct sockaddr_un server_sockaddr = {0}, client_sockaddr = {0};
+	char *name1, *name2;
+	int ret = -1, sock = -1;
+	int64_t peer_id;
+	int peer_fds[ZE_MAX_DEVICES];
+
+	if (peer_smr->pid == ep->region->pid ||
+	    !(peer_smr->flags & SMR_FLAG_IPC_SOCK))
+		goto out;
+
+	sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (sock < 0)
+		goto out;
+
+	if (strcmp(smr_sock_name(ep->region), smr_sock_name(peer_smr)) < 1) {
+		name1 = smr_sock_name(ep->region);
+		name2 = smr_sock_name(peer_smr);
+	} else {
+		name1 = smr_sock_name(peer_smr);
+		name2 = smr_sock_name(ep->region);
+	}
+	client_sockaddr.sun_family = AF_UNIX;
+	snprintf(client_sockaddr.sun_path, SMR_SOCK_NAME_MAX, "%s%s:%s",
+		 SMR_ZE_SOCK_PATH, name1, name2);
+
+	ret = bind(sock, (struct sockaddr *) &client_sockaddr,
+		  (socklen_t) sizeof(client_sockaddr));
+	if (ret == -1) {
+		if (errno != EADDRINUSE) {
+			FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "bind error\n");
+			ep->sock_info->peers[id].state = SMR_CMAP_FAILED;
+		}
+		close(sock);
+		return;
+	}
+
+	server_sockaddr.sun_family = AF_UNIX;
+	snprintf(server_sockaddr.sun_path, SMR_SOCK_NAME_MAX, "%s%s",
+		 SMR_ZE_SOCK_PATH, smr_sock_name(peer_smr));
+
+	ret = connect(sock, (struct sockaddr *) &server_sockaddr,
+		      sizeof(server_sockaddr));
+	if (ret == -1)
+		goto cleanup;
+
+	FI_DBG(&smr_prov, FI_LOG_EP_CTRL, "EP connected to UNIX socket %s\n",
+	       server_sockaddr.sun_path);
+
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
+	ret = smr_sendmsg_fd(sock, id, peer_id, ep->sock_info->my_fds,
+			     ep->sock_info->nfds);
+	if (ret)
+		goto cleanup;
+
+	ret = smr_recvmsg_fd(sock, &id, peer_fds, ep->sock_info->nfds);
+	if (ret)
+		goto cleanup;
+
+	memcpy(ep->sock_info->peers[id].device_fds, peer_fds,
+	       sizeof(*peer_fds) * ep->sock_info->nfds);
+
+cleanup:
+	close(sock);
+	unlink(client_sockaddr.sun_path);
+out:
+	ep->sock_info->peers[id].state = ret ?
+		SMR_CMAP_FAILED : SMR_CMAP_SUCCESS;
+}
+
+static void smr_init_ipc_socket(struct smr_ep *ep)
+{
+	struct smr_sock_name *sock_name;
+	struct sockaddr_un sockaddr = {0};
+	int ret;
+
+	ep->sock_info = calloc(1, sizeof(*ep->sock_info));
+	if (!ep->sock_info)
+		goto err_out;
+
+	ep->sock_info->listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ep->sock_info->listen_sock < 0)
+		goto free;
+
+	snprintf(smr_sock_name(ep->region), SMR_SOCK_NAME_MAX,
+		 "%ld:%d", (long) ep->region->pid, ep->ep_idx);
+
+	sockaddr.sun_family = AF_UNIX;
+	snprintf(sockaddr.sun_path, SMR_SOCK_NAME_MAX,
+		 "%s%s", SMR_ZE_SOCK_PATH, smr_sock_name(ep->region));
+
+	ret = bind(ep->sock_info->listen_sock, (struct sockaddr *) &sockaddr,
+		   (socklen_t) sizeof(sockaddr));
+	if (ret)
+		goto close;
+
+	ret = listen(ep->sock_info->listen_sock, SMR_MAX_PEERS);
+	if (ret)
+		goto close;
+
+	FI_DBG(&smr_prov, FI_LOG_EP_CTRL, "EP listening on UNIX socket %s\n",
+	       sockaddr.sun_path);
+
+	ret = smr_init_epoll(ep->sock_info);
+	if (ret)
+		goto close;
+
+	sock_name = calloc(1, sizeof(*sock_name));
+	if (!sock_name)
+		goto cleanup;
+
+	memcpy(sock_name->name, sockaddr.sun_path, strlen(sockaddr.sun_path));
+	memcpy(ep->sock_info->name, sockaddr.sun_path,
+	       strlen(sockaddr.sun_path));
+
+	pthread_mutex_lock(&sock_list_lock);
+	dlist_insert_tail(&sock_name->entry, &sock_name_list);
+	pthread_mutex_unlock(&sock_list_lock);
+
+	ep->sock_info->my_fds = ze_hmem_get_dev_fds(&ep->sock_info->nfds);
+	ret = pthread_create(&ep->sock_info->listener_thread, NULL,
+			     &smr_start_listener, ep);
+	if (ret)
+		goto remove;
+
+	return;
+
+remove:
+	pthread_mutex_lock(&sock_list_lock);
+	dlist_remove(&sock_name->entry);
+	pthread_mutex_unlock(&sock_list_lock);
+	free(sock_name);
+cleanup:
+	smr_cleanup_epoll(ep->sock_info);
+close:
+	close(ep->sock_info->listen_sock);
+	unlink(sockaddr.sun_path);
+free:
+	free(ep->sock_info);
+	ep->sock_info = NULL;
+err_out:
+	FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Unable to initialize IPC socket."
+		"Defaulting to SAR for device transfers\n");
+}
+
 static int smr_ep_ctrl(struct fid *fid, int command, void *arg)
 {
 	struct smr_attr attr;
@@ -643,17 +1061,28 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg)
 
 	switch (command) {
 	case FI_ENABLE:
-		if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq)
+		if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) ||
+		    (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq))
 			return -FI_ENOCQ;
 		if (!ep->util_ep.av)
 			return -FI_ENOAV;
 
-		attr.name = ep->name;
+		attr.name = smr_no_prefix(ep->name);
 		attr.rx_count = ep->rx_size;
 		attr.tx_count = ep->tx_size;
 		ret = smr_create(&smr_prov, av->smr_map, &attr, &ep->region);
 		if (ret)
 			return ret;
+
+		if (ep->util_ep.caps & FI_HMEM || smr_env.disable_cma) {
+			ep->region->cma_cap_peer = SMR_CMA_CAP_OFF;
+			ep->region->cma_cap_self = SMR_CMA_CAP_OFF;
+			if (ep->util_ep.caps & FI_HMEM) {
+				if (ze_hmem_p2p_enabled())
+					smr_init_ipc_socket(ep);
+			}
+		}
+
 		smr_exchange_all_peers(ep->region);
 		break;
 	default:
@@ -670,20 +1099,22 @@ static struct fi_ops smr_ep_fi_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
-static int smr_endpoint_name(char *name, char *addr, size_t addrlen,
-			     int dom_idx, int ep_idx)
+static int smr_endpoint_name(struct smr_ep *ep, char *name, char *addr,
+			     size_t addrlen)
 {
-	const char *start;
 	memset(name, 0, SMR_NAME_MAX);
 	if (!addr || addrlen > SMR_NAME_MAX)
 		return -FI_EINVAL;
 
-	start = smr_no_prefix((const char *) addr);
-	if (strstr(addr, SMR_PREFIX) || dom_idx || ep_idx)
-		snprintf(name, SMR_NAME_MAX - 1, "%s:%d:%d:%d", start, getuid(),
-			 dom_idx, ep_idx);
+	pthread_mutex_lock(&ep_list_lock);
+	ep->ep_idx = smr_global_ep_idx++;
+	pthread_mutex_unlock(&ep_list_lock);
+
+	if (strstr(addr, SMR_PREFIX))
+		snprintf(name, SMR_NAME_MAX - 1, "%s:%d:%d", addr, getuid(),
+			 ep->ep_idx);
 	else
-		snprintf(name, SMR_NAME_MAX - 1, "%s", start);
+		snprintf(name, SMR_NAME_MAX - 1, "%s", addr);
 
 	return 0;
 }
@@ -692,24 +1123,16 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info,
 		  struct fid_ep **ep_fid, void *context)
 {
 	struct smr_ep *ep;
-	struct smr_domain *smr_domain;
-	int ret, ep_idx;
+	int ret;
 	char name[SMR_NAME_MAX];
 
 	ep = calloc(1, sizeof(*ep));
 	if (!ep)
 		return -FI_ENOMEM;
 
-	smr_domain = container_of(domain, struct smr_domain, util_domain.domain_fid);
-
-	fastlock_acquire(&smr_domain->util_domain.lock);
-	ep_idx = smr_domain->ep_idx++;
-	fastlock_release(&smr_domain->util_domain.lock);
-	ret = smr_endpoint_name(name, info->src_addr, info->src_addrlen,
-			        smr_domain->dom_idx, ep_idx);
+	ret = smr_endpoint_name(ep, name, info->src_addr, info->src_addrlen);
 	if (ret)
 		goto err2;
-
 	ret = smr_setname(&ep->util_ep.ep_fid.fid, name, SMR_NAME_MAX);
 	if (ret)
 		goto err2;
diff --git a/deps/libfabric/prov/shm/src/smr_init.c b/deps/libfabric/prov/shm/src/smr_init.c
index 5f41bece78b427ed15ec67fdf5239ffb2eb83517..e24d6f572187fc76ebf4b0fbe7f5b86e51fca283 100644
--- a/deps/libfabric/prov/shm/src/smr_init.c
+++ b/deps/libfabric/prov/shm/src/smr_init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2015-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -40,6 +40,7 @@
 extern struct sigaction *old_action;
 struct smr_env smr_env = {
 	.sar_threshold = SIZE_MAX,
+	.disable_cma = false,
 };
 
 static void smr_init_env(void)
@@ -47,6 +48,7 @@ static void smr_init_env(void)
 	fi_param_get_size_t(&smr_prov, "sar_threshold", &smr_env.sar_threshold);
 	fi_param_get_size_t(&smr_prov, "tx_size", &smr_info.tx_attr->size);
 	fi_param_get_size_t(&smr_prov, "rx_size", &smr_info.rx_attr->size);
+	fi_param_get_bool(&smr_prov, "disable_cma", &smr_env.disable_cma);
 }
 
 static void smr_resolve_addr(const char *node, const char *service,
@@ -99,7 +101,8 @@ static int smr_shm_space_check(size_t tx_count, size_t rx_count)
 	shm_size_needed = num_of_core *
 			  smr_calculate_size_offsets(tx_count, rx_count,
 						     NULL, NULL, NULL,
-						     NULL, NULL, NULL);
+						     NULL, NULL, NULL,
+						     NULL);
 	err = statvfs(shm_fs, &stat);
 	if (err) {
 		FI_WARN(&smr_prov, FI_LOG_CORE,
@@ -213,6 +216,8 @@ SHM_INI
 	fi_param_define(&smr_prov, "rx_size", FI_PARAM_SIZE_T,
 			"Max number of outstanding rx operations \
 			 Default: 1024");
+	fi_param_define(&smr_prov, "disable_cma", FI_PARAM_BOOL,
+			"Manually disables CMA. Default: false");
 
 	smr_init_env();
 
diff --git a/deps/libfabric/prov/shm/src/smr_msg.c b/deps/libfabric/prov/shm/src/smr_msg.c
index 1eb86200ba6fb68cf6b340d8b50e47dfece33ec8..5578585c0f0cd1f2ed500b8022d09a47a766a826 100644
--- a/deps/libfabric/prov/shm/src/smr_msg.c
+++ b/deps/libfabric/prov/shm/src/smr_msg.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2013-2018 Intel Corporation. All rights reserved
+ * Copyright (c) 2013-2021 Intel Corporation. All rights reserved
+ * (C) Copyright 2021 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -57,13 +58,13 @@ static struct smr_rx_entry *smr_get_recv_entry(struct smr_ep *ep,
 	struct smr_rx_entry *entry;
 
 	if (ofi_cirque_isfull(ep->util_ep.rx_cq->cirq) ||
-	    freestack_isempty(ep->recv_fs)) {
+	    ofi_freestack_isempty(ep->recv_fs)) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"not enough space to post recv\n");
 		return NULL;
 	}
 
-	entry = freestack_pop(ep->recv_fs);
+	entry = ofi_freestack_pop(ep->recv_fs);
 
 	memcpy(&entry->iov, iov, sizeof(*iov) * count);
 	entry->iov_count = count;
@@ -158,7 +159,6 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 {
 	struct smr_region *peer_smr;
 	struct smr_inject_buf *tx_buf;
-	struct smr_sar_msg *sar;
 	struct smr_resp *resp;
 	struct smr_cmd *cmd;
 	struct smr_tx_entry *pend;
@@ -167,6 +167,7 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 	int64_t id, peer_id;
 	ssize_t ret = 0;
 	size_t total_len;
+	bool use_ipc;
 
 	assert(iov_count <= SMR_IOV_LIMIT);
 
@@ -193,13 +194,20 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 
 	total_len = ofi_total_iov_len(iov, iov_count);
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_generic_format(cmd, peer_id, op, tag, data, op_flags);
 
-	if (total_len <= SMR_MSG_DATA_LEN && !(op_flags & FI_DELIVERY_COMPLETE)) {
+	/* Do not inline/inject if IPC is available so device to device
+	 * transfer may occur if possible. */
+	use_ipc = ofi_hmem_is_ipc_enabled(iface) && (iov_count == 1) &&
+		  desc && (smr_get_mr_flags(desc) & FI_HMEM_DEVICE_ONLY);
+
+	if (total_len <= SMR_MSG_DATA_LEN &&
+	    !(op_flags & FI_DELIVERY_COMPLETE) && !use_ipc) {
 		smr_format_inline(cmd, iface, device, iov, iov_count);
 	} else if (total_len <= SMR_INJECT_SIZE &&
-		   !(op_flags & FI_DELIVERY_COMPLETE)) {
+		   !(op_flags & FI_DELIVERY_COMPLETE) &&
+		   !use_ipc) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
 		smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf);
 	} else {
@@ -207,31 +215,40 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 			ret = -FI_EAGAIN;
 			goto unlock_cq;
 		}
-		resp = ofi_cirque_tail(smr_resp_queue(ep->region));
-		pend = freestack_pop(ep->pend_fs);
+		resp = ofi_cirque_next(smr_resp_queue(ep->region));
+		pend = ofi_freestack_pop(ep->pend_fs);
 		if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) {
 			smr_format_iov(cmd, iov, iov_count, total_len, ep->region,
 				       resp);
 		} else {
-			if (total_len <= smr_env.sar_threshold ||
-			    iface != FI_HMEM_SYSTEM) {
-				if (!peer_smr->sar_cnt) {
-					ret = -FI_EAGAIN;
-				} else {
-					sar = smr_freestack_pop(smr_sar_pool(peer_smr));
-					smr_format_sar(cmd, iface, device, iov,
-						       iov_count, total_len,
-						       ep->region, peer_smr, sar,
-						       pend, resp);
-					peer_smr->sar_cnt--;
-					smr_peer_data(ep->region)[id].sar_status = 1;
+			if (use_ipc && iface == FI_HMEM_ZE &&
+			    smr_ze_ipc_enabled(ep->region, peer_smr)) {
+				ret = smr_format_ze_ipc(ep, id, cmd, iov,
+					device, total_len, ep->region,
+					resp, pend);
+			} else if (use_ipc && iface != FI_HMEM_ZE) {
+				ret = smr_format_ipc(cmd, iov[0].iov_base, total_len,
+						     ep->region, resp, iface);
+				if (ret) {
+					FI_WARN_ONCE(&smr_prov, FI_LOG_EP_CTRL,
+						     "unable to use IPC for msg, fallback to using SAR\n");
+					ret = smr_format_sar(cmd, iface, device, iov,
+							     iov_count, total_len,
+							     ep->region, peer_smr, id,
+							     pend, resp);
 				}
+			} else if (total_len <= smr_env.sar_threshold ||
+				   iface != FI_HMEM_SYSTEM) {
+				ret = smr_format_sar(cmd, iface, device, iov,
+						     iov_count, total_len,
+						     ep->region, peer_smr, id,
+						     pend, resp);
 			} else {
 				ret = smr_format_mmap(ep, cmd, iov, iov_count,
 						      total_len, pend, resp);
 			}
 			if (ret) {
-				freestack_push(ep->pend_fs, pend);
+				ofi_freestack_push(ep->pend_fs, pend);
 				ret = -FI_EAGAIN;
 				goto unlock_cq;
 			}
@@ -251,6 +268,7 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 commit:
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 unlock_cq:
 	fastlock_release(&ep->util_ep.tx_cq->cq_lock);
 unlock_region:
@@ -329,7 +347,7 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf,
 		goto unlock;
 	}
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_generic_format(cmd, peer_id, op, tag, data, op_flags);
 
 	if (len <= SMR_MSG_DATA_LEN) {
@@ -342,6 +360,7 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf,
 	ofi_ep_tx_cntr_inc_func(&ep->util_ep, op);
 	peer_smr->cmd_cnt--;
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
+	smr_signal(peer_smr);
 unlock:
 	fastlock_release(&peer_smr->lock);
 
diff --git a/deps/libfabric/prov/shm/src/smr_progress.c b/deps/libfabric/prov/shm/src/smr_progress.c
index ea0f7a6eb9e7f395c685a4252d30f5eff1b8f1c2..8dcee52a975e86db0aff19f4884de6b40e3a2276 100644
--- a/deps/libfabric/prov/shm/src/smr_progress.c
+++ b/deps/libfabric/prov/shm/src/smr_progress.c
@@ -36,11 +36,12 @@
 
 #include "ofi_iov.h"
 #include "ofi_hmem.h"
+#include "ofi_atom.h"
 #include "smr.h"
 
 
-static inline void smr_try_progress_to_sar(struct smr_sar_msg *sar_msg,
-				struct smr_resp *resp,
+static inline void smr_try_progress_to_sar(struct smr_region *smr,
+				struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 				struct smr_cmd *cmd, enum fi_hmem_iface iface,
 				uint64_t device, struct iovec *iov,
 				size_t iov_count, size_t *bytes_done, int *next)
@@ -48,10 +49,11 @@ static inline void smr_try_progress_to_sar(struct smr_sar_msg *sar_msg,
 	while (*bytes_done < cmd->msg.hdr.size &&
 	       smr_copy_to_sar(sar_msg, resp, cmd, iface, device, iov,
 			       iov_count, bytes_done, next));
+	smr_signal(smr);
 }
 
-static inline void smr_try_progress_from_sar(struct smr_sar_msg *sar_msg,
-				struct smr_resp *resp,
+static inline void smr_try_progress_from_sar(struct smr_region *smr,
+				struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 				struct smr_cmd *cmd, enum fi_hmem_iface iface,
 				uint64_t device, struct iovec *iov,
 				size_t iov_count, size_t *bytes_done, int *next)
@@ -59,6 +61,7 @@ static inline void smr_try_progress_from_sar(struct smr_sar_msg *sar_msg,
 	while (*bytes_done < cmd->msg.hdr.size &&
 	       smr_copy_from_sar(sar_msg, resp, cmd, iface, device, iov,
 				 iov_count, bytes_done, next));
+	smr_signal(smr);
 }
 
 static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
@@ -69,12 +72,17 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	struct smr_inject_buf *tx_buf = NULL;
 	struct smr_sar_msg *sar_msg = NULL;
 	uint8_t *src;
+	ssize_t hmem_copy_ret;
 
 	peer_smr = smr_peer_region(ep->region, pending->peer_id);
 
 	switch (pending->cmd.msg.hdr.op_src) {
 	case smr_src_iov:
 		break;
+	case smr_src_ipc:
+		if (pending->iface == FI_HMEM_ZE)
+			close(pending->fd);
+		break;
 	case smr_src_sar:
 		sar_msg = smr_get_ptr(peer_smr, pending->cmd.msg.data.sar);
 		if (pending->bytes_done == pending->cmd.msg.hdr.size &&
@@ -82,25 +90,18 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 		    sar_msg->sar[1].status == SMR_SAR_FREE)
 			break;
 
-		if (peer_smr != ep->region) {
-			if (fastlock_tryacquire(&peer_smr->lock))
-				return -FI_EAGAIN;
-		}
 		if (pending->cmd.msg.hdr.op == ofi_op_read_req)
-			smr_try_progress_from_sar(sar_msg, resp,
+			smr_try_progress_from_sar(peer_smr, sar_msg, resp,
 					&pending->cmd, pending->iface,
 					pending->device, pending->iov,
 				        pending->iov_count, &pending->bytes_done,
 					&pending->next);
 		else
-			smr_try_progress_to_sar(sar_msg, resp,
+			smr_try_progress_to_sar(peer_smr, sar_msg, resp,
 					&pending->cmd, pending->iface,
 					pending->device, pending->iov,
 					pending->iov_count, &pending->bytes_done,
 					&pending->next);
-		if (peer_smr != ep->region)
-			fastlock_release(&peer_smr->lock);
-
 		if (pending->bytes_done != pending->cmd.msg.hdr.size ||
 		    sar_msg->sar[0].status != SMR_SAR_FREE ||
 		    sar_msg->sar[1].status != SMR_SAR_FREE)
@@ -111,14 +112,24 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 			break;
 		if (pending->cmd.msg.hdr.op == ofi_op_read_req) {
 			if (!*err) {
-				pending->bytes_done = ofi_copy_to_iov(pending->iov,
-						pending->iov_count, 0,
-						pending->map_ptr,
-						pending->cmd.msg.hdr.size);
-				if (pending->bytes_done != pending->cmd.msg.hdr.size) {
+				hmem_copy_ret =
+					ofi_copy_to_hmem_iov(pending->iface,
+							     pending->device,
+							     pending->iov,
+							     pending->iov_count,
+							     0, pending->map_ptr,
+							     pending->cmd.msg.hdr.size);
+				if (hmem_copy_ret < 0) {
+					FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+						"Copy from mmapped file failed with code %d\n",
+						(int)(-hmem_copy_ret));
+					*err = hmem_copy_ret;
+				} else if (hmem_copy_ret != pending->cmd.msg.hdr.size) {
 					FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 						"Incomplete copy from mmapped file\n");
-					*err = -FI_EIO;
+					*err = -FI_ETRUNC;
+				} else {
+					pending->bytes_done = (size_t) hmem_copy_ret;
 				}
 			}
 			munmap(pending->map_ptr, pending->cmd.msg.hdr.size);
@@ -131,18 +142,27 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	case smr_src_inject:
 		inj_offset = (size_t) pending->cmd.msg.hdr.src_data;
 		tx_buf = smr_get_ptr(peer_smr, inj_offset);
-		if (*err || pending->bytes_done == pending->cmd.msg.hdr.size)
+		if (*err || pending->bytes_done == pending->cmd.msg.hdr.size ||
+		    pending->cmd.msg.hdr.op == ofi_op_atomic)
 			break;
 
 		src = pending->cmd.msg.hdr.op == ofi_op_atomic_compare ?
 		      tx_buf->buf : tx_buf->data;
-		pending->bytes_done = ofi_copy_to_iov(pending->iov, pending->iov_count,
-				       0, src, pending->cmd.msg.hdr.size);
+		hmem_copy_ret  = ofi_copy_to_hmem_iov(pending->iface, pending->device,
+						      pending->iov, pending->iov_count,
+						      0, src, pending->cmd.msg.hdr.size);
 
-		if (pending->bytes_done != pending->cmd.msg.hdr.size) {
+		if (hmem_copy_ret < 0) {
+			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+				"RMA read/fetch failed with code %d\n",
+				(int)(-hmem_copy_ret));
+			*err = hmem_copy_ret;
+		} else if (hmem_copy_ret != pending->cmd.msg.hdr.size) {
 			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 				"Incomplete rma read/fetch buffer copied\n");
-			*err = FI_EIO;
+			*err = -FI_ETRUNC;
+		} else {
+			pending->bytes_done = (size_t) hmem_copy_ret;
 		}
 		break;
 	default:
@@ -153,8 +173,10 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	//Skip locking on transfers from self since we already have
 	//the ep->region->lock
 	if (peer_smr != ep->region) {
-		if (fastlock_tryacquire(&peer_smr->lock))
+		if (fastlock_tryacquire(&peer_smr->lock)) {
+			smr_signal(ep->region);
 			return -FI_EAGAIN;
+		}
 	}
 
 	peer_smr->cmd_cnt++;
@@ -169,7 +191,7 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	if (peer_smr != ep->region)
 		fastlock_release(&peer_smr->lock);
 
-	return 0;
+	return FI_SUCCESS;
 }
 
 static void smr_progress_resp(struct smr_ep *ep)
@@ -198,7 +220,7 @@ static void smr_progress_resp(struct smr_ep *ep)
 				"unable to process tx completion\n");
 			break;
 		}
-		freestack_push(ep->pend_fs, pending);
+		ofi_freestack_push(ep->pend_fs, pending);
 		ofi_cirque_discard(smr_resp_queue(ep->region));
 	}
 	fastlock_release(&ep->util_ep.tx_cq->cq_lock);
@@ -209,14 +231,24 @@ static int smr_progress_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface,
 			       uint64_t device, struct iovec *iov,
 			       size_t iov_count, size_t *total_len)
 {
-	*total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0,
-					  cmd->msg.data.msg, cmd->msg.hdr.size);
-	if (*total_len != cmd->msg.hdr.size) {
+	ssize_t hmem_copy_ret;
+
+	hmem_copy_ret = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0,
+					     cmd->msg.data.msg, cmd->msg.hdr.size);
+	if (hmem_copy_ret < 0) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-			"recv truncated");
-		return -FI_EIO;
+			"inline recv failed with code %d\n",
+			(int)(-hmem_copy_ret));
+		return hmem_copy_ret;
+	} else if (hmem_copy_ret != cmd->msg.hdr.size) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"inline recv truncated\n");
+		return -FI_ETRUNC;
 	}
-	return 0;
+
+	*total_len = hmem_copy_ret;
+
+	return FI_SUCCESS;
 }
 
 static int smr_progress_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface,
@@ -226,6 +258,7 @@ static int smr_progress_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface,
 {
 	struct smr_inject_buf *tx_buf;
 	size_t inj_offset;
+	ssize_t hmem_copy_ret;
 
 	inj_offset = (size_t) cmd->msg.hdr.src_data;
 	tx_buf = smr_get_ptr(ep->region, inj_offset);
@@ -236,20 +269,30 @@ static int smr_progress_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface,
 	}
 
 	if (cmd->msg.hdr.op == ofi_op_read_req) {
-		*total_len = ofi_copy_from_hmem_iov(tx_buf->data, cmd->msg.hdr.size,
-						    iface, device, iov, iov_count, 0);
+		hmem_copy_ret = ofi_copy_from_hmem_iov(tx_buf->data,
+						       cmd->msg.hdr.size,
+						       iface, device, iov,
+						       iov_count, 0);
 	} else {
-		*total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0,
-						  tx_buf->data, cmd->msg.hdr.size);
+		hmem_copy_ret = ofi_copy_to_hmem_iov(iface, device, iov,
+						     iov_count, 0, tx_buf->data,
+						     cmd->msg.hdr.size);
 		smr_freestack_push(smr_inject_pool(ep->region), tx_buf);
 	}
 
-	if (*total_len != cmd->msg.hdr.size) {
+	if (hmem_copy_ret < 0) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-			"recv truncated");
-		return -FI_EIO;
+			"inject recv failed with code %d\n",
+			(int)(-hmem_copy_ret));
+		return hmem_copy_ret;
+	} else if (hmem_copy_ret != cmd->msg.hdr.size) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"inject recv truncated\n");
+		return -FI_ETRUNC;
 	}
 
+	*total_len = hmem_copy_ret;
+
 	return FI_SUCCESS;
 }
 
@@ -278,18 +321,21 @@ static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov,
 out:
 	//Status must be set last (signals peer: op done, valid resp entry)
 	resp->status = ret;
+	smr_signal(peer_smr);
 
 	return -ret;
 }
 
 static int smr_mmap_peer_copy(struct smr_ep *ep, struct smr_cmd *cmd,
-				 struct iovec *iov, size_t iov_count,
-				 size_t *total_len)
+			      enum fi_hmem_iface iface, uint64_t device,
+			      struct iovec *iov, size_t iov_count,
+			      size_t *total_len)
 {
 	char shm_name[SMR_NAME_MAX];
 	void *mapped_ptr;
 	int fd, num;
 	int ret = 0;
+	ssize_t hmem_copy_ret;
 
 	num = smr_mmap_name(shm_name,
 			ep->region->map->peers[cmd->msg.hdr.id].peer.name,
@@ -314,26 +360,28 @@ static int smr_mmap_peer_copy(struct smr_ep *ep, struct smr_cmd *cmd,
 	}
 
 	if (cmd->msg.hdr.op == ofi_op_read_req) {
-		*total_len = ofi_total_iov_len(iov, iov_count);
-		if (ofi_copy_from_iov(mapped_ptr, *total_len, iov, iov_count, 0)
-		    != *total_len) {
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"mmap iov copy in error\n");
-			ret = -FI_EIO;
-			goto munmap;
-		}
+		hmem_copy_ret = ofi_copy_from_hmem_iov(mapped_ptr,
+						    cmd->msg.hdr.size, iface,
+						    device, iov, iov_count, 0);
 	} else {
-		*total_len = ofi_copy_to_iov(iov, iov_count, 0, mapped_ptr,
-				      cmd->msg.hdr.size);
-		if (*total_len != cmd->msg.hdr.size) {
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"mmap iov copy out error\n");
-			ret = -FI_EIO;
-			goto munmap;
-		}
+		hmem_copy_ret = ofi_copy_to_hmem_iov(iface, device, iov,
+						  iov_count, 0, mapped_ptr,
+						  cmd->msg.hdr.size);
+	}
+
+	if (hmem_copy_ret < 0) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"mmap copy iov failed with code %d\n",
+			(int)(-hmem_copy_ret));
+		ret = hmem_copy_ret;
+	} else if (hmem_copy_ret != cmd->msg.hdr.size) {
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"mmap copy iov truncated\n");
+		ret = -FI_ETRUNC;
 	}
 
-munmap:
+	*total_len = hmem_copy_ret;
+
 	munmap(mapped_ptr, cmd->msg.hdr.size);
 unlink_close:
 	shm_unlink(shm_name);
@@ -341,7 +389,8 @@ unlink_close:
 	return ret;
 }
 
-static int smr_progress_mmap(struct smr_cmd *cmd, struct iovec *iov,
+static int smr_progress_mmap(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			     uint64_t device, struct iovec *iov,
 			     size_t iov_count, size_t *total_len,
 			     struct smr_ep *ep)
 {
@@ -352,10 +401,12 @@ static int smr_progress_mmap(struct smr_cmd *cmd, struct iovec *iov,
 	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 	resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data);
 
-	ret = smr_mmap_peer_copy(ep, cmd, iov, iov_count, total_len);
+	ret = smr_mmap_peer_copy(ep, cmd, iface, device,
+				 iov, iov_count, total_len);
 
 	//Status must be set last (signals peer: op done, valid resp entry)
 	resp->status = ret;
+	smr_signal(peer_smr);
 
 	return ret;
 }
@@ -380,16 +431,16 @@ static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd,
 	(void) ofi_truncate_iov(sar_iov, &iov_count, cmd->msg.hdr.size);
 
 	if (cmd->msg.hdr.op == ofi_op_read_req)
-		smr_try_progress_to_sar(sar_msg, resp, cmd, iface, device,
+		smr_try_progress_to_sar(peer_smr, sar_msg, resp, cmd, iface, device,
 					sar_iov, iov_count, total_len, &next);
 	else
-		smr_try_progress_from_sar(sar_msg, resp, cmd, iface, device,
+		smr_try_progress_from_sar(peer_smr, sar_msg, resp, cmd, iface, device,
 					  sar_iov, iov_count, total_len, &next);
 
 	if (*total_len == cmd->msg.hdr.size)
 		return NULL;
 
-	sar_entry = freestack_pop(ep->sar_fs);
+	sar_entry = ofi_freestack_pop(ep->sar_fs);
 
 	sar_entry->cmd = *cmd;
 	sar_entry->bytes_done = *total_len;
@@ -412,6 +463,76 @@ static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd,
 	return sar_entry;
 }
 
+static int smr_progress_ipc(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			    uint64_t device, struct iovec *iov,
+			    size_t iov_count, size_t *total_len,
+			    struct smr_ep *ep, int err)
+{
+	struct smr_region *peer_smr;
+	struct smr_resp *resp;
+	void *base, *ptr;
+	uint64_t ipc_device;
+	int64_t id;
+	int ret, fd, ipc_fd;
+	ssize_t hmem_copy_ret;
+
+	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
+	resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data);
+
+	//TODO disable IPC if more than 1 interface is initialized
+	assert(iface == cmd->msg.data.ipc_info.iface || iface == FI_HMEM_SYSTEM);
+
+	if (cmd->msg.data.ipc_info.iface == FI_HMEM_ZE) {
+		id = cmd->msg.hdr.id;
+		ipc_device = cmd->msg.data.ipc_info.device;
+		fd = ep->sock_info->peers[id].device_fds[ipc_device];
+		ret = ze_hmem_open_shared_handle(fd,
+				(void **) &cmd->msg.data.ipc_info.fd_handle,
+				&ipc_fd, ipc_device, &base);
+	} else {
+		ret = ofi_hmem_open_handle(cmd->msg.data.ipc_info.iface,
+				(void **) &cmd->msg.data.ipc_info.ipc_handle,
+				device, &base);
+	}
+	if (ret)
+		goto out;
+
+	ptr = base;
+	if (cmd->msg.data.ipc_info.iface == FI_HMEM_ZE)
+		ptr = (char *) ptr + (uintptr_t) cmd->msg.data.ipc_info.offset;
+
+	if (cmd->msg.hdr.op == ofi_op_read_req) {
+		hmem_copy_ret = ofi_copy_from_hmem_iov(ptr, cmd->msg.hdr.size,
+						       cmd->msg.data.ipc_info.iface,
+						       device, iov, iov_count, 0);
+	} else {
+		hmem_copy_ret = ofi_copy_to_hmem_iov(cmd->msg.data.ipc_info.iface,
+						     device, iov, iov_count, 0,
+						     ptr, cmd->msg.hdr.size);
+	}
+
+	if (cmd->msg.data.ipc_info.iface == FI_HMEM_ZE)
+		close(ipc_fd);
+
+	/* Truncation error takes precedence over close_handle error */
+	ret = ofi_hmem_close_handle(cmd->msg.data.ipc_info.iface, base);
+
+	if (hmem_copy_ret < 0) {
+		ret = hmem_copy_ret;
+	} else if (hmem_copy_ret != cmd->msg.hdr.size) {
+		ret = -FI_ETRUNC;
+	}
+
+	*total_len = hmem_copy_ret;
+
+out:
+	//Status must be set last (signals peer: op done, valid resp entry)
+	resp->status = ret;
+	smr_signal(peer_smr);
+
+	return -ret;
+}
+
 static bool smr_progress_multi_recv(struct smr_ep *ep,
 				    struct smr_rx_entry *entry, size_t len)
 {
@@ -479,9 +600,9 @@ static int smr_progress_inline_atomic(struct smr_cmd *cmd, struct fi_ioc *ioc,
 	if (*len != cmd->msg.hdr.size) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"recv truncated");
-		return -FI_EIO;
+		return -FI_ETRUNC;
 	}
-	return 0;
+	return FI_SUCCESS;
 }
 
 static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct fi_ioc *ioc,
@@ -519,7 +640,7 @@ static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct fi_ioc *ioc,
 	if (*len != cmd->msg.hdr.size) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"recv truncated");
-		err = -FI_EIO;
+		err = -FI_ETRUNC;
 	}
 
 out:
@@ -557,13 +678,19 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 					      &total_len, ep, 0);
 		break;
 	case smr_src_mmap:
-		entry->err = smr_progress_mmap(cmd, entry->iov, entry->iov_count,
+		entry->err = smr_progress_mmap(cmd, entry->iface, entry->device,
+					       entry->iov, entry->iov_count,
 					       &total_len, ep);
 		break;
 	case smr_src_sar:
 		sar = smr_progress_sar(cmd, entry, entry->iface, entry->device,
 				       entry->iov, entry->iov_count, &total_len, ep);
 		break;
+	case smr_src_ipc:
+		entry->err = smr_progress_ipc(cmd, entry->iface, entry->device,
+					      entry->iov, entry->iov_count,
+					      &total_len, ep, 0);
+		break;
 	default:
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"unidentified operation type\n");
@@ -594,7 +721,7 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 
 	if (free_entry) {
 		dlist_remove(&entry->entry);
-		freestack_push(ep->recv_fs, entry);
+		ofi_freestack_push(ep->recv_fs, entry);
 		return 1;
 	}
 	return 0;
@@ -619,8 +746,14 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd)
 
 	peer_smr = smr_peer_region(ep->region, idx);
 
+	if (peer_smr->pid != (int) cmd->msg.hdr.data) {
+		//TODO track and update/complete in error any transfers
+		//to or from old mapping
+		munmap(peer_smr, peer_smr->total_size);
+		smr_map_to_region(&smr_prov, &ep->region->map->peers[idx]);
+		peer_smr = smr_peer_region(ep->region, idx);
+	}
 	smr_peer_data(peer_smr)[cmd->msg.hdr.id].addr.id = idx;
-
 	smr_peer_data(ep->region)[idx].addr.id = cmd->msg.hdr.id;
 
 	smr_freestack_push(smr_inject_pool(ep->region), tx_buf);
@@ -652,9 +785,9 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd)
 					     recv_queue->match_func,
 					     &match_attr);
 	if (!dlist_entry) {
-		if (freestack_isempty(ep->unexp_fs))
+		if (ofi_freestack_isempty(ep->unexp_fs))
 			return -FI_EAGAIN;
-		unexp = freestack_pop(ep->unexp_fs);
+		unexp = ofi_freestack_pop(ep->unexp_fs);
 		memcpy(&unexp->cmd, cmd, sizeof(*cmd));
 		ofi_cirque_discard(smr_cmd_queue(ep->region));
 		if (cmd->msg.hdr.op == ofi_op_msg) {
@@ -740,6 +873,7 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 			peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 			resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data);
 			resp->status = -err;
+			smr_signal(peer_smr);
 		} else {
 			ep->region->cmd_cnt++;
 		}
@@ -748,13 +882,18 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 		err = smr_progress_iov(cmd, iov, iov_count, &total_len, ep, ret);
 		break;
 	case smr_src_mmap:
-		err = smr_progress_mmap(cmd, iov, iov_count, &total_len, ep);
+		err = smr_progress_mmap(cmd, iface, device, iov,
+					iov_count, &total_len, ep);
 		break;
 	case smr_src_sar:
 		if (smr_progress_sar(cmd, NULL, iface, device, iov, iov_count,
 				     &total_len, ep))
 			return ret;
 		break;
+	case smr_src_ipc:
+		err = smr_progress_ipc(cmd, iface, device, iov, iov_count,
+				       &total_len, ep, ret);
+		break;
 	default:
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"unidentified operation type\n");
@@ -827,6 +966,7 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd)
 		peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 		resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data);
 		resp->status = -err;
+		smr_signal(peer_smr);
 	} else {
 		ep->region->cmd_cnt++;
 	}
@@ -885,6 +1025,7 @@ static void smr_progress_cmd(struct smr_ep *ep)
 			ret = -FI_EINVAL;
 		}
 		if (ret) {
+			smr_signal(ep->region);
 			if (ret != -FI_EAGAIN) {
 				FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 					"error processing command\n");
@@ -914,12 +1055,12 @@ static void smr_progress_sar_list(struct smr_ep *ep)
 		peer_smr = smr_peer_region(ep->region, sar_entry->cmd.msg.hdr.id);
 		resp = smr_get_ptr(peer_smr, sar_entry->cmd.msg.hdr.src_data);
 		if (sar_entry->cmd.msg.hdr.op == ofi_op_read_req)
-			smr_try_progress_to_sar(sar_msg, resp, &sar_entry->cmd,
+			smr_try_progress_to_sar(peer_smr, sar_msg, resp, &sar_entry->cmd,
 					sar_entry->iface, sar_entry->device,
 					sar_entry->iov, sar_entry->iov_count,
 					&sar_entry->bytes_done, &sar_entry->next);
 		else
-			smr_try_progress_from_sar(sar_msg, resp, &sar_entry->cmd,
+			smr_try_progress_from_sar(peer_smr, sar_msg, resp, &sar_entry->cmd,
 					sar_entry->iface, sar_entry->device,
 					sar_entry->iov, sar_entry->iov_count,
 					&sar_entry->bytes_done, &sar_entry->next);
@@ -938,7 +1079,7 @@ static void smr_progress_sar_list(struct smr_ep *ep)
 					"unable to process rx completion\n");
 			}
 			dlist_remove(&sar_entry->entry);
-			freestack_push(ep->sar_fs, sar_entry);
+			ofi_freestack_push(ep->sar_fs, sar_entry);
 		}
 	}
 	fastlock_release(&ep->util_ep.rx_cq->cq_lock);
@@ -951,10 +1092,11 @@ void smr_ep_progress(struct util_ep *util_ep)
 
 	ep = container_of(util_ep, struct smr_ep, util_ep);
 
-	smr_progress_resp(ep);
-	smr_progress_cmd(ep);
-
-	smr_progress_sar_list(ep);
+	if (ofi_atomic_cas_bool32(&ep->region->signal, 1, 0)) {
+		smr_progress_resp(ep);
+		smr_progress_cmd(ep);
+		smr_progress_sar_list(ep);
+	}
 }
 
 int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry,
@@ -980,7 +1122,7 @@ int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry,
 	while (dlist_entry) {
 		unexp_msg = container_of(dlist_entry, struct smr_unexp_msg, entry);
 		ret = smr_progress_msg_common(ep, &unexp_msg->cmd, entry);
-		freestack_push(ep->unexp_fs, unexp_msg);
+		ofi_freestack_push(ep->unexp_fs, unexp_msg);
 		if (!multi_recv || ret)
 			break;
 
diff --git a/deps/libfabric/prov/shm/src/smr_rma.c b/deps/libfabric/prov/shm/src/smr_rma.c
index b41ad8887c21aef5dfaf0af15b1c2808c287c09a..da827a066223fd90d3ee01ec540570323b6acd9f 100644
--- a/deps/libfabric/prov/shm/src/smr_rma.c
+++ b/deps/libfabric/prov/shm/src/smr_rma.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2013-2018 Intel Corporation. All rights reserved
+ * Copyright (c) 2013-2021 Intel Corporation. All rights reserved
+ * (C) Copyright 2021 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -92,7 +93,6 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 	struct smr_domain *domain;
 	struct smr_region *peer_smr;
 	struct smr_inject_buf *tx_buf;
-	struct smr_sar_msg *sar;
 	struct smr_resp *resp;
 	struct smr_cmd *cmd;
 	struct smr_tx_entry *pend;
@@ -103,6 +103,7 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 	uint16_t comp_flags;
 	ssize_t ret = 0;
 	size_t total_len;
+	bool use_ipc;
 
 	assert(iov_count <= SMR_IOV_LIMIT);
 	assert(rma_count <= SMR_IOV_LIMIT);
@@ -133,7 +134,7 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 		goto unlock_cq;
 	}
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 
 	if (cmds == 1) {
 		err = smr_rma_fast(peer_smr, cmd, iov, iov_count, rma_iov,
@@ -147,12 +148,17 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 
 	total_len = ofi_total_iov_len(iov, iov_count);
 
+	/* Do not inline/inject if IPC is available so device to device
+	 * transfer may occur if possible. */
+	use_ipc = ofi_hmem_is_ipc_enabled(iface) && (iov_count == 1) &&
+		  desc && (smr_get_mr_flags(desc) & FI_HMEM_DEVICE_ONLY);
+
 	smr_generic_format(cmd, peer_id, op, 0, data, op_flags);
 	if (total_len <= SMR_MSG_DATA_LEN && op == ofi_op_write &&
-	    !(op_flags & FI_DELIVERY_COMPLETE)) {
+	    !(op_flags & FI_DELIVERY_COMPLETE) && !use_ipc) {
 		smr_format_inline(cmd, iface, device, iov, iov_count);
 	} else if (total_len <= SMR_INJECT_SIZE &&
-		   !(op_flags & FI_DELIVERY_COMPLETE)) {
+		   !(op_flags & FI_DELIVERY_COMPLETE) && !use_ipc) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
 		smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf);
 		if (op == ofi_op_read_req) {
@@ -162,8 +168,8 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 				goto unlock_cq;
 			}
 			cmd->msg.hdr.op_flags |= SMR_RMA_REQ;
-			resp = ofi_cirque_tail(smr_resp_queue(ep->region));
-			pend = freestack_pop(ep->pend_fs);
+			resp = ofi_cirque_next(smr_resp_queue(ep->region));
+			pend = ofi_freestack_pop(ep->pend_fs);
 			smr_format_pend_resp(pend, cmd, context, iface, device, iov,
 					     iov_count, id, resp);
 			cmd->msg.hdr.data = smr_get_offset(ep->region, resp);
@@ -175,31 +181,40 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 			ret = -FI_EAGAIN;
 			goto unlock_cq;
 		}
-		resp = ofi_cirque_tail(smr_resp_queue(ep->region));
-		pend = freestack_pop(ep->pend_fs);
+		resp = ofi_cirque_next(smr_resp_queue(ep->region));
+		pend = ofi_freestack_pop(ep->pend_fs);
 		if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) {
 			smr_format_iov(cmd, iov, iov_count, total_len, ep->region,
 				       resp);
 		} else {
-			if (total_len <= smr_env.sar_threshold ||
-			    iface != FI_HMEM_SYSTEM) {
-				if (!peer_smr->sar_cnt) {
-					ret = -FI_EAGAIN;
-				} else {
-					sar = smr_freestack_pop(smr_sar_pool(peer_smr));
-					smr_format_sar(cmd, iface, device, iov,
-						       iov_count, total_len,
-						       ep->region, peer_smr, sar,
-						       pend, resp);
-					peer_smr->sar_cnt--;
-					smr_peer_data(ep->region)[id].sar_status = 1;
+			if (use_ipc && iface == FI_HMEM_ZE &&
+			    smr_ze_ipc_enabled(ep->region, peer_smr)) {
+				ret = smr_format_ze_ipc(ep, id, cmd, iov,
+					device, total_len, ep->region,
+					resp, pend);
+			} else if (use_ipc && iface != FI_HMEM_ZE) {
+				ret = smr_format_ipc(cmd, iov[0].iov_base, total_len,
+						     ep->region, resp, iface);
+				if (ret) {
+					FI_WARN_ONCE(&smr_prov, FI_LOG_EP_CTRL,
+						     "unable to use IPC for RMA, fallback to using SAR\n");
+					ret = smr_format_sar(cmd, iface, device, iov,
+							     iov_count, total_len,
+							     ep->region, peer_smr, id,
+							     pend, resp);
 				}
+			} else if (total_len <= smr_env.sar_threshold ||
+			    iface != FI_HMEM_SYSTEM) {
+				ret = smr_format_sar(cmd, iface, device, iov,
+						     iov_count, total_len,
+						     ep->region, peer_smr, id,
+						     pend, resp);
 			} else {
 				ret = smr_format_mmap(ep, cmd, iov, iov_count,
 						      total_len, pend, resp);
 			}
 			if (ret) {
-				freestack_push(ep->pend_fs, pend);
+				ofi_freestack_push(ep->pend_fs, pend);
 				ret = -FI_EAGAIN;
 				goto unlock_cq;
 			}
@@ -213,12 +228,13 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 	comp_flags = cmd->msg.hdr.op_flags;
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_format_rma_iov(cmd, rma_iov, rma_count);
 
 commit_comp:
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 
 	if (!comp)
 		goto unlock_cq;
@@ -251,7 +267,7 @@ ssize_t smr_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 	rma_iov.len = len;
 	rma_iov.key = key;
 
-	return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, 
+	return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc,
 			       src_addr, context, ofi_op_read_req, 0,
 			       smr_ep_tx_flags(ep));
 }
@@ -303,7 +319,7 @@ ssize_t smr_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc
 	rma_iov.len = len;
 	rma_iov.key = key;
 
-	return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, 
+	return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc,
 			       dest_addr, context, ofi_op_write, 0,
 			       smr_ep_tx_flags(ep));
 }
@@ -383,7 +399,7 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf,
 	rma_iov.len = len;
 	rma_iov.key = key;
 
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 
 	if (cmds == 1) {
 		ret = smr_rma_fast(peer_smr, cmd, &iov, 1, &rma_iov, 1, NULL,
@@ -404,12 +420,13 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf,
 
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
-	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+	cmd = ofi_cirque_next(smr_cmd_queue(peer_smr));
 	smr_format_rma_iov(cmd, &rma_iov, 1);
 
 commit:
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
 	peer_smr->cmd_cnt--;
+	smr_signal(peer_smr);
 	ofi_ep_tx_cntr_inc_func(&ep->util_ep, ofi_op_write);
 unlock_region:
 	fastlock_release(&peer_smr->lock);
diff --git a/deps/libfabric/prov/shm/src/smr_signal.h b/deps/libfabric/prov/shm/src/smr_signal.h
index 35c46aee098e6ac8a4489ab2d02833e4000c0fae..8c2dde5773bb3551b13307a67ddd8a32bb33731a 100644
--- a/deps/libfabric/prov/shm/src/smr_signal.h
+++ b/deps/libfabric/prov/shm/src/smr_signal.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates.
+ * Copyright (c) 2020-2021 Intel Corporation.
  * All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -41,12 +42,17 @@ struct sigaction *old_action;
 static void smr_handle_signal(int signum, siginfo_t *info, void *ucontext)
 {
 	struct smr_ep_name *ep_name;
+	struct smr_sock_name *sock_name;
 	int ret;
 
 	dlist_foreach_container(&ep_name_list, struct smr_ep_name,
 				ep_name, entry) {
 		shm_unlink(ep_name->name);
 	}
+	dlist_foreach_container(&sock_name_list, struct smr_sock_name,
+				sock_name, entry) {
+		unlink(sock_name->name);
+	}
 
 	/* Register the original signum handler, SIG_DFL or otherwise */
 	ret = sigaction(signum, &old_action[signum], NULL);
diff --git a/deps/libfabric/prov/sockets/include/sock.h b/deps/libfabric/prov/sockets/include/sock.h
index 06273c9572e8bc95c0b93ca37f116b87080718e7..57436abe65c02f4944e441e1e7c41ce9c236935a 100644
--- a/deps/libfabric/prov/sockets/include/sock.h
+++ b/deps/libfabric/prov/sockets/include/sock.h
@@ -197,8 +197,8 @@ struct sock_conn {
 struct sock_conn_map {
 	struct sock_conn *table;
 	ofi_epoll_t epoll_set;
-	void **epoll_ctxs;
-	int epoll_ctxs_sz;
+	struct ofi_epollfds_event *epoll_events;
+	int epoll_size;
 	int used;
 	int size;
 	fastlock_t lock;
@@ -467,7 +467,7 @@ struct sock_eq_entry {
 	size_t len;
 	uint64_t flags;
 	struct dlist_entry entry;
-	char event[0];
+	char event[];
 };
 
 struct sock_eq_err_data_entry {
@@ -876,7 +876,7 @@ struct sock_cq_overflow_entry_t {
 	size_t len;
 	fi_addr_t addr;
 	struct dlist_entry entry;
-	char cq_entry[0];
+	char cq_entry[];
 };
 
 struct sock_cq {
@@ -916,7 +916,7 @@ struct sock_conn_req {
 	struct sock_conn_hdr hdr;
 	union ofi_sock_ip src_addr;
 	uint64_t caps;
-	char cm_data[0];
+	char cm_data[];
 };
 
 enum {
diff --git a/deps/libfabric/prov/sockets/src/sock_atomic.c b/deps/libfabric/prov/sockets/src/sock_atomic.c
index 4d93c42ec98f82453027dc37222b720ab392fccb..e4cca45d21390e18c5b6eb1f782c416ef2e77a2a 100644
--- a/deps/libfabric/prov/sockets/src/sock_atomic.c
+++ b/deps/libfabric/prov/sockets/src/sock_atomic.c
@@ -133,7 +133,7 @@ ssize_t sock_ep_tx_atomic(struct fid_ep *ep,
 
 		total_len = src_len + cmp_len;
 	} else {
-		total_len = msg->iov_count * sizeof(union sock_iov);
+		total_len = (msg->iov_count + compare_count) * sizeof(union sock_iov);
 	}
 
 	total_len += (sizeof(struct sock_op_send) +
diff --git a/deps/libfabric/prov/sockets/src/sock_av.c b/deps/libfabric/prov/sockets/src/sock_av.c
index 05d3972a1513f5682b22cc8801e625c5c64fa599..e8ed5065bbcdac35ca31a08d68771be23f87324b 100644
--- a/deps/libfabric/prov/sockets/src/sock_av.c
+++ b/deps/libfabric/prov/sockets/src/sock_av.c
@@ -127,19 +127,22 @@ static inline void sock_av_report_success(struct sock_av *av, void *context,
 			     &eq_entry, sizeof(eq_entry), flags);
 }
 
-static inline void sock_av_report_error(struct sock_av *av,
-					void *context, int index, int err)
+static void sock_av_report_error(struct sock_av *av, fi_addr_t *fi_addr,
+				 void *context, int index, int err,
+				 uint64_t flags)
 {
-	if (!av->eq)
-		return;
+	int *sync_err;
 
-	sock_eq_report_error(av->eq, &av->av_fid.fid,
-			     context, index, err, -err, NULL, 0);
-}
+	if (fi_addr) {
+		fi_addr[index] = FI_ADDR_NOTAVAIL;
+	} else if (flags & FI_SYNC_ERR) {
+		sync_err = context;
+		sync_err[index] = err;
+	}
 
-static int sock_av_is_valid_address(const struct sockaddr *addr)
-{
-	return ofi_sizeofaddr(addr);
+	if (av->eq)
+		sock_eq_report_error(av->eq, &av->av_fid.fid,
+				     context, index, err, -err, NULL, 0);
 }
 
 static void sock_update_av_table(struct sock_av *_av, size_t count)
@@ -191,7 +194,7 @@ static int sock_av_get_next_index(struct sock_av *av)
 	return -1;
 }
 
-static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr,
+static int sock_check_table_in(struct sock_av *_av, const void *addr,
 			       fi_addr_t *fi_addr, int count, uint64_t flags,
 			       void *context)
 {
@@ -204,21 +207,27 @@ static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr,
 	if ((_av->attr.flags & FI_EVENT) && !_av->eq)
 		return -FI_ENOEQ;
 
+	if (flags & FI_SYNC_ERR) {
+		if (fi_addr || !context || _av->eq)
+			return -FI_EBADFLAGS;
+		memset(context, 0, sizeof(int) * count);
+	}
+
 	if (_av->attr.flags & FI_READ) {
 		for (i = 0; i < count; i++) {
+			struct sockaddr *sock_addr = (struct sockaddr *) ((char *)addr + i * _av->addrlen);
 			for (j = 0; j < _av->table_hdr->size; j++) {
 				if (_av->table[j].valid &&
-				     !sock_av_is_valid_address(&addr[i])) {
-					if (fi_addr)
-						fi_addr[i] = FI_ADDR_NOTAVAIL;
-					sock_av_report_error(_av, context, i,
-								FI_EINVAL);
+				    !ofi_valid_dest_ipaddr(sock_addr)) {
+					sock_av_report_error(_av, fi_addr,
+							context, i, FI_EINVAL,
+							flags);
 					continue;
 				}
 
 				av_addr = &_av->table[j];
-				if (memcmp(&av_addr->addr, &addr[i],
-					   ofi_sizeofaddr(&addr[i])) == 0) {
+				if (memcmp(&av_addr->addr, sock_addr,
+					   ofi_sizeofaddr(sock_addr)) == 0) {
 					SOCK_LOG_DBG("Found addr in shared av\n");
 					if (fi_addr)
 						fi_addr[i] = (fi_addr_t)j;
@@ -231,19 +240,19 @@ static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr,
 	}
 
 	for (i = 0, ret = 0; i < count; i++) {
-		if (!sock_av_is_valid_address(&addr[i])) {
-			if (fi_addr)
-				fi_addr[i] = FI_ADDR_NOTAVAIL;
-			sock_av_report_error(_av, context, i, FI_EINVAL);
+		struct sockaddr *sock_addr = (struct sockaddr *) ((char *)addr + i * _av->addrlen);
+		if (!ofi_valid_dest_ipaddr(sock_addr)) {
+			sock_av_report_error(_av, fi_addr, context, i, FI_EINVAL,
+					     flags);
 			continue;
 		}
 		if (_av->table_hdr->stored == _av->table_hdr->size) {
 			index = sock_av_get_next_index(_av);
 			if (index < 0) {
 				if (sock_resize_av_table(_av)) {
-					if (fi_addr)
-						fi_addr[i] = FI_ADDR_NOTAVAIL;
-					sock_av_report_error(_av, context, i, FI_ENOMEM);
+					sock_av_report_error(_av, fi_addr,
+							     context, i,
+							     FI_ENOMEM, flags);
 					continue;
 				}
 				index = _av->table_hdr->stored++;
@@ -253,13 +262,13 @@ static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr,
 		}
 
 		av_addr = &_av->table[index];
-		inet_ntop(addr[i].sa_family, ofi_get_ipaddr(&addr[i]),
+		inet_ntop(sock_addr->sa_family, ofi_get_ipaddr(sock_addr),
 			  sa_ip, sizeof sa_ip);
 		SOCK_LOG_DBG("AV-INSERT: dst_addr family: %d, IP %s, port: %d\n",
-			      (&addr[i])->sa_family, sa_ip,
-			      ofi_addr_get_port(&addr[i]));
+			      sock_addr->sa_family, sa_ip,
+			      ofi_addr_get_port(sock_addr));
 
-		memcpy(&av_addr->addr, &addr[i], ofi_sizeofaddr(&addr[i]));
+		memcpy(&av_addr->addr, sock_addr, ofi_sizeofaddr(sock_addr));
 		if (fi_addr)
 			fi_addr[i] = (fi_addr_t)index;
 
@@ -279,8 +288,7 @@ static int sock_av_insert(struct fid_av *av, const void *addr, size_t count,
 	_av = container_of(av, struct sock_av, av_fid);
 
 	fastlock_acquire(&_av->table_lock);
-	ret = sock_check_table_in(_av, (const struct sockaddr *) addr,
-				   fi_addr, count, flags, context);
+	ret = sock_check_table_in(_av, addr, fi_addr, count, flags, context);
 	fastlock_release(&_av->table_lock);
 	return ret;
 }
@@ -327,7 +335,8 @@ static int _sock_av_insertsvc(struct fid_av *av, const char *node,
 	ret = getaddrinfo(node, service, &sock_hints, &result);
 	if (ret) {
 		if (_av->eq) {
-			sock_av_report_error(_av, context, 0, FI_EINVAL);
+			sock_av_report_error(_av, fi_addr, context, 0,
+					     FI_EINVAL, flags);
 			sock_av_report_success(_av, context, 0, flags);
 		}
 		return -ret;
diff --git a/deps/libfabric/prov/sockets/src/sock_conn.c b/deps/libfabric/prov/sockets/src/sock_conn.c
index 0d39956a82557d329497c531e633256b434cc11c..fd7168900dfda12d217f3aae976db900dfd93b35 100644
--- a/deps/libfabric/prov/sockets/src/sock_conn.c
+++ b/deps/libfabric/prov/sockets/src/sock_conn.c
@@ -97,12 +97,13 @@ int sock_conn_map_init(struct sock_ep *ep, int init_size)
 {
 	struct sock_conn_map *map = &ep->attr->cmap;
 	int ret;
+
 	map->table = calloc(init_size, sizeof(*map->table));
 	if (!map->table)
 		return -FI_ENOMEM;
 
-	map->epoll_ctxs = calloc(init_size, sizeof(*map->epoll_ctxs));
-	if (!map->epoll_ctxs)
+	map->epoll_events = calloc(init_size, sizeof(*map->epoll_events));
+	if (!map->epoll_events)
 		goto err1;
 
 	ret = ofi_epoll_create(&map->epoll_set);
@@ -116,10 +117,11 @@ int sock_conn_map_init(struct sock_ep *ep, int init_size)
 	fastlock_init(&map->lock);
 	map->used = 0;
 	map->size = init_size;
+	map->epoll_size = init_size;
 	return 0;
 
 err2:
-	free(map->epoll_ctxs);
+	free(map->epoll_events);
 err1:
 	free(map->table);
 	return -FI_ENOMEM;
@@ -153,9 +155,9 @@ void sock_conn_map_destroy(struct sock_ep_attr *ep_attr)
 	}
 	free(cmap->table);
 	cmap->table = NULL;
-	free(cmap->epoll_ctxs);
-	cmap->epoll_ctxs = NULL;
-	cmap->epoll_ctxs_sz = 0;
+	free(cmap->epoll_events);
+	cmap->epoll_events = NULL;
+	cmap->epoll_size = 0;
 	cmap->used = cmap->size = 0;
 	ofi_epoll_close(cmap->epoll_set);
 	fastlock_destroy(&cmap->lock);
@@ -317,14 +319,14 @@ static void *sock_conn_listener_thread(void *arg)
 {
 	struct sock_conn_listener *conn_listener = arg;
 	struct sock_conn_handle *conn_handle;
-	void *ep_contexts[SOCK_EPOLL_WAIT_EVENTS];
+	struct ofi_epollfds_event events[SOCK_EPOLL_WAIT_EVENTS];
 	struct sock_ep_attr *ep_attr;
 	int num_fds, i, conn_fd;
 	union ofi_sock_ip remote;
 	socklen_t addr_size;
 
 	while (conn_listener->do_listen) {
-		num_fds = ofi_epoll_wait(conn_listener->epollfd, ep_contexts,
+		num_fds = ofi_epoll_wait(conn_listener->epollfd, events,
 		                        SOCK_EPOLL_WAIT_EVENTS, -1);
 		if (num_fds < 0) {
 			SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno));
@@ -342,7 +344,7 @@ static void *sock_conn_listener_thread(void *arg)
 		}
 
 		for (i = 0; i < num_fds; i++) {
-			conn_handle = ep_contexts[i];
+			conn_handle = events[i].data.ptr;
 
 			if (conn_handle == NULL) { /* signal event */
 				fd_signal_reset(&conn_listener->signal);
diff --git a/deps/libfabric/prov/sockets/src/sock_ep.c b/deps/libfabric/prov/sockets/src/sock_ep.c
index 9f5145d8423ba2b0e17f7d425f688478ee5cbf68..8257fdeb71421629be183f74356a6675aacf875f 100644
--- a/deps/libfabric/prov/sockets/src/sock_ep.c
+++ b/deps/libfabric/prov/sockets/src/sock_ep.c
@@ -726,7 +726,7 @@ static int sock_ep_close(struct fid *fid)
 		free(sock_ep->attr->dest_addr);
 
 	fastlock_acquire(&sock_ep->attr->domain->pe->lock);
-	ofi_idm_reset(&sock_ep->attr->av_idm);
+	ofi_idm_reset(&sock_ep->attr->av_idm, NULL);
 	sock_conn_map_destroy(sock_ep->attr);
 	fastlock_release(&sock_ep->attr->domain->pe->lock);
 
diff --git a/deps/libfabric/prov/sockets/src/sock_ep_msg.c b/deps/libfabric/prov/sockets/src/sock_ep_msg.c
index 50498685c43aa723959113f65d43a1d946ca5021..bd158cbb634218839092f4c77bb425b10b656ca5 100644
--- a/deps/libfabric/prov/sockets/src/sock_ep_msg.c
+++ b/deps/libfabric/prov/sockets/src/sock_ep_msg.c
@@ -382,10 +382,15 @@ static void sock_ep_cm_connect_handler(struct sock_ep_cm_head *cm_head,
 	struct fi_eq_cm_entry *cm_entry = NULL;
 	int cm_data_sz, response_port;
 
-	assert(hdr->type == SOCK_CONN_ACCEPT
-	       || hdr->type == SOCK_CONN_REJECT);
+	assert(hdr->type == SOCK_CONN_ACCEPT ||
+	       hdr->type == SOCK_CONN_REJECT);
 
 	cm_data_sz = ntohs(hdr->cm_data_sz);
+	if (cm_data_sz > SOCK_EP_MAX_CM_DATA_SZ) {
+		SOCK_LOG_ERROR("CM data size too large\n");
+		goto err;
+	}
+
 	response_port = ntohs(hdr->port);
 	if (cm_data_sz) {
 		param = calloc(1, cm_data_sz);
@@ -846,6 +851,11 @@ static void sock_pep_req_handler(struct sock_ep_cm_head *cm_head,
 	}
 
 	req_cm_data_sz = ntohs(conn_req->hdr.cm_data_sz);
+	if (req_cm_data_sz > SOCK_EP_MAX_CM_DATA_SZ) {
+		SOCK_LOG_ERROR("CM data size is too large\n");
+		goto err;
+	}
+
 	if (req_cm_data_sz) {
 		ret = sock_cm_recv(handle->sock_fd, conn_req->cm_data,
 				   req_cm_data_sz);
@@ -1161,13 +1171,13 @@ static void *sock_ep_cm_thread(void *arg)
 {
 	int num_fds, i;
 	struct sock_ep_cm_head *cm_head = arg;
-	void *ep_contexts[SOCK_EPOLL_WAIT_EVENTS];
+	struct ofi_epollfds_event events[SOCK_EPOLL_WAIT_EVENTS];
 	struct sock_conn_req_handle *handle;
 
 	while (cm_head->do_listen) {
 		sock_ep_cm_check_closing_rejected_list(cm_head);
 
-		num_fds = ofi_epoll_wait(cm_head->epollfd, ep_contexts,
+		num_fds = ofi_epoll_wait(cm_head->epollfd, events,
 		                        SOCK_EPOLL_WAIT_EVENTS, -1);
 		if (num_fds < 0) {
 			SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno));
@@ -1185,7 +1195,7 @@ static void *sock_ep_cm_thread(void *arg)
 			goto skip;
 		}
 		for (i = 0; i < num_fds; i++) {
-			handle = ep_contexts[i];
+			handle = events[i].data.ptr;
 
 			if (handle == NULL) { /* Signal event */
 				fd_signal_reset(&cm_head->signal);
diff --git a/deps/libfabric/prov/sockets/src/sock_progress.c b/deps/libfabric/prov/sockets/src/sock_progress.c
index b8f21962fbbf339ca8babd65d64f0e2b2c8dd23a..0d9489704e5c17c8e5a0ea09a4f263e94b40fcf2 100644
--- a/deps/libfabric/prov/sockets/src/sock_progress.c
+++ b/deps/libfabric/prov/sockets/src/sock_progress.c
@@ -1169,6 +1169,8 @@ ssize_t sock_rx_claim_recv(struct sock_rx_ctx *rx_ctx, void *context,
 
 		dlist_remove(&rx_buffered->entry);
 		sock_rx_release_entry(rx_buffered);
+		if (rx_ctx->progress_start == entry)
+			rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
 	} else {
 		ret = -FI_ENOMSG;
 	}
@@ -2374,20 +2376,20 @@ static int sock_pe_progress_rx_ep(struct sock_pe *pe,
 	if (!map->used)
 		return 0;
 
-	if (map->epoll_ctxs_sz < map->used) {
+	if (map->epoll_size < map->used) {
 		uint64_t new_size = map->used * 2;
-		void *ctxs;
+		struct ofi_epollfds_event *events;
 
-		ctxs = realloc(map->epoll_ctxs,
-			       sizeof(*map->epoll_ctxs) * new_size);
-		if (ctxs) {
-			map->epoll_ctxs = ctxs;
-			map->epoll_ctxs_sz = new_size;
+		events = realloc(map->epoll_events,
+				 sizeof(*map->epoll_events) * new_size);
+		if (events) {
+			map->epoll_events = events;
+			map->epoll_size = new_size;
 		}
 	}
 
-	num_fds = ofi_epoll_wait(map->epoll_set, map->epoll_ctxs,
-	                        MIN(map->used, map->epoll_ctxs_sz), 0);
+	num_fds = ofi_epoll_wait(map->epoll_set, map->epoll_events,
+	                        MIN(map->used, map->epoll_size), 0);
 	if (num_fds < 0 || num_fds == 0) {
 		if (num_fds < 0)
 			SOCK_LOG_ERROR("epoll failed: %d\n", num_fds);
@@ -2396,7 +2398,7 @@ static int sock_pe_progress_rx_ep(struct sock_pe *pe,
 
 	fastlock_acquire(&map->lock);
 	for (i = 0; i < num_fds; i++) {
-		conn = map->epoll_ctxs[i];
+		conn = map->epoll_events[i].data.ptr;
 		if (!conn)
 			SOCK_LOG_ERROR("ofi_idm_lookup failed\n");
 
@@ -2595,9 +2597,9 @@ static void sock_pe_wait(struct sock_pe *pe)
 {
 	char tmp;
 	int ret;
-	void *ep_contexts[1];
+	struct ofi_epollfds_event event;
 
-	ret = ofi_epoll_wait(pe->epoll_set, ep_contexts, 1, -1);
+	ret = ofi_epoll_wait(pe->epoll_set, &event, 1, -1);
 	if (ret < 0)
 		SOCK_LOG_ERROR("poll failed : %s\n", strerror(ofi_sockerr()));
 
diff --git a/deps/libfabric/prov/tcp/Makefile.include b/deps/libfabric/prov/tcp/Makefile.include
index 94d92415da723595060c2015726bbc9638380663..4fb43ca597680aac84fccfb1fdd86920c4f19959 100644
--- a/deps/libfabric/prov/tcp/Makefile.include
+++ b/deps/libfabric/prov/tcp/Makefile.include
@@ -12,7 +12,6 @@ _tcp_files = \
 	prov/tcp/src/tcpx_eq.c		\
 	prov/tcp/src/tcpx_init.c	\
 	prov/tcp/src/tcpx_progress.c	\
-	prov/tcp/src/tcpx_comm.c	\
 	prov/tcp/src/tcpx.h
 
 if HAVE_TCP_DL
diff --git a/deps/libfabric/prov/tcp/src/tcpx.h b/deps/libfabric/prov/tcp/src/tcpx.h
index 7de2a63be5647781ce3eeb8d08ebb64d65a99a75..7b2615f93f70d860f85da618c0eccd6a185f5dca 100644
--- a/deps/libfabric/prov/tcp/src/tcpx.h
+++ b/deps/libfabric/prov/tcp/src/tcpx.h
@@ -58,65 +58,137 @@
 #include <ofi_signal.h>
 #include <ofi_util.h>
 #include <ofi_proto.h>
+#include <ofi_net.h>
 
 #ifndef _TCP_H_
 #define _TCP_H_
 
-#define TCPX_HDR_VERSION	3
-#define TCPX_CTRL_HDR_VERSION	3
-
-#define TCPX_MAX_CM_DATA_SIZE	(1 << 8)
-#define TCPX_IOV_LIMIT		(4)
-#define TCPX_MAX_INJECT_SZ	(64)
 
+#define TCPX_MAX_INJECT		128
 #define MAX_POLL_EVENTS		100
-
 #define TCPX_MIN_MULTI_RECV	16384
-
 #define TCPX_PORT_MAX_RANGE	(USHRT_MAX)
 
 extern struct fi_provider	tcpx_prov;
 extern struct util_prov		tcpx_util_prov;
 extern struct fi_info		tcpx_info;
 extern struct tcpx_port_range	port_range;
+extern int tcpx_nodelay;
+extern int tcpx_staging_sbuf_size;
+extern int tcpx_prefetch_rbuf_size;
+extern size_t tcpx_default_tx_size;
+extern size_t tcpx_default_rx_size;
+extern size_t tcpx_zerocopy_size;
+
 struct tcpx_xfer_entry;
 struct tcpx_ep;
 
-enum tcpx_xfer_op_codes {
-	TCPX_OP_MSG_SEND,
-	TCPX_OP_MSG_RECV,
-	TCPX_OP_MSG_RESP,
-	TCPX_OP_WRITE,
-	TCPX_OP_REMOTE_WRITE,
-	TCPX_OP_READ_REQ,
-	TCPX_OP_READ_RSP,
-	TCPX_OP_REMOTE_READ,
-	TCPX_OP_CODE_MAX,
+
+/*
+ * Wire protocol structures and definitions
+ */
+
+#define TCPX_CTRL_HDR_VERSION	3
+
+enum {
+	TCPX_MAX_CM_DATA_SIZE = (1 << 8)
+};
+
+struct tcpx_cm_msg {
+	struct ofi_ctrl_hdr hdr;
+	char data[TCPX_MAX_CM_DATA_SIZE];
+};
+
+#define TCPX_HDR_VERSION	3
+
+enum {
+	TCPX_IOV_LIMIT = 4
+};
+
+/* base_hdr::op_data */
+enum {
+	/* backward compatible value */
+	TCPX_OP_ACK = 2, /* indicates ack message - should be a flag */
+};
+
+/* Flags */
+#define TCPX_REMOTE_CQ_DATA	(1 << 0)
+/* not used TCPX_TRANSMIT_COMPLETE	(1 << 1) */
+#define TCPX_DELIVERY_COMPLETE	(1 << 2)
+#define TCPX_COMMIT_COMPLETE	(1 << 3)
+#define TCPX_TAGGED		(1 << 7)
+
+struct tcpx_base_hdr {
+	uint8_t			version;
+	uint8_t			op;
+	uint16_t		flags;
+	uint8_t			op_data;
+	uint8_t			rma_iov_cnt;
+	uint8_t			hdr_size;
+	union {
+		uint8_t		rsvd;
+		uint8_t		id; /* debug */
+	};
+	uint64_t		size;
+};
+
+struct tcpx_tag_hdr {
+	struct tcpx_base_hdr	base_hdr;
+	uint64_t		tag;
+};
+
+struct tcpx_cq_data_hdr {
+	struct tcpx_base_hdr 	base_hdr;
+	uint64_t		cq_data;
+};
+
+struct tcpx_tag_data_hdr {
+	struct tcpx_cq_data_hdr	cq_data_hdr;
+	uint64_t		tag;
+};
+
+/* Maximum header is scatter RMA with CQ data */
+#define TCPX_MAX_HDR (sizeof(struct tcpx_cq_data_hdr) + \
+		     sizeof(struct ofi_rma_iov) * TCPX_IOV_LIMIT)
+
+/*
+ * End wire protocol definitions
+ */
+
+
+enum tcpx_cm_state {
+	TCPX_CM_LISTENING,
+	TCPX_CM_CONNECTING,
+	TCPX_CM_WAIT_REQ,
+	TCPX_CM_REQ_SENT,
+	TCPX_CM_REQ_RVCD,
+	TCPX_CM_RESP_READY,
+	/* CM context is freed once connected */
 };
 
-enum tcpx_cm_event_type {
-	SERVER_SOCK_ACCEPT,
-	CLIENT_SEND_CONNREQ,
-	SERVER_RECV_CONNREQ,
-	SERVER_SEND_CM_ACCEPT,
-	CLIENT_RECV_CONNRESP,
-	CLIENT_SERVER_ERROR,
+#define OFI_PROV_SPECIFIC_TCP (0x7cb << 16)
+enum {
+	TCPX_CLASS_CM = OFI_PROV_SPECIFIC_TCP,
 };
 
 struct tcpx_cm_context {
-	fid_t			fid;
-	enum tcpx_cm_event_type	type;
+	struct fid		fid;
+	struct fid		*hfid;
+	enum tcpx_cm_state	state;
 	size_t			cm_data_sz;
-	char			cm_data[TCPX_MAX_CM_DATA_SIZE];
+	struct tcpx_cm_msg	msg;
 };
 
+struct tcpx_cm_context *tcpx_alloc_cm_ctx(fid_t fid, enum tcpx_cm_state state);
+void tcpx_free_cm_ctx(struct tcpx_cm_context *cm_ctx);
+
 struct tcpx_port_range {
 	int high;
 	int low;
 };
 
 struct tcpx_conn_handle {
-	struct fid		handle;
+	struct fid		fid;
 	struct tcpx_pep		*pep;
 	SOCKET			sock;
 	bool			endian_match;
@@ -138,75 +210,73 @@ enum tcpx_state {
 	TCPX_DISCONNECTED,
 };
 
-struct tcpx_base_hdr {
-	uint8_t			version;
-	uint8_t			op;
-	uint16_t		flags;
-	uint8_t			op_data;
-	uint8_t			rma_iov_cnt;
-	uint8_t			payload_off;
-	uint8_t			rsvd;
-	uint64_t		size;
-};
-
-struct tcpx_cq_data_hdr {
-	struct tcpx_base_hdr 	base_hdr;
-	uint64_t		cq_data;
-};
-
-#define TCPX_MAX_HDR_SZ (sizeof(struct tcpx_base_hdr) + 	\
-			 sizeof(uint64_t) +			\
-			 sizeof(struct ofi_rma_iov) *		\
-			 TCPX_IOV_LIMIT +			\
-			 TCPX_MAX_INJECT_SZ)
-
-struct tcpx_cur_rx_msg {
+struct tcpx_cur_rx {
 	union {
 		struct tcpx_base_hdr	base_hdr;
-		uint8_t		       	max_hdr[TCPX_MAX_HDR_SZ];
+		struct tcpx_cq_data_hdr cq_data_hdr;
+		struct tcpx_tag_data_hdr tag_data_hdr;
+		struct tcpx_tag_hdr	tag_hdr;
+		uint8_t			max_hdr[TCPX_MAX_HDR];
 	} hdr;
 	size_t			hdr_len;
-	size_t			done_len;
+	size_t			hdr_done;
+	size_t			data_left;
+	struct tcpx_xfer_entry	*entry;
+	int			(*handler)(struct tcpx_ep *ep);
+};
+
+struct tcpx_cur_tx {
+	size_t			data_left;
+	struct tcpx_xfer_entry	*entry;
 };
 
 struct tcpx_rx_ctx {
 	struct fid_ep		rx_fid;
 	struct slist		rx_queue;
+	struct slist		tag_queue;
+	struct tcpx_xfer_entry	*(*match_tag_rx)(struct tcpx_rx_ctx *srx,
+						 struct tcpx_ep *ep,
+						 uint64_t tag);
+
 	struct ofi_bufpool	*buf_pool;
 	uint64_t		op_flags;
 	fastlock_t		lock;
 };
 
-typedef int (*tcpx_rx_process_fn_t)(struct tcpx_xfer_entry *rx_entry);
-
-enum {
-	STAGE_BUF_SIZE = 512
-};
+struct tcpx_xfer_entry *
+tcpx_match_tag_addr(struct tcpx_rx_ctx *srx, struct tcpx_ep *ep, uint64_t tag);
+struct tcpx_xfer_entry *
+tcpx_match_tag(struct tcpx_rx_ctx *srx, struct tcpx_ep *ep, uint64_t tag);
 
-struct stage_buf {
-	uint8_t			buf[STAGE_BUF_SIZE];
-	size_t			bytes_avail;
-	size_t			cur_pos;
-};
 
 struct tcpx_ep {
 	struct util_ep		util_ep;
-	SOCKET			sock;
-	struct tcpx_cur_rx_msg	cur_rx_msg;
-	struct tcpx_xfer_entry	*cur_rx_entry;
-	tcpx_rx_process_fn_t 	cur_rx_proc_fn;
+	struct ofi_bsock	bsock;
+	struct tcpx_cur_rx	cur_rx;
+	struct tcpx_cur_tx	cur_tx;
+	OFI_DBG_VAR(uint8_t, tx_id)
+	OFI_DBG_VAR(uint8_t, rx_id)
+
 	struct dlist_entry	ep_entry;
 	struct slist		rx_queue;
 	struct slist		tx_queue;
-	struct slist		tx_rsp_pend_queue;
+	struct slist		priority_queue;
+	struct slist		need_ack_queue;
+	struct slist		async_queue;
 	struct slist		rma_read_queue;
+	int			rx_avail;
 	struct tcpx_rx_ctx	*srx_ctx;
 	enum tcpx_state		state;
+	union {
+		struct fid		*fid;
+		struct tcpx_cm_context	*cm_ctx;
+		struct tcpx_conn_handle *handle;
+	};
+
 	/* lock for protecting tx/rx queues, rma list, state*/
 	fastlock_t		lock;
 	int (*start_op[ofi_op_write + 1])(struct tcpx_ep *ep);
 	void (*hdr_bswap)(struct tcpx_base_hdr *hdr);
-	struct stage_buf	stage_buf;
 	size_t			min_multi_recv_size;
 	bool			pollout_set;
 };
@@ -215,35 +285,54 @@ struct tcpx_fabric {
 	struct util_fabric	util_fabric;
 };
 
+
+#define TCPX_NEED_RESP		BIT(1)
+#define TCPX_NEED_ACK		BIT(2)
+#define TCPX_INTERNAL_XFER	BIT(3)
+#define TCPX_NEED_DYN_RBUF 	BIT(4)
+#define TCPX_ASYNC		BIT(5)
+
 struct tcpx_xfer_entry {
 	struct slist_entry	entry;
 	union {
 		struct tcpx_base_hdr	base_hdr;
 		struct tcpx_cq_data_hdr cq_data_hdr;
-		uint8_t		       	max_hdr[TCPX_MAX_HDR_SZ];
+		struct tcpx_tag_data_hdr tag_data_hdr;
+		struct tcpx_tag_hdr	tag_hdr;
+		uint8_t		       	max_hdr[TCPX_MAX_HDR + TCPX_MAX_INJECT];
 	} hdr;
 	size_t			iov_cnt;
 	struct iovec		iov[TCPX_IOV_LIMIT+1];
 	struct tcpx_ep		*ep;
-	uint64_t		flags;
+	uint64_t		tag;
+	uint64_t		ignore;
+	uint64_t		cq_flags;
+	uint32_t		ctrl_flags;
+	uint32_t		async_index;
 	void			*context;
-	uint64_t		rem_len;
 	void			*mrecv_msg_start;
+	// for RMA read requests, we need a way to track the request response
+	// so that we don't propagate multiple completions for the same operation
+	struct tcpx_xfer_entry  *resp_entry;
 };
 
 struct tcpx_domain {
-	struct util_domain	util_domain;
+	struct util_domain		util_domain;
+	struct ofi_ops_dynamic_rbuf	*dynamic_rbuf;
 };
 
-struct tcpx_buf_pool {
-	struct ofi_bufpool	*pool;
-	enum tcpx_xfer_op_codes	op_type;
-};
+static inline struct ofi_ops_dynamic_rbuf *tcpx_dynamic_rbuf(struct tcpx_ep *ep)
+{
+	struct tcpx_domain *domain;
+
+	domain = container_of(ep->util_ep.domain, struct tcpx_domain,
+			      util_domain);
+	return domain->dynamic_rbuf;
+}
 
 struct tcpx_cq {
 	struct util_cq		util_cq;
-	/* buf_pools protected by util.cq_lock */
-	struct tcpx_buf_pool	buf_pools[TCPX_OP_CODE_MAX];
+	struct ofi_bufpool	*xfer_pool;
 };
 
 struct tcpx_eq {
@@ -280,35 +369,21 @@ void tcpx_cq_report_success(struct util_cq *cq,
 void tcpx_cq_report_error(struct util_cq *cq,
 			  struct tcpx_xfer_entry *xfer_entry,
 			  int err);
+void tcpx_get_cq_info(struct tcpx_xfer_entry *entry, uint64_t *flags,
+		      uint64_t *data, uint64_t *tag);
 
-
-ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf,
-		      struct tcpx_cur_rx_msg *cur_rx_msg);
-int tcpx_recv_msg_data(struct tcpx_xfer_entry *recv_entry);
-int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry);
-int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf);
-
-struct tcpx_xfer_entry *tcpx_xfer_entry_alloc(struct tcpx_cq *cq,
-					      enum tcpx_xfer_op_codes type);
-
-void tcpx_xfer_entry_release(struct tcpx_cq *tcpx_cq,
-			     struct tcpx_xfer_entry *xfer_entry);
-void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx,
-			   struct tcpx_xfer_entry *xfer_entry);
-
-void tcpx_rx_msg_release(struct tcpx_xfer_entry *rx_entry);
-struct tcpx_xfer_entry *
-tcpx_srx_next_xfer_entry(struct tcpx_rx_ctx *srx_ctx,
-			struct tcpx_ep *ep, size_t entry_size);
+void tcpx_reset_rx(struct tcpx_ep *ep);
 
 void tcpx_progress_tx(struct tcpx_ep *ep);
 void tcpx_progress_rx(struct tcpx_ep *ep);
+void tcpx_progress_async(struct tcpx_ep *ep);
 int tcpx_try_func(void *util_ep);
+int tcpx_update_epoll(struct tcpx_ep *ep);
 
 void tcpx_hdr_none(struct tcpx_base_hdr *hdr);
 void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr);
 
-void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep,
+void tcpx_tx_queue_insert(struct tcpx_ep *ep,
 			  struct tcpx_xfer_entry *tx_entry);
 
 void tcpx_conn_mgr_run(struct util_eq *eq);
@@ -316,10 +391,132 @@ int tcpx_eq_wait_try_func(void *arg);
 int tcpx_eq_create(struct fid_fabric *fabric_fid, struct fi_eq_attr *attr,
 		   struct fid_eq **eq_fid, void *context);
 
-int tcpx_op_invalid(struct tcpx_ep *tcpx_ep);
-int tcpx_op_msg(struct tcpx_ep *tcpx_ep);
-int tcpx_op_read_req(struct tcpx_ep *tcpx_ep);
-int tcpx_op_write(struct tcpx_ep *tcpx_ep);
-int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep);
+int tcpx_op_invalid(struct tcpx_ep *ep);
+int tcpx_op_msg(struct tcpx_ep *ep);
+int tcpx_op_tagged(struct tcpx_ep *ep);
+int tcpx_op_read_req(struct tcpx_ep *ep);
+int tcpx_op_write(struct tcpx_ep *ep);
+int tcpx_op_read_rsp(struct tcpx_ep *ep);
+
+
+static inline void
+tcpx_set_ack_flags(struct tcpx_xfer_entry *xfer, uint64_t flags)
+{
+	if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) {
+		xfer->hdr.base_hdr.flags |= TCPX_DELIVERY_COMPLETE;
+		xfer->ctrl_flags |= TCPX_NEED_ACK;
+	}
+}
+
+static inline void
+tcpx_set_commit_flags(struct tcpx_xfer_entry *xfer, uint64_t flags)
+{
+	tcpx_set_ack_flags(xfer, flags);
+	if (flags & FI_COMMIT_COMPLETE) {
+		xfer->hdr.base_hdr.flags |= TCPX_COMMIT_COMPLETE;
+		xfer->ctrl_flags |= TCPX_NEED_ACK;
+	}
+}
+
+static inline uint64_t
+tcpx_tx_completion_flag(struct tcpx_ep *ep, uint64_t op_flags)
+{
+	/* Generate a completion if op flags indicate or we generate
+	 * completions by default
+	 */
+	return (ep->util_ep.tx_op_flags | op_flags) & FI_COMPLETION;
+}
+
+static inline uint64_t
+tcpx_rx_completion_flag(struct tcpx_ep *ep, uint64_t op_flags)
+{
+	/* Generate a completion if op flags indicate or we generate
+	 * completions by default
+	 */
+	return (ep->util_ep.rx_op_flags | op_flags) & FI_COMPLETION;
+}
+
+static inline struct tcpx_xfer_entry *
+tcpx_alloc_xfer(struct tcpx_cq *cq)
+{
+	struct tcpx_xfer_entry *xfer;
+
+	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
+	xfer = ofi_buf_alloc(cq->xfer_pool);
+	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+
+	return xfer;
+}
+
+static inline void
+tcpx_free_xfer(struct tcpx_cq *cq, struct tcpx_xfer_entry *xfer)
+{
+	xfer->hdr.base_hdr.flags = 0;
+	xfer->cq_flags = 0;
+	xfer->ctrl_flags = 0;
+	xfer->context = 0;
+
+	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
+	ofi_buf_free(xfer);
+	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+}
+
+static inline struct tcpx_xfer_entry *
+tcpx_alloc_rx(struct tcpx_ep *ep)
+{
+	struct tcpx_xfer_entry *xfer;
+	struct tcpx_cq *cq;
+
+	cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+	xfer = tcpx_alloc_xfer(cq);
+	if (xfer)
+		xfer->ep = ep;
+
+	return xfer;
+}
+
+static inline void
+tcpx_free_rx(struct tcpx_xfer_entry *xfer)
+{
+	struct tcpx_cq *cq;
+	struct tcpx_rx_ctx *srx;
+
+	if (xfer->ep->srx_ctx) {
+		srx = xfer->ep->srx_ctx;
+		fastlock_acquire(&srx->lock);
+		ofi_buf_free(xfer);
+		fastlock_release(&srx->lock);
+	} else {
+		cq = container_of(xfer->ep->util_ep.rx_cq,
+				  struct tcpx_cq, util_cq);
+		tcpx_free_xfer(cq, xfer);
+	}
+}
+
+static inline struct tcpx_xfer_entry *
+tcpx_alloc_tx(struct tcpx_ep *ep)
+{
+	struct tcpx_xfer_entry *xfer;
+	struct tcpx_cq *cq;
+
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+
+	xfer = tcpx_alloc_xfer(cq);
+	if (xfer) {
+		xfer->hdr.base_hdr.version = TCPX_HDR_VERSION;
+		xfer->hdr.base_hdr.op_data = 0;
+		xfer->ep = ep;
+	}
+
+	return xfer;
+}
+
+static inline void
+tcpx_free_tx(struct tcpx_xfer_entry *xfer)
+{
+	struct tcpx_cq *cq;
+	cq = container_of(xfer->ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+	tcpx_free_xfer(cq, xfer);
+}
 
 #endif //_TCP_H_
diff --git a/deps/libfabric/prov/tcp/src/tcpx_attr.c b/deps/libfabric/prov/tcp/src/tcpx_attr.c
index f0f59d648732e0afcb15d75093f14a4348f49314..a406a2a8ab7cd3726852d0bdf356e5e6aacbf5b7 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_attr.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_attr.c
@@ -35,6 +35,7 @@
 
 #define TCPX_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM)
 #define TCPX_EP_CAPS	 (FI_MSG | FI_RMA | FI_RMA_PMEM)
+#define TCPX_EP_SRX_CAPS (TCPX_EP_CAPS | FI_TAGGED)
 #define TCPX_TX_CAPS	 (FI_SEND | FI_WRITE | FI_READ)
 #define TCPX_RX_CAPS	 (FI_RECV | FI_REMOTE_READ | 			\
 			  FI_REMOTE_WRITE)
@@ -55,7 +56,7 @@ static struct fi_tx_attr tcpx_tx_attr = {
 	.op_flags = TCPX_TX_OP_FLAGS,
 	.comp_order = FI_ORDER_STRICT,
 	.msg_order = TCPX_MSG_ORDER,
-	.inject_size = 64,
+	.inject_size = TCPX_MAX_INJECT,
 	.size = 1024,
 	.iov_limit = TCPX_IOV_LIMIT,
 	.rma_iov_limit = TCPX_IOV_LIMIT,
@@ -67,7 +68,7 @@ static struct fi_rx_attr tcpx_rx_attr = {
 	.comp_order = FI_ORDER_STRICT,
 	.msg_order = TCPX_MSG_ORDER,
 	.total_buffered_recv = 0,
-	.size = 1024,
+	.size = 65536,
 	.iov_limit = TCPX_IOV_LIMIT
 };
 
@@ -82,6 +83,38 @@ static struct fi_ep_attr tcpx_ep_attr = {
 	.max_order_waw_size = SIZE_MAX,
 };
 
+static struct fi_tx_attr tcpx_tx_srx_attr = {
+	.caps = TCPX_EP_SRX_CAPS | TCPX_TX_CAPS,
+	.op_flags = TCPX_TX_OP_FLAGS,
+	.comp_order = FI_ORDER_STRICT,
+	.msg_order = TCPX_MSG_ORDER,
+	.inject_size = TCPX_MAX_INJECT,
+	.size = 1024,
+	.iov_limit = TCPX_IOV_LIMIT,
+	.rma_iov_limit = TCPX_IOV_LIMIT,
+};
+
+static struct fi_rx_attr tcpx_rx_srx_attr = {
+	.caps = TCPX_EP_SRX_CAPS | TCPX_RX_CAPS,
+	.op_flags = TCPX_RX_OP_FLAGS,
+	.comp_order = FI_ORDER_STRICT,
+	.msg_order = TCPX_MSG_ORDER,
+	.total_buffered_recv = 0,
+	.size = 65536,
+	.iov_limit = TCPX_IOV_LIMIT
+};
+
+static struct fi_ep_attr tcpx_ep_srx_attr = {
+	.type = FI_EP_MSG,
+	.protocol = FI_PROTO_SOCK_TCP,
+	.protocol_version = 0,
+	.max_msg_size = SIZE_MAX,
+	.tx_ctx_cnt = 1,
+	.rx_ctx_cnt = FI_SHARED_CONTEXT,
+	.max_order_raw_size = SIZE_MAX,
+	.max_order_waw_size = SIZE_MAX,
+};
+
 static struct fi_domain_attr tcpx_domain_attr = {
 	.name = "tcp",
 	.caps = TCPX_DOMAIN_CAPS,
@@ -97,9 +130,10 @@ static struct fi_domain_attr tcpx_domain_attr = {
 	.ep_cnt = 8192,
 	.tx_ctx_cnt = 8192,
 	.rx_ctx_cnt = 8192,
-	.max_ep_srx_ctx = 128,
+	.max_ep_srx_ctx = 8192,
 	.max_ep_tx_ctx = 1,
-	.max_ep_rx_ctx = 1
+	.max_ep_rx_ctx = 1,
+	.mr_iov_limit = 1,
 };
 
 static struct fi_fabric_attr tcpx_fabric_attr = {
@@ -107,7 +141,18 @@ static struct fi_fabric_attr tcpx_fabric_attr = {
 	.prov_version = OFI_VERSION_DEF_PROV,
 };
 
+struct fi_info tcpx_srx_info = {
+	.caps = TCPX_DOMAIN_CAPS | TCPX_EP_SRX_CAPS | TCPX_TX_CAPS | TCPX_RX_CAPS,
+	.addr_format = FI_SOCKADDR,
+	.tx_attr = &tcpx_tx_srx_attr,
+	.rx_attr = &tcpx_rx_srx_attr,
+	.ep_attr = &tcpx_ep_srx_attr,
+	.domain_attr = &tcpx_domain_attr,
+	.fabric_attr = &tcpx_fabric_attr
+};
+
 struct fi_info tcpx_info = {
+	.next = &tcpx_srx_info,
 	.caps = TCPX_DOMAIN_CAPS | TCPX_EP_CAPS | TCPX_TX_CAPS | TCPX_RX_CAPS,
 	.addr_format = FI_SOCKADDR,
 	.tx_attr = &tcpx_tx_attr,
@@ -117,8 +162,27 @@ struct fi_info tcpx_info = {
 	.fabric_attr = &tcpx_fabric_attr
 };
 
+
+/* User hints will still override the modified dest_info attributes
+ * through ofi_alter_info
+ */
+static int
+tcpx_alter_defaults(uint32_t version, const struct fi_info *hints,
+		    const struct fi_info *base_info,
+		    struct fi_info *dest_info)
+{
+	dest_info->tx_attr->size = tcpx_default_tx_size;
+	if ((base_info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) &&
+	    hints && hints->ep_attr &&
+	    (hints->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT))
+		dest_info->rx_attr->size = tcpx_default_rx_size;
+	return 0;
+}
+
+
 struct util_prov tcpx_util_prov = {
 	.prov = &tcpx_prov,
 	.info = &tcpx_info,
+	.alter_defaults = &tcpx_alter_defaults,
 	.flags = 0,
 };
diff --git a/deps/libfabric/prov/tcp/src/tcpx_comm.c b/deps/libfabric/prov/tcp/src/tcpx_comm.c
deleted file mode 100644
index d790b01379fd1cd864b3707368f3d90109211720..0000000000000000000000000000000000000000
--- a/deps/libfabric/prov/tcp/src/tcpx_comm.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2017 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *	   Redistribution and use in source and binary forms, with or
- *	   without modification, are permitted provided that the following
- *	   conditions are met:
- *
- *		- Redistributions of source code must retain the above
- *		  copyright notice, this list of conditions and the following
- *		  disclaimer.
- *
- *		- Redistributions in binary form must reproduce the above
- *		  copyright notice, this list of conditions and the following
- *		  disclaimer in the documentation and/or other materials
- *		  provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/fi_errno.h>
-#include <ofi_prov.h>
-#include <sys/types.h>
-#include <ofi_util.h>
-#include <ofi_iov.h>
-#include "tcpx.h"
-
-int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry)
-{
-	ssize_t bytes_sent;
-	struct msghdr msg = {0};
-
-	msg.msg_iov = tx_entry->iov;
-	msg.msg_iovlen = tx_entry->iov_cnt;
-
-	bytes_sent = ofi_sendmsg_tcp(tx_entry->ep->sock, &msg, MSG_NOSIGNAL);
-	if (bytes_sent < 0)
-		return ofi_sockerr() == EPIPE ? -FI_ENOTCONN : -ofi_sockerr();
-
-	tx_entry->rem_len -= bytes_sent;
-	if (tx_entry->rem_len) {
-		ofi_consume_iov(tx_entry->iov, &tx_entry->iov_cnt, bytes_sent);
-		return -FI_EAGAIN;
-	}
-	return FI_SUCCESS;
-}
-
-static ssize_t tcpx_read_from_buffer(struct stage_buf *stage_buf,
-				     uint8_t *buf, size_t len)
-{
-	size_t rem_size;
-	ssize_t ret;
-
-	assert(stage_buf->cur_pos < stage_buf->bytes_avail);
-	rem_size = stage_buf->bytes_avail - stage_buf->cur_pos;
-	ret = (rem_size >= len) ? len : rem_size;
-	memcpy(buf, &stage_buf->buf[stage_buf->cur_pos], ret);
-	stage_buf->cur_pos += ret;
-	return ret;
-}
-
-ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf,
-		      struct tcpx_cur_rx_msg *cur_rx_msg)
-{
-	ssize_t bytes_recvd, bytes_read;
-	size_t rem_len;
-	void *rem_buf;
-
-	rem_buf = (uint8_t *) &cur_rx_msg->hdr + cur_rx_msg->done_len;
-	rem_len = cur_rx_msg->hdr_len - cur_rx_msg->done_len;
-
-	if (stage_buf->cur_pos < stage_buf->bytes_avail) {
-		bytes_read = tcpx_read_from_buffer(stage_buf, rem_buf, rem_len);
-		rem_len -= bytes_read;
-		if (!rem_len)
-			return bytes_read;
-
-		rem_buf = (char *) rem_buf + bytes_read;
-	} else {
-		bytes_read = 0;
-	}
-
-	bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0);
-	if (bytes_recvd < 0)
-		return bytes_read ? bytes_read : -ofi_sockerr();
-	else if (bytes_recvd == 0)
-		return -FI_ENOTCONN;
-
-	return bytes_read + bytes_recvd;
-}
-
-static ssize_t tcpx_readv_from_buffer(struct stage_buf *stage_buf,
-				      struct iovec *iov,
-				      int iov_cnt)
-{
-	ssize_t ret = 0;
-	size_t bytes_read;
-	int i;
-
-	if (iov_cnt == 1)
-		return tcpx_read_from_buffer(stage_buf, iov[0].iov_base,
-					     iov[0].iov_len);
-
-	for (i = 0; i < iov_cnt; i++) {
-		bytes_read = tcpx_read_from_buffer(stage_buf, iov[i].iov_base,
-						   iov[i].iov_len);
-		ret += bytes_read;
-		if ((bytes_read < iov[i].iov_len) ||
-		    !(stage_buf->bytes_avail - stage_buf->cur_pos))
-			break;
-	}
-	return ret;
-}
-
-int tcpx_recv_msg_data(struct tcpx_xfer_entry *rx_entry)
-{
-	struct stage_buf *stage_buf;
-	ssize_t bytes_recvd, bytes_read;
-
-	if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len)
-		return FI_SUCCESS;
-
-	stage_buf = &rx_entry->ep->stage_buf;
-	if (stage_buf->cur_pos < stage_buf->bytes_avail) {
-		bytes_read = tcpx_readv_from_buffer(stage_buf,
-						    rx_entry->iov,
-						    rx_entry->iov_cnt);
-		ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_read);
-		if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len)
-			return FI_SUCCESS;
-	} else {
-		bytes_read = 0;
-	}
-
-	bytes_recvd = ofi_readv_socket(rx_entry->ep->sock, rx_entry->iov,
-				       rx_entry->iov_cnt);
-	if (bytes_recvd < 0)
-		return bytes_read ? -FI_EAGAIN : -ofi_sockerr();
-	else if (bytes_recvd == 0)
-		return -FI_ENOTCONN;
-
-	ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_recvd);
-	return (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len) ?
-		FI_SUCCESS : -FI_EAGAIN;
-}
-
-int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf)
-{
-	int bytes_recvd;
-
-	bytes_recvd = ofi_recv_socket(sock, stage_buf->buf,
-				      sizeof(stage_buf->buf), 0);
-	if (bytes_recvd <= 0)
-		return (bytes_recvd) ? -ofi_sockerr(): -FI_ENOTCONN;
-
-	stage_buf->bytes_avail = bytes_recvd;
-	stage_buf->cur_pos = 0;
-	return FI_SUCCESS;
-}
diff --git a/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c b/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
index de7f5b2296ccdcce8c1d8793d8aca7d2b57106a0..621e482ea65deb79bcc0bf66062af0d2431ca668 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
@@ -39,39 +39,80 @@
 #include <ofi_util.h>
 
 
-static int rx_cm_data(SOCKET fd, struct ofi_ctrl_hdr *hdr,
-		      int type, struct tcpx_cm_context *cm_ctx)
+struct tcpx_cm_context *tcpx_alloc_cm_ctx(fid_t fid, enum tcpx_cm_state state)
+{
+	struct tcpx_cm_context *cm_ctx;
+	struct tcpx_ep *ep;
+
+	cm_ctx = calloc(1, sizeof(*cm_ctx));
+	if (!cm_ctx)
+		return NULL;
+
+	cm_ctx->fid.fclass = TCPX_CLASS_CM;
+	cm_ctx->hfid = fid;
+	if (fid && fid->fclass == FI_CLASS_EP) {
+		ep = container_of(cm_ctx->hfid, struct tcpx_ep,
+				  util_ep.ep_fid.fid);
+		assert(!ep->fid);
+		ep->cm_ctx = cm_ctx;
+	}
+	cm_ctx->state = state;
+	return cm_ctx;
+}
+
+void tcpx_free_cm_ctx(struct tcpx_cm_context *cm_ctx)
+{
+	struct tcpx_ep *ep;
+
+	assert(cm_ctx->fid.fclass == TCPX_CLASS_CM);
+	if (cm_ctx->hfid && cm_ctx->hfid->fclass == FI_CLASS_EP) {
+		ep = container_of(cm_ctx->hfid, struct tcpx_ep,
+				  util_ep.ep_fid.fid);
+		ep->cm_ctx = NULL;
+	}
+
+	free(cm_ctx);
+}
+
+/* The underlying socket has the POLLIN event set.  The entire
+ * CM message should be readable, as it fits within a single MTU
+ * and is the first data transferred over the socket.
+ */
+static int rx_cm_data(SOCKET fd, int type, struct tcpx_cm_context *cm_ctx)
 {
 	size_t data_size = 0;
 	ssize_t ret;
 
-	ret = ofi_recv_socket(fd, hdr, sizeof(*hdr), MSG_WAITALL);
-	if (ret != sizeof(*hdr)) {
+	ret = ofi_recv_socket(fd, &cm_ctx->msg.hdr, sizeof(cm_ctx->msg.hdr), 0);
+	if (ret != sizeof(cm_ctx->msg.hdr)) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"Failed to read cm header\n");
 		ret = ofi_sockerr() ? -ofi_sockerr() : -FI_EIO;
 		goto out;
 	}
 
-	if (hdr->version != TCPX_CTRL_HDR_VERSION) {
+	if (cm_ctx->msg.hdr.version != TCPX_CTRL_HDR_VERSION) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"cm protocol version mismatch\n");
 		ret = -FI_ENOPROTOOPT;
 		goto out;
 	}
 
-	if (hdr->type != type && hdr->type != ofi_ctrl_nack) {
+	if (cm_ctx->msg.hdr.type != type &&
+	    cm_ctx->msg.hdr.type != ofi_ctrl_nack) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"unexpected cm message type, expected %d or %d got: %d\n",
-			type, ofi_ctrl_nack, hdr->type);
+			type, ofi_ctrl_nack, cm_ctx->msg.hdr.type);
 		ret = -FI_ECONNREFUSED;
 		goto out;
 	}
 
-	data_size = MIN(ntohs(hdr->seg_size), TCPX_MAX_CM_DATA_SIZE);
+	data_size = ntohs(cm_ctx->msg.hdr.seg_size);
 	if (data_size) {
-		ret = ofi_recv_socket(fd, cm_ctx->cm_data, data_size,
-				      MSG_WAITALL);
+		if (data_size > TCPX_MAX_CM_DATA_SIZE)
+			data_size = TCPX_MAX_CM_DATA_SIZE;
+
+		ret = ofi_recv_socket(fd, cm_ctx->msg.data, data_size, 0);
 		if ((size_t) ret != data_size) {
 			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 				"Failed to read cm data\n");
@@ -80,15 +121,15 @@ static int rx_cm_data(SOCKET fd, struct ofi_ctrl_hdr *hdr,
 			goto out;
 		}
 
-		if (ntohs(hdr->seg_size) > TCPX_MAX_CM_DATA_SIZE) {
+		if (ntohs(cm_ctx->msg.hdr.seg_size) > TCPX_MAX_CM_DATA_SIZE) {
 			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 				"Discarding unexpected cm data\n");
-			ofi_discard_socket(fd, ntohs(hdr->seg_size) -
+			ofi_discard_socket(fd, ntohs(cm_ctx->msg.hdr.seg_size) -
 					   TCPX_MAX_CM_DATA_SIZE);
 		}
 	}
 
-	if (hdr->type == ofi_ctrl_nack) {
+	if (cm_ctx->msg.hdr.type == ofi_ctrl_nack) {
 		FI_INFO(&tcpx_prov, FI_LOG_EP_CTRL,
 			"Connection refused from remote\n");
 		ret = -FI_ECONNREFUSED;
@@ -101,36 +142,41 @@ out:
 	return ret;
 }
 
+/* The underlying socket has the POLLOUT event set.  It is ready
+ * to accept outbound data.  We expect to transfer the entire CM
+ * message as it fits into a single MTU and is the first data
+ * transferred over the socket.
+ */
 static int tx_cm_data(SOCKET fd, uint8_t type, struct tcpx_cm_context *cm_ctx)
 {
-	struct ofi_ctrl_hdr hdr;
 	ssize_t ret;
 
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.version = TCPX_CTRL_HDR_VERSION;
-	hdr.type = type;
-	hdr.seg_size = htons((uint16_t) cm_ctx->cm_data_sz);
-	hdr.conn_data = 1; /* For testing endianess mismatch at peer */
-
-	ret = ofi_send_socket(fd, &hdr, sizeof(hdr), MSG_NOSIGNAL);
-	if (ret != sizeof(hdr))
-		goto err;
-
-	if (cm_ctx->cm_data_sz) {
-		ret = ofi_send_socket(fd, cm_ctx->cm_data,
-				      cm_ctx->cm_data_sz, MSG_NOSIGNAL);
-		if ((size_t) ret != cm_ctx->cm_data_sz)
-			goto err;
-	}
+	memset(&cm_ctx->msg.hdr, 0, sizeof(cm_ctx->msg.hdr));
+	cm_ctx->msg.hdr.version = TCPX_CTRL_HDR_VERSION;
+	cm_ctx->msg.hdr.type = type;
+	cm_ctx->msg.hdr.seg_size = htons((uint16_t) cm_ctx->cm_data_sz);
+	cm_ctx->msg.hdr.conn_data = 1; /* tests endianess mismatch at peer */
+
+	ret = ofi_send_socket(fd, &cm_ctx->msg, sizeof(cm_ctx->msg.hdr) +
+			      cm_ctx->cm_data_sz, MSG_NOSIGNAL);
+	if (ret != sizeof(cm_ctx->msg.hdr) + cm_ctx->cm_data_sz)
+		return ofi_sockerr() ? -ofi_sockerr() : -FI_EIO;
 
 	return FI_SUCCESS;
-err:
-	return ofi_sockerr() ? -ofi_sockerr() : -FI_EIO;
 }
 
-static int tcpx_ep_enable(struct tcpx_ep *ep)
+static int tcpx_ep_enable(struct tcpx_ep *ep,
+			  struct fi_eq_cm_entry *cm_entry,
+			  size_t cm_entry_sz)
+
 {
-	int ret;
+	int ret = 0;
+
+	if (!ep->util_ep.rx_cq && !ep->util_ep.tx_cq) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"ep must be bound to cq's\n");
+		return -FI_ENOCQ;
+	}
 
 	fastlock_acquire(&ep->lock);
 	if (ep->state != TCPX_CONNECTING && ep->state != TCPX_ACCEPTING) {
@@ -140,18 +186,12 @@ static int tcpx_ep_enable(struct tcpx_ep *ep)
 		goto unlock;
 	}
 
-	ret = fi_fd_nonblock(ep->sock);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"failed to set socket to nonblocking\n");
-		goto unlock;
-	}
 	ep->state = TCPX_CONNECTED;
 	fastlock_release(&ep->lock);
 
 	if (ep->util_ep.rx_cq) {
 		ret = ofi_wait_add_fd(ep->util_ep.rx_cq->wait,
-				      ep->sock, POLLIN, tcpx_try_func,
+				      ep->bsock.sock, POLLIN, tcpx_try_func,
 				      (void *) &ep->util_ep,
 				      &ep->util_ep.ep_fid.fid);
 		if (ret) {
@@ -163,7 +203,7 @@ static int tcpx_ep_enable(struct tcpx_ep *ep)
 
 	if (ep->util_ep.tx_cq) {
 		ret = ofi_wait_add_fd(ep->util_ep.tx_cq->wait,
-				      ep->sock, POLLIN, tcpx_try_func,
+				      ep->bsock.sock, POLLIN, tcpx_try_func,
 				      (void *) &ep->util_ep,
 				      &ep->util_ep.ep_fid.fid);
 		if (ret) {
@@ -173,53 +213,76 @@ static int tcpx_ep_enable(struct tcpx_ep *ep)
 		}
 	}
 
-	/* TODO: Move writing CONNECTED event here */
+	ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, cm_entry,
+				cm_entry_sz, 0);
+	if (ret < 0) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n");
+		return ret;
+	}
 
-	return ret;
+	return 0;
 unlock:
 	fastlock_release(&ep->lock);
 	return ret;
 }
 
-static int proc_conn_resp(struct tcpx_cm_context *cm_ctx,
-			  struct tcpx_ep *ep)
+static void tcpx_cm_recv_resp(struct util_wait *wait,
+			      struct tcpx_cm_context *cm_ctx)
 {
-	struct ofi_ctrl_hdr conn_resp;
 	struct fi_eq_cm_entry *cm_entry;
-	ssize_t len;
-	int ret = FI_SUCCESS;
+	struct tcpx_ep *ep;
+	int ret;
+
+	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Handling accept from server\n");
+	assert(cm_ctx->hfid->fclass == FI_CLASS_EP);
+	ep = container_of(cm_ctx->hfid, struct tcpx_ep, util_ep.ep_fid.fid);
 
-	ret = rx_cm_data(ep->sock, &conn_resp, ofi_ctrl_connresp, cm_ctx);
+	ret = rx_cm_data(ep->bsock.sock, ofi_ctrl_connresp, cm_ctx);
 	if (ret) {
+		if (ret == -FI_EAGAIN)
+			return;
+
 		enum fi_log_level level = (ret == -FI_ECONNREFUSED) ?
 				FI_LOG_INFO : FI_LOG_WARN;
 		FI_LOG(&tcpx_prov, level, FI_LOG_EP_CTRL,
 			"Failed to receive connect response\n");
-		return ret;
+		ofi_wait_del_fd(wait, ep->bsock.sock);
+		goto err1;
+	}
+
+	ret = ofi_wait_del_fd(wait, ep->bsock.sock);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Could not remove fd from wait\n");
+		goto err1;
 	}
 
 	cm_entry = calloc(1, sizeof(*cm_entry) + cm_ctx->cm_data_sz);
 	if (!cm_entry)
-		return -FI_ENOMEM;
+		goto err1;
 
-	cm_entry->fid = cm_ctx->fid;
-	memcpy(cm_entry->data, cm_ctx->cm_data, cm_ctx->cm_data_sz);
+	cm_entry->fid = cm_ctx->hfid;
+	memcpy(cm_entry->data, cm_ctx->msg.data, cm_ctx->cm_data_sz);
 
-	ep->hdr_bswap = (conn_resp.conn_data == 1) ?
+	ep->hdr_bswap = (cm_ctx->msg.hdr.conn_data == 1) ?
 			tcpx_hdr_none : tcpx_hdr_bswap;
 
-	ret = tcpx_ep_enable(ep);
+	ret = tcpx_ep_enable(ep, cm_entry,
+			     sizeof(*cm_entry) + cm_ctx->cm_data_sz);
 	if (ret)
-		goto err;
+		goto err2;
 
-	len = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, cm_entry,
-			  sizeof(*cm_entry) + cm_ctx->cm_data_sz, 0);
-	if (len < 0)
-		ret = (int) len;
+	free(cm_entry);
+	tcpx_free_cm_ctx(cm_ctx);
+	return;
 
-err:
+err2:
 	free(cm_entry);
-	return ret;
+err1:
+	fastlock_acquire(&ep->lock);
+	tcpx_ep_disable(ep, -ret);
+	fastlock_release(&ep->lock);
+	tcpx_free_cm_ctx(cm_ctx);
 }
 
 int tcpx_eq_wait_try_func(void *arg)
@@ -227,102 +290,77 @@ int tcpx_eq_wait_try_func(void *arg)
 	return FI_SUCCESS;
 }
 
-static void client_recv_connresp(struct util_wait *wait,
-				 struct tcpx_cm_context *cm_ctx)
-{
-	struct tcpx_ep *ep;
-	ssize_t ret;
-
-	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Handling accept from server\n");
-	assert(cm_ctx->fid->fclass == FI_CLASS_EP);
-	ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid);
-
-	ret = ofi_wait_del_fd(wait, ep->sock);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"Could not remove fd from wait\n");
-		goto err;
-	}
-
-	/* TODO: merge proc_conn_resp into here */
-	ret = proc_conn_resp(cm_ctx, ep);
-	if (ret)
-		goto err;
-
-	free(cm_ctx);
-	return;
-err:
-	tcpx_ep_disable(ep, -ret);
-	free(cm_ctx);
-}
-
-static void server_send_cm_accept(struct util_wait *wait,
-				  struct tcpx_cm_context *cm_ctx)
+static void tcpx_cm_send_resp(struct util_wait *wait,
+			      struct tcpx_cm_context *cm_ctx)
 {
 	struct fi_eq_cm_entry cm_entry = {0};
 	struct tcpx_ep *ep;
 	int ret;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Send connect (accept) response\n");
-	assert(cm_ctx->fid->fclass == FI_CLASS_EP);
-	ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid);
+	assert(cm_ctx->hfid->fclass == FI_CLASS_EP);
+	ep = container_of(cm_ctx->hfid, struct tcpx_ep, util_ep.ep_fid.fid);
 
-	ret = ofi_wait_del_fd(wait, ep->sock);
+	ret = tx_cm_data(ep->bsock.sock, ofi_ctrl_connresp, cm_ctx);
 	if (ret) {
+		if (ret == -FI_EAGAIN)
+			return;
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"Could not remove fd from wait\n");
-		goto err;
+			"Failed to send connect (accept) response\n");
+		goto delfd;
 	}
 
-	ret = tx_cm_data(ep->sock, ofi_ctrl_connresp, cm_ctx);
+	ret = ofi_wait_del_fd(wait, ep->bsock.sock);
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"Failed to send connect (accept) response\n");
-		goto err;
+			"Could not remove fd from wait\n");
+		goto disable;
 	}
 
-	cm_entry.fid =  cm_ctx->fid;
-	ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED,
-				&cm_entry, sizeof(cm_entry), 0);
-	if (ret < 0)
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n");
+	cm_entry.fid = cm_ctx->hfid;
 
-	ret = tcpx_ep_enable(ep);
+	ret = tcpx_ep_enable(ep, &cm_entry, sizeof(cm_entry));
 	if (ret)
-		goto err;
+		goto disable;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Connection Accept Successful\n");
-	free(cm_ctx);
+	tcpx_free_cm_ctx(cm_ctx);
 	return;
 
-err:
+delfd:
+	ofi_wait_del_fd(wait, ep->bsock.sock);
+disable:
+	fastlock_acquire(&ep->lock);
 	tcpx_ep_disable(ep, -ret);
-	free(cm_ctx);
+	fastlock_release(&ep->lock);
+	tcpx_free_cm_ctx(cm_ctx);
 }
 
-static void server_recv_connreq(struct util_wait *wait,
-				struct tcpx_cm_context *cm_ctx)
+static void tcpx_cm_recv_req(struct util_wait *wait,
+			     struct tcpx_cm_context *cm_ctx)
 {
 	struct tcpx_conn_handle *handle;
 	struct fi_eq_cm_entry *cm_entry;
-	struct ofi_ctrl_hdr conn_req;
 	socklen_t len;
 	int ret;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Server receive connect request\n");
-	handle  = container_of(cm_ctx->fid, struct tcpx_conn_handle, handle);
+	handle  = container_of(cm_ctx->hfid, struct tcpx_conn_handle, fid);
+
+	ret = rx_cm_data(handle->sock, ofi_ctrl_connreq, cm_ctx);
+	if (ret) {
+		if (ret == -FI_EAGAIN)
+			return;
+		ofi_wait_del_fd(wait, handle->sock);
+		goto err1;
+	}
 
 	ret = ofi_wait_del_fd(wait, handle->sock);
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"fd deletion from ofi_wait failed\n");
-		cm_ctx->type = CLIENT_SERVER_ERROR;
-		return;
-	}
-
-	ret = rx_cm_data(handle->sock, &conn_req, ofi_ctrl_connreq, cm_ctx);
-	if (ret)
 		goto err1;
+	}
 
 	cm_entry = calloc(1, sizeof(*cm_entry) + cm_ctx->cm_data_sz);
 	if (!cm_entry)
@@ -334,6 +372,7 @@ static void server_recv_connreq(struct util_wait *wait,
 		goto err2;
 
 	len = cm_entry->info->dest_addrlen = handle->pep->info->src_addrlen;
+	free(cm_entry->info->dest_addr);
 	cm_entry->info->dest_addr = malloc(len);
 	if (!cm_entry->info->dest_addr)
 		goto err3;
@@ -342,11 +381,13 @@ static void server_recv_connreq(struct util_wait *wait,
 	if (ret)
 		goto err3;
 
-	handle->endian_match = (conn_req.conn_data == 1);
-	cm_entry->info->handle = &handle->handle;
-	memcpy(cm_entry->data, cm_ctx->cm_data, cm_ctx->cm_data_sz);
+	handle->endian_match = (cm_ctx->msg.hdr.conn_data == 1);
+	cm_entry->info->handle = &handle->fid;
+	memcpy(cm_entry->data, cm_ctx->msg.data, cm_ctx->cm_data_sz);
+	cm_ctx->state = TCPX_CM_REQ_RVCD;
 
-	ret = (int) fi_eq_write(&handle->pep->util_pep.eq->eq_fid, FI_CONNREQ, cm_entry,
+	ret = (int) fi_eq_write(&handle->pep->util_pep.eq->eq_fid,
+				FI_CONNREQ, cm_entry,
 				sizeof(*cm_entry) + cm_ctx->cm_data_sz, 0);
 	if (ret < 0) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n");
@@ -354,7 +395,7 @@ static void server_recv_connreq(struct util_wait *wait,
 	}
 
 	free(cm_entry);
-	free(cm_ctx);
+	tcpx_free_cm_ctx(cm_ctx);
 	return;
 err3:
 	fi_freeinfo(cm_entry->info);
@@ -362,57 +403,61 @@ err2:
 	free(cm_entry);
 err1:
 	ofi_close_socket(handle->sock);
-	free(cm_ctx);
+	tcpx_free_cm_ctx(cm_ctx);
 	free(handle);
 }
 
-static void client_send_connreq(struct util_wait *wait,
-				struct tcpx_cm_context *cm_ctx)
+static void tcpx_cm_send_req(struct util_wait *wait,
+			     struct tcpx_cm_context *cm_ctx)
 {
 	struct tcpx_ep *ep;
 	socklen_t len;
 	int status, ret = FI_SUCCESS;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "client send connreq\n");
-	ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid);
-
-	ret = ofi_wait_del_fd(wait, ep->sock);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"Could not remove fd from wait: %s\n",
-			fi_strerror(-ret));
-		goto err;
-	}
+	ep = container_of(cm_ctx->hfid, struct tcpx_ep, util_ep.ep_fid.fid);
 
 	len = sizeof(status);
-	ret = getsockopt(ep->sock, SOL_SOCKET, SO_ERROR, (char *) &status, &len);
+	ret = getsockopt(ep->bsock.sock, SOL_SOCKET, SO_ERROR,
+			 (char *) &status, &len);
 	if (ret < 0 || status) {
-		ret = (ret < 0)? -ofi_sockerr() : status;
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "connection failure\n");
-		goto err;
+		ret = (ret < 0)? -ofi_sockerr() : -status;
+		FI_WARN_SPARSE(&tcpx_prov, FI_LOG_EP_CTRL,
+				"connection failure (sockerr %d)\n", ret);
+		goto delfd;
 	}
 
-	ret = tx_cm_data(ep->sock, ofi_ctrl_connreq, cm_ctx);
+	ret = tx_cm_data(ep->bsock.sock, ofi_ctrl_connreq, cm_ctx);
 	if (ret)
-		goto err;
+		goto delfd;
 
-	ret = ofi_wait_del_fd(wait, ep->sock);
+	ret = ofi_wait_del_fd(wait, ep->bsock.sock);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Could not remove fd from wait: %s\n",
+			fi_strerror(-ret));
+		goto disable;
+	}
 
-	cm_ctx->type = CLIENT_RECV_CONNRESP;
-	ret = ofi_wait_add_fd(wait, ep->sock, POLLIN,
+	cm_ctx->state = TCPX_CM_REQ_SENT;
+	ret = ofi_wait_add_fd(wait, ep->bsock.sock, POLLIN,
 			      tcpx_eq_wait_try_func, NULL, cm_ctx);
 	if (ret)
-		goto err;
+		goto disable;
 
 	return;
 
-err:
+delfd:
+	ofi_wait_del_fd(wait, ep->bsock.sock);
+disable:
+	fastlock_acquire(&ep->lock);
 	tcpx_ep_disable(ep, -ret);
-	free(cm_ctx);
+	fastlock_release(&ep->lock);
+	tcpx_free_cm_ctx(cm_ctx);
 }
 
-static void server_sock_accept(struct util_wait *wait,
-			       struct tcpx_cm_context *cm_ctx)
+static void tcpx_accept(struct util_wait *wait,
+			struct tcpx_cm_context *cm_ctx)
 {
 	struct tcpx_conn_handle *handle;
 	struct tcpx_cm_context *rx_req_cm_ctx;
@@ -420,14 +465,16 @@ static void server_sock_accept(struct util_wait *wait,
 	SOCKET sock;
 	int ret;
 
-	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Received Connreq\n");
-	assert(cm_ctx->fid->fclass == FI_CLASS_PEP);
-	pep = container_of(cm_ctx->fid, struct tcpx_pep, util_pep.pep_fid.fid);
+	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "accepting connection\n");
+	assert(cm_ctx->hfid->fclass == FI_CLASS_PEP);
+	pep = container_of(cm_ctx->hfid, struct tcpx_pep, util_pep.pep_fid.fid);
 
 	sock = accept(pep->sock, NULL, 0);
 	if (sock < 0) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"accept error: %d\n", ofi_sockerr());
+		if (!OFI_SOCK_TRY_ACCEPT_AGAIN(ofi_sockerr())) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+				"accept error: %d\n", ofi_sockerr());
+		}
 		return;
 	}
 
@@ -438,15 +485,13 @@ static void server_sock_accept(struct util_wait *wait,
 		goto err1;
 	}
 
-	rx_req_cm_ctx = calloc(1, sizeof(*rx_req_cm_ctx));
+	rx_req_cm_ctx = tcpx_alloc_cm_ctx(&handle->fid, TCPX_CM_WAIT_REQ);
 	if (!rx_req_cm_ctx)
 		goto err2;
 
 	handle->sock = sock;
-	handle->handle.fclass = FI_CLASS_CONNREQ;
+	handle->fid.fclass = FI_CLASS_CONNREQ;
 	handle->pep = pep;
-	rx_req_cm_ctx->fid = &handle->handle;
-	rx_req_cm_ctx->type = SERVER_RECV_CONNREQ;
 
 	ret = ofi_wait_add_fd(wait, sock, POLLIN,
 			      tcpx_eq_wait_try_func,
@@ -463,85 +508,72 @@ err1:
 	ofi_close_socket(sock);
 }
 
-/*
- * The cm_context::fid is an endpoint, which contains the EP state.
- * That state duplicates the cm_context::type.  Remove cm_ctx and use
- * the fid fclass and EP state.
- */
 static void process_cm_ctx(struct util_wait *wait,
 			   struct tcpx_cm_context *cm_ctx)
 {
-	switch (cm_ctx->type) {
-	case SERVER_SOCK_ACCEPT:
-		assert(cm_ctx->fid->fclass == FI_CLASS_PEP);
-		server_sock_accept(wait, cm_ctx);
+	switch (cm_ctx->state) {
+	case TCPX_CM_LISTENING:
+		assert(cm_ctx->hfid->fclass == FI_CLASS_PEP);
+		tcpx_accept(wait, cm_ctx);
 		break;
-	case CLIENT_SEND_CONNREQ:
-		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
-		       (container_of(cm_ctx->fid, struct tcpx_ep,
+	case TCPX_CM_CONNECTING:
+		assert((cm_ctx->hfid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->hfid, struct tcpx_ep,
 				     util_ep.ep_fid.fid)->state ==
 							  TCPX_CONNECTING));
-		client_send_connreq(wait, cm_ctx);
+		tcpx_cm_send_req(wait, cm_ctx);
 		break;
-	case SERVER_RECV_CONNREQ:
-		assert(cm_ctx->fid->fclass == FI_CLASS_CONNREQ);
-		server_recv_connreq(wait, cm_ctx);
+	case TCPX_CM_WAIT_REQ:
+		assert(cm_ctx->hfid->fclass == FI_CLASS_CONNREQ);
+		tcpx_cm_recv_req(wait, cm_ctx);
 		break;
-	case SERVER_SEND_CM_ACCEPT:
-		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
-		       (container_of(cm_ctx->fid, struct tcpx_ep,
+	case TCPX_CM_RESP_READY:
+		assert((cm_ctx->hfid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->hfid, struct tcpx_ep,
 				     util_ep.ep_fid.fid)->state ==
 							  TCPX_ACCEPTING));
-		server_send_cm_accept(wait, cm_ctx);
+		tcpx_cm_send_resp(wait, cm_ctx);
 		break;
-	case CLIENT_RECV_CONNRESP:
-		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
-		       (container_of(cm_ctx->fid, struct tcpx_ep,
+	case TCPX_CM_REQ_SENT:
+		assert((cm_ctx->hfid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->hfid, struct tcpx_ep,
 				     util_ep.ep_fid.fid)->state ==
 							  TCPX_CONNECTING));
-		client_recv_connresp(wait, cm_ctx);
+		tcpx_cm_recv_resp(wait, cm_ctx);
 		break;
 	default:
 		break;
 	}
 }
 
-/* The implementation assumes that the EQ does not share a wait set with
- * a CQ.  This is true for internally created wait sets, but not if the
- * application manages the wait set.  To fix, we need to distinguish
- * whether the wait_context references a fid or tcpx_cm_context.
- */
-void tcpx_conn_mgr_run(struct util_eq *eq)
+void tcpx_conn_mgr_run(struct util_eq *util_eq)
 {
 	struct util_wait_fd *wait_fd;
-	struct tcpx_eq *tcpx_eq;
-	void *wait_contexts[MAX_POLL_EVENTS];
-	int num_fds = 0, i;
-
-	assert(eq->wait != NULL);
-
-	wait_fd = container_of(eq->wait, struct util_wait_fd,
-			       util_wait);
-
-	tcpx_eq = container_of(eq, struct tcpx_eq, util_eq);
-	fastlock_acquire(&tcpx_eq->close_lock);
-	num_fds = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
-		  ofi_epoll_wait(wait_fd->epoll_fd, wait_contexts,
-				 MAX_POLL_EVENTS, 0) :
-		  ofi_pollfds_wait(wait_fd->pollfds, wait_contexts,
-				   MAX_POLL_EVENTS, 0);
-	if (num_fds < 0) {
-		fastlock_release(&tcpx_eq->close_lock);
-		return;
-	}
+	struct tcpx_eq *eq;
+	struct fid *fid;
+	struct ofi_epollfds_event events[MAX_POLL_EVENTS];
+	int count, i;
+
+	assert(util_eq->wait != NULL);
+	wait_fd = container_of(util_eq->wait, struct util_wait_fd, util_wait);
+
+	eq = container_of(util_eq, struct tcpx_eq, util_eq);
+	fastlock_acquire(&eq->close_lock);
+	count = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
+		ofi_epoll_wait(wait_fd->epoll_fd, events, MAX_POLL_EVENTS, 0) :
+		ofi_pollfds_wait(wait_fd->pollfds, events, MAX_POLL_EVENTS, 0);
+	if (count < 0)
+		goto unlock;
 
-	for ( i = 0; i < num_fds; i++) {
+	for (i = 0; i < count; i++) {
 		/* skip wake up signals */
-		if (&wait_fd->util_wait.wait_fid.fid == wait_contexts[i])
+		if (&wait_fd->util_wait.wait_fid.fid == events[i].data.ptr)
 			continue;
 
-		process_cm_ctx(eq->wait, (struct tcpx_cm_context *)
-			       wait_contexts[i]);
+		fid = events[i].data.ptr;
+		if (fid->fclass == TCPX_CLASS_CM)
+			process_cm_ctx(util_eq->wait, events[i].data.ptr);
 	}
-	fastlock_release(&tcpx_eq->close_lock);
+unlock:
+	fastlock_release(&eq->close_lock);
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_cq.c b/deps/libfabric/prov/tcp/src/tcpx_cq.c
index c4ce02291b9f08c1a7e4e3be2345cf3b3bacae16..5511339266ebb4ab4fdce11dbe88302d88383254 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_cq.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_cq.c
@@ -40,12 +40,13 @@
 
 void tcpx_cq_progress(struct util_cq *cq)
 {
-	void *wait_contexts[MAX_POLL_EVENTS];
+	struct ofi_epollfds_event events[MAX_POLL_EVENTS];
 	struct fid_list_entry *fid_entry;
 	struct util_wait_fd *wait_fd;
 	struct dlist_entry *item;
 	struct tcpx_ep *ep;
 	struct fid *fid;
+	uint32_t inevent, outevent, errevent;
 	int nfds, i;
 
 	wait_fd = container_of(cq->wait, struct util_wait_fd, util_wait);
@@ -55,24 +56,42 @@ void tcpx_cq_progress(struct util_cq *cq)
 		fid_entry = container_of(item, struct fid_list_entry, entry);
 		ep = container_of(fid_entry->fid, struct tcpx_ep,
 				  util_ep.ep_fid.fid);
-		tcpx_try_func(&ep->util_ep);
+
 		fastlock_acquire(&ep->lock);
-		tcpx_progress_tx(ep);
-		if (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail)
+		/* We need to progress receives in the case where we're waiting
+		 * on the application to post a buffer to consume a receive
+		 * that we've already read from the kernel.  If the message is
+		 * of length 0, there's no additional data to read, so failing
+		 * to progress can result in application hangs.
+		 */
+		if (ofi_bsock_readable(&ep->bsock) ||
+		    (ep->cur_rx.handler && !ep->cur_rx.entry)) {
+			assert(ep->state == TCPX_CONNECTED);
 			tcpx_progress_rx(ep);
+		}
+
+		(void) tcpx_update_epoll(ep);
 		fastlock_release(&ep->lock);
 	}
 
-	nfds = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
-	       ofi_epoll_wait(wait_fd->epoll_fd, wait_contexts,
-			      MAX_POLL_EVENTS, 0) :
-	       ofi_pollfds_wait(wait_fd->pollfds, wait_contexts,
-				MAX_POLL_EVENTS, 0);
+	if (wait_fd->util_wait.wait_obj == FI_WAIT_FD) {
+		nfds = ofi_epoll_wait(wait_fd->epoll_fd, events,
+				      MAX_POLL_EVENTS, 0);
+		inevent = POLLIN;
+		outevent = POLLOUT;
+		errevent = POLLERR;
+	} else {
+		nfds = ofi_pollfds_wait(wait_fd->pollfds, events,
+					MAX_POLL_EVENTS, 0);
+		inevent = OFI_EPOLL_IN;
+		outevent = OFI_EPOLL_OUT;
+		errevent = OFI_EPOLL_ERR;
+	}
 	if (nfds <= 0)
 		goto unlock;
 
 	for (i = 0; i < nfds; i++) {
-		fid = wait_contexts[i];
+		fid = events[i].data.ptr;
 		if (fid->fclass != FI_CLASS_EP) {
 			fd_signal_reset(&wait_fd->signal);
 			continue;
@@ -80,91 +99,87 @@ void tcpx_cq_progress(struct util_cq *cq)
 
 		ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
 		fastlock_acquire(&ep->lock);
-		tcpx_progress_rx(ep);
+		if (events[i].events & errevent)
+			tcpx_progress_async(ep);
+		if (events[i].events & inevent)
+			tcpx_progress_rx(ep);
+		if (events[i].events & outevent)
+			tcpx_progress_tx(ep);
 		fastlock_release(&ep->lock);
 	}
 unlock:
 	cq->cq_fastlock_release(&cq->ep_list_lock);
 }
 
-static void tcpx_buf_pools_destroy(struct tcpx_buf_pool *buf_pools)
-{
-	int i;
-
-	for (i = 0; i < TCPX_OP_CODE_MAX; i++)
-		ofi_bufpool_destroy(buf_pools[i].pool);
-}
-
 static int tcpx_cq_close(struct fid *fid)
 {
 	int ret;
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_cq *cq;
 
-	tcpx_cq = container_of(fid, struct tcpx_cq, util_cq.cq_fid.fid);
-	tcpx_buf_pools_destroy(tcpx_cq->buf_pools);
-	ret = ofi_cq_cleanup(&tcpx_cq->util_cq);
+	cq = container_of(fid, struct tcpx_cq, util_cq.cq_fid.fid);
+	ofi_bufpool_destroy(cq->xfer_pool);
+	ret = ofi_cq_cleanup(&cq->util_cq);
 	if (ret)
 		return ret;
 
-	free(tcpx_cq);
+	free(cq);
 	return 0;
 }
 
-struct tcpx_xfer_entry *tcpx_xfer_entry_alloc(struct tcpx_cq *tcpx_cq,
-					      enum tcpx_xfer_op_codes type)
-{
-	struct tcpx_xfer_entry *xfer_entry;
-
-	tcpx_cq->util_cq.cq_fastlock_acquire(&tcpx_cq->util_cq.cq_lock);
-	if (!ofi_cirque_isfull(tcpx_cq->util_cq.cirq))
-		xfer_entry = ofi_buf_alloc(tcpx_cq->buf_pools[type].pool);
-	else
-		xfer_entry = NULL;
-	tcpx_cq->util_cq.cq_fastlock_release(&tcpx_cq->util_cq.cq_lock);
-
-	return xfer_entry;
-}
-
-void tcpx_xfer_entry_release(struct tcpx_cq *tcpx_cq,
-			     struct tcpx_xfer_entry *xfer_entry)
+void tcpx_get_cq_info(struct tcpx_xfer_entry *entry, uint64_t *flags,
+		      uint64_t *data, uint64_t *tag)
 {
-	if (xfer_entry->ep->cur_rx_entry == xfer_entry)
-		xfer_entry->ep->cur_rx_entry = NULL;
-
-	xfer_entry->hdr.base_hdr.flags = 0;
-
-	xfer_entry->flags = 0;
-	xfer_entry->context = 0;
-	xfer_entry->rem_len = 0;
+	if (entry->hdr.base_hdr.flags & TCPX_REMOTE_CQ_DATA) {
+		*data = entry->hdr.cq_data_hdr.cq_data;
+
+		if ((entry->hdr.base_hdr.op == ofi_op_tagged) ||
+		    (entry->hdr.base_hdr.flags & TCPX_TAGGED)) {
+			*flags |= FI_REMOTE_CQ_DATA | FI_TAGGED;
+			*tag = entry->hdr.tag_data_hdr.tag;
+		} else {
+			*flags |= FI_REMOTE_CQ_DATA;
+			*tag = 0;
+		}
 
-	tcpx_cq->util_cq.cq_fastlock_acquire(&tcpx_cq->util_cq.cq_lock);
-	ofi_buf_free(xfer_entry);
-	tcpx_cq->util_cq.cq_fastlock_release(&tcpx_cq->util_cq.cq_lock);
+	} else if ((entry->hdr.base_hdr.op == ofi_op_tagged) ||
+		   (entry->hdr.base_hdr.flags & TCPX_TAGGED)) {
+		*flags |= FI_TAGGED;
+		*data = 0;
+		*tag = entry->hdr.tag_hdr.tag;
+	} else {
+		*data = 0;
+		*tag = 0;
+	}
 }
 
 void tcpx_cq_report_success(struct util_cq *cq,
 			    struct tcpx_xfer_entry *xfer_entry)
 {
-	uint64_t data = 0;
-	uint64_t flags = 0;
-	void *buf = NULL;
-	size_t len = 0;
-
-	flags = xfer_entry->flags;
+	uint64_t flags, data, tag;
+	size_t len;
 
-	if (!(flags & FI_COMPLETION))
+	if (!(xfer_entry->cq_flags & FI_COMPLETION) ||
+	    (xfer_entry->ctrl_flags & TCPX_INTERNAL_XFER))
 		return;
 
-	len = xfer_entry->hdr.base_hdr.size -
-	      xfer_entry->hdr.base_hdr.payload_off;
-
-	if (xfer_entry->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) {
-		flags |= FI_REMOTE_CQ_DATA;
+	flags = xfer_entry->cq_flags & ~FI_COMPLETION;
+	if (flags & FI_RECV) {
+		len = xfer_entry->hdr.base_hdr.size -
+		      xfer_entry->hdr.base_hdr.hdr_size;
+		tcpx_get_cq_info(xfer_entry, &flags, &data, &tag);
+	} else if (flags & FI_REMOTE_CQ_DATA) {
+		assert(flags & FI_REMOTE_WRITE);
+		len = 0;
+		tag = 0;
 		data = xfer_entry->hdr.cq_data_hdr.cq_data;
+	} else {
+		len = 0;
+		data = 0;
+		tag = 0;
 	}
 
 	ofi_cq_write(cq, xfer_entry->context,
-		     flags, len, buf, data, 0);
+		     flags, len, NULL, data, tag);
 	if (cq->wait)
 		ofi_cq_signal(&cq->cq_fid);
 }
@@ -174,19 +189,26 @@ void tcpx_cq_report_error(struct util_cq *cq,
 			  int err)
 {
 	struct fi_cq_err_entry err_entry;
-	uint64_t data = 0;
 
-	if (xfer_entry->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) {
-		xfer_entry->flags |= FI_REMOTE_CQ_DATA;
-		data = xfer_entry->hdr.cq_data_hdr.cq_data;
+	if (xfer_entry->ctrl_flags & TCPX_INTERNAL_XFER)
+		return;
+
+	err_entry.flags = xfer_entry->cq_flags & ~FI_COMPLETION;
+	if (err_entry.flags & FI_RECV) {
+		tcpx_get_cq_info(xfer_entry, &err_entry.flags, &err_entry.data,
+				 &err_entry.tag);
+	} else if (err_entry.flags & FI_REMOTE_CQ_DATA) {
+		assert(err_entry.flags & FI_REMOTE_WRITE);
+		err_entry.tag = 0;
+		err_entry.data = xfer_entry->hdr.cq_data_hdr.cq_data;
+	} else {
+		err_entry.data = 0;
+		err_entry.tag = 0;
 	}
 
 	err_entry.op_context = xfer_entry->context;
-	err_entry.flags = xfer_entry->flags;
 	err_entry.len = 0;
 	err_entry.buf = NULL;
-	err_entry.data = data;
-	err_entry.tag = 0;
 	err_entry.olen = 0;
 	err_entry.err = err;
 	err_entry.prov_errno = ofi_sockerr();
@@ -226,84 +248,23 @@ static struct fi_ops tcpx_cq_fi_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
-static void tcpx_buf_pool_init(struct ofi_bufpool_region *region, void *buf)
-{
-	struct tcpx_buf_pool *pool = region->pool->attr.context;
-	struct tcpx_xfer_entry *xfer_entry = buf;
-
-	xfer_entry->hdr.base_hdr.version = TCPX_HDR_VERSION;
-	xfer_entry->hdr.base_hdr.op_data = pool->op_type;
-
-	switch (pool->op_type) {
-	case TCPX_OP_MSG_RECV:
-	case TCPX_OP_MSG_SEND:
-	case TCPX_OP_MSG_RESP:
-		xfer_entry->hdr.base_hdr.op = ofi_op_msg;
-		break;
-	case TCPX_OP_WRITE:
-	case TCPX_OP_REMOTE_WRITE:
-		xfer_entry->hdr.base_hdr.op = ofi_op_write;
-		break;
-	case TCPX_OP_READ_REQ:
-		xfer_entry->hdr.base_hdr.op = ofi_op_read_req;
-		break;
-	case TCPX_OP_READ_RSP:
-		xfer_entry->hdr.base_hdr.op = ofi_op_read_rsp;
-		break;
-	case TCPX_OP_REMOTE_READ:
-		break;
-	default:
-		assert(0);
-		break;
-	}
-}
-
-static int tcpx_buf_pools_create(struct tcpx_buf_pool *buf_pools)
-{
-	int i, ret;
-	struct ofi_bufpool_attr attr = {
-		.size = sizeof(struct tcpx_xfer_entry),
-		.alignment = 16,
-		.chunk_cnt = 1024,
-		.init_fn = tcpx_buf_pool_init,
-		.flags = OFI_BUFPOOL_HUGEPAGES,
-	};
-
-	for (i = 0; i < TCPX_OP_CODE_MAX; i++) {
-		buf_pools[i].op_type = i;
-
-		attr.context = &buf_pools[i];
-		ret = ofi_bufpool_create_attr(&attr, &buf_pools[i].pool);
-		if (ret) {
-			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-				"Unable to create buf pool\n");
-			goto err;
-		}
-	}
-	return 0;
-
-err:
-	while (i--)
-		ofi_bufpool_destroy(buf_pools[i].pool);
-
-	return -ret;
-}
-
 int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 		 struct fid_cq **cq_fid, void *context)
 {
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_cq *cq;
 	struct fi_cq_attr cq_attr;
 	int ret;
 
-	tcpx_cq = calloc(1, sizeof(*tcpx_cq));
-	if (!tcpx_cq)
+	cq = calloc(1, sizeof(*cq));
+	if (!cq)
 		return -FI_ENOMEM;
 
 	if (!attr->size)
 		attr->size = TCPX_DEF_CQ_SIZE;
 
-	ret = tcpx_buf_pools_create(tcpx_cq->buf_pools);
+	ret = ofi_bufpool_create(&cq->xfer_pool,
+				 sizeof(struct tcpx_xfer_entry), 16, 0,
+				 1024, 0);
 	if (ret)
 		goto free_cq;
 
@@ -314,18 +275,18 @@ int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 		attr = &cq_attr;
 	}
 
-	ret = ofi_cq_init(&tcpx_prov, domain, attr, &tcpx_cq->util_cq,
+	ret = ofi_cq_init(&tcpx_prov, domain, attr, &cq->util_cq,
 			  &tcpx_cq_progress, context);
 	if (ret)
 		goto destroy_pool;
 
-	*cq_fid = &tcpx_cq->util_cq.cq_fid;
+	*cq_fid = &cq->util_cq.cq_fid;
 	(*cq_fid)->fid.ops = &tcpx_cq_fi_ops;
 	return 0;
 
 destroy_pool:
-	tcpx_buf_pools_destroy(tcpx_cq->buf_pools);
+	ofi_bufpool_destroy(cq->xfer_pool);
 free_cq:
-	free(tcpx_cq);
+	free(cq);
 	return ret;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_domain.c b/deps/libfabric/prov/tcp/src/tcpx_domain.c
index fbdefa9255fc06a45a823fddaaad8c224e5ec8af..773915afd28d11a2e4c20a3940d0830ec3701e40 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_domain.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_domain.c
@@ -34,7 +34,10 @@
 #include <string.h>
 
 #include "tcpx.h"
+
 extern struct fi_ops_msg tcpx_srx_msg_ops;
+extern struct fi_ops_tagged tcpx_srx_tag_ops;
+
 
 static int tcpx_srx_ctx_close(struct fid *fid)
 {
@@ -51,6 +54,12 @@ static int tcpx_srx_ctx_close(struct fid *fid)
 		ofi_buf_free(xfer_entry);
 	}
 
+	while (!slist_empty(&srx_ctx->tag_queue)) {
+		entry = slist_remove_head(&srx_ctx->tag_queue);
+		xfer_entry = container_of(entry, struct tcpx_xfer_entry, entry);
+		ofi_buf_free(xfer_entry);
+	}
+
 	ofi_bufpool_destroy(srx_ctx->buf_pool);
 	fastlock_destroy(&srx_ctx->lock);
 	free(srx_ctx);
@@ -66,7 +75,7 @@ static struct fi_ops fi_ops_srx_ctx = {
 };
 
 static int tcpx_srx_ctx(struct fid_domain *domain, struct fi_rx_attr *attr,
-		 struct fid_ep **rx_ep, void *context)
+			struct fid_ep **rx_ep, void *context)
 {
 	struct tcpx_rx_ctx *srx_ctx;
 	int ret = FI_SUCCESS;
@@ -80,21 +89,23 @@ static int tcpx_srx_ctx(struct fid_domain *domain, struct fi_rx_attr *attr,
 	srx_ctx->rx_fid.fid.ops = &fi_ops_srx_ctx;
 
 	srx_ctx->rx_fid.msg = &tcpx_srx_msg_ops;
+	srx_ctx->rx_fid.tagged = &tcpx_srx_tag_ops;
 	slist_init(&srx_ctx->rx_queue);
+	slist_init(&srx_ctx->tag_queue);
 
 	ret = fastlock_init(&srx_ctx->lock);
 	if (ret)
 		goto err1;
 
 	ret = ofi_bufpool_create(&srx_ctx->buf_pool,
-				 sizeof(struct tcpx_xfer_entry), 16, 0, 1024,
-				 OFI_BUFPOOL_HUGEPAGES);
+				 sizeof(struct tcpx_xfer_entry),
+				 16, attr->size, 1024, 0);
 	if (ret)
 		goto err2;
 
-	if (attr)
-		srx_ctx->op_flags = attr->op_flags;
-
+	srx_ctx->match_tag_rx = (attr->caps & FI_DIRECTED_RECV) ?
+				tcpx_match_tag_addr : tcpx_match_tag;
+	srx_ctx->op_flags = attr->op_flags;
 	*rx_ep = &srx_ctx->rx_fid;
 	return FI_SUCCESS;
 err2:
@@ -118,28 +129,53 @@ static struct fi_ops_domain tcpx_domain_ops = {
 	.query_collective = fi_no_query_collective,
 };
 
+static int tcpx_set_ops(struct fid *fid, const char *name,
+			uint64_t flags, void *ops, void *context)
+{
+	struct tcpx_domain *domain;
+
+	domain = container_of(fid, struct tcpx_domain,
+			      util_domain.domain_fid.fid);
+	if (flags)
+		return -FI_EBADFLAGS;
+
+	if (!strcasecmp(name, OFI_OPS_DYNAMIC_RBUF)) {
+		domain->dynamic_rbuf = ops;
+		if (domain->dynamic_rbuf->size != sizeof(*domain->dynamic_rbuf)) {
+			domain->dynamic_rbuf = NULL;
+			return -FI_ENOSYS;
+		}
+
+		return 0;
+	}
+
+	return -FI_ENOSYS;
+}
+
 static int tcpx_domain_close(fid_t fid)
 {
-	struct tcpx_domain *tcpx_domain;
+	struct tcpx_domain *domain;
 	int ret;
 
-	tcpx_domain = container_of(fid, struct tcpx_domain,
-				   util_domain.domain_fid.fid);
+	domain = container_of(fid, struct tcpx_domain,
+			      util_domain.domain_fid.fid);
 
-	ret = ofi_domain_close(&tcpx_domain->util_domain);
+	ret = ofi_domain_close(&domain->util_domain);
 	if (ret)
 		return ret;
 
-	free(tcpx_domain);
+	free(domain);
 	return FI_SUCCESS;
 }
 
 static struct fi_ops tcpx_domain_fi_ops = {
 	.size = sizeof(struct fi_ops),
 	.close = tcpx_domain_close,
-	.bind = fi_no_bind,
+	.bind = ofi_domain_bind,
 	.control = fi_no_control,
 	.ops_open = fi_no_ops_open,
+	.tostr = NULL,
+	.ops_set = tcpx_set_ops,
 };
 
 static struct fi_ops_mr tcpx_domain_fi_ops_mr = {
@@ -150,30 +186,30 @@ static struct fi_ops_mr tcpx_domain_fi_ops_mr = {
 };
 
 int tcpx_domain_open(struct fid_fabric *fabric, struct fi_info *info,
-		     struct fid_domain **domain, void *context)
+		     struct fid_domain **domain_fid, void *context)
 {
-	struct tcpx_domain *tcpx_domain;
+	struct tcpx_domain *domain;
 	int ret;
 
 	ret = ofi_prov_check_info(&tcpx_util_prov, fabric->api_version, info);
 	if (ret)
 		return ret;
 
-	tcpx_domain = calloc(1, sizeof(*tcpx_domain));
-	if (!tcpx_domain)
+	domain = calloc(1, sizeof(*domain));
+	if (!domain)
 		return -FI_ENOMEM;
 
-	ret = ofi_domain_init(fabric, info, &tcpx_domain->util_domain, context);
+	ret = ofi_domain_init(fabric, info, &domain->util_domain, context);
 	if (ret)
 		goto err;
 
-	*domain = &tcpx_domain->util_domain.domain_fid;
-	(*domain)->fid.ops = &tcpx_domain_fi_ops;
-	(*domain)->ops = &tcpx_domain_ops;
-	(*domain)->mr = &tcpx_domain_fi_ops_mr;
+	domain->util_domain.domain_fid.fid.ops = &tcpx_domain_fi_ops;
+	domain->util_domain.domain_fid.ops = &tcpx_domain_ops;
+	domain->util_domain.domain_fid.mr = &tcpx_domain_fi_ops_mr;
+	*domain_fid = &domain->util_domain.domain_fid;
 
 	return FI_SUCCESS;
 err:
-	free(tcpx_domain);
+	free(domain);
 	return ret;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_ep.c b/deps/libfabric/prov/tcp/src/tcpx_ep.c
index 1076993dbb6737b445884667105163330599b82a..1120be2828c4421026d96fa3243065f4e68bdc1b 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_ep.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_ep.c
@@ -41,6 +41,8 @@
 
 extern struct fi_ops_rma tcpx_rma_ops;
 extern struct fi_ops_msg tcpx_msg_ops;
+extern struct fi_ops_tagged tcpx_tagged_ops;
+
 
 void tcpx_hdr_none(struct tcpx_base_hdr *hdr)
 {
@@ -49,27 +51,63 @@ void tcpx_hdr_none(struct tcpx_base_hdr *hdr)
 
 void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr)
 {
-	struct ofi_rma_iov *rma_iov;
-	uint8_t *ptr = (uint8_t *)hdr + sizeof(*hdr);
-	int i;
+	uint64_t *cur;
+	int i, cnt;
 
 	hdr->flags = ntohs(hdr->flags);
 	hdr->size = ntohll(hdr->size);
 
-	if (hdr->flags & OFI_REMOTE_CQ_DATA) {
-		*((uint64_t *)ptr) = ntohll(*((uint64_t *) ptr));
-		ptr += sizeof(uint64_t);
-	}
+	cnt = (hdr->hdr_size - sizeof(*hdr)) >> 3;
+	cur = (uint64_t *) (hdr + 1);
+	for (i = 0; i < cnt; i++)
+		cur[i] = ntohll(cur[i]);
+}
+
+#ifdef MSG_ZEROCOPY
+static void tcpx_set_zerocopy(SOCKET sock)
+{
+	int val = 1;
+
+	if (tcpx_zerocopy_size == SIZE_MAX)
+		return;
 
-	rma_iov = (struct ofi_rma_iov *)ptr;
-	for ( i = 0; i < hdr->rma_iov_cnt; i++) {
-		rma_iov[i].addr = ntohll(rma_iov[i].addr);
-		rma_iov[i].len = ntohll(rma_iov[i].len);
-		rma_iov[i].key = ntohll(rma_iov[i].key);
+	(void) setsockopt(sock, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val));
+}
+
+static void tcpx_config_bsock(struct ofi_bsock *bsock)
+{
+	int ret, val = 0;
+	socklen_t len = sizeof(val);
+
+	if (tcpx_zerocopy_size == SIZE_MAX)
+		return;
+
+	ret = getsockopt(bsock->sock, SOL_SOCKET, SO_ZEROCOPY, &val, &len);
+	if (!ret && val) {
+		bsock->zerocopy_size = tcpx_zerocopy_size;
+		FI_INFO(&tcpx_prov, FI_LOG_EP_CTRL,
+			"zero copy enabled for transfers > %zu\n",
+			bsock->zerocopy_size);
 	}
 }
+#else
+#define tcpx_set_zerocopy(sock)
+#define tcpx_config_bsock(bsock)
+#endif
+
+#ifdef IP_BIND_ADDRESS_NO_PORT
+static void tcpx_set_no_port(SOCKET sock)
+{
+	int val = 1;
+
+	(void) setsockopt(sock, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT,
+			  &val, sizeof(val));
+}
+#else
+#define tcpx_set_no_port(sock)
+#endif
 
-static int tcpx_setup_socket(SOCKET sock)
+static int tcpx_setup_socket(SOCKET sock, struct fi_info *info)
 {
 	int ret, optval = 1;
 
@@ -80,154 +118,215 @@ static int tcpx_setup_socket(SOCKET sock)
 		return -ofi_sockerr();
 	}
 
-	ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &optval,
-			 sizeof(optval));
+	/* Do not enable nodelay for bulk data traffic class, unless nodelay
+	 * has explicitly been requested.
+	 */
+	if (tcpx_nodelay && !((tcpx_nodelay < 0) &&
+	    (info->fabric_attr->api_version >= FI_VERSION(1, 9) &&
+	    info->tx_attr->tclass == FI_TC_BULK_DATA))) {
+
+		ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+				 (char *) &optval, sizeof(optval));
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+				"setsockopt nodelay failed\n");
+			return -ofi_sockerr();
+		}
+	}
+
+	ret = fi_fd_nonblock(sock);
 	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"setsockopt nodelay failed\n");
-		return -ofi_sockerr();
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"failed to set socket to nonblocking\n");
+		return ret;
 	}
 
-	return ret;
+	return 0;
 }
 
-static int tcpx_ep_connect(struct fid_ep *ep, const void *addr,
+static int tcpx_ep_connect(struct fid_ep *ep_fid, const void *addr,
 			   const void *param, size_t paramlen)
 {
-	struct tcpx_ep *tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	struct tcpx_ep *ep;
 	struct tcpx_cm_context *cm_ctx;
 	int ret;
 
-	if (!addr || !tcpx_ep->sock || paramlen > TCPX_MAX_CM_DATA_SIZE ||
-	    tcpx_ep->state != TCPX_IDLE)
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	if (!addr || (ep->bsock.sock == INVALID_SOCKET) ||
+	    (paramlen > TCPX_MAX_CM_DATA_SIZE) || (ep->state != TCPX_IDLE))
 		return -FI_EINVAL;
 
-	cm_ctx = calloc(1, sizeof(*cm_ctx));
+	cm_ctx = tcpx_alloc_cm_ctx(&ep_fid->fid, TCPX_CM_CONNECTING);
 	if (!cm_ctx) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"cannot allocate memory \n");
 		return -FI_ENOMEM;
 	}
 
-	tcpx_ep->state = TCPX_CONNECTING;
-	ret = connect(tcpx_ep->sock, (struct sockaddr *) addr,
+	ep->state = TCPX_CONNECTING;
+	ret = connect(ep->bsock.sock, (struct sockaddr *) addr,
 		      (socklen_t) ofi_sizeofaddr(addr));
-	if (ret && ofi_sockerr() != FI_EINPROGRESS) {
-		tcpx_ep->state = TCPX_IDLE;
+	if (ret && !OFI_SOCK_TRY_CONN_AGAIN(ofi_sockerr())) {
+		ep->state = TCPX_IDLE;
 		ret =  -ofi_sockerr();
 		goto free;
 	}
 
-	cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid;
-	cm_ctx->type = CLIENT_SEND_CONNREQ;
-
 	if (paramlen) {
 		cm_ctx->cm_data_sz = paramlen;
-		memcpy(cm_ctx->cm_data, param, paramlen);
+		memcpy(cm_ctx->msg.data, param, paramlen);
 	}
 
-	ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock,
-			      POLLOUT, tcpx_eq_wait_try_func, NULL,cm_ctx);
+	ret = ofi_wait_add_fd(ep->util_ep.eq->wait, ep->bsock.sock,
+			      POLLOUT, tcpx_eq_wait_try_func, NULL, cm_ctx);
 	if (ret)
 		goto disable;
 
 	return 0;
 
 disable:
-	tcpx_ep_disable(tcpx_ep, -ret);
+	fastlock_acquire(&ep->lock);
+	tcpx_ep_disable(ep, -ret);
+	fastlock_release(&ep->lock);
 free:
-	free(cm_ctx);
+	tcpx_free_cm_ctx(cm_ctx);
 	return ret;
 }
 
-static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
+static int
+tcpx_ep_accept(struct fid_ep *ep_fid, const void *param, size_t paramlen)
 {
-	struct tcpx_ep *tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	struct tcpx_ep *ep;
 	struct tcpx_cm_context *cm_ctx;
+	struct tcpx_conn_handle *handle;
 	int ret;
 
-	if (tcpx_ep->sock == INVALID_SOCKET || tcpx_ep->state != TCPX_RCVD_REQ)
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	handle = ep->handle;
+	if (ep->bsock.sock == INVALID_SOCKET || ep->state != TCPX_RCVD_REQ ||
+	    !handle || (handle->fid.fclass != FI_CLASS_CONNREQ))
 		return -FI_EINVAL;
 
-	cm_ctx = calloc(1, sizeof(*cm_ctx));
+	ep->handle = NULL;
+	cm_ctx = tcpx_alloc_cm_ctx(&ep->util_ep.ep_fid.fid, TCPX_CM_RESP_READY);
 	if (!cm_ctx) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"cannot allocate memory \n");
 		return -FI_ENOMEM;
 	}
 
-	tcpx_ep->state = TCPX_ACCEPTING;
-	cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid;
-	cm_ctx->type = SERVER_SEND_CM_ACCEPT;
+	ep->state = TCPX_ACCEPTING;
+
 	if (paramlen) {
 		cm_ctx->cm_data_sz = paramlen;
-		memcpy(cm_ctx->cm_data, param, paramlen);
+		memcpy(cm_ctx->msg.data, param, paramlen);
 	}
 
-	ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock,
+	ret = ofi_wait_add_fd(ep->util_ep.eq->wait, ep->bsock.sock,
 			      POLLOUT, tcpx_eq_wait_try_func, NULL, cm_ctx);
 	if (ret)
 		goto free;
 
+	free(handle);
 	return 0;
 
 free:
-	tcpx_ep->state = TCPX_RCVD_REQ;
-	free(cm_ctx);
+	ep->state = TCPX_RCVD_REQ;
+	tcpx_free_cm_ctx(cm_ctx);
 	return ret;
 }
 
-static void tcpx_ep_flush_pending_xfers(struct tcpx_ep *ep)
+/* must hold ep->lock */
+static void tcpx_ep_flush_queue(struct slist *queue,
+				struct tcpx_cq *cq)
+{
+	struct tcpx_xfer_entry *xfer_entry;
+
+	while (!slist_empty(queue)) {
+		xfer_entry = container_of(queue->head, struct tcpx_xfer_entry,
+					  entry);
+		slist_remove_head(queue);
+		tcpx_cq_report_error(&cq->util_cq, xfer_entry, FI_ECANCELED);
+		tcpx_free_xfer(cq, xfer_entry);
+	}
+}
+
+static void tcpx_ep_flush_all_queues(struct tcpx_ep *ep)
 {
-	struct slist_entry *entry;
-	struct tcpx_xfer_entry *tx_entry;
 	struct tcpx_cq *cq;
 
-	while (!slist_empty(&ep->tx_rsp_pend_queue)) {
-		entry = slist_remove_head(&ep->tx_rsp_pend_queue);
-		tx_entry = container_of(entry, struct tcpx_xfer_entry, entry);
-		tcpx_cq_report_error(ep->util_ep.tx_cq, tx_entry, FI_ENOTCONN);
+	assert(fastlock_held(&ep->lock));
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+	if (ep->cur_tx.entry) {
+		ep->hdr_bswap(&ep->cur_tx.entry->hdr.base_hdr);
+		tcpx_cq_report_error(&cq->util_cq, ep->cur_tx.entry,
+				     FI_ECANCELED);
+		tcpx_free_xfer(cq, ep->cur_tx.entry);
+		ep->cur_tx.entry = NULL;
+	}
 
-		cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
-		tcpx_xfer_entry_release(cq, tx_entry);
+	tcpx_ep_flush_queue(&ep->tx_queue, cq);
+	tcpx_ep_flush_queue(&ep->priority_queue, cq);
+	tcpx_ep_flush_queue(&ep->rma_read_queue, cq);
+	tcpx_ep_flush_queue(&ep->need_ack_queue, cq);
+	tcpx_ep_flush_queue(&ep->async_queue, cq);
+
+	cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+	if (ep->cur_rx.entry) {
+		tcpx_cq_report_error(&cq->util_cq, ep->cur_rx.entry,
+				     FI_ECANCELED);
+		tcpx_free_xfer(cq, ep->cur_rx.entry);
+		tcpx_reset_rx(ep);
 	}
+	tcpx_ep_flush_queue(&ep->rx_queue, cq);
+	ofi_bsock_discard(&ep->bsock);
 }
 
-/* must hold ep->lock */
 void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err)
 {
 	struct util_wait_fd *wait;
 	struct fi_eq_cm_entry cm_entry = {0};
 	struct fi_eq_err_entry err_entry = {0};
+	int ret;
 
+	assert(fastlock_held(&ep->lock));
 	switch (ep->state) {
 	case TCPX_RCVD_REQ:
 		break;
 	case TCPX_CONNECTED:
+		/* We need to remove the socket from the CQ's fdset,
+		 * or the CQ will be left in a 'signaled' state.  This
+		 * can result in threads spinning on the CQs fdset.
+		 */
 		if (ep->util_ep.tx_cq) {
 			wait = container_of(ep->util_ep.tx_cq->wait,
 					    struct util_wait_fd, util_wait);
-			ofi_wait_fdset_del(wait, ep->sock);
+			ofi_wait_fdset_del(wait, ep->bsock.sock);
 		}
 
 		if (ep->util_ep.rx_cq) {
 			wait = container_of(ep->util_ep.rx_cq->wait,
 					    struct util_wait_fd, util_wait);
-			ofi_wait_fdset_del(wait, ep->sock);
+			ofi_wait_fdset_del(wait, ep->bsock.sock);
 		}
-
-		tcpx_ep_flush_pending_xfers(ep);
 		/* fall through */
 	case TCPX_ACCEPTING:
 	case TCPX_CONNECTING:
 		wait = container_of(ep->util_ep.eq->wait,
 				    struct util_wait_fd, util_wait);
-		ofi_wait_fdset_del(wait, ep->sock);
+		ofi_wait_fdset_del(wait, ep->bsock.sock);
 		break;
 
 	default:
 		return;
 	}
 
+	ret = ofi_shutdown(ep->bsock.sock, SHUT_RDWR);
+	if (ret && ofi_sockerr() != ENOTCONN)
+		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, "shutdown failed\n");
+
+	tcpx_ep_flush_all_queues(ep);
+
 	if (cm_err) {
 		err_entry.fid = &ep->util_ep.ep_fid.fid;
 		err_entry.context = ep->util_ep.ep_fid.fid.context;
@@ -243,21 +342,16 @@ void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err)
 	ep->state = TCPX_DISCONNECTED;
 }
 
-static int tcpx_ep_shutdown(struct fid_ep *ep, uint64_t flags)
+static int tcpx_ep_shutdown(struct fid_ep *ep_fid, uint64_t flags)
 {
-	struct tcpx_ep *tcpx_ep;
-	int ret;
-
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	struct tcpx_ep *ep;
 
-	ret = ofi_shutdown(tcpx_ep->sock, SHUT_RDWR);
-	if (ret && ofi_sockerr() != ENOTCONN) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, "ep shutdown unsuccessful\n");
-	}
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	(void) ofi_bsock_flush(&ep->bsock);
 
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_ep_disable(tcpx_ep, 0);
-	fastlock_release(&tcpx_ep->lock);
+	fastlock_acquire(&ep->lock);
+	tcpx_ep_disable(ep, 0);
+	fastlock_release(&ep->lock);
 
 	return FI_SUCCESS;
 }
@@ -315,10 +409,18 @@ static int tcpx_pep_sock_create(struct tcpx_pep *pep)
 			strerror(ofi_sockerr()));
 		return -FI_EIO;
 	}
-	ret = tcpx_setup_socket(pep->sock);
+	ret = tcpx_setup_socket(pep->sock, pep->info);
+	if (ret)
+		goto err;
+
+	tcpx_set_zerocopy(pep->sock);
+	ret = fi_fd_nonblock(pep->sock);
 	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"failed to set listener socket to nonblocking\n");
 		goto err;
 	}
+
 	if (ofi_addr_get_port(pep->info->src_addr) != 0 || port_range.high == 0) {
 		ret = bind(pep->sock, pep->info->src_addr,
 			  (socklen_t) pep->info->src_addrlen);
@@ -344,26 +446,26 @@ err:
 
 static int tcpx_ep_getname(fid_t fid, void *addr, size_t *addrlen)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	size_t addrlen_in = *addrlen;
 	int ret;
 
-	tcpx_ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid);
-	ret = ofi_getsockname(tcpx_ep->sock, addr, (socklen_t *)addrlen);
+	ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid);
+	ret = ofi_getsockname(ep->bsock.sock, addr, (socklen_t *) addrlen);
 	if (ret)
 		return -ofi_sockerr();
 
 	return (addrlen_in < *addrlen)? -FI_ETOOSMALL: FI_SUCCESS;
 }
 
-static int tcpx_ep_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen)
+static int tcpx_ep_getpeer(struct fid_ep *ep_fid, void *addr, size_t *addrlen)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	size_t addrlen_in = *addrlen;
 	int ret;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
-	ret = ofi_getpeername(tcpx_ep->sock, addr, (socklen_t *)addrlen);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	ret = ofi_getpeername(ep->bsock.sock, addr, (socklen_t *) addrlen);
 	if (ret)
 		return -ofi_sockerr();
 
@@ -383,48 +485,64 @@ static struct fi_ops_cm tcpx_cm_ops = {
 	.join = fi_no_join,
 };
 
-void tcpx_rx_msg_release(struct tcpx_xfer_entry *rx_entry)
+void tcpx_reset_rx(struct tcpx_ep *ep)
 {
-	struct tcpx_cq *tcpx_cq;
-
-	assert(rx_entry->hdr.base_hdr.op_data == TCPX_OP_MSG_RECV);
-
-	if (rx_entry->ep->srx_ctx) {
-		tcpx_srx_xfer_release(rx_entry->ep->srx_ctx, rx_entry);
-	} else {
-		tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq,
-				       struct tcpx_cq, util_cq);
-		tcpx_xfer_entry_release(tcpx_cq, rx_entry);
-	}
+	ep->cur_rx.handler = NULL;
+	ep->cur_rx.entry = NULL;
+	ep->cur_rx.hdr_done = 0;
+	ep->cur_rx.hdr_len = sizeof(ep->cur_rx.hdr.base_hdr);
+	OFI_DBG_SET(ep->cur_rx.hdr.base_hdr.version, 0);
 }
 
-static void tcpx_ep_release_queue(struct slist *queue,
-				  struct tcpx_cq *tcpx_cq)
+static void tcpx_ep_cancel_rx(struct tcpx_ep *ep, void *context)
 {
+	struct slist_entry *cur, *prev;
 	struct tcpx_xfer_entry *xfer_entry;
+	struct tcpx_cq *cq;
 
-	while (!slist_empty(queue)) {
-		xfer_entry = container_of(queue->head, struct tcpx_xfer_entry,
-					  entry);
-		slist_remove_head(queue);
-		tcpx_cq_report_error(&tcpx_cq->util_cq, xfer_entry, FI_ECANCELED);
-		tcpx_xfer_entry_release(tcpx_cq, xfer_entry);
+	assert(fastlock_held(&ep->lock));
+
+	/* To cancel an active receive, we would need to flush the socket of
+	 * all data associated with that message.  Since some of that data
+	 * may not have arrived yet, this would require additional state
+	 * tracking and complexity.  Fail the cancel in this case, since
+	 * the receive is already in process anyway.
+	 */
+	slist_foreach(&ep->rx_queue, cur, prev) {
+		xfer_entry = container_of(cur, struct tcpx_xfer_entry, entry);
+		if (xfer_entry->context == context) {
+			if (ep->cur_rx.entry != xfer_entry)
+				goto found;
+			break;
+		}
 	}
+
+	return;
+
+found:
+	cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+
+	slist_remove(&ep->rx_queue, cur, prev);
+	ep->rx_avail++;
+	tcpx_cq_report_error(&cq->util_cq, xfer_entry, FI_ECANCELED);
+	tcpx_free_xfer(cq, xfer_entry);
 }
 
-static void tcpx_ep_tx_rx_queues_release(struct tcpx_ep *ep)
+/* We currently only support canceling receives, which is the common case.
+ * Canceling an operation from the other queues is not trivial,
+ * especially if the operation has already been initiated.
+ */
+static ssize_t tcpx_ep_cancel(fid_t fid, void *context)
 {
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_ep *ep;
 
-	fastlock_acquire(&ep->lock);
-	tcpx_cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
-	tcpx_ep_release_queue(&ep->tx_queue, tcpx_cq);
-	tcpx_ep_release_queue(&ep->rma_read_queue, tcpx_cq);
-	tcpx_ep_release_queue(&ep->tx_rsp_pend_queue, tcpx_cq);
+	ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
 
-	tcpx_cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
-	tcpx_ep_release_queue(&ep->rx_queue, tcpx_cq);
+	fastlock_acquire(&ep->lock);
+	tcpx_ep_cancel_rx(ep, context);
 	fastlock_release(&ep->lock);
+
+	return 0;
 }
 
 static int tcpx_ep_close(struct fid *fid)
@@ -441,24 +559,32 @@ static int tcpx_ep_close(struct fid *fid)
 		fastlock_acquire(&eq->close_lock);
 
 	if (ep->util_ep.rx_cq)
-		ofi_wait_del_fd(ep->util_ep.rx_cq->wait, ep->sock);
+		ofi_wait_del_fd(ep->util_ep.rx_cq->wait, ep->bsock.sock);
 
 	if (ep->util_ep.tx_cq)
-		ofi_wait_del_fd(ep->util_ep.tx_cq->wait, ep->sock);
+		ofi_wait_del_fd(ep->util_ep.tx_cq->wait, ep->bsock.sock);
 
 	if (ep->util_ep.eq && ep->util_ep.eq->wait)
-		ofi_wait_del_fd(ep->util_ep.eq->wait, ep->sock);
+		ofi_wait_del_fd(ep->util_ep.eq->wait, ep->bsock.sock);
 
 	if (eq)
 		fastlock_release(&eq->close_lock);
 
-	tcpx_ep_tx_rx_queues_release(ep);
+	if (ep->fid && ep->fid->fclass == TCPX_CLASS_CM)
+		tcpx_free_cm_ctx(ep->cm_ctx);
+
+	/* Lock not technically needed, since we're freeing the EP.  But it's
+	 * harmless to acquire and silences static code analysis tools.
+	 */
+	fastlock_acquire(&ep->lock);
+	tcpx_ep_flush_all_queues(ep);
+	fastlock_release(&ep->lock);
 
 	if (eq) {
 		ofi_eq_remove_fid_events(ep->util_ep.eq,
 					 &ep->util_ep.ep_fid.fid);
 	}
-	ofi_close_socket(ep->sock);
+	ofi_close_socket(ep->bsock.sock);
 	ofi_endpoint_close(&ep->util_ep);
 	fastlock_destroy(&ep->lock);
 
@@ -473,10 +599,15 @@ static int tcpx_ep_ctrl(struct fid *fid, int command, void *arg)
 	ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
 	switch (command) {
 	case FI_ENABLE:
-		if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq)
+		if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) ||
+		    (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq)) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+				"missing needed CQ binding\n");
 			return -FI_ENOCQ;
+		}
 		break;
 	default:
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "unsupported command\n");
 		return -FI_ENOSYS;
 	}
 	return FI_SUCCESS;
@@ -484,18 +615,18 @@ static int tcpx_ep_ctrl(struct fid *fid, int command, void *arg)
 
 static int tcpx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_rx_ctx *rx_ctx;
 
-	tcpx_ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
+	ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
 
 	if (bfid->fclass == FI_CLASS_SRX_CTX) {
 		rx_ctx = container_of(bfid, struct tcpx_rx_ctx, rx_fid.fid);
-		tcpx_ep->srx_ctx = rx_ctx;
+		ep->srx_ctx = rx_ctx;
 		return FI_SUCCESS;
 	}
 
-	return ofi_ep_bind(&tcpx_ep->util_ep, bfid, flags);
+	return ofi_ep_bind(&ep->util_ep, bfid, flags);
 }
 
 static struct fi_ops tcpx_ep_fi_ops = {
@@ -561,7 +692,7 @@ int tcpx_ep_setopt(fid_t fid, int level, int optname,
 
 static struct fi_ops_ep tcpx_ep_ops = {
 	.size = sizeof(struct fi_ops_ep),
-	.cancel = fi_no_cancel,
+	.cancel = tcpx_ep_cancel,
 	.getopt = tcpx_ep_getopt,
 	.setopt = tcpx_ep_setopt,
 	.tx_ctx = fi_no_tx_ctx,
@@ -587,36 +718,60 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 	if (ret)
 		goto err1;
 
+	ofi_bsock_init(&ep->bsock, tcpx_staging_sbuf_size,
+		       tcpx_prefetch_rbuf_size);
 	if (info->handle) {
 		if (((fid_t) info->handle)->fclass == FI_CLASS_PEP) {
 			pep = container_of(info->handle, struct tcpx_pep,
 					   util_pep.pep_fid.fid);
 
-			ep->sock = pep->sock;
+			ep->bsock.sock = pep->sock;
 			pep->sock = INVALID_SOCKET;
 		} else {
 			ep->state = TCPX_RCVD_REQ;
 			handle = container_of(info->handle,
-					      struct tcpx_conn_handle, handle);
-			ep->sock = handle->sock;
+					      struct tcpx_conn_handle, fid);
+			/* EP now owns socket */
+			ep->bsock.sock = handle->sock;
+			handle->sock = INVALID_SOCKET;
 			ep->hdr_bswap = handle->endian_match ?
 					tcpx_hdr_none : tcpx_hdr_bswap;
-			free(handle);
+			/* Save handle, but we only free if user calls accept.
+			 * Otherwise, user will call reject, which will free it.
+			 */
+			ep->handle = handle;
 
-			ret = tcpx_setup_socket(ep->sock);
+			ret = tcpx_setup_socket(ep->bsock.sock, info);
 			if (ret)
 				goto err3;
 		}
 	} else {
-		ep->sock = ofi_socket(ofi_get_sa_family(info), SOCK_STREAM, 0);
-		if (ep->sock == INVALID_SOCKET) {
+		ep->bsock.sock = ofi_socket(ofi_get_sa_family(info), SOCK_STREAM, 0);
+		if (ep->bsock.sock == INVALID_SOCKET) {
 			ret = -ofi_sockerr();
 			goto err2;
 		}
 
-		ret = tcpx_setup_socket(ep->sock);
+		ret = tcpx_setup_socket(ep->bsock.sock, info);
 		if (ret)
 			goto err3;
+
+		tcpx_set_zerocopy(ep->bsock.sock);
+
+		if (info->src_addr && (!ofi_is_any_addr(info->src_addr) ||
+					ofi_addr_get_port(info->src_addr))) {
+
+			if (!ofi_addr_get_port(info->src_addr))
+				tcpx_set_no_port(ep->bsock.sock);
+
+			ret = bind(ep->bsock.sock, info->src_addr,
+				(socklen_t) info->src_addrlen);
+			if (ret) {
+				FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "bind failed\n");
+				ret = -ofi_sockerr();
+				goto err3;
+			}
+		}
 	}
 
 	ret = fastlock_init(&ep->lock);
@@ -625,12 +780,18 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 
 	slist_init(&ep->rx_queue);
 	slist_init(&ep->tx_queue);
+	slist_init(&ep->priority_queue);
 	slist_init(&ep->rma_read_queue);
-	slist_init(&ep->tx_rsp_pend_queue);
+	slist_init(&ep->need_ack_queue);
+	slist_init(&ep->async_queue);
 
-	ep->cur_rx_msg.done_len = 0;
-	ep->cur_rx_msg.hdr_len = sizeof(ep->cur_rx_msg.hdr.base_hdr);
+	if (info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT)
+		ep->rx_avail = info->rx_attr->size;
+
+	ep->cur_rx.hdr_done = 0;
+	ep->cur_rx.hdr_len = sizeof(ep->cur_rx.hdr.base_hdr);
 	ep->min_multi_recv_size = TCPX_MIN_MULTI_RECV;
+	tcpx_config_bsock(&ep->bsock);
 
 	*ep_fid = &ep->util_ep.ep_fid;
 	(*ep_fid)->fid.ops = &tcpx_ep_fi_ops;
@@ -638,15 +799,16 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 	(*ep_fid)->cm = &tcpx_cm_ops;
 	(*ep_fid)->msg = &tcpx_msg_ops;
 	(*ep_fid)->rma = &tcpx_rma_ops;
+	(*ep_fid)->tagged = &tcpx_tagged_ops;
 
 	ep->start_op[ofi_op_msg] = tcpx_op_msg;
-	ep->start_op[ofi_op_tagged] = tcpx_op_invalid;
+	ep->start_op[ofi_op_tagged] = tcpx_op_tagged;
 	ep->start_op[ofi_op_read_req] = tcpx_op_read_req;
 	ep->start_op[ofi_op_read_rsp] = tcpx_op_read_rsp;
 	ep->start_op[ofi_op_write] = tcpx_op_write;
 	return 0;
 err3:
-	ofi_close_socket(ep->sock);
+	ofi_close_socket(ep->bsock.sock);
 err2:
 	ofi_endpoint_close(&ep->util_ep);
 err1:
@@ -671,12 +833,13 @@ static int tcpx_pep_fi_close(struct fid *fid)
 
 static int tcpx_pep_fi_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 {
-	struct tcpx_pep *tcpx_pep = container_of(fid, struct tcpx_pep,
-						 util_pep.pep_fid.fid);
+	struct tcpx_pep *pep;
+
+	pep = container_of(fid, struct tcpx_pep, util_pep.pep_fid.fid);
 
 	switch (bfid->fclass) {
 	case FI_CLASS_EQ:
-		return ofi_pep_bind_eq(&tcpx_pep->util_pep,
+		return ofi_pep_bind_eq(&pep->util_pep,
 				       container_of(bfid, struct util_eq,
 						    eq_fid.fid), flags);
 	default:
@@ -696,95 +859,100 @@ static struct fi_ops tcpx_pep_fi_ops = {
 
 static int tcpx_pep_setname(fid_t fid, void *addr, size_t addrlen)
 {
-	struct tcpx_pep *tcpx_pep;
+	struct tcpx_pep *pep;
 
 	if ((addrlen != sizeof(struct sockaddr_in)) &&
 	    (addrlen != sizeof(struct sockaddr_in6)))
 		return -FI_EINVAL;
 
-	tcpx_pep = container_of(fid, struct tcpx_pep,
+	pep = container_of(fid, struct tcpx_pep,
 				util_pep.pep_fid);
 
-	if (tcpx_pep->sock != INVALID_SOCKET) {
-		ofi_close_socket(tcpx_pep->sock);
-		tcpx_pep->sock = INVALID_SOCKET;
+	if (pep->sock != INVALID_SOCKET) {
+		ofi_close_socket(pep->sock);
+		pep->sock = INVALID_SOCKET;
 	}
 
-	if (tcpx_pep->info->src_addr) {
-		free(tcpx_pep->info->src_addr);
-		tcpx_pep->info->src_addrlen = 0;
+	if (pep->info->src_addr) {
+		free(pep->info->src_addr);
+		pep->info->src_addrlen = 0;
 	}
 
-	tcpx_pep->info->src_addr = mem_dup(addr, addrlen);
-	if (!tcpx_pep->info->src_addr)
+	pep->info->src_addr = mem_dup(addr, addrlen);
+	if (!pep->info->src_addr)
 		return -FI_ENOMEM;
-	tcpx_pep->info->src_addrlen = addrlen;
+	pep->info->src_addrlen = addrlen;
 
-	return tcpx_pep_sock_create(tcpx_pep);
+	return tcpx_pep_sock_create(pep);
 }
 
 static int tcpx_pep_getname(fid_t fid, void *addr, size_t *addrlen)
 {
-	struct tcpx_pep *tcpx_pep;
+	struct tcpx_pep *pep;
 	size_t addrlen_in = *addrlen;
 	int ret;
 
-	tcpx_pep = container_of(fid, struct tcpx_pep, util_pep.pep_fid);
-	ret = ofi_getsockname(tcpx_pep->sock, addr, (socklen_t *)addrlen);
+	pep = container_of(fid, struct tcpx_pep, util_pep.pep_fid);
+	ret = ofi_getsockname(pep->sock, addr, (socklen_t *) addrlen);
 	if (ret)
 		return -ofi_sockerr();
 
 	return (addrlen_in < *addrlen) ? -FI_ETOOSMALL: FI_SUCCESS;
 }
 
-static int tcpx_pep_listen(struct fid_pep *pep)
+static int tcpx_pep_listen(struct fid_pep *pep_fid)
 {
-	struct tcpx_pep *tcpx_pep;
+	struct tcpx_pep *pep;
 	int ret;
 
-	tcpx_pep = container_of(pep,struct tcpx_pep, util_pep.pep_fid);
+	pep = container_of(pep_fid, struct tcpx_pep, util_pep.pep_fid);
 
 	/* arbitrary backlog value to support larger scale jobs */
-	if (listen(tcpx_pep->sock, 4096)) {
+	if (listen(pep->sock, 4096)) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"socket listen failed\n");
 		return -ofi_sockerr();
 	}
 
-	ret = ofi_wait_add_fd(tcpx_pep->util_pep.eq->wait, tcpx_pep->sock,
+	ret = ofi_wait_add_fd(pep->util_pep.eq->wait, pep->sock,
 			      POLLIN, tcpx_eq_wait_try_func,
-			      NULL, &tcpx_pep->cm_ctx);
+			      NULL, &pep->cm_ctx);
 
 	return ret;
 }
 
-static int tcpx_pep_reject(struct fid_pep *pep, fid_t handle,
+static int tcpx_pep_reject(struct fid_pep *pep, fid_t fid_handle,
 			   const void *param, size_t paramlen)
 {
-	struct ofi_ctrl_hdr hdr;
-	struct tcpx_conn_handle *tcpx_handle;
+	struct tcpx_cm_msg msg;
+	struct tcpx_conn_handle *handle;
 	int ret;
 
-	tcpx_handle = container_of(handle, struct tcpx_conn_handle, handle);
-
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.version = TCPX_CTRL_HDR_VERSION;
-	hdr.type = ofi_ctrl_nack;
-	hdr.seg_size = htons((uint16_t) paramlen);
+	handle = container_of(fid_handle, struct tcpx_conn_handle, fid);
+	/* If we created an endpoint, it owns the socket */
+	if (handle->sock == INVALID_SOCKET)
+		goto free;
 
-	ret = ofi_send_socket(tcpx_handle->sock, &hdr,
-			      sizeof(hdr), MSG_NOSIGNAL);
+	memset(&msg.hdr, 0, sizeof(msg.hdr));
+	msg.hdr.version = TCPX_CTRL_HDR_VERSION;
+	msg.hdr.type = ofi_ctrl_nack;
+	msg.hdr.seg_size = htons((uint16_t) paramlen);
+	if (paramlen)
+		memcpy(&msg.data, param, paramlen);
 
-	if ((ret == sizeof(hdr)) && paramlen)
-		(void) ofi_send_socket(tcpx_handle->sock, param,
-				       paramlen, MSG_NOSIGNAL);
+	ret = ofi_send_socket(handle->sock, &msg,
+			      sizeof(msg.hdr) + paramlen, MSG_NOSIGNAL);
+	if (ret != sizeof(msg.hdr) + paramlen)
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"sending of reject message failed\n");
 
-	ofi_shutdown(tcpx_handle->sock, SHUT_RDWR);
-	ret = ofi_close_socket(tcpx_handle->sock);
+	ofi_shutdown(handle->sock, SHUT_RDWR);
+	ret = ofi_close_socket(handle->sock);
 	if (ret)
 		return ret;
 
-	free(tcpx_handle);
+free:
+	free(handle);
 	return FI_SUCCESS;
 }
 
@@ -829,9 +997,9 @@ static struct fi_ops_ep tcpx_pep_ops = {
 };
 
 int tcpx_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
-		    struct fid_pep **pep, void *context)
+		    struct fid_pep **pep_fid, void *context)
 {
-	struct tcpx_pep *_pep;
+	struct tcpx_pep *pep;
 	int ret;
 
 	if (!info) {
@@ -839,47 +1007,47 @@ int tcpx_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 		return -FI_EINVAL;
 	}
 
-	ret = ofi_check_info(&tcpx_util_prov, tcpx_util_prov.info,
-			     fabric->api_version, info);
+	ret = ofi_prov_check_info(&tcpx_util_prov, fabric->api_version, info);
 	if (ret)
 		return ret;
 
-	_pep = calloc(1, sizeof(*_pep));
-	if (!_pep)
+	pep = calloc(1, sizeof(*pep));
+	if (!pep)
 		return -FI_ENOMEM;
 
-	ret = ofi_pep_init(fabric, info, &_pep->util_pep, context);
+	ret = ofi_pep_init(fabric, info, &pep->util_pep, context);
 	if (ret)
 		goto err1;
 
-	_pep->util_pep.pep_fid.fid.ops = &tcpx_pep_fi_ops;
-	_pep->util_pep.pep_fid.cm = &tcpx_pep_cm_ops;
-	_pep->util_pep.pep_fid.ops = &tcpx_pep_ops;
+	pep->util_pep.pep_fid.fid.ops = &tcpx_pep_fi_ops;
+	pep->util_pep.pep_fid.cm = &tcpx_pep_cm_ops;
+	pep->util_pep.pep_fid.ops = &tcpx_pep_ops;
 
-	_pep->info = fi_dupinfo(info);
-	if (!_pep->info) {
+	pep->info = fi_dupinfo(info);
+	if (!pep->info) {
 		ret = -FI_ENOMEM;
 		goto err2;
 	}
 
-	_pep->cm_ctx.fid = &_pep->util_pep.pep_fid.fid;
-	_pep->cm_ctx.type = SERVER_SOCK_ACCEPT;
-	_pep->cm_ctx.cm_data_sz = 0;
-	_pep->sock = INVALID_SOCKET;
+	pep->cm_ctx.fid.fclass = TCPX_CLASS_CM;
+	pep->cm_ctx.hfid = &pep->util_pep.pep_fid.fid;
+	pep->cm_ctx.state = TCPX_CM_LISTENING;
+	pep->cm_ctx.cm_data_sz = 0;
+	pep->sock = INVALID_SOCKET;
 
 	if (info->src_addr) {
-		ret = tcpx_pep_sock_create(_pep);
+		ret = tcpx_pep_sock_create(pep);
 		if (ret)
 			goto err3;
 	}
 
-	*pep = &_pep->util_pep.pep_fid;
+	*pep_fid = &pep->util_pep.pep_fid;
 	return FI_SUCCESS;
 err3:
-	fi_freeinfo(_pep->info);
+	fi_freeinfo(pep->info);
 err2:
-	ofi_pep_close(&_pep->util_pep);
+	ofi_pep_close(&pep->util_pep);
 err1:
-	free(_pep);
+	free(pep);
 	return ret;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_fabric.c b/deps/libfabric/prov/tcp/src/tcpx_fabric.c
index 3ff1b87f1b400cc6cdbfe9ef8d277231173e9f4c..17d9ca78acf2ad9eb85ad3da78c1c6e8bbfb1aac 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_fabric.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_fabric.c
@@ -53,16 +53,16 @@ struct fi_ops_fabric tcpx_fabric_ops = {
 static int tcpx_fabric_close(fid_t fid)
 {
 	int ret;
-	struct tcpx_fabric *tcpx_fabric;
+	struct tcpx_fabric *fabric;
 
-	tcpx_fabric = container_of(fid, struct tcpx_fabric,
-				   util_fabric.fabric_fid.fid);
+	fabric = container_of(fid, struct tcpx_fabric,
+			      util_fabric.fabric_fid.fid);
 
-	ret = ofi_fabric_close(&tcpx_fabric->util_fabric);
+	ret = ofi_fabric_close(&fabric->util_fabric);
 	if (ret)
 		return ret;
 
-	free(tcpx_fabric);
+	free(fabric);
 	return 0;
 }
 
@@ -74,26 +74,26 @@ struct fi_ops tcpx_fabric_fi_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
-int tcpx_create_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
-		       void *context)
+int tcpx_create_fabric(struct fi_fabric_attr *attr,
+		       struct fid_fabric **fabric_fid, void *context)
 {
-	struct tcpx_fabric *tcpx_fabric;
+	struct tcpx_fabric *fabric;
 	int ret;
 
-	tcpx_fabric = calloc(1, sizeof(*tcpx_fabric));
-	if (!tcpx_fabric)
+	fabric = calloc(1, sizeof(*fabric));
+	if (!fabric)
 		return -FI_ENOMEM;
 
 	ret = ofi_fabric_init(&tcpx_prov, tcpx_info.fabric_attr, attr,
-			      &tcpx_fabric->util_fabric, context);
+			      &fabric->util_fabric, context);
 	if (ret) {
-		free(tcpx_fabric);
+		free(fabric);
 		return ret;
 	}
 
-	*fabric = &tcpx_fabric->util_fabric.fabric_fid;
-	(*fabric)->fid.ops = &tcpx_fabric_fi_ops;
-	(*fabric)->ops = &tcpx_fabric_ops;
+	fabric->util_fabric.fabric_fid.fid.ops = &tcpx_fabric_fi_ops;
+	fabric->util_fabric.fabric_fid.ops = &tcpx_fabric_ops;
+	*fabric_fid = &fabric->util_fabric.fabric_fid;
 
 	return 0;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_init.c b/deps/libfabric/prov/tcp/src/tcpx_init.c
index 7b5ed3b5849783a138c0519f64447906e621cda2..fb966f9232dc95a9783bd315b12baa1a3479a558 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_init.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_init.c
@@ -52,8 +52,28 @@ struct tcpx_port_range port_range = {
 	.high = 0,
 };
 
+int tcpx_nodelay = -1;
+
+int tcpx_staging_sbuf_size = 9000;
+int tcpx_prefetch_rbuf_size = 9000;
+size_t tcpx_default_tx_size = 256;
+size_t tcpx_default_rx_size = 256;
+size_t tcpx_zerocopy_size = SIZE_MAX;
+
+
 static void tcpx_init_env(void)
 {
+	size_t tx_size;
+	size_t rx_size;
+
+	/* Checked in util code */
+	fi_param_define(&tcpx_prov, "iface", FI_PARAM_STRING,
+			"Specify interface name");
+
+	fi_param_define(&tcpx_prov,"port_low_range", FI_PARAM_INT,
+			"define port low range");
+	fi_param_define(&tcpx_prov,"port_high_range", FI_PARAM_INT,
+			"define port high range");
 	fi_param_get_int(&tcpx_prov, "port_high_range", &port_range.high);
 	fi_param_get_int(&tcpx_prov, "port_low_range", &port_range.low);
 
@@ -67,6 +87,40 @@ static void tcpx_init_env(void)
 		port_range.low  = 0;
 		port_range.high = 0;
 	}
+
+	fi_param_define(&tcpx_prov,"tx_size", FI_PARAM_SIZE_T,
+			"define default tx context size (default: %zu)",
+			tcpx_default_tx_size);
+	fi_param_define(&tcpx_prov,"rx_size", FI_PARAM_SIZE_T,
+			"define default rx context size (default: %zu)",
+			tcpx_default_rx_size);
+	if (!fi_param_get_size_t(&tcpx_prov, "tx_size", &tx_size)) {
+		tcpx_default_tx_size = tx_size;
+	}
+	if (!fi_param_get_size_t(&tcpx_prov, "rx_size", &rx_size)) {
+		tcpx_default_rx_size = rx_size;
+	}
+
+	fi_param_define(&tcpx_prov, "nodelay", FI_PARAM_BOOL,
+			"overrides default TCP_NODELAY socket setting");
+	fi_param_get_bool(&tcpx_prov, "nodelay", &tcpx_nodelay);
+
+	fi_param_define(&tcpx_prov, "staging_sbuf_size", FI_PARAM_INT,
+			"size of buffer used to coalesce iovec's or "
+			"send requests before posting to the kernel, "
+			"set to 0 to disable");
+	fi_param_define(&tcpx_prov, "prefetch_rbuf_size", FI_PARAM_INT,
+			"size of buffer used to prefetch received data from "
+			"the kernel, set to 0 to disable");
+	fi_param_define(&tcpx_prov, "zerocopy_size", FI_PARAM_SIZE_T,
+			"lower threshold where zero copy transfers will be "
+			"used, if supported by the platform, set to -1 to "
+			"disable (default: %zu)", tcpx_zerocopy_size);
+	fi_param_get_int(&tcpx_prov, "staging_sbuf_size",
+			 &tcpx_staging_sbuf_size);
+	fi_param_get_int(&tcpx_prov, "prefetch_rbuf_size",
+			 &tcpx_prefetch_rbuf_size);
+	fi_param_get_size_t(&tcpx_prov, "zerocopy_size", &tcpx_zerocopy_size);
 }
 
 static void fi_tcp_fini(void)
@@ -88,15 +142,6 @@ TCP_INI
 #if HAVE_TCP_DL
 	ofi_pmem_init();
 #endif
-	fi_param_define(&tcpx_prov, "iface", FI_PARAM_STRING,
-			"Specify interface name");
-
-	fi_param_define(&tcpx_prov,"port_low_range", FI_PARAM_INT,
-			"define port low range");
-
-	fi_param_define(&tcpx_prov,"port_high_range", FI_PARAM_INT,
-			"define port high range");
-
 	tcpx_init_env();
 	return &tcpx_prov;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_msg.c b/deps/libfabric/prov/tcp/src/tcpx_msg.c
index 3e8581b32c5a564dae15a6c6f88fa80ee3d5aa1c..906cfa3ae251ec0a7e225db545ec95da432173d6 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_msg.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_msg.c
@@ -48,55 +48,140 @@
 #include <arpa/inet.h>
 #include <netdb.h>
 
+
 static inline struct tcpx_xfer_entry *
-tcpx_alloc_recv_entry(struct tcpx_ep *tcpx_ep)
+tcpx_alloc_send(struct tcpx_ep *ep)
 {
-	struct tcpx_xfer_entry *recv_entry;
-	struct tcpx_cq *tcpx_cq;
-
-	tcpx_cq = container_of(tcpx_ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+	struct tcpx_xfer_entry *send_entry;
 
-	recv_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_MSG_RECV);
-	if (recv_entry)
-		recv_entry->ep = tcpx_ep;
+	send_entry = tcpx_alloc_tx(ep);
+	if (send_entry)
+		send_entry->hdr.base_hdr.op = ofi_op_msg;
 
-	return recv_entry;
+	return send_entry;
 }
 
+/* When dynamic receive buffers are enabled, receive buffer matching
+ * is handled by the upper layer (rxm).  The tcp provider is only
+ * carrying the tag to reduce header overhead.  The transport operation
+ * is still op_msg at the tcp provider.  This is needed for backwards
+ * compatibility.
+ *
+ * If dynamic receive buffers are disabled, then tagged messages are
+ * being handled entirely by the tcp provider.  We use the op_tagged
+ * protocol for this, which allows distinguishing between the two
+ * cases at the receiver.
+ *
+ * We assume the peer is configured similar to the local side, which
+ * is all we can check.
+ */
 static inline struct tcpx_xfer_entry *
-tcpx_alloc_send_entry(struct tcpx_ep *tcpx_ep)
+tcpx_alloc_tsend(struct tcpx_ep *ep)
 {
 	struct tcpx_xfer_entry *send_entry;
-	struct tcpx_cq *tcpx_cq;
 
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
-
-	send_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_MSG_SEND);
-	if (send_entry)
-		send_entry->ep = tcpx_ep;
+	send_entry = tcpx_alloc_tx(ep);
+	if (send_entry) {
+		if (tcpx_dynamic_rbuf(ep)) {
+			send_entry->hdr.base_hdr.op = ofi_op_msg;
+			send_entry->hdr.base_hdr.flags = TCPX_TAGGED;
+		} else {
+			assert(ep->srx_ctx);
+			send_entry->hdr.base_hdr.op = ofi_op_tagged;
+		}
+	}
 
 	return send_entry;
 }
 
-static inline void tcpx_queue_recv(struct tcpx_ep *tcpx_ep,
-				   struct tcpx_xfer_entry *recv_entry)
+static inline void
+tcpx_init_tx_sizes(struct tcpx_xfer_entry *tx_entry, size_t hdr_len,
+		   size_t data_len)
 {
-	fastlock_acquire(&tcpx_ep->lock);
-	slist_insert_tail(&recv_entry->entry, &tcpx_ep->rx_queue);
-	fastlock_release(&tcpx_ep->lock);
+	tx_entry->hdr.base_hdr.size = hdr_len + data_len;
+	tx_entry->hdr.base_hdr.hdr_size = (uint8_t) hdr_len;
 }
 
-static ssize_t tcpx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
-			    uint64_t flags)
+static inline void
+tcpx_init_tx_inject(struct tcpx_xfer_entry *tx_entry, size_t hdr_len,
+		    const void *buf, size_t data_len)
+{
+	assert(data_len <= TCPX_MAX_INJECT);
+	tcpx_init_tx_sizes(tx_entry, hdr_len, data_len);
+
+	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
+	memcpy((uint8_t *) &tx_entry->hdr + hdr_len, (uint8_t *) buf,
+		data_len);
+	tx_entry->iov[0].iov_len = hdr_len + data_len;
+	tx_entry->iov_cnt = 1;
+}
+
+static inline void
+tcpx_init_tx_buf(struct tcpx_xfer_entry *tx_entry, size_t hdr_len,
+		 const void *buf, size_t data_len)
+{
+	if (data_len <= TCPX_MAX_INJECT) {
+		tcpx_init_tx_inject(tx_entry, hdr_len, buf, data_len);
+		return;
+	}
+
+	tcpx_init_tx_sizes(tx_entry, hdr_len, data_len);
+	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
+	tx_entry->iov[0].iov_len = hdr_len;
+	tx_entry->iov[1].iov_base = (void *) buf;
+	tx_entry->iov[1].iov_len = data_len;
+	tx_entry->iov_cnt = 2;
+}
+
+static inline void
+tcpx_init_tx_iov(struct tcpx_xfer_entry *tx_entry, size_t hdr_len,
+		 const struct iovec *iov, size_t count)
+{
+	size_t data_len;
+
+	assert(count <= TCPX_IOV_LIMIT);
+	data_len = ofi_total_iov_len(iov, count);
+	tcpx_init_tx_sizes(tx_entry, hdr_len, data_len);
+
+	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
+	if (data_len <= TCPX_MAX_INJECT) {
+		ofi_copy_iov_buf(iov, count, 0, (uint8_t *) &tx_entry->hdr +
+				 hdr_len, TCPX_MAX_INJECT, OFI_COPY_IOV_TO_BUF);
+		tx_entry->iov[0].iov_len = hdr_len + data_len;
+		tx_entry->iov_cnt = 1;
+	} else {
+		tx_entry->iov[0].iov_len = hdr_len;
+		tx_entry->iov_cnt = count + 1;
+		memcpy(&tx_entry->iov[1], &iov[0], count * sizeof(struct iovec));
+	}
+}
+
+static inline bool
+tcpx_queue_recv(struct tcpx_ep *ep, struct tcpx_xfer_entry *recv_entry)
+{
+	bool ret;
+
+	fastlock_acquire(&ep->lock);
+	ret = ep->rx_avail;
+	if (ret) {
+		slist_insert_tail(&recv_entry->entry, &ep->rx_queue);
+		ep->rx_avail--;
+	}
+	fastlock_release(&ep->lock);
+	return ret;
+}
+
+static ssize_t
+tcpx_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags)
 {
 	struct tcpx_xfer_entry *recv_entry;
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
 	assert(msg->iov_count <= TCPX_IOV_LIMIT);
 
-	recv_entry = tcpx_alloc_recv_entry(tcpx_ep);
+	recv_entry = tcpx_alloc_rx(ep);
 	if (!recv_entry)
 		return -FI_EAGAIN;
 
@@ -104,23 +189,27 @@ static ssize_t tcpx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
 	memcpy(&recv_entry->iov[0], &msg->msg_iov[0],
 	       msg->iov_count * sizeof(struct iovec));
 
-	recv_entry->flags = tcpx_ep->util_ep.rx_msg_flags | flags |
-			    FI_MSG | FI_RECV;
+	recv_entry->cq_flags = tcpx_rx_completion_flag(ep, flags) |
+			       FI_MSG | FI_RECV;
 	recv_entry->context = msg->context;
 
-	tcpx_queue_recv(tcpx_ep, recv_entry);
+	if (!tcpx_queue_recv(ep, recv_entry)) {
+		tcpx_free_rx(recv_entry);
+		return -FI_EAGAIN;
+	}
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_recv(struct fid_ep *ep, void *buf, size_t len, void *desc,
-			 fi_addr_t src_addr, void *context)
+static ssize_t
+tcpx_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
+	  fi_addr_t src_addr, void *context)
 {
 	struct tcpx_xfer_entry *recv_entry;
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	recv_entry = tcpx_alloc_recv_entry(tcpx_ep);
+	recv_entry = tcpx_alloc_rx(ep);
 	if (!recv_entry)
 		return -FI_EAGAIN;
 
@@ -128,298 +217,203 @@ static ssize_t tcpx_recv(struct fid_ep *ep, void *buf, size_t len, void *desc,
 	recv_entry->iov[0].iov_base = buf;
 	recv_entry->iov[0].iov_len = len;
 
-	recv_entry->flags = (tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION) |
-			    FI_MSG | FI_RECV;
+	recv_entry->cq_flags = tcpx_rx_completion_flag(ep, 0) |
+			       FI_MSG | FI_RECV;
 	recv_entry->context = context;
 
-	tcpx_queue_recv(tcpx_ep, recv_entry);
+	if (!tcpx_queue_recv(ep, recv_entry)) {
+		tcpx_free_rx(recv_entry);
+		return -FI_EAGAIN;
+	}
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc,
-			  size_t count, fi_addr_t src_addr, void *context)
+static ssize_t
+tcpx_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+	   size_t count, fi_addr_t src_addr, void *context)
 {
 	struct tcpx_xfer_entry *recv_entry;
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
 	assert(count <= TCPX_IOV_LIMIT);
 
-	recv_entry = tcpx_alloc_recv_entry(tcpx_ep);
+	recv_entry = tcpx_alloc_rx(ep);
 	if (!recv_entry)
 		return -FI_EAGAIN;
 
 	recv_entry->iov_cnt = count;
 	memcpy(recv_entry->iov, iov, count * sizeof(*iov));
 
-	recv_entry->flags = (tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION) |
-			    FI_MSG | FI_RECV;
+	recv_entry->cq_flags = tcpx_rx_completion_flag(ep, 0) |
+			       FI_MSG | FI_RECV;
 	recv_entry->context = context;
 
-	tcpx_queue_recv(tcpx_ep, recv_entry);
+	if (!tcpx_queue_recv(ep, recv_entry)) {
+		tcpx_free_rx(recv_entry);
+		return -FI_EAGAIN;
+	}
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg,
-			    uint64_t flags)
+static inline void
+tcpx_queue_send(struct tcpx_ep *ep, struct tcpx_xfer_entry *tx_entry)
 {
-	struct tcpx_ep *tcpx_ep;
-	struct tcpx_cq *tcpx_cq;
-	struct tcpx_xfer_entry *tx_entry;
-	uint64_t data_len;
-	size_t offset = 0;
-	uint64_t *cq_data;
+	fastlock_acquire(&ep->lock);
+	tcpx_tx_queue_insert(ep, tx_entry);
+	fastlock_release(&ep->lock);
+}
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq,
-			       util_cq);
+static ssize_t
+tcpx_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+	size_t hdr_len;
 
-	tx_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_MSG_SEND);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
-	assert(msg->iov_count <= TCPX_IOV_LIMIT);
-	data_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
-	assert(!(flags & FI_INJECT) || (data_len <= TCPX_MAX_INJECT_SZ));
-
-	offset = sizeof(tx_entry->hdr.base_hdr);
-
 	if (flags & FI_REMOTE_CQ_DATA) {
-		tx_entry->hdr.base_hdr.flags |= OFI_REMOTE_CQ_DATA;
-		cq_data = (uint64_t *)((uint8_t *)&tx_entry->hdr + offset);
-		*cq_data = msg->data;
-		offset += sizeof(msg->data);
-	}
-
-	tx_entry->hdr.base_hdr.payload_off = (uint8_t)offset;
-	tx_entry->hdr.base_hdr.size = offset + data_len;
-	if (flags & FI_INJECT) {
-		ofi_copy_iov_buf(msg->msg_iov, msg->iov_count, 0,
-				 (uint8_t *)&tx_entry->hdr + offset,
-				 data_len,
-				 OFI_COPY_IOV_TO_BUF);
-		tx_entry->iov_cnt = 1;
-		offset += data_len;
+		tx_entry->hdr.base_hdr.flags = TCPX_REMOTE_CQ_DATA;
+		tx_entry->hdr.cq_data_hdr.cq_data = msg->data;
+		hdr_len = sizeof(tx_entry->hdr.cq_data_hdr);
 	} else {
-		memcpy(&tx_entry->iov[1], &msg->msg_iov[0],
-		       msg->iov_count * sizeof(struct iovec));
-
-		tx_entry->iov_cnt = msg->iov_count + 1;
+		hdr_len = sizeof(tx_entry->hdr.base_hdr);
 	}
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = offset;
 
-	tx_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			    flags | FI_MSG | FI_SEND);
-
-	if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))
-		tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE;
-
-	tx_entry->ep = tcpx_ep;
+	tcpx_init_tx_iov(tx_entry, hdr_len, msg->msg_iov, msg->iov_count);
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, flags) |
+			     FI_MSG | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, flags);
 	tx_entry->context = msg->context;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
 
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_send(struct fid_ep *ep, const void *buf, size_t len,
-			 void *desc, fi_addr_t dest_addr, void *context)
+static ssize_t
+tcpx_send(struct fid_ep *ep_fid, const void *buf, size_t len,
+	  void *desc, fi_addr_t dest_addr, void *context)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *tx_entry;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	tx_entry = tcpx_alloc_send_entry(tcpx_ep);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
-	tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.base_hdr);
-	tx_entry->hdr.base_hdr.payload_off = (uint8_t)
-					     sizeof(tx_entry->hdr.base_hdr);
-
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.base_hdr);
-
-	tx_entry->iov[1].iov_base = (void *) buf;
-	tx_entry->iov[1].iov_len = len;
-	tx_entry->iov_cnt = 2;
+	tcpx_init_tx_buf(tx_entry, sizeof(tx_entry->hdr.base_hdr), buf, len);
 	tx_entry->context = context;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
-	tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			   FI_MSG | FI_SEND;
-
-	if (tcpx_ep->util_ep.tx_op_flags &
-	    (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))
-		tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE;
-
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_MSG | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_sendv(struct fid_ep *ep, const struct iovec *iov,
-			  void **desc, size_t count, fi_addr_t dest_addr,
-			  void *context)
+static ssize_t
+tcpx_sendv(struct fid_ep *ep_fid, const struct iovec *iov,
+	   void **desc, size_t count, fi_addr_t dest_addr, void *context)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *tx_entry;
-	uint64_t data_len;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	tx_entry = tcpx_alloc_send_entry(tcpx_ep);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
-	assert(count <= TCPX_IOV_LIMIT);
-	data_len = ofi_total_iov_len(iov, count);
-	tx_entry->hdr.base_hdr.size = data_len + sizeof(tx_entry->hdr.base_hdr);
-	tx_entry->hdr.base_hdr.payload_off = (uint8_t)
-					     sizeof(tx_entry->hdr.base_hdr);
-
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.base_hdr);
-	tx_entry->iov_cnt = count + 1;
-	memcpy(&tx_entry->iov[1], &iov[0], count * sizeof(struct iovec));
-
+	tcpx_init_tx_iov(tx_entry, sizeof(tx_entry->hdr.base_hdr), iov, count);
 	tx_entry->context = context;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
-	tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			   FI_MSG | FI_SEND;
-
-	if (tcpx_ep->util_ep.tx_op_flags &
-	    (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))
-		tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE;
-
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_MSG | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
 
-static ssize_t tcpx_inject(struct fid_ep *ep, const void *buf, size_t len,
-			   fi_addr_t dest_addr)
+static ssize_t
+tcpx_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
+	    fi_addr_t dest_addr)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *tx_entry;
-	size_t offset;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	tx_entry = tcpx_alloc_send_entry(tcpx_ep);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
-	assert(len <= TCPX_MAX_INJECT_SZ);
-	tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.base_hdr);
+	tcpx_init_tx_inject(tx_entry, sizeof(tx_entry->hdr.base_hdr), buf, len);
 
-	offset = sizeof(tx_entry->hdr.base_hdr);
-	tx_entry->hdr.base_hdr.payload_off = (uint8_t) offset;
-	memcpy((uint8_t *)&tx_entry->hdr + offset, (uint8_t *) buf, len);
+	tx_entry->cq_flags = FI_MSG | FI_SEND; /* set in case of error */
 
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = len + sizeof(tx_entry->hdr.base_hdr);
-	tx_entry->iov_cnt = 1;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
-	tx_entry->flags = FI_MSG | FI_SEND;
-
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_senddata(struct fid_ep *ep, const void *buf, size_t len,
-			     void *desc, uint64_t data, fi_addr_t dest_addr,
-			     void *context)
+static ssize_t
+tcpx_senddata(struct fid_ep *ep_fid, const void *buf, size_t len,
+	      void *desc, uint64_t data, fi_addr_t dest_addr, void *context)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *tx_entry;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	tx_entry = tcpx_alloc_send_entry(tcpx_ep);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
 	tx_entry->hdr.cq_data_hdr.base_hdr.size =
 		len + sizeof(tx_entry->hdr.cq_data_hdr);
-	tx_entry->hdr.cq_data_hdr.base_hdr.flags = OFI_REMOTE_CQ_DATA;
-
+	tx_entry->hdr.cq_data_hdr.base_hdr.flags = TCPX_REMOTE_CQ_DATA;
 	tx_entry->hdr.cq_data_hdr.cq_data = data;
 
-	tx_entry->hdr.cq_data_hdr.base_hdr.payload_off =
-		(uint8_t) sizeof(tx_entry->hdr.cq_data_hdr);
-
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.cq_data_hdr);
-
-
-	tx_entry->iov[1].iov_base = (void *) buf;
-	tx_entry->iov[1].iov_len = len;
-	tx_entry->iov_cnt = 2;
-
+	tcpx_init_tx_buf(tx_entry, sizeof(tx_entry->hdr.cq_data_hdr),
+			 buf, len);
 	tx_entry->context = context;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
-	tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			   FI_MSG | FI_SEND;
-
-	if (tcpx_ep->util_ep.tx_op_flags &
-	    (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))
-		tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE;
-
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_MSG | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_injectdata(struct fid_ep *ep, const void *buf, size_t len,
-			       uint64_t data, fi_addr_t dest_addr)
+static ssize_t
+tcpx_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
+		uint64_t data, fi_addr_t dest_addr)
 {
-	struct tcpx_ep *tcpx_ep;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *tx_entry;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
 
-	tx_entry = tcpx_alloc_send_entry(tcpx_ep);
+	tx_entry = tcpx_alloc_send(ep);
 	if (!tx_entry)
 		return -FI_EAGAIN;
 
-	assert(len <= TCPX_MAX_INJECT_SZ);
-
-	tx_entry->hdr.cq_data_hdr.base_hdr.flags = OFI_REMOTE_CQ_DATA;
+	tx_entry->hdr.cq_data_hdr.base_hdr.flags = TCPX_REMOTE_CQ_DATA;
 	tx_entry->hdr.cq_data_hdr.cq_data = data;
 
-	tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.cq_data_hdr);
-	tx_entry->hdr.base_hdr.payload_off = (uint8_t)
-					     sizeof(tx_entry->hdr.cq_data_hdr);
-
-	memcpy((uint8_t *) &tx_entry->hdr + sizeof(tx_entry->hdr.cq_data_hdr),
-	       (uint8_t *) buf, len);
+	tcpx_init_tx_inject(tx_entry, sizeof(tx_entry->hdr.cq_data_hdr),
+			    buf, len);
 
-	tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr;
-	tx_entry->iov[0].iov_len = len + sizeof(tx_entry->hdr.cq_data_hdr);
-	tx_entry->iov_cnt = 1;
-	tx_entry->rem_len = tx_entry->hdr.base_hdr.size;
-	tx_entry->flags = FI_MSG | FI_SEND;
+	tx_entry->cq_flags = FI_MSG | FI_SEND; /* set in case of error */
 
-	tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, tx_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tcpx_queue_send(ep, tx_entry);
 	return FI_SUCCESS;
 }
 
@@ -435,3 +429,181 @@ struct fi_ops_msg tcpx_msg_ops = {
 	.senddata = tcpx_senddata,
 	.injectdata = tcpx_injectdata,
 };
+
+
+/* There's no application driven need for tagged message operations over
+ * connected endpoints.  The tcp provider exposes the ability to send
+ * tagged messages using the tcp header, with the expectation that the
+ * peer side is using dynamic receive buffers to match the tagged messages
+ * with application buffers.  This provides an optimized path for rxm
+ * over tcp, that allows rxm to drop its header in certain cases an only
+ * use a minimal tcp header.
+ */
+static ssize_t
+tcpx_tsendmsg(struct fid_ep *fid_ep, const struct fi_msg_tagged *msg,
+	      uint64_t flags)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+	size_t hdr_len;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+
+	if (flags & FI_REMOTE_CQ_DATA) {
+		tx_entry->hdr.base_hdr.flags |= TCPX_REMOTE_CQ_DATA;
+		tx_entry->hdr.tag_data_hdr.cq_data_hdr.cq_data = msg->data;
+		tx_entry->hdr.tag_data_hdr.tag = msg->tag;
+		hdr_len = sizeof(tx_entry->hdr.tag_data_hdr);
+	} else {
+		tx_entry->hdr.tag_hdr.tag = msg->tag;
+		hdr_len = sizeof(tx_entry->hdr.tag_hdr);
+	}
+
+	tcpx_init_tx_iov(tx_entry, hdr_len, msg->msg_iov, msg->iov_count);
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, flags) |
+			     FI_TAGGED | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, flags);
+	tx_entry->context = msg->context;
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+static ssize_t
+tcpx_tsend(struct fid_ep *fid_ep, const void *buf, size_t len,
+	   void *desc, fi_addr_t dest_addr, uint64_t tag, void *context)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+
+	tx_entry->hdr.tag_hdr.tag = tag;
+
+	tcpx_init_tx_buf(tx_entry, sizeof(tx_entry->hdr.tag_hdr), buf, len);
+	tx_entry->context = context;
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_TAGGED | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+static ssize_t
+tcpx_tsendv(struct fid_ep *fid_ep, const struct iovec *iov, void **desc,
+	    size_t count, fi_addr_t dest_addr, uint64_t tag, void *context)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+
+	tx_entry->hdr.tag_hdr.tag = tag;
+
+	tcpx_init_tx_iov(tx_entry, sizeof(tx_entry->hdr.tag_hdr), iov, count);
+	tx_entry->context = context;
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_TAGGED | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+
+static ssize_t
+tcpx_tinject(struct fid_ep *fid_ep, const void *buf, size_t len,
+	     fi_addr_t dest_addr, uint64_t tag)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+	tx_entry->hdr.tag_hdr.tag = tag;
+
+	tcpx_init_tx_inject(tx_entry, sizeof(tx_entry->hdr.tag_hdr), buf, len);
+
+	tx_entry->cq_flags = FI_TAGGED | FI_SEND; /* set in case of error */
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+static ssize_t
+tcpx_tsenddata(struct fid_ep *fid_ep, const void *buf, size_t len, void *desc,
+	       uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+
+	tx_entry->hdr.base_hdr.flags |= TCPX_REMOTE_CQ_DATA;
+	tx_entry->hdr.tag_data_hdr.tag = tag;
+	tx_entry->hdr.tag_data_hdr.cq_data_hdr.cq_data = data;
+
+	tcpx_init_tx_buf(tx_entry, sizeof(tx_entry->hdr.tag_data_hdr),
+			 buf, len);
+	tx_entry->context = context;
+	tx_entry->cq_flags = tcpx_tx_completion_flag(ep, 0) |
+			     FI_TAGGED | FI_SEND;
+	tcpx_set_ack_flags(tx_entry, ep->util_ep.tx_op_flags);
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+static ssize_t
+tcpx_tinjectdata(struct fid_ep *fid_ep, const void *buf, size_t len,
+		 uint64_t data, fi_addr_t dest_addr, uint64_t tag)
+{
+	struct tcpx_ep *ep;
+	struct tcpx_xfer_entry *tx_entry;
+
+	ep = container_of(fid_ep, struct tcpx_ep, util_ep.ep_fid);
+
+	tx_entry = tcpx_alloc_tsend(ep);
+	if (!tx_entry)
+		return -FI_EAGAIN;
+
+	tx_entry->hdr.base_hdr.flags |= TCPX_REMOTE_CQ_DATA;
+	tx_entry->hdr.tag_data_hdr.tag = tag;
+	tx_entry->hdr.tag_data_hdr.cq_data_hdr.cq_data = data;
+
+	tcpx_init_tx_inject(tx_entry, sizeof(tx_entry->hdr.tag_data_hdr),
+			    buf, len);
+
+	tx_entry->cq_flags = FI_TAGGED | FI_SEND; /* set in case of error */
+
+	tcpx_queue_send(ep, tx_entry);
+	return FI_SUCCESS;
+}
+
+struct fi_ops_tagged tcpx_tagged_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = fi_no_tagged_recv,
+	.recvv = fi_no_tagged_recvv,
+	.recvmsg = fi_no_tagged_recvmsg,
+	.send = tcpx_tsend,
+	.sendv = tcpx_tsendv,
+	.sendmsg = tcpx_tsendmsg,
+	.inject = tcpx_tinject,
+	.senddata = tcpx_tsenddata,
+	.injectdata = tcpx_tinjectdata,
+};
diff --git a/deps/libfabric/prov/tcp/src/tcpx_progress.c b/deps/libfabric/prov/tcp/src/tcpx_progress.c
index 8d955fb1a8badb57bf054c0f6a6e59f010914ad2..249f71a04e6a65d796276d9459fd7626a7b832ac 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_progress.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_progress.c
@@ -43,131 +43,241 @@
 #include <ofi_iov.h>
 
 
-static void process_tx_entry(struct tcpx_xfer_entry *tx_entry)
+static int tcpx_send_msg(struct tcpx_ep *ep)
 {
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_xfer_entry *tx_entry;
+	ssize_t ret;
+	size_t len;
+
+	assert(ep->cur_tx.entry);
+	tx_entry = ep->cur_tx.entry;
+	ret = ofi_bsock_sendv(&ep->bsock, tx_entry->iov, tx_entry->iov_cnt,
+			      &len);
+	if (ret < 0 && ret != -FI_EINPROGRESS)
+		return ret;
+
+	if (ret == -FI_EINPROGRESS) {
+		/* If a transfer generated multiple async sends, we only
+		 * need to track the last async index to know when the entire
+		 * transfer has completed.
+		 */
+		tx_entry->async_index = ep->bsock.async_index;
+		tx_entry->ctrl_flags |= TCPX_ASYNC;
+	} else {
+		len = ret;
+	}
+
+	ep->cur_tx.data_left -= len;
+	if (ep->cur_tx.data_left) {
+		ofi_consume_iov(tx_entry->iov, &tx_entry->iov_cnt, len);
+		return -FI_EAGAIN;
+	}
+	return FI_SUCCESS;
+}
+
+static int tcpx_recv_msg_data(struct tcpx_ep *ep)
+{
+	struct tcpx_xfer_entry *rx_entry;
+	ssize_t ret;
+
+	if (!ep->cur_rx.data_left)
+		return FI_SUCCESS;
+
+	rx_entry = ep->cur_rx.entry;
+	ret = ofi_bsock_recvv(&ep->bsock, rx_entry->iov, rx_entry->iov_cnt);
+	if (ret < 0)
+		return ret;
+
+	ep->cur_rx.data_left -= ret;
+	if (!ep->cur_rx.data_left)
+		return FI_SUCCESS;
+
+	ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, ret);
+	if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len)
+		return -FI_ETRUNC;
+
+	return -FI_EAGAIN;
+}
+
+void tcpx_progress_tx(struct tcpx_ep *ep)
+{
+	struct tcpx_xfer_entry *tx_entry;
+	struct tcpx_cq *cq;
 	int ret;
 
-	ret = tcpx_send_msg(tx_entry);
-	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
-		return;
+	assert(fastlock_held(&ep->lock));
+	while (ep->cur_tx.entry) {
+		ret = tcpx_send_msg(ep);
+		if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
+			return;
 
-	/* Keep this path below as a single pass path.*/
-	tx_entry->ep->hdr_bswap(&tx_entry->hdr.base_hdr);
-	slist_remove_head(&tx_entry->ep->tx_queue);
+		tx_entry = ep->cur_tx.entry;
+		cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
 
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg send failed\n");
-		tcpx_ep_disable(tx_entry->ep, 0);
-		tcpx_cq_report_error(tx_entry->ep->util_ep.tx_cq,
-				     tx_entry, -ret);
-	} else {
-		if (tx_entry->hdr.base_hdr.flags &
-		    (OFI_DELIVERY_COMPLETE | OFI_COMMIT_COMPLETE)) {
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg send failed\n");
+			tcpx_cq_report_error(&cq->util_cq, tx_entry, -ret);
+			tcpx_free_xfer(cq, tx_entry);
+		} else if (tx_entry->ctrl_flags & TCPX_NEED_ACK) {
+			/* A SW ack guarantees the peer received the data, so
+			 * we can skip the async completion.
+			 */
 			slist_insert_tail(&tx_entry->entry,
-					  &tx_entry->ep->tx_rsp_pend_queue);
-			return;
+					  &ep->need_ack_queue);
+		} else if (tx_entry->ctrl_flags & TCPX_NEED_RESP) {
+			// discard send but enable receive for completeion
+			assert(tx_entry->resp_entry);
+			tx_entry->resp_entry->ctrl_flags &= ~TCPX_INTERNAL_XFER;
+			tcpx_free_xfer(cq, tx_entry);
+		} else if ((tx_entry->ctrl_flags & TCPX_ASYNC) &&
+			   (ofi_val32_gt(tx_entry->async_index,
+					 ep->bsock.done_index))) {
+			slist_insert_tail(&tx_entry->entry,
+						&ep->async_queue);
+		} else {
+			tcpx_cq_report_success(&cq->util_cq, tx_entry);
+			tcpx_free_xfer(cq, tx_entry);
+		}
+
+		if (!slist_empty(&ep->priority_queue)) {
+			ep->cur_tx.entry = container_of(slist_remove_head(
+							&ep->priority_queue),
+					     struct tcpx_xfer_entry, entry);
+			assert(ep->cur_tx.entry->ctrl_flags & TCPX_INTERNAL_XFER);
+		} else if (!slist_empty(&ep->tx_queue)) {
+			ep->cur_tx.entry = container_of(slist_remove_head(
+							&ep->tx_queue),
+					     struct tcpx_xfer_entry, entry);
+			assert(!(ep->cur_tx.entry->ctrl_flags & TCPX_INTERNAL_XFER));
+		} else {
+			ep->cur_tx.entry = NULL;
+			break;
 		}
-		tcpx_cq_report_success(tx_entry->ep->util_ep.tx_cq, tx_entry);
+
+		ep->cur_tx.data_left = ep->cur_tx.entry->hdr.base_hdr.size;
+		OFI_DBG_SET(ep->cur_tx.entry->hdr.base_hdr.id, ep->tx_id++);
+		ep->hdr_bswap(&ep->cur_tx.entry->hdr.base_hdr);
 	}
 
-	tcpx_cq = container_of(tx_entry->ep->util_ep.tx_cq,
-			       struct tcpx_cq, util_cq);
-	tcpx_xfer_entry_release(tcpx_cq, tx_entry);
+	/* Buffered data is sent first by tcpx_send_msg, but if we don't
+	 * have other data to send, we need to try flushing any buffered data.
+	 */
+	(void) ofi_bsock_flush(&ep->bsock);
 }
 
-static int tcpx_prepare_rx_entry_resp(struct tcpx_xfer_entry *rx_entry)
+static int tcpx_queue_ack(struct tcpx_xfer_entry *rx_entry)
 {
-	struct tcpx_cq *tcpx_tx_cq;
-	struct tcpx_xfer_entry *resp_entry;
-
-	tcpx_tx_cq = container_of(rx_entry->ep->util_ep.tx_cq,
-			       struct tcpx_cq, util_cq);
+	struct tcpx_ep *ep;
+	struct tcpx_cq *cq;
+	struct tcpx_xfer_entry *resp;
 
-	resp_entry = tcpx_xfer_entry_alloc(tcpx_tx_cq, TCPX_OP_MSG_RESP);
-	if (!resp_entry)
-		return -FI_EAGAIN;
+	ep = rx_entry->ep;
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
 
-	resp_entry->iov[0].iov_base = (void *) &resp_entry->hdr;
-	resp_entry->iov[0].iov_len = sizeof(resp_entry->hdr.base_hdr);
-	resp_entry->iov_cnt = 1;
+	resp = tcpx_alloc_xfer(cq);
+	if (!resp)
+		return -FI_ENOMEM;
 
-	resp_entry->hdr.base_hdr.op = ofi_op_msg;
-	resp_entry->hdr.base_hdr.size = sizeof(resp_entry->hdr.base_hdr);
-	resp_entry->hdr.base_hdr.payload_off =
-		(uint8_t)sizeof(resp_entry->hdr.base_hdr);
+	resp->iov[0].iov_base = (void *) &resp->hdr;
+	resp->iov[0].iov_len = sizeof(resp->hdr.base_hdr);
+	resp->iov_cnt = 1;
 
-	resp_entry->flags = 0;
-	resp_entry->context = NULL;
-	resp_entry->rem_len = sizeof(resp_entry->hdr.base_hdr);
-	resp_entry->ep = rx_entry->ep;
+	resp->hdr.base_hdr.version = TCPX_HDR_VERSION;
+	resp->hdr.base_hdr.op_data = TCPX_OP_ACK;
+	resp->hdr.base_hdr.op = ofi_op_msg;
+	resp->hdr.base_hdr.size = sizeof(resp->hdr.base_hdr);
+	resp->hdr.base_hdr.hdr_size = (uint8_t) sizeof(resp->hdr.base_hdr);
 
-	resp_entry->ep->hdr_bswap(&resp_entry->hdr.base_hdr);
-	tcpx_tx_queue_insert(resp_entry->ep, resp_entry);
-	tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry);
+	resp->ctrl_flags = TCPX_INTERNAL_XFER;
+	resp->context = NULL;
+	resp->ep = ep;
 
-	tcpx_rx_msg_release(rx_entry);
+	tcpx_tx_queue_insert(ep, resp);
 	return FI_SUCCESS;
 }
 
-static int process_rx_entry(struct tcpx_xfer_entry *rx_entry)
+static int tcpx_update_rx_iov(struct tcpx_xfer_entry *rx_entry)
 {
-	int ret = FI_SUCCESS;
+	struct ofi_cq_rbuf_entry cq_entry;
+	int ret;
 
-	ret = tcpx_recv_msg_data(rx_entry);
-	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
-		return ret;
+	assert(tcpx_dynamic_rbuf(rx_entry->ep));
+
+	cq_entry.ep_context = rx_entry->ep->util_ep.ep_fid.fid.context;
+	cq_entry.op_context = rx_entry->context;
+	cq_entry.flags = 0;
+	cq_entry.len = rx_entry->hdr.base_hdr.size -
+		       rx_entry->hdr.base_hdr.hdr_size;
+	cq_entry.buf = rx_entry->mrecv_msg_start;
+	tcpx_get_cq_info(rx_entry, &cq_entry.flags, &cq_entry.data,
+			 &cq_entry.tag);
 
+	rx_entry->iov_cnt = TCPX_IOV_LIMIT;
+	ret = (int) tcpx_dynamic_rbuf(rx_entry->ep)->
+		    get_rbuf(&cq_entry, &rx_entry->iov[0], &rx_entry->iov_cnt);
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
-			"msg recv Failed ret = %d\n", ret);
+			"get_rbuf callback failed %s\n",
+			fi_strerror(-ret));
+		return ret;
+	}
 
-		tcpx_ep_disable(rx_entry->ep, 0);
-		tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
-		tcpx_rx_msg_release(rx_entry);
-	} else if (rx_entry->hdr.base_hdr.flags & OFI_DELIVERY_COMPLETE) {
-		if (tcpx_prepare_rx_entry_resp(rx_entry))
-			rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_entry_resp;
-	} else {
-		tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry);
-		tcpx_rx_msg_release(rx_entry);
+	assert(rx_entry->iov_cnt <= TCPX_IOV_LIMIT);
+	ret = ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_cnt,
+			       rx_entry->ep->cur_rx.data_left);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+			"dynamically provided rbuf is too small\n");
+		return ret;
 	}
-	return ret;
+
+	return 0;
 }
 
-static int tcpx_prepare_rx_write_resp(struct tcpx_xfer_entry *rx_entry)
+static int tcpx_process_recv(struct tcpx_ep *ep)
 {
-	struct tcpx_cq *tcpx_rx_cq, *tcpx_tx_cq;
-	struct tcpx_xfer_entry *resp_entry;
+	struct tcpx_xfer_entry *rx_entry;
+	int ret;
 
-	tcpx_tx_cq = container_of(rx_entry->ep->util_ep.tx_cq,
-				  struct tcpx_cq, util_cq);
+	rx_entry = ep->cur_rx.entry;
+retry:
+	ret = tcpx_recv_msg_data(ep);
+	if (ret) {
+		if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
+			return ret;
 
-	resp_entry = tcpx_xfer_entry_alloc(tcpx_tx_cq, TCPX_OP_MSG_RESP);
-	if (!resp_entry)
-		return -FI_EAGAIN;
+		if (ret != -FI_ETRUNC)
+			goto err;
+		assert(rx_entry->ctrl_flags & TCPX_NEED_DYN_RBUF);
+	}
 
-	resp_entry->iov[0].iov_base = (void *) &resp_entry->hdr;
-	resp_entry->iov[0].iov_len = sizeof(resp_entry->hdr.base_hdr);
-	resp_entry->iov_cnt = 1;
-
-	resp_entry->hdr.base_hdr.op = ofi_op_msg;
-	resp_entry->hdr.base_hdr.size = sizeof(resp_entry->hdr.base_hdr);
-	resp_entry->hdr.base_hdr.payload_off = (uint8_t)
-						sizeof(resp_entry->hdr.base_hdr);
-
-	resp_entry->flags &= ~FI_COMPLETION;
-	resp_entry->context = NULL;
-	resp_entry->rem_len = resp_entry->hdr.base_hdr.size;
-	resp_entry->ep = rx_entry->ep;
-	resp_entry->ep->hdr_bswap(&resp_entry->hdr.base_hdr);
-	tcpx_tx_queue_insert(resp_entry->ep, resp_entry);
-
-	tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry);
-	tcpx_rx_cq = container_of(rx_entry->ep->util_ep.rx_cq,
-				  struct tcpx_cq, util_cq);
-	tcpx_xfer_entry_release(tcpx_rx_cq, rx_entry);
-	return FI_SUCCESS;
+	if (rx_entry->ctrl_flags & TCPX_NEED_DYN_RBUF) {
+		ret = tcpx_update_rx_iov(rx_entry);
+		if (ret)
+			goto err;
+
+		rx_entry->ctrl_flags &= ~TCPX_NEED_DYN_RBUF;
+		goto retry;
+	}
+
+	if (rx_entry->hdr.base_hdr.flags & TCPX_DELIVERY_COMPLETE) {
+		ret = tcpx_queue_ack(rx_entry);
+		if (ret)
+			goto err;
+	}
+
+	tcpx_cq_report_success(ep->util_ep.rx_cq, rx_entry);
+	tcpx_free_rx(rx_entry);
+	tcpx_reset_rx(ep);
+	return 0;
+
+err:
+	FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+		"msg recv failed ret = %d (%s)\n", ret, fi_strerror(-ret));
+	tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
+	tcpx_free_rx(rx_entry);
+	tcpx_reset_rx(ep);
+	return ret;
 }
 
 static void tcpx_pmem_commit(struct tcpx_xfer_entry *rx_entry)
@@ -179,8 +289,7 @@ static void tcpx_pmem_commit(struct tcpx_xfer_entry *rx_entry)
 	if (!ofi_pmem_commit)
 		return ;
 
-	if (rx_entry->hdr.base_hdr.flags &
-	    OFI_REMOTE_CQ_DATA)
+	if (rx_entry->hdr.base_hdr.flags & TCPX_REMOTE_CQ_DATA)
 		offset = sizeof(rx_entry->hdr.base_hdr) + sizeof(uint64_t);
 	else
 		offset = sizeof(rx_entry->hdr.base_hdr);
@@ -194,477 +303,526 @@ static void tcpx_pmem_commit(struct tcpx_xfer_entry *rx_entry)
 	}
 }
 
-static int process_remote_write(struct tcpx_xfer_entry *rx_entry)
+static int tcpx_process_remote_write(struct tcpx_ep *ep)
 {
-	struct tcpx_cq *tcpx_cq;
-	int ret = FI_SUCCESS;
+	struct tcpx_xfer_entry *rx_entry;
+	struct tcpx_cq *cq;
+	int ret;
 
-	ret = tcpx_recv_msg_data(rx_entry);
+	rx_entry = ep->cur_rx.entry;
+	ret = tcpx_recv_msg_data(ep);
 	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
 		return ret;
 
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
-			"remote write Failed ret = %d\n",
-			ret);
-
-		tcpx_ep_disable(rx_entry->ep, 0);
-		tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
-		tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq,
-				       struct tcpx_cq, util_cq);
-		tcpx_xfer_entry_release(tcpx_cq, rx_entry);
+	cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+	if (ret)
+		goto err;
 
-	} else if (rx_entry->hdr.base_hdr.flags &
-		  (OFI_DELIVERY_COMPLETE | OFI_COMMIT_COMPLETE)) {
+	if (rx_entry->hdr.base_hdr.flags &
+	    (TCPX_DELIVERY_COMPLETE | TCPX_COMMIT_COMPLETE)) {
 
-		if (rx_entry->hdr.base_hdr.flags & OFI_COMMIT_COMPLETE)
+		if (rx_entry->hdr.base_hdr.flags & TCPX_COMMIT_COMPLETE)
 			tcpx_pmem_commit(rx_entry);
 
-		if (tcpx_prepare_rx_write_resp(rx_entry))
-			rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_write_resp;
-	} else {
-		tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry);
-		tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq,
-				       struct tcpx_cq, util_cq);
-		tcpx_xfer_entry_release(tcpx_cq, rx_entry);
+		ret = tcpx_queue_ack(rx_entry);
+		if (ret)
+			goto err;
 	}
+
+	tcpx_cq_report_success(ep->util_ep.rx_cq, rx_entry);
+	tcpx_free_xfer(cq, rx_entry);
+	tcpx_reset_rx(ep);
+	return FI_SUCCESS;
+
+err:
+	FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "remote write failed %d\n", ret);
+	tcpx_free_xfer(cq, rx_entry);
+	tcpx_reset_rx(ep);
 	return ret;
 }
 
-static int process_remote_read(struct tcpx_xfer_entry *rx_entry)
+static int tcpx_process_remote_read(struct tcpx_ep *ep)
 {
-	struct tcpx_cq *tcpx_cq;
-	int ret = FI_SUCCESS;
+	struct tcpx_xfer_entry *rx_entry;
+	struct tcpx_cq *cq;
+	int ret;
+
+	rx_entry = ep->cur_rx.entry;
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
 
-	ret = tcpx_recv_msg_data(rx_entry);
+	ret = tcpx_recv_msg_data(ep);
 	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
 		return ret;
 
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
 			"msg recv Failed ret = %d\n", ret);
-		tcpx_ep_disable(rx_entry->ep, 0);
-		tcpx_cq_report_error(rx_entry->ep->util_ep.tx_cq, rx_entry, -ret);
+		tcpx_cq_report_error(&cq->util_cq, rx_entry, -ret);
 	} else {
-		tcpx_cq_report_success(rx_entry->ep->util_ep.tx_cq, rx_entry);
+		tcpx_cq_report_success(&cq->util_cq, rx_entry);
 	}
 
 	slist_remove_head(&rx_entry->ep->rma_read_queue);
-	tcpx_cq = container_of(rx_entry->ep->util_ep.tx_cq,
-			       struct tcpx_cq, util_cq);
-	tcpx_xfer_entry_release(tcpx_cq, rx_entry);
+	tcpx_free_xfer(cq, rx_entry);
+	tcpx_reset_rx(ep);
 	return ret;
 }
 
-static void tcpx_copy_rma_iov_to_msg_iov(struct tcpx_xfer_entry *xfer_entry)
+int tcpx_op_invalid(struct tcpx_ep *ep)
 {
-	struct ofi_rma_iov *rma_iov;
-	size_t offset;
-	int i;
-
-	if (xfer_entry->hdr.base_hdr.flags &
-	    OFI_REMOTE_CQ_DATA)
-		offset = sizeof(xfer_entry->hdr.base_hdr) + sizeof(uint64_t);
-	else
-		offset = sizeof(xfer_entry->hdr.base_hdr);
-
-	rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &xfer_entry->hdr + offset);
-
-	xfer_entry->iov_cnt = xfer_entry->hdr.base_hdr.rma_iov_cnt;
-	for ( i = 0 ; i < xfer_entry->hdr.base_hdr.rma_iov_cnt; i++ ) {
-		xfer_entry->iov[i].iov_base = (void *) rma_iov[i].addr;
-		xfer_entry->iov[i].iov_len = rma_iov[i].len;
-	}
+	return -FI_EINVAL;
 }
 
-static int tcpx_prepare_rx_remote_read_resp(struct tcpx_xfer_entry *resp_entry)
+static struct tcpx_xfer_entry *tcpx_get_rx_entry(struct tcpx_ep *ep)
 {
-	struct ofi_rma_iov *rma_iov;
-	int i;
+	struct tcpx_xfer_entry *xfer;
+	struct tcpx_rx_ctx *srx;
+
+	if (ep->srx_ctx) {
+		srx = ep->srx_ctx;
+		fastlock_acquire(&srx->lock);
+		if (!slist_empty(&srx->rx_queue)) {
+			xfer = container_of(slist_remove_head(&srx->rx_queue),
+					    struct tcpx_xfer_entry, entry);
+			xfer->cq_flags |= tcpx_rx_completion_flag(ep, 0);
+		} else {
+			xfer = NULL;
+		}
+		fastlock_release(&ep->srx_ctx->lock);
+	} else {
+		assert(fastlock_held(&ep->lock));
+		if (!slist_empty(&ep->rx_queue)) {
+			xfer = container_of(slist_remove_head(&ep->rx_queue),
+					    struct tcpx_xfer_entry, entry);
+			ep->rx_avail++;
+		} else {
+			xfer = NULL;
+		}
+	}
 
-	resp_entry->iov[0].iov_base = (void *) &resp_entry->hdr;
-	resp_entry->iov[0].iov_len = sizeof(resp_entry->hdr.base_hdr);
+	return xfer;
+}
 
-	rma_iov = (struct ofi_rma_iov *) ((uint8_t *)
-		  &resp_entry->hdr + sizeof(resp_entry->hdr.base_hdr));
-
-	resp_entry->iov_cnt = 1 + resp_entry->hdr.base_hdr.rma_iov_cnt;
-	resp_entry->hdr.base_hdr.size = resp_entry->iov[0].iov_len;
-	for ( i = 0 ; i < resp_entry->hdr.base_hdr.rma_iov_cnt ; i++ ) {
-		resp_entry->iov[i+1].iov_base =	(void *) (uintptr_t)rma_iov[i].addr;
-		resp_entry->iov[i+1].iov_len = rma_iov[i].len;
-		resp_entry->hdr.base_hdr.size += resp_entry->iov[i+1].iov_len;
-	}
+static int tcpx_handle_ack(struct tcpx_ep *ep)
+{
+	struct tcpx_xfer_entry *tx_entry;
+	struct tcpx_cq *cq;
 
-	resp_entry->hdr.base_hdr.op = ofi_op_read_rsp;
-	resp_entry->hdr.base_hdr.payload_off = (uint8_t)
-						sizeof(resp_entry->hdr.base_hdr);
+	if (ep->cur_rx.hdr.base_hdr.size !=
+	    sizeof(ep->cur_rx.hdr.base_hdr))
+		return -FI_EIO;
 
-	resp_entry->flags &= ~FI_COMPLETION;
-	resp_entry->context = NULL;
-	resp_entry->rem_len = resp_entry->hdr.base_hdr.size;
+	assert(!slist_empty(&ep->need_ack_queue));
+	tx_entry = container_of(slist_remove_head(&ep->need_ack_queue),
+				struct tcpx_xfer_entry, entry);
 
-	resp_entry->ep->hdr_bswap(&resp_entry->hdr.base_hdr);
-	tcpx_tx_queue_insert(resp_entry->ep, resp_entry);
-	resp_entry->ep->cur_rx_entry = NULL;
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+	tcpx_cq_report_success(ep->util_ep.tx_cq, tx_entry);
+	tcpx_free_xfer(cq, tx_entry);
+	tcpx_reset_rx(ep);
 	return FI_SUCCESS;
 }
 
-static int tcpx_validate_rx_rma_data(struct tcpx_xfer_entry *rx_entry,
-				     uint64_t access)
+int tcpx_op_msg(struct tcpx_ep *ep)
 {
-	struct ofi_mr_map *map = &rx_entry->ep->util_ep.domain->mr_map;
-	struct ofi_rma_iov *rma_iov;
-	size_t offset;
-	int i, ret;
+	struct tcpx_xfer_entry *rx_entry;
+	struct tcpx_cur_rx *msg = &ep->cur_rx;
+	size_t msg_len;
+	int ret;
 
-	if (rx_entry->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA)
-		offset = sizeof(rx_entry->hdr.base_hdr) + sizeof(uint64_t);
-	else
-		offset = sizeof(rx_entry->hdr.base_hdr);
+	if (msg->hdr.base_hdr.op_data == TCPX_OP_ACK)
+		return tcpx_handle_ack(ep);
 
-	rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &rx_entry->hdr + offset);
+	msg_len = (msg->hdr.base_hdr.size - msg->hdr.base_hdr.hdr_size);
 
-	for ( i = 0 ; i < rx_entry->hdr.base_hdr.rma_iov_cnt ; i++) {
-		ret = ofi_mr_verify(map, rma_iov[i].len,
-				    (uintptr_t *)&rma_iov[i].addr,
-				    rma_iov[i].key, access);
-		if (ret) {
-			FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
-			       "invalid rma iov received\n");
-			return -FI_EINVAL;
+	rx_entry = tcpx_get_rx_entry(ep);
+	if (!rx_entry)
+		return -FI_EAGAIN;
+
+	memcpy(&rx_entry->hdr, &msg->hdr,
+	       (size_t) msg->hdr.base_hdr.hdr_size);
+	rx_entry->ep = ep;
+	rx_entry->mrecv_msg_start = rx_entry->iov[0].iov_base;
+
+	if (tcpx_dynamic_rbuf(ep)) {
+		rx_entry->ctrl_flags = TCPX_NEED_DYN_RBUF;
+
+		if (msg->hdr.base_hdr.flags & TCPX_TAGGED) {
+			/* Raw message, no rxm header */
+			rx_entry->iov_cnt = 0;
+		} else {
+			/* Receiving only rxm header */
+			assert(msg_len >= ofi_total_iov_len(rx_entry->iov,
+							    rx_entry->iov_cnt));
 		}
+	} else {
+		ret = ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_cnt,
+				       msg_len);
+		if (ret)
+			goto truncate_err;
 	}
-	return FI_SUCCESS;
-}
-
-int tcpx_op_invalid(struct tcpx_ep *tcpx_ep)
-{
-	return -FI_EINVAL;
-}
 
-static void tcpx_rx_setup(struct tcpx_ep *ep, struct tcpx_xfer_entry *rx_entry,
-			  tcpx_rx_process_fn_t process_fn)
-{
-	ep->cur_rx_entry = rx_entry;
-	ep->cur_rx_proc_fn = process_fn;
+	ep->cur_rx.entry = rx_entry;
+	ep->cur_rx.handler = tcpx_process_recv;
+	return tcpx_process_recv(ep);
 
-	/* Reset to receive next message */
-	ep->cur_rx_msg.hdr_len = sizeof(ep->cur_rx_msg.hdr.base_hdr);
-	ep->cur_rx_msg.done_len = 0;
+truncate_err:
+	FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+		"posted rx buffer size is not big enough\n");
+	tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
+	tcpx_free_rx(rx_entry);
+	return ret;
 }
 
-int tcpx_op_msg(struct tcpx_ep *tcpx_ep)
+int tcpx_op_tagged(struct tcpx_ep *ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
-	struct tcpx_xfer_entry *tx_entry;
-	struct tcpx_cq *tcpx_cq;
-	struct tcpx_cur_rx_msg *cur_rx_msg = &tcpx_ep->cur_rx_msg;
+	struct tcpx_cur_rx *msg = &ep->cur_rx;
 	size_t msg_len;
+	uint64_t tag;
 	int ret;
 
-	if (cur_rx_msg->hdr.base_hdr.op_data == TCPX_OP_MSG_RESP) {
-		assert(!slist_empty(&tcpx_ep->tx_rsp_pend_queue));
-		tx_entry = container_of(tcpx_ep->tx_rsp_pend_queue.head,
-					struct tcpx_xfer_entry, entry);
+	assert(ep->srx_ctx && !tcpx_dynamic_rbuf(ep));
+	msg_len = (msg->hdr.base_hdr.size - msg->hdr.base_hdr.hdr_size);
 
-		tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq,
-				       util_cq);
-		tcpx_cq_report_success(tx_entry->ep->util_ep.tx_cq, tx_entry);
+	tag = (msg->hdr.base_hdr.flags & FI_REMOTE_CQ_DATA) ?
+	      msg->hdr.tag_data_hdr.tag : msg->hdr.tag_hdr.tag;
 
-		slist_remove_head(&tx_entry->ep->tx_rsp_pend_queue);
-		tcpx_xfer_entry_release(tcpx_cq, tx_entry);
-		tcpx_rx_setup(tcpx_ep, NULL, NULL);
+	rx_entry = ep->srx_ctx->match_tag_rx(ep->srx_ctx, ep, tag);
+	if (!rx_entry)
 		return -FI_EAGAIN;
-	}
 
-	msg_len = (tcpx_ep->cur_rx_msg.hdr.base_hdr.size -
-		   tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off);
-
-	if (tcpx_ep->srx_ctx){
-		rx_entry = tcpx_srx_next_xfer_entry(tcpx_ep->srx_ctx,
-						    tcpx_ep, msg_len);
-		if (!rx_entry)
-			return -FI_EAGAIN;
-
-		rx_entry->flags |= tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION;
-	} else {
-		if (slist_empty(&tcpx_ep->rx_queue))
-			return -FI_EAGAIN;
-
-		rx_entry = container_of(tcpx_ep->rx_queue.head,
-					struct tcpx_xfer_entry, entry);
-
-		rx_entry->rem_len = ofi_total_iov_len(rx_entry->iov,
-						      rx_entry->iov_cnt) - msg_len;
-		slist_remove_head(&tcpx_ep->rx_queue);
-	}
-
-	memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr,
-	       (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off);
-	rx_entry->ep = tcpx_ep;
-	rx_entry->hdr.base_hdr.op_data = TCPX_OP_MSG_RECV;
-	rx_entry->mrecv_msg_start = rx_entry->iov[0].iov_base;
+	rx_entry->cq_flags |= tcpx_rx_completion_flag(ep, 0);
+	memcpy(&rx_entry->hdr, &msg->hdr,
+	       (size_t) msg->hdr.base_hdr.hdr_size);
+	rx_entry->ep = ep;
 
 	ret = ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_cnt, msg_len);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
-			"posted rx buffer size is not big enough\n");
-		tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq,
-				     rx_entry, -ret);
-		tcpx_rx_msg_release(rx_entry);
-		return ret;
-	}
+	if (ret)
+		goto truncate_err;
 
-	if (cur_rx_msg->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA)
-		rx_entry->flags |= FI_REMOTE_CQ_DATA;
+	ep->cur_rx.entry = rx_entry;
+	ep->cur_rx.handler = tcpx_process_recv;
+	return tcpx_process_recv(ep);
 
-	tcpx_rx_setup(tcpx_ep, rx_entry, process_rx_entry);
-	return FI_SUCCESS;
+truncate_err:
+	FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+		"posted rx buffer size is not big enough\n");
+	tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
+	tcpx_free_rx(rx_entry);
+	return ret;
 }
 
-int tcpx_op_read_req(struct tcpx_ep *tcpx_ep)
+int tcpx_op_read_req(struct tcpx_ep *ep)
 {
-	struct tcpx_xfer_entry *rx_entry;
-	struct tcpx_cq *tcpx_cq;
-	int ret;
+	struct tcpx_xfer_entry *resp;
+	struct tcpx_cq *cq;
+	struct ofi_rma_iov *rma_iov;
+	int i, ret;
 
-	/* The read request will generate a response once done,
-	 * so the xfer_entry will become a transmit and returned
-	 * to the tx cq buffer pool.
-	 */
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq,
-			       struct tcpx_cq, util_cq);
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+	resp = tcpx_alloc_xfer(cq);
+	if (!resp)
+		return -FI_ENOMEM;
 
-	rx_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_REMOTE_READ);
-	if (!rx_entry)
-		return -FI_EAGAIN;
+	memcpy(&resp->hdr, &ep->cur_rx.hdr,
+	       (size_t) ep->cur_rx.hdr.base_hdr.hdr_size);
+	resp->hdr.base_hdr.op_data = 0;
+	resp->ep = ep;
 
-	memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr,
-	       (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off);
-	rx_entry->hdr.base_hdr.op_data = TCPX_OP_REMOTE_READ;
-	rx_entry->ep = tcpx_ep;
-	rx_entry->rem_len = (rx_entry->hdr.base_hdr.size -
-			      tcpx_ep->cur_rx_msg.done_len);
+	resp->iov[0].iov_base = (void *) &resp->hdr;
+	resp->iov[0].iov_len = sizeof(resp->hdr.base_hdr);
 
-	ret = tcpx_validate_rx_rma_data(rx_entry, FI_REMOTE_READ);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
-			"invalid rma data\n");
-		tcpx_xfer_entry_release(tcpx_cq, rx_entry);
-		return ret;
+	rma_iov = (struct ofi_rma_iov *) ((uint8_t *)
+		  &resp->hdr + sizeof(resp->hdr.base_hdr));
+
+	resp->iov_cnt = 1 + resp->hdr.base_hdr.rma_iov_cnt;
+	resp->hdr.base_hdr.size = resp->iov[0].iov_len;
+	for (i = 0; i < resp->hdr.base_hdr.rma_iov_cnt; i++) {
+		ret = ofi_mr_verify(&ep->util_ep.domain->mr_map, rma_iov[i].len,
+				    (uintptr_t *) &rma_iov[i].addr,
+				    rma_iov[i].key, FI_REMOTE_READ);
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+			       "invalid rma iov received\n");
+			tcpx_free_xfer(cq, resp);
+			return ret;
+		}
+
+		resp->iov[i + 1].iov_base = (void *) (uintptr_t)
+					    rma_iov[i].addr;
+		resp->iov[i + 1].iov_len = rma_iov[i].len;
+		resp->hdr.base_hdr.size += resp->iov[i + 1].iov_len;
 	}
 
-	tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_prepare_rx_remote_read_resp);
+	resp->hdr.base_hdr.op = ofi_op_read_rsp;
+	resp->hdr.base_hdr.hdr_size = (uint8_t) sizeof(resp->hdr.base_hdr);
+
+	resp->ctrl_flags = TCPX_INTERNAL_XFER;
+	resp->context = NULL;
+
+	tcpx_tx_queue_insert(ep, resp);
+	tcpx_reset_rx(ep);
 	return FI_SUCCESS;
 }
 
-int tcpx_op_write(struct tcpx_ep *tcpx_ep)
+int tcpx_op_write(struct tcpx_ep *ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
-	struct tcpx_cq *tcpx_cq;
-	int ret;
-
-	tcpx_cq = container_of(tcpx_ep->util_ep.rx_cq,
-			       struct tcpx_cq, util_cq);
+	struct tcpx_cq *cq;
+	struct ofi_rma_iov *rma_iov;
+	int ret, i;
 
-	rx_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_REMOTE_WRITE);
+	cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq);
+	rx_entry = tcpx_alloc_xfer(cq);
 	if (!rx_entry)
-		return -FI_EAGAIN;
+		return -FI_ENOMEM;
 
-	rx_entry->flags = 0;
-	if (tcpx_ep->cur_rx_msg.hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA)
-		rx_entry->flags = (FI_COMPLETION |
-				   FI_REMOTE_CQ_DATA | FI_REMOTE_WRITE);
+	if (ep->cur_rx.hdr.base_hdr.flags & TCPX_REMOTE_CQ_DATA) {
+		rx_entry->cq_flags = (FI_COMPLETION | FI_REMOTE_WRITE |
+				      FI_REMOTE_CQ_DATA);
+		rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &rx_entry->hdr +
+			   sizeof(rx_entry->hdr.cq_data_hdr));
+	} else {
+		rx_entry->ctrl_flags = TCPX_INTERNAL_XFER;
+		rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &rx_entry->hdr +
+			  sizeof(rx_entry->hdr.base_hdr));
+	}
 
-	memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr,
-	       (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off);
-	rx_entry->hdr.base_hdr.op_data = TCPX_OP_REMOTE_WRITE;
-	rx_entry->ep = tcpx_ep;
-	rx_entry->rem_len = (rx_entry->hdr.base_hdr.size -
-			      tcpx_ep->cur_rx_msg.done_len);
+	memcpy(&rx_entry->hdr, &ep->cur_rx.hdr,
+	       (size_t) ep->cur_rx.hdr.base_hdr.hdr_size);
+	rx_entry->hdr.base_hdr.op_data = 0;
+	rx_entry->ep = ep;
 
-	ret = tcpx_validate_rx_rma_data(rx_entry, FI_REMOTE_WRITE);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
-			"invalid rma data\n");
-		tcpx_xfer_entry_release(tcpx_cq, rx_entry);
-		return ret;
+	rx_entry->iov_cnt = rx_entry->hdr.base_hdr.rma_iov_cnt;
+	for (i = 0; i < rx_entry->hdr.base_hdr.rma_iov_cnt; i++) {
+		ret = ofi_mr_verify(&ep->util_ep.domain->mr_map, rma_iov[i].len,
+				    (uintptr_t *) &rma_iov[i].addr,
+				    rma_iov[i].key, FI_REMOTE_WRITE);
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+			       "invalid rma iov received\n");
+			tcpx_free_xfer(cq, rx_entry);
+			return ret;
+		}
+		rx_entry->iov[i].iov_base = (void *) (uintptr_t)
+					      rma_iov[i].addr;
+		rx_entry->iov[i].iov_len = rma_iov[i].len;
 	}
 
-	tcpx_copy_rma_iov_to_msg_iov(rx_entry);
-	tcpx_rx_setup(tcpx_ep, rx_entry, process_remote_write);
-	return FI_SUCCESS;
-
+	ep->cur_rx.entry = rx_entry;
+	ep->cur_rx.handler = tcpx_process_remote_write;
+	return tcpx_process_remote_write(ep);
 }
 
-int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep)
+int tcpx_op_read_rsp(struct tcpx_ep *ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
 	struct slist_entry *entry;
 
-	if (slist_empty(&tcpx_ep->rma_read_queue))
+	if (slist_empty(&ep->rma_read_queue))
 		return -FI_EINVAL;
 
-	entry = tcpx_ep->rma_read_queue.head;
-	rx_entry = container_of(entry, struct tcpx_xfer_entry,
-				entry);
+	entry = ep->rma_read_queue.head;
+	rx_entry = container_of(entry, struct tcpx_xfer_entry, entry);
 
-	memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr,
-	       (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off);
-	rx_entry->hdr.base_hdr.op_data = TCPX_OP_READ_RSP;
-	rx_entry->rem_len = (rx_entry->hdr.base_hdr.size -
-			     tcpx_ep->cur_rx_msg.done_len);
+	memcpy(&rx_entry->hdr, &ep->cur_rx.hdr,
+	       (size_t) ep->cur_rx.hdr.base_hdr.hdr_size);
+	rx_entry->hdr.base_hdr.op_data = 0;
 
-	tcpx_rx_setup(tcpx_ep, rx_entry, process_remote_read);
-	return FI_SUCCESS;
+	ep->cur_rx.entry = rx_entry;
+	ep->cur_rx.handler = tcpx_process_remote_read;
+	return tcpx_process_remote_read(ep);
 }
 
-static int tcpx_get_next_rx_hdr(struct tcpx_ep *ep)
+static int tcpx_recv_hdr(struct tcpx_ep *ep)
 {
+	size_t len;
+	void *buf;
 	ssize_t ret;
 
-	ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf, &ep->cur_rx_msg);
+	assert(ep->cur_rx.hdr_done < ep->cur_rx.hdr_len);
+
+next_hdr:
+	buf = (uint8_t *) &ep->cur_rx.hdr + ep->cur_rx.hdr_done;
+	len = ep->cur_rx.hdr_len - ep->cur_rx.hdr_done;
+	ret = ofi_bsock_recv(&ep->bsock, buf, len);
 	if (ret < 0)
 		return (int) ret;
 
-	ep->cur_rx_msg.done_len += ret;
-	if (ep->cur_rx_msg.done_len >= sizeof(ep->cur_rx_msg.hdr.base_hdr)) {
-		if (ep->cur_rx_msg.hdr.base_hdr.payload_off > TCPX_MAX_HDR_SZ) {
+	ep->cur_rx.hdr_done += ret;
+	if (ep->cur_rx.hdr_done == sizeof(ep->cur_rx.hdr.base_hdr)) {
+		assert(ep->cur_rx.hdr_len == sizeof(ep->cur_rx.hdr.base_hdr));
+
+		if (ep->cur_rx.hdr.base_hdr.hdr_size > TCPX_MAX_HDR) {
 			FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
 				"Payload offset is too large\n");
 			return -FI_EIO;
 		}
-		ep->cur_rx_msg.hdr_len = (size_t) ep->cur_rx_msg.hdr.
-						  base_hdr.payload_off;
-
-		if (ep->cur_rx_msg.hdr_len > ep->cur_rx_msg.done_len) {
-			ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf,
-					    &ep->cur_rx_msg);
-			if (ret < 0)
-				return (int) ret;
+		ep->cur_rx.hdr_len = (size_t) ep->cur_rx.hdr.base_hdr.hdr_size;
+		if (ep->cur_rx.hdr_done < ep->cur_rx.hdr_len)
+			goto next_hdr;
 
-			ep->cur_rx_msg.done_len += ret;
-		}
+	} else if (ep->cur_rx.hdr_done < ep->cur_rx.hdr_len) {
+		return -FI_EAGAIN;
 	}
 
-	if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len)
+	if (ep->cur_rx.hdr_done < ep->cur_rx.hdr_len)
 		return -FI_EAGAIN;
 
-	ep->hdr_bswap(&ep->cur_rx_msg.hdr.base_hdr);
-	return FI_SUCCESS;
+	ep->hdr_bswap(&ep->cur_rx.hdr.base_hdr);
+	assert(ep->cur_rx.hdr.base_hdr.id == ep->rx_id++);
+	if (ep->cur_rx.hdr.base_hdr.op >= ARRAY_SIZE(ep->start_op)) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+			"Received invalid opcode\n");
+		return -FI_EIO;
+	}
+
+	ep->cur_rx.data_left = ep->cur_rx.hdr.base_hdr.size -
+			       ep->cur_rx.hdr.base_hdr.hdr_size;
+	ep->cur_rx.handler = ep->start_op[ep->cur_rx.hdr.base_hdr.op];
+
+	return ep->cur_rx.handler(ep);
 }
 
-/* Must hold ep lock */
 void tcpx_progress_rx(struct tcpx_ep *ep)
 {
 	int ret;
 
-	if (!ep->cur_rx_entry &&
-	    (ep->stage_buf.cur_pos == ep->stage_buf.bytes_avail)) {
-		ret = tcpx_read_to_buffer(ep->sock, &ep->stage_buf);
-		if (ret)
-			goto err;
-	}
-
+	assert(fastlock_held(&ep->lock));
 	do {
-		if (!ep->cur_rx_entry) {
-			if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len) {
-				ret = tcpx_get_next_rx_hdr(ep);
-				if (ret)
-					goto err;
-			}
-
-			if (ep->cur_rx_msg.hdr.base_hdr.op >=
-			    ARRAY_SIZE(ep->start_op)) {
-				FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
-					"Received invalid opcode\n");
-				ret = -FI_ENOTCONN; /* force shutdown */
-				goto err;
-			}
-			ret = ep->start_op[ep->cur_rx_msg.hdr.base_hdr.op](ep);
-			if (ret)
-				goto err;
+		if (ep->cur_rx.hdr_done < ep->cur_rx.hdr_len) {
+			ret = tcpx_recv_hdr(ep);
+		} else {
+			ret = ep->cur_rx.handler(ep);
 		}
-		assert(ep->cur_rx_proc_fn);
-		ep->cur_rx_proc_fn(ep->cur_rx_entry);
-
-	} while (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail);
 
-	return;
-err:
-	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
-		return;
+	} while (!ret && ofi_bsock_readable(&ep->bsock));
 
-	if (ret == -FI_ENOTCONN)
+	if (ret && !OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
 		tcpx_ep_disable(ep, 0);
 }
 
-/* Must hold ep lock */
-void tcpx_progress_tx(struct tcpx_ep *ep)
+void tcpx_progress_async(struct tcpx_ep *ep)
 {
-	struct tcpx_xfer_entry *tx_entry;
-	struct slist_entry *entry;
-
-	if (!slist_empty(&ep->tx_queue)) {
-		entry = ep->tx_queue.head;
-		tx_entry = container_of(entry, struct tcpx_xfer_entry, entry);
-		process_tx_entry(tx_entry);
+	struct tcpx_xfer_entry *xfer;
+	uint32_t done;
+
+	done = ofi_bsock_async_done(&tcpx_prov, &ep->bsock);
+	while (!slist_empty(&ep->async_queue)) {
+		xfer = container_of(ep->async_queue.head,
+				    struct tcpx_xfer_entry, entry);
+		if (ofi_val32_gt(xfer->async_index, done))
+			break;
+
+		slist_remove_head(&ep->async_queue);
+		tcpx_cq_report_success(ep->util_ep.tx_cq, xfer);
+		tcpx_free_tx(xfer);
 	}
 }
 
-int tcpx_try_func(void *util_ep)
+static bool tcpx_tx_pending(struct tcpx_ep *ep)
+{
+	return ep->cur_tx.entry || ofi_bsock_tosend(&ep->bsock);
+}
+
+static int tcpx_mod_epoll(struct tcpx_ep *ep, struct util_wait_fd *wait_fd)
 {
 	uint32_t events;
-	struct util_wait_fd *wait_fd;
-	struct tcpx_ep *ep;
 	int ret;
 
-	ep = container_of(util_ep, struct tcpx_ep, util_ep);
-	wait_fd = container_of(((struct util_ep *) util_ep)->tx_cq->wait,
-			       struct util_wait_fd, util_wait);
-
-	fastlock_acquire(&ep->lock);
-	if (!slist_empty(&ep->tx_queue) && !ep->pollout_set) {
-		ep->pollout_set = true;
+	assert(fastlock_held(&ep->lock));
+	if (ep->pollout_set) {
 		events = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
 			 (OFI_EPOLL_IN | OFI_EPOLL_OUT) : (POLLIN | POLLOUT);
-		goto epoll_mod;
-	} else if (slist_empty(&ep->tx_queue) && ep->pollout_set) {
-		ep->pollout_set = false;
+	} else {
 		events = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
 			 OFI_EPOLL_IN : POLLIN;
-		goto epoll_mod;
 	}
-	fastlock_release(&ep->lock);
-	return FI_SUCCESS;
 
-epoll_mod:
 	ret = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ?
-	      ofi_epoll_mod(wait_fd->epoll_fd, ep->sock, events,
+	      ofi_epoll_mod(wait_fd->epoll_fd, ep->bsock.sock, events,
 			    &ep->util_ep.ep_fid.fid) :
-	      ofi_pollfds_mod(wait_fd->pollfds, ep->sock, events,
+	      ofi_pollfds_mod(wait_fd->pollfds, ep->bsock.sock, events,
 			      &ep->util_ep.ep_fid.fid);
 	if (ret)
 		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
 			"epoll modify failed\n");
+
+	return ret;
+}
+
+/* We may need to send data in response to received requests,
+ * such as delivery complete acks or RMA read responses.  So,
+ * even if this is the Rx CQ, we need to progress transmits.
+ * We also need to keep the rx and tx epoll wait fd's in sync,
+ * such that we ask for POLLOUT on both or neither.  This is
+ * required in case they share the same wait set and underlying
+ * epoll fd.  So we only maintain a single pollout_set state
+ * variable rather than trying to track them independently.
+ * The latter does not work if the epoll fd behind the tx
+ * and rx CQs is the same fd.
+ */
+int tcpx_update_epoll(struct tcpx_ep *ep)
+{
+	struct util_wait_fd *rx_wait, *tx_wait;
+	int ret;
+
+	assert(fastlock_held(&ep->lock));
+	if ((tcpx_tx_pending(ep) && ep->pollout_set) ||
+	    (!tcpx_tx_pending(ep) && !ep->pollout_set))
+		return FI_SUCCESS;
+
+	rx_wait = ep->util_ep.rx_cq ?
+		  container_of(ep->util_ep.rx_cq->wait,
+		  	       struct util_wait_fd, util_wait) : NULL;
+	tx_wait = ep->util_ep.tx_cq ?
+		  container_of(ep->util_ep.tx_cq->wait,
+		  	       struct util_wait_fd, util_wait) : NULL;
+
+	ep->pollout_set = tcpx_tx_pending(ep);
+	ret = tcpx_mod_epoll(ep, rx_wait);
+	if (!ret && rx_wait != tx_wait)
+		ret = tcpx_mod_epoll(ep, tx_wait);
+
+	if (ret)
+		ep->pollout_set = false;
+	return ret;
+}
+
+int tcpx_try_func(void *util_ep)
+{
+	struct tcpx_ep *ep;
+	int ret;
+
+	ep = container_of(util_ep, struct tcpx_ep, util_ep);
+	fastlock_acquire(&ep->lock);
+	if (ofi_bsock_readable(&ep->bsock)) {
+		ret = -FI_EAGAIN;
+	} else {
+		ret = tcpx_update_epoll(ep);
+	}
 	fastlock_release(&ep->lock);
 	return ret;
 }
 
-void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep,
+void tcpx_tx_queue_insert(struct tcpx_ep *ep,
 			  struct tcpx_xfer_entry *tx_entry)
 {
-	int empty;
-	struct util_wait *wait = tcpx_ep->util_ep.tx_cq->wait;
-
-	empty = slist_empty(&tcpx_ep->tx_queue);
-	slist_insert_tail(&tx_entry->entry, &tcpx_ep->tx_queue);
+	struct util_wait *wait = ep->util_ep.tx_cq->wait;
 
-	if (empty) {
-		process_tx_entry(tx_entry);
+	if (!ep->cur_tx.entry) {
+		ep->cur_tx.entry = tx_entry;
+		ep->cur_tx.data_left = tx_entry->hdr.base_hdr.size;
+		OFI_DBG_SET(tx_entry->hdr.base_hdr.id, ep->tx_id++);
+		ep->hdr_bswap(&tx_entry->hdr.base_hdr);
+		tcpx_progress_tx(ep);
 
-		if (!slist_empty(&tcpx_ep->tx_queue) && wait)
+		if (!ep->cur_tx.entry && wait)
 			wait->signal(wait);
+	} else if (tx_entry->ctrl_flags & TCPX_INTERNAL_XFER) {
+		slist_insert_tail(&tx_entry->entry, &ep->priority_queue);
+	} else {
+		slist_insert_tail(&tx_entry->entry, &ep->tx_queue);
 	}
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_rma.c b/deps/libfabric/prov/tcp/src/tcpx_rma.c
index c046e61770e6017266d64288f711946c39e2ef89..3cfaee59e8358a23780eb44f2aa34be848452365 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_rma.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_rma.c
@@ -50,7 +50,8 @@
 #include <netdb.h>
 
 static void tcpx_rma_read_send_entry_fill(struct tcpx_xfer_entry *send_entry,
-					  struct tcpx_ep *tcpx_ep,
+					  struct tcpx_xfer_entry *recv_entry,
+					  struct tcpx_ep *ep,
 					  const struct fi_msg_rma *msg)
 {
 	struct ofi_rma_iov *rma_iov;
@@ -59,6 +60,7 @@ static void tcpx_rma_read_send_entry_fill(struct tcpx_xfer_entry *send_entry,
 	offset = sizeof(send_entry->hdr.base_hdr);
 	rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &send_entry->hdr + offset);
 
+	send_entry->hdr.base_hdr.op = ofi_op_read_req;
 	send_entry->hdr.base_hdr.rma_iov_cnt = msg->rma_iov_count;
 	memcpy(rma_iov, msg->rma_iov,
 	       msg->rma_iov_count * sizeof(msg->rma_iov[0]));
@@ -66,17 +68,18 @@ static void tcpx_rma_read_send_entry_fill(struct tcpx_xfer_entry *send_entry,
 	offset += (msg->rma_iov_count * sizeof(*rma_iov));
 
 	send_entry->hdr.base_hdr.size = offset;
-	send_entry->hdr.base_hdr.payload_off = (uint8_t)offset;
+	send_entry->hdr.base_hdr.hdr_size = (uint8_t) offset;
 
 	send_entry->iov[0].iov_base = (void *) &send_entry->hdr;
 	send_entry->iov[0].iov_len = offset;
 	send_entry->iov_cnt = 1;
-	send_entry->ep = tcpx_ep;
-	send_entry->rem_len = send_entry->hdr.base_hdr.size;
+	send_entry->context = msg->context;
+	send_entry->ctrl_flags = TCPX_NEED_RESP;
+	send_entry->resp_entry = recv_entry;
 }
 
 static void tcpx_rma_read_recv_entry_fill(struct tcpx_xfer_entry *recv_entry,
-					  struct tcpx_ep *tcpx_ep,
+					  struct tcpx_ep *ep,
 					  const struct fi_msg_rma *msg,
 					  uint64_t flags)
 {
@@ -84,49 +87,51 @@ static void tcpx_rma_read_recv_entry_fill(struct tcpx_xfer_entry *recv_entry,
 	       msg->iov_count * sizeof(struct iovec));
 
 	recv_entry->iov_cnt = msg->iov_count;
-	recv_entry->ep = tcpx_ep;
+	recv_entry->ep = ep;
 	recv_entry->context = msg->context;
-	recv_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			     flags | FI_RMA | FI_READ);
+	recv_entry->cq_flags = tcpx_tx_completion_flag(ep, flags) |
+			       FI_RMA | FI_READ;
+	recv_entry->ctrl_flags = TCPX_INTERNAL_XFER;
 }
 
-static ssize_t tcpx_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
-				uint64_t flags)
+static ssize_t
+tcpx_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+		 uint64_t flags)
 {
-	struct tcpx_ep *tcpx_ep;
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_ep *ep;
+	struct tcpx_cq *cq;
 	struct tcpx_xfer_entry *send_entry;
 	struct tcpx_xfer_entry *recv_entry;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq,
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq,
 			       util_cq);
 
 	assert(msg->iov_count <= TCPX_IOV_LIMIT);
 	assert(msg->rma_iov_count <= TCPX_IOV_LIMIT);
 
-	send_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_READ_REQ);
+	send_entry = tcpx_alloc_tx(ep);
 	if (!send_entry)
 		return -FI_EAGAIN;
 
-	recv_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_READ_RSP);
+	recv_entry = tcpx_alloc_xfer(cq);
 	if (!recv_entry) {
-		tcpx_xfer_entry_release(tcpx_cq, send_entry);
+		tcpx_free_xfer(cq, send_entry);
 		return -FI_EAGAIN;
 	}
-	tcpx_rma_read_send_entry_fill(send_entry, tcpx_ep, msg);
-	tcpx_rma_read_recv_entry_fill(recv_entry, tcpx_ep, msg, flags);
-
-	tcpx_ep->hdr_bswap(&send_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	slist_insert_tail(&recv_entry->entry, &tcpx_ep->rma_read_queue);
-	tcpx_tx_queue_insert(tcpx_ep, send_entry);
-	fastlock_release(&tcpx_ep->lock);
+	tcpx_rma_read_send_entry_fill(send_entry, recv_entry, ep, msg);
+	tcpx_rma_read_recv_entry_fill(recv_entry, ep, msg, flags);
+
+	fastlock_acquire(&ep->lock);
+	slist_insert_tail(&recv_entry->entry, &ep->rma_read_queue);
+	tcpx_tx_queue_insert(ep, send_entry);
+	fastlock_release(&ep->lock);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc,
-			     fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context)
+static ssize_t
+tcpx_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
+	      fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context)
 {
 	struct iovec msg_iov = {
 		.iov_base = (void *)buf,
@@ -148,12 +153,13 @@ static ssize_t tcpx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *des
 		.data = 0,
 	};
 
-	return tcpx_rma_readmsg(ep, &msg, 0);
+	return tcpx_rma_readmsg(ep_fid, &msg, 0);
 }
 
-static ssize_t tcpx_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
-			      size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
-			      void *context)
+static ssize_t
+tcpx_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+	       size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
+	       void *context)
 {
 	struct fi_rma_iov rma_iov = {
 		.addr = addr,
@@ -171,25 +177,21 @@ static ssize_t tcpx_rma_readv(struct fid_ep *ep, const struct iovec *iov, void *
 		.data = 0,
 	};
 
-	return tcpx_rma_readmsg(ep, &msg, 0);
+	return tcpx_rma_readmsg(ep_fid, &msg, 0);
 }
 
-static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
-				 uint64_t flags)
+static ssize_t
+tcpx_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+		 uint64_t flags)
 {
-	struct tcpx_ep *tcpx_ep;
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *send_entry;
 	struct ofi_rma_iov *rma_iov;
 	uint64_t data_len;
-	uint64_t *cq_data;
 	size_t offset;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq,
-			       util_cq);
-
-	send_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_WRITE);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	send_entry = tcpx_alloc_tx(ep);
 	if (!send_entry)
 		return -FI_EAGAIN;
 
@@ -198,14 +200,16 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg
 
 	data_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
 
-	assert(!(flags & FI_INJECT) || (data_len <= TCPX_MAX_INJECT_SZ));
-	offset = sizeof(send_entry->hdr.base_hdr);
+	assert(!(flags & FI_INJECT) || (data_len <= TCPX_MAX_INJECT));
+
+	send_entry->hdr.base_hdr.op = ofi_op_write;
 
 	if (flags & FI_REMOTE_CQ_DATA) {
-		send_entry->hdr.base_hdr.flags |= OFI_REMOTE_CQ_DATA;
-		cq_data = (uint64_t *)((uint8_t *)&send_entry->hdr + offset);
-		*cq_data = msg->data;
-		offset += sizeof(msg->data);
+		send_entry->hdr.base_hdr.flags = TCPX_REMOTE_CQ_DATA;
+		send_entry->hdr.cq_data_hdr.cq_data = msg->data;
+		offset = sizeof(send_entry->hdr.cq_data_hdr);
+	} else {
+		offset = sizeof(send_entry->hdr.base_hdr);
 	}
 
 	rma_iov = (struct ofi_rma_iov *)((uint8_t *)&send_entry->hdr + offset);
@@ -215,7 +219,7 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg
 
 	offset += (send_entry->hdr.base_hdr.rma_iov_cnt * sizeof(*rma_iov));
 
-	send_entry->hdr.base_hdr.payload_off = (uint8_t)offset;
+	send_entry->hdr.base_hdr.hdr_size = (uint8_t) offset;
 	send_entry->hdr.base_hdr.size = data_len + offset;
 	if (flags & FI_INJECT) {
 		ofi_copy_iov_buf(msg->msg_iov, msg->iov_count, 0,
@@ -233,28 +237,20 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg
 	send_entry->iov[0].iov_base = (void *) &send_entry->hdr;
 	send_entry->iov[0].iov_len = offset;
 
-	send_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) |
-			     flags | FI_RMA | FI_WRITE;
-
-	if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))
-		send_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE;
-
-	if (flags & FI_COMMIT_COMPLETE)
-		send_entry->hdr.base_hdr.flags |= OFI_COMMIT_COMPLETE;
-
-	send_entry->ep = tcpx_ep;
+	send_entry->cq_flags = tcpx_tx_completion_flag(ep, flags) |
+			       FI_RMA | FI_WRITE;
+	tcpx_set_commit_flags(send_entry, flags);
 	send_entry->context = msg->context;
-	send_entry->rem_len = send_entry->hdr.base_hdr.size;
 
-	tcpx_ep->hdr_bswap(&send_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, send_entry);
-	fastlock_release(&tcpx_ep->lock);
+	fastlock_acquire(&ep->lock);
+	tcpx_tx_queue_insert(ep, send_entry);
+	fastlock_release(&ep->lock);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc,
-			      fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context)
+static ssize_t
+tcpx_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc,
+	       fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context)
 {
 	struct iovec msg_iov = {
 		.iov_base = (void *)buf,
@@ -276,12 +272,13 @@ static ssize_t tcpx_rma_write(struct fid_ep *ep, const void *buf, size_t len, vo
 		.data = 0,
 	};
 
-	return tcpx_rma_writemsg(ep, &msg, 0);
+	return tcpx_rma_writemsg(ep_fid, &msg, 0);
 }
 
-static ssize_t tcpx_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc,
-			       size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
-			       void *context)
+static ssize_t
+tcpx_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+		size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
+		void *context)
 {
 	struct fi_rma_iov rma_iov = {
 		.addr = addr,
@@ -299,13 +296,14 @@ static ssize_t tcpx_rma_writev(struct fid_ep *ep, const struct iovec *iov, void
 		.data = 0,
 	};
 
-	return tcpx_rma_writemsg(ep, &msg, 0);
+	return tcpx_rma_writemsg(ep_fid, &msg, 0);
 }
 
 
-static ssize_t tcpx_rma_writedata(struct fid_ep *ep, const void *buf, size_t len, void *desc,
-				  uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
-				  void *context)
+static ssize_t
+tcpx_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
+		   void *desc, uint64_t data, fi_addr_t dest_addr,
+		   uint64_t addr, uint64_t key, void *context)
 {
 	struct iovec msg_iov = {
 		.iov_base = (void *)buf,
@@ -327,34 +325,32 @@ static ssize_t tcpx_rma_writedata(struct fid_ep *ep, const void *buf, size_t len
 		.data = data,
 	};
 
-	return tcpx_rma_writemsg(ep, &msg, FI_REMOTE_CQ_DATA);
+	return tcpx_rma_writemsg(ep_fid, &msg, FI_REMOTE_CQ_DATA);
 }
 
-static ssize_t tcpx_rma_inject_common(struct fid_ep *ep, const void *buf,
-				      size_t len, uint64_t data,
-				      fi_addr_t dest_addr, uint64_t addr,
-				      uint64_t key, uint64_t flags)
+static ssize_t
+tcpx_rma_inject_common(struct fid_ep *ep_fid, const void *buf, size_t len,
+		       uint64_t data, fi_addr_t dest_addr, uint64_t addr,
+		       uint64_t key, uint64_t flags)
 {
-	struct tcpx_ep *tcpx_ep;
-	struct tcpx_cq *tcpx_cq;
+	struct tcpx_ep *ep;
 	struct tcpx_xfer_entry *send_entry;
 	struct ofi_rma_iov *rma_iov;
 	uint64_t *cq_data;
 	size_t offset;
 
-	tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid);
-	tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq,
-			       util_cq);
-
-	send_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_WRITE);
+	ep = container_of(ep_fid, struct tcpx_ep, util_ep.ep_fid);
+	send_entry = tcpx_alloc_tx(ep);
 	if (!send_entry)
 		return -FI_EAGAIN;
 
-	assert(len <= TCPX_MAX_INJECT_SZ);
+	assert(len <= TCPX_MAX_INJECT);
 	offset = sizeof(send_entry->hdr.base_hdr);
 
+	send_entry->hdr.base_hdr.op = ofi_op_write;
+
 	if (flags & FI_REMOTE_CQ_DATA) {
-		send_entry->hdr.base_hdr.flags |= OFI_REMOTE_CQ_DATA;
+		send_entry->hdr.base_hdr.flags = TCPX_REMOTE_CQ_DATA;
 		cq_data = (uint64_t *)((uint8_t *)&send_entry->hdr + offset);
 		*cq_data = data;
 		offset += sizeof(data);
@@ -367,7 +363,7 @@ static ssize_t tcpx_rma_inject_common(struct fid_ep *ep, const void *buf,
 	send_entry->hdr.base_hdr.rma_iov_cnt = 1;
 	offset += sizeof(*rma_iov);
 
-	send_entry->hdr.base_hdr.payload_off = (uint8_t)offset;
+	send_entry->hdr.base_hdr.hdr_size = (uint8_t) offset;
 	memcpy((uint8_t *)&send_entry->hdr + offset, (uint8_t *)buf, len);
 	offset += len;
 
@@ -376,30 +372,28 @@ static ssize_t tcpx_rma_inject_common(struct fid_ep *ep, const void *buf,
 	send_entry->iov_cnt = 1;
 
 	send_entry->hdr.base_hdr.size = offset;
-	send_entry->ep = tcpx_ep;
-	send_entry->rem_len = send_entry->hdr.base_hdr.size;
 
-	tcpx_ep->hdr_bswap(&send_entry->hdr.base_hdr);
-	fastlock_acquire(&tcpx_ep->lock);
-	tcpx_tx_queue_insert(tcpx_ep, send_entry);
-	fastlock_release(&tcpx_ep->lock);
+	fastlock_acquire(&ep->lock);
+	tcpx_tx_queue_insert(ep, send_entry);
+	fastlock_release(&ep->lock);
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_rma_inject(struct fid_ep *ep, const void *buf, size_t len,
-			       fi_addr_t dest_addr, uint64_t addr, uint64_t key)
+static ssize_t
+tcpx_rma_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
+		fi_addr_t dest_addr, uint64_t addr, uint64_t key)
 {
-	return tcpx_rma_inject_common(ep, buf, len, dest_addr,
-				      0, addr, key, FI_INJECT);
+	return tcpx_rma_inject_common(ep_fid, buf, len, 0 ,dest_addr,
+				      addr, key, FI_INJECT);
 }
 
 static ssize_t
-tcpx_rma_injectdata(struct fid_ep *ep, const void *buf, size_t len,
+tcpx_rma_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 		    uint64_t data, fi_addr_t dest_addr, uint64_t addr,
 		    uint64_t key)
 {
-	return tcpx_rma_inject_common(ep, buf, len, dest_addr, data, addr, key,
-				      FI_INJECT | FI_REMOTE_CQ_DATA);
+	return tcpx_rma_inject_common(ep_fid, buf, len, data, dest_addr, addr,
+				      key, FI_INJECT | FI_REMOTE_CQ_DATA);
 }
 
 struct fi_ops_rma tcpx_rma_ops = {
diff --git a/deps/libfabric/prov/tcp/src/tcpx_shared_ctx.c b/deps/libfabric/prov/tcp/src/tcpx_shared_ctx.c
index 3672c883c3f028a9926fc998d73536050eaa8d45..43d94d48bf4a649faa6830a3af31be78372d98d5 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_shared_ctx.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_shared_ctx.c
@@ -38,45 +38,16 @@
 #include <unistd.h>
 #include <ofi_iov.h>
 
-void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx,
-			   struct tcpx_xfer_entry *xfer_entry)
-{
-	if (xfer_entry->ep->cur_rx_entry == xfer_entry)
-		xfer_entry->ep->cur_rx_entry = NULL;
 
-	fastlock_acquire(&srx_ctx->lock);
-	ofi_buf_free(xfer_entry);
-	fastlock_release(&srx_ctx->lock);
-}
-
-struct tcpx_xfer_entry *
-tcpx_srx_next_xfer_entry(struct tcpx_rx_ctx *srx_ctx,
-			struct tcpx_ep *ep, size_t entry_size)
-{
-	struct tcpx_xfer_entry *xfer_entry = NULL;
-
-	fastlock_acquire(&srx_ctx->lock);
-	if (slist_empty(&srx_ctx->rx_queue))
-		goto out;
-
-	xfer_entry = container_of(srx_ctx->rx_queue.head,
-				  struct tcpx_xfer_entry, entry);
-	xfer_entry->rem_len = ofi_total_iov_len(xfer_entry->iov,
-						xfer_entry->iov_cnt) - entry_size;
-	slist_remove_head(&srx_ctx->rx_queue);
-out:
-	fastlock_release(&srx_ctx->lock);
-	return xfer_entry;
-}
-
-static ssize_t tcpx_srx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
-				uint64_t flags)
+static ssize_t
+tcpx_srx_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
+		 uint64_t flags)
 {
 	struct tcpx_xfer_entry *recv_entry;
 	struct tcpx_rx_ctx *srx_ctx;
 	ssize_t ret = FI_SUCCESS;
 
-	srx_ctx = container_of(ep, struct tcpx_rx_ctx, rx_fid);
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
 	assert(msg->iov_count <= TCPX_IOV_LIMIT);
 
 	fastlock_acquire(&srx_ctx->lock);
@@ -86,7 +57,7 @@ static ssize_t tcpx_srx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
 		goto unlock;
 	}
 
-	recv_entry->flags = flags | FI_MSG | FI_RECV;
+	recv_entry->cq_flags = FI_MSG | FI_RECV;
 	recv_entry->context = msg->context;
 	recv_entry->iov_cnt = msg->iov_count;
 	memcpy(&recv_entry->iov[0], msg->msg_iov,
@@ -98,14 +69,15 @@ unlock:
 	return ret;
 }
 
-static ssize_t tcpx_srx_recv(struct fid_ep *ep, void *buf, size_t len, void *desc,
-			     fi_addr_t src_addr, void *context)
+static ssize_t
+tcpx_srx_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
+	      fi_addr_t src_addr, void *context)
 {
 	struct tcpx_xfer_entry *recv_entry;
 	struct tcpx_rx_ctx *srx_ctx;
 	ssize_t ret = FI_SUCCESS;
 
-	srx_ctx = container_of(ep, struct tcpx_rx_ctx, rx_fid);
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
 
 	fastlock_acquire(&srx_ctx->lock);
 	recv_entry = ofi_buf_alloc(srx_ctx->buf_pool);
@@ -114,12 +86,11 @@ static ssize_t tcpx_srx_recv(struct fid_ep *ep, void *buf, size_t len, void *des
 		goto unlock;
 	}
 
-	recv_entry->flags = FI_MSG | FI_RECV;
+	recv_entry->cq_flags = FI_MSG | FI_RECV;
 	recv_entry->context = context;
 	recv_entry->iov_cnt = 1;
 	recv_entry->iov[0].iov_base = buf;
 	recv_entry->iov[0].iov_len = len;
-	recv_entry->rem_len = len;
 
 	slist_insert_tail(&recv_entry->entry, &srx_ctx->rx_queue);
 unlock:
@@ -127,14 +98,15 @@ unlock:
 	return ret;
 }
 
-static ssize_t tcpx_srx_recvv(struct fid_ep *ep, const struct iovec *iov, void **desc,
-			      size_t count, fi_addr_t src_addr, void *context)
+static ssize_t
+tcpx_srx_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+	       size_t count, fi_addr_t src_addr, void *context)
 {
 	struct tcpx_xfer_entry *recv_entry;
 	struct tcpx_rx_ctx *srx_ctx;
 	ssize_t ret = FI_SUCCESS;
 
-	srx_ctx = container_of(ep, struct tcpx_rx_ctx, rx_fid);
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
 	assert(count <= TCPX_IOV_LIMIT);
 
 	fastlock_acquire(&srx_ctx->lock);
@@ -144,7 +116,7 @@ static ssize_t tcpx_srx_recvv(struct fid_ep *ep, const struct iovec *iov, void *
 		goto unlock;
 	}
 
-	recv_entry->flags = FI_MSG | FI_RECV;
+	recv_entry->cq_flags = FI_MSG | FI_RECV;
 	recv_entry->context = context;
 	recv_entry->iov_cnt = count;
 	memcpy(&recv_entry->iov[0], iov, count * sizeof(*iov));
@@ -167,3 +139,156 @@ struct fi_ops_msg tcpx_srx_msg_ops = {
 	.senddata = fi_no_msg_senddata,
 	.injectdata = fi_no_msg_injectdata,
 };
+
+
+static ssize_t
+tcpx_srx_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
+		  uint64_t flags)
+{
+	struct tcpx_xfer_entry *recv_entry;
+	struct tcpx_rx_ctx *srx_ctx;
+	ssize_t ret = FI_SUCCESS;
+
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
+	assert(msg->iov_count <= TCPX_IOV_LIMIT);
+
+	fastlock_acquire(&srx_ctx->lock);
+	recv_entry = ofi_buf_alloc(srx_ctx->buf_pool);
+	if (!recv_entry) {
+		ret = -FI_EAGAIN;
+		goto unlock;
+	}
+
+	recv_entry->tag = msg->tag;
+	recv_entry->ignore = msg->ignore;
+	recv_entry->ep = (void *) (uintptr_t) msg->addr;
+	recv_entry->cq_flags = FI_TAGGED | FI_RECV;
+	recv_entry->context = msg->context;
+	recv_entry->iov_cnt = msg->iov_count;
+	memcpy(&recv_entry->iov[0], msg->msg_iov,
+	       msg->iov_count * sizeof(*msg->msg_iov));
+
+	slist_insert_tail(&recv_entry->entry, &srx_ctx->tag_queue);
+unlock:
+	fastlock_release(&srx_ctx->lock);
+	return ret;
+}
+
+static ssize_t
+tcpx_srx_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
+	       fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context)
+{
+	struct tcpx_xfer_entry *recv_entry;
+	struct tcpx_rx_ctx *srx_ctx;
+	ssize_t ret = FI_SUCCESS;
+
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
+
+	fastlock_acquire(&srx_ctx->lock);
+	recv_entry = ofi_buf_alloc(srx_ctx->buf_pool);
+	if (!recv_entry) {
+		ret = -FI_EAGAIN;
+		goto unlock;
+	}
+
+	recv_entry->tag = tag;
+	recv_entry->ignore = ignore;
+	recv_entry->ep = (void *) (uintptr_t) src_addr;
+	recv_entry->cq_flags = FI_TAGGED | FI_RECV;
+	recv_entry->context = context;
+	recv_entry->iov_cnt = 1;
+	recv_entry->iov[0].iov_base = buf;
+	recv_entry->iov[0].iov_len = len;
+
+	slist_insert_tail(&recv_entry->entry, &srx_ctx->tag_queue);
+unlock:
+	fastlock_release(&srx_ctx->lock);
+	return ret;
+}
+
+static ssize_t
+tcpx_srx_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
+		size_t count, fi_addr_t src_addr, uint64_t tag,
+		uint64_t ignore, void *context)
+{
+	struct tcpx_xfer_entry *recv_entry;
+	struct tcpx_rx_ctx *srx_ctx;
+	ssize_t ret = FI_SUCCESS;
+
+	srx_ctx = container_of(ep_fid, struct tcpx_rx_ctx, rx_fid);
+	assert(count <= TCPX_IOV_LIMIT);
+
+	fastlock_acquire(&srx_ctx->lock);
+	recv_entry = ofi_buf_alloc(srx_ctx->buf_pool);
+	if (!recv_entry) {
+		ret = -FI_EAGAIN;
+		goto unlock;
+	}
+
+	recv_entry->tag = tag;
+	recv_entry->ignore = ignore;
+	recv_entry->ep = (void *) (uintptr_t) src_addr;
+	recv_entry->cq_flags = FI_TAGGED | FI_RECV;
+	recv_entry->context = context;
+	recv_entry->iov_cnt = count;
+	memcpy(&recv_entry->iov[0], iov, count * sizeof(*iov));
+
+	slist_insert_tail(&recv_entry->entry, &srx_ctx->tag_queue);
+unlock:
+	fastlock_release(&srx_ctx->lock);
+	return ret;
+}
+
+struct fi_ops_tagged tcpx_srx_tag_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = tcpx_srx_trecv,
+	.recvv = tcpx_srx_trecvv,
+	.recvmsg = tcpx_srx_trecvmsg,
+	.send = fi_no_tagged_send,
+	.sendv = fi_no_tagged_sendv,
+	.sendmsg = fi_no_tagged_sendmsg,
+	.inject = fi_no_tagged_inject,
+	.senddata = fi_no_tagged_senddata,
+	.injectdata = fi_no_tagged_injectdata,
+};
+
+struct tcpx_xfer_entry *
+tcpx_match_tag(struct tcpx_rx_ctx *srx, struct tcpx_ep *ep, uint64_t tag)
+{
+	struct tcpx_xfer_entry *rx_entry;
+	struct slist_entry *item, *prev;
+
+	fastlock_acquire(&srx->lock);
+	slist_foreach(&srx->tag_queue, item, prev) {
+		rx_entry = container_of(item, struct tcpx_xfer_entry, entry);
+		if (ofi_match_tag(rx_entry->tag, rx_entry->ignore, tag)) {
+			slist_remove(&srx->tag_queue, item, prev);
+			fastlock_release(&srx->lock);
+			return rx_entry;
+		}
+	}
+	fastlock_release(&srx->lock);
+
+	return NULL;
+}
+
+struct tcpx_xfer_entry *
+tcpx_match_tag_addr(struct tcpx_rx_ctx *srx, struct tcpx_ep *ep, uint64_t tag)
+{
+	struct tcpx_xfer_entry *rx_entry;
+	struct slist_entry *item, *prev;
+
+	fastlock_acquire(&srx->lock);
+	slist_foreach(&srx->tag_queue, item, prev) {
+		rx_entry = container_of(item, struct tcpx_xfer_entry, entry);
+		if (ofi_match_tag(rx_entry->tag, rx_entry->ignore, tag) &&
+		    ofi_match_addr((uintptr_t) rx_entry->ep, (uintptr_t) ep)) {
+			slist_remove(&srx->tag_queue, item, prev);
+			fastlock_release(&srx->lock);
+			return rx_entry;
+		}
+	}
+	fastlock_release(&srx->lock);
+
+	return NULL;
+}
diff --git a/deps/libfabric/prov/udp/src/udpx_ep.c b/deps/libfabric/prov/udp/src/udpx_ep.c
index 08e234a0b51a3ccbb26dd3eb58e9b4d70eba27e9..274f7a86f0e0daf74c47e6b4aa3848907de19c24 100644
--- a/deps/libfabric/prov/udp/src/udpx_ep.c
+++ b/deps/libfabric/prov/udp/src/udpx_ep.c
@@ -219,7 +219,7 @@ static void udpx_tx_comp(struct udpx_ep *ep, void *context)
 {
 	struct fi_cq_tagged_entry *comp;
 
-	comp = ofi_cirque_tail(ep->util_ep.tx_cq->cirq);
+	comp = ofi_cirque_next(ep->util_ep.tx_cq->cirq);
 	comp->op_context = context;
 	comp->flags = FI_SEND;
 	comp->len = 0;
@@ -239,7 +239,7 @@ static void udpx_rx_comp(struct udpx_ep *ep, void *context, uint64_t flags,
 {
 	struct fi_cq_tagged_entry *comp;
 
-	comp = ofi_cirque_tail(ep->util_ep.rx_cq->cirq);
+	comp = ofi_cirque_next(ep->util_ep.rx_cq->cirq);
 	comp->op_context = context;
 	comp->flags = FI_RECV | flags;
 	comp->len = len;
@@ -316,7 +316,7 @@ static ssize_t udpx_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
 		goto out;
 	}
 
-	entry = ofi_cirque_tail(ep->rxq);
+	entry = ofi_cirque_next(ep->rxq);
 	entry->context = msg->context;
 	for (entry->iov_count = 0; entry->iov_count < msg->iov_count;
 	     entry->iov_count++) {
@@ -357,7 +357,7 @@ static ssize_t udpx_recv(struct fid_ep *ep_fid, void *buf, size_t len,
 		goto out;
 	}
 
-	entry = ofi_cirque_tail(ep->rxq);
+	entry = ofi_cirque_next(ep->rxq);
 	entry->context = context;
 	entry->iov_count = 1;
 	entry->iov[0].iov_base = buf;
@@ -698,7 +698,8 @@ static int udpx_ep_ctrl(struct fid *fid, int command, void *arg)
 	ep = container_of(fid, struct udpx_ep, util_ep.ep_fid.fid);
 	switch (command) {
 	case FI_ENABLE:
-		if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq)
+		if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) ||
+		    (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq))
 			return -FI_ENOCQ;
 		if (!ep->util_ep.av)
 			return -FI_ENOAV;
diff --git a/deps/libfabric/prov/udp/src/udpx_init.c b/deps/libfabric/prov/udp/src/udpx_init.c
index bf5e023fa00effb3fe7d33d09054dcfa1288d5ba..85ad70f749b6224c29fee48232483416a0706018 100644
--- a/deps/libfabric/prov/udp/src/udpx_init.c
+++ b/deps/libfabric/prov/udp/src/udpx_init.c
@@ -52,7 +52,7 @@ static void udpx_fini(void)
 }
 
 struct fi_provider udpx_prov = {
-	.name = "UDP",
+	.name = "udp",
 	.version = OFI_VERSION_DEF_PROV,
 	.fi_version = OFI_VERSION_LATEST,
 	.getinfo = udpx_getinfo,
diff --git a/deps/libfabric/prov/usnic/src/usdf.h b/deps/libfabric/prov/usnic/src/usdf.h
index 64ade0a9f0e5bd8e4dd20f092aa941a0a00b369d..991eb57a615a1c5347e90bdb2f7f574e7404577c 100644
--- a/deps/libfabric/prov/usnic/src/usdf.h
+++ b/deps/libfabric/prov/usnic/src/usdf.h
@@ -425,7 +425,7 @@ enum {
 struct usdf_err_data_entry {
 	struct slist_entry entry;
 	uint8_t seen;
-	uint8_t err_data[0];
+	uint8_t err_data[];
 };
 
 struct usdf_event {
diff --git a/deps/libfabric/prov/usnic/src/usdf_cm.h b/deps/libfabric/prov/usnic/src/usdf_cm.h
index fe5154f54c4c12c466dae3c3b7c9c7ea927ab01e..d361818055b83ca3f07809e9cf003ef1366f85db 100644
--- a/deps/libfabric/prov/usnic/src/usdf_cm.h
+++ b/deps/libfabric/prov/usnic/src/usdf_cm.h
@@ -51,7 +51,7 @@ struct usdf_connreq_msg {
 	uint32_t creq_result;
 	uint32_t creq_reason;
 	uint32_t creq_datalen;
-	uint8_t creq_data[0];
+	uint8_t creq_data[];
 } __attribute__((packed));
 
 struct usdf_connreq {
@@ -67,7 +67,7 @@ struct usdf_connreq {
 	size_t cr_resid;
 
 	size_t cr_datalen;
-	uint8_t cr_data[0];
+	uint8_t cr_data[];
 };
 
 void usdf_cm_report_failure(struct usdf_connreq *crp, int error,
diff --git a/deps/libfabric/prov/usnic/src/usdf_ext.c b/deps/libfabric/prov/usnic/src/usdf_ext.c
index a171d2a447f9f6d28906dd105d1396852c6327b9..eefdec67908d8b744104302807754047623ca1f8 100644
--- a/deps/libfabric/prov/usnic/src/usdf_ext.c
+++ b/deps/libfabric/prov/usnic/src/usdf_ext.c
@@ -57,7 +57,8 @@ usdf_usnic_getinfo_v1(uint32_t version, struct fid_fabric *fabric,
 
 	uip->ui.v1.ui_link_speed = dap->uda_bandwidth;
 	uip->ui.v1.ui_netmask_be = dap->uda_netmask_be;
-	strcpy(uip->ui.v1.ui_ifname, dap->uda_ifname);
+	snprintf(uip->ui.v1.ui_ifname, sizeof(uip->ui.v1.ui_ifname), "%s",
+		 dap->uda_ifname);
 	uip->ui.v1.ui_num_vf = dap->uda_num_vf;
 	uip->ui.v1.ui_qp_per_vf = dap->uda_qp_per_vf;
 	uip->ui.v1.ui_cq_per_vf = dap->uda_cq_per_vf;
diff --git a/deps/libfabric/prov/usnic/src/usdf_progress.c b/deps/libfabric/prov/usnic/src/usdf_progress.c
index 47204a175dab84a3c637d10aefea8ffbadf0ceaa..b5db07e14f7396891d7fa96fd17247055487ffa3 100644
--- a/deps/libfabric/prov/usnic/src/usdf_progress.c
+++ b/deps/libfabric/prov/usnic/src/usdf_progress.c
@@ -93,7 +93,7 @@ usdf_fabric_progression_thread(void *v)
 	int num_blocked_waiting;
 	int sleep_time;
 	ofi_epoll_t epfd;
-	void *context;
+	struct ofi_epollfds_event event;
 	int ret;
 	int n;
 
@@ -111,14 +111,14 @@ usdf_fabric_progression_thread(void *v)
 			sleep_time = -1;
 		}
 
-		n = ofi_epoll_wait(epfd, &context, 1, sleep_time);
+		n = ofi_epoll_wait(epfd, &event, 1, sleep_time);
 		if (fp->fab_exit || (n < 0 && n != EINTR)) {
 			pthread_exit(NULL);
 		}
 
 		/* consume event if there was one */
 		if (n == 1) {
-			pip = context;
+			pip = event.data.ptr;
 			ret = pip->pi_rtn(pip->pi_context);
 			if (ret != 0) {
 				pthread_exit(NULL);
diff --git a/deps/libfabric/prov/usnic/src/usdf_wait.c b/deps/libfabric/prov/usnic/src/usdf_wait.c
index ea575e3f7057fdc217106e185e6b86a457fdd97b..f94b1dc35330fa41fd5ceaf89b2db86fe6fdd4c4 100644
--- a/deps/libfabric/prov/usnic/src/usdf_wait.c
+++ b/deps/libfabric/prov/usnic/src/usdf_wait.c
@@ -274,7 +274,7 @@ static int usdf_wait_close(struct fid *waitset)
 static int usdf_wait_wait(struct fid_wait *fwait, int timeout)
 {
 	struct usdf_wait *wait;
-	void *context;
+	struct ofi_epollfds_event event;
 	int ret = FI_SUCCESS;
 	int nevents;
 
@@ -289,7 +289,7 @@ static int usdf_wait_wait(struct fid_wait *fwait, int timeout)
 		return ret;
 	}
 
-	nevents = ofi_epoll_wait(wait->object.epfd, &context, 1, timeout);
+	nevents = ofi_epoll_wait(wait->object.epfd, &event, 1, timeout);
 	if (nevents == 0) {
 		ret = -FI_ETIMEDOUT;
 	} else if (nevents < 0) {
diff --git a/deps/libfabric/prov/usnic/src/usnic_direct/vnic_devcmd.h b/deps/libfabric/prov/usnic/src/usnic_direct/vnic_devcmd.h
index 47e06095babce7016a07dbebff11f14c8623ea29..90872381c1ceb6ff68955f6a9c1789555f26b4d0 100644
--- a/deps/libfabric/prov/usnic/src/usnic_direct/vnic_devcmd.h
+++ b/deps/libfabric/prov/usnic/src/usnic_direct/vnic_devcmd.h
@@ -820,7 +820,7 @@ struct vnic_devcmd_notify {
 struct vnic_devcmd_provinfo {
 	u8 oui[3];
 	u8 type;
-	u8 data[0];
+	u8 data[];
 };
 
 /*
@@ -1038,7 +1038,7 @@ enum {
 struct filter_tlv {
 	u_int32_t type;
 	u_int32_t length;
-	u_int32_t val[0];
+	u_int32_t val[];
 };
 
 /* Data for CMD_ADD_FILTER is 2 TLV and filter + action structs */
@@ -1379,9 +1379,9 @@ typedef enum {
  *
  * in:  (u32) a0 = RDMA_SUBCMD_GET_STATS
  *
- * out: (u64) a0 = IG packet count 
+ * out: (u64) a0 = IG packet count
  *      (u64) a1 = IG byte count
- *      (u64) a2 = EG packet count 
+ *      (u64) a2 = EG packet count
  *      (u64) a3 = EG byte count
  */
 #define RDMA_SUBCMD_GET_STATS             7
diff --git a/deps/libfabric/prov/util/src/util_atomic.c b/deps/libfabric/prov/util/src/util_atomic.c
index a95057011ca68fba0ecef105f5e9eff1ecc4ac87..0b985cbe42b71f95bc5495d219e65659a7e0b3ea 100644
--- a/deps/libfabric/prov/util/src/util_atomic.c
+++ b/deps/libfabric/prov/util/src/util_atomic.c
@@ -49,11 +49,14 @@ static const size_t ofi_datatype_size_table[] = {
 	[FI_DOUBLE_COMPLEX] = sizeof(ofi_complex_double),
 	[FI_LONG_DOUBLE]    = sizeof(long double),
 	[FI_LONG_DOUBLE_COMPLEX] = sizeof(ofi_complex_long_double),
+	/* Compute 128-bit integer size, since compiler may not support type. */
+	[FI_INT128]  = sizeof(int64_t) * 2,
+	[FI_UINT128] = sizeof(uint64_t) * 2,
 };
 
 size_t ofi_datatype_size(enum fi_datatype datatype)
 {
-	if (datatype >= FI_DATATYPE_LAST) {
+	if (datatype >= ARRAY_SIZE(ofi_datatype_size_table)) {
 		errno = FI_EINVAL;
 		return 0;
 	}
@@ -152,6 +155,72 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 #define OFI_OP_READ_COMPLEX		OFI_OP_READ
 #define OFI_OP_WRITE_COMPLEX		OFI_OP_WRITE
 
+#if defined(HAVE_BUILTIN_MM_INT128_ATOMICS)
+
+/*
+ * If the compiler supports 128-bit integer types and atomics,
+ *  then existing macros will just work.
+ */
+#define OFI_DEF_WRITE_INT128_NAME(op, type)	OFI_DEF_WRITE_NAME(op, type)
+#define OFI_DEF_WRITE_INT128_FUNC(op, type)	OFI_DEF_WRITE_FUNC(op, type)
+#define OFI_DEF_WRITEEXT_INT128_NAME(op, type)	OFI_DEF_WRITEEXT_NAME(op, type)
+#define OFI_DEF_WRITEEXT_INT128_FUNC(op, type)	OFI_DEF_WRITEEXT_FUNC(op, type)
+#define OFI_DEF_WRITEEXT_CMP_INT128_NAME(op, type)	\
+	OFI_DEF_WRITEEXT_CMP_NAME(op, type)
+#define OFI_DEF_WRITEEXT_CMP_INT128_FUNC(op, type)	\
+	OFI_DEF_WRITEEXT_CMP_FUNC(op, type)
+#define OFI_DEF_READ_INT128_NAME(op, type)	OFI_DEF_READ_NAME(op, type)
+#define OFI_DEF_READ_INT128_FUNC(op, type)	OFI_DEF_READ_FUNC(op, type)
+#define OFI_DEF_READWRITE_INT128_NAME(op, type)	OFI_DEF_READWRITE_NAME(op, type)
+#define OFI_DEF_READWRITE_INT128_FUNC(op, type)	OFI_DEF_READWRITE_FUNC(op, type)
+#define OFI_DEF_READWRITEEXT_INT128_NAME(op, type)	\
+	OFI_DEF_READWRITEEXT_NAME(op, type)
+#define OFI_DEF_READWRITEEXT_INT128_FUNC(op, type)	\
+	OFI_DEF_READWRITEEXT_FUNC(op, type)
+#define OFI_DEF_READWRITEEXT_CMP_INT128_NAME(op, type)	\
+	OFI_DEF_READWRITEEXT_CMP_NAME(op, type)
+#define OFI_DEF_READWRITEEXT_CMP_INT128_FUNC(op, type)	\
+	OFI_DEF_READWRITEEXT_CMP_FUNC(op, type)
+#define OFI_DEF_EXCHANGE_INT128_NAME(op, type)	OFI_DEF_EXCHANGE_NAME(op, type)
+#define OFI_DEF_EXCHANGE_INT128_FUNC(op, type)	OFI_DEF_EXCHANGE_FUNC(op, type)
+#define OFI_DEF_CSWAP_INT128_NAME(op, type)	OFI_DEF_CSWAP_NAME(op, type)
+#define OFI_DEF_CSWAP_INT128_FUNC(op, type)	OFI_DEF_CSWAP_FUNC(op, type)
+#define OFI_DEF_CSWAPEXT_INT128_NAME(op, type)	OFI_DEF_CSWAPEXT_NAME(op, type)
+#define OFI_DEF_CSWAPEXT_INT128_FUNC(op, type)	OFI_DEF_CSWAPEXT_FUNC(op, type)
+#define OFI_DEF_CSWAPEXT_CMP_INT128_NAME(op, type)	\
+	OFI_DEF_CSWAPEXT_CMP_NAME(op, type)
+#define OFI_DEF_CSWAPEXT_CMP_INT128_FUNC(op, type)	\
+	OFI_DEF_CSWAPEXT_CMP_FUNC(op, type)
+
+#else /* HAVE_BUILTIN_MM_INT128_ATOMICS */
+
+/* Otherwise, we support only nothing */
+
+#define OFI_DEF_WRITE_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_WRITE_INT128_FUNC(op, type)
+#define OFI_DEF_WRITEEXT_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_WRITEEXT_INT128_FUNC(op, type)
+#define OFI_DEF_WRITEEXT_CMP_INT128_NAME(op, type) NULL,
+#define OFI_DEF_WRITEEXT_CMP_INT128_FUNC(op, type)
+#define OFI_DEF_READ_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_READ_INT128_FUNC(op, type)
+#define OFI_DEF_READWRITE_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_READWRITE_INT128_FUNC(op, type)
+#define OFI_DEF_READWRITEEXT_INT128_NAME(op, type) NULL,
+#define OFI_DEF_READWRITEEXT_INT128_FUNC(op, type)
+#define OFI_DEF_READWRITEEXT_CMP_INT128_NAME(op, type) NULL,
+#define OFI_DEF_READWRITEEXT_CMP_INT128_FUNC(op, type)
+#define OFI_DEF_EXCHANGE_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_EXCHANGE_INT128_FUNC(op, type)
+#define OFI_DEF_CSWAP_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_CSWAP_INT128_FUNC(op, type)
+#define OFI_DEF_CSWAPEXT_INT128_NAME(op, type)	NULL,
+#define OFI_DEF_CSWAPEXT_INT128_FUNC(op, type)
+#define OFI_DEF_CSWAPEXT_CMP_INT128_NAME(op, type) NULL,
+#define OFI_DEF_CSWAPEXT_CMP_INT128_FUNC(op, type)
+
+#endif /* HAVE_BUILTIN_MM_INT128_ATOMICS */
+
 /********************************
  * ATOMIC TYPE function templates
  ********************************/
@@ -727,11 +796,13 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
-	OFI_DEF_NOOP_##FUNCNAME
+	OFI_DEF_NOOP_##FUNCNAME                                         \
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #ifdef HAVE_BUILTIN_MM_ATOMICS
 
-/* Only support 8 byte and under datatypes */
+/* Only support 8 byte and under datatypes and non-complex 16 byte types */
 #define OFI_DEFINE_ALL_HANDLERS(ATOMICTYPE, FUNCNAME, op)		\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, int8_t)			\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, uint8_t)			\
@@ -746,7 +817,9 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, float)	\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
-	OFI_DEF_NOOP_##FUNCNAME
+	OFI_DEF_NOOP_##FUNCNAME						\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #define OFI_DEFINE_REALNO_HANDLERS(ATOMICTYPE, FUNCNAME, op)		\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, int8_t)			\
@@ -762,7 +835,9 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
-	OFI_DEF_NOOP_##FUNCNAME
+	OFI_DEF_NOOP_##FUNCNAME						\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #else /* HAVE_BUILTIN_MM_ATOMICS */
 
@@ -780,7 +855,9 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, float)	\
 	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, double)\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, long_double)		\
-	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, long_double)
+	OFI_DEF_##ATOMICTYPE##_COMPLEX_##FUNCNAME(op ##_COMPLEX, long_double) \
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #define OFI_DEFINE_REALNO_HANDLERS(ATOMICTYPE, FUNCNAME, op)		\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, int8_t)			\
@@ -796,12 +873,18 @@ size_t ofi_datatype_size(enum fi_datatype datatype)
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_NOOP_##FUNCNAME						\
 	OFI_DEF_##ATOMICTYPE##_##FUNCNAME(op, long_double)		\
-	OFI_DEF_NOOP_##FUNCNAME
+	OFI_DEF_NOOP_##FUNCNAME						\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_int128_t)	\
+	OFI_DEF_##ATOMICTYPE##_INT128_##FUNCNAME(op, ofi_uint128_t)
 
 #endif /* HAVE_BUILTIN_MM_ATOMICS */
 
-#define OFI_OP_NOT_SUPPORTED(op)	NULL, NULL, NULL, NULL, NULL,	\
-			NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+/* 5 per line to be easily counted by inspection. */
+#define OFI_OP_NOT_SUPPORTED(op)		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL, NULL, NULL, NULL, NULL,		\
+	NULL
 
 #ifdef HAVE_BUILTIN_MM_ATOMICS
 
@@ -821,7 +904,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITEEXT, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITEEXT_CMP, NAME, OFI_OP_MIN) },
@@ -855,7 +938,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(EXCHANGE, FUNC, OFI_OP_READWRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITEEXT_CMP, NAME, OFI_OP_MIN) },
@@ -884,7 +967,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAPEXT, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
@@ -919,7 +1002,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) },
@@ -953,7 +1036,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) },
@@ -982,7 +1065,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][OFI_DATATYPE_CNT])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
@@ -1023,7 +1106,7 @@ int ofi_atomic_valid(const struct fi_provider *prov,
 		return -FI_EBADFLAGS;
 	}
 
-	if (datatype >= FI_DATATYPE_LAST) {
+	if (datatype > OFI_DATATYPE_CNT) {
 		FI_INFO(prov, FI_LOG_DOMAIN, "Invalid datatype\n");
 		return -FI_EOPNOTSUPP;
 	}
diff --git a/deps/libfabric/prov/util/src/util_attr.c b/deps/libfabric/prov/util/src/util_attr.c
index 84fb143ed1b20e88f89789a0ad3900c85c171b8c..7acb96f8da7ea3a2344d3ad3a18fafb6f9edb658 100644
--- a/deps/libfabric/prov/util/src/util_attr.c
+++ b/deps/libfabric/prov/util/src/util_attr.c
@@ -94,13 +94,14 @@ char *ofi_strdup_append(const char *head, const char *tail)
 int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name)
 {
 	char *exclude, *name, *temp;
+	int length;
 
-	exclude = malloc(strlen(util_prov_name) + 2);
+	length = strlen(util_prov_name) + 2;
+	exclude = malloc(length);
 	if (!exclude)
 		return -FI_ENOMEM;
 
-	exclude[0] = '^';
-	strcpy(&exclude[1], util_prov_name);
+	snprintf(exclude, length, "^%s", util_prov_name);
 
 	if (!*prov_name_list)
 		goto out;
@@ -397,7 +398,7 @@ int ofi_check_fabric_attr(const struct fi_provider *prov,
 	 * user's hints, if one is specified.
 	 */
 	if (prov_attr->prov_name && user_attr->prov_name &&
-	    !strcasestr(user_attr->prov_name, prov_attr->prov_name)) {
+	    strcasestr(user_attr->prov_name, prov_attr->prov_name)) {
 		FI_INFO(prov, FI_LOG_CORE,
 			"Requesting provider %s, skipping %s\n",
 			prov_attr->prov_name, user_attr->prov_name);
@@ -543,7 +544,7 @@ int ofi_check_mr_mode(const struct fi_provider *prov, uint32_t api_version,
 out:
 	if (ret) {
 		FI_INFO(prov, FI_LOG_CORE, "Invalid memory registration mode\n");
-		FI_INFO_MR_MODE(prov, prov_mode, user_mode);
+		OFI_INFO_MR_MODE(prov, prov_mode, user_mode);
 	}
 
 	return ret;
@@ -588,7 +589,7 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version,
 
 	if (user_attr->cq_data_size > prov_attr->cq_data_size) {
 		FI_INFO(prov, FI_LOG_CORE, "CQ data size too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, cq_data_size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, cq_data_size);
 		return -FI_ENODATA;
 	}
 
@@ -597,12 +598,12 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version,
 
 	if (user_attr->max_ep_stx_ctx > prov_attr->max_ep_stx_ctx) {
 		FI_INFO(prov, FI_LOG_CORE, "max_ep_stx_ctx greater than supported\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, max_ep_stx_ctx);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, max_ep_stx_ctx);
 	}
 
 	if (user_attr->max_ep_srx_ctx > prov_attr->max_ep_srx_ctx) {
 		FI_INFO(prov, FI_LOG_CORE, "max_ep_srx_ctx greater than supported\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, max_ep_srx_ctx);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, max_ep_srx_ctx);
 	}
 
 	/* following checks only apply to api 1.5 and beyond */
@@ -616,31 +617,31 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version,
 
 	if (user_attr->mr_iov_limit > prov_attr->mr_iov_limit) {
 		FI_INFO(prov, FI_LOG_CORE, "MR iov limit too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, mr_iov_limit);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, mr_iov_limit);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->caps & ~(prov_attr->caps)) {
 		FI_INFO(prov, FI_LOG_CORE, "Requested domain caps not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
 	if ((user_attr->mode & prov_attr->mode) != prov_attr->mode) {
 		FI_INFO(prov, FI_LOG_CORE, "Required domain mode missing\n");
-		FI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
+		OFI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->max_err_data > prov_attr->max_err_data) {
 		FI_INFO(prov, FI_LOG_CORE, "Max err data too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, max_err_data);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, max_err_data);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->mr_cnt > prov_attr->mr_cnt) {
 		FI_INFO(prov, FI_LOG_CORE, "MR count too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, mr_cnt);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, mr_cnt);
 		return -FI_ENODATA;
 	}
 
@@ -655,7 +656,7 @@ int ofi_check_ep_type(const struct fi_provider *prov,
 	    (prov_attr->type != FI_EP_UNSPEC) &&
 	    (user_attr->type != prov_attr->type)) {
 		FI_INFO(prov, FI_LOG_CORE, "unsupported endpoint type\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, type, FI_TYPE_EP_TYPE);
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, type, FI_TYPE_EP_TYPE);
 		return -FI_ENODATA;
 	}
 	return 0;
@@ -677,7 +678,7 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 	if ((user_attr->protocol != FI_PROTO_UNSPEC) &&
 	    (user_attr->protocol != prov_attr->protocol)) {
 		FI_INFO(prov, FI_LOG_CORE, "Unsupported protocol\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, protocol, FI_TYPE_PROTOCOL);
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, protocol, FI_TYPE_PROTOCOL);
 		return -FI_ENODATA;
 	}
 
@@ -689,7 +690,7 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 
 	if (user_attr->max_msg_size > prov_attr->max_msg_size) {
 		FI_INFO(prov, FI_LOG_CORE, "Max message size too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, max_msg_size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, max_msg_size);
 		return -FI_ENODATA;
 	}
 
@@ -708,6 +709,11 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 				user_attr->tx_ctx_cnt);
 			return -FI_ENODATA;
 		}
+	} else if (!user_attr->tx_ctx_cnt &&
+		   prov_attr->tx_ctx_cnt == FI_SHARED_CONTEXT) {
+		FI_INFO(prov, FI_LOG_CORE,
+			"Provider requires use of shared tx context\n");
+		return -FI_ENODATA;
 	}
 
 	if (user_attr->rx_ctx_cnt > prov_info->domain_attr->max_ep_rx_ctx) {
@@ -725,6 +731,11 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 				user_attr->rx_ctx_cnt);
 			return -FI_ENODATA;
 		}
+	} else if (!user_attr->rx_ctx_cnt &&
+		   prov_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) {
+		FI_INFO(prov, FI_LOG_CORE,
+			"Provider requires use of shared rx context\n");
+		return -FI_ENODATA;
 	}
 
 	if (user_info->caps & (FI_RMA | FI_ATOMIC)) {
@@ -732,8 +743,8 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 		    prov_attr->max_order_raw_size) {
 			FI_INFO(prov, FI_LOG_CORE,
 				"Max order RAW size exceeds supported size\n");
-			FI_INFO_CHECK_VAL(prov, prov_attr, user_attr,
-					  max_order_raw_size);
+			OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr,
+					    max_order_raw_size);
 			return -FI_ENODATA;
 		}
 
@@ -741,8 +752,8 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 		    prov_attr->max_order_war_size) {
 			FI_INFO(prov, FI_LOG_CORE,
 				"Max order WAR size exceeds supported size\n");
-			FI_INFO_CHECK_VAL(prov, prov_attr, user_attr,
-					  max_order_war_size);
+			OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr,
+					    max_order_war_size);
 			return -FI_ENODATA;
 		}
 
@@ -750,8 +761,8 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 		    prov_attr->max_order_waw_size) {
 			FI_INFO(prov, FI_LOG_CORE,
 				"Max order WAW size exceeds supported size\n");
-			FI_INFO_CHECK_VAL(prov, prov_attr, user_attr,
-					  max_order_waw_size);
+			OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr,
+					    max_order_waw_size);
 			return -FI_ENODATA;
 		}
 	}
@@ -759,7 +770,7 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 	if (user_attr->auth_key_size &&
 	    (user_attr->auth_key_size != prov_attr->auth_key_size)) {
 		FI_INFO(prov, FI_LOG_CORE, "Unsupported authentication size.");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, auth_key_size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, auth_key_size);
 		return -FI_ENODATA;
 	}
 
@@ -767,7 +778,7 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 	    ofi_max_tag(user_attr->mem_tag_format) >
 		    ofi_max_tag(prov_attr->mem_tag_format)) {
 		FI_INFO(prov, FI_LOG_CORE, "Tag size exceeds supported size\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, mem_tag_format);
+		OFI_INFO_CHECK_U64(prov, prov_attr, user_attr, mem_tag_format);
 		return -FI_ENODATA;
 	}
 
@@ -786,54 +797,54 @@ int ofi_check_rx_attr(const struct fi_provider *prov,
 
 	if ((user_attr->caps & ~OFI_IGNORED_RX_CAPS) & ~(prov_attr->caps)) {
 		FI_INFO(prov, FI_LOG_CORE, "caps not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
 	info_mode = user_attr->mode ? user_attr->mode : info_mode;
 	if ((info_mode & prov_attr->mode) != prov_attr->mode) {
 		FI_INFO(prov, FI_LOG_CORE, "needed mode not set\n");
-		FI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
+		OFI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->op_flags & ~(prov_attr->op_flags)) {
 		FI_INFO(prov, FI_LOG_CORE, "op_flags not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, op_flags,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, op_flags,
 			     FI_TYPE_OP_FLAGS);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->msg_order & ~(prov_attr->msg_order)) {
 		FI_INFO(prov, FI_LOG_CORE, "msg_order not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, msg_order,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, msg_order,
 			     FI_TYPE_MSG_ORDER);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->comp_order & ~(prov_attr->comp_order)) {
 		FI_INFO(prov, FI_LOG_CORE, "comp_order not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, comp_order,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, comp_order,
 			     FI_TYPE_MSG_ORDER);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->total_buffered_recv > prov_attr->total_buffered_recv) {
 		FI_INFO(prov, FI_LOG_CORE, "total_buffered_recv too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr,
-				  total_buffered_recv);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr,
+				    total_buffered_recv);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->size > prov_attr->size) {
 		FI_INFO(prov, FI_LOG_CORE, "size is greater than supported\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, size);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->iov_limit > prov_attr->iov_limit) {
 		FI_INFO(prov, FI_LOG_CORE, "iov_limit too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, iov_limit);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, iov_limit);
 		return -FI_ENODATA;
 	}
 
@@ -842,8 +853,8 @@ int ofi_check_rx_attr(const struct fi_provider *prov,
 		/* Just log a notification, but ignore the value */
 		FI_INFO(prov, FI_LOG_CORE,
 			"Total buffered recv size exceeds supported size\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr,
-				  total_buffered_recv);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr,
+				    total_buffered_recv);
 	}
 
 	return 0;
@@ -867,7 +878,7 @@ int ofi_check_attr_subset(const struct fi_provider *prov,
 	if (~expanded_caps & requested_caps) {
 		FI_INFO(prov, FI_LOG_CORE,
 			"requested caps not subset of base endpoint caps\n");
-		FI_INFO_FIELD(prov, expanded_caps, requested_caps,
+		OFI_INFO_FIELD(prov, expanded_caps, requested_caps,
 			"Supported", "Requested", FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
@@ -884,59 +895,59 @@ int ofi_check_tx_attr(const struct fi_provider *prov,
 
 	if ((user_attr->caps & ~OFI_IGNORED_TX_CAPS) & ~(prov_attr->caps)) {
 		FI_INFO(prov, FI_LOG_CORE, "caps not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
 	info_mode = user_attr->mode ? user_attr->mode : info_mode;
 	if ((info_mode & prov_attr->mode) != prov_attr->mode) {
 		FI_INFO(prov, FI_LOG_CORE, "needed mode not set\n");
-		FI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
+		OFI_INFO_MODE(prov, prov_attr->mode, user_attr->mode);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->op_flags & ~(prov_attr->op_flags)) {
 		FI_INFO(prov, FI_LOG_CORE, "op_flags not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, op_flags,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, op_flags,
 			     FI_TYPE_OP_FLAGS);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->msg_order & ~(prov_attr->msg_order)) {
 		FI_INFO(prov, FI_LOG_CORE, "msg_order not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, msg_order,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, msg_order,
 			     FI_TYPE_MSG_ORDER);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->comp_order & ~(prov_attr->comp_order)) {
 		FI_INFO(prov, FI_LOG_CORE, "comp_order not supported\n");
-		FI_INFO_CHECK(prov, prov_attr, user_attr, comp_order,
+		OFI_INFO_CHECK(prov, prov_attr, user_attr, comp_order,
 			     FI_TYPE_MSG_ORDER);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->inject_size > prov_attr->inject_size) {
 		FI_INFO(prov, FI_LOG_CORE, "inject_size too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, inject_size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, inject_size);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->size > prov_attr->size) {
 		FI_INFO(prov, FI_LOG_CORE, "size is greater than supported\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, size);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, size);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->iov_limit > prov_attr->iov_limit) {
 		FI_INFO(prov, FI_LOG_CORE, "iov_limit too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, iov_limit);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, iov_limit);
 		return -FI_ENODATA;
 	}
 
 	if (user_attr->rma_iov_limit > prov_attr->rma_iov_limit) {
 		FI_INFO(prov, FI_LOG_CORE, "rma_iov_limit too large\n");
-		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, rma_iov_limit);
+		OFI_INFO_CHECK_SIZE(prov, prov_attr, user_attr, rma_iov_limit);
 		return -FI_ENODATA;
 	}
 
@@ -988,10 +999,18 @@ int ofi_prov_check_dup_info(const struct util_prov *util_prov,
 				     api_version, user_info);
 	    	if (ret)
 			continue;
+
 		if (!(fi = fi_dupinfo(prov_info))) {
 			ret = -FI_ENOMEM;
 			goto err;
 		}
+
+		if (util_prov->alter_defaults) {
+			ret = util_prov->alter_defaults(api_version, user_info,
+							prov_info, fi);
+			assert(ret == FI_SUCCESS);
+		}
+
 		if (!*info)
 			*info = fi;
 		else
@@ -1030,7 +1049,7 @@ int ofi_check_info(const struct util_prov *util_prov,
 
 	if (user_info->caps & ~(prov_info->caps)) {
 		FI_INFO(prov, FI_LOG_CORE, "Unsupported capabilities\n");
-		FI_INFO_CHECK(prov, prov_info, user_info, caps, FI_TYPE_CAPS);
+		OFI_INFO_CHECK(prov, prov_info, user_info, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
@@ -1038,14 +1057,14 @@ int ofi_check_info(const struct util_prov *util_prov,
 
 	if ((user_info->mode & prov_mode) != prov_mode) {
 		FI_INFO(prov, FI_LOG_CORE, "needed mode not set\n");
-		FI_INFO_MODE(prov, prov_mode, user_info->mode);
+		OFI_INFO_MODE(prov, prov_mode, user_info->mode);
 		return -FI_ENODATA;
 	}
 
 	if (!fi_valid_addr_format(prov_info->addr_format,
 				  user_info->addr_format)) {
 		FI_INFO(prov, FI_LOG_CORE, "address format not supported\n");
-		FI_INFO_CHECK(prov, prov_info, user_info, addr_format,
+		OFI_INFO_CHECK(prov, prov_info, user_info, addr_format,
 			      FI_TYPE_ADDR_FORMAT);
 		return -FI_ENODATA;
 	}
diff --git a/deps/libfabric/prov/util/src/util_av.c b/deps/libfabric/prov/util/src/util_av.c
index 2ec86d7d2cb02ad6f7b9dc19ca60d9558fcb7760..9401189dbd572317f3b96573f70ecb364ac904d3 100644
--- a/deps/libfabric/prov/util/src/util_av.c
+++ b/deps/libfabric/prov/util/src/util_av.c
@@ -247,28 +247,46 @@ void *ofi_av_get_addr(struct util_av *av, fi_addr_t fi_addr)
 	return entry->data;
 }
 
-int ofi_verify_av_insert(struct util_av *av, uint64_t flags)
+void *ofi_av_addr_context(struct util_av *av, fi_addr_t fi_addr)
 {
-	if ((av->flags & FI_EVENT) && !av->eq) {
-		FI_WARN(av->prov, FI_LOG_AV, "no EQ bound to AV\n");
-		return -FI_ENOEQ;
+	void *addr;
+
+	addr = ofi_av_get_addr(av, fi_addr);
+	return (char *) addr + av->context_offset;
+}
+
+int ofi_verify_av_insert(struct util_av *av, uint64_t flags, void *context)
+{
+	if (av->flags & FI_EVENT) {
+		if (!av->eq) {
+			FI_WARN(av->prov, FI_LOG_AV, "no EQ bound to AV\n");
+			return -FI_ENOEQ;
+		}
+
+		if (flags & FI_SYNC_ERR) {
+			FI_WARN(av->prov, FI_LOG_AV, "invalid flag\n");
+			return -FI_EBADFLAGS;
+		}
 	}
 
-	if (flags & ~(FI_MORE)) {
+	if (flags & ~(FI_MORE | FI_SYNC_ERR)) {
 		FI_WARN(av->prov, FI_LOG_AV, "unsupported flags\n");
-		return -FI_ENOEQ;
+		return -FI_EBADFLAGS;
+	}
+
+	if ((flags & FI_SYNC_ERR) && !context) {
+		FI_WARN(av->prov, FI_LOG_AV, "null context with FI_SYNC_ERR");
+		return -FI_EINVAL;
 	}
 
 	return 0;
 }
 
-/*
- * Must hold AV lock
- */
 int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr)
 {
 	struct util_av_entry *entry = NULL;
 
+	assert(fastlock_held(&av->lock));
 	HASH_FIND(hh, av->hash, addr, av->addrlen, entry);
 	if (entry) {
 		if (fi_addr)
@@ -306,13 +324,11 @@ int ofi_av_elements_iter(struct util_av *av, ofi_av_apply_func apply, void *arg)
 	return 0;
 }
 
-/*
- * Must hold AV lock
- */
 int ofi_av_remove_addr(struct util_av *av, fi_addr_t fi_addr)
 {
 	struct util_av_entry *av_entry;
 
+	assert(fastlock_held(&av->lock));
 	av_entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr);
 	if (!av_entry)
 		return -FI_ENOENT;
@@ -411,6 +427,13 @@ int ofi_av_close(struct util_av *av)
 	return 0;
 }
 
+size_t ofi_av_size(struct util_av *av)
+{
+	return av->av_entry_pool->entry_cnt ?
+	       av->av_entry_pool->entry_cnt :
+	       av->av_entry_pool->attr.chunk_cnt;
+}
+
 static int util_verify_av_util_attr(struct util_domain *domain,
 				    const struct util_av_attr *util_attr)
 {
@@ -426,7 +449,7 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr,
 			const struct util_av_attr *util_attr)
 {
 	int ret = 0;
-	size_t max_count;
+	size_t orig_size;
 	size_t offset;
 
 	/* offset calculated on a 8-byte boundary */
@@ -441,8 +464,7 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr,
 		.max_cnt	= 0,
 		/* Don't use track of buffer, because user can close
 		 * the AV without prior deletion of addresses */
-		.flags		= OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED |
-				  OFI_BUFPOOL_HUGEPAGES,
+		.flags		= OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED,
 	};
 
 	/* TODO: Handle FI_READ */
@@ -452,16 +474,16 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr,
 	if (ret)
 		return ret;
 
-	max_count = attr->count ? attr->count : ofi_universe_size;
-	av->count = roundup_power_of_two(max_count);
-	FI_INFO(av->prov, FI_LOG_AV, "AV size %zu\n", av->count);
+	orig_size = attr->count ? attr->count : ofi_universe_size;
+	orig_size = roundup_power_of_two(orig_size);
+	FI_INFO(av->prov, FI_LOG_AV, "AV size %zu\n", orig_size);
 
 	av->addrlen = util_attr->addrlen;
 	av->context_offset = offset + av->addrlen;
 	av->flags = util_attr->flags | attr->flags;
 	av->hash = NULL;
 
-	pool_attr.chunk_cnt = av->count;
+	pool_attr.chunk_cnt = orig_size;
 	return ofi_bufpool_create_attr(&pool_attr, &av->av_entry_pool);
 }
 
@@ -575,28 +597,12 @@ fi_addr_t ofi_ip_av_get_fi_addr(struct util_av *av, const void *addr)
 	return ofi_av_lookup_fi_addr(av, addr);
 }
 
-static int ip_av_valid_addr(struct util_av *av, const void *addr)
-{
-	const struct sockaddr_in *sin = addr;
-	const struct sockaddr_in6 *sin6 = addr;
-
-	switch (sin->sin_family) {
-	case AF_INET:
-		return sin->sin_port && sin->sin_addr.s_addr;
-	case AF_INET6:
-		return sin6->sin6_port &&
-		      memcmp(&in6addr_any, &sin6->sin6_addr, sizeof(in6addr_any));
-	default:
-		return 0;
-	}
-}
-
 static int ip_av_insert_addr(struct util_av *av, const void *addr,
 			     fi_addr_t *fi_addr, void *context)
 {
 	int ret;
 
-	if (ip_av_valid_addr(av, addr)) {
+	if (ofi_valid_dest_ipaddr(addr)) {
 		fastlock_acquire(&av->lock);
 		ret = ofi_av_insert_addr(av, addr, fi_addr);
 		fastlock_release(&av->lock);
@@ -616,12 +622,19 @@ static int ip_av_insert_addr(struct util_av *av, const void *addr,
 }
 
 int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen,
-		      size_t count, fi_addr_t *fi_addr, void *context)
+		      size_t count, fi_addr_t *fi_addr, uint64_t flags,
+		      void *context)
 {
 	int ret, success_cnt = 0;
+	int *sync_err = NULL;
 	size_t i;
 
 	FI_DBG(av->prov, FI_LOG_AV, "inserting %zu addresses\n", count);
+	if (flags & FI_SYNC_ERR) {
+		sync_err = context;
+		memset(sync_err, 0, sizeof(*sync_err) * count);
+	}
+
 	for (i = 0; i < count; i++) {
 		ret = ip_av_insert_addr(av, (const char *) addr + i * addrlen,
 					fi_addr ? &fi_addr[i] : NULL, context);
@@ -629,6 +642,8 @@ int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen,
 			success_cnt++;
 		else if (av->eq)
 			ofi_av_write_event(av, i, -ret, context);
+		else if (sync_err)
+			sync_err[i] = -ret;
 	}
 
 	FI_DBG(av->prov, FI_LOG_AV, "%d addresses successful\n", success_cnt);
@@ -648,12 +663,12 @@ int ofi_ip_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 	int ret;
 
 	av = container_of(av_fid, struct util_av, av_fid);
-	ret = ofi_verify_av_insert(av, flags);
+	ret = ofi_verify_av_insert(av, flags, context);
 	if (ret)
 		return ret;
 
 	return ofi_ip_av_insertv(av, addr, ofi_sizeofaddr(addr),
-				 count, fi_addr, context);
+				 count, fi_addr, flags, context);
 }
 
 static int ip_av_insertsvc(struct fid_av *av, const char *node,
@@ -844,7 +859,7 @@ static int ip_av_insertsym(struct fid_av *av_fid, const char *node,
 	int ret, count;
 
 	av = container_of(av_fid, struct util_av, av_fid);
-	ret = ofi_verify_av_insert(av, flags);
+	ret = ofi_verify_av_insert(av, flags, context);
 	if (ret)
 		return ret;
 
@@ -854,7 +869,7 @@ static int ip_av_insertsym(struct fid_av *av_fid, const char *node,
 		return count;
 
 	ret = ofi_ip_av_insertv(av, addr, addrlen, count,
-				fi_addr, context);
+				fi_addr, flags, context);
 	free(addr);
 	return ret;
 }
diff --git a/deps/libfabric/prov/util/src/util_buf.c b/deps/libfabric/prov/util/src/util_buf.c
index 74f10a987b93d89d1fce2dea4fa2f9ae88af49c9..783426562e19195585e58059c8e6308ec2e08c04 100644
--- a/deps/libfabric/prov/util/src/util_buf.c
+++ b/deps/libfabric/prov/util/src/util_buf.c
@@ -114,26 +114,18 @@ retry:
 		buf_hdr = ofi_buf_hdr(buf);
 		buf_hdr->region = buf_region;
 		buf_hdr->index = pool->entry_cnt + i;
-		if (pool->attr.init_fn) {
-#if ENABLE_DEBUG
-			if (pool->attr.flags & OFI_BUFPOOL_INDEXED) {
-				buf_hdr->entry.dlist.next = (void *) OFI_MAGIC_64;
-				buf_hdr->entry.dlist.prev = (void *) OFI_MAGIC_64;
-
-				pool->attr.init_fn(buf_region, buf);
-
-				assert((buf_hdr->entry.dlist.next == (void *) OFI_MAGIC_64) &&
-				       (buf_hdr->entry.dlist.prev == (void *) OFI_MAGIC_64));
-			} else {
-				buf_hdr->entry.slist.next = (void *) OFI_MAGIC_64;
+		OFI_DBG_SET(buf_hdr->magic, OFI_MAGIC_SIZE_T);
+		OFI_DBG_SET(buf_hdr->ftr,
+			    (struct ofi_bufpool_ftr *) ((char *) buf +
+			    pool->entry_size - sizeof(struct ofi_bufpool_ftr)));
+		OFI_DBG_SET(buf_hdr->ftr->magic, OFI_MAGIC_SIZE_T);
 
-				pool->attr.init_fn(buf_region, buf);
-
-				assert(buf_hdr->entry.slist.next == (void *) OFI_MAGIC_64);
-			}
-#else
+		if (pool->attr.init_fn) {
+			OFI_DBG_SET(buf_hdr->entry.dlist.next, OFI_MAGIC_PTR);
+			OFI_DBG_SET(buf_hdr->entry.dlist.prev, OFI_MAGIC_PTR);
 			pool->attr.init_fn(buf_region, buf);
-#endif
+			assert((buf_hdr->entry.dlist.next == OFI_MAGIC_PTR) &&
+			       (buf_hdr->entry.dlist.prev == OFI_MAGIC_PTR));
 		}
 		if (pool->attr.flags & OFI_BUFPOOL_INDEXED) {
 			dlist_insert_tail(&buf_hdr->entry.dlist,
@@ -177,7 +169,10 @@ int ofi_bufpool_create_attr(struct ofi_bufpool_attr *attr,
 	pool->attr = *attr;
 
 	entry_sz = (attr->size + sizeof(struct ofi_bufpool_hdr));
-	pool->entry_size = ofi_get_aligned_size(entry_sz, attr->alignment);
+	OFI_DBG_ADD(entry_sz, sizeof(struct ofi_bufpool_ftr));
+	if (!attr->alignment)
+		pool->attr.alignment = entry_sz;
+	pool->entry_size = ofi_get_aligned_size(entry_sz, pool->attr.alignment);
 
 	if (!attr->chunk_cnt) {
 		pool->attr.chunk_cnt =
diff --git a/deps/libfabric/prov/util/src/util_coll.c b/deps/libfabric/prov/util/src/util_coll.c
index 2322f319b8cfdf8d81bacf1072d8272f15e1e1cf..d6ac7e6063a778290af671c988ad9aa0e40c9b5c 100644
--- a/deps/libfabric/prov/util/src/util_coll.c
+++ b/deps/libfabric/prov/util/src/util_coll.c
@@ -1095,6 +1095,7 @@ static int util_av_set_close(struct fid *fid)
 	if (ofi_atomic_get32(&av_set->ref) > 0)
 		return -FI_EBUSY;
 
+	free(av_set->fi_addr_array);
 	free(av_set);
 
 	return FI_SUCCESS;
@@ -1145,8 +1146,8 @@ static int util_coll_av_init(struct util_av *av)
 	if (ret)
 		goto err3;
 
-	coll_mc->av_set->fi_addr_array =
-		calloc(av->count, sizeof(*coll_mc->av_set->fi_addr_array));
+	coll_mc->av_set->fi_addr_array = calloc(ofi_av_size(av),
+					 sizeof(*coll_mc->av_set->fi_addr_array));
 	if (!coll_mc->av_set->fi_addr_array) {
 		ret = -FI_ENOMEM;
 		goto err2;
@@ -1178,7 +1179,9 @@ int ofi_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
 {
 	struct util_av *util_av = container_of(av, struct util_av, av_fid);
 	struct util_av_set *av_set;
-	int ret, iter;
+	size_t max_size;
+	uint64_t i;
+	int ret;
 
 	if (!util_av->coll_mc) {
 		ret = util_coll_av_init(util_av);
@@ -1186,7 +1189,7 @@ int ofi_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
 			return ret;
 	}
 
-	av_set = calloc(1,sizeof(*av_set));
+	av_set = calloc(1, sizeof(*av_set));
 	if (!av_set)
 		return -FI_ENOMEM;
 
@@ -1194,15 +1197,17 @@ int ofi_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
 	if (ret)
 		goto err1;
 
-	av_set->fi_addr_array = calloc(util_av->count, sizeof(*av_set->fi_addr_array));
+	max_size = attr->count ? attr->count : ofi_av_size(util_av);
+	av_set->fi_addr_array = calloc(max_size,
+				       sizeof(*av_set->fi_addr_array));
 	if (!av_set->fi_addr_array)
 		goto err2;
 
-	for (iter = 0; iter < attr->count; iter++) {
-		av_set->fi_addr_array[iter] =
-			util_av->coll_mc->av_set->fi_addr_array[iter * attr->stride];
-		av_set->fi_addr_count++;
+	for (i = attr->start_addr; i <= attr->end_addr; i += attr->stride) {
+		av_set->fi_addr_array[av_set->fi_addr_count++] =
+			util_av->coll_mc->av_set->fi_addr_array[i];
 	}
+	assert(av_set->fi_addr_count <= max_size);
 
 	util_coll_mc_init(&av_set->coll_mc, av_set, NULL, context);
 
diff --git a/deps/libfabric/prov/util/src/util_cq.c b/deps/libfabric/prov/util/src/util_cq.c
index fe230f525028f1e0267e11ba9582507f0600473e..9ee3024ebc01d04200e7e3056f773faf2a5e2535 100644
--- a/deps/libfabric/prov/util/src/util_cq.c
+++ b/deps/libfabric/prov/util/src/util_cq.c
@@ -38,58 +38,69 @@
 
 #define UTIL_DEF_CQ_SIZE (1024)
 
-/* Caller must hold `cq_lock` */
-int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, size_t len,
-			  void *buf, uint64_t data, uint64_t tag, fi_addr_t src)
+
+/* While the CQ is full, we continue to add new entries to the auxiliary
+ * queue.
+ */
+static void ofi_cq_insert_aux(struct util_cq *cq,
+			      struct util_cq_aux_entry *entry)
+{
+	if (!ofi_cirque_isfull(cq->cirq))
+		ofi_cirque_commit(cq->cirq);
+
+	entry->cq_slot = ofi_cirque_tail(cq->cirq);
+	entry->cq_slot->flags = UTIL_FLAG_AUX;
+	slist_insert_tail(&entry->list_entry, &cq->aux_queue);
+}
+
+int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags,
+			  size_t len, void *buf, uint64_t data, uint64_t tag,
+			  fi_addr_t src)
 {
-	struct util_cq_oflow_err_entry *entry;
+	struct util_cq_aux_entry *entry;
 
-	assert(ofi_cirque_isfull(cq->cirq));
+	assert(fastlock_held(&cq->cq_lock));
+	FI_DBG(cq->domain->prov, FI_LOG_CQ, "writing to CQ overflow list\n");
+	assert(ofi_cirque_freecnt(cq->cirq) <= 1);
 
 	if (!(entry = calloc(1, sizeof(*entry))))
 		return -FI_ENOMEM;
 
-	entry->parent_comp = ofi_cirque_tail(cq->cirq);
-	entry->parent_comp->flags |= UTIL_FLAG_OVERFLOW;
-
 	entry->comp.op_context = context;
 	entry->comp.flags = flags;
 	entry->comp.len = len;
 	entry->comp.buf = buf;
 	entry->comp.data = data;
 	entry->comp.tag = tag;
-
+	entry->comp.err = 0;
 	entry->src = src;
-	slist_insert_tail(&entry->list_entry, &cq->oflow_err_list);
 
+	ofi_cq_insert_aux(cq, entry);
 	return 0;
 }
 
-int ofi_cq_write_error(struct util_cq *cq,
-		       const struct fi_cq_err_entry *err_entry)
+int ofi_cq_insert_error(struct util_cq *cq,
+			const struct fi_cq_err_entry *err_entry)
 {
-	struct util_cq_oflow_err_entry *entry;
-	struct fi_cq_tagged_entry *comp;
+	struct util_cq_aux_entry *entry;
 
+	assert(fastlock_held(&cq->cq_lock));
 	assert(err_entry->err);
-
 	if (!(entry = calloc(1, sizeof(*entry))))
 		return -FI_ENOMEM;
 
 	entry->comp = *err_entry;
-	cq->cq_fastlock_acquire(&cq->cq_lock);
-	slist_insert_tail(&entry->list_entry, &cq->oflow_err_list);
+	ofi_cq_insert_aux(cq, entry);
+	return 0;
+}
 
-	if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) {
-		comp = ofi_cirque_tail(cq->cirq);
-		comp->flags |= (UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW);
-		entry->parent_comp = ofi_cirque_tail(cq->cirq);
-	} else {
-		comp = ofi_cirque_tail(cq->cirq);
-		comp->flags = UTIL_FLAG_ERROR;
-		ofi_cirque_commit(cq->cirq);
-	}
+int ofi_cq_write_error(struct util_cq *cq,
+		       const struct fi_cq_err_entry *err_entry)
+{
+	cq->cq_fastlock_acquire(&cq->cq_lock);
+	ofi_cq_insert_error(cq, err_entry);
 	cq->cq_fastlock_release(&cq->cq_lock);
+
 	if (cq->wait)
 		cq->wait->signal(cq->wait);
 	return 0;
@@ -203,40 +214,12 @@ static void util_cq_read_tagged(void **dst, void *src)
 	*(char **)dst += sizeof(struct fi_cq_tagged_entry);
 }
 
-static inline
-void util_cq_read_oflow_entry(struct util_cq *cq,
-			      struct util_cq_oflow_err_entry *oflow_entry,
-			      struct fi_cq_tagged_entry *cirq_entry,
-			      void **buf, fi_addr_t *src_addr, ssize_t i)
-{
-	if (src_addr && cq->src) {
-		src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)];
-		cq->src[ofi_cirque_rindex(cq->cirq)] = oflow_entry->src;
-	}
-	cq->read_entry(buf, cirq_entry);
-	cirq_entry->op_context = oflow_entry->comp.op_context;
-	cirq_entry->flags = oflow_entry->comp.flags;
-	cirq_entry->len = oflow_entry->comp.len;
-	cirq_entry->buf = oflow_entry->comp.buf;
-	cirq_entry->data = oflow_entry->comp.data;
-	cirq_entry->tag = oflow_entry->comp.tag;
-}
-
-static inline
-void util_cq_read_entry(struct util_cq *cq, struct fi_cq_tagged_entry *entry,
-			void **buf, fi_addr_t *src_addr, ssize_t i)
-{
-	if (src_addr && cq->src)
-		src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)];
-	cq->read_entry(buf, entry);
-	ofi_cirque_discard(cq->cirq);
-}
-
 ssize_t ofi_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count,
 			fi_addr_t *src_addr)
 {
-	struct util_cq *cq;
 	struct fi_cq_tagged_entry *entry;
+	struct util_cq_aux_entry *aux_entry;
+	struct util_cq *cq;
 	ssize_t i;
 
 	cq = container_of(cq_fid, struct util_cq, cq_fid);
@@ -255,54 +238,40 @@ ssize_t ofi_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count,
 	if (count > ofi_cirque_usedcnt(cq->cirq))
 		count = ofi_cirque_usedcnt(cq->cirq);
 
-	for (i = 0; i < (ssize_t)count; i++) {
+	for (i = 0; i < (ssize_t) count; i++) {
 		entry = ofi_cirque_head(cq->cirq);
-		if (OFI_UNLIKELY(entry->flags & (UTIL_FLAG_ERROR |
-						 UTIL_FLAG_OVERFLOW))) {
-			if (entry->flags & UTIL_FLAG_ERROR) {
-				struct util_cq_oflow_err_entry *oflow_err_entry =
-						container_of(cq->oflow_err_list.head,
-							     struct util_cq_oflow_err_entry,
-							     list_entry);
-				if (oflow_err_entry->comp.err) {
-					/* This handles case when the head of oflow_err_list is
-					 * an error entry.
-					 *
-					 * NOTE: if this isn't an error entry, we have to handle
-					 * overflow entries and then the error entries to ensure
-					 * ordering. */
-					if (!i)
-						i = -FI_EAVAIL;
-					break;
-				}
+		if (!(entry->flags & UTIL_FLAG_AUX)) {
+			if (src_addr && cq->src)
+				src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)];
+			cq->read_entry(&buf, entry);
+			ofi_cirque_discard(cq->cirq);
+		} else {
+			assert(!slist_empty(&cq->aux_queue));
+			aux_entry = container_of(cq->aux_queue.head,
+						 struct util_cq_aux_entry,
+						 list_entry);
+			assert(aux_entry->cq_slot == entry);
+			if (aux_entry->comp.err) {
+				if (!i)
+					i = -FI_EAVAIL;
+				break;
 			}
-			if (entry->flags & UTIL_FLAG_OVERFLOW) {
-				assert(!slist_empty(&cq->oflow_err_list));
-				struct util_cq_oflow_err_entry *oflow_entry =
-					container_of(cq->oflow_err_list.head,
-						     struct util_cq_oflow_err_entry,
-						     list_entry);
-				if (oflow_entry->parent_comp != entry) {
-					/* Handle case when all overflow/error CQ entries were read
-					 * for particular CIRQ entry */
-					entry->flags &= ~(UTIL_FLAG_OVERFLOW | UTIL_FLAG_ERROR);
-				} else {
-					uint64_t service_flags =
-						(entry->flags & (UTIL_FLAG_OVERFLOW | UTIL_FLAG_ERROR));
-					slist_remove_head(&cq->oflow_err_list);
-
-					entry->flags &= ~(service_flags);
-					util_cq_read_oflow_entry(cq, oflow_entry, entry,
-								 &buf, src_addr, i);
-					/* To ensure checking of overflow CQ entries once again */
-					if (!slist_empty(&cq->oflow_err_list))
-						entry->flags |= service_flags;
-					free(oflow_entry);
-					continue;
-				}
+
+			if (src_addr && cq->src)
+				src_addr[i] = aux_entry->src;
+			cq->read_entry(&buf, &aux_entry->comp);
+			slist_remove_head(&cq->aux_queue);
+
+			if (slist_empty(&cq->aux_queue)) {
+				ofi_cirque_discard(cq->cirq);
+			} else {
+				aux_entry = container_of(cq->aux_queue.head,
+							struct util_cq_aux_entry,
+							list_entry);
+				if (aux_entry->cq_slot != ofi_cirque_head(cq->cirq))
+					ofi_cirque_discard(cq->cirq);
 			}
 		}
-		util_cq_read_entry(cq, entry, &buf, src_addr, i);
 	}
 out:
 	cq->cq_fastlock_release(&cq->cq_lock);
@@ -317,10 +286,8 @@ ssize_t ofi_cq_read(struct fid_cq *cq_fid, void *buf, size_t count)
 ssize_t ofi_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *buf,
 		       uint64_t flags)
 {
+	struct util_cq_aux_entry *aux_entry;
 	struct util_cq *cq;
-	struct util_cq_oflow_err_entry *err;
-	struct slist_entry *entry;
-	struct fi_cq_tagged_entry *cirq_entry;
 	char *err_buf_save;
 	size_t err_data_size;
 	uint32_t api_version;
@@ -331,47 +298,48 @@ ssize_t ofi_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *buf,
 
 	cq->cq_fastlock_acquire(&cq->cq_lock);
 	if (ofi_cirque_isempty(cq->cirq) ||
-	    !(ofi_cirque_head(cq->cirq)->flags & UTIL_FLAG_ERROR)) {
+	    !(ofi_cirque_head(cq->cirq)->flags & UTIL_FLAG_AUX)) {
 		ret = -FI_EAGAIN;
 		goto unlock;
 	}
 
-	entry = slist_remove_head(&cq->oflow_err_list);
-	err = container_of(entry, struct util_cq_oflow_err_entry, list_entry);
-	if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) && buf->err_data_size) {
-		err_data_size = MIN(buf->err_data_size, err->comp.err_data_size);
-		memcpy(buf->err_data, err->comp.err_data, err_data_size);
+	assert(!slist_empty(&cq->aux_queue));
+	aux_entry = container_of(cq->aux_queue.head,
+				 struct util_cq_aux_entry, list_entry);
+	assert(aux_entry->cq_slot == ofi_cirque_head(cq->cirq));
+
+	if (!aux_entry->comp.err) {
+		ret = -FI_EAGAIN;
+		goto unlock;
+	}
+
+	if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) &&
+	    buf->err_data_size) {
 		err_buf_save = buf->err_data;
-		*buf = err->comp;
+		err_data_size = MIN(buf->err_data_size,
+				    aux_entry->comp.err_data_size);
+
+		*buf = aux_entry->comp;
+		memcpy(err_buf_save, aux_entry->comp.err_data, err_data_size);
 		buf->err_data = err_buf_save;
 		buf->err_data_size = err_data_size;
 	} else {
-		memcpy(buf, &err->comp, sizeof(struct fi_cq_err_entry_1_0));
+		memcpy(buf, &aux_entry->comp,
+		       sizeof(struct fi_cq_err_entry_1_0));
 	}
 
-	cirq_entry = ofi_cirque_head(cq->cirq);
-	if (!(cirq_entry->flags & UTIL_FLAG_OVERFLOW)) {
+	slist_remove_head(&cq->aux_queue);
+	free(aux_entry);
+	if (slist_empty(&cq->aux_queue)) {
 		ofi_cirque_discard(cq->cirq);
-	} else if (!slist_empty(&cq->oflow_err_list)) {
-		struct util_cq_oflow_err_entry *oflow_entry =
-			container_of(cq->oflow_err_list.head,
-				     struct util_cq_oflow_err_entry,
-				     list_entry);
-		if (oflow_entry->parent_comp != cirq_entry) {
-			/* The normal CQ entry were used to report error due to
-			 * out of space in the circular queue. We have to unset
-			 * UTIL_FLAG_ERROR and UTIL_FLAG_OVERFLOW flags */
-			cirq_entry->flags &= ~(UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW);
-		}
-		/* If the next entry in the oflow_err_list use the same entry from CIRQ to
-		 * report error/overflow, don't unset UTIL_FLAG_ERRO and UTIL_FLAG_OVERFLOW
-		 * flags to ensure the next round of handling overflow/error entries */
 	} else {
-		cirq_entry->flags &= ~(UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW);
+		aux_entry = container_of(cq->aux_queue.head,
+					 struct util_cq_aux_entry, list_entry);
+		if (aux_entry->cq_slot != ofi_cirque_head(cq->cirq))
+			ofi_cirque_discard(cq->cirq);
 	}
 
 	ret = 1;
-	free(err);
 unlock:
 	cq->cq_fastlock_release(&cq->cq_lock);
 	return ret;
@@ -440,15 +408,15 @@ static struct fi_ops_cq util_cq_ops = {
 
 int ofi_cq_cleanup(struct util_cq *cq)
 {
-	struct util_cq_oflow_err_entry *err;
+	struct util_cq_aux_entry *err;
 	struct slist_entry *entry;
 
 	if (ofi_atomic_get32(&cq->ref))
 		return -FI_EBUSY;
 
-	while (!slist_empty(&cq->oflow_err_list)) {
-		entry = slist_remove_head(&cq->oflow_err_list);
-		err = container_of(entry, struct util_cq_oflow_err_entry, list_entry);
+	while (!slist_empty(&cq->aux_queue)) {
+		entry = slist_remove_head(&cq->aux_queue);
+		err = container_of(entry, struct util_cq_aux_entry, list_entry);
 		free(err);
 	}
 
@@ -527,7 +495,7 @@ static int fi_cq_init(struct fid_domain *domain, struct fi_cq_attr *attr,
 		cq->cq_fastlock_acquire = ofi_fastlock_acquire;
 		cq->cq_fastlock_release = ofi_fastlock_release;
 	}
-	slist_init(&cq->oflow_err_list);
+	slist_init(&cq->aux_queue);
 	cq->read_entry = read_entry;
 
 	cq->cq_fid.fid.fclass = FI_CLASS_CQ;
@@ -645,36 +613,32 @@ int ofi_cq_init(const struct fi_provider *prov, struct fid_domain *domain,
 	if (cq->wait) {
 		ret = fi_poll_add(&cq->wait->pollset->poll_fid,
 				  &cq->cq_fid.fid, 0);
-		if (ret) {
-			ofi_cq_cleanup(cq);
-			return ret;
-		}
+		if (ret)
+			goto cleanup;
 	}
 
 	cq->cirq = util_comp_cirq_create(attr->size == 0 ? UTIL_DEF_CQ_SIZE : attr->size);
 	if (!cq->cirq) {
 		ret = -FI_ENOMEM;
-		goto err1;
+		goto cleanup;
 	}
 
 	if (cq->domain->info_domain_caps & FI_SOURCE) {
 		cq->src = calloc(cq->cirq->size, sizeof *cq->src);
 		if (!cq->src) {
 			ret = -FI_ENOMEM;
-			goto err2;
+			goto cleanup;
 		}
 	}
 	return 0;
 
-err2:
-	util_comp_cirq_free(cq->cirq);
-err1:
-	ofi_cq_cleanup(cq);
+cleanup:
+	(void) ofi_cq_cleanup(cq);
 	return ret;
 }
 
 uint64_t ofi_rx_flags[] = {
-	[ofi_op_msg] = FI_RECV,
+	[ofi_op_msg] = FI_MSG | FI_RECV,
 	[ofi_op_tagged] = FI_RECV | FI_TAGGED,
 	[ofi_op_read_req] = FI_RMA | FI_REMOTE_READ,
 	[ofi_op_read_rsp] = FI_RMA | FI_REMOTE_READ,
@@ -683,7 +647,7 @@ uint64_t ofi_rx_flags[] = {
 	[ofi_op_atomic] = FI_ATOMIC | FI_REMOTE_WRITE,
 	[ofi_op_atomic_fetch] = FI_ATOMIC | FI_REMOTE_READ,
 	[ofi_op_atomic_compare] = FI_ATOMIC | FI_REMOTE_READ,
-	[ofi_op_read_async] = FI_RMA | FI_READ,
+	[ofi_op_read_async] = FI_RMA | FI_REMOTE_READ,
 };
 
 uint64_t ofi_tx_flags[] = {
@@ -696,6 +660,6 @@ uint64_t ofi_tx_flags[] = {
 	[ofi_op_atomic] = FI_ATOMIC | FI_WRITE,
 	[ofi_op_atomic_fetch] = FI_ATOMIC | FI_READ,
 	[ofi_op_atomic_compare] = FI_ATOMIC | FI_READ,
-	[ofi_op_read_async] = FI_RMA | FI_RMA,
+	[ofi_op_read_async] = FI_RMA | FI_READ,
 };
 
diff --git a/deps/libfabric/prov/util/src/util_domain.c b/deps/libfabric/prov/util/src/util_domain.c
index 2a7d8de11bb790140ad8ccce903ad1c8ccfb607e..334b48c645e93822043fe0b2563ad185fc2b2b4d 100644
--- a/deps/libfabric/prov/util/src/util_domain.c
+++ b/deps/libfabric/prov/util/src/util_domain.c
@@ -37,7 +37,7 @@
 #include <ofi_util.h>
 
 
-int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq)
+static int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq)
 {
 	if (domain->eq) {
 		FI_WARN(domain->prov, FI_LOG_DOMAIN,
@@ -50,11 +50,34 @@ int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq)
 	return 0;
 }
 
+int ofi_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct util_domain *domain;
+	struct util_eq *eq;
+
+	domain = container_of(fid, struct util_domain, domain_fid.fid);
+	if (flags) {
+		FI_WARN(domain->prov, FI_LOG_DOMAIN,
+			"unsupported bind flags\n");
+		return -FI_EBADFLAGS;
+	}
+
+	switch (bfid->fclass) {
+	case FI_CLASS_EQ:
+		eq = container_of(bfid, struct util_eq, eq_fid.fid);
+		return ofi_domain_bind_eq(domain, eq);
+	default:
+		return -EINVAL;
+	}
+}
+
 int ofi_domain_close(struct util_domain *domain)
 {
 	if (ofi_atomic_get32(&domain->ref))
 		return -FI_EBUSY;
 
+	if (domain->eq)
+		ofi_atomic_dec32(&domain->eq->ref);
 	if (domain->mr_map.rbtree)
 		ofi_mr_map_close(&domain->mr_map);
 
diff --git a/deps/libfabric/prov/util/src/util_eq.c b/deps/libfabric/prov/util/src/util_eq.c
index 915084642b67e99bdf027b649ceec0ce8f41afb5..44dd0d8e2b83bc8f586c990254a26572958a1afa 100644
--- a/deps/libfabric/prov/util/src/util_eq.c
+++ b/deps/libfabric/prov/util/src/util_eq.c
@@ -68,6 +68,11 @@ void ofi_eq_handle_err_entry(uint32_t api_version, uint64_t flags,
 	}
 }
 
+/*
+ * fi_eq_read and fi_eq_readerr share this common code path.
+ * If flags contains UTIL_FLAG_ERROR, then we are processing
+ * fi_eq_readerr.
+ */
 ssize_t ofi_eq_read(struct fid_eq *eq_fid, uint32_t *event,
 		    void *buf, size_t len, uint64_t flags)
 {
diff --git a/deps/libfabric/prov/util/src/util_mem_hooks.c b/deps/libfabric/prov/util/src/util_mem_hooks.c
index 7a61948ad33cae09325723421f1fc7aeddc4816c..a3ec659ca617bf0c30567dc91af288cd5ddc9159 100644
--- a/deps/libfabric/prov/util/src/util_mem_hooks.c
+++ b/deps/libfabric/prov/util/src/util_mem_hooks.c
@@ -276,6 +276,25 @@ static int ofi_patch_function(struct ofi_intercept *intercept)
 
 	return ofi_apply_patch(intercept);
 }
+
+/*
+ * Check to see if there exists at the specified intercept location the
+ * pattern of bytes used in ofi_patch_function() to patch in our own
+ * monitoring function. These bytes (roughly) represent opcodes to load
+ * a supplied address into a register and execute a jump to that location.
+ * If they do already exist, then we've almost certainly already patched.
+ * Note that we are explicitly ignoring the target address in these checks
+ * as that is a transient value and not invarient as the other values are.
+ */
+static bool ofi_is_function_patched(struct ofi_intercept *intercept)
+{
+	return (
+		(*(unsigned short*)((uintptr_t)intercept->orig_func + 0) == 0xbb49) &&
+		(*(unsigned char* )((uintptr_t)intercept->orig_func +10) == 0x41  ) &&
+		(*(unsigned char* )((uintptr_t)intercept->orig_func +11) == 0xff  ) &&
+		(*(unsigned char* )((uintptr_t)intercept->orig_func +12) == 0xe3  )
+	);
+}
 #elif defined(__aarch64__)
 /**
  * @brief Generate a mov immediate instruction
@@ -333,6 +352,27 @@ static int ofi_patch_function(struct ofi_intercept *intercept)
 
 	return ofi_apply_patch(intercept);
 }
+
+/*
+ * Please see comments at other ofi_is_function_patched() function
+ */
+static bool ofi_is_function_patched(struct ofi_intercept *intercept)
+{
+    uint32_t mov_mask=~((0xFFFF << 5) | 0x1F);
+    uint32_t br_mask=~(0x1F << 5);
+    uintptr_t addr = (uintptr_t) intercept->orig_func;
+    /*
+     * Register 15 is used in our patching code, but for checking here let's
+     * ignore the register value and instead focus on the surrounding bytes.
+     */
+    return (
+        ((*(uint32_t *) (addr +  0)) & mov_mask) == mov(0, 3, 0) &&
+        ((*(uint32_t *) (addr +  4)) & mov_mask) == movk(0, 2, 0) &&
+        ((*(uint32_t *) (addr +  8)) & mov_mask) == movk(0, 1, 0) &&
+        ((*(uint32_t *) (addr + 12)) & mov_mask) == movk(0, 0, 0) &&
+        ((*(uint32_t *) (addr + 16)) & br_mask) == br(0)
+	);
+}
 #endif
 
 /*
@@ -362,7 +402,14 @@ static int ofi_intercept_symbol(struct ofi_intercept *intercept)
 
 	intercept->orig_func = func_addr;
 
-	ret = ofi_patch_function(intercept);
+	if (ofi_is_function_patched(intercept)) {
+		FI_DBG(&core_prov, FI_LOG_MR,
+				"function %s is already patched; stopping further patching\n",
+				intercept->symbol);
+		ret = -FI_EALREADY;
+	} else {
+		ret = ofi_patch_function(intercept);
+	}
 
 	if (!ret)
 		dlist_insert_tail(&intercept->entry, &memhooks.intercept_list);
@@ -553,56 +600,25 @@ static int ofi_memhooks_start(struct ofi_mem_monitor *monitor)
 	for (i = 0; i < OFI_INTERCEPT_MAX; ++i)
 		dlist_init(&intercepts[i].dl_intercept_list);
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MMAP]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept mmap failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
-
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MUNMAP]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept munmap failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
-
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MREMAP]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept mremap failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
-
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MADVISE]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept madvise failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
-
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMAT]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept shmat failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
+	for (i = 0; i < OFI_INTERCEPT_MAX; ++i) {
+		ret = ofi_intercept_symbol(&intercepts[i]);
+		if (ret != 0) {
+			FI_DBG(&core_prov, FI_LOG_MR,
+				"intercept %s failed %d %s\n", intercepts[i].symbol,
+					ret, fi_strerror(ret));
+			goto err_intercept_failed;
+		}
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMDT]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept shmdt failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
+	return 0;
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_BRK]);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept brk failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
+err_intercept_failed:
+	while (--i >= 0)
+		ofi_remove_patch(&intercepts[i]);
+	memhooks_monitor->subscribe = NULL;
+	memhooks_monitor->unsubscribe = NULL;
 
-	return 0;
+	return ret;
 }
 
 static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor)
diff --git a/deps/libfabric/prov/util/src/util_mem_monitor.c b/deps/libfabric/prov/util/src/util_mem_monitor.c
index 8acafe5722cc168a5e8cda9551587cd95cb7699d..c6928ecfc6142ea880881a6a12263394b2d9a639 100644
--- a/deps/libfabric/prov/util/src/util_mem_monitor.c
+++ b/deps/libfabric/prov/util/src/util_mem_monitor.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2017 Cray Inc. All rights reserved.
- * Copyright (c) 2017-2019 Intel Inc. All rights reserved.
- * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates.
+ * Copyright (c) 2017-2021 Intel Inc. All rights reserved.
+ * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates.
  *                         All rights reserved.
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
@@ -34,10 +34,16 @@
  * SOFTWARE.
  */
 
-#include <ofi_mr.h>
 #include <unistd.h>
 
+#include <ofi_mr.h>
+#include <ofi_hmem.h>
+#include <ofi_enosys.h>
+#include <rdma/fi_ext.h>
+
+
 pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t mm_state_lock = PTHREAD_MUTEX_INITIALIZER;
 pthread_rwlock_t mm_list_rwlock = PTHREAD_RWLOCK_INITIALIZER;
 
 static int ofi_uffd_start(struct ofi_mem_monitor *monitor);
@@ -55,6 +61,7 @@ struct ofi_mem_monitor *uffd_monitor = &uffd.monitor;
 struct ofi_mem_monitor *default_monitor;
 struct ofi_mem_monitor *default_cuda_monitor;
 struct ofi_mem_monitor *default_rocr_monitor;
+struct ofi_mem_monitor *default_ze_monitor;
 
 static size_t ofi_default_cache_size(void)
 {
@@ -72,15 +79,86 @@ static size_t ofi_default_cache_size(void)
 	return cache_size;
 }
 
+/**
+ * We needed additional locking to run the start/stop functions
+ * without holding the rwlock. This allows us to have memory
+ * related functions in start/stop without deadlocking. We queue
+ * up a list of monitors before handling their start/stop functions
+ * all within this function. Due to having to release the rwlock
+ * before we enter this function, we need to further ensure thread
+ * safety by adding a state system.
+ *
+ * The state system has 4 expected states, IDLE, STARTING, RUNNING,
+ * and STOPPING.
+ *
+ * We expect states to move without any races from:
+ * IDLE -> STARTING
+ * STARTING -> RUNNING
+ * RUNNING -> STOPPING
+ * STOPPING -> IDLE
+ *
+ * In the case of races, we can also expect:
+ * STOPPING -> RUNNING
+ * STARTING -> RUNNING
+ *
+ * We only execute any behavior in this update function when the
+ * state is either STARTING or STOPPING.
+ *
+ * Discussion on this can be found at #7003 and #7063
+ *
+ */
+static int ofi_monitors_update(struct ofi_mem_monitor **monitors)
+{
+	int ret = 0;
+	enum fi_hmem_iface iface;
+	struct ofi_mem_monitor *monitor;
+
+	assert(monitors);
+
+	pthread_mutex_lock(&mm_state_lock);
+	for (iface = 0; iface < OFI_HMEM_MAX; iface++) {
+		monitor = monitors[iface];
+		if (monitor == NULL)
+			continue;
+
+		assert(monitor->state != FI_MM_STATE_UNSPEC);
+		switch (monitor->state) {
+		case FI_MM_STATE_STARTING:
+			ret = monitor->start(monitor);
+			if (ret) {
+				monitor->state = FI_MM_STATE_IDLE;
+				FI_WARN(&core_prov, FI_LOG_MR,
+					"Failed to start %s memory monitor: %s\n",
+					fi_tostr(&iface, FI_TYPE_HMEM_IFACE), fi_strerror(-ret));
+
+				goto out;
+			}
+			monitor->state = FI_MM_STATE_RUNNING;
+			break;
+		case FI_MM_STATE_STOPPING:
+			monitor->stop(monitor);
+			monitor->state = FI_MM_STATE_IDLE;
+			break;
+		default:
+			break;
+		}
+	}
+out:
+	pthread_mutex_unlock(&mm_state_lock);
+	return ret;
+}
+
 
 void ofi_monitor_init(struct ofi_mem_monitor *monitor)
 {
 	dlist_init(&monitor->list);
+	monitor->state = FI_MM_STATE_IDLE;
 }
 
 void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor)
 {
 	assert(dlist_empty(&monitor->list));
+	assert(monitor->state == FI_MM_STATE_IDLE);
 }
 
 /*
@@ -92,14 +170,8 @@ void ofi_monitors_init(void)
 	memhooks_monitor->init(memhooks_monitor);
 	cuda_monitor->init(cuda_monitor);
 	rocr_monitor->init(rocr_monitor);
-
-#if HAVE_MEMHOOKS_MONITOR
-        default_monitor = memhooks_monitor;
-#elif HAVE_UFFD_MONITOR
-        default_monitor = uffd_monitor;
-#else
-        default_monitor = NULL;
-#endif
+	ze_monitor->init(ze_monitor);
+	import_monitor->init(import_monitor);
 
 	fi_param_define(NULL, "mr_cache_max_size", FI_PARAM_SIZE_T,
 			"Defines the total number of bytes for all memory"
@@ -128,6 +200,9 @@ void ofi_monitors_init(void)
 	fi_param_define(NULL, "mr_rocr_cache_monitor_enabled", FI_PARAM_BOOL,
 			"Enable or disable the ROCR cache memory monitor. "
 			"Monitor is enabled by default.");
+	fi_param_define(NULL, "mr_ze_cache_monitor_enabled", FI_PARAM_BOOL,
+			"Enable or disable the ZE cache memory monitor. "
+			"Monitor is enabled by default.");
 
 	fi_param_get_size_t(NULL, "mr_cache_max_size", &cache_params.max_size);
 	fi_param_get_size_t(NULL, "mr_cache_max_count", &cache_params.max_cnt);
@@ -136,10 +211,26 @@ void ofi_monitors_init(void)
 			  &cache_params.cuda_monitor_enabled);
 	fi_param_get_bool(NULL, "mr_rocr_cache_monitor_enabled",
 			  &cache_params.rocr_monitor_enabled);
+	fi_param_get_bool(NULL, "mr_ze_cache_monitor_enabled",
+			  &cache_params.ze_monitor_enabled);
 
 	if (!cache_params.max_size)
 		cache_params.max_size = ofi_default_cache_size();
 
+	/*
+	 * At this time, the import monitor could have set the default monitor,
+	 * do not override
+	 */
+	if (!default_monitor) {
+#if HAVE_MEMHOOKS_MONITOR
+		default_monitor = memhooks_monitor;
+#elif HAVE_UFFD_MONITOR
+		default_monitor = uffd_monitor;
+#else
+		default_monitor = NULL;
+#endif
+	}
+
 	if (cache_params.monitor != NULL) {
 		if (!strcmp(cache_params.monitor, "userfaultfd")) {
 #if HAVE_UFFD_MONITOR
@@ -169,6 +260,11 @@ void ofi_monitors_init(void)
 		default_rocr_monitor = rocr_monitor;
 	else
 		default_rocr_monitor = NULL;
+
+	if (cache_params.ze_monitor_enabled)
+		default_ze_monitor = ze_monitor;
+	else
+		default_ze_monitor = NULL;
 }
 
 void ofi_monitors_cleanup(void)
@@ -177,12 +273,15 @@ void ofi_monitors_cleanup(void)
 	memhooks_monitor->cleanup(memhooks_monitor);
 	cuda_monitor->cleanup(cuda_monitor);
 	rocr_monitor->cleanup(rocr_monitor);
+	ze_monitor->cleanup(ze_monitor);
+	import_monitor->cleanup(import_monitor);
 }
 
 /* Monitors array must be of size OFI_HMEM_MAX. */
 int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 			   struct ofi_mr_cache *cache)
 {
+	struct ofi_mem_monitor *start_list[OFI_HMEM_MAX];
 	int ret = 0;
 	enum fi_hmem_iface iface;
 	struct ofi_mem_monitor *monitor;
@@ -207,6 +306,9 @@ int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 
 	for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; iface++) {
 		cache->monitors[iface] = NULL;
+		start_list[iface] = NULL;
+		if (!ofi_hmem_is_initialized(iface))
+			continue;
 
 		monitor = monitors[iface];
 		if (!monitor) {
@@ -217,11 +319,14 @@ int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 		}
 
 		if (dlist_empty(&monitor->list)) {
-			ret = monitor->start(monitor);
-			if (ret == -FI_ENOSYS)
-				continue;
-			else if (ret)
-				goto err;
+			pthread_mutex_lock(&mm_state_lock);
+			start_list[iface] = monitor;
+			/* See comment above ofi_monitors_update for details */
+			if (monitor->state == FI_MM_STATE_IDLE)
+				monitor->state = FI_MM_STATE_STARTING;
+			else if (monitor->state == FI_MM_STATE_STOPPING)
+				monitor->state = FI_MM_STATE_RUNNING;
+			pthread_mutex_unlock(&mm_state_lock);
 		}
 
 		success_count++;
@@ -230,21 +335,21 @@ int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 				  &monitor->list);
 	}
 	pthread_rwlock_unlock(&mm_list_rwlock);
+
+	ret = ofi_monitors_update(start_list);
+	if (ret)
+		goto err;
+
 	return success_count ? FI_SUCCESS : -FI_ENOSYS;
 
 err:
-	pthread_rwlock_unlock(&mm_list_rwlock);
-
-	FI_WARN(&core_prov, FI_LOG_MR,
-		"Failed to start %s memory monitor: %s\n",
-		fi_tostr(&iface, FI_TYPE_HMEM_IFACE), fi_strerror(-ret));
 	ofi_monitors_del_cache(cache);
-
 	return ret;
 }
 
 void ofi_monitors_del_cache(struct ofi_mr_cache *cache)
 {
+	struct ofi_mem_monitor *stop_list[OFI_HMEM_MAX];
 	struct ofi_mem_monitor *monitor;
 	enum fi_hmem_iface iface;
 	int ret;
@@ -261,19 +366,32 @@ void ofi_monitors_del_cache(struct ofi_mr_cache *cache)
 	} while (ret);
 
 	for (iface = 0; iface < OFI_HMEM_MAX; iface++) {
+		stop_list[iface] = NULL;
 		monitor = cache->monitors[iface];
 		if (!monitor)
 			continue;
 
 		dlist_remove(&cache->notify_entries[iface]);
 
-		if (dlist_empty(&monitor->list))
-			monitor->stop(monitor);
+		if (dlist_empty(&monitor->list)) {
+			pthread_mutex_lock(&mm_state_lock);
+			stop_list[iface] = monitor;
+			/* See comment above ofi_monitors_update for details */
+			if (monitor->state == FI_MM_STATE_RUNNING)
+				monitor->state = FI_MM_STATE_STOPPING;
+			else if (monitor->state == FI_MM_STATE_STARTING)
+				monitor->state = FI_MM_STATE_RUNNING;
+			pthread_mutex_unlock(&mm_state_lock);
+		}
 
 		cache->monitors[iface] = NULL;
 	}
 
 	pthread_rwlock_unlock(&mm_list_rwlock);
+
+
+	ofi_monitors_update(stop_list);
+	return;
 }
 
 /* Must be called with locks in place like following
@@ -347,7 +465,15 @@ void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor,
 #include <sys/ioctl.h>
 #include <linux/userfaultfd.h>
 
-
+/* The userfault fd monitor requires for events that could
+ * trigger it to be handled outside of the monitor functions
+ * itself. When a fault occurs on a monitored region, the
+ * faulting thread is put to sleep until the event is read
+ * via the userfault file descriptor. If this fault occurs
+ * within the userfault handling thread, no threads will
+ * read this event and our threads cannot progress, resulting
+ * in a hang.
+ */
 static void *ofi_uffd_handler(void *arg)
 {
 	struct uffd_msg msg;
@@ -544,3 +670,151 @@ static void ofi_uffd_stop(struct ofi_mem_monitor *monitor)
 }
 
 #endif /* HAVE_UFFD_MONITOR */
+
+
+static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor);
+static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor);
+static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor);
+static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor);
+static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier,
+					const void *addr, size_t len,
+					union ofi_mr_hmem_info *hmem_info);
+static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier,
+					   const void *addr, size_t len,
+					   union ofi_mr_hmem_info *hmem_info);
+static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier,
+				     const void *addr, size_t len,
+				     union ofi_mr_hmem_info *hmem_info);
+
+struct ofi_import_monitor {
+	struct ofi_mem_monitor monitor;
+	struct fid_mem_monitor *impfid;
+};
+
+static struct ofi_import_monitor impmon = {
+	.monitor.iface = FI_HMEM_SYSTEM,
+	.monitor.init = ofi_import_monitor_init,
+	.monitor.cleanup = ofi_import_monitor_cleanup,
+	.monitor.start = ofi_import_monitor_start,
+	.monitor.stop = ofi_import_monitor_stop,
+	.monitor.subscribe = ofi_import_monitor_subscribe,
+	.monitor.unsubscribe = ofi_import_monitor_unsubscribe,
+	.monitor.valid = ofi_import_monitor_valid,
+};
+
+struct ofi_mem_monitor *import_monitor = &impmon.monitor;
+
+static void ofi_import_monitor_init(struct ofi_mem_monitor *monitor)
+{
+	ofi_monitor_init(monitor);
+}
+
+static void ofi_import_monitor_cleanup(struct ofi_mem_monitor *monitor)
+{
+	assert(!impmon.impfid);
+	ofi_monitor_cleanup(monitor);
+}
+
+static int ofi_import_monitor_start(struct ofi_mem_monitor *monitor)
+{
+	if (!impmon.impfid)
+		return -FI_ENOSYS;
+
+	return impmon.impfid->export_ops->start(impmon.impfid);
+}
+
+static void ofi_import_monitor_stop(struct ofi_mem_monitor *monitor)
+{
+	assert(impmon.impfid);
+	impmon.impfid->export_ops->stop(impmon.impfid);
+}
+
+static int ofi_import_monitor_subscribe(struct ofi_mem_monitor *notifier,
+					const void *addr, size_t len,
+					union ofi_mr_hmem_info *hmem_info)
+{
+	assert(impmon.impfid);
+	return impmon.impfid->export_ops->subscribe(impmon.impfid, addr, len);
+}
+
+static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier,
+					   const void *addr, size_t len,
+					   union ofi_mr_hmem_info *hmem_info)
+{
+	assert(impmon.impfid);
+	return impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len);
+}
+
+static bool ofi_import_monitor_valid(struct ofi_mem_monitor *notifier,
+				     const void *addr, size_t len,
+				     union ofi_mr_hmem_info *hmem_info)
+{
+	assert(impmon.impfid);
+	return impmon.impfid->export_ops->valid(impmon.impfid, addr, len);
+}
+
+static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor,
+				      const void *addr, size_t len)
+{
+	assert(monitor->fid.context == &impmon);
+	pthread_rwlock_rdlock(&mm_list_rwlock);
+	pthread_mutex_lock(&mm_lock);
+	ofi_monitor_notify(&impmon.monitor, addr, len);
+	pthread_mutex_unlock(&mm_lock);
+	pthread_rwlock_unlock(&mm_list_rwlock);
+}
+
+static int ofi_close_import(struct fid *fid)
+{
+	impmon.impfid = NULL;
+	return 0;
+}
+
+static struct fi_ops_mem_notify import_ops = {
+	.size = sizeof(struct fi_ops_mem_notify),
+	.notify = ofi_import_monitor_notify,
+};
+
+static struct fi_ops impfid_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ofi_close_import,
+	.bind = fi_no_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+	.tostr = fi_no_tostr,
+	.ops_set = fi_no_ops_set,
+};
+
+int ofi_monitor_import(struct fid *fid)
+{
+	struct fid_mem_monitor *impfid;
+
+	if (fid->fclass != FI_CLASS_MEM_MONITOR)
+		return -FI_ENOSYS;
+
+	if (impmon.impfid) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"imported monitor already exists\n");
+		return -FI_EBUSY;
+	}
+
+	if (default_monitor && !dlist_empty(&default_monitor->list)) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"cannot replace active monitor\n");
+		return -FI_EBUSY;
+	}
+
+	impfid = container_of(fid, struct fid_mem_monitor, fid);
+	if (impfid->export_ops->size < sizeof(struct fi_ops_mem_monitor))
+		return -FI_EINVAL;
+
+	impmon.impfid = impfid;
+	impfid->fid.context = &impmon;
+	impfid->fid.ops = &impfid_ops;
+	impfid->import_ops = &import_ops;
+
+	FI_INFO(&core_prov, FI_LOG_MR,
+		"setting imported memory monitor as default\n");
+	default_monitor = &impmon.monitor;
+	return 0;
+}
diff --git a/deps/libfabric/prov/util/src/util_mr_cache.c b/deps/libfabric/prov/util/src/util_mr_cache.c
index eca00802762307364210dd35ebf002924fc7d841..0334753c59bd0ceb0b48a6043de574a8b1c7e171 100644
--- a/deps/libfabric/prov/util/src/util_mr_cache.c
+++ b/deps/libfabric/prov/util/src/util_mr_cache.c
@@ -41,12 +41,14 @@
 #include <ofi_mr.h>
 #include <ofi_list.h>
 #include <ofi_tree.h>
+#include <ofi_enosys.h>
 
 
 struct ofi_mr_cache_params cache_params = {
 	.max_cnt = 1024,
 	.cuda_monitor_enabled = true,
 	.rocr_monitor_enabled = true,
+	.ze_monitor_enabled = true,
 };
 
 static int util_mr_find_within(struct ofi_rbmap *map, void *key, void *data)
@@ -105,7 +107,7 @@ static void util_mr_free_entry(struct ofi_mr_cache *cache,
 	FI_DBG(cache->domain->prov, FI_LOG_MR, "free %p (len: %zu)\n",
 	       entry->info.iov.iov_base, entry->info.iov.iov_len);
 
-	assert(!entry->storage_context);
+	assert(!entry->node);
 	cache->delete_region(cache, entry);
 	util_mr_entry_free(cache, entry);
 }
@@ -119,7 +121,9 @@ static void util_mr_uncache_entry_storage(struct ofi_mr_cache *cache,
 	 * notification events, but is harmless to correct operation.
 	 */
 
-	cache->storage.erase(&cache->storage, entry);
+	ofi_rbmap_delete(&cache->tree, entry->node);
+	entry->node = NULL;
+
 	cache->cached_cnt--;
 	cache->cached_size -= entry->info.iov.iov_len;
 }
@@ -131,13 +135,38 @@ static void util_mr_uncache_entry(struct ofi_mr_cache *cache,
 
 	if (entry->use_cnt == 0) {
 		dlist_remove(&entry->list_entry);
-		dlist_insert_tail(&entry->list_entry, &cache->flush_list);
+		dlist_insert_tail(&entry->list_entry, &cache->dead_region_list);
 	} else {
 		cache->uncached_cnt++;
 		cache->uncached_size += entry->info.iov.iov_len;
 	}
 }
 
+static struct ofi_mr_entry *ofi_mr_rbt_find(struct ofi_rbmap *tree,
+					    const struct ofi_mr_info *key)
+{
+	struct ofi_rbnode *node;
+
+	node = ofi_rbmap_find(tree, (void *) key);
+	if (!node)
+		return NULL;
+
+	return node->data;
+}
+
+static struct ofi_mr_entry *ofi_mr_rbt_overlap(struct ofi_rbmap *tree,
+					       const struct iovec *key)
+{
+	struct ofi_rbnode *node;
+
+	node = ofi_rbmap_search(tree, (void *) key,
+				util_mr_find_overlap);
+	if (!node)
+		return NULL;
+
+	return node->data;
+}
+
 /* Caller must hold ofi_mem_monitor lock as well as unsubscribe from the region */
 void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t len)
 {
@@ -148,51 +177,49 @@ void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t le
 	iov.iov_base = (void *) addr;
 	iov.iov_len = len;
 
-	for (entry = cache->storage.overlap(&cache->storage, &iov); entry;
-	     entry = cache->storage.overlap(&cache->storage, &iov))
+	for (entry = ofi_mr_rbt_overlap(&cache->tree, &iov); entry;
+	     entry = ofi_mr_rbt_overlap(&cache->tree, &iov))
 		util_mr_uncache_entry(cache, entry);
 }
 
+/* Function to remove dead regions and prune MR cache size.
+ * Returns true if any entries were flushed from the cache.
+ */
 bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru)
 {
+	struct dlist_entry free_list;
 	struct ofi_mr_entry *entry;
+	bool entries_freed;
 
-	pthread_mutex_lock(&mm_lock);
-	while (!dlist_empty(&cache->flush_list)) {
-		dlist_pop_front(&cache->flush_list, struct ofi_mr_entry,
-				entry, list_entry);
-		FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n",
-		       entry->info.iov.iov_base, entry->info.iov.iov_len);
-		pthread_mutex_unlock(&mm_lock);
+	dlist_init(&free_list);
 
-		util_mr_free_entry(cache, entry);
-		pthread_mutex_lock(&mm_lock);
-	}
+	pthread_mutex_lock(&mm_lock);
 
-	if (!flush_lru || dlist_empty(&cache->lru_list)) {
-		pthread_mutex_unlock(&mm_lock);
-		return false;
-	}
+	dlist_splice_tail(&free_list, &cache->dead_region_list);
 
-	do {
+	while (flush_lru && !dlist_empty(&cache->lru_list)) {
 		dlist_pop_front(&cache->lru_list, struct ofi_mr_entry,
 				entry, list_entry);
 		dlist_init(&entry->list_entry);
-		FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n",
-		       entry->info.iov.iov_base, entry->info.iov.iov_len);
-
 		util_mr_uncache_entry_storage(cache, entry);
-		pthread_mutex_unlock(&mm_lock);
+		dlist_insert_tail(&entry->list_entry, &free_list);
 
-		util_mr_free_entry(cache, entry);
-		pthread_mutex_lock(&mm_lock);
+		flush_lru = ofi_mr_cache_full(cache);
+	}
 
-	} while (!dlist_empty(&cache->lru_list) &&
-		 ((cache->cached_cnt >= cache_params.max_cnt) ||
-		  (cache->cached_size >= cache_params.max_size)));
 	pthread_mutex_unlock(&mm_lock);
 
-	return true;
+	entries_freed = !dlist_empty(&free_list);
+
+	while(!dlist_empty(&free_list)) {
+		dlist_pop_front(&free_list, struct ofi_mr_entry,
+				entry, list_entry);
+		FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n",
+			entry->info.iov.iov_base, entry->info.iov.iov_len);
+		util_mr_free_entry(cache, entry);
+	}
+
+	return entries_freed;
 }
 
 void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)
@@ -204,7 +231,7 @@ void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)
 	cache->delete_cnt++;
 
 	if (--entry->use_cnt == 0) {
-		if (!entry->storage_context) {
+		if (!entry->node) {
 			cache->uncached_cnt--;
 			cache->uncached_size -= entry->info.iov.iov_len;
 			pthread_mutex_unlock(&mm_lock);
@@ -243,7 +270,7 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info,
 	if (!*entry)
 		return -FI_ENOMEM;
 
-	(*entry)->storage_context = NULL;
+	(*entry)->node = NULL;
 	(*entry)->info = *info;
 	(*entry)->use_cnt = 1;
 
@@ -252,19 +279,18 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info,
 		goto free;
 
 	pthread_mutex_lock(&mm_lock);
-	cur = cache->storage.find(&cache->storage, info);
+	cur = ofi_mr_rbt_find(&cache->tree, info);
 	if (cur) {
 		ret = -FI_EAGAIN;
 		goto unlock;
 	}
 
-	if ((cache->cached_cnt >= cache_params.max_cnt) ||
-	    (cache->cached_size >= cache_params.max_size)) {
+	if (ofi_mr_cache_full(cache)) {
 		cache->uncached_cnt++;
 		cache->uncached_size += info->iov.iov_len;
 	} else {
-		if (cache->storage.insert(&cache->storage,
-					  &(*entry)->info, *entry)) {
+		if (ofi_rbmap_insert(&cache->tree, (void *) &(*entry)->info,
+				     (void *) *entry, &(*entry)->node)) {
 			ret = -FI_ENOMEM;
 			goto unlock;
 		}
@@ -294,9 +320,11 @@ int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *att
 			struct ofi_mr_entry **entry)
 {
 	struct ofi_mr_info info;
+	struct ofi_mem_monitor *monitor;
+	bool flush_lru;
 	int ret;
-	struct ofi_mem_monitor *monitor = cache->monitors[attr->iface];
 
+	monitor = cache->monitors[attr->iface];
 	if (!monitor) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 			"MR cache disabled for %s memory\n",
@@ -314,16 +342,15 @@ int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *att
 
 	do {
 		pthread_mutex_lock(&mm_lock);
-
-		if ((cache->cached_cnt >= cache_params.max_cnt) ||
-		    (cache->cached_size >= cache_params.max_size)) {
+		flush_lru = ofi_mr_cache_full(cache);
+		if (flush_lru || !dlist_empty(&cache->dead_region_list)) {
 			pthread_mutex_unlock(&mm_lock);
-			ofi_mr_cache_flush(cache, true);
+			ofi_mr_cache_flush(cache, flush_lru);
 			pthread_mutex_lock(&mm_lock);
 		}
 
 		cache->search_cnt++;
-		*entry = cache->storage.find(&cache->storage, &info);
+		*entry = ofi_mr_rbt_find(&cache->tree, &info);
 
 		if (*entry &&
 		    ofi_iov_within(attr->mr_iov, &(*entry)->info.iov) &&
@@ -336,7 +363,7 @@ int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *att
 		/* Purge regions that overlap with new region */
 		while (*entry) {
 			util_mr_uncache_entry(cache, *entry);
-			*entry = cache->storage.find(&cache->storage, &info);
+			*entry = ofi_mr_rbt_find(&cache->tree, &info);
 		}
 		pthread_mutex_unlock(&mm_lock);
 
@@ -371,7 +398,7 @@ struct ofi_mr_entry *ofi_mr_cache_find(struct ofi_mr_cache *cache,
 	cache->search_cnt++;
 
 	info.iov = *attr->mr_iov;
-	entry = cache->storage.find(&cache->storage, &info);
+	entry = ofi_mr_rbt_find(&cache->tree, &info);
 	if (!entry) {
 		goto unlock;
 	}
@@ -410,7 +437,7 @@ int ofi_mr_cache_reg(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr,
 
 	(*entry)->info.iov = *attr->mr_iov;
 	(*entry)->use_cnt = 1;
-	(*entry)->storage_context = NULL;
+	(*entry)->node = NULL;
 
 	ret = cache->add_region(cache, *entry);
 	if (ret)
@@ -443,7 +470,7 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache)
 
 	pthread_mutex_destroy(&cache->lock);
 	ofi_monitors_del_cache(cache);
-	cache->storage.destroy(&cache->storage);
+	ofi_rbmap_cleanup(&cache->tree);
 	ofi_atomic_dec32(&cache->domain->ref);
 	ofi_bufpool_destroy(cache->entry_pool);
 	assert(cache->cached_cnt == 0);
@@ -452,92 +479,6 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache)
 	assert(cache->uncached_size == 0);
 }
 
-static void ofi_mr_rbt_destroy(struct ofi_mr_storage *storage)
-{
-	ofi_rbmap_destroy(storage->storage);
-}
-
-static struct ofi_mr_entry *ofi_mr_rbt_find(struct ofi_mr_storage *storage,
-					    const struct ofi_mr_info *key)
-{
-	struct ofi_rbnode *node;
-
-	node = ofi_rbmap_find(storage->storage, (void *) key);
-	if (!node)
-		return NULL;
-
-	return node->data;
-}
-
-static struct ofi_mr_entry *ofi_mr_rbt_overlap(struct ofi_mr_storage *storage,
-					       const struct iovec *key)
-{
-	struct ofi_rbnode *node;
-
-	node = ofi_rbmap_search(storage->storage, (void *) key,
-				util_mr_find_overlap);
-	if (!node)
-		return NULL;
-
-	return node->data;
-}
-
-static int ofi_mr_rbt_insert(struct ofi_mr_storage *storage,
-			     struct ofi_mr_info *key,
-			     struct ofi_mr_entry *entry)
-{
-	assert(!entry->storage_context);
-	return ofi_rbmap_insert(storage->storage, (void *) key, (void *) entry,
-				(struct ofi_rbnode **) &entry->storage_context);
-}
-
-static int ofi_mr_rbt_erase(struct ofi_mr_storage *storage,
-			    struct ofi_mr_entry *entry)
-{
-	assert(entry->storage_context);
-	ofi_rbmap_delete(storage->storage,
-			 (struct ofi_rbnode *) entry->storage_context);
-	entry->storage_context = NULL;
-	return 0;
-}
-
-static int ofi_mr_cache_init_rbt(struct ofi_mr_cache *cache)
-{
-	cache->storage.storage = ofi_rbmap_create(util_mr_find_within);
-	if (!cache->storage.storage)
-		return -FI_ENOMEM;
-
-	cache->storage.overlap = ofi_mr_rbt_overlap;
-	cache->storage.destroy = ofi_mr_rbt_destroy;
-	cache->storage.find = ofi_mr_rbt_find;
-	cache->storage.insert = ofi_mr_rbt_insert;
-	cache->storage.erase = ofi_mr_rbt_erase;
-	return 0;
-}
-
-static int ofi_mr_cache_init_storage(struct ofi_mr_cache *cache)
-{
-	int ret;
-
-	switch (cache->storage.type) {
-	case OFI_MR_STORAGE_DEFAULT:
-	case OFI_MR_STORAGE_RBT:
-		ret = ofi_mr_cache_init_rbt(cache);
-		break;
-	case OFI_MR_STORAGE_USER:
-		ret = (cache->storage.storage && cache->storage.overlap &&
-		      cache->storage.destroy && cache->storage.find &&
-		      cache->storage.insert && cache->storage.erase) ?
-			0 : -FI_EINVAL;
-		break;
-	default:
-		ret = -FI_EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
 /* Monitors array must be of size OFI_HMEM_MAX. */
 int ofi_mr_cache_init(struct util_domain *domain,
 		      struct ofi_mem_monitor **monitors,
@@ -551,7 +492,7 @@ int ofi_mr_cache_init(struct util_domain *domain,
 
 	pthread_mutex_init(&cache->lock, NULL);
 	dlist_init(&cache->lru_list);
-	dlist_init(&cache->flush_list);
+	dlist_init(&cache->dead_region_list);
 	cache->cached_cnt = 0;
 	cache->cached_size = 0;
 	cache->uncached_cnt = 0;
@@ -563,13 +504,10 @@ int ofi_mr_cache_init(struct util_domain *domain,
 	cache->domain = domain;
 	ofi_atomic_inc32(&domain->ref);
 
-	ret = ofi_mr_cache_init_storage(cache);
-	if (ret)
-		goto dec;
-
+	ofi_rbmap_init(&cache->tree, util_mr_find_within);
 	ret = ofi_monitors_add_cache(monitors, cache);
 	if (ret)
-		goto del;
+		goto destroy;
 
 	ret = ofi_bufpool_create(&cache->entry_pool,
 				 sizeof(struct ofi_mr_entry) +
@@ -581,10 +519,59 @@ int ofi_mr_cache_init(struct util_domain *domain,
 	return 0;
 del:
 	ofi_monitors_del_cache(cache);
-	cache->storage.destroy(&cache->storage);
-dec:
+destroy:
+	ofi_rbmap_cleanup(&cache->tree);
 	ofi_atomic_dec32(&cache->domain->ref);
 	pthread_mutex_destroy(&cache->lock);
 	cache->domain = NULL;
 	return ret;
 }
+
+
+
+static int ofi_close_cache_fid(struct fid *fid)
+{
+	free(fid);
+	return 0;
+}
+
+static int ofi_bind_cache_fid(struct fid *fid, struct fid *bfid,
+			      uint64_t flags)
+{
+	if (flags || bfid->fclass != FI_CLASS_MEM_MONITOR)
+		return -FI_EINVAL;
+
+	return ofi_monitor_import(bfid);
+}
+
+static struct fi_ops ofi_mr_cache_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ofi_close_cache_fid,
+	.bind = ofi_bind_cache_fid,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+	.tostr = fi_no_tostr,
+	.ops_set = fi_no_ops_set,
+};
+
+int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len,
+		      uint64_t flags, struct fid **fid, void *context)
+{
+	struct fid *cache_fid;
+
+	if (FI_VERSION_LT(version, FI_VERSION(1, 13)) || attr_len)
+		return -FI_EINVAL;
+
+	if (flags)
+		return -FI_EBADFLAGS;
+
+	cache_fid = calloc(1, sizeof(*cache_fid));
+	if (!cache_fid)
+		return -FI_ENOMEM;
+
+	cache_fid->fclass = FI_CLASS_MR_CACHE;
+	cache_fid->context = context;
+	cache_fid->ops = &ofi_mr_cache_ops;
+	*fid = cache_fid;
+	return 0;
+}
diff --git a/deps/libfabric/prov/util/src/util_mr_map.c b/deps/libfabric/prov/util/src/util_mr_map.c
index 78e6459f5c8763f873852f969e3d626c52d2b8dd..80a4e48d79041e7eabae280043d2a6239bc3ef5c 100644
--- a/deps/libfabric/prov/util/src/util_mr_map.c
+++ b/deps/libfabric/prov/util/src/util_mr_map.c
@@ -266,6 +266,17 @@ int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 
 	ofi_mr_update_attr(domain->fabric->fabric_fid.api_version,
 			   domain->info_domain_caps, attr, &cur_abi_attr);
+
+	if ((flags & FI_HMEM_HOST_ALLOC) && (attr->iface == FI_HMEM_ZE))
+		cur_abi_attr.device.ze = -1;
+
+	if (!hmem_ops[cur_abi_attr.iface].initialized) {
+		FI_WARN(domain->mr_map.prov, FI_LOG_MR,
+			"MR registration failed - hmem iface not initialized\n");
+		free(mr);
+		return -FI_ENOSYS;
+	}
+
 	fastlock_acquire(&domain->lock);
 
 	mr->mr_fid.fid.fclass = FI_CLASS_MR;
diff --git a/deps/libfabric/prov/util/src/util_shm.c b/deps/libfabric/prov/util/src/util_shm.c
index 72301b62dbb607da7562c8bc3713823e1dcb060c..824eba7c9f1ecc9a48a47ee38593a211be6ec264 100644
--- a/deps/libfabric/prov/util/src/util_shm.c
+++ b/deps/libfabric/prov/util/src/util_shm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -68,36 +68,46 @@ static void smr_peer_addr_init(struct smr_addr *peer)
 void smr_cma_check(struct smr_region *smr, struct smr_region *peer_smr)
 {
 	struct iovec local_iov, remote_iov;
+	int remote_pid;
 	int ret;
 
-	if (peer_smr->cma_cap != SMR_CMA_CAP_NA) {
-		smr->cma_cap = peer_smr->cma_cap;
+	if (smr != peer_smr && peer_smr->cma_cap_peer != SMR_CMA_CAP_NA) {
+		smr->cma_cap_peer = peer_smr->cma_cap_peer;
 		return;
 	}
-	local_iov.iov_base = &smr->cma_cap;
-	local_iov.iov_len = sizeof(smr->cma_cap);
+	remote_pid = peer_smr->pid;
+	local_iov.iov_base = &remote_pid;
+	local_iov.iov_len = sizeof(remote_pid);
 	remote_iov.iov_base = (char *)peer_smr->base_addr +
-			      ((char *)&peer_smr->cma_cap - (char *)peer_smr);
-	remote_iov.iov_len = sizeof(peer_smr->cma_cap);
+			      ((char *)&peer_smr->pid - (char *)peer_smr);
+	remote_iov.iov_len = sizeof(peer_smr->pid);
 	ret = ofi_process_vm_writev(peer_smr->pid, &local_iov, 1,
 				    &remote_iov, 1, 0);
-	smr->cma_cap = (ret == -1) ? SMR_CMA_CAP_OFF : SMR_CMA_CAP_ON;
-	peer_smr->cma_cap = smr->cma_cap;
+	assert(remote_pid == peer_smr->pid);
+
+	if (smr == peer_smr) {
+		smr->cma_cap_self = (ret == -1) ? SMR_CMA_CAP_OFF : SMR_CMA_CAP_ON;
+	} else {
+		smr->cma_cap_peer = (ret == -1) ? SMR_CMA_CAP_OFF : SMR_CMA_CAP_ON;
+		peer_smr->cma_cap_peer = smr->cma_cap_peer;
+	}
 }
 
 size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 				  size_t *cmd_offset, size_t *resp_offset,
 				  size_t *inject_offset, size_t *sar_offset,
-				  size_t *peer_offset, size_t *name_offset)
+				  size_t *peer_offset, size_t *name_offset,
+				  size_t *sock_offset)
 {
 	size_t cmd_queue_offset, resp_queue_offset, inject_pool_offset;
 	size_t sar_pool_offset, peer_data_offset, ep_name_offset;
-	size_t tx_size, rx_size, total_size;
+	size_t tx_size, rx_size, total_size, sock_name_offset;
 
 	tx_size = roundup_power_of_two(tx_count);
 	rx_size = roundup_power_of_two(rx_count);
 
-	cmd_queue_offset = sizeof(struct smr_region);
+	/* Align cmd_queue offset to 128-bit boundary. */
+	cmd_queue_offset = ofi_get_aligned_size(sizeof(struct smr_region), 16);
 	resp_queue_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) +
 			    sizeof(struct smr_cmd) * rx_size;
 	inject_pool_offset = resp_queue_offset + sizeof(struct smr_resp_queue) +
@@ -108,6 +118,8 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 			   sizeof(struct smr_sar_pool_entry) * SMR_MAX_PEERS;
 	ep_name_offset = peer_data_offset + sizeof(struct smr_peer_data) * SMR_MAX_PEERS;
 
+	sock_name_offset = ep_name_offset + SMR_NAME_MAX;
+
 	if (cmd_offset)
 		*cmd_offset = cmd_queue_offset;
 	if (resp_offset)
@@ -120,8 +132,10 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 		*peer_offset = peer_data_offset;
 	if (name_offset)
 		*name_offset = ep_name_offset;
+	if (sock_offset)
+		*sock_offset = sock_name_offset;
 
-	total_size = ep_name_offset + SMR_NAME_MAX;
+	total_size = sock_name_offset + SMR_SOCK_NAME_MAX;
 
 	/*
  	 * Revisit later to see if we really need the size adjustment, or
@@ -132,6 +146,45 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 	return total_size;
 }
 
+static int smr_retry_map(const char *name, int *fd)
+{
+	char tmp[NAME_MAX];
+	struct smr_region *old_shm;
+	struct stat sts;
+	int shm_pid;
+
+	*fd = shm_open(name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (*fd < 0)
+		return -errno;
+
+	old_shm = mmap(NULL, sizeof(*old_shm), PROT_READ | PROT_WRITE,
+		       MAP_SHARED, *fd, 0);
+	if (old_shm == MAP_FAILED)
+		goto err;
+
+        /* No backwards compatibility for now. */
+	if (old_shm->version != SMR_VERSION) {
+		munmap(old_shm, sizeof(*old_shm));
+		goto err;
+	}
+	shm_pid = old_shm->pid;
+	munmap(old_shm, sizeof(*old_shm));
+
+	if (!shm_pid)
+		return FI_SUCCESS;
+
+	memset(tmp, 0, sizeof(tmp));
+	snprintf(tmp, sizeof(tmp), "/proc/%d", shm_pid);
+
+	if (stat(tmp, &sts) == -1 && errno == ENOENT)
+		return FI_SUCCESS;
+
+err:
+	close(*fd);
+	shm_unlink(name);
+	return -FI_EBUSY;
+}
+
 /* TODO: Determine if aligning SMR data helps performance */
 int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	       const struct smr_attr *attr, struct smr_region *volatile *smr)
@@ -139,7 +192,7 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	struct smr_ep_name *ep_name;
 	size_t total_size, cmd_queue_offset, peer_data_offset;
 	size_t resp_queue_offset, inject_pool_offset, name_offset;
-	size_t sar_pool_offset;
+	size_t sar_pool_offset, sock_name_offset;
 	int fd, ret, i;
 	void *mapped_addr;
 	size_t tx_size, rx_size;
@@ -149,12 +202,25 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	total_size = smr_calculate_size_offsets(tx_size, rx_size, &cmd_queue_offset,
 					&resp_queue_offset, &inject_pool_offset,
 					&sar_pool_offset, &peer_data_offset,
-					&name_offset);
+					&name_offset, &sock_name_offset);
 
-	fd = shm_open(attr->name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	fd = shm_open(attr->name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
 	if (fd < 0) {
-		FI_WARN(prov, FI_LOG_EP_CTRL, "shm_open error\n");
-		return -errno;
+		if (errno != EEXIST) {
+			FI_WARN(prov, FI_LOG_EP_CTRL,
+				"shm_open error (%s): %s\n",
+				attr->name, strerror(errno));
+			return -errno;
+		}
+
+		ret = smr_retry_map(attr->name, &fd);
+		if (ret) {
+			FI_WARN(prov, FI_LOG_EP_CTRL, "shm file in use (%s)\n",
+				attr->name);
+			return ret;
+		}
+		FI_WARN(prov, FI_LOG_EP_CTRL,
+			"Overwriting shm from dead process (%s)\n", attr->name);
 	}
 
 	ep_name = calloc(1, sizeof(*ep_name));
@@ -191,11 +257,13 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 
 	*smr = mapped_addr;
 	fastlock_init(&(*smr)->lock);
+	ofi_atomic_initialize32(&(*smr)->signal, 0);
 
 	(*smr)->map = map;
 	(*smr)->version = SMR_VERSION;
 	(*smr)->flags = SMR_FLAG_ATOMIC | SMR_FLAG_DEBUG;
-	(*smr)->cma_cap = SMR_CMA_CAP_NA;
+	(*smr)->cma_cap_peer = SMR_CMA_CAP_NA;
+	(*smr)->cma_cap_self = SMR_CMA_CAP_NA;
 	(*smr)->base_addr = *smr;
 
 	(*smr)->total_size = total_size;
@@ -205,6 +273,7 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	(*smr)->sar_pool_offset = sar_pool_offset;
 	(*smr)->peer_data_offset = peer_data_offset;
 	(*smr)->name_offset = name_offset;
+	(*smr)->sock_name_offset = sock_name_offset;
 	(*smr)->cmd_cnt = rx_size;
 	/* Limit of 1 outstanding SAR message per peer */
 	(*smr)->sar_cnt = SMR_MAX_PEERS;
@@ -247,7 +316,7 @@ static int smr_name_compare(struct ofi_rbmap *map, void *key, void *data)
 
 	smr_map = container_of(map, struct smr_map, rbmap);
 
-	return strncmp(smr_map->peers[(int64_t) data].peer.name,
+	return strncmp(smr_map->peers[(uintptr_t) data].peer.name,
 		       (char *) key, SMR_NAME_MAX);
 }
 
@@ -284,11 +353,13 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf)
 	struct smr_region *peer;
 	size_t size;
 	int fd, ret = 0;
+	struct stat sts;
 	struct dlist_entry *entry;
+	const char *name = smr_no_prefix(peer_buf->peer.name);
+	char tmp[SMR_PATH_MAX];
 
 	pthread_mutex_lock(&ep_list_lock);
-	entry = dlist_find_first_match(&ep_name_list, smr_match_name,
-				       peer_buf->peer.name);
+	entry = dlist_find_first_match(&ep_name_list, smr_match_name, name);
 	if (entry) {
 		peer_buf->region = container_of(entry, struct smr_ep_name,
 						entry)->region;
@@ -297,12 +368,24 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf)
 	}
 	pthread_mutex_unlock(&ep_list_lock);
 
-	fd = shm_open(peer_buf->peer.name, O_RDWR, S_IRUSR | S_IWUSR);
+	fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
 	if (fd < 0) {
 		FI_WARN_ONCE(prov, FI_LOG_AV, "shm_open error\n");
 		return -errno;
 	}
 
+	memset(tmp, 0, sizeof(tmp));
+	snprintf(tmp, sizeof(tmp), "%s%s", SMR_DIR, name);
+	if (stat(tmp, &sts) == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (sts.st_size < sizeof(*peer)) {
+		ret = -FI_ENOENT;
+		goto out;
+	}
+
 	peer = mmap(NULL, sizeof(*peer), PROT_READ | PROT_WRITE,
 		    MAP_SHARED, fd, 0);
 	if (peer == MAP_FAILED) {
@@ -314,7 +397,7 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf)
 	if (!peer->pid) {
 		FI_WARN(prov, FI_LOG_AV, "peer not initialized\n");
 		munmap(peer, sizeof(*peer));
-		ret = -FI_EAGAIN;
+		ret = -FI_ENOENT;
 		goto out;
 	}
 
@@ -345,7 +428,8 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id)
 
 	peer_smr = smr_peer_region(region, id);
 
-	if (region->cma_cap == SMR_CMA_CAP_NA && region != peer_smr)
+	if ((region != peer_smr && region->cma_cap_peer == SMR_CMA_CAP_NA) ||
+	    (region == peer_smr && region->cma_cap_self == SMR_CMA_CAP_NA))
 		smr_cma_check(region, peer_smr);
 }
 
@@ -383,10 +467,11 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
 	int tries = 0, ret = 0;
 
 	fastlock_acquire(&map->lock);
-	ret = ofi_rbmap_insert(&map->rbmap, (void *) name, (void *) *id, &node);
+	ret = ofi_rbmap_insert(&map->rbmap, (void *) name,
+			       (void *) (intptr_t) *id, &node);
 	if (ret) {
 		assert(ret == -FI_EALREADY);
-		*id = (int64_t) node->data;
+		*id = (intptr_t) node->data;
 		fastlock_release(&map->lock);
 		return 0;
 	}
@@ -400,7 +485,7 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
 
 	assert(map->cur_id < SMR_MAX_PEERS && tries < SMR_MAX_PEERS);
 	*id = map->cur_id;
-	node->data = (void *) *id;
+	node->data = (void *) (intptr_t) *id;
 	strncpy(map->peers[*id].peer.name, name, SMR_NAME_MAX);
 	map->peers[*id].peer.name[SMR_NAME_MAX - 1] = '\0';
 
@@ -421,7 +506,7 @@ void smr_map_del(struct smr_map *map, int64_t id)
 
 	pthread_mutex_lock(&ep_list_lock);
 	entry = dlist_find_first_match(&ep_name_list, smr_match_name,
-				       map->peers[id].peer.name);
+				       smr_no_prefix(map->peers[id].peer.name));
 	pthread_mutex_unlock(&ep_list_lock);
 
 	fastlock_acquire(&map->lock);
@@ -431,7 +516,7 @@ void smr_map_del(struct smr_map *map, int64_t id)
 	(void) ofi_rbmap_find_delete(&map->rbmap,
 				     (void *) map->peers[id].peer.name);
 
-	map->peers[id].fiaddr = FI_ADDR_UNSPEC;	
+	map->peers[id].fiaddr = FI_ADDR_UNSPEC;
 	map->peers[id].peer.id = -1;
 
 	fastlock_release(&map->lock);
diff --git a/deps/libfabric/prov/util/src/util_wait.c b/deps/libfabric/prov/util/src/util_wait.c
index 07d6a7a63735ee9012c9e374267ba807ab38c464..42a7fb737ae5aa59fbabb5dfd2e4e0d32c197fc2 100644
--- a/deps/libfabric/prov/util/src/util_wait.c
+++ b/deps/libfabric/prov/util/src/util_wait.c
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
+#include <sched.h>
 
 #include <ofi_enosys.h>
 #include <ofi_util.h>
@@ -383,9 +384,9 @@ release:
 
 static int util_wait_fd_run(struct fid_wait *wait_fid, int timeout)
 {
+	struct ofi_epollfds_event event;
 	struct util_wait_fd *wait;
 	uint64_t endtime;
-	void *ep_context[1];
 	int ret;
 
 	wait = container_of(wait_fid, struct util_wait_fd, util_wait.wait_fid);
@@ -400,12 +401,17 @@ static int util_wait_fd_run(struct fid_wait *wait_fid, int timeout)
 			return -FI_ETIMEDOUT;
 
 		ret = (wait->util_wait.wait_obj == FI_WAIT_FD) ?
-		      ofi_epoll_wait(wait->epoll_fd, ep_context, 1, timeout) :
-		      ofi_pollfds_wait(wait->pollfds, ep_context, 1, timeout);
+		      ofi_epoll_wait(wait->epoll_fd, &event, 1, timeout) :
+		      ofi_pollfds_wait(wait->pollfds, &event, 1, timeout);
 		if (ret > 0)
 			return FI_SUCCESS;
 
 		if (ret < 0) {
+#if ENABLE_DEBUG
+			/* ignore interrupts in order to enable debugging */
+			if (ret == -FI_EINTR)
+				continue;
+#endif
 			FI_WARN(wait->util_wait.prov, FI_LOG_FABRIC,
 				"poll failed\n");
 			return ret;
@@ -613,7 +619,7 @@ static int util_wait_yield_run(struct fid_wait *wait_fid, int timeout)
 			}
 		}
 		fastlock_release(&wait->util_wait.lock);
-		pthread_yield();
+		sched_yield();
 	}
 
 	fastlock_acquire(&wait->signal_lock);
diff --git a/deps/libfabric/prov/util/src/ze_mem_monitor.c b/deps/libfabric/prov/util/src/ze_mem_monitor.c
new file mode 100644
index 0000000000000000000000000000000000000000..866885bdd3484399056937df6b53a67b44e2358e
--- /dev/null
+++ b/deps/libfabric/prov/util/src/ze_mem_monitor.c
@@ -0,0 +1,117 @@
+/*
+ * (C) Copyright 2021 Intel Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ofi_mr.h"
+
+#if HAVE_LIBZE
+
+#include "ofi_hmem.h"
+
+static int ze_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			   size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	return ze_hmem_get_id(addr, &hmem_info->ze_id);
+}
+
+static void ze_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+			      const void *addr, size_t len,
+			      union ofi_mr_hmem_info *hmem_info)
+{
+	/* no-op */
+}
+
+static bool ze_mm_valid(struct ofi_mem_monitor *monitor,
+			const void *addr, size_t len,
+			union ofi_mr_hmem_info *hmem_info)
+{
+	uint64_t id;
+	int ret;
+
+	ret = ze_hmem_get_id(addr, &id);
+	if (ret)
+		return false;
+
+
+	return id == hmem_info->ze_id;
+}
+
+static int ze_monitor_start(struct ofi_mem_monitor *monitor)
+{
+	/* no-op */
+	return FI_SUCCESS;
+}
+
+#else
+
+static int ze_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			   size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	return -FI_ENOSYS;
+}
+
+static void ze_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+			      const void *addr, size_t len,
+			      union ofi_mr_hmem_info *hmem_info)
+{
+}
+
+static bool ze_mm_valid(struct ofi_mem_monitor *monitor,
+			const void *addr, size_t len,
+			union ofi_mr_hmem_info *hmem_info)
+{
+	return false;
+}
+
+static int ze_monitor_start(struct ofi_mem_monitor *monitor)
+{
+	return -FI_ENOSYS;
+}
+
+#endif /* HAVE_LIBZE */
+
+void ze_monitor_stop(struct ofi_mem_monitor *monitor)
+{
+	/* no-op */
+}
+
+static struct ofi_mem_monitor ze_mm = {
+	.iface = FI_HMEM_ZE,
+	.init = ofi_monitor_init,
+	.cleanup = ofi_monitor_cleanup,
+	.start = ze_monitor_start,
+	.stop = ze_monitor_stop,
+	.subscribe = ze_mm_subscribe,
+	.unsubscribe = ze_mm_unsubscribe,
+	.valid = ze_mm_valid,
+};
+
+struct ofi_mem_monitor *ze_monitor = &ze_mm;
diff --git a/deps/libfabric/prov/verbs/configure.m4 b/deps/libfabric/prov/verbs/configure.m4
index 2d51072c6f12ad8dcb815b42263d2be8fda6fee8..aa793e0180bae2435b228a93edb9ba98edb3168e 100644
--- a/deps/libfabric/prov/verbs/configure.m4
+++ b/deps/libfabric/prov/verbs/configure.m4
@@ -47,6 +47,10 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[
 	AS_IF([test $verbs_ibverbs_happy -eq 1 && \
 	       test $verbs_rdmacm_happy -eq 1], [$1], [$2])
 
+	#Set CPPFLAGS to allow correct include path to be used by AC_CHECK_DECL()
+	fi_verbs_configure_save_CPPFLAGS=$CPPFLAGS
+	CPPFLAGS=$verbs_ibverbs_CPPFLAGS
+
 	#See if we have extended verbs calls
 	VERBS_HAVE_QUERY_EX=0
 	AS_IF([test $verbs_ibverbs_happy -eq 1],[
@@ -79,6 +83,18 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[
 	AC_DEFINE_UNQUOTED([VERBS_HAVE_RDMA_ESTABLISH],[$VERBS_HAVE_RDMA_ESTABLISH],
 		[Whether rdma/rdma_cma.h has rdma_establish() support or not])
 
+	#See if we have rdma-core dmabuf mr support
+	VERBS_HAVE_DMABUF_MR=0
+	AS_IF([test $verbs_ibverbs_happy -eq 1],[
+		AC_CHECK_DECL([ibv_reg_dmabuf_mr],
+			[VERBS_HAVE_DMABUF_MR=1],[],
+			[#include <infiniband/verbs.h>])
+		])
+	AC_DEFINE_UNQUOTED([VERBS_HAVE_DMABUF_MR],[$VERBS_HAVE_DMABUF_MR],
+		[Whether infiniband/verbs.h has ibv_reg_dmabuf_mr() support or not])
+
+	CPPFLAGS=$fi_verbs_configure_save_CPPFLAGS
+
 	# Technically, verbs_ibverbs_CPPFLAGS and
 	# verbs_rdmacm_CPPFLAGS could be different, but it is highly
 	# unlikely that they ever will be.  So only list
diff --git a/deps/libfabric/prov/verbs/src/fi_verbs.c b/deps/libfabric/prov/verbs/src/fi_verbs.c
index 1bdbe8a64c758251fa0ecd7665beb00e65082c8c..c57c2502e94d6c68cbb394c2d01c9fef9d16c6a1 100644
--- a/deps/libfabric/prov/verbs/src/fi_verbs.c
+++ b/deps/libfabric/prov/verbs/src/fi_verbs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2015 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2013-2021 Intel Corporation, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -124,7 +124,7 @@ vrb_get_rdmacm_rai(const char *node, const char *service, uint64_t flags,
 
 	ret = rdma_getaddrinfo(node, service, &rai_hints, &_rai);
 	if (ret) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_getaddrinfo", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_getaddrinfo");
 		if (errno)
 			ret = -errno;
 		goto out;
@@ -173,7 +173,7 @@ vrb_get_sib_rai(const char *node, const char *service, uint64_t flags,
 	if (*rai == NULL)
 		return -FI_ENOMEM;
 
-	ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr, 
+	ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr,
 						 dest_addrlen, flags, *rai);
 	if (ret)
 		return ret;
@@ -274,7 +274,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 	ret = rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints ? hints->addr_format:
 					FI_FORMAT_UNSPEC));
 	if (ret) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_id", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_create_id");
 		ret = -errno;
 		goto err1;
 	}
@@ -282,7 +282,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 	if ((*rai)->ai_flags & RAI_PASSIVE) {
 		ret = rdma_bind_addr(*id, (*rai)->ai_src_addr);
 		if (ret) {
-			VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_bind_addr", errno);
+			VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_bind_addr");
 			ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC,
 					"bind addr", (*rai)->ai_src_addr);
 			ret = -errno;
@@ -294,7 +294,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 	ret = rdma_resolve_addr(*id, (*rai)->ai_src_addr,
 				(*rai)->ai_dst_addr, VERBS_RESOLVE_TIMEOUT);
 	if (ret) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_resolve_addr", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_resolve_addr");
 		ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC,
 				"src addr", (*rai)->ai_src_addr);
 		ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC,
@@ -305,7 +305,7 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 	return 0;
 err2:
 	if (rdma_destroy_id(*id))
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_destroy_id", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_destroy_id");
 err1:
 	rdma_freeaddrinfo(*rai);
 	return ret;
@@ -327,8 +327,7 @@ int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps,
 
 	if (rdma_create_id(NULL, id, NULL, ps)) {
 		ret = -errno;
-		FI_WARN(&vrb_prov, FI_LOG_FABRIC, "rdma_create_id failed: "
-			"%s (%d)\n", strerror(-ret), -ret);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_create_id");
 		goto err1;
 	}
 
@@ -343,8 +342,7 @@ int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps,
 	if (rdma_resolve_addr(*id, rai->ai_src_addr, rai->ai_dst_addr,
 			      VERBS_RESOLVE_TIMEOUT)) {
 		ret = -errno;
-		FI_WARN(&vrb_prov, FI_LOG_EP_CTRL, "rdma_resolve_addr failed: "
-			"%s (%d)\n", strerror(-ret), -ret);
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr");
 		ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
 				"src addr", rai->ai_src_addr);
 		ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
@@ -424,7 +422,7 @@ fn:
 }
 
 #if ENABLE_DEBUG
-static int vrb_dbg_query_qp_attr(struct ibv_qp *qp)
+static void vrb_dbg_query_qp_attr(struct ibv_qp *qp)
 {
 	struct ibv_qp_init_attr attr = { 0 };
 	struct ibv_qp_attr qp_attr = { 0 };
@@ -433,9 +431,10 @@ static int vrb_dbg_query_qp_attr(struct ibv_qp *qp)
 	ret = ibv_query_qp(qp, &qp_attr, IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
 			   IBV_QP_RNR_RETRY | IBV_QP_MIN_RNR_TIMER, &attr);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unable to query QP\n");
-		return ret;
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_query_qp");
+		return;
 	}
+
 	FI_DBG(&vrb_prov, FI_LOG_EP_CTRL, "QP attributes: "
 	       "min_rnr_timer"	": %" PRIu8 ", "
 	       "timeout"	": %" PRIu8 ", "
@@ -443,22 +442,20 @@ static int vrb_dbg_query_qp_attr(struct ibv_qp *qp)
 	       "rnr_retry"	": %" PRIu8 "\n",
 	       qp_attr.min_rnr_timer, qp_attr.timeout, qp_attr.retry_cnt,
 	       qp_attr.rnr_retry);
-	return 0;
 }
 #else
-static int vrb_dbg_query_qp_attr(struct ibv_qp *qp)
+static void vrb_dbg_query_qp_attr(struct ibv_qp *qp)
 {
-	return 0;
 }
 #endif
 
-int vrb_set_rnr_timer(struct ibv_qp *qp)
+void vrb_set_rnr_timer(struct ibv_qp *qp)
 {
 	struct ibv_qp_attr attr = { 0 };
 	int ret;
 
 	if (vrb_gl_data.min_rnr_timer > 31) {
-		VERBS_WARN(FI_LOG_EQ, "min_rnr_timer value out of valid range; "
+		VRB_WARN(FI_LOG_EQ, "min_rnr_timer value out of valid range; "
 			   "using default value of %d\n",
 			   VERBS_DEFAULT_MIN_RNR_TIMER);
 		attr.min_rnr_timer = VERBS_DEFAULT_MIN_RNR_TIMER;
@@ -468,17 +465,13 @@ int vrb_set_rnr_timer(struct ibv_qp *qp)
 
 	/* XRC initiator QP do not have responder logic */
 	if (qp->qp_type == IBV_QPT_XRC_SEND)
-		return 0;
+		return;
 
 	ret = ibv_modify_qp(qp, &attr, IBV_QP_MIN_RNR_TIMER);
-	if (ret) {
-		VERBS_WARN(FI_LOG_EQ, "Unable to modify QP attribute\n");
-		return ret;
-	}
-	ret = vrb_dbg_query_qp_attr(qp);
 	if (ret)
-		return ret;
-	return 0;
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_modify_qp");
+
+	vrb_dbg_query_qp_attr(qp);
 }
 
 int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context,
@@ -629,106 +622,93 @@ static int vrb_read_params(void)
 {
 	/* Common parameters */
 	if (vrb_get_param_int("tx_size", "Default maximum tx context size",
-				 &vrb_gl_data.def_tx_size) ||
+			      &vrb_gl_data.def_tx_size) ||
 	    (vrb_gl_data.def_tx_size < 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of tx_size\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of tx_size\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("rx_size", "Default maximum rx context size",
-				 &vrb_gl_data.def_rx_size) ||
+			      &vrb_gl_data.def_rx_size) ||
 	    (vrb_gl_data.def_rx_size < 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of rx_size\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of rx_size\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("tx_iov_limit", "Default maximum tx iov_limit",
-				 &vrb_gl_data.def_tx_iov_limit) ||
+			      &vrb_gl_data.def_tx_iov_limit) ||
 	    (vrb_gl_data.def_tx_iov_limit < 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of tx_iov_limit\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of tx_iov_limit\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("rx_iov_limit", "Default maximum rx iov_limit",
-				 &vrb_gl_data.def_rx_iov_limit) ||
+			      &vrb_gl_data.def_rx_iov_limit) ||
 	    (vrb_gl_data.def_rx_iov_limit < 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of rx_iov_limit\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of rx_iov_limit\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("inline_size", "Default maximum inline size. "
-				 "Actual inject size returned in fi_info may be "
-				 "greater", &vrb_gl_data.def_inline_size) ||
+			      "Actual inject size returned in fi_info may be "
+			      "greater", &vrb_gl_data.def_inline_size) ||
 	    (vrb_gl_data.def_inline_size < 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of inline_size\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of inline_size\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("min_rnr_timer", "Set min_rnr_timer QP "
-				 "attribute (0 - 31)",
-				 &vrb_gl_data.min_rnr_timer) ||
+			      "attribute (0 - 31)",
+			      &vrb_gl_data.min_rnr_timer) ||
 	    ((vrb_gl_data.min_rnr_timer < 0) ||
 	     (vrb_gl_data.min_rnr_timer > 31))) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of min_rnr_timer\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of min_rnr_timer\n");
 		return -FI_EINVAL;
 	}
 
 	if (vrb_get_param_bool("use_odp", "Enable on-demand paging memory "
-	    "registrations, if supported.  This is currently required to "
-	    "register DAX file system mmapped memory.", &vrb_gl_data.use_odp)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of use_odp\n");
+			       "registrations, if supported.  This is "
+			       "currently required to register DAX file system "
+			       "mmapped memory.", &vrb_gl_data.use_odp)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of use_odp\n");
 		return -FI_EINVAL;
 	}
 
-	if (vrb_get_param_bool("prefer_xrc", "Order XRC transport fi_infos"
-				  "ahead of RC. Default orders RC first.",
-				  &vrb_gl_data.msg.prefer_xrc)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of prefer_xrc\n");
+	if (vrb_get_param_bool("prefer_xrc", "Order XRC transport fi_infos "
+			       "ahead of RC.  Default orders RC first.  This "
+			       "setting must usually be combined with setting "
+			       "FI_OFI_RXM_USE_SRX.  See fi_verbs.7 man page.",
+				&vrb_gl_data.msg.prefer_xrc)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of prefer_xrc\n");
 		return -FI_EINVAL;
 	}
 
 	if (vrb_get_param_str("xrcd_filename", "A file to "
-				 "associate with the XRC domain.",
-				 &vrb_gl_data.msg.xrcd_filename)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of xrcd_filename\n");
+			      "associate with the XRC domain.",
+			      &vrb_gl_data.msg.xrcd_filename)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of xrcd_filename\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("cqread_bunch_size", "The number of entries to "
-				 "be read from the verbs completion queue at a time",
-				 &vrb_gl_data.cqread_bunch_size) ||
+			      "be read from the verbs completion queue at a time",
+			      &vrb_gl_data.cqread_bunch_size) ||
 	    (vrb_gl_data.cqread_bunch_size <= 0)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of cqread_bunch_size\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid value of cqread_bunch_size\n");
 		return -FI_EINVAL;
 	}
 	if (vrb_get_param_int("gid_idx", "Set which gid index to use "
-				 "attribute (0 - 255)",
-				 &vrb_gl_data.gid_idx) ||
-	    (vrb_gl_data.gid_idx < 0 ||
-	     vrb_gl_data.gid_idx > 255)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of gid index\n");
+			      "attribute (0 - 255)", &vrb_gl_data.gid_idx) ||
+	    (vrb_gl_data.gid_idx < 0 || vrb_gl_data.gid_idx > 255)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of gid index\n");
 		return -FI_EINVAL;
 	}
 
 	if (vrb_get_param_str("device_name", "The prefix or the full name of the "
-			      "verbs device to use",
-			      &vrb_gl_data.device_name)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of device_name\n");
+			      "verbs device to use", &vrb_gl_data.device_name)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of device_name\n");
 		return -FI_EINVAL;
 	}
 
 	/* MSG-specific parameter */
 	if (vrb_get_param_str("iface", "The prefix or the full name of the "
-				 "network interface associated with the verbs device",
-				 &vrb_gl_data.iface)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of iface\n");
+			      "network interface associated with the verbs "
+			      "device", &vrb_gl_data.iface)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid value of iface\n");
 		return -FI_EINVAL;
 	}
 
@@ -736,21 +716,19 @@ static int vrb_read_params(void)
 	if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK"))
 		vrb_gl_data.dgram.use_name_server = 0;
 	if (vrb_get_param_bool("dgram_use_name_server", "The option that "
-				  "enables/disables OFI Name Server thread that is used "
-				  "to resolve IP-addresses to provider specific "
-				  "addresses. If MPI is used, the NS is disabled "
-				  "by default.", &vrb_gl_data.dgram.use_name_server)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of dgram_use_name_server\n");
+			       "enables/disables OFI Name Server thread used "
+			       "to resolve IP-addresses to provider specific "
+			       "addresses. If MPI is used, the NS is disabled "
+			       "by default.", &vrb_gl_data.dgram.use_name_server)) {
+		VRB_WARN(FI_LOG_CORE, "Invalid dgram_use_name_server\n");
 		return -FI_EINVAL;
 	}
-	if (vrb_get_param_int("dgram_name_server_port", "The port on which Name Server "
-				 "thread listens incoming connections and requestes.",
-				 &vrb_gl_data.dgram.name_server_port) ||
+	if (vrb_get_param_int("dgram_name_server_port", "The port on which "
+			      "the name server thread listens incoming "
+			      "requests.", &vrb_gl_data.dgram.name_server_port) ||
 	    (vrb_gl_data.dgram.name_server_port < 0 ||
 	     vrb_gl_data.dgram.name_server_port > 65535)) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Invalid value of dgram_name_server_port\n");
+		VRB_WARN(FI_LOG_CORE, "Invalid dgram_name_server_port\n");
 		return -FI_EINVAL;
 	}
 
@@ -774,6 +752,50 @@ static void verbs_devs_free(void)
 	}
 }
 
+static void vrb_set_peer_mem_support(void)
+{
+	char *line = NULL;
+	size_t line_size = 0;
+	ssize_t bytes;
+	FILE *kallsyms_fd;
+
+	kallsyms_fd = fopen("/proc/kallsyms", "r");
+	if (!kallsyms_fd)
+		return;
+
+	while ((bytes = getline(&line, &line_size, kallsyms_fd)) != -1) {
+		if (strstr(line, "ib_register_peer_memory_client")) {
+			vrb_gl_data.peer_mem_support = true;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(kallsyms_fd);
+}
+
+static void vrb_set_dmabuf_support(void)
+{
+	char *line = NULL;
+	size_t line_size = 0;
+	ssize_t bytes;
+	FILE *kallsyms_fd;
+
+	kallsyms_fd = fopen("/proc/kallsyms", "r");
+	if (!kallsyms_fd)
+		return;
+
+	while ((bytes = getline(&line, &line_size, kallsyms_fd)) != -1) {
+		if (strstr(line, "ib_umem_dmabuf_get")) {
+			vrb_gl_data.dmabuf_support = true;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(kallsyms_fd);
+}
+
 static void vrb_fini(void)
 {
 #if HAVE_VERBS_DL
@@ -793,6 +815,9 @@ VERBS_INI
 	ofi_hmem_init();
 	ofi_monitors_init();
 #endif
+	vrb_set_peer_mem_support();
+	vrb_set_dmabuf_support();
+
 	if (vrb_read_params()|| vrb_init_info(&vrb_util_prov.info))
 		return NULL;
 	return &vrb_prov;
diff --git a/deps/libfabric/prov/verbs/src/fi_verbs.h b/deps/libfabric/prov/verbs/src/fi_verbs.h
index 7df34020f64731351716b9d2bf79801bb242de75..29ff483fb4c1086aef3122b132e2b9fc29f54167 100644
--- a/deps/libfabric/prov/verbs/src/fi_verbs.h
+++ b/deps/libfabric/prov/verbs/src/fi_verbs.h
@@ -3,6 +3,7 @@
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2018-2019 Cray Inc. All rights reserved.
  * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -73,6 +74,8 @@
 #include "ofi_util.h"
 #include "ofi_tree.h"
 #include "ofi_indexer.h"
+#include "ofi_iov.h"
+#include "ofi_hmem.h"
 
 #include "ofi_verbs_priv.h"
 
@@ -89,16 +92,22 @@
 
 #define VERBS_PROV_NAME "verbs"
 
-#define VERBS_DBG(subsys, ...) FI_DBG(&vrb_prov, subsys, __VA_ARGS__)
-#define VERBS_INFO(subsys, ...) FI_INFO(&vrb_prov, subsys, __VA_ARGS__)
-#define VERBS_INFO_ERRNO(subsys, fn, errno) VERBS_INFO(subsys, fn ": %s(%d)\n",	\
-		strerror(errno), errno)
-#define VERBS_WARN(subsys, ...) FI_WARN(&vrb_prov, subsys, __VA_ARGS__)
+#define VRB_DBG(subsys, ...) FI_DBG(&vrb_prov, subsys, __VA_ARGS__)
+#define VRB_INFO(subsys, ...) FI_INFO(&vrb_prov, subsys, __VA_ARGS__)
+#define VRB_WARN(subsys, ...) FI_WARN(&vrb_prov, subsys, __VA_ARGS__)
 
+#define VRB_WARN_ERRNO(subsys, fn) \
+	VRB_WARN(subsys, fn ": %s (%d)\n", strerror(errno), errno)
+#define VRB_WARN_ERR(subsys, fn, err) \
+	VRB_WARN(subsys, fn ": %s (%d)\n", fi_strerror((int) -(err)), (int) err)
 
-#define VERBS_INJECT_FLAGS(ep, len, flags) ((((flags) & FI_INJECT) || \
-		len <= (ep)->info_attr.inject_size) ? IBV_SEND_INLINE : 0)
-#define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->util_ep.tx_op_flags)
+
+#define VERBS_INJECT_FLAGS(ep, len, flags, desc) \
+	(((flags) & FI_INJECT) || !(desc) || \
+	 ((((struct vrb_mem_desc *) (desc))->info.iface == FI_HMEM_SYSTEM) && \
+	  ((len) <= (ep)->info_attr.inject_size))) ? IBV_SEND_INLINE : 0
+#define VERBS_INJECT(ep, len, desc) \
+	VERBS_INJECT_FLAGS(ep, len, (ep)->util_ep.tx_op_flags, desc)
 
 #define VERBS_COMP_FLAGS(ep, flags, context)		\
 	(((ep)->util_ep.tx_op_flags | (flags)) &		\
@@ -171,6 +180,9 @@ extern struct vrb_gl_data {
 		int	prefer_xrc;
 		char	*xrcd_filename;
 	} msg;
+
+	bool	peer_mem_support;
+	bool	dmabuf_support;
 } vrb_gl_data;
 
 struct verbs_addr {
@@ -357,9 +369,8 @@ struct vrb_domain {
 
 	ssize_t		(*send_credits)(struct fid_ep *ep, uint64_t credits);
 
-	/* Indicates that MSG endpoints should use the XRC transport.
-	 * TODO: Move selection of XRC/RC to endpoint info from domain */
-	int				flags;
+	/* Domain use of specific extended H/W capabilities, e.g. XRC, ODP */
+	uint64_t			ext_flags;
 	struct {
 		int			xrcd_fd;
 		struct ibv_xrcd		*xrcd;
@@ -425,13 +436,13 @@ struct vrb_mem_desc {
 	struct fid_mr		mr_fid;
 	struct ibv_mr		*mr;
 	struct vrb_domain	*domain;
-	size_t			len;
 	/* this field is used only by MR cache operations */
 	struct ofi_mr_entry	*entry;
+	struct ofi_mr_info	info;
+	uint32_t		lkey;
 };
 
 extern struct fi_ops_mr vrb_mr_ops;
-extern struct fi_ops_mr vrb_mr_cache_ops;
 
 int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
 			       struct ofi_mr_entry *entry);
@@ -604,6 +615,7 @@ struct vrb_ep {
 	struct rdma_conn_param		conn_param;
 	struct vrb_cm_data_hdr		*cm_hdr;
 	void				*cm_priv_data;
+	bool				hmem_enabled;
 };
 
 
@@ -775,7 +787,6 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn);
 int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
 			       struct vrb_ini_shared_conn **ini_conn);
 void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep);
-int vrb_reserve_qpn(struct vrb_xrc_ep *ep, struct ibv_qp **qp);
 
 void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data,
 			   size_t len);
@@ -839,7 +850,7 @@ ssize_t vrb_eq_write_event(struct vrb_eq *eq, uint32_t event,
 int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
 			enum fi_op op, struct fi_atomic_attr *attr,
 			uint64_t flags);
-int vrb_set_rnr_timer(struct ibv_qp *qp);
+void vrb_set_rnr_timer(struct ibv_qp *qp);
 void vrb_cleanup_cq(struct vrb_ep *cur_ep);
 int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context,
 			   enum ibv_qp_type qp_type);
@@ -885,54 +896,17 @@ int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc);
 #define vrb_init_sge(buf, len, desc) (struct ibv_sge)	\
 	{ .addr = (uintptr_t) buf,			\
 	  .length = (uint32_t) len,			\
-	  .lkey = (uint32_t) (uintptr_t) desc }
-
-#define vrb_set_sge_iov(sg_list, iov, count, desc)	\
-({							\
-	size_t i;					\
-	sg_list = alloca(sizeof(*sg_list) * count);	\
-	for (i = 0; i < count; i++) {			\
-		sg_list[i] = vrb_init_sge(		\
-				iov[i].iov_base,	\
-				iov[i].iov_len,		\
-				desc[i]);		\
-	}						\
-})
-
-#define vrb_set_sge_iov_count_len(sg_list, iov, count, desc, len)	\
-({									\
-	size_t i;							\
-	sg_list = alloca(sizeof(*sg_list) * count);			\
-	for (i = 0; i < count; i++) {					\
-		sg_list[i] = vrb_init_sge(				\
-				iov[i].iov_base,			\
-				iov[i].iov_len,				\
-				desc[i]);				\
-		len += iov[i].iov_len;					\
-	}								\
-})
-
-#define vrb_init_sge_inline(buf, len) vrb_init_sge(buf, len, NULL)
-
-#define vrb_set_sge_iov_inline(sg_list, iov, count, len)	\
-({								\
+	  .lkey = (desc) ? ((struct vrb_mem_desc *) (desc))->lkey : 0 }
+
+#define vrb_iov_dupa(dst, iov, desc, count)			\
+do {								\
 	size_t i;						\
-	sg_list = alloca(sizeof(*sg_list) * count);		\
+	dst = alloca(sizeof(*dst) * count);			\
 	for (i = 0; i < count; i++) {				\
-		sg_list[i] = vrb_init_sge_inline(		\
-					iov[i].iov_base,	\
-					iov[i].iov_len);	\
-		len += iov[i].iov_len;				\
+		dst[i] = vrb_init_sge(iov[i].iov_base,		\
+				      iov[i].iov_len, desc[i]);	\
 	}							\
-})
-
-#define vrb_send_iov(ep, wr, iov, desc, count)		\
-	vrb_send_iov_flags(ep, wr, iov, desc, count,		\
-			      (ep)->util_ep.tx_op_flags)
-
-#define vrb_send_msg(ep, wr, msg, flags)				\
-	vrb_send_iov_flags(ep, wr, (msg)->msg_iov, (msg)->desc,	\
-			      (msg)->iov_count, flags)
+} while (0)
 
 #define vrb_wr_consumes_recv(wr)						\
 	( wr->opcode == IBV_WR_SEND || wr->opcode == IBV_WR_SEND_WITH_IMM	\
@@ -947,49 +921,15 @@ vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr,
 {
 	struct ibv_sge sge = vrb_init_sge(buf, len, desc);
 
-	assert(wr->wr_id != VERBS_NO_COMP_FLAG);
-
 	wr->sg_list = &sge;
 	wr->num_sge = 1;
 
 	return vrb_post_send(ep, wr, 0);
 }
 
-static inline ssize_t
-vrb_send_buf_inline(struct vrb_ep *ep, struct ibv_send_wr *wr,
-		       const void *buf, size_t len)
-{
-	struct ibv_sge sge = vrb_init_sge_inline(buf, len);
-
-	assert(wr->wr_id == VERBS_NO_COMP_FLAG);
-
-	wr->sg_list = &sge;
-	wr->num_sge = 1;
-
-	return vrb_post_send(ep, wr, 0);
-}
-
-static inline ssize_t
-vrb_send_iov_flags(struct vrb_ep *ep, struct ibv_send_wr *wr,
-		      const struct iovec *iov, void **desc, int count,
-		      uint64_t flags)
-{
-	size_t len = 0;
-
-	if (!desc)
-		vrb_set_sge_iov_inline(wr->sg_list, iov, count, len);
-	else
-		vrb_set_sge_iov_count_len(wr->sg_list, iov, count, desc, len);
-
-	wr->num_sge = count;
-	wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags);
-	wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id);
-
-	if (flags & FI_FENCE)
-		wr->send_flags |= IBV_SEND_FENCE;
-
-	return vrb_post_send(ep, wr, flags);
-}
+ssize_t vrb_send_iov(struct vrb_ep *ep, struct ibv_send_wr *wr,
+		     const struct iovec *iov, void **desc, int count,
+		     uint64_t flags);
 
 void vrb_add_credits(struct fid_ep *ep, size_t credits);
 
diff --git a/deps/libfabric/prov/verbs/src/verbs_cm.c b/deps/libfabric/prov/verbs/src/verbs_cm.c
index f341c8738275693a070288bd8d909c52ef85b4d8..093247ccaf92a053fd9339800ae1701d5d4da883 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cm.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2015 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2013-2021 Intel Corporation, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -62,7 +62,7 @@ static int vrb_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 		container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
 
 	if (addrlen != ep->info_attr.src_addrlen) {
-		VERBS_INFO(FI_LOG_EP_CTRL,"addrlen expected: %zu, got: %zu.\n",
+		VRB_INFO(FI_LOG_EP_CTRL,"addrlen expected: %zu, got: %zu.\n",
 			   ep->info_attr.src_addrlen, addrlen);
 		return -FI_EINVAL;
 	}
@@ -71,7 +71,7 @@ static int vrb_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 
 	ep->info_attr.src_addr = malloc(ep->info_attr.src_addrlen);
 	if (!ep->info_attr.src_addr) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "memory allocation failure\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "memory allocation failure\n");
 		ret = -FI_ENOMEM;
 		goto err1;
 	}
@@ -139,7 +139,10 @@ vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data,
 {
 	struct vrb_rdma_cm_hdr *rdma_cm_hdr = priv_data;
 
-	rdma_cm_hdr->ip_version = 6 << 4; /* IPv6 */
+	/* ip_version=6 would requires IPoIB to be installed and the IP link
+	 * to be UP, which we don't want. As a work-around, we set ip_version to 0,
+	 * which let the CMA kernel code to skip any requirement for IPoIB. */
+	rdma_cm_hdr->ip_version = 0;
 	rdma_cm_hdr->port = htons(ofi_addr_get_port(&id->route.addr.src_addr));
 
 	/* Record the GIDs */
@@ -165,8 +168,10 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
 
 	if (!ep->id->qp) {
 		ret = fi_control(&ep_fid->fid, FI_ENABLE, NULL);
-		if (ret)
+		if (ret) {
+			VRB_WARN_ERR(FI_LOG_EP_CTRL, "fi_control", ret);
 			return ret;
+		}
 	}
 
 	if (ep->id->route.addr.src_addr.sa_family == AF_IB)
@@ -191,9 +196,7 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
 
 	if (rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT)) {
 		ret = -errno;
-		FI_WARN(&vrb_prov, FI_LOG_EP_CTRL,
-			"rdma_resolve_route failed: %s (%d)\n",
-			strerror(-ret), -ret);
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_route");
 		free(ep->cm_priv_data);
 		ep->cm_priv_data = NULL;
 		return ret;
@@ -216,8 +219,10 @@ vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 
 	if (!_ep->id->qp) {
 		ret = fi_control(&ep->fid, FI_ENABLE, NULL);
-		if (ret)
+		if (ret) {
+			VRB_WARN_ERR(FI_LOG_EP_CTRL, "fi_control", ret);
 			return ret;
+		}
 	}
 
 	cm_hdr = alloca(sizeof(*cm_hdr) + paramlen);
@@ -229,8 +234,10 @@ vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 		conn_param.srq = 1;
 
 	ret = rdma_accept(_ep->id, &conn_param);
-	if (ret)
+	if (ret) {
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_accept");
 		return -errno;
+	}
 
 	connreq = container_of(_ep->info_attr.handle, struct vrb_connreq, handle);
 	free(connreq);
@@ -247,14 +254,14 @@ static int vrb_msg_alloc_xrc_params(void **adjusted_param,
 	*adjusted_param = NULL;
 
 	if (cm_datalen > VRB_CM_DATA_SIZE) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "XRC CM data overflow %zu\n",
+		VRB_WARN(FI_LOG_EP_CTRL, "XRC CM data overflow %zu\n",
 			   cm_datalen);
 		return -FI_EINVAL;
 	}
 
 	cm_data = malloc(cm_datalen);
 	if (!cm_data) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unable to allocate XRC CM data\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Unable to allocate XRC CM data\n");
 		return -FI_ENOMEM;
 	}
 
@@ -303,14 +310,20 @@ vrb_msg_ep_reject(struct fid_pep *pep, fid_t handle,
 	vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr);
 
 	fastlock_acquire(&_pep->eq->lock);
-	if (connreq->is_xrc)
+	if (connreq->is_xrc) {
 		ret = vrb_msg_xrc_ep_reject(connreq, cm_hdr,
 				(uint8_t)(sizeof(*cm_hdr) + paramlen));
-	else
+	} else if (connreq->id) {
 		ret = rdma_reject(connreq->id, cm_hdr,
 			(uint8_t)(sizeof(*cm_hdr) + paramlen)) ? -errno : 0;
+	} else {
+		ret = -FI_EBUSY;
+	}
 	fastlock_release(&_pep->eq->lock);
 
+	if (ret)
+		VRB_WARN_ERR(FI_LOG_EP_CTRL, "rdma_reject", ret);
+
 	free(connreq);
 	return ret;
 }
@@ -343,7 +356,7 @@ vrb_msg_xrc_cm_common_verify(struct vrb_xrc_ep *ep, size_t paramlen)
 	int ret;
 
 	if (!vrb_is_xrc_ep(&ep->base_ep)) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "EP is not using XRC\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "EP is not using XRC\n");
 		return -FI_EINVAL;
 	}
 
@@ -392,7 +405,7 @@ vrb_msg_xrc_ep_connect(struct fid_ep *ep, const void *addr,
 
 	xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup));
 	if (!xrc_ep->conn_setup) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Unable to allocate connection setup memory\n");
 		free(adjusted_param);
 		free(cm_hdr);
@@ -461,7 +474,7 @@ static int vrb_pep_setname(fid_t pep_fid, void *addr, size_t addrlen)
 	pep = container_of(pep_fid, struct vrb_pep, pep_fid);
 
 	if (pep->src_addrlen && (addrlen != pep->src_addrlen)) {
-		VERBS_INFO(FI_LOG_FABRIC, "addrlen expected: %zu, got: %zu.\n",
+		VRB_INFO(FI_LOG_FABRIC, "addrlen expected: %zu, got: %zu.\n",
 			   pep->src_addrlen, addrlen);
 		return -FI_EINVAL;
 	}
@@ -470,22 +483,19 @@ static int vrb_pep_setname(fid_t pep_fid, void *addr, size_t addrlen)
 	if (pep->bound) {
 		ret = rdma_destroy_id(pep->id);
 		if (ret) {
-			VERBS_INFO(FI_LOG_FABRIC,
-				   "Unable to destroy previous rdma_cm_id\n");
+			VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_destroy_id");
 			return -errno;
 		}
 		ret = rdma_create_id(NULL, &pep->id, &pep->pep_fid.fid, RDMA_PS_TCP);
 		if (ret) {
-			VERBS_INFO(FI_LOG_FABRIC,
-				   "Unable to create rdma_cm_id\n");
+			VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_cm_id\n");
 			return -errno;
 		}
 	}
 
 	ret = rdma_bind_addr(pep->id, (struct sockaddr *)addr);
 	if (ret) {
-		VERBS_INFO(FI_LOG_FABRIC,
-			   "Unable to bind address to rdma_cm_id\n");
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_bind_addr");
 		return -errno;
 	}
 
diff --git a/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c b/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
index c88bd8fbce62e5f706a09e7b05fe62fc0461573a..8bda28d25d7331c8f3ce3d8b5cda5e9863f9fb71 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
@@ -54,7 +54,7 @@ void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep)
 		break;
 	default:
 		assert(0);
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
+		VRB_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
 			   ep->conn_state);
 	}
 }
@@ -80,7 +80,7 @@ void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep)
 		break;
 	default:
 		assert(0);
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
+		VRB_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
 			   ep->conn_state);
 	}
 }
@@ -109,13 +109,13 @@ int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
 			      int private_data_len)
 {
 	if (sizeof(*remote) > private_data_len) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "XRC MSG EP CM data length mismatch\n");
 		return -FI_EINVAL;
 	}
 
 	if (remote->version != VRB_XRC_VERSION) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "XRC MSG EP connection protocol mismatch "
 			   "(local %"PRIu8", remote %"PRIu8")\n",
 			   VRB_XRC_VERSION, remote->version);
@@ -124,7 +124,7 @@ int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
 	return FI_SUCCESS;
 }
 
-void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
+static void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
 {
 	struct sockaddr *addr;
 	char buf[OFI_ADDRSTRLEN];
@@ -133,38 +133,41 @@ void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
 	if (!fi_log_enabled(&vrb_prov, FI_LOG_INFO, FI_LOG_EP_CTRL))
 		return;
 
-	VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, %s\n", ep, desc);
-	VERBS_INFO(FI_LOG_EP_CTRL,
+	VRB_INFO(FI_LOG_EP_CTRL, "EP %p, %s\n", (void *) ep, desc);
+	VRB_INFO(FI_LOG_EP_CTRL,
 		  "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n",
-		  ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn);
+		  (void*) ep, (void *) ep->base_ep.id, (void *) ep->tgt_id,
+		  ep->srqn, ep->peer_srqn);
 
 
 	if (ep->base_ep.id) {
 		addr = rdma_get_local_addr(ep->base_ep.id);
 		len = sizeof(buf);
 		ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr);
-		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n", ep, buf);
+		VRB_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n",
+			   (void *) ep, buf);
 
 		addr = rdma_get_peer_addr(ep->base_ep.id);
 		len = sizeof(buf);
 		ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr);
-		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n", ep, buf);
+		VRB_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n",
+			   (void *) ep, buf);
 	}
 
 	if (ep->base_ep.ibv_qp) {
-		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, INI QP Num %d\n",
-			  ep, ep->base_ep.ibv_qp->qp_num);
-		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, Remote TGT QP Num %d\n", ep,
-			  ep->ini_conn->tgt_qpn);
+		VRB_INFO(FI_LOG_EP_CTRL, "EP %p, INI QP Num %d\n",
+			   (void *) ep, ep->base_ep.ibv_qp->qp_num);
+		VRB_INFO(FI_LOG_EP_CTRL, "EP %p, Remote TGT QP Num %d\n",
+			   (void *) ep, ep->ini_conn->tgt_qpn);
 	}
 	if (ep->tgt_ibv_qp)
-		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, TGT QP Num %d\n",
-			  ep, ep->tgt_ibv_qp->qp_num);
+		VRB_INFO(FI_LOG_EP_CTRL, "EP %p, TGT QP Num %d\n",
+			   (void *) ep, ep->tgt_ibv_qp->qp_num);
 }
 
-/* Caller must hold eq:lock */
 void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
 {
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(ep->conn_setup);
 
 	/* If a disconnect is requested then the XRC bidirectional connection
@@ -198,19 +201,19 @@ void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
 	}
 }
 
-/* Caller must hold the eq:lock */
 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
 		       int reciprocal, void *param, size_t paramlen)
 {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 	int ret;
 
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(!ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn);
 
 	domain->xrc.lock_acquire(&domain->xrc.ini_lock);
 	ret = vrb_get_shared_ini_conn(ep, &ep->ini_conn);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Get of shared XRC INI connection failed %d\n", ret);
 		if (!reciprocal) {
 			free(ep->conn_setup);
@@ -228,11 +231,11 @@ int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
 	return FI_SUCCESS;
 }
 
-/* Caller must hold the eq:lock */
 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(ep->base_ep.id && ep->ini_conn);
 
 	domain->xrc.lock_acquire(&domain->xrc.ini_lock);
@@ -247,7 +250,7 @@ void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 		ep->ini_conn->state = VRB_INI_QP_CONNECTED;
 		ep->ini_conn->tgt_qpn = tgt_qpn;
 		ep->base_ep.id->qp = NULL;
-		VERBS_DBG(FI_LOG_EP_CTRL,
+		VRB_DBG(FI_LOG_EP_CTRL,
 			  "Set INI Conn QP %d remote TGT QP %d\n",
 			  ep->ini_conn->ini_qp->qp_num,
 			  ep->ini_conn->tgt_qpn);
@@ -258,9 +261,9 @@ void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 	domain->xrc.lock_release(&domain->xrc.ini_lock);
 }
 
-/* Caller must hold the eq:lock */
 void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep)
 {
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(ep->base_ep.id && ep->ini_conn);
 
 	vrb_log_ep_conn(ep, "INI Connection Rejected");
@@ -278,14 +281,14 @@ void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *ep)
 	}
 }
 
-/* Caller must hold the eq:lock */
 int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
-				    struct vrb_connreq *connreq,
-				    struct rdma_cm_id *id)
+				 struct vrb_connreq *connreq,
+				 struct rdma_cm_id *id)
 {
 	struct rdma_conn_param conn_param = { 0 };
 	struct vrb_xrc_cm_data *cm_data = ep->accept_param_data;
 
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(cm_data && ep->tgt_ibv_qp);
 	assert(ep->tgt_ibv_qp->qp_num == connreq->xrc.tgt_qpn);
 	assert(ep->peer_srqn == connreq->xrc.peer_srqn);
@@ -307,9 +310,8 @@ int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
 	return rdma_accept(id, &conn_param);
 }
 
-/* Caller must hold the eq:lock */
 int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
-		      void *param, size_t paramlen)
+		   void *param, size_t paramlen)
 {
 	struct sockaddr *addr;
 	struct vrb_connreq *connreq;
@@ -318,6 +320,7 @@ int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
 	struct vrb_xrc_cm_data connect_cm_data;
 	int ret;
 
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	addr = rdma_get_local_addr(ep->tgt_id);
 	if (addr)
 		ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "src_addr", addr);
@@ -359,7 +362,7 @@ int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
 	ret = rdma_accept(ep->tgt_id, &conn_param);
 	if (OFI_UNLIKELY(ret)) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "XRC TGT, rdma_accept error %d\n", ret);
 		vrb_prev_xrc_conn_state(ep);
 		return ret;
@@ -368,7 +371,7 @@ int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
 
 	if (ep->tgt_id->ps == RDMA_PS_UDP &&
 	    vrb_eq_add_sidr_conn(ep, cm_data, paramlen))
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "SIDR connection accept not added to map\n");
 
 	/* The passive side of the initial shared connection using
@@ -380,7 +383,7 @@ int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
 					 &connect_cm_data,
 					 sizeof(connect_cm_data));
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "XRC reciprocal connect error %d\n", ret);
 			vrb_prev_xrc_conn_state(ep);
 			ep->tgt_id->qp = NULL;
@@ -401,7 +404,7 @@ int vrb_process_xrc_connreq(struct vrb_ep *ep,
 
 	xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup));
 	if (!xrc_ep->conn_setup) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			  "Unable to allocate connection setup memory\n");
 		return -FI_ENOMEM;
 	}
diff --git a/deps/libfabric/prov/verbs/src/verbs_cq.c b/deps/libfabric/prov/verbs/src/verbs_cq.c
index 14d91bab908eed2eb1f5e26449b194ad6e255f9b..541982417c33372ed7059d5a499346b93ae6902d 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cq.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cq.c
@@ -180,7 +180,7 @@ vrb_poll_events(struct vrb_cq *_cq, int timeout)
 		rc--;
 	}
 	if (rc) {
-		VERBS_WARN(FI_LOG_CQ, "Unknown poll error: check revents\n");
+		VRB_WARN(FI_LOG_CQ, "Unknown poll error: check revents\n");
 		return -FI_EOTHER;
 	}
 
@@ -226,12 +226,12 @@ vrb_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond,
 	return cur ? cur : ret;
 }
 
-/* Must be called with CQ lock held. */
 int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc)
 {
 	struct vrb_context *ctx;
 	int ret;
 
+	assert(fastlock_held(&cq->util_cq.cq_lock));
 	do {
 		ret = ibv_poll_cq(cq->cq, 1, wc);
 		if (ret <= 0)
@@ -263,11 +263,11 @@ int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc)
 	return ret;
 }
 
-/* Must be called with CQ lock held. */
 int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc)
 {
 	struct vrb_wc_entry *wce;
 
+	assert(fastlock_held(&cq->util_cq.cq_lock));
 	wce = ofi_buf_alloc(cq->wce_pool);
 	if (!wce) {
 		FI_WARN(&vrb_prov, FI_LOG_CQ,
@@ -377,7 +377,7 @@ int vrb_cq_signal(struct fid_cq *cq)
 	_cq = container_of(cq, struct vrb_cq, util_cq.cq_fid);
 
 	if (write(_cq->signal_fd[1], &data, 1) != 1) {
-		VERBS_WARN(FI_LOG_CQ, "Error signalling CQ\n");
+		VRB_WARN(FI_LOG_CQ, "Error signalling CQ\n");
 		return -errno;
 	}
 
@@ -391,7 +391,7 @@ int vrb_cq_trywait(struct vrb_cq *cq)
 	int ret = -FI_EAGAIN, rc;
 
 	if (!cq->channel) {
-		VERBS_WARN(FI_LOG_CQ, "No wait object object associated with CQ\n");
+		VRB_WARN(FI_LOG_CQ, "No wait object object associated with CQ\n");
 		return -FI_EINVAL;
 	}
 
@@ -411,7 +411,7 @@ int vrb_cq_trywait(struct vrb_cq *cq)
 
 	rc = ibv_req_notify_cq(cq->cq, 0);
 	if (rc) {
-		VERBS_WARN(FI_LOG_CQ, "ibv_req_notify_cq error: %d\n", ret);
+		VRB_WARN(FI_LOG_CQ, "ibv_req_notify_cq error: %d\n", ret);
 		ret = -errno;
 		goto out;
 	}
@@ -568,6 +568,7 @@ int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 	size_t size;
 	int ret;
 	struct fi_cq_attr tmp_attr = *attr;
+	int comp_vector = 0;
 
 	cq = calloc(1, sizeof(*cq));
 	if (!cq)
@@ -594,11 +595,24 @@ int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 		goto err4;
 	}
 
+	if (attr->flags & FI_AFFINITY) {
+		if (attr->signaling_vector < 0 ||
+		    attr->signaling_vector > domain->verbs->num_comp_vectors)  {
+
+			VRB_WARN(FI_LOG_CQ,
+				   "Invalid value for the CQ attribute signaling_vector: %d\n",
+				   attr->signaling_vector);
+			ret = -FI_EINVAL;
+			goto err4;
+		}
+		comp_vector = attr->signaling_vector;
+	}
+
 	if (cq->wait_obj != FI_WAIT_NONE) {
 		cq->channel = ibv_create_comp_channel(domain->verbs);
 		if (!cq->channel) {
 			ret = -errno;
-			VERBS_WARN(FI_LOG_CQ,
+			VRB_WARN(FI_LOG_CQ,
 				   "Unable to create completion channel\n");
 			goto err2;
 		}
@@ -629,17 +643,17 @@ int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 	 * num_qp_per_cq = ibv_device_attr->max_cqe / (qp_send_wr + qp_recv_wr)
 	 */
 	cq->cq = ibv_create_cq(domain->verbs, size, cq, cq->channel,
-			       attr->signaling_vector);
+			       comp_vector);
 	if (!cq->cq) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_CQ, "Unable to create verbs CQ\n");
+		VRB_WARN(FI_LOG_CQ, "Unable to create verbs CQ\n");
 		goto err4;
 	}
 
 	if (cq->channel) {
 		ret = ibv_req_notify_cq(cq->cq, 0);
 		if (ret) {
-			VERBS_WARN(FI_LOG_CQ,
+			VRB_WARN(FI_LOG_CQ,
 				   "ibv_req_notify_cq failed\n");
 			goto err5;
 		}
@@ -648,7 +662,7 @@ int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr,
 	ret = ofi_bufpool_create(&cq->wce_pool, sizeof(struct vrb_wc_entry),
 				16, 0, VERBS_WCE_CNT, 0);
 	if (ret) {
-		VERBS_WARN(FI_LOG_CQ, "Failed to create wce_pool\n");
+		VRB_WARN(FI_LOG_CQ, "Failed to create wce_pool\n");
 		goto err5;
 	}
 
diff --git a/deps/libfabric/prov/verbs/src/verbs_dgram_av.c b/deps/libfabric/prov/verbs/src/verbs_dgram_av.c
index ce0f710f33d87bb65b9291912efb3694af516c8f..c6c5998b35008614c447a8b812a2a1ed2e979348 100644
--- a/deps/libfabric/prov/verbs/src/verbs_dgram_av.c
+++ b/deps/libfabric/prov/verbs/src/verbs_dgram_av.c
@@ -43,12 +43,12 @@ static inline int
 vrb_dgram_verify_av_flags(struct util_av *av, uint64_t flags)
 {
 	if ((av->flags & FI_EVENT) && !av->eq) {
-		VERBS_WARN(FI_LOG_AV, "No EQ bound to AV\n");
+		VRB_WARN(FI_LOG_AV, "No EQ bound to AV\n");
 		return -FI_ENOEQ;
 	}
 
 	if (flags & ~(FI_MORE)) {
-		VERBS_WARN(FI_LOG_AV, "Unsupported flags\n");
+		VRB_WARN(FI_LOG_AV, "Unsupported flags\n");
 		return -FI_ENOEQ;
 	}
 
@@ -79,21 +79,21 @@ vrb_dgram_av_insert_addr(struct vrb_dgram_av *av, const void *addr,
 		ah_attr.grh.sgid_index = vrb_gl_data.gid_idx;
 	} else if (OFI_UNLIKELY(!vrb_dgram_av_is_addr_valid(av, addr))) {
 		ret = -FI_EADDRNOTAVAIL;
-		VERBS_WARN(FI_LOG_AV, "Invalid address\n");
+		VRB_WARN(FI_LOG_AV, "Invalid address\n");
 		goto fn1;
 	}
 
 	av_entry = calloc(1, sizeof(*av_entry));
 	if (OFI_UNLIKELY(!av_entry)) {
 		ret = -FI_ENOMEM;
-		VERBS_WARN(FI_LOG_AV, "Unable to allocate memory for AV entry\n");
+		VRB_WARN(FI_LOG_AV, "Unable to allocate memory for AV entry\n");
 		goto fn1;
 	}
 
 	av_entry->ah = ibv_create_ah(domain->pd, &ah_attr);
 	if (OFI_UNLIKELY(!av_entry->ah)) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_AV,
+		VRB_WARN(FI_LOG_AV,
 			   "Unable to create Address Handle, errno - %d\n", errno);
 		goto fn2;
 	}
@@ -124,7 +124,7 @@ static int vrb_dgram_av_insert(struct fid_av *av_fid, const void *addr,
 	if (ret)
 		return ret;
 
-	VERBS_DBG(FI_LOG_AV, "Inserting %"PRIu64" addresses\n", count);
+	VRB_DBG(FI_LOG_AV, "Inserting %"PRIu64" addresses\n", count);
 	for (i = 0; i < count; i++) {
 		ret = vrb_dgram_av_insert_addr(
 				av, (struct ofi_ib_ud_ep_name *)addr + i,
@@ -133,7 +133,7 @@ static int vrb_dgram_av_insert(struct fid_av *av_fid, const void *addr,
 			success_cnt++;
 	}
 
-	VERBS_DBG(FI_LOG_AV,
+	VRB_DBG(FI_LOG_AV,
 		  "%d addresses were inserted successfully\n", success_cnt);
 	return success_cnt;
 }
@@ -143,7 +143,7 @@ vrb_dgram_av_remove_addr(struct vrb_dgram_av_entry *av_entry)
 {
 	int ret = ibv_destroy_ah(av_entry->ah);
 	if (ret)
-		VERBS_WARN(FI_LOG_AV,
+		VRB_WARN(FI_LOG_AV,
 			   "AH Destroying failed with status - %d\n",
 			   ret);
 	dlist_remove(&av_entry->list_entry);
diff --git a/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c b/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
index d71b575109098b0d75175dc695f7268af908da9c..e018d93ddda0b51ac1533b50e57cdc2c4ed59a2e 100644
--- a/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
+++ b/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
@@ -59,7 +59,7 @@ vrb_dgram_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
 		.next = NULL,
 	};
 
-	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
+	vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count);
 	return vrb_post_recv(ep, &wr);
 }
 
@@ -111,7 +111,8 @@ vrb_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
 	if (vrb_dgram_ep_set_addr(ep, msg->addr, &wr))
 		return -FI_ENOENT;
 
-	return vrb_send_msg(ep, &wr, msg, flags);
+	return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc,
+			    msg->iov_count, flags);
 }
 
 static inline ssize_t
@@ -129,7 +130,8 @@ vrb_dgram_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov,
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr))
 		return -FI_ENOENT;
 
-	return vrb_send_iov(ep, &wr, iov, desc, count);
+	return vrb_send_iov(ep, &wr, iov, desc, count,
+			    ep->util_ep.tx_op_flags);
 }
 
 static ssize_t
@@ -141,7 +143,7 @@ vrb_dgram_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND,
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr))
@@ -160,10 +162,11 @@ vrb_dgram_ep_senddata(struct fid_ep *ep_fid, const void *buf,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr))
 		return -FI_ENOENT;
 
@@ -179,14 +182,15 @@ vrb_dgram_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_NO_COMP_FLAG,
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.send_flags = IBV_SEND_INLINE,
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr))
 		return -FI_ENOENT;
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
@@ -226,7 +230,7 @@ vrb_dgram_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr))
 		return -FI_ENOENT;
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
diff --git a/deps/libfabric/prov/verbs/src/verbs_domain.c b/deps/libfabric/prov/verbs/src/verbs_domain.c
index b64f96d096b951116c9b9ba01390a25cb2614f75..c105c84cc8c2783faf7c2bb4f99fd4ab06b11b62 100644
--- a/deps/libfabric/prov/verbs/src/verbs_domain.c
+++ b/deps/libfabric/prov/verbs/src/verbs_domain.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2015 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -168,7 +169,7 @@ static int vrb_domain_close(fid_t fid)
 			ofi_ns_stop_server(&fab->name_server);
 		break;
 	case FI_EP_MSG:
-		if (domain->flags & VRB_USE_XRC) {
+		if (domain->ext_flags & VRB_USE_XRC) {
 			ret = vrb_domain_xrc_cleanup(domain);
 			if (ret)
 				return ret;
@@ -214,7 +215,7 @@ static int vrb_open_device_by_name(struct vrb_domain *domain, const char *name)
 		const char *rdma_name = ibv_get_device_name(dev_list[i]->device);
 		switch (domain->ep_type) {
 		case FI_EP_MSG:
-			ret = domain->flags & VRB_USE_XRC ?
+			ret = domain->ext_flags & VRB_USE_XRC ?
 				vrb_cmp_xrc_domain_name(name, rdma_name) :
 				strcmp(name, rdma_name);
 			break;
@@ -223,7 +224,7 @@ static int vrb_open_device_by_name(struct vrb_domain *domain, const char *name)
 				      strlen(name) - strlen(verbs_dgram_domain.suffix));
 			break;
 		default:
-			VERBS_WARN(FI_LOG_DOMAIN,
+			VRB_WARN(FI_LOG_DOMAIN,
 				   "Unsupported EP type - %d\n", domain->ep_type);
 			/* Never should go here */
 			assert(0);
@@ -280,7 +281,11 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 {
 	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
 		[FI_HMEM_SYSTEM] = default_monitor,
+		[FI_HMEM_CUDA] = default_cuda_monitor,
+		[FI_HMEM_ROCR] = default_rocr_monitor,
+		[FI_HMEM_ZE] = default_ze_monitor,
 	};
+	enum fi_hmem_iface iface;
 	struct vrb_domain *_domain;
 	int ret;
 	struct vrb_fabric *fab =
@@ -309,7 +314,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 		goto err2;
 
 	_domain->ep_type = VRB_EP_TYPE(info);
-	_domain->flags |= vrb_is_xrc_info(info) ? VRB_USE_XRC : 0;
+	_domain->ext_flags |= vrb_is_xrc_info(info) ? VRB_USE_XRC : 0;
 
 	ret = vrb_open_device_by_name(_domain, info->domain_attr->name);
 	if (ret)
@@ -321,20 +326,29 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 		goto err3;
 	}
 
-	_domain->flags |= vrb_odp_flag(_domain->verbs);
+	_domain->ext_flags |= vrb_odp_flag(_domain->verbs);
 	_domain->util_domain.domain_fid.fid.fclass = FI_CLASS_DOMAIN;
 	_domain->util_domain.domain_fid.fid.context = context;
 	_domain->util_domain.domain_fid.fid.ops = &vrb_fid_ops;
+	_domain->util_domain.domain_fid.mr = &vrb_mr_ops;
 
 	_domain->cache.entry_data_size = sizeof(struct vrb_mem_desc);
 	_domain->cache.add_region = vrb_mr_cache_add_region;
 	_domain->cache.delete_region = vrb_mr_cache_delete_region;
 	ret = ofi_mr_cache_init(&_domain->util_domain, memory_monitors,
 				&_domain->cache);
-	if (!ret)
-		_domain->util_domain.domain_fid.mr = &vrb_mr_cache_ops;
-	else
-		_domain->util_domain.domain_fid.mr = &vrb_mr_ops;
+	if (ret) {
+		VRB_INFO(FI_LOG_MR,
+			   "MR cache init failed: %s. MR caching disabled.\n",
+			   fi_strerror(-ret));
+	} else {
+		for (iface = 0; iface < OFI_HMEM_MAX; iface++) {
+			if (_domain->cache.monitors[iface])
+				VRB_INFO(FI_LOG_MR,
+					   "MR cache enabled for %s memory\n",
+					   fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
+		}
+	}
 
 	switch (_domain->ep_type) {
 	case FI_EP_DGRAM:
@@ -356,7 +370,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 		_domain->util_domain.domain_fid.ops = &vrb_dgram_domain_ops;
 		break;
 	case FI_EP_MSG:
-		if (_domain->flags & VRB_USE_XRC) {
+		if (_domain->ext_flags & VRB_USE_XRC) {
 			ret = vrb_domain_xrc_init(_domain);
 			if (ret)
 				goto err4;
@@ -364,7 +378,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 		_domain->util_domain.domain_fid.ops = &vrb_msg_domain_ops;
 		break;
 	default:
-		VERBS_INFO(FI_LOG_DOMAIN, "Ivalid EP type is provided, "
+		VRB_INFO(FI_LOG_DOMAIN, "Ivalid EP type is provided, "
 			   "EP type :%d\n", _domain->ep_type);
 		ret = -FI_EINVAL;
 		goto err4;
@@ -375,14 +389,12 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 err4:
 	ofi_mr_cache_cleanup(&_domain->cache);
 	if (ibv_dealloc_pd(_domain->pd))
-		VERBS_INFO_ERRNO(FI_LOG_DOMAIN,
-				 "ibv_dealloc_pd", errno);
+		VRB_WARN_ERRNO(FI_LOG_DOMAIN, "ibv_dealloc_pd");
 err3:
 	fi_freeinfo(_domain->info);
 err2:
 	if (ofi_domain_close(&_domain->util_domain))
-		VERBS_INFO(FI_LOG_DOMAIN,
-			   "ofi_domain_close fails");
+		VRB_WARN(FI_LOG_DOMAIN, "ofi_domain_close fails");
 err1:
 	free(_domain);
 	return ret;
diff --git a/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c b/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
index 70f39a6de66c7cd66214a8b33eac19c6047befd2..8e366a2d5b47e352bae022c5426dbfaa900d2765 100644
--- a/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
+++ b/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
@@ -45,38 +45,6 @@ struct vrb_ini_conn_key {
 static int vrb_process_ini_conn(struct vrb_xrc_ep *ep,int reciprocal,
 				   void *param, size_t paramlen);
 
-/*
- * This routine is a work around that creates a QP for the only purpose of
- * reserving the QP number. The QP is not transitioned out of the RESET state.
- */
-int vrb_reserve_qpn(struct vrb_xrc_ep *ep, struct ibv_qp **qp)
-{
-	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
-	struct vrb_cq *cq = container_of(ep->base_ep.util_ep.tx_cq,
-					    struct vrb_cq, util_cq);
-	struct ibv_qp_init_attr attr = { 0 };
-	int ret;
-
-	/* Limit library allocated resources and do not INIT QP */
-	attr.cap.max_send_wr = 1;
-	attr.cap.max_send_sge = 1;
-	attr.cap.max_recv_wr = 0;
-	attr.cap.max_recv_sge = 0;
-	attr.cap.max_inline_data = 0;
-	attr.send_cq = cq->cq;
-	attr.recv_cq = cq->cq;
-	attr.qp_type = IBV_QPT_RC;
-
-	*qp = ibv_create_qp(domain->pd, &attr);
-	if (OFI_UNLIKELY(!*qp)) {
-		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Reservation QP create failed %d\n", -ret);
-		return ret;
-	}
-	return FI_SUCCESS;
-}
-
 static int vrb_create_ini_qp(struct vrb_xrc_ep *ep)
 {
 #if VERBS_HAVE_XRC
@@ -90,11 +58,12 @@ static int vrb_create_ini_qp(struct vrb_xrc_ep *ep)
 	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
 	attr_ex.pd = domain->pd;
 	attr_ex.qp_context = domain;
+	attr_ex.srq = NULL;
 
 	ret = rdma_create_qp_ex(ep->base_ep.id, &attr_ex);
 	if (ret) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "XRC INI QP rdma_create_qp_ex failed %d\n", -ret);
 		return ret;
 	}
@@ -112,15 +81,15 @@ static inline void vrb_set_ini_conn_key(struct vrb_xrc_ep *ep,
 				  struct vrb_cq, util_cq);
 }
 
-/* Caller must hold domain:xrc.ini_lock */
 int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
-			       struct vrb_ini_shared_conn **ini_conn) {
+			    struct vrb_ini_shared_conn **ini_conn) {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 	struct vrb_ini_conn_key key;
 	struct vrb_ini_shared_conn *conn;
 	struct ofi_rbnode *node;
 	int ret;
 
+	assert(fastlock_held(&domain->xrc.ini_lock));
 	vrb_set_ini_conn_key(ep, &key);
 	node = ofi_rbmap_find(domain->xrc.ini_conn_rbmap, &key);
 	if (node) {
@@ -132,7 +101,7 @@ int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
 	*ini_conn = NULL;
 	conn = calloc(1, sizeof(*conn));
 	if (!conn) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Unable to allocate INI connection memory\n");
 		return -FI_ENOMEM;
 	}
@@ -140,7 +109,7 @@ int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
 	conn->tgt_qpn = VRB_NO_INI_TGT_QPNUM;
 	conn->peer_addr = mem_dup(key.addr, ofi_sizeofaddr(key.addr));
 	if (!conn->peer_addr) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "mem_dup of peer address failed\n");
 		free(conn);
 		return -FI_ENOMEM;
@@ -155,7 +124,7 @@ int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
 			       (void *) &key, (void *) conn, NULL);
 	assert(ret != -FI_EALREADY);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "INI QP RBTree insert failed %d\n",
+		VRB_WARN(FI_LOG_EP_CTRL, "INI QP RBTree insert failed %d\n",
 			   ret);
 		goto insert_err;
 	}
@@ -168,13 +137,13 @@ insert_err:
 	return ret;
 }
 
-/* Caller must hold domain:xrc.ini_lock */
 void _vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
 {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 	struct vrb_ini_shared_conn *ini_conn;
 	struct vrb_ini_conn_key key;
 
+	assert(fastlock_held(&domain->xrc.ini_lock));
 	if (!ep->ini_conn)
 		return;
 
@@ -199,7 +168,7 @@ void _vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
 	/* Tear down physical INI/TGT when no longer being used */
 	if (!ofi_atomic_dec32(&ini_conn->ref_cnt)) {
 		if (ini_conn->ini_qp && ibv_destroy_qp(ini_conn->ini_qp))
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "Destroy of XRC physical INI QP failed %d\n",
 				   errno);
 
@@ -222,10 +191,11 @@ void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
 	domain->xrc.lock_release(&domain->xrc.ini_lock);
 }
 
-/* Caller must hold domain:xrc.ini_lock */
 void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal,
-				 void *conn_param, size_t conn_paramlen)
+			      void *conn_param, size_t conn_paramlen)
 {
+	assert(fastlock_held(&vrb_ep_to_domain(&ep->base_ep)->xrc.ini_lock));
+
 	ep->conn_setup->pending_recip = reciprocal;
 	ep->conn_setup->pending_paramlen = MIN(conn_paramlen,
 				sizeof(ep->conn_setup->pending_param));
@@ -275,7 +245,7 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 				       RDMA_PS_TCP : RDMA_PS_UDP,
 				       &ep->base_ep.id);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "Failed to create active CM ID %d\n",
 				   ret);
 			goto err;
@@ -286,12 +256,12 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 
 			if (ep->ini_conn->ini_qp &&
 			    ibv_destroy_qp(ep->ini_conn->ini_qp)) {
-				VERBS_WARN(FI_LOG_EP_CTRL, "Failed to destroy "
+				VRB_WARN(FI_LOG_EP_CTRL, "Failed to destroy "
 					   "physical INI QP %d\n", errno);
 			}
 			ret = vrb_create_ini_qp(ep);
 			if (ret) {
-				VERBS_WARN(FI_LOG_EP_CTRL, "Failed to create "
+				VRB_WARN(FI_LOG_EP_CTRL, "Failed to create "
 					   "physical INI QP %d\n", ret);
 				goto err;
 			}
@@ -300,7 +270,7 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 			ep->ini_conn->phys_conn_id = ep->base_ep.id;
 		} else {
 			assert(!ep->base_ep.id->qp);
-			VERBS_DBG(FI_LOG_EP_CTRL, "Sharing XRC INI QPN %d\n",
+			VRB_DBG(FI_LOG_EP_CTRL, "Sharing XRC INI QPN %d\n",
 				  ep->ini_conn->ini_qp->qp_num);
 		}
 
@@ -309,7 +279,7 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 		ret = rdma_migrate_id(ep->base_ep.id,
 				      ep->base_ep.eq->channel);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "Failed to migrate active CM ID %d\n", ret);
 			goto err;
 		}
@@ -371,7 +341,7 @@ int vrb_process_ini_conn(struct vrb_xrc_ep *ep,int reciprocal,
 	ret = rdma_resolve_route(ep->base_ep.id, VERBS_RESOLVE_TIMEOUT);
 	if (ret) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "rdma_resolve_route failed %s (%d)\n",
 			   strerror(-ret), -ret);
 		vrb_prev_xrc_conn_state(ep);
@@ -405,7 +375,7 @@ int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 		ep->tgt_ibv_qp = ibv_open_qp(domain->verbs, &open_attr);
 		if (!ep->tgt_ibv_qp) {
 			ret = -errno;
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "XRC TGT QP ibv_open_qp failed %d\n", -ret);
 			return ret;
 		}
@@ -423,7 +393,7 @@ int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 	attr_ex.xrcd = domain->xrc.xrcd;
 	if (rdma_create_qp_ex(ep->tgt_id, &attr_ex)) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Physical XRC TGT QP rdma_create_qp_ex failed %d\n",
 			   -ret);
 		return ret;
@@ -448,7 +418,7 @@ static int vrb_put_tgt_qp(struct vrb_xrc_ep *ep)
 	ret = ibv_destroy_qp(ep->tgt_ibv_qp);
 	if (ret) {
 		ret = -errno;
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Close XRC TGT QP ibv_destroy_qp failed %d\n",
 			   -ret);
 		return ret;
@@ -460,9 +430,9 @@ static int vrb_put_tgt_qp(struct vrb_xrc_ep *ep)
 	return FI_SUCCESS;
 }
 
-/* Caller must hold eq:lock */
 int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep)
 {
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	vrb_put_shared_ini_conn(ep);
 
 	if (ep->base_ep.id) {
@@ -501,7 +471,7 @@ static int vrb_ini_conn_compare(struct ofi_rbmap *map, void *key, void *data)
 			     sizeof(ofi_sin6_addr(_key->addr)));
 		break;
 	default:
-		VERBS_WARN(FI_LOG_FABRIC, "Unsupported address format\n");
+		VRB_WARN(FI_LOG_FABRIC, "Unsupported address format\n");
 		assert(0);
 		return -FI_EINVAL;
 	}
@@ -520,7 +490,7 @@ static int vrb_domain_xrc_validate_hw(struct vrb_domain *domain)
 
 	ret = ibv_query_device(domain->verbs, &attr);
 	if (ret || !(attr.device_cap_flags & IBV_DEVICE_XRC)) {
-		VERBS_INFO(FI_LOG_DOMAIN, "XRC is not supported");
+		VRB_INFO(FI_LOG_DOMAIN, "XRC is not supported");
 		return -FI_EINVAL;
 	}
 	return FI_SUCCESS;
@@ -541,7 +511,7 @@ int vrb_domain_xrc_init(struct vrb_domain *domain)
 		domain->xrc.xrcd_fd = open(vrb_gl_data.msg.xrcd_filename,
 				       O_CREAT, S_IWUSR | S_IRUSR);
 		if (domain->xrc.xrcd_fd < 0) {
-			VERBS_WARN(FI_LOG_DOMAIN,
+			VRB_WARN(FI_LOG_DOMAIN,
 				   "XRCD file open failed %d\n", errno);
 			return -errno;
 		}
@@ -553,14 +523,13 @@ int vrb_domain_xrc_init(struct vrb_domain *domain)
 	domain->xrc.xrcd = ibv_open_xrcd(domain->verbs, &attr);
 	if (!domain->xrc.xrcd) {
 		ret = -errno;
-		VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_open_xrcd", errno);
+		VRB_WARN_ERRNO(FI_LOG_DOMAIN, "ibv_open_xrcd");
 		goto xrcd_err;
 	}
 
 	domain->xrc.ini_conn_rbmap = ofi_rbmap_create(vrb_ini_conn_compare);
 	if (!domain->xrc.ini_conn_rbmap) {
 		ret = -ENOMEM;
-		VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "XRC INI QP RB Tree", -ret);
 		goto rbmap_err;
 	}
 
@@ -572,7 +541,7 @@ int vrb_domain_xrc_init(struct vrb_domain *domain)
 		domain->xrc.lock_acquire = ofi_fastlock_acquire;
 		domain->xrc.lock_release = ofi_fastlock_release;
 	}
-	domain->flags |= VRB_USE_XRC;
+	domain->ext_flags |= VRB_USE_XRC;
 	return FI_SUCCESS;
 
 rbmap_err:
@@ -596,13 +565,13 @@ int vrb_domain_xrc_cleanup(struct vrb_domain *domain)
 	assert(domain->xrc.xrcd);
 	/* All endpoint and hence XRC INI QP should be closed */
 	if (!ofi_rbmap_empty(domain->xrc.ini_conn_rbmap)) {
-		VERBS_WARN(FI_LOG_DOMAIN, "XRC domain busy\n");
+		VRB_WARN(FI_LOG_DOMAIN, "XRC domain busy\n");
 		return -FI_EBUSY;
 	}
 
 	ret = ibv_close_xrcd(domain->xrc.xrcd);
 	if (ret) {
-		VERBS_WARN(FI_LOG_DOMAIN, "ibv_close_xrcd failed %d\n", ret);
+		VRB_WARN(FI_LOG_DOMAIN, "ibv_close_xrcd failed %d\n", ret);
 		return -ret;
 	}
 	if (domain->xrc.xrcd_fd >= 0) {
diff --git a/deps/libfabric/prov/verbs/src/verbs_ep.c b/deps/libfabric/prov/verbs/src/verbs_ep.c
index a892677cf976cee6988fbc254826260df0aa54f2..bae5b153b71f2d900b02149fa95ba44444ebdc3c 100644
--- a/deps/libfabric/prov/verbs/src/verbs_ep.c
+++ b/deps/libfabric/prov/verbs/src/verbs_ep.c
@@ -149,8 +149,8 @@ ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags)
 	ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr);
 	wr->wr_id = (uintptr_t) ctx->user_ctx;
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_DATA, "Post send failed - %zd\n",
-			   vrb_convert_ret(ret));
+		VRB_WARN(FI_LOG_EP_DATA, "Post send failed - %zd\n",
+			 vrb_convert_ret(ret));
 		goto credits;
 	}
 	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
@@ -182,6 +182,63 @@ unlock:
 	return -FI_EAGAIN;
 }
 
+ssize_t vrb_send_iov(struct vrb_ep *ep, struct ibv_send_wr *wr,
+		     const struct iovec *iov, void **desc, int count,
+		     uint64_t flags)
+{
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	void *bounce_buf;
+	void *send_desc;
+	size_t i, len = 0;
+	ssize_t ret;
+
+	wr->sg_list = alloca(sizeof(*wr->sg_list) * count);
+	for (i = 0; i < count; i++) {
+		wr->sg_list[i].addr = (uintptr_t) iov[i].iov_base;
+		wr->sg_list[i].length = iov[i].iov_len;
+		wr->sg_list[i].lkey =
+			desc ? ((struct vrb_mem_desc *) desc[i])->lkey : 0;
+		len += iov[i].iov_len;
+	}
+
+	if (desc) {
+		iface = ((struct vrb_mem_desc *) desc[0])->info.iface;
+		device = ((struct vrb_mem_desc *) desc[0])->info.device;
+		send_desc = desc[0];
+
+		wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags, send_desc);
+	} else {
+		iface = FI_HMEM_SYSTEM;
+		device = 0;
+		send_desc = NULL;
+
+		wr->send_flags = IBV_SEND_INLINE;
+	}
+
+	if (wr->send_flags & IBV_SEND_INLINE) {
+		bounce_buf = alloca(len);
+		ret = ofi_copy_from_hmem_iov(bounce_buf, len, iface, device,
+					     iov, count, 0);
+		if (ret != len) {
+			VRB_WARN(FI_LOG_EP_DATA, "hmem copy error");
+			return -FI_EIO;
+		}
+
+		wr->sg_list[0] = vrb_init_sge(bounce_buf, len, NULL);
+		wr->num_sge = 1;
+	} else {
+		wr->num_sge = count;
+	}
+
+	wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id);
+	if (flags & FI_FENCE)
+		wr->send_flags |= IBV_SEND_FENCE;
+
+	ret = vrb_post_send(ep, wr, flags);
+	return ret;
+}
+
 static inline int vrb_msg_ep_cmdata_size(fid_t fid)
 {
 	struct vrb_pep *pep;
@@ -315,16 +372,22 @@ vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain,
 			return NULL;
 	}
 
+	// When we are enabling flow control, we artificially inject
+	// a credit so that the credit messaging itself is not blocked
+	// by a lack of credits.  To counter this, we will adjust the number
+	// of credit we send the first time by initializing to -1.
+	ep->rq_credits_avail = -1;
+
 	if (domain->util_domain.threading != FI_THREAD_SAFE) {
 		if (vrb_alloc_wrs(ep))
 			goto err1;
 	}
 
-	ret = ofi_endpoint_init(&domain->util_domain.domain_fid, &vrb_util_prov, info,
-				&ep->util_ep, context, vrb_util_ep_progress_noop);
+	ret = ofi_endpoint_init(&domain->util_domain.domain_fid,
+				&vrb_util_prov, info, &ep->util_ep, context,
+				vrb_util_ep_progress_noop);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Unable to initialize EP, error - %d\n", ret);
+		VRB_WARN_ERR(FI_LOG_EP_CTRL, "ofi_endpoint_init", ret);
 		goto err2;
 	}
 
@@ -369,12 +432,12 @@ static int vrb_close_free_ep(struct vrb_ep *ep)
 	return 0;
 }
 
-/* Caller must hold eq:lock */
-static inline void vrb_ep_xrc_close(struct vrb_ep *ep)
+static void vrb_ep_xrc_close(struct vrb_ep *ep)
 {
 	struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep,
-						    base_ep);
+						 base_ep);
 
+	assert(fastlock_held(&ep->eq->lock));
 	if (xrc_ep->conn_setup)
 		vrb_free_xrc_conn_setup(xrc_ep, 0);
 
@@ -423,24 +486,22 @@ static int vrb_ep_close(fid_t fid)
 				      &ep->service, &ep->ep_name);
 		ret = ibv_destroy_qp(ep->ibv_qp);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
-				   "Unable to destroy QP (errno = %d)\n", errno);
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_destroy_qp");
 			return -errno;
 		}
 		vrb_cleanup_cq(ep);
 		break;
 	default:
-		VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n");
+		VRB_WARN(FI_LOG_DOMAIN, "Unknown EP type\n");
 		assert(0);
 		return -FI_EINVAL;
 	}
 
-	VERBS_INFO(FI_LOG_DOMAIN, "EP %p is being closed\n", ep);
+	VRB_INFO(FI_LOG_DOMAIN, "EP %p is being closed\n", ep);
 
 	ret = vrb_close_free_ep(ep);
 	if (ret) {
-		VERBS_WARN(FI_LOG_DOMAIN,
-			   "Unable to close EP (%p), error - %d\n", ep, ret);
+		VRB_WARN_ERR(FI_LOG_DOMAIN, "vrb_close_free_ep", ret);
 		return ret;
 	}
 
@@ -476,7 +537,7 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 		if (flags & FI_RECV) {
 			cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
 			if (cq->credits < ep->rx_cq_size) {
-				VERBS_WARN(FI_LOG_DOMAIN,
+				VRB_WARN(FI_LOG_EP_CTRL,
 					   "Rx CQ is fully reserved\n");
 				ep->rx_cq_size = 0;
 			}
@@ -505,9 +566,10 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 		else
 			ret = rdma_migrate_id(ep->id, ep->eq->channel);
 		fastlock_release(&ep->eq->lock);
-		if (ret)
+		if (ret) {
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_migrate_id");
 			return -errno;
-
+		}
 		break;
 	case FI_CLASS_SRX_CTX:
 		if (ep->util_ep.type != FI_EP_MSG)
@@ -548,8 +610,7 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,
 
 	ep->ibv_qp = ibv_create_qp(domain->pd, init_attr);
 	if (!ep->ibv_qp) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unable to create IBV "
-			   "Queue Pair\n");
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_create_qp");
 		return -errno;
 	}
 
@@ -559,8 +620,7 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,
 			    IBV_QP_PORT |
 			    IBV_QP_QKEY);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unable to modify QP state "
-			   "to INIT\n");
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_modify_qp");
 		return -errno;
 	}
 
@@ -569,8 +629,7 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,
 	ret = ibv_modify_qp(ep->ibv_qp, &attr,
 			    IBV_QP_STATE);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unable to modify QP state "
-			   "to RTR\n");
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_modify_qp");
 		return -errno;
 	}
 
@@ -582,30 +641,25 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,
 				    IBV_QP_STATE |
 				    IBV_QP_SQ_PSN);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL, "Unable to modify QP state "
-				   "to RTS\n");
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_modify_qp");
 			return -errno;
 		}
 	}
 
 	if (ibv_query_gid(domain->verbs, 1, vrb_gl_data.gid_idx, &gid)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Unable to query GID, errno = %d",
-			   errno);
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_query_gid");
 		return -errno;
 	}
 
 	if (ibv_query_pkey(domain->verbs, 1, 0, &p_key)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Unable to query P_Key, errno = %d",
 			   errno);
 		return -errno;
 	}
 
 	if (ibv_query_port(domain->verbs, 1, &port_attr)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Unable to query port attributes, errno = %d",
-			   errno);
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_query_port");
 		return -errno;
 	}
 
@@ -624,7 +678,6 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,
 	return 0;
 }
 
-/* vrb_srq_ep::xrc.prepost_lock must be held */
 FI_VERBS_XRC_ONLY
 static int vrb_process_xrc_preposted(struct vrb_srq_ep *srq_ep)
 {
@@ -632,6 +685,7 @@ static int vrb_process_xrc_preposted(struct vrb_srq_ep *srq_ep)
 	struct slist_entry *entry;
 	int ret;
 
+	assert(fastlock_held(&srq_ep->xrc.prepost_lock));
 	/* The pre-post SRQ function ops have been replaced so the
 	 * posting here results in adding the RX entries to the SRQ */
 	while (!slist_empty(&srq_ep->xrc.prepost_list)) {
@@ -642,8 +696,8 @@ static int vrb_process_xrc_preposted(struct vrb_srq_ep *srq_ep)
 			      recv->desc, recv->src_addr, recv->context);
 		free(recv);
 		if (ret) {
-			VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "fi_recv", errno);
-			return -errno;
+			VRB_WARN_ERR(FI_LOG_EP_DATA, "fi_recv", ret);
+			return ret;
 		}
 	}
 	return FI_SUCCESS;
@@ -674,7 +728,7 @@ static int vrb_ep_enable_xrc(struct vrb_ep *ep)
 		 */
 		if (!srq_ep->xrc.cq || srq_ep->xrc.cq != cq) {
 			fastlock_release(&srq_ep->xrc.prepost_lock);
-			VERBS_WARN(FI_LOG_EP_CTRL, "SRX_CTX/CQ mismatch\n");
+			VRB_WARN(FI_LOG_EP_CTRL, "SRX_CTX/CQ mismatch\n");
 			return -FI_EINVAL;
 		}
 		ibv_get_srq_num(srq_ep->srq, &xrc_ep->srqn);
@@ -683,9 +737,8 @@ static int vrb_ep_enable_xrc(struct vrb_ep *ep)
 	}
 
 	if (cq->credits < srq_ep->xrc.max_recv_wr) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "CQ credits %" PRId64 " insufficient\n",
-			   cq->credits);
+		VRB_WARN(FI_LOG_EP_CTRL,
+			   "CQ credits %zd insufficient\n", cq->credits);
 		ret = -FI_EINVAL;
 		goto done;
 	}
@@ -702,7 +755,7 @@ static int vrb_ep_enable_xrc(struct vrb_ep *ep)
 
 	srq_ep->srq = ibv_create_srq_ex(domain->verbs, &attr);
 	if (!srq_ep->srq) {
-		VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_create_srq_ex", errno);
+		VRB_WARN_ERRNO(FI_LOG_DOMAIN, "ibv_create_srq_ex");
 		ret = -errno;
 		goto done;
 	}
@@ -781,29 +834,28 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
 	int ret;
 
 	if (!ep->eq && (ep->util_ep.type == FI_EP_MSG)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Endpoint is not bound to an event queue\n");
+		VRB_WARN(FI_LOG_EP_CTRL,
+			 "Endpoint is not bound to an event queue\n");
 		return -FI_ENOEQ;
 	}
 
 	if (!ep->util_ep.tx_cq && !ep->util_ep.rx_cq) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
-			   "a send or receive completion queue\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
+			 "a send or receive completion queue\n");
 		return -FI_ENOCQ;
 	}
 
-	if (!ep->util_ep.tx_cq && (ofi_send_allowed(ep->util_ep.caps) ||
-				ofi_rma_initiate_allowed(ep->util_ep.caps))) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
-			   "a send completion queue when it has transmit "
-			   "capabilities enabled (FI_SEND | FI_RMA).\n");
+	if (!ep->util_ep.tx_cq && (ofi_needs_tx(ep->util_ep.caps))) {
+		VRB_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
+			 "a send completion queue when it has transmit "
+			 "capabilities enabled (FI_SEND | FI_RMA).\n");
 		return -FI_ENOCQ;
 	}
 
-	if (!ep->util_ep.rx_cq && ofi_recv_allowed(ep->util_ep.caps)) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
-			   "a receive completion queue when it has receive "
-			   "capabilities enabled. (FI_RECV)\n");
+	if (!ep->util_ep.rx_cq && ofi_needs_rx(ep->util_ep.caps)) {
+		VRB_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to "
+			 "a receive completion queue when it has receive "
+			 "capabilities enabled. (FI_RECV)\n");
 		return -FI_ENOCQ;
 	}
 	vrb_msg_ep_get_qp_attr(ep, &attr);
@@ -813,7 +865,7 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
 		if (ep->srq_ep) {
 			/* Override receive function pointers to prevent the user from
 			 * posting Receive WRs to a QP where a SRQ is attached to it */
-			if (domain->flags & VRB_USE_XRC) {
+			if (domain->ext_flags & VRB_USE_XRC) {
 				*ep->util_ep.ep_fid.msg = vrb_msg_srq_xrc_ep_msg_ops;
 				return vrb_ep_enable_xrc(ep);
 			} else {
@@ -821,19 +873,16 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
 				ep->util_ep.ep_fid.msg->recvv = fi_no_msg_recvv;
 				ep->util_ep.ep_fid.msg->recvmsg = fi_no_msg_recvmsg;
 			}
-		} else if (domain->flags & VRB_USE_XRC) {
-			VERBS_WARN(FI_LOG_EP_CTRL, "XRC EP_MSG not bound "
-				   "to srx_context\n");
+		} else if (domain->ext_flags & VRB_USE_XRC) {
+			VRB_WARN(FI_LOG_EP_CTRL, "XRC EP_MSG not bound "
+				 "to srx_context\n");
 			return -FI_EINVAL;
 		}
 
 		ret = rdma_create_qp(ep->id, domain->pd, &attr);
 		if (ret) {
-			ret = -errno;
-			VERBS_WARN(FI_LOG_EP_CTRL,
-				   "Unable to create rdma qp: %s (%d)\n",
-				   fi_strerror(-ret), -ret);
-			return ret;
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp");
+			return -errno;
 		}
 
 		/* Allow shared XRC INI QP not controlled by RDMA CM
@@ -845,13 +894,12 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
 		attr.sq_sig_all = 1;
 		ret = vrb_create_dgram_ep(domain, ep, &attr);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL, "Unable to create dgram EP: %s (%d)\n",
-				   fi_strerror(-ret), -ret);
+			VRB_WARN_ERR(FI_LOG_EP_CTRL, "vrb_create_dgram_ep", ret);
 			return ret;
 		}
 		break;
 	default:
-		VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Unknown EP type\n");
 		assert(0);
 		return -FI_EINVAL;
 	}
@@ -868,7 +916,6 @@ static int vrb_ep_control(struct fid *fid, int command, void *arg)
 		switch (command) {
 		case FI_ENABLE:
 			return vrb_ep_enable(ep);
-			break;
 		default:
 			return -FI_ENOSYS;
 		}
@@ -886,9 +933,9 @@ static int vrb_dgram_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 
 	ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid.fid);
 	if (addrlen < ep->info_attr.src_addrlen) {
-		VERBS_INFO(FI_LOG_EP_CTRL,
-			   "addrlen expected: %zu, got: %zu\n",
-			   ep->info_attr.src_addrlen, addrlen);
+		VRB_INFO(FI_LOG_EP_CTRL,
+			 "addrlen expected: %zu, got: %zu\n",
+			 ep->info_attr.src_addrlen, addrlen);
 		return -FI_ETOOSMALL;
 	}
 	/*
@@ -919,9 +966,9 @@ static int vrb_dgram_ep_getname(fid_t ep_fid, void *addr, size_t *addrlen)
 	ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid.fid);
 	if (*addrlen < sizeof(ep->ep_name)) {
 		*addrlen = sizeof(ep->ep_name);
-		VERBS_INFO(FI_LOG_EP_CTRL,
-			   "addrlen expected: %zu, got: %zu\n",
-			   sizeof(ep->ep_name), *addrlen);
+		VRB_INFO(FI_LOG_EP_CTRL,
+			 "addrlen expected: %zu, got: %zu\n",
+			 sizeof(ep->ep_name), *addrlen);
 		return -FI_ETOOSMALL;
 	}
 
@@ -968,7 +1015,7 @@ static int vrb_ep_save_info_attr(struct vrb_ep *ep, struct fi_info *info)
 	if (info->src_addr) {
 		ep->info_attr.src_addr = mem_dup(info->src_addr, info->src_addrlen);
 		if (ep->info_attr.src_addr == NULL) {
-			VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save src addr\n");
+			VRB_WARN(FI_LOG_EP_CTRL, "Memory error save src addr\n");
 			return -FI_ENOMEM;
 		}
 		ep->info_attr.src_addrlen = info->src_addrlen;
@@ -976,7 +1023,7 @@ static int vrb_ep_save_info_attr(struct vrb_ep *ep, struct fi_info *info)
 	if (info->dest_addr) {
 		ep->info_attr.dest_addr = mem_dup(info->dest_addr, info->dest_addrlen);
 		if (ep->info_attr.dest_addr == NULL) {
-			VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save dest addr\n");
+			VRB_WARN(FI_LOG_EP_CTRL, "Memory error save dest addr\n");
 			free(ep->info_attr.src_addr);
 			ep->info_attr.src_addr = NULL;
 			return -FI_ENOMEM;
@@ -1009,7 +1056,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 	 * to allocate DGRAM (has prefix <dev_name>-dgram) and MSG EPs */
 	if (strncmp(dom->verbs->device->name, info->domain_attr->name,
 		    strlen(dom->verbs->device->name))) {
-		VERBS_INFO(FI_LOG_DOMAIN,
+		VRB_WARN(FI_LOG_DOMAIN,
 			   "Invalid info->domain_attr->name: %s and %s\n",
 			   dom->verbs->device->name, info->domain_attr->name);
 		return -FI_EINVAL;
@@ -1038,13 +1085,13 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 
 	ep = vrb_alloc_init_ep(info, dom, context);
 	if (!ep) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
-			   "Unable to allocate/init EP memory\n");
+		VRB_WARN_ERR(FI_LOG_EP_CTRL, "vrb_alloc_init_ep", -ENOMEM);
 		return -FI_ENOMEM;
 	}
 
 	ep->peer_rq_credits = UINT64_MAX;
 	ep->threshold = INT64_MAX; /* disables RQ flow control */
+	ep->hmem_enabled = !!(ep->util_ep.caps & FI_HMEM);
 
 	ret = vrb_ep_save_info_attr(ep, info);
 	if (ret)
@@ -1052,7 +1099,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 
 	switch (info->ep_attr->type) {
 	case FI_EP_MSG:
-		if (dom->flags & VRB_USE_XRC) {
+		if (dom->ext_flags & VRB_USE_XRC) {
 			if (dom->util_domain.threading == FI_THREAD_SAFE) {
 				*ep->util_ep.ep_fid.msg = vrb_msg_xrc_ep_msg_ops_ts;
 				ep->util_ep.ep_fid.rma = &vrb_msg_xrc_ep_rma_ops_ts;
@@ -1076,7 +1123,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 
 		if (!info->handle) {
 			/* Only RC, XRC active RDMA CM ID is created at connect */
-			if (!(dom->flags & VRB_USE_XRC)) {
+			if (!(dom->ext_flags & VRB_USE_XRC)) {
 				ret = vrb_create_ep(ep,
 					vrb_get_port_space(info->addr_format), &ep->id);
 				if (ret)
@@ -1086,7 +1133,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		} else if (info->handle->fclass == FI_CLASS_CONNREQ) {
 			connreq = container_of(info->handle,
 					       struct vrb_connreq, handle);
-			if (dom->flags & VRB_USE_XRC) {
+			if (dom->ext_flags & VRB_USE_XRC) {
 				assert(connreq->is_xrc);
 
 				if (!connreq->xrc.is_reciprocal) {
@@ -1096,7 +1143,12 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 						goto err1;
 				}
 			} else {
+				/* ep now owns this rdma cm id, prevent trying to access
+				 * it outside of ep operations to avoid possible use-after-
+				 * free bugs in case the ep is closed
+				 */
 				ep->id = connreq->id;
+				connreq->id = NULL;
 				ep->ibv_qp = ep->id->qp;
 				ep->id->context = &ep->util_ep.ep_fid.fid;
 			}
@@ -1109,7 +1161,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 			if (rdma_resolve_addr(ep->id, info->src_addr, info->dest_addr,
 					      VERBS_RESOLVE_TIMEOUT)) {
 				ret = -errno;
-				VERBS_INFO(FI_LOG_DOMAIN, "Unable to rdma_resolve_addr\n");
+				VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr");
 				goto err2;
 			}
 			ep->id->context = &ep->util_ep.ep_fid.fid;
@@ -1132,7 +1184,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		ep->util_ep.ep_fid.cm = &vrb_dgram_cm_ops;
 		break;
 	default:
-		VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Unknown EP type\n");
 		ret = -FI_EINVAL;
 		assert(0);
 		goto err1;
@@ -1183,7 +1235,7 @@ static int vrb_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags)
 	 */
 	if (vrb_is_xrc_info(pep->info)) {
 		if (pep->eq->xrc.pep_port) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "XRC limits EQ binding to a single PEP\n");
 			return -FI_EINVAL;
 		}
@@ -1191,8 +1243,10 @@ static int vrb_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags)
 	}
 
 	ret = rdma_migrate_id(pep->id, pep->eq->channel);
-	if (ret)
+	if (ret) {
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_migrate_id");
 		return -errno;
+	}
 
 	if (vrb_is_xrc_info(pep->info)) {
 		ret = rdma_migrate_id(pep->xrc_ps_udp_id, pep->eq->channel);
@@ -1286,14 +1340,15 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 	ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid,
 			     vrb_get_port_space(_pep->info->addr_format));
 	if (ret) {
-		VERBS_INFO(FI_LOG_DOMAIN, "Unable to create PEP rdma_cm_id\n");
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_id");
 		goto err2;
 	}
 
 	if (info->src_addr) {
-		ret = rdma_bind_addr(_pep->id, (struct sockaddr *)info->src_addr);
+		ret = rdma_bind_addr(_pep->id, (struct sockaddr *) info->src_addr);
 		if (ret) {
-			VERBS_INFO(FI_LOG_DOMAIN, "Unable to bind address to rdma_cm_id\n");
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_bind_addr");
+			ret = -errno;
 			goto err3;
 		}
 		_pep->bound = 1;
@@ -1304,8 +1359,7 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 		ret = rdma_create_id(NULL, &_pep->xrc_ps_udp_id,
 				     &_pep->pep_fid.fid, RDMA_PS_UDP);
 		if (ret) {
-			VERBS_INFO(FI_LOG_DOMAIN,
-				   "Unable to create PEP PS_UDP rdma_cm_id\n");
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_id");
 			goto err3;
 		}
 		/* Currently both listens must be bound to same port number */
@@ -1314,8 +1368,7 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 		ret = rdma_bind_addr(_pep->xrc_ps_udp_id,
 				     (struct sockaddr *)_pep->info->src_addr);
 		if (ret) {
-			VERBS_INFO(FI_LOG_DOMAIN,
-				   "Unable to bind address to PS_UDP rdma_cm_id\n");
+			VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_bind_addr");
 			goto err4;
 		}
 	}
@@ -1438,7 +1491,7 @@ vrb_srq_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t fla
 		.next = NULL,
 	};
 
-	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
+	vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count);
 	return vrb_post_srq(ep, &wr);
 }
 
@@ -1563,18 +1616,18 @@ static void vrb_cleanup_prepost_bufs(struct vrb_srq_ep *srq_ep)
 	}
 }
 
-/* Must hold the associated CQ lock cq::xrc.srq_list_lock */
 int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep)
 {
 	int ret;
 
-	assert(srq_ep->domain->flags & VRB_USE_XRC);
+	assert(fastlock_held(&srq_ep->xrc.cq->xrc.srq_list_lock));
+	assert(srq_ep->domain->ext_flags & VRB_USE_XRC);
 	if (!srq_ep->xrc.cq || !srq_ep->srq)
 		return FI_SUCCESS;
 
 	ret = ibv_destroy_srq(srq_ep->srq);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Cannot destroy SRQ rc=%d\n", ret);
+		VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_destroy_srq");
 		return -ret;
 	}
 	srq_ep->xrc.cq->credits += srq_ep->xrc.max_recv_wr;
@@ -1593,7 +1646,7 @@ static int vrb_srq_close(fid_t fid)
 	struct vrb_cq *cq = srq_ep->xrc.cq;
 	int ret;
 
-	if (srq_ep->domain->flags & VRB_USE_XRC) {
+	if (srq_ep->domain->ext_flags & VRB_USE_XRC) {
 		if (cq) {
 			fastlock_acquire(&cq->xrc.srq_list_lock);
 			ret = vrb_xrc_close_srq(srq_ep);
@@ -1614,7 +1667,7 @@ static int vrb_srq_close(fid_t fid)
 	return FI_SUCCESS;
 
 err:
-	VERBS_WARN(FI_LOG_EP_CTRL, "Cannot destroy SRQ rc=%d\n", ret);
+	VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_destroy_srq");
 	return ret;
 }
 
@@ -1661,7 +1714,7 @@ int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr,
 
 	/* XRC SRQ creation is delayed until the first endpoint it is bound
 	 * to is enabled.*/
-	if (dom->flags & VRB_USE_XRC) {
+	if (dom->ext_flags & VRB_USE_XRC) {
 		fastlock_init(&srq_ep->xrc.prepost_lock);
 		slist_init(&srq_ep->xrc.prepost_list);
 		dlist_init(&srq_ep->xrc.srq_entry);
@@ -1677,7 +1730,7 @@ int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr,
 
 	srq_ep->srq = ibv_create_srq(dom->pd, &srq_init_attr);
 	if (!srq_ep->srq) {
-		VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_create_srq", errno);
+		VRB_WARN_ERRNO(FI_LOG_DOMAIN, "ibv_create_srq");
 		ret = -errno;
 		goto free_bufs;
 	}
@@ -1695,26 +1748,26 @@ free_ep:
 }
 
 
-#define vrb_atomicvalid(name, flags)					\
+#define VRB_DEF_ATOMICVALID(name, flags)				\
 static int vrb_msg_ep_atomic_ ## name(struct fid_ep *ep_fid,		\
 					 enum fi_datatype datatype,	\
 					 enum fi_op op, size_t *count)	\
 {									\
-	struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep,	\
+	struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep,		\
 					    util_ep.ep_fid);		\
 	struct fi_atomic_attr attr;					\
 	int ret;							\
 									\
-	ret = vrb_query_atomic(&ep->util_ep.domain->domain_fid,	\
+	ret = vrb_query_atomic(&ep->util_ep.domain->domain_fid,		\
 				  datatype, op, &attr, flags);		\
 	if (!ret)							\
 		*count = attr.count;					\
 	return ret;							\
 }
 
-vrb_atomicvalid(writevalid, 0);
-vrb_atomicvalid(readwritevalid, FI_FETCH_ATOMIC);
-vrb_atomicvalid(compwritevalid, FI_COMPARE_ATOMIC);
+VRB_DEF_ATOMICVALID(writevalid, 0)
+VRB_DEF_ATOMICVALID(readwritevalid, FI_FETCH_ATOMIC)
+VRB_DEF_ATOMICVALID(compwritevalid, FI_COMPARE_ATOMIC)
 
 int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
 			enum fi_op op, struct fi_atomic_attr *attr,
@@ -1759,8 +1812,8 @@ int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
 			return  -FI_EBADFLAGS;
 		}
 		if (domain->info->tx_attr->op_flags & FI_INJECT) {
-			VERBS_INFO(FI_LOG_EP_DATA,
-				   "FI_INJECT not supported for %s\n", log_str);
+			VRB_INFO(FI_LOG_EP_DATA,
+				 "FI_INJECT not supported for %s\n", log_str);
 			return -FI_EINVAL;
 		}
 	}
@@ -1797,7 +1850,7 @@ vrb_msg_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count,
 		.opcode = IBV_WR_RDMA_WRITE,
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)(uintptr_t)key,
-		.send_flags = VERBS_INJECT(ep, sizeof(uint64_t)) |
+		.send_flags = VERBS_INJECT(ep, sizeof(uint64_t), desc) |
 			      IBV_SEND_FENCE,
 	};
 	size_t count_copy;
@@ -1841,7 +1894,7 @@ vrb_msg_ep_atomic_writemsg(struct fid_ep *ep_fid,
 		.wr_id = VERBS_COMP_FLAGS(ep, flags, (uintptr_t)msg->context),
 		.wr.rdma.remote_addr = msg->rma_iov->addr,
 		.wr.rdma.rkey = (uint32_t)(uintptr_t)msg->rma_iov->key,
-		.send_flags = VERBS_INJECT_FLAGS(ep, sizeof(uint64_t), flags) |
+		.send_flags = VERBS_INJECT_FLAGS(ep, sizeof(uint64_t), flags, msg->desc[0]) |
 			      IBV_SEND_FENCE,
 	};
 	size_t count_copy;
@@ -2107,7 +2160,7 @@ vrb_msg_xrc_ep_atomic_write(struct fid_ep *ep_fid, const void *buf,
 		.opcode = IBV_WR_RDMA_WRITE,
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)(uintptr_t)key,
-		.send_flags = VERBS_INJECT(&ep->base_ep, sizeof(uint64_t)) |
+		.send_flags = VERBS_INJECT(&ep->base_ep, sizeof(uint64_t), desc) |
 			      IBV_SEND_FENCE,
 	};
 	size_t count_copy;
@@ -2142,7 +2195,7 @@ vrb_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid,
 		.wr.rdma.remote_addr = msg->rma_iov->addr,
 		.wr.rdma.rkey = (uint32_t)(uintptr_t)msg->rma_iov->key,
 		.send_flags = VERBS_INJECT_FLAGS(&ep->base_ep,
-				sizeof(uint64_t), flags) | IBV_SEND_FENCE,
+				sizeof(uint64_t), flags, msg->desc[0]) | IBV_SEND_FENCE,
 	};
 	size_t count_copy;
 	int ret;
diff --git a/deps/libfabric/prov/verbs/src/verbs_eq.c b/deps/libfabric/prov/verbs/src/verbs_eq.c
index 33473b63124ac5adfbfcdb6b64453f74b2401b1d..fa9d07d1b4eafdd71e9151da4003e796c02d7548 100644
--- a/deps/libfabric/prov/verbs/src/verbs_eq.c
+++ b/deps/libfabric/prov/verbs/src/verbs_eq.c
@@ -76,11 +76,11 @@ unlock:
 	return rd;
 }
 
-/* Caller must hold eq:lock */
 void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep)
 {
 	struct vrb_eq *eq = ep->base_ep.eq;
 
+	assert(fastlock_held(&eq->lock));
 	assert(ep->conn_setup);
 	assert(ep->conn_setup->conn_tag == VERBS_CONN_TAG_INVALID);
 	ep->conn_setup->conn_tag =
@@ -88,12 +88,12 @@ void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep)
 				ofi_idx_insert(eq->xrc.conn_key_map, ep));
 }
 
-/* Caller must hold eq:lock */
 void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep)
 {
 	struct vrb_eq *eq = ep->base_ep.eq;
 	int index;
 
+	assert(fastlock_held(&eq->lock));
 	assert(ep->conn_setup);
 	if (ep->conn_setup->conn_tag == VERBS_CONN_TAG_INVALID)
 		return;
@@ -101,33 +101,33 @@ void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep)
 	index = ofi_key2idx(&eq->xrc.conn_key_idx,
 			    (uint64_t)ep->conn_setup->conn_tag);
 	if (!ofi_idx_is_valid(eq->xrc.conn_key_map, index))
-	    VERBS_WARN(FI_LOG_EP_CTRL,
+	    VRB_WARN(FI_LOG_EP_CTRL,
 		       "Invalid XRC connection connection tag\n");
 	else
 		ofi_idx_remove(eq->xrc.conn_key_map, index);
 	ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID;
 }
 
-/* Caller must hold eq:lock */
 struct vrb_xrc_ep *vrb_eq_xrc_conn_tag2ep(struct vrb_eq *eq,
-						uint32_t conn_tag)
+					  uint32_t conn_tag)
 {
 	struct vrb_xrc_ep *ep;
 	int index;
 
+	assert(fastlock_held(&eq->lock));
 	index = ofi_key2idx(&eq->xrc.conn_key_idx, (uint64_t)conn_tag);
 	ep = ofi_idx_lookup(eq->xrc.conn_key_map, index);
 	if (!ep || ep->magic != VERBS_XRC_EP_MAGIC) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "XRC EP is not valid\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "XRC EP is not valid\n");
 		return NULL;
 	}
 	if (!ep->conn_setup) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Bad state, no connection data\n");
 		return NULL;
 	}
 	if (ep->conn_setup->conn_tag != conn_tag) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Connection tag mismatch\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Connection tag mismatch\n");
 		return NULL;
 	}
 
@@ -185,7 +185,7 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info,
 	int ret = -FI_ENOMEM;
 
 	if (!(hints = fi_dupinfo(pep_info))) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "dupinfo failure\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "dupinfo failure\n");
 		return -FI_ENOMEM;
 	}
 
@@ -200,7 +200,7 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info,
 			goto err1;
 	} else {
 		if (vrb_pep_dev_domain_match(hints, devname)) {
-			VERBS_WARN(FI_LOG_EQ, "passive endpoint domain: %s does"
+			VRB_WARN(FI_LOG_EQ, "passive endpoint domain: %s does"
 				   " not match device: %s where we got a "
 				   "connection request\n",
 				   hints->domain_attr->name, devname);
@@ -243,7 +243,7 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info,
 
 	connreq = calloc(1, sizeof *connreq);
 	if (!connreq) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Unable to allocate connreq memory\n");
 		goto err2;
 	}
@@ -339,7 +339,7 @@ static int vrb_sidr_conn_compare(struct ofi_rbmap *map,
 			     sizeof(ofi_sin6_addr(_key->addr)));
 		break;
 	default:
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unsuuported address format\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Unsuuported address format\n");
 		assert(0);
 		ret = -FI_EINVAL;
 	}
@@ -354,14 +354,14 @@ static int vrb_sidr_conn_compare(struct ofi_rbmap *map,
 		-1 : _key->recip > ep->recip_accept;
 }
 
-/* Caller must hold eq:lock */
 struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq,
-					      struct sockaddr *peer,
-					      uint16_t pep_port, bool recip)
+					struct sockaddr *peer,
+					uint16_t pep_port, bool recip)
 {
 	struct ofi_rbnode *node;
 	struct vrb_sidr_conn_key key;
 
+	assert(fastlock_held(&eq->lock));
 	vrb_set_sidr_conn_key(peer, pep_port, recip, &key);
 	node = ofi_rbmap_find(&eq->xrc.sidr_conn_rbmap, &key);
 	if (OFI_LIKELY(!node))
@@ -370,13 +370,13 @@ struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq,
 	return (struct vrb_xrc_ep *) node->data;
 }
 
-/* Caller must hold eq:lock */
 int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
-			    void *param_data, size_t param_len)
+			 void *param_data, size_t param_len)
 {
 	int ret;
 	struct vrb_sidr_conn_key key;
 
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(!ep->accept_param_data);
 	assert(param_len);
 	assert(ep->tgt_id && ep->tgt_id->ps == RDMA_PS_UDP);
@@ -385,7 +385,7 @@ int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
 				 ep->remote_pep_port, ep->recip_accept, &key);
 	ep->accept_param_data = calloc(1, param_len);
 	if (!ep->accept_param_data) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "SIDR alloc conn param memory failure\n");
 		return -FI_ENOMEM;
 	}
@@ -396,7 +396,7 @@ int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
 			       &key, (void *) ep, &ep->conn_map_node);
 	assert(ret != -FI_EALREADY);
 	if (OFI_UNLIKELY(ret)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "SIDR conn map entry insert error %d\n", ret);
 		free(ep->accept_param_data);
 		ep->accept_param_data = NULL;
@@ -406,9 +406,9 @@ int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
 	return FI_SUCCESS;
 }
 
-/* Caller must hold eq:lock */
 void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep)
 {
+	assert(fastlock_held(&ep->base_ep.eq->lock));
 	assert(ep->conn_map_node);
 
 	ofi_rbmap_delete(&ep->base_ep.eq->xrc.sidr_conn_rbmap,
@@ -432,7 +432,7 @@ vrb_eq_accept_recip_conn(struct vrb_xrc_ep *ep,
 	ret = vrb_accept_xrc(ep, VRB_RECIP_CONN, &cm_data,
 				sizeof(cm_data));
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Reciprocal XRC Accept failed %d\n", ret);
 		return ret;
 	}
@@ -481,14 +481,14 @@ vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry,
 					     connreq->xrc.port,
 					     connreq->xrc.is_reciprocal);
 		if (ep) {
-			VERBS_DBG(FI_LOG_EP_CTRL,
+			VRB_DBG(FI_LOG_EP_CTRL,
 				  "SIDR %s request retry received\n",
 				  connreq->xrc.is_reciprocal ?
 				  "reciprocal" : "original");
 			ret = vrb_resend_shared_accept_xrc(ep, connreq,
 							      cma_event->id);
 			if (ret)
-				VERBS_WARN(FI_LOG_EP_CTRL,
+				VRB_WARN(FI_LOG_EP_CTRL,
 					   "SIDR accept resend failure %d\n",
 					   -errno);
 			rdma_destroy_id(cma_event->id);
@@ -508,7 +508,7 @@ vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry,
 	 */
 	ep = vrb_eq_xrc_conn_tag2ep(eq, connreq->xrc.conn_tag);
 	if (!ep) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Reciprocal XRC connection tag 0x%x not found\n",
 			   connreq->xrc.conn_tag);
 		return -FI_EAGAIN;
@@ -524,7 +524,7 @@ vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry,
 
 	ret = rdma_migrate_id(ep->tgt_id, ep->base_ep.eq->channel);
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "Could not migrate CM ID\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Could not migrate CM ID\n");
 		goto send_reject;
 	}
 
@@ -538,7 +538,7 @@ vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry,
 
 send_reject:
 	if (rdma_reject(connreq->id, *priv_data, *priv_datalen))
-		VERBS_WARN(FI_LOG_EP_CTRL, "rdma_reject %d\n", -errno);
+		VRB_WARN(FI_LOG_EP_CTRL, "rdma_reject %d\n", -errno);
 
 	return -FI_EAGAIN;
 }
@@ -565,7 +565,7 @@ vrb_eq_xrc_conn_event(struct vrb_xrc_ep *ep,
 	size_t priv_datalen = cma_event->param.conn.private_data_len;
 	int ret;
 
-	VERBS_DBG(FI_LOG_EP_CTRL, "EP %p INITIAL CONNECTION DONE state %d, ps %d\n",
+	VRB_DBG(FI_LOG_EP_CTRL, "EP %p INITIAL CONNECTION DONE state %d, ps %d\n",
 		  ep, ep->conn_state, cma_event->id->ps);
 	vrb_next_xrc_conn_state(ep);
 
@@ -621,14 +621,14 @@ vrb_eq_xrc_recip_conn_event(struct vrb_eq *eq,
 	int ret;
 
 	vrb_next_xrc_conn_state(ep);
-	VERBS_DBG(FI_LOG_EP_CTRL, "EP %p RECIPROCAL CONNECTION DONE state %d\n",
+	VRB_DBG(FI_LOG_EP_CTRL, "EP %p RECIPROCAL CONNECTION DONE state %d\n",
 		  ep, ep->conn_state);
 
 	/* If this is the reciprocal active side notification */
 	if (cma_event->param.conn.private_data) {
 		ret = vrb_eq_set_xrc_info(cma_event, &xrc_info);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "Reciprocal connection protocol mismatch\n");
 			eq->err.err = -ret;
 			eq->err.prov_errno = ret;
@@ -654,7 +654,6 @@ vrb_eq_xrc_recip_conn_event(struct vrb_eq *eq,
 	return sizeof(*entry) + len;
 }
 
-/* Caller must hold eq:lock */
 static int
 vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event)
 {
@@ -663,9 +662,10 @@ vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event)
 	struct vrb_xrc_conn_info xrc_info;
 	enum vrb_xrc_ep_conn_state state;
 
+	assert(fastlock_held(&eq->lock));
 	ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid);
 	if (ep->magic != VERBS_XRC_EP_MAGIC) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "CM ID context not valid\n");
 		return -FI_EAGAIN;
 	}
@@ -674,7 +674,7 @@ vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event)
 	if (ep->base_ep.id != cma_event->id ||
 	    (state != VRB_XRC_ORIG_CONNECTING &&
 	     state != VRB_XRC_RECIP_CONNECTING)) {
-		VERBS_WARN(FI_LOG_EP_CTRL,
+		VRB_WARN(FI_LOG_EP_CTRL,
 			   "Stale/invalid CM reject %d received\n", cma_event->status);
 		return -FI_EAGAIN;
 	}
@@ -684,7 +684,7 @@ vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event)
 	    cma_event->status == VRB_CM_REJ_SIDR_CONSUMER_DEFINED) {
 		if (cma_event->param.conn.private_data_len &&
 		    vrb_eq_set_xrc_info(cma_event, &xrc_info)) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "CM REJ private data not valid\n");
 			return -FI_EAGAIN;
 		}
@@ -693,21 +693,22 @@ vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event)
 		return FI_SUCCESS;
 	}
 
-	VERBS_WARN(FI_LOG_EP_CTRL, "Non-application generated CM Reject %d\n",
+	VRB_WARN(FI_LOG_EP_CTRL, "Non-application generated CM Reject %d\n",
 		   cma_event->status);
 	if (cma_event->param.conn.private_data_len)
-		VERBS_WARN(FI_LOG_EP_CTRL, "Unexpected CM Reject priv_data\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "Unexpected CM Reject priv_data\n");
 
 	vrb_ep_ini_conn_rejected(ep);
 
 	return state == VRB_XRC_ORIG_CONNECTING ? FI_SUCCESS : -FI_EAGAIN;
 }
 
-/* Caller must hold eq:lock */
-static inline int
+static int
 vrb_eq_xrc_connect_retry(struct vrb_xrc_ep *ep,
 			 struct rdma_cm_event *cma_event, int *acked)
 {
+	assert(fastlock_held(&ep->base_ep.eq->lock));
+
 	if (ep->base_ep.info_attr.src_addr)
 		ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL,
 				"Connect retry src ",
@@ -728,18 +729,18 @@ vrb_eq_xrc_connect_retry(struct vrb_xrc_ep *ep,
 			       ep->conn_setup->pending_paramlen);
 }
 
-/* Caller must hold eq:lock */
-static inline int
+static int
 vrb_eq_xrc_cm_err_event(struct vrb_eq *eq,
-                           struct rdma_cm_event *cma_event, int *acked)
+                        struct rdma_cm_event *cma_event, int *acked)
 {
 	struct vrb_xrc_ep *ep;
 	fid_t fid = cma_event->id->context;
 	int ret;
 
+	assert(fastlock_held(&eq->lock));
 	ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid);
 	if (ep->magic != VERBS_XRC_EP_MAGIC) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "CM ID context invalid\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "CM ID context invalid\n");
 		return -FI_EAGAIN;
 	}
 
@@ -748,7 +749,7 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq,
 	if ((ep->base_ep.id != cma_event->id) &&
 	    (cma_event->event == RDMA_CM_EVENT_CONNECT_ERROR &&
 	     ep->tgt_id != cma_event->id)) {
-		VERBS_WARN(FI_LOG_EP_CTRL, "CM error not valid for EP\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "CM error not valid for EP\n");
 		return -FI_EAGAIN;
 	}
 
@@ -764,7 +765,7 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq,
 		}
 	}
 
-	VERBS_WARN(FI_LOG_EP_CTRL, "CM error event %s, status %d\n",
+	VRB_WARN(FI_LOG_EP_CTRL, "CM error event %s, status %d\n",
 		   rdma_event_str(cma_event->event), cma_event->status);
 	if (ep->base_ep.info_attr.src_addr)
 		ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
@@ -776,12 +777,11 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq,
         return FI_SUCCESS;
 }
 
-/* Caller must hold eq:lock */
-static inline int
+static int
 vrb_eq_xrc_connected_event(struct vrb_eq *eq,
-			      struct rdma_cm_event *cma_event, int *acked,
-			      struct fi_eq_cm_entry *entry, size_t len,
-			      uint32_t *event)
+			   struct rdma_cm_event *cma_event, int *acked,
+			   struct fi_eq_cm_entry *entry, size_t len,
+			   uint32_t *event)
 {
 	struct vrb_xrc_ep *ep;
 	fid_t fid = cma_event->id->context;
@@ -789,6 +789,7 @@ vrb_eq_xrc_connected_event(struct vrb_eq *eq,
 
 	ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid);
 
+	assert(fastlock_held(&eq->lock));
 	assert(ep->conn_state == VRB_XRC_ORIG_CONNECTING ||
 	       ep->conn_state == VRB_XRC_RECIP_CONNECTING);
 
@@ -807,14 +808,15 @@ vrb_eq_xrc_connected_event(struct vrb_eq *eq,
 	return ret;
 }
 
-/* Caller must hold eq:lock */
-static inline void
+static void
 vrb_eq_xrc_timewait_event(struct vrb_eq *eq,
-			     struct rdma_cm_event *cma_event, int *acked)
+			  struct rdma_cm_event *cma_event, int *acked)
 {
 	fid_t fid = cma_event->id->context;
 	struct vrb_xrc_ep *ep = container_of(fid, struct vrb_xrc_ep,
 						base_ep.util_ep.ep_fid);
+
+	assert(fastlock_held(&eq->lock));
 	assert(ep->magic == VERBS_XRC_EP_MAGIC);
 	assert(ep->conn_setup);
 
@@ -833,14 +835,15 @@ vrb_eq_xrc_timewait_event(struct vrb_eq *eq,
 		vrb_free_xrc_conn_setup(ep, 0);
 }
 
-/* Caller must hold eq:lock */
 static inline void
 vrb_eq_xrc_disconnect_event(struct vrb_eq *eq,
 			       struct rdma_cm_event *cma_event, int *acked)
 {
 	fid_t fid = cma_event->id->context;
 	struct vrb_xrc_ep *ep = container_of(fid, struct vrb_xrc_ep,
-						base_ep.util_ep.ep_fid);
+					     base_ep.util_ep.ep_fid);
+
+	assert(fastlock_held(&eq->lock));
 	assert(ep->magic == VERBS_XRC_EP_MAGIC);
 
 	if (ep->conn_setup && cma_event->id == ep->base_ep.id) {
@@ -888,7 +891,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 
 		ret = vrb_eq_cm_getinfo(cma_event, pep->info, &entry->info);
 		if (ret) {
-			VERBS_WARN(FI_LOG_EP_CTRL,
+			VRB_WARN(FI_LOG_EP_CTRL,
 				   "CM getinfo error %d\n", ret);
 			rdma_destroy_id(cma_event->id);
 			eq->err.err = -ret;
@@ -918,9 +921,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 		if (cma_event->id->qp &&
 		    cma_event->id->qp->context->device->transport_type !=
 		    IBV_TRANSPORT_IWARP) {
-			ret = vrb_set_rnr_timer(cma_event->id->qp);
-			if (ret)
-				goto ack;
+			vrb_set_rnr_timer(cma_event->id->qp);
 		}
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
 		if (vrb_is_xrc_ep(ep)) {
@@ -1008,7 +1009,7 @@ xrc_shared_reject:
 		eq->err.err = EADDRNOTAVAIL;
 		goto err;
 	default:
-		VERBS_WARN(FI_LOG_EP_CTRL, "unknown rdmacm event received: %d\n",
+		VRB_WARN(FI_LOG_EP_CTRL, "unknown rdmacm event received: %d\n",
 			   cma_event->event);
 		ret = -FI_EAGAIN;
 		goto ack;
@@ -1061,12 +1062,12 @@ int vrb_eq_match_event(struct dlist_entry *item, const void *arg)
 	}
 }
 
-/* Caller must hold eq->lock */
 void vrb_eq_remove_events(struct vrb_eq *eq, struct fid *fid)
 {
 	struct dlist_entry *item;
 	struct vrb_eq_entry *entry;
 
+	assert(fastlock_held(&eq->lock));
 	while ((item =
 		dlistfd_remove_first_match(&eq->list_head,
 					   vrb_eq_match_event, fid))) {
@@ -1205,7 +1206,7 @@ vrb_eq_sread(struct fid_eq *eq_fid, uint32_t *event,
 		void *buf, size_t len, int timeout, uint64_t flags)
 {
 	struct vrb_eq *eq;
-	void *contexts;
+	struct ofi_epollfds_event fdevent;
 	ssize_t ret;
 
 	eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid);
@@ -1215,12 +1216,12 @@ vrb_eq_sread(struct fid_eq *eq_fid, uint32_t *event,
 		if (ret && (ret != -FI_EAGAIN))
 			return ret;
 
-		ret = ofi_epoll_wait(eq->epollfd, &contexts, 1, timeout);
+		ret = ofi_epoll_wait(eq->epollfd, &fdevent, 1, timeout);
 		if (ret == 0)
 			return -FI_EAGAIN;
 		else if (ret < 0)
 			return -errno;
-	};
+	}
 }
 
 static const char *
@@ -1292,7 +1293,7 @@ static int vrb_eq_close(fid_t fid)
 	/* TODO: use util code, if possible, and add ref counting */
 
 	if (!ofi_rbmap_empty(&eq->xrc.sidr_conn_rbmap))
-		VERBS_WARN(FI_LOG_EP_CTRL, "SIDR connection RBmap not empty\n");
+		VRB_WARN(FI_LOG_EP_CTRL, "SIDR connection RBmap not empty\n");
 
 	free(eq->err.err_data);
 
@@ -1351,7 +1352,7 @@ int vrb_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
 	fastlock_init(&_eq->lock);
 	ret = dlistfd_head_init(&_eq->list_head);
 	if (ret) {
-		VERBS_INFO(FI_LOG_EQ, "Unable to initialize dlistfd\n");
+		VRB_INFO(FI_LOG_EQ, "Unable to initialize dlistfd\n");
 		goto err1;
 	}
 
diff --git a/deps/libfabric/prov/verbs/src/verbs_info.c b/deps/libfabric/prov/verbs/src/verbs_info.c
index acdd96da8575b86ec0a8d1abe42a6c0044d7d414..1bc70070f23756695c29818829f757d81bb8d30c 100644
--- a/deps/libfabric/prov/verbs/src/verbs_info.c
+++ b/deps/libfabric/prov/verbs/src/verbs_info.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2015 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -64,7 +65,7 @@
 		OFI_ORDER_WAW_SET | FI_ORDER_WAS | FI_ORDER_SAW | FI_ORDER_SAS )
 
 #define VERBS_INFO_NODE_2_UD_ADDR(sybsys, node, svc, ib_ud_addr)			\
-	VERBS_INFO(sybsys, "'%s:%u' resolved to <gid <interface_id=%"PRIu64		\
+	VRB_INFO(sybsys, "'%s:%u' resolved to <gid <interface_id=%"PRIu64		\
 			   ", subnet_prefix=%"PRIu64">, lid=%d, service = %u>\n",	\
 		   node, svc, be64toh((ib_ud_addr)->gid.global.interface_id),		\
 		   be64toh((ib_ud_addr)->gid.global.subnet_prefix),			\
@@ -182,7 +183,7 @@ int vrb_check_ep_attr(const struct fi_info *hints,
 	case FI_PROTO_IB_UD:
 		break;
 	default:
-		VERBS_INFO(FI_LOG_CORE,
+		VRB_INFO(FI_LOG_CORE,
 			   "Unsupported protocol\n");
 		return -FI_ENODATA;
 	}
@@ -236,16 +237,16 @@ static int vrb_check_hints(uint32_t version, const struct fi_info *hints,
 	uint64_t prov_mode;
 
 	if (hints->caps & ~(info->caps)) {
-		VERBS_INFO(FI_LOG_CORE, "Unsupported capabilities\n");
-		FI_INFO_CHECK(&vrb_prov, info, hints, caps, FI_TYPE_CAPS);
+		VRB_INFO(FI_LOG_CORE, "Unsupported capabilities\n");
+		OFI_INFO_CHECK(&vrb_prov, info, hints, caps, FI_TYPE_CAPS);
 		return -FI_ENODATA;
 	}
 
 	prov_mode = ofi_mr_get_prov_mode(version, hints, info);
 
 	if ((hints->mode & prov_mode) != prov_mode) {
-		VERBS_INFO(FI_LOG_CORE, "needed mode not set\n");
-		FI_INFO_MODE(&vrb_prov, prov_mode, hints->mode);
+		VRB_INFO(FI_LOG_CORE, "needed mode not set\n");
+		OFI_INFO_MODE(&vrb_prov, prov_mode, hints->mode);
 		return -FI_ENODATA;
 	}
 
@@ -259,7 +260,7 @@ static int vrb_check_hints(uint32_t version, const struct fi_info *hints,
 	if (hints->domain_attr) {
 		if (hints->domain_attr->name &&
 		    strcasecmp(hints->domain_attr->name, info->domain_attr->name)) {
-			VERBS_INFO(FI_LOG_CORE, "skipping device %s (want %s)\n",
+			VRB_INFO(FI_LOG_CORE, "skipping device %s (want %s)\n",
 				   info->domain_attr->name, hints->domain_attr->name);
 			return -FI_ENODATA;
 		}
@@ -333,7 +334,7 @@ int vrb_set_rai(uint32_t addr_format, void *src_addr, size_t src_addrlen,
 		}
 		break;
 	default:
-		VERBS_INFO(FI_LOG_FABRIC, "Unknown addr_format\n");
+		VRB_INFO(FI_LOG_FABRIC, "Unknown addr_format\n");
 	}
 
 	if (src_addrlen) {
@@ -399,7 +400,7 @@ static int vrb_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi)
 
 	fi->addr_format = ofi_translate_addr_format(rai->ai_family);
 	if (fi->addr_format == FI_FORMAT_UNSPEC) {
-		VERBS_WARN(FI_LOG_FABRIC, "Unknown address format\n");
+		VRB_WARN(FI_LOG_FABRIC, "Unknown address format\n");
 		return -FI_EINVAL;
 	}
 
@@ -433,13 +434,13 @@ static inline int vrb_get_qp_cap(struct ibv_context *ctx,
 
 	pd = ibv_alloc_pd(ctx);
 	if (!pd) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd");
 		return -errno;
 	}
 
 	cq = ibv_create_cq(ctx, 1, NULL, NULL, 0);
 	if (!cq) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_cq", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_create_cq");
 		ret = -errno;
 		goto err1;
 	}
@@ -475,7 +476,7 @@ static inline int vrb_get_qp_cap(struct ibv_context *ctx,
 
 	qp = ibv_create_qp(pd, &init_attr);
 	if (!qp) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_qp", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_create_qp");
 		ret = -errno;
 		goto err2;
 	}
@@ -550,8 +551,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 
 	ret = ibv_query_device(ctx, &device_attr);
 	if (ret) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC,
-				 "ibv_query_device", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_query_device");
 		return -errno;
 	}
 
@@ -600,8 +600,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 	for (port_num = 1; port_num < device_attr.phys_port_cnt + 1; port_num++) {
 		ret = ibv_query_port(ctx, port_num, &port_attr);
 		if (ret) {
-			VERBS_INFO_ERRNO(FI_LOG_FABRIC,
-					 "ibv_query_port", errno);
+			VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_query_port");
 			return -errno;
 		}
 		if (port_attr.state == IBV_PORT_ACTIVE)
@@ -613,14 +612,14 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 			"active ports\n", dev_name);
 		return -FI_ENODATA;
 	} else {
-		VERBS_INFO(FI_LOG_FABRIC, "device %s: first found active port "
+		VRB_INFO(FI_LOG_FABRIC, "device %s: first found active port "
 			   "is %"PRIu8"\n", dev_name, port_num);
 	}
 
 	if (info->ep_attr->type == FI_EP_DGRAM) {
 		ret = vrb_mtu_type_to_len(port_attr.active_mtu);
 		if (ret < 0) {
-			VERBS_WARN(FI_LOG_FABRIC, "device %s (port: %d) reports"
+			VRB_WARN(FI_LOG_FABRIC, "device %s (port: %d) reports"
 				   " an unrecognized MTU (%d) \n",
 				   dev_name, port_num, port_attr.active_mtu);
 			return ret;
@@ -638,7 +637,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 		       device_attr.vendor_part_id);
 	if (ret < 0) {
 		info->nic->device_attr->device_id = NULL;
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "Unable to allocate memory for device_attr::device_id\n");
 		return -FI_ENOMEM;
 	}
@@ -647,7 +646,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 		       device_attr.vendor_id);
 	if (ret < 0) {
 		info->nic->device_attr->vendor_id = NULL;
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "Unable to allocate memory for device_attr::vendor_id\n");
 		return -FI_ENOMEM;
 	}
@@ -656,14 +655,14 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 		       device_attr.hw_ver);
 	if (ret < 0) {
 		info->nic->device_attr->device_version = NULL;
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "Unable to allocate memory for device_attr::device_version\n");
 		return -FI_ENOMEM;
 	}
 
         info->nic->device_attr->firmware = strdup(device_attr.fw_ver);
 	if (!info->nic->device_attr->firmware) {
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "Unable to allocate memory for device_attr::firmware\n");
 		return -FI_ENOMEM;
 	}
@@ -677,7 +676,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 	info->nic->link_attr->network_type =
 		strdup(vrb_link_layer_str(port_attr.link_layer));
 	if (!info->nic->link_attr->network_type) {
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "Unable to allocate memory for link_attr::network_type\n");
 		return -FI_ENOMEM;
 	}
@@ -728,6 +727,20 @@ static int vrb_have_device(void)
 	return ret;
 }
 
+static bool vrb_hmem_supported(const char *dev_name)
+{
+	if (ofi_hmem_p2p_disabled())
+		return false;
+
+	if (vrb_gl_data.peer_mem_support && strstr(dev_name, "mlx"))
+		return true;
+
+	if (vrb_gl_data.dmabuf_support && strstr(dev_name, "mlx5"))
+		return true;
+
+	return false;
+}
+
 static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info,
 			     const struct verbs_ep_domain *ep_dom)
 {
@@ -791,9 +804,8 @@ static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info,
 
 	switch (ctx->device->transport_type) {
 	case IBV_TRANSPORT_IB:
-		if (ibv_query_gid(ctx, 1, 0, &gid)) {
-			VERBS_INFO_ERRNO(FI_LOG_FABRIC,
-					 "ibv_query_gid", errno);
+		if (ibv_query_gid(ctx, 1, vrb_gl_data.gid_idx, &gid)) {
+			VRB_WARN_ERRNO(FI_LOG_FABRIC, "ibv_query_gid");
 			ret = -errno;
 			goto err;
 		}
@@ -837,7 +849,7 @@ static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info,
 		fi->domain_attr->cq_data_size = 0;
 		break;
 	default:
-		VERBS_INFO(FI_LOG_CORE, "Unknown transport type\n");
+		VRB_INFO(FI_LOG_CORE, "Unknown transport type\n");
 		ret = -FI_ENODATA;
 		goto err;
 	}
@@ -1043,7 +1055,12 @@ static int vrb_get_sib(struct dlist_entry *verbs_devs)
 		return -errno;
 
 	for (int dev = 0; dev < num_devices; dev++) {
+		if (!devices[dev])
+			continue;
+
 		context = ibv_open_device(devices[dev]);
+		if (!context)
+			continue;
 
 		ret = ibv_query_device(context, &device_attr);
 		if (ret)
@@ -1104,7 +1121,7 @@ static int vrb_getifaddrs(struct dlist_entry *verbs_devs)
 
 	ret = ofi_getifaddrs(&ifaddr);
 	if (ret) {
-		VERBS_WARN(FI_LOG_FABRIC,
+		VRB_WARN(FI_LOG_FABRIC,
 			   "unable to get interface addresses\n");
 		return ret;
 	}
@@ -1113,7 +1130,7 @@ static int vrb_getifaddrs(struct dlist_entry *verbs_devs)
 	if (iface) {
 		iface_len = strlen(iface);
 		if (iface_len > IFNAMSIZ) {
-			VERBS_INFO(FI_LOG_FABRIC, "iface name: %s, too long "
+			VRB_INFO(FI_LOG_FABRIC, "iface name: %s, too long "
 				   "max: %d\n", iface, IFNAMSIZ);
 
 		}
@@ -1316,7 +1333,7 @@ int vrb_init_info(const struct fi_info **all_infos)
 	*all_infos = NULL;
 
 	if (!vrb_have_device()) {
-		VERBS_INFO(FI_LOG_FABRIC, "no RDMA devices found\n");
+		VRB_INFO(FI_LOG_FABRIC, "no RDMA devices found\n");
 		ret = -FI_ENODATA;
 		goto done;
 	}
@@ -1349,12 +1366,20 @@ int vrb_init_info(const struct fi_info **all_infos)
 
 	ctx_list = rdma_get_devices(&num_devices);
 	if (!num_devices) {
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_get_devices");
 		ret = -errno;
 		goto done;
 	}
 
 	for (i = 0; i < num_devices; i++) {
+		if (!ctx_list[i]) {
+			FI_INFO(&vrb_prov, FI_LOG_FABRIC,
+				"skipping device: %d, "
+				"the interface may be down, faulty or disabled\n",
+				i);
+			continue;
+		}
+
 		for (j = 0; j < dom_count; j++) {
 			if (ep_type[j]->type == FI_EP_MSG &&
 			    !vrb_device_has_ipoib_addr(ctx_list[i]->device->name)) {
@@ -1373,11 +1398,29 @@ int vrb_init_info(const struct fi_info **all_infos)
 				continue;
 
 			ret = vrb_alloc_info(ctx_list[i], &fi, ep_type[j]);
-			if (!ret) {
-				if (!*all_infos)
-					*all_infos = fi;
-				else
-					tail->next = fi;
+			if (ret)
+				continue;
+
+			if (!*all_infos)
+				*all_infos = fi;
+			else
+				tail->next = fi;
+			tail = fi;
+
+			/* If verbs HMEM is supported, duplicate previously
+			 * allocated fi_info and apply HMEM flags.
+			 */
+			if (vrb_hmem_supported(ctx_list[i]->device->name)) {
+				fi = fi_dupinfo(fi);
+				if (!fi)
+					continue;
+
+				fi->caps |= FI_HMEM;
+				fi->tx_attr->caps |= FI_HMEM;
+				fi->rx_attr->caps |= FI_HMEM;
+				fi->domain_attr->mr_mode |= FI_MR_HMEM;
+
+				tail->next = fi;
 				tail = fi;
 			}
 		}
@@ -1584,7 +1627,7 @@ static int vrb_resolve_ib_ud_dest_addr(const char *node, const char *service,
 	if (*dest_addr) {
 		VERBS_INFO_NODE_2_UD_ADDR(FI_LOG_CORE, node, svc, *dest_addr);
 	} else {
-		VERBS_INFO(FI_LOG_CORE,
+		VRB_INFO(FI_LOG_CORE,
 			   "failed to resolve '%s:%u'.\n", node, svc);
 		return -FI_ENODATA;
 	}
@@ -1644,7 +1687,7 @@ static int vrb_handle_ib_ud_addr(const char *node, const char *service,
 	if (!src_addr) {
 		src_addr = calloc(1, sizeof(*src_addr));
 		if (!src_addr) {
-			VERBS_INFO(FI_LOG_CORE,
+			VRB_INFO(FI_LOG_CORE,
 			           "failed to allocate src addr.\n");
 			ret = -FI_ENODATA;
 			goto err;
@@ -1660,7 +1703,7 @@ static int vrb_handle_ib_ud_addr(const char *node, const char *service,
 				}
 			}
 
-			VERBS_INFO(FI_LOG_CORE, "node '%s' service '%s' "
+			VRB_INFO(FI_LOG_CORE, "node '%s' service '%s' "
 				                "converted to <service=%d>\n",
 				   node, service, src_addr->service);
 		}
@@ -1710,7 +1753,7 @@ static int vrb_handle_sock_addr(const char *node, const char *service,
 out:
 	rdma_freeaddrinfo(rai);
 	if (rdma_destroy_id(id))
-		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_destroy_id", errno);
+		VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_destroy_id");
 	return ret;
 }
 
@@ -1733,7 +1776,7 @@ static int vrb_get_match_infos(uint32_t version, const char *node,
 	    hints->ep_attr->type == FI_EP_UNSPEC) {
 		ret_sock_addr = vrb_handle_sock_addr(node, service, flags, hints, info);
 		if (ret_sock_addr) {
-			VERBS_INFO(FI_LOG_FABRIC,
+			VRB_INFO(FI_LOG_FABRIC,
 				   "handling of the socket address fails - %d\n",
 				   ret_sock_addr);
 		} else {
@@ -1746,7 +1789,7 @@ static int vrb_get_match_infos(uint32_t version, const char *node,
 	    hints->ep_attr->type == FI_EP_UNSPEC) {
 		ret_ib_ud_addr = vrb_handle_ib_ud_addr(node, service, flags, info);
 		if (ret_ib_ud_addr)
-			VERBS_INFO(FI_LOG_FABRIC,
+			VRB_INFO(FI_LOG_FABRIC,
 				   "handling of the IB ID address fails - %d\n",
 				   ret_ib_ud_addr);
 	}
@@ -1754,7 +1797,7 @@ static int vrb_get_match_infos(uint32_t version, const char *node,
 	if (ret_sock_addr && ret_ib_ud_addr) {
 		/* neither the sockaddr nor the ib_ud address wasn't
 		 * handled to satisfy the selection procedure */
-		VERBS_INFO(FI_LOG_CORE, "Handling of the addresses fails, "
+		VRB_INFO(FI_LOG_CORE, "Handling of the addresses fails, "
 			   "the getting infos is unsuccessful\n");
 		fi_freeinfo(*info);
 		return -FI_ENODATA;
diff --git a/deps/libfabric/prov/verbs/src/verbs_mr.c b/deps/libfabric/prov/verbs/src/verbs_mr.c
index 204d17985970f50e3b62c91b99ae2f85b3e07fa6..90442a3b8b1a41fcf2d4cac236723a310a0e47a5 100644
--- a/deps/libfabric/prov/verbs/src/verbs_mr.c
+++ b/deps/libfabric/prov/verbs/src/verbs_mr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2017-2019 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,33 +34,6 @@
 #include <ofi_util.h>
 #include "fi_verbs.h"
 
-
-static int
-vrb_mr_regv(struct fid *fid, const struct iovec *iov,
-	       size_t count, uint64_t access, uint64_t offset,
-	       uint64_t requested_key, uint64_t flags,
-	       struct fid_mr **mr, void *context)
-{
-	struct fid_domain *domain = container_of(fid, struct fid_domain, fid);
-
-	if (OFI_UNLIKELY(count > 1))
-		return -FI_EINVAL;
-
-	return count ? fi_mr_reg(domain, (const void *) iov->iov_base,
-				 iov->iov_len, access, offset, requested_key,
-				 flags, mr, context) :
-		       fi_mr_reg(domain, NULL, 0, access, offset, requested_key,
-				 flags, mr, context);
-}
-
-static int vrb_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
-			     uint64_t flags, struct fid_mr **mr)
-{
-	return vrb_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access,
-			      attr->offset, attr->requested_key, flags, mr,
-			      attr->context);
-}
-
 static int vrb_mr_close(fid_t fid)
 {
 	struct vrb_mem_desc *mr;
@@ -83,18 +57,55 @@ static struct fi_ops vrb_mr_fi_ops = {
 	.ops_open = fi_no_ops_open,
 };
 
+#if VERBS_HAVE_DMABUF_MR
 static inline
-int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access,
-			 const void *buf, size_t len, void *context)
+struct ibv_mr *vrb_mr_ibv_reg_dmabuf_mr(struct ibv_pd *pd, const void *buf,
+				        size_t len, int vrb_access)
+{
+	void *handle;
+	void *base;
+	uint64_t offset;
+	int err;
+
+	err = ze_hmem_get_handle((void *)buf, &handle);
+	if (err)
+		return NULL;
+
+	err = ze_hmem_get_base_addr((void *)buf, &base, &len);
+	if (err)
+		return NULL;
+
+	offset = (uintptr_t)buf - (uintptr_t)base;
+	return ibv_reg_dmabuf_mr(pd, offset, len, (uint64_t)buf/* iova */,
+				 (int)(uintptr_t)handle/* dmabuf fd */,
+				 vrb_access);
+}
+#endif
+
+static inline
+int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *buf,
+		      size_t len, void *context, enum fi_hmem_iface iface,
+		      uint64_t device)
 {
 	/* ops should be set in special functions */
 	md->mr_fid.fid.fclass = FI_CLASS_MR;
 	md->mr_fid.fid.context = context;
+	md->info.iface = iface;
+	md->info.device = device;
+	md->info.iov.iov_base = (void *) buf;
+	md->info.iov.iov_len = len;
 
-	if (md->domain->flags & VRB_USE_ODP)
+	if (md->domain->ext_flags & VRB_USE_ODP && iface == FI_HMEM_SYSTEM)
 		vrb_access |= VRB_ACCESS_ON_DEMAND;
 
-	md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, vrb_access);
+#if VERBS_HAVE_DMABUF_MR
+	if (iface == FI_HMEM_ZE)
+		md->mr = vrb_mr_ibv_reg_dmabuf_mr(md->domain->pd, buf, len,
+					          vrb_access);
+	else
+#endif
+		md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len,
+				    vrb_access);
 	if (!md->mr) {
 		if (len)
 			return -errno;
@@ -102,8 +113,9 @@ int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access,
 			/* Ignore failure for zero length memory registration */
 			assert(errno == FI_EINVAL);
 	} else {
-		md->mr_fid.mem_desc = (void *)(uintptr_t)md->mr->lkey;
+		md->mr_fid.mem_desc = md;
 		md->mr_fid.key = md->mr->rkey;
+		md->lkey = md->mr->lkey;
 	}
 
 	if (md->domain->eq_flags & FI_REG_MR) {
@@ -113,7 +125,7 @@ int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access,
 		};
 		if (md->domain->eq)
 			vrb_eq_write_event(md->domain->eq, FI_MR_COMPLETE,
-				 	      &entry, sizeof(entry));
+					   &entry, sizeof(entry));
 		else if (md->domain->util_domain.eq)
 			 /* This branch is taken for the verbs/DGRAM */
 			fi_eq_write(&md->domain->util_domain.eq->eq_fid,
@@ -139,9 +151,6 @@ vrb_mr_ofi2ibv_access(uint64_t ofi_access, struct vrb_domain *domain)
 			ibv_access |= IBV_ACCESS_REMOTE_WRITE;
 	}
 
-	if (ofi_access & FI_WRITE)
-		ibv_access |= IBV_ACCESS_LOCAL_WRITE;
-
 	if (ofi_access & FI_REMOTE_READ)
 		ibv_access |= IBV_ACCESS_REMOTE_READ;
 
@@ -155,26 +164,23 @@ vrb_mr_ofi2ibv_access(uint64_t ofi_access, struct vrb_domain *domain)
 }
 
 static int
-vrb_mr_reg(struct fid *fid, const void *buf, size_t len,
-	      uint64_t access, uint64_t offset, uint64_t requested_key,
-	      uint64_t flags, struct fid_mr **mr, void *context)
+vrb_mr_nocache_reg(struct vrb_domain *domain, const void *buf, size_t len,
+		   uint64_t access, uint64_t offset, uint64_t requested_key,
+		   uint64_t flags, struct fid_mr **mr, void *context,
+		   enum fi_hmem_iface iface, uint64_t device)
 {
 	struct vrb_mem_desc *md;
 	int ret;
 
-	if (OFI_UNLIKELY(flags & ~OFI_MR_NOCACHE))
-		return -FI_EBADFLAGS;
-
 	md = calloc(1, sizeof(*md));
 	if (OFI_UNLIKELY(!md))
 		return -FI_ENOMEM;
 
-	md->domain = container_of(fid, struct vrb_domain,
-				  util_domain.domain_fid.fid);
+	md->domain = domain;
 	md->mr_fid.fid.ops = &vrb_mr_fi_ops;
 
 	ret = vrb_mr_reg_common(md, vrb_mr_ofi2ibv_access(access, md->domain),
-				   buf, len, context);
+				buf, len, context, iface, device);
 	if (OFI_UNLIKELY(ret))
 		goto err;
 
@@ -189,18 +195,10 @@ static int vrb_mr_cache_close(fid_t fid)
 {
 	struct vrb_mem_desc *md =
 		container_of(fid, struct vrb_mem_desc, mr_fid.fid);
-
 	ofi_mr_cache_delete(&md->domain->cache, md->entry);
 	return FI_SUCCESS;
 }
 
-struct fi_ops_mr vrb_mr_ops = {
-	.size = sizeof(struct fi_ops_mr),
-	.reg = vrb_mr_reg,
-	.regv = vrb_mr_regv,
-	.regattr = vrb_mr_regattr,
-};
-
 static struct fi_ops vrb_mr_cache_fi_ops = {
 	.size = sizeof(struct fi_ops),
 	.close = vrb_mr_cache_close,
@@ -221,7 +219,8 @@ int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
 	return vrb_mr_reg_common(md, IBV_ACCESS_LOCAL_WRITE |
 			IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC |
 			IBV_ACCESS_REMOTE_READ, entry->info.iov.iov_base,
-			entry->info.iov.iov_len, NULL);
+			entry->info.iov.iov_len, NULL, entry->info.iface,
+			entry->info.device);
 }
 
 void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache,
@@ -233,23 +232,17 @@ void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache,
 }
 
 static int
-vrb_mr_cache_reg(struct fid *fid, const void *buf, size_t len,
-		    uint64_t access, uint64_t offset, uint64_t requested_key,
-		    uint64_t flags, struct fid_mr **mr, void *context)
+vrb_mr_cache_reg(struct vrb_domain *domain, const void *buf, size_t len,
+		 uint64_t access, uint64_t offset, uint64_t requested_key,
+		 uint64_t flags, struct fid_mr **mr, void *context,
+		 enum fi_hmem_iface iface, uint64_t device)
 {
-	struct vrb_domain *domain;
 	struct vrb_mem_desc *md;
 	struct ofi_mr_entry *entry;
 	struct fi_mr_attr attr;
 	struct iovec iov;
 	int ret;
 
-	if (flags & ~OFI_MR_NOCACHE)
-		return -FI_EBADFLAGS;
-
-	domain = container_of(fid, struct vrb_domain,
-			      util_domain.domain_fid.fid);
-
 	attr.access = access;
 	attr.context = context;
 	attr.iov_count = 1;
@@ -259,7 +252,8 @@ vrb_mr_cache_reg(struct fid *fid, const void *buf, size_t len,
 	attr.offset = offset;
 	attr.requested_key = requested_key;
 	attr.auth_key_size = 0;
-	attr.iface = FI_HMEM_SYSTEM;
+	attr.iface = iface;
+	attr.device.reserved = device;
 
 	ret = (flags & OFI_MR_NOCACHE) ?
 	      ofi_mr_cache_reg(&domain->cache, &attr, &entry) :
@@ -272,9 +266,88 @@ vrb_mr_cache_reg(struct fid *fid, const void *buf, size_t len,
 	return FI_SUCCESS;
 }
 
-struct fi_ops_mr vrb_mr_cache_ops = {
+static int
+vrb_mr_reg_iface(struct fid *fid, const void *buf, size_t len, uint64_t access,
+		 uint64_t offset, uint64_t requested_key, uint64_t flags,
+		 struct fid_mr **mr, void *context, enum fi_hmem_iface iface,
+		 uint64_t device)
+{
+	struct vrb_domain *domain;
+
+	domain = container_of(fid, struct vrb_domain,
+			      util_domain.domain_fid.fid);
+
+	if (domain->cache.monitors[iface])
+		return vrb_mr_cache_reg(domain, buf, len, access, offset,
+					requested_key, flags, mr, context,
+					iface, device);
+	else
+		return vrb_mr_nocache_reg(domain, buf, len, access, offset,
+					  requested_key, flags, mr, context,
+					  iface, device);
+}
+
+static int
+vrb_mr_regv_iface(struct fid *fid, const struct iovec *iov, size_t count,
+		  uint64_t access, uint64_t offset, uint64_t requested_key,
+		  uint64_t flags, struct fid_mr **mr, void *context,
+		  enum fi_hmem_iface iface, uint64_t device)
+{
+	const void *addr = count ? iov->iov_base: NULL;
+	size_t len = count ? iov->iov_len : 0;
+
+	if (OFI_UNLIKELY(count > 1))
+		return -FI_EINVAL;
+
+	return vrb_mr_reg_iface(fid, addr, len, access, offset, requested_key,
+				flags, mr, context, iface, device);
+}
+
+static int
+vrb_mr_reg(struct fid *fid, const void *buf, size_t len, uint64_t access,
+	   uint64_t offset, uint64_t requested_key, uint64_t flags,
+	   struct fid_mr **mr, void *context)
+{
+	return vrb_mr_reg_iface(fid, buf, len, access, offset, requested_key,
+				flags, mr, context, FI_HMEM_SYSTEM, 0);
+}
+
+static int
+vrb_mr_regv(struct fid *fid, const struct iovec *iov, size_t count,
+	    uint64_t access, uint64_t offset, uint64_t requested_key,
+	    uint64_t flags, struct fid_mr **mr, void *context)
+{
+	return vrb_mr_regv_iface(fid, iov, count, access, offset, requested_key,
+				 flags, mr, context, FI_HMEM_SYSTEM, 0);
+}
+
+static int vrb_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
+			  uint64_t flags, struct fid_mr **mr)
+{
+	struct vrb_domain *domain;
+	struct fi_mr_attr cur_abi_attr;
+
+	domain = container_of(fid, struct vrb_domain,
+			      util_domain.domain_fid.fid);
+
+	ofi_mr_update_attr(domain->util_domain.fabric->fabric_fid.api_version,
+			   domain->util_domain.info_domain_caps, attr,
+			   &cur_abi_attr);
+
+	if ((flags & FI_HMEM_HOST_ALLOC) && (cur_abi_attr.iface == FI_HMEM_ZE))
+		cur_abi_attr.device.ze = -1;
+
+	return vrb_mr_regv_iface(fid, cur_abi_attr.mr_iov,
+				 cur_abi_attr.iov_count, cur_abi_attr.access,
+				 cur_abi_attr.offset,
+				 cur_abi_attr.requested_key, flags, mr,
+				 cur_abi_attr.context, cur_abi_attr.iface,
+				 cur_abi_attr.device.reserved);
+}
+
+struct fi_ops_mr vrb_mr_ops = {
 	.size = sizeof(struct fi_ops_mr),
-	.reg = vrb_mr_cache_reg,
+	.reg = vrb_mr_reg,
 	.regv = vrb_mr_regv,
 	.regattr = vrb_mr_regattr,
 };
diff --git a/deps/libfabric/prov/verbs/src/verbs_msg.c b/deps/libfabric/prov/verbs/src/verbs_msg.c
index 13c0d9681d4066cfc5d720d719fca1f09bd6b96e..31948c8e3023f5993a5877d9c86c8d50be087627 100644
--- a/deps/libfabric/prov/verbs/src/verbs_msg.c
+++ b/deps/libfabric/prov/verbs/src/verbs_msg.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2018 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -46,7 +47,7 @@ vrb_msg_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t fla
 		.next = NULL,
 	};
 
-	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
+	vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count);
 	return vrb_post_recv(ep, &wr);
 }
 
@@ -98,7 +99,8 @@ vrb_msg_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t fla
 		wr.opcode = IBV_WR_SEND;
 	}
 
-	return vrb_send_msg(ep, &wr, msg, flags);
+	return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc,
+			    msg->iov_count, flags);
 }
 
 static ssize_t
@@ -110,7 +112,7 @@ vrb_msg_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND,
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
 	return vrb_send_buf(ep, &wr, buf, len, desc);
@@ -125,10 +127,11 @@ vrb_msg_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	return vrb_send_buf(ep, &wr, buf, len, desc);
 }
 
@@ -143,7 +146,8 @@ vrb_msg_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 		.opcode = IBV_WR_SEND,
 	};
 
-	return vrb_send_iov(ep, &wr, iov, desc, count);
+	return vrb_send_iov(ep, &wr, iov, desc, count,
+			    ep->util_ep.tx_op_flags);
 }
 
 static ssize_t vrb_msg_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
@@ -157,7 +161,7 @@ static ssize_t vrb_msg_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t
 		.send_flags = IBV_SEND_INLINE,
 	};
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t vrb_msg_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
@@ -168,11 +172,12 @@ static ssize_t vrb_msg_ep_injectdata(struct fid_ep *ep_fid, const void *buf, siz
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_NO_COMP_FLAG,
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.send_flags = IBV_SEND_INLINE,
 	};
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	wr.imm_data = htonl((uint32_t)data);
+
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
@@ -250,7 +255,8 @@ vrb_msg_xrc_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t
 		wr.opcode = IBV_WR_SEND;
 	}
 
-	return vrb_send_msg(&ep->base_ep, &wr, msg, flags);
+	return vrb_send_iov(&ep->base_ep, &wr, msg->msg_iov, msg->desc,
+			    msg->iov_count, flags);
 }
 
 static ssize_t
@@ -262,7 +268,7 @@ vrb_msg_xrc_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND,
-		.send_flags = VERBS_INJECT(&ep->base_ep, len),
+		.send_flags = VERBS_INJECT(&ep->base_ep, len, desc),
 	};
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
@@ -279,10 +285,11 @@ vrb_msg_xrc_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context),
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
-		.send_flags = VERBS_INJECT(&ep->base_ep, len),
+		.send_flags = VERBS_INJECT(&ep->base_ep, len, desc),
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
 	return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc);
@@ -301,7 +308,8 @@ vrb_msg_xrc_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count);
+	return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count,
+			    ep->base_ep.util_ep.tx_op_flags);
 }
 
 static ssize_t vrb_msg_xrc_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
@@ -317,7 +325,7 @@ static ssize_t vrb_msg_xrc_ep_inject(struct fid_ep *ep_fid, const void *buf, siz
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_buf_inline(&ep->base_ep, &wr, buf, len);
+	return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL);
 }
 
 static ssize_t vrb_msg_xrc_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
@@ -328,13 +336,14 @@ static ssize_t vrb_msg_xrc_ep_injectdata(struct fid_ep *ep_fid, const void *buf,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_NO_COMP_FLAG,
 		.opcode = IBV_WR_SEND_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.send_flags = IBV_SEND_INLINE,
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_buf_inline(&ep->base_ep, &wr, buf, len);
+	return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL);
 }
 
 /* NOTE: Initially the XRC endpoint must be used with a SRQ. */
diff --git a/deps/libfabric/prov/verbs/src/verbs_rma.c b/deps/libfabric/prov/verbs/src/verbs_rma.c
index d52cd3a178d316f509df12dc50f00dad84e9c028..188dcc4a0f558ccf248e864b970275be8f578b7d 100644
--- a/deps/libfabric/prov/verbs/src/verbs_rma.c
+++ b/deps/libfabric/prov/verbs/src/verbs_rma.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2018 Intel Corporation, Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -55,7 +56,7 @@ vrb_msg_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len,
 		.opcode = IBV_WR_RDMA_WRITE,
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
 	return vrb_send_buf(ep, &wr, buf, len, desc);
@@ -75,7 +76,8 @@ vrb_msg_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **des
 		.wr.rdma.rkey = (uint32_t)key,
 	};
 
-	return vrb_send_iov(ep, &wr, iov, desc, count);
+	return vrb_send_iov(ep, &wr, iov, desc, count,
+			    ep->util_ep.tx_op_flags);
 }
 
 static ssize_t
@@ -97,7 +99,8 @@ vrb_msg_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 		wr.opcode = IBV_WR_RDMA_WRITE;
 	}
 
-	return vrb_send_msg(ep, &wr, msg, flags);
+	return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc,
+			    msg->iov_count, flags);
 }
 
 static ssize_t
@@ -132,8 +135,7 @@ vrb_msg_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc
 		.num_sge = count,
 	};
 
-	vrb_set_sge_iov(wr.sg_list, iov, count, desc);
-
+	vrb_iov_dupa(wr.sg_list, iov, desc, count);
 	return vrb_post_send(ep, &wr, 0);
 }
 
@@ -151,8 +153,7 @@ vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 		.num_sge = msg->iov_count,
 	};
 
-	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
-
+	vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count);
 	return vrb_post_send(ep, &wr, 0);
 }
 
@@ -166,12 +167,13 @@ vrb_msg_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(ep, (uintptr_t)context),
 		.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
-		.send_flags = VERBS_INJECT(ep, len),
+		.send_flags = VERBS_INJECT(ep, len, desc),
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	return vrb_send_buf(ep, &wr, buf, len, desc);
 }
 
@@ -189,7 +191,7 @@ vrb_msg_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len,
 		.send_flags = IBV_SEND_INLINE,
 	};
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
@@ -219,13 +221,14 @@ vrb_msg_ep_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t l
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_NO_COMP_FLAG,
 		.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
 		.send_flags = IBV_SEND_INLINE,
 	};
 
-	return vrb_send_buf_inline(ep, &wr, buf, len);
+	wr.imm_data = htonl((uint32_t)data);
+
+	return vrb_send_buf(ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
@@ -288,7 +291,7 @@ vrb_msg_xrc_ep_rma_write(struct fid_ep *ep_fid, const void *buf,
 		.opcode = IBV_WR_RDMA_WRITE,
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
-		.send_flags = VERBS_INJECT(&ep->base_ep, len),
+		.send_flags = VERBS_INJECT(&ep->base_ep, len, desc),
 	};
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
@@ -312,7 +315,8 @@ vrb_msg_xrc_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov,
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count);
+	return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count,
+			    ep->base_ep.util_ep.tx_op_flags);
 }
 
 static ssize_t
@@ -336,7 +340,8 @@ vrb_msg_xrc_ep_rma_writemsg(struct fid_ep *ep_fid,
 		wr.opcode = IBV_WR_RDMA_WRITE;
 	}
 
-	return vrb_send_msg(&ep->base_ep, &wr, msg, flags);
+	return vrb_send_iov(&ep->base_ep, &wr, msg->msg_iov, msg->desc,
+			    msg->iov_count, flags);
 }
 
 static ssize_t
@@ -375,8 +380,7 @@ vrb_msg_xrc_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov,
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	vrb_set_sge_iov(wr.sg_list, iov, count, desc);
-
+	vrb_iov_dupa(wr.sg_list, iov, desc, count);
 	return vrb_post_send(&ep->base_ep, &wr, 0);
 }
 
@@ -397,8 +401,7 @@ vrb_msg_xrc_ep_rma_readmsg(struct fid_ep *ep_fid,
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
-
+	vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count);
 	return vrb_post_send(&ep->base_ep, &wr, flags);
 }
 
@@ -412,12 +415,13 @@ vrb_msg_xrc_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context),
 		.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
-		.send_flags = VERBS_INJECT(&ep->base_ep, len),
+		.send_flags = VERBS_INJECT(&ep->base_ep, len, desc),
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
 	return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc);
@@ -440,7 +444,7 @@ vrb_msg_xrc_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf,
 
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_buf_inline(&ep->base_ep, &wr, buf, len);
+	return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
@@ -470,15 +474,16 @@ vrb_msg_xrc_ep_rma_inject_writedata(struct fid_ep *ep_fid,
 	struct ibv_send_wr wr = {
 		.wr_id = VERBS_NO_COMP_FLAG,
 		.opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
-		.imm_data = htonl((uint32_t)data),
 		.wr.rdma.remote_addr = addr,
 		.wr.rdma.rkey = (uint32_t)key,
 		.send_flags = IBV_SEND_INLINE,
 	};
 
+	wr.imm_data = htonl((uint32_t)data);
+
 	VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn);
 
-	return vrb_send_buf_inline(&ep->base_ep, &wr, buf, len);
+	return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL);
 }
 
 static ssize_t
diff --git a/deps/libfabric/src/common.c b/deps/libfabric/src/common.c
index 4c54dc2dec680136a9114236ac727ef6c1af7492..6206b8825e381b38e13e3a9859142f38f21d9404 100644
--- a/deps/libfabric/src/common.c
+++ b/deps/libfabric/src/common.c
@@ -49,6 +49,7 @@
 
 #include <inttypes.h>
 #include <netinet/in.h>
+#include <netinet/ip.h>
 #include <arpa/inet.h>
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -67,6 +68,7 @@
 #include <ofi_epoll.h>
 #include <ofi_list.h>
 #include <ofi_osd.h>
+#include <ofi_iov.h>
 #include <shared/ofi_str.h>
 
 struct fi_provider core_prov = {
@@ -119,64 +121,66 @@ uint8_t ofi_lsb(uint64_t num)
 	return ofi_msb(num & (~(num - 1)));
 }
 
-int ofi_send_allowed(uint64_t caps)
+bool ofi_send_allowed(uint64_t caps)
 {
-	if (caps & FI_MSG ||
-		caps & FI_TAGGED) {
+	if ((caps & FI_MSG) || (caps & FI_TAGGED)) {
 		if (caps & FI_SEND)
-			return 1;
+			return true;
 		if (caps & FI_RECV)
-			return 0;
-		return 1;
+			return false;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
-int ofi_recv_allowed(uint64_t caps)
+bool ofi_recv_allowed(uint64_t caps)
 {
-	if (caps & FI_MSG ||
-		caps & FI_TAGGED) {
+	if ((caps & FI_MSG) || (caps & FI_TAGGED)) {
 		if (caps & FI_RECV)
-			return 1;
+			return true;
 		if (caps & FI_SEND)
-			return 0;
-		return 1;
+			return false;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
-int ofi_rma_initiate_allowed(uint64_t caps)
+bool ofi_rma_initiate_allowed(uint64_t caps)
 {
-	if (caps & FI_RMA ||
-		caps & FI_ATOMICS) {
-		if (caps & FI_WRITE ||
-			caps & FI_READ)
-			return 1;
-		if (caps & FI_REMOTE_WRITE ||
-			caps & FI_REMOTE_READ)
-			return 0;
-		return 1;
+	if ((caps & FI_RMA) || (caps & FI_ATOMICS)) {
+		if ((caps & FI_WRITE) || (caps & FI_READ))
+			return true;
+		if ((caps & FI_REMOTE_WRITE) || (caps & FI_REMOTE_READ))
+			return false;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
-int ofi_rma_target_allowed(uint64_t caps)
+bool ofi_rma_target_allowed(uint64_t caps)
 {
-	if (caps & FI_RMA ||
-		caps & FI_ATOMICS) {
-		if (caps & FI_REMOTE_WRITE ||
-			caps & FI_REMOTE_READ)
-			return 1;
-		if (caps & FI_WRITE ||
-			caps & FI_READ)
-			return 0;
-		return 1;
+	if ((caps & FI_RMA) || (caps & FI_ATOMICS)) {
+		if ((caps & FI_REMOTE_WRITE) || (caps & FI_REMOTE_READ))
+			return true;
+		if ((caps & FI_WRITE) || (caps & FI_READ))
+			return false;
+		return true;
 	}
 
-	return 0;
+	return false;
+}
+
+bool ofi_needs_tx(uint64_t caps)
+{
+	return ofi_send_allowed(caps) || ofi_rma_initiate_allowed(caps);
+}
+
+bool ofi_needs_rx(uint64_t caps)
+{
+	return ofi_recv_allowed(caps);
 }
 
 int ofi_ep_bind_valid(const struct fi_provider *prov, struct fid *bfid, uint64_t flags)
@@ -356,6 +360,11 @@ sa_sin6:
 		    snprintf(buf, *len, "fi_addr_psmx2://%" PRIx64 ":%" PRIx64,
 			     *(uint64_t *)addr, *((uint64_t *)addr + 1));
 		break;
+	case FI_ADDR_PSMX3:
+		size =
+		    snprintf(buf, *len, "fi_addr_psmx3://%" PRIx64 ":%" PRIx64,
+			     *(uint64_t *)addr, *((uint64_t *)addr + 1));
+		break;
 	case FI_ADDR_GNI:
 		size = snprintf(buf, *len, "fi_addr_gni://%" PRIx64,
 				*(uint64_t *)addr);
@@ -395,14 +404,14 @@ sa_sin6:
 
 uint32_t ofi_addr_format(const char *str)
 {
-	char fmt[16];
+	char fmt[17];
 	int ret;
 
+	memset(fmt, 0, sizeof(fmt));
 	ret = sscanf(str, "%16[^:]://", fmt);
 	if (ret != 1)
 		return FI_FORMAT_UNSPEC;
 
-	fmt[sizeof(fmt) - 1] = '\0';
 	if (!strcasecmp(fmt, "fi_sockaddr_in"))
 		return FI_SOCKADDR_IN;
 	else if (!strcasecmp(fmt, "fi_sockaddr_in6"))
@@ -413,6 +422,8 @@ uint32_t ofi_addr_format(const char *str)
 		return FI_ADDR_PSMX;
 	else if (!strcasecmp(fmt, "fi_addr_psmx2"))
 		return FI_ADDR_PSMX2;
+	else if (!strcasecmp(fmt, "fi_addr_psmx3"))
+		return FI_ADDR_PSMX3;
 	else if (!strcasecmp(fmt, "fi_addr_gni"))
 		return FI_ADDR_GNI;
 	else if (!strcasecmp(fmt, "fi_addr_bgq"))
@@ -462,6 +473,24 @@ static int ofi_str_to_psmx2(const char *str, void **addr, size_t *len)
 	return -FI_EINVAL;
 }
 
+static int ofi_str_to_psmx3(const char *str, void **addr, size_t *len)
+{
+	int ret;
+
+	*len = 2 * sizeof(uint64_t);
+	*addr = calloc(1, *len);
+	if (!(*addr))
+		return -FI_ENOMEM;
+
+	ret = sscanf(str, "%*[^:]://%" SCNx64 ":%" SCNx64,
+		     (uint64_t *) *addr, (uint64_t *) *addr + 1);
+	if (ret == 2)
+		return 0;
+
+	free(*addr);
+	return -FI_EINVAL;
+}
+
 static int ofi_str_to_ib_ud(const char *str, void **addr, size_t *len)
 {
 	int ret;
@@ -794,6 +823,8 @@ int ofi_str_toaddr(const char *str, uint32_t *addr_format,
 		return ofi_str_to_psmx(str, addr, len);
 	case FI_ADDR_PSMX2:
 		return ofi_str_to_psmx2(str, addr, len);
+	case FI_ADDR_PSMX3:
+		return ofi_str_to_psmx3(str, addr, len);
 	case FI_ADDR_IB_UD:
 		return ofi_str_to_ib_ud(str, addr, len);
 	case FI_ADDR_EFA:
@@ -870,8 +901,8 @@ static int ofi_is_any_addr_port(struct sockaddr *addr)
 	}
 }
 
-int ofi_is_wildcard_listen_addr(const char *node, const char *service,
-				uint64_t flags, const struct fi_info *hints)
+bool ofi_is_wildcard_listen_addr(const char *node, const char *service,
+				 uint64_t flags, const struct fi_info *hints)
 {
 	struct addrinfo *res = NULL;
 	int ret;
@@ -880,30 +911,30 @@ int ofi_is_wildcard_listen_addr(const char *node, const char *service,
 	    hints->addr_format != FI_SOCKADDR &&
 	    hints->addr_format != FI_SOCKADDR_IN &&
 	    hints->addr_format != FI_SOCKADDR_IN6)
-		return 0;
+		return false;
 
 	/* else it's okay to call getaddrinfo, proceed with processing */
 
 	if (node) {
 		if (!(flags & FI_SOURCE))
-			return 0;
+			return false;
 		ret = getaddrinfo(node, service, NULL, &res);
 		if (ret) {
 			FI_WARN(&core_prov, FI_LOG_CORE,
 				"getaddrinfo failed!\n");
-			return 0;
+			return false;
 		}
 		if (ofi_is_any_addr_port(res->ai_addr)) {
 			freeaddrinfo(res);
 			goto out;
 		}
 		freeaddrinfo(res);
-		return 0;
+		return false;
 	}
 
 	if (hints) {
 		if (hints->dest_addr)
-			return 0;
+			return false;
 
 		if (!hints->src_addr)
 			goto out;
@@ -911,7 +942,7 @@ int ofi_is_wildcard_listen_addr(const char *node, const char *service,
 		return ofi_is_any_addr_port(hints->src_addr);
 	}
 out:
-	return ((flags & FI_SOURCE) && service) ? 1 : 0;
+	return ((flags & FI_SOURCE) && service);
 }
 
 size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr,
@@ -974,6 +1005,338 @@ int ofi_discard_socket(SOCKET sock, size_t len)
 	return ret;
 }
 
+size_t ofi_byteq_readv(struct ofi_byteq *byteq, struct iovec *iov,
+		       size_t cnt, size_t offset)
+{
+	size_t avail, len;
+
+	if (cnt == 1 && !offset)
+		return ofi_byteq_read(byteq, iov[0].iov_base, iov[0].iov_len);
+
+	avail = ofi_byteq_readable(byteq);
+	if (!avail)
+		return 0;
+
+	len = ofi_copy_iov_buf(iov, cnt, offset, &byteq->data[byteq->head],
+			       avail, OFI_COPY_BUF_TO_IOV);
+	if (len < avail) {
+		byteq->head += len;
+	} else {
+		byteq->head = 0;
+		byteq->tail = 0;
+	}
+	return len;
+}
+
+void ofi_byteq_writev(struct ofi_byteq *byteq, const struct iovec *iov,
+		      size_t cnt)
+{
+	size_t i;
+
+	assert(ofi_total_iov_len(iov, cnt) <= ofi_byteq_writeable(byteq));
+
+	if (cnt == 1) {
+		ofi_byteq_write(byteq, iov[0].iov_base, iov[0].iov_len);
+		return;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		memcpy(&byteq->data[byteq->tail], iov[i].iov_base,
+		       iov[i].iov_len);
+		byteq->tail += iov[i].iov_len;
+	}
+}
+
+
+ssize_t ofi_bsock_flush(struct ofi_bsock *bsock)
+{
+	ssize_t ret;
+
+	if (!ofi_bsock_tosend(bsock))
+		return 0;
+
+	ret = ofi_byteq_send(&bsock->sq, bsock->sock);
+	if (ret < 0) {
+		return ofi_sockerr() == EPIPE ?
+			-FI_ENOTCONN : -ofi_sockerr();
+	}
+
+	return ofi_bsock_tosend(bsock) ? -FI_EAGAIN : 0;
+}
+
+ssize_t ofi_bsock_send(struct ofi_bsock *bsock, const void *buf, size_t *len)
+{
+	size_t avail;
+	ssize_t ret;
+
+	avail = ofi_bsock_tosend(bsock);
+	if (avail) {
+		if (*len < ofi_byteq_writeable(&bsock->sq)) {
+			ofi_byteq_write(&bsock->sq, buf, *len);
+			ret = ofi_bsock_flush(bsock);
+			return !ret || ret == -FI_EAGAIN ? *len : ret;
+		}
+
+		ret = ofi_bsock_flush(bsock);
+		if (ret)
+			return ret;
+	}
+
+	assert(!ofi_bsock_tosend(bsock));
+	if (*len > bsock->zerocopy_size) {
+		ret = ofi_send_socket(bsock->sock, buf, *len,
+				      MSG_NOSIGNAL | OFI_ZEROCOPY);
+		if (ret >= 0) {
+			bsock->async_index++;
+			*len = ret;
+			return -FI_EINPROGRESS;
+		}
+	} else {
+		ret = ofi_send_socket(bsock->sock, buf, *len, MSG_NOSIGNAL);
+	}
+	if (ret < 0) {
+		if (OFI_SOCK_TRY_SND_RCV_AGAIN(ofi_sockerr()) &&
+		    *len < ofi_byteq_writeable(&bsock->sq)) {
+			ofi_byteq_write(&bsock->sq, buf, *len);
+			return *len;
+		}
+		return ofi_sockerr() == EPIPE ? -FI_ENOTCONN : -ofi_sockerr();
+	}
+	*len = ret;
+	return ret;
+}
+
+ssize_t ofi_bsock_sendv(struct ofi_bsock *bsock, const struct iovec *iov,
+			size_t cnt, size_t *len)
+{
+	struct msghdr msg;
+	size_t avail;
+	ssize_t ret;
+
+	if (cnt == 1) {
+		*len = iov[0].iov_len;
+		return ofi_bsock_send(bsock, iov[0].iov_base, len);
+	}
+
+	*len = ofi_total_iov_len(iov, cnt);
+	avail = ofi_bsock_tosend(bsock);
+	if (avail) {
+		if (*len < ofi_byteq_writeable(&bsock->sq)) {
+			ofi_byteq_writev(&bsock->sq, iov, cnt);
+			ret = ofi_bsock_flush(bsock);
+			return !ret || ret == -FI_EAGAIN ? *len : ret;
+		}
+
+		ret = ofi_bsock_flush(bsock);
+		if (ret)
+			return ret;
+	}
+
+	assert(!ofi_bsock_tosend(bsock));
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = (struct iovec *) iov;
+	msg.msg_iovlen = cnt;
+
+	if (*len > bsock->zerocopy_size) {
+		ret = ofi_sendmsg_tcp(bsock->sock, &msg,
+				      MSG_NOSIGNAL | OFI_ZEROCOPY);
+		if (ret >= 0) {
+			bsock->async_index++;
+			*len = ret;
+			return -FI_EINPROGRESS;
+		}
+	} else {
+		ret = ofi_sendmsg_tcp(bsock->sock, &msg, MSG_NOSIGNAL);
+	}
+	if (ret < 0) {
+		if (OFI_SOCK_TRY_SND_RCV_AGAIN(ofi_sockerr()) &&
+		    *len < ofi_byteq_writeable(&bsock->sq)) {
+			ofi_byteq_writev(&bsock->sq, iov, cnt);
+			return *len;
+		}
+		return ofi_sockerr() == EPIPE ? -FI_ENOTCONN : -ofi_sockerr();
+	}
+	*len = ret;
+	return ret;
+}
+
+ssize_t ofi_bsock_recv(struct ofi_bsock *bsock, void *buf, size_t len)
+{
+	size_t bytes;
+	ssize_t ret;
+
+	bytes = ofi_byteq_read(&bsock->rq, buf, len);
+	if (bytes) {
+		if (bytes == len)
+			return len;
+		buf = (char *) buf + bytes;
+		len -= bytes;
+	}
+
+	assert(!ofi_bsock_readable(bsock));
+	if (len < (bsock->rq.size >> 1)) {
+		ret = ofi_byteq_recv(&bsock->rq, bsock->sock);
+		if (ret <= 0)
+			goto out;
+
+		assert(ofi_bsock_readable(bsock));
+		bytes += ofi_byteq_read(&bsock->rq, buf, len);
+		return bytes;
+	}
+
+	ret = ofi_recv_socket(bsock->sock, buf, len, MSG_NOSIGNAL);
+	if (ret > 0)
+		return bytes + ret;
+
+out:
+	if (bytes)
+		return bytes;
+	return ret ? -ofi_sockerr(): -FI_ENOTCONN;
+}
+
+ssize_t ofi_bsock_recvv(struct ofi_bsock *bsock, struct iovec *iov, size_t cnt)
+{
+	struct msghdr msg;
+	size_t len, bytes;
+	ssize_t ret;
+
+	if (cnt == 1)
+		return ofi_bsock_recv(bsock, iov[0].iov_base, iov[0].iov_len);
+
+	len = ofi_total_iov_len(iov, cnt);
+	if (ofi_byteq_readable(&bsock->rq)) {
+		bytes = ofi_byteq_readv(&bsock->rq, iov, cnt, 0);
+		if (bytes == len)
+			return len;
+
+		len -= bytes;
+	} else {
+		bytes = 0;
+	}
+
+	assert(!ofi_bsock_readable(bsock));
+	if (len < (bsock->rq.size >> 1)) {
+		ret = ofi_byteq_recv(&bsock->rq, bsock->sock);
+		if (ret <= 0)
+			goto out;
+
+		assert(ofi_bsock_readable(bsock));
+		bytes += ofi_byteq_readv(&bsock->rq, iov, cnt, bytes);
+		return bytes;
+	}
+
+	/* It's too difficult to adjust the iov without copying it, so return
+	 * what data we have.  The caller will consume the iov and retry.
+	 */
+	if (bytes)
+		return bytes;
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = cnt;
+
+	ret = ofi_recvmsg_tcp(bsock->sock, &msg, MSG_NOSIGNAL);
+	if (ret > 0)
+		return ret;
+out:
+	if (bytes)
+		return bytes;
+	return ret ? -ofi_sockerr(): -FI_ENOTCONN;
+}
+
+#ifdef MSG_ZEROCOPY
+uint32_t ofi_bsock_async_done(const struct fi_provider *prov,
+			      struct ofi_bsock *bsock)
+{
+	struct msghdr msg = {};
+	struct sock_extended_err *serr;
+	struct cmsghdr *cmsg;
+	/* x2 is arbitrary but avoids truncation */
+	uint8_t ctrl[CMSG_SPACE(sizeof(*serr) * 2)];
+	int ret;
+
+	msg.msg_control = &ctrl;
+	msg.msg_controllen = sizeof(ctrl);
+	ret = recvmsg(bsock->sock, &msg, MSG_ERRQUEUE);
+	if (ret < 0) {
+		FI_WARN(prov, FI_LOG_EP_DATA,
+			"Error reading MSG_ERRQUEUE (%s)\n", strerror(errno));
+		goto disable;
+	}
+
+	assert(!(msg.msg_flags & MSG_CTRUNC));
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if ((cmsg->cmsg_level != SOL_IP && cmsg->cmsg_type != IP_RECVERR) &&
+	    (cmsg->cmsg_level != SOL_IPV6 && cmsg->cmsg_type != IPV6_RECVERR)) {
+		FI_WARN(prov, FI_LOG_EP_DATA,
+			"Unexpected cmsg level (!IP) or type (!RECVERR)\n");
+		goto disable;
+	}
+
+	serr = (void *) CMSG_DATA(cmsg);
+	if ((serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) || serr->ee_errno) {
+		FI_WARN(prov, FI_LOG_EP_DATA,
+			"Unexpected sock err origin or errno\n");
+		goto disable;
+	}
+
+	bsock->done_index = serr->ee_data;
+	if (serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED) {
+		FI_WARN(prov, FI_LOG_EP_DATA,
+			"Zerocopy data was copied\n");
+disable:
+		FI_WARN(prov, FI_LOG_EP_DATA, "disabling zerocopy\n");
+		bsock->zerocopy_size = SIZE_MAX;
+	}
+	return bsock->done_index;
+}
+#else
+uint32_t ofi_bsock_async_done(const struct fi_provider *prov,
+			      struct ofi_bsock *bsock)
+{
+	return 0;
+}
+#endif
+
+int ofi_pollfds_grow(struct ofi_pollfds *pfds, int max_size)
+{
+	struct pollfd *fds;
+	void *contexts;
+	size_t size;
+
+	if (max_size < pfds->size)
+		return FI_SUCCESS;
+
+	size = max_size + 1;
+	if (size < pfds->size + 64)
+		size = pfds->size + 64;
+
+	fds = calloc(size, sizeof(*pfds->fds) + sizeof(*pfds->context));
+	if (!fds)
+		return -FI_ENOMEM;
+
+	contexts = fds + size;
+	if (pfds->size) {
+		memcpy(fds, pfds->fds, pfds->size * sizeof(*pfds->fds));
+		memcpy(contexts, pfds->context, pfds->size * sizeof(*pfds->context));
+		free(pfds->fds);
+	}
+
+	while (pfds->size < size)
+		fds[pfds->size++].fd = INVALID_SOCKET;
+
+	pfds->fds = fds;
+	pfds->context = contexts;
+	return FI_SUCCESS;
+}
 
 int ofi_pollfds_create(struct ofi_pollfds **pfds)
 {
@@ -983,14 +1346,9 @@ int ofi_pollfds_create(struct ofi_pollfds **pfds)
 	if (!*pfds)
 		return -FI_ENOMEM;
 
-	(*pfds)->size = 64;
-	(*pfds)->fds = calloc((*pfds)->size, sizeof(*(*pfds)->fds) +
-			    sizeof(*(*pfds)->context));
-	if (!(*pfds)->fds) {
-		ret = -FI_ENOMEM;
+	ret = ofi_pollfds_grow(*pfds, 63);
+	if (ret)
 		goto err1;
-	}
-	(*pfds)->context = (void *)((*pfds)->fds + (*pfds)->size);
 
 	ret = fd_signal_init(&(*pfds)->signal);
 	if (ret)
@@ -1035,111 +1393,92 @@ int ofi_pollfds_add(struct ofi_pollfds *pfds, int fd, uint32_t events,
 	return ofi_pollfds_ctl(pfds, POLLFDS_CTL_ADD, fd, events, context);
 }
 
-int ofi_pollfds_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
-		    void *context)
+static int ofi_pollfds_find(struct slist_entry *entry, const void *arg)
 {
-	return ofi_pollfds_ctl(pfds, POLLFDS_CTL_MOD, fd, events, context);
-}
+	struct ofi_pollfds_work_item *item;
+	int fd = (int) (uintptr_t) arg;
 
-int ofi_pollfds_del(struct ofi_pollfds *pfds, int fd)
-{
-	return ofi_pollfds_ctl(pfds, POLLFDS_CTL_DEL, fd, 0, NULL);
+	item = container_of(entry, struct ofi_pollfds_work_item, entry);
+	return item->fd == fd;
 }
 
-static int ofi_pollfds_array(struct ofi_pollfds *pfds)
+/* We're not changing the fds, just fields.  This is always 'racy' if
+ * the app modifies the events being monitored for an fd while another
+ * thread waits on the fds.  The other thread can always return before
+ * the modifications have been made.  The caller must be prepared to
+ * handle this, same as if epoll were used directly.
+ *
+ * Updating the events is a common case, so handle this immediately
+ * without the overhead of queuing a work item.
+ */
+int ofi_pollfds_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
+		    void *context)
 {
-	struct pollfd *fds;
-	void *contexts;
-
-	fds = calloc(pfds->size + 64,
-		     sizeof(*pfds->fds) + sizeof(*pfds->context));
-	if (!fds)
-		return -FI_ENOMEM;
+	struct slist_entry *entry;
+	struct ofi_pollfds_work_item *item;
+	int ret;
 
-	pfds->size += 64;
-	contexts = fds + pfds->size;
+	fastlock_acquire(&pfds->lock);
+	ret = ofi_pollfds_do_mod(pfds, fd, events, context);
+	if (!ret)
+		goto signal;
+
+	/* fd may be queued for insertion */
+	entry = slist_find_first_match(&pfds->work_item_list, ofi_pollfds_find,
+				       (void *) (uintptr_t) fd);
+	if (entry) {
+		item = container_of(entry, struct ofi_pollfds_work_item, entry);
+		item->events = events;
+		item->context = context;
+	}
 
-	memcpy(fds, pfds->fds, pfds->nfds * sizeof(*pfds->fds));
-	memcpy(contexts, pfds->context, pfds->nfds * sizeof(*pfds->context));
-	free(pfds->fds);
-	pfds->fds = fds;
-	pfds->context = contexts;
-	return FI_SUCCESS;
+signal:
+	fd_signal_set(&pfds->signal);
+	fastlock_release(&pfds->lock);
+	return 0;
 }
 
-static void ofi_pollfds_cleanup(struct ofi_pollfds *pfds)
+int ofi_pollfds_del(struct ofi_pollfds *pfds, int fd)
 {
-	int i;
-
-	for (i = 0; i < pfds->nfds; i++) {
-		while (pfds->fds[i].fd == INVALID_SOCKET) {
-			pfds->fds[i].fd = pfds->fds[pfds->nfds-1].fd;
-			pfds->fds[i].events = pfds->fds[pfds->nfds-1].events;
-			pfds->fds[i].revents = pfds->fds[pfds->nfds-1].revents;
-			pfds->context[i] = pfds->context[pfds->nfds-1];
-			pfds->nfds--;
-			if (i == pfds->nfds)
-				break;
-		}
-	}
+	return ofi_pollfds_ctl(pfds, POLLFDS_CTL_DEL, fd, 0, NULL);
 }
 
 static void ofi_pollfds_process_work(struct ofi_pollfds *pfds)
 {
 	struct slist_entry *entry;
 	struct ofi_pollfds_work_item *item;
-	int i;
 
 	while (!slist_empty(&pfds->work_item_list)) {
-		if ((pfds->nfds == pfds->size) &&
-		    ofi_pollfds_array(pfds))
-			continue;
-
 		entry = slist_remove_head(&pfds->work_item_list);
 		item = container_of(entry, struct ofi_pollfds_work_item, entry);
 
 		switch (item->type) {
 		case POLLFDS_CTL_ADD:
-			pfds->fds[pfds->nfds].fd = item->fd;
-			pfds->fds[pfds->nfds].events = item->events;
-			pfds->fds[pfds->nfds].revents = 0;
-			pfds->context[pfds->nfds] = item->context;
-			pfds->nfds++;
+			ofi_pollfds_do_add(pfds, item);
 			break;
 		case POLLFDS_CTL_DEL:
-			for (i = 0; i < pfds->nfds; i++) {
-				if (pfds->fds[i].fd == item->fd) {
-					pfds->fds[i].fd = INVALID_SOCKET;
-					break;
-				}
-			}
-			break;
-		case POLLFDS_CTL_MOD:
-			for (i = 0; i < pfds->nfds; i++) {
-				if (pfds->fds[i].fd == item->fd) {
-					pfds->fds[i].events = item->events;
-					pfds->fds[i].revents &= item->events;
-					pfds->context[i] = item->context;
-					break;
-				}
-			}
+			ofi_pollfds_do_del(pfds, item);
 			break;
 		default:
 			assert(0);
-			goto out;
+			break;
 		}
 		free(item);
 	}
-out:
-	ofi_pollfds_cleanup(pfds);
 }
 
-int ofi_pollfds_wait(struct ofi_pollfds *pfds, void **contexts,
-		     int max_contexts, int timeout)
+int ofi_pollfds_wait(struct ofi_pollfds *pfds,
+		     struct ofi_epollfds_event *events,
+		     int maxevents, int timeout)
 {
 	int i, ret;
 	int found = 0;
-	uint64_t start = (timeout >= 0) ? ofi_gettime_ms() : 0;
+	uint64_t start = (timeout > 0) ? ofi_gettime_ms() : 0;
+
+	fastlock_acquire(&pfds->lock);
+	if (!slist_empty(&pfds->work_item_list))
+		ofi_pollfds_process_work(pfds);
+	fastlock_release(&pfds->lock);
 
 	do {
 		ret = poll(pfds->fds, pfds->nfds, timeout);
@@ -1148,30 +1487,27 @@ int ofi_pollfds_wait(struct ofi_pollfds *pfds, void **contexts,
 		else if (ret == 0)
 			return 0;
 
-		if (pfds->fds[0].revents)
-			fd_signal_reset(&pfds->signal);
-
 		fastlock_acquire(&pfds->lock);
 		if (!slist_empty(&pfds->work_item_list))
 			ofi_pollfds_process_work(pfds);
-
 		fastlock_release(&pfds->lock);
 
-		/* Index 0 is the internal signaling fd, skip it */
-		for (i = pfds->index; i < pfds->nfds && found < max_contexts; i++) {
-			if (pfds->fds[i].revents && i) {
-				contexts[found++] = pfds->context[i];
-				pfds->index = i;
-			}
+		if (pfds->fds[0].revents) {
+			fd_signal_reset(&pfds->signal);
+			ret--;
 		}
-		for (i = 0; i < pfds->index && found < max_contexts; i++) {
-			if (pfds->fds[i].revents && i) {
-				contexts[found++] = pfds->context[i];
-				pfds->index = i;
+
+		ret = MIN(maxevents, ret);
+
+		/* Index 0 is the internal signaling fd, skip it */
+		for (i = 1; i < pfds->nfds && found < ret; i++) {
+			if (pfds->fds[i].revents) {
+				events[found].events = pfds->fds[i].revents;
+				events[found++].data.ptr = pfds->context[i];
 			}
 		}
 
-		if (timeout > 0)
+		if (!found && timeout > 0)
 			timeout -= (int) (ofi_gettime_ms() - start);
 
 	} while (timeout > 0 && !found);
@@ -1579,7 +1915,7 @@ static void ofi_tostr_bus_attr(char *buf, size_t len,
 	ofi_strncatf(buf, len, "%sfi_bus_attr:\n", prefix);
 
 	prefix = TAB TAB TAB;
-	ofi_strncatf(buf, len, "%sfi_bus_type: ", prefix);
+	ofi_strncatf(buf, len, "%sbus_type: ", prefix);
 	ofi_tostr_bus_type(buf, len, attr->bus_type);
 	ofi_strncatf(buf, len, "\n");
 
@@ -1625,7 +1961,7 @@ int ofi_nic_tostr(const struct fid *fid_nic, char *buf, size_t len)
 	const struct fid_nic *nic = (const struct fid_nic*) fid_nic;
 
 	assert(fid_nic->fclass == FI_CLASS_NIC);
-	ofi_strncatf(buf, len, "%sfid_nic:\n", TAB);
+	ofi_strncatf(buf, len, "%snic:\n", TAB);
 
 	ofi_tostr_device_attr(buf, len, nic->device_attr);
 	ofi_tostr_bus_attr(buf, len, nic->bus_attr);
diff --git a/deps/libfabric/src/enosys.c b/deps/libfabric/src/enosys.c
index 32f1bcca84ff2504afe9d60fa26dc7aa376f0025..13ace0c6f71d29cfd5edac13f2269841a4e3c8b5 100644
--- a/deps/libfabric/src/enosys.c
+++ b/deps/libfabric/src/enosys.c
@@ -53,6 +53,15 @@ int fi_no_ops_open(struct fid *fid, const char *name,
 {
 	return -FI_ENOSYS;
 }
+int fi_no_tostr(const struct fid *fid, char *buf, size_t len)
+{
+	return -FI_ENOSYS;
+}
+int fi_no_ops_set(struct fid *fid, const char *name, uint64_t flags,
+		  void *ops, void *context)
+{
+	return -FI_ENOSYS;
+}
 
 /*
  * struct fi_ops_fabric
diff --git a/deps/libfabric/src/fabric.c b/deps/libfabric/src/fabric.c
index 73de7336f269f6c1f8fb2c1c7928c0f71f889bc7..eefaa292ff73308316a60a9606e0168e9d433a86 100644
--- a/deps/libfabric/src/fabric.c
+++ b/deps/libfabric/src/fabric.c
@@ -49,11 +49,13 @@
 #include "ofi_prov.h"
 #include "ofi_perf.h"
 #include "ofi_hmem.h"
+#include "rdma/fi_ext.h"
 
 #ifdef HAVE_LIBDL
 #include <dlfcn.h>
 #endif
 
+
 struct ofi_prov {
 	struct ofi_prov		*next;
 	char			*prov_name;
@@ -62,12 +64,102 @@ struct ofi_prov {
 	bool			hidden;
 };
 
+enum ofi_prov_order {
+	OFI_PROV_ORDER_VERSION,
+	OFI_PROV_ORDER_REGISTER,
+};
+
 static struct ofi_prov *prov_head, *prov_tail;
+static enum ofi_prov_order prov_order = OFI_PROV_ORDER_VERSION;
 int ofi_init = 0;
 extern struct ofi_common_locks common_locks;
 
 static struct fi_filter prov_filter;
 
+
+static struct ofi_prov *
+ofi_alloc_prov(const char *prov_name)
+{
+	struct ofi_prov *prov;
+
+	prov = calloc(sizeof *prov, 1);
+	if (!prov)
+		return NULL;
+
+	prov->prov_name = strdup(prov_name);
+	if (!prov->prov_name) {
+		free(prov);
+		return NULL;
+	}
+
+	return prov;
+}
+
+static void
+ofi_init_prov(struct ofi_prov *prov, struct fi_provider *provider,
+	      void *dlhandle)
+{
+	prov->provider = provider;
+	prov->dlhandle = dlhandle;
+}
+
+static void ofi_cleanup_prov(struct fi_provider *provider, void *dlhandle)
+{
+	if (provider) {
+		fi_param_undefine(provider);
+		if (provider->cleanup)
+			provider->cleanup();
+	}
+
+#ifdef HAVE_LIBDL
+	if (dlhandle)
+		dlclose(dlhandle);
+#else
+	OFI_UNUSED(dlhandle);
+#endif
+}
+
+static void ofi_free_prov(struct ofi_prov *prov)
+{
+	ofi_cleanup_prov(prov->provider, prov->dlhandle);
+	free(prov->prov_name);
+	free(prov);
+}
+
+static void ofi_insert_prov(struct ofi_prov *prov)
+{
+	struct ofi_prov *cur, *prev;
+
+	for (prev = NULL, cur = prov_head; cur; prev = cur, cur = cur->next) {
+		if ((strlen(prov->prov_name) == strlen(cur->prov_name)) &&
+		    !strcasecmp(prov->prov_name, cur->prov_name)) {
+			if ((prov_order == OFI_PROV_ORDER_VERSION) &&
+			    FI_VERSION_LT(cur->provider->version,
+					  prov->provider->version)) {
+				cur->hidden = true;
+				prov->next = cur;
+				if (prev)
+					prev->next = prov;
+				else
+					prov_head = prov;
+			} else {
+				prov->hidden = true;
+				prov->next = cur->next;
+				cur->next = prov;
+				if (prov_tail == cur)
+					prov_tail = prov;
+			}
+			return;
+		}
+	}
+
+	if (prov_tail)
+		prov_tail->next = prov;
+	else
+		prov_head = prov;
+	prov_tail = prov;
+}
+
 static int ofi_find_name(char **names, const char *name)
 {
 	int i;
@@ -310,51 +402,6 @@ struct fi_provider *ofi_get_hook(const char *name)
 	return provider;
 }
 
-static void cleanup_provider(struct fi_provider *provider, void *dlhandle)
-{
-	OFI_UNUSED(dlhandle);
-
-	if (provider) {
-		fi_param_undefine(provider);
-
-		if (provider->cleanup)
-			provider->cleanup();
-	}
-
-#ifdef HAVE_LIBDL
-	if (dlhandle)
-		dlclose(dlhandle);
-#endif
-}
-
-static struct ofi_prov *ofi_create_prov_entry(const char *prov_name)
-{
-	struct ofi_prov *prov = NULL;
-	prov = calloc(sizeof *prov, 1);
-	if (!prov) {
-		FI_WARN(&core_prov, FI_LOG_CORE,
-			"Not enough memory to allocate provider registry\n");
-		return NULL;
-	}
-
-	prov->prov_name = strdup(prov_name);
-	if (!prov->prov_name) {
-		FI_WARN(&core_prov, FI_LOG_CORE,
-			"Failed to init pre-registered provider name\n");
-		free(prov);
-		return NULL;
-	}
-	if (prov_tail)
-		prov_tail->next = prov;
-	else
-		prov_head = prov;
-	prov_tail = prov;
-
-	prov->hidden = false;
-
-	return prov;
-}
-
 /* This is the default order that providers will be reported when a provider
  * is available.  Initialize the socket(s) provider last.  This will result in
  * it being the least preferred provider.
@@ -362,8 +409,8 @@ static struct ofi_prov *ofi_create_prov_entry(const char *prov_name)
 static void ofi_ordered_provs_init(void)
 {
 	char *ordered_prov_names[] = {
-		"psm2", "psm", "efa", "usnic", "gni", "bgq", "verbs",
-		"netdir", "ofi_rxm", "ofi_rxd", "shm",
+		"efa", "psm2", "psm", "usnic", "gni", "bgq", "verbs",
+		"netdir", "psm3", "ofi_rxm", "ofi_rxd", "shm",
 		/* Initialize the socket based providers last of the
 		 * standard providers.  This will result in them being
 		 * the least preferred providers.
@@ -378,10 +425,16 @@ static void ofi_ordered_provs_init(void)
 		 */
 		"ofi_hook_perf", "ofi_hook_debug", "ofi_hook_noop",
 	};
-	int num_provs = sizeof(ordered_prov_names)/sizeof(ordered_prov_names[0]), i;
+	struct ofi_prov *prov;
+	int num_provs, i;
+
+	num_provs = sizeof(ordered_prov_names) / sizeof(ordered_prov_names[0]);
 
-	for (i = 0; i < num_provs; i++)
-		ofi_create_prov_entry(ordered_prov_names[i]);
+	for (i = 0; i < num_provs; i++) {
+		prov = ofi_alloc_prov(ordered_prov_names[i]);
+		if (prov)
+			ofi_insert_prov(prov);
+	}
 }
 
 static void ofi_set_prov_type(struct fi_prov_context *ctx,
@@ -450,52 +503,28 @@ static void ofi_register_provider(struct fi_provider *provider, void *dlhandle)
 	 */
 	if (!strcasecmp(provider->name, "sockets") ||
 	    !strcasecmp(provider->name, "shm") ||
-	    !strcasecmp(provider->name, "efa") || ofi_is_util_prov(provider))
+	    !strcasecmp(provider->name, "efa") ||
+	    !strcasecmp(provider->name, "psm3") || ofi_is_util_prov(provider))
 		ctx->disable_layering = 1;
 
 	prov = ofi_getprov(provider->name, strlen(provider->name));
-	if (prov) {
-		/* If this provider has not been init yet, then we add the
-		 * provider and dlhandle to the struct and exit.
-		 */
-		if (prov->provider == NULL)
-			goto update_prov_registry;
-
-		/* If this provider is older than an already-loaded
-		 * provider of the same name, then discard this one.
-		 */
-		if (FI_VERSION_GE(prov->provider->version, provider->version)) {
-			FI_INFO(&core_prov, FI_LOG_CORE,
-				"a newer %s provider was already loaded; "
-				"ignoring this one\n", provider->name);
-			goto cleanup;
-		}
-
-		/* This provider is newer than an already-loaded
-		 * provider of the same name, so discard the
-		 * already-loaded one.
-		 */
-		FI_INFO(&core_prov, FI_LOG_CORE,
-			"an older %s provider was already loaded; "
-			"keeping this one and ignoring the older one\n",
-			provider->name);
-		cleanup_provider(prov->provider, prov->dlhandle);
+	if (prov && !prov->provider) {
+		ofi_init_prov(prov, provider, dlhandle);
 	} else {
-		prov = ofi_create_prov_entry(provider->name);
+		prov = ofi_alloc_prov(provider->name);
 		if (!prov)
 			goto cleanup;
+
+		ofi_init_prov(prov, provider, dlhandle);
+		ofi_insert_prov(prov);
 	}
 
 	if (hidden)
 		prov->hidden = true;
-
-update_prov_registry:
-	prov->dlhandle = dlhandle;
-	prov->provider = provider;
 	return;
 
 cleanup:
-	cleanup_provider(provider, dlhandle);
+	ofi_cleanup_prov(provider, dlhandle);
 }
 
 #ifdef HAVE_LIBDL
@@ -591,11 +620,11 @@ static void ofi_reg_dl_prov(const char *lib)
 
 static void ofi_ini_dir(const char *dir)
 {
-	int n = 0;
+	int n;
 	char *lib;
 	struct dirent **liblist = NULL;
 
-	n = scandir(dir, &liblist, lib_filter, NULL);
+	n = scandir(dir, &liblist, lib_filter, alphasort);
 	if (n < 0)
 		goto libdl_done;
 
@@ -617,8 +646,8 @@ libdl_done:
 	free(liblist);
 }
 
-/* Search standard system library paths (i.e. LD_LIBRARY_PATH) for known DL provider
- * libraries.
+/* Search standard system library paths (i.e. LD_LIBRARY_PATH) for DLLs for
+ * known providers.
  */
 static void ofi_find_prov_libs(void)
 {
@@ -628,7 +657,6 @@ static void ofi_find_prov_libs(void)
 	char* short_prov_name;
 
 	for (prov = prov_head; prov; prov = prov->next) {
-
 		if (!prov->prov_name)
 			continue;
 
@@ -649,6 +677,55 @@ static void ofi_find_prov_libs(void)
 		free(lib);
 	}
 }
+
+static void ofi_load_dl_prov(void)
+{
+	char **dirs;
+	char *provdir = NULL;
+	void *dlhandle;
+	int i;
+
+	/* If dlopen fails, assume static linking and return */
+	dlhandle = dlopen(NULL, RTLD_NOW);
+	if (!dlhandle)
+		return;
+	dlclose(dlhandle);
+
+	fi_param_define(NULL, "provider_path", FI_PARAM_STRING,
+			"Search for providers in specific path.  Path is "
+			"specified similar to dir1:dir2:dir3.  If the path "
+			"starts with @, loaded providers are given preference "
+			"based on discovery order, rather than version. "
+			"(default: " PROVDLDIR ")");
+
+	fi_param_get_str(NULL, "provider_path", &provdir);
+	if (!provdir || !strlen(provdir)) {
+		ofi_find_prov_libs();
+		dirs = ofi_split_and_alloc(PROVDLDIR, ":", NULL);
+	} else if (provdir[0] == '@') {
+		prov_order = OFI_PROV_ORDER_REGISTER;
+		if (strlen(provdir) == 1)
+			dirs = ofi_split_and_alloc(PROVDLDIR, ":", NULL);
+		else
+			dirs = ofi_split_and_alloc(&provdir[1], ":", NULL);
+	} else {
+		dirs = ofi_split_and_alloc(provdir, ":", NULL);
+	}
+
+	if (dirs) {
+		for (i = 0; dirs[i]; i++)
+			ofi_ini_dir(dirs[i]);
+
+		ofi_free_string_array(dirs);
+	}
+}
+
+#else /* HAVE_LIBDL */
+
+static void ofi_load_dl_prov(void)
+{
+}
+
 #endif
 
 void fi_ini(void)
@@ -687,38 +764,9 @@ void fi_ini(void)
 	fi_param_get_str(NULL, "provider", &param_val);
 	ofi_create_filter(&prov_filter, param_val);
 
-#ifdef HAVE_LIBDL
-	int n = 0;
-	char **dirs;
-	char *provdir = NULL;
-	void *dlhandle;
-
-	/* If dlopen fails, assume static linking and just return
-	   without error */
-	dlhandle = dlopen(NULL, RTLD_NOW);
-	if (dlhandle == NULL) {
-		goto libdl_done;
-	}
-	dlclose(dlhandle);
-
-	fi_param_define(NULL, "provider_path", FI_PARAM_STRING,
-			"Search for providers in specific path (default: "
-			PROVDLDIR ")");
-	fi_param_get_str(NULL, "provider_path", &provdir);
-	if (!provdir) {
-		provdir = PROVDLDIR;
-		ofi_find_prov_libs();
-	}
-	dirs = ofi_split_and_alloc(provdir, ":", NULL);
-	if (dirs) {
-		for (n = 0; dirs[n]; ++n) {
-			ofi_ini_dir(dirs[n]);
-		}
-		ofi_free_string_array(dirs);
-	}
-libdl_done:
-#endif
+	ofi_load_dl_prov();
 
+	ofi_register_provider(PSM3_INIT, NULL);
 	ofi_register_provider(PSM2_INIT, NULL);
 	ofi_register_provider(PSM_INIT, NULL);
 	ofi_register_provider(USNIC_INIT, NULL);
@@ -750,15 +798,15 @@ FI_DESTRUCTOR(fi_fini(void))
 {
 	struct ofi_prov *prov;
 
+	pthread_mutex_lock(&common_locks.ini_lock);
+
 	if (!ofi_init)
-		return;
+		goto unlock;
 
 	while (prov_head) {
 		prov = prov_head;
 		prov_head = prov->next;
-		cleanup_provider(prov->provider, prov->dlhandle);
-		free(prov->prov_name);
-		free(prov);
+		ofi_free_prov(prov);
 	}
 
 	ofi_free_filter(&prov_filter);
@@ -768,6 +816,11 @@ FI_DESTRUCTOR(fi_fini(void))
 	fi_log_fini();
 	fi_param_fini();
 	ofi_osd_fini();
+
+	ofi_init = 0;
+
+unlock:
+	pthread_mutex_unlock(&common_locks.ini_lock);
 }
 
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
@@ -803,7 +856,7 @@ void DEFAULT_SYMVER_PRE(fi_freeinfo)(struct fi_info *info)
 		free(info);
 	}
 }
-CURRENT_SYMVER(fi_freeinfo_, fi_freeinfo);
+DEFAULT_SYMVER(fi_freeinfo_, fi_freeinfo, FABRIC_1.3);
 
 /*
  * Make a dummy info object for each provider, and copy in the
@@ -963,8 +1016,7 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node,
 	enum fi_log_level level;
 	int ret;
 
-	if (!ofi_init)
-		fi_ini();
+	fi_ini();
 
 	if (FI_VERSION_LT(fi_version(), version)) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
@@ -1050,7 +1102,7 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node,
 
 	return *info ? 0 : -FI_ENODATA;
 }
-CURRENT_SYMVER(fi_getinfo_, fi_getinfo);
+DEFAULT_SYMVER(fi_getinfo_, fi_getinfo, FABRIC_1.3);
 
 struct fi_info *ofi_allocinfo_internal(void)
 {
@@ -1181,7 +1233,7 @@ fail:
 	fi_freeinfo(dup);
 	return NULL;
 }
-CURRENT_SYMVER(fi_dupinfo_, fi_dupinfo);
+DEFAULT_SYMVER(fi_dupinfo_, fi_dupinfo, FABRIC_1.3);
 
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
 int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr,
@@ -1194,8 +1246,7 @@ int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr,
 	if (!attr || !attr->prov_name || !attr->name)
 		return -FI_EINVAL;
 
-	if (!ofi_init)
-		fi_ini();
+	fi_ini();
 
 	top_name = strrchr(attr->prov_name, OFI_NAME_DELIM);
 	if (top_name)
@@ -1231,6 +1282,19 @@ uint32_t DEFAULT_SYMVER_PRE(fi_version)(void)
 }
 DEFAULT_SYMVER(fi_version_, fi_version, FABRIC_1.0);
 
+__attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
+int DEFAULT_SYMVER_PRE(fi_open)(uint32_t version, const char *name,
+		void *attr, size_t attr_len, uint64_t flags,
+		struct fid **fid, void *context)
+{
+	if (!strcasecmp("mr_cache", name))
+		return ofi_open_mr_cache(version, attr, attr_len,
+					 flags, fid, context);
+
+	return -FI_ENOSYS;
+}
+DEFAULT_SYMVER(fi_open_, fi_open, FABRIC_1.5);
+
 static const char *const errstr[] = {
 	[FI_EOTHER - FI_ERRNO_OFFSET] = "Unspecified error",
 	[FI_ETOOSMALL - FI_ERRNO_OFFSET] = "Provided buffer is too small",
@@ -1245,6 +1309,7 @@ static const char *const errstr[] = {
 	[FI_ENOKEY - FI_ERRNO_OFFSET] = "Required key not available",
 	[FI_ENOAV - FI_ERRNO_OFFSET] = "Missing or unavailable address vector",
 	[FI_EOVERRUN - FI_ERRNO_OFFSET] = "Queue has been overrun",
+	[FI_ENORX - FI_ERRNO_OFFSET] = "Receiver not ready, no receive buffers available",
 };
 
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
diff --git a/deps/libfabric/src/fi_tostr.c b/deps/libfabric/src/fi_tostr.c
index 4a0370e965cb824c8509154cca5bf2cf418431e4..0b8646d354d75f616b69ecd60d7c60f684156007 100644
--- a/deps/libfabric/src/fi_tostr.c
+++ b/deps/libfabric/src/fi_tostr.c
@@ -70,741 +70,821 @@
  * fi_info->caps  : ofi_tostr_caps(..., typeof(caps), ...)
  */
 
-#define OFI_BUFSIZ 8192
 
-static void ofi_tostr_fid(const char *label, char *buf, const struct fid *fid)
+static void
+ofi_tostr_fid(const char *label, char *buf, size_t len, const struct fid *fid)
 {
 	if (!fid || !FI_CHECK_OP(fid->ops, struct fi_ops, tostr))
-		ofi_strcatf(buf, "%s%p\n", label, fid);
+		ofi_strncatf(buf, len, "%s%p\n", label, fid);
 	else
-		fid->ops->tostr(fid, buf, OFI_BUFSIZ - strnlen(buf, OFI_BUFSIZ));
+		fid->ops->tostr(fid, buf, len - strnlen(buf, len));
 }
 
-static void ofi_tostr_opflags(char *buf, uint64_t flags)
+static void ofi_tostr_opflags(char *buf, size_t len, uint64_t flags)
 {
-	IFFLAGSTR(flags, FI_MULTICAST);
-
-	IFFLAGSTR(flags, FI_MULTI_RECV);
-	IFFLAGSTR(flags, FI_REMOTE_CQ_DATA);
-	IFFLAGSTR(flags, FI_MORE);
-	IFFLAGSTR(flags, FI_PEEK);
-	IFFLAGSTR(flags, FI_TRIGGER);
-	IFFLAGSTR(flags, FI_FENCE);
-
-	IFFLAGSTR(flags, FI_COMPLETION);
-	IFFLAGSTR(flags, FI_INJECT);
-	IFFLAGSTR(flags, FI_INJECT_COMPLETE);
-	IFFLAGSTR(flags, FI_TRANSMIT_COMPLETE);
-	IFFLAGSTR(flags, FI_DELIVERY_COMPLETE);
-	IFFLAGSTR(flags, FI_MATCH_COMPLETE);
-	IFFLAGSTR(flags, FI_AFFINITY);
-
-	IFFLAGSTR(flags, FI_CLAIM);
-	IFFLAGSTR(flags, FI_DISCARD);
+	IFFLAGSTRN(flags, FI_MULTICAST, len);
+
+	IFFLAGSTRN(flags, FI_MULTI_RECV, len);
+	IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len);
+	IFFLAGSTRN(flags, FI_MORE, len);
+	IFFLAGSTRN(flags, FI_PEEK, len);
+	IFFLAGSTRN(flags, FI_TRIGGER, len);
+	IFFLAGSTRN(flags, FI_FENCE, len);
+
+	IFFLAGSTRN(flags, FI_COMPLETION, len);
+	IFFLAGSTRN(flags, FI_INJECT, len);
+	IFFLAGSTRN(flags, FI_INJECT_COMPLETE, len);
+	IFFLAGSTRN(flags, FI_TRANSMIT_COMPLETE, len);
+	IFFLAGSTRN(flags, FI_DELIVERY_COMPLETE, len);
+	IFFLAGSTRN(flags, FI_MATCH_COMPLETE, len);
+	IFFLAGSTRN(flags, FI_AFFINITY, len);
+
+	IFFLAGSTRN(flags, FI_CLAIM, len);
+	IFFLAGSTRN(flags, FI_DISCARD, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_addr_format(char *buf, uint32_t addr_format)
+static void ofi_tostr_addr_format(char *buf, size_t len, uint32_t addr_format)
 {
 	switch (addr_format) {
-	CASEENUMSTR(FI_FORMAT_UNSPEC);
-	CASEENUMSTR(FI_SOCKADDR);
-	CASEENUMSTR(FI_SOCKADDR_IN);
-	CASEENUMSTR(FI_SOCKADDR_IN6);
-	CASEENUMSTR(FI_SOCKADDR_IB);
-	CASEENUMSTR(FI_ADDR_PSMX);
-	CASEENUMSTR(FI_ADDR_PSMX2);
-	CASEENUMSTR(FI_ADDR_GNI);
-	CASEENUMSTR(FI_ADDR_BGQ);
-	CASEENUMSTR(FI_ADDR_MLX);
-	CASEENUMSTR(FI_ADDR_STR);
-	CASEENUMSTR(FI_ADDR_IB_UD);
-	CASEENUMSTR(FI_ADDR_EFA);
+	CASEENUMSTRN(FI_FORMAT_UNSPEC, len);
+	CASEENUMSTRN(FI_SOCKADDR, len);
+	CASEENUMSTRN(FI_SOCKADDR_IN, len);
+	CASEENUMSTRN(FI_SOCKADDR_IN6, len);
+	CASEENUMSTRN(FI_SOCKADDR_IB, len);
+	CASEENUMSTRN(FI_ADDR_PSMX, len);
+	CASEENUMSTRN(FI_ADDR_PSMX2, len);
+	CASEENUMSTRN(FI_ADDR_GNI, len);
+	CASEENUMSTRN(FI_ADDR_BGQ, len);
+	CASEENUMSTRN(FI_ADDR_MLX, len);
+	CASEENUMSTRN(FI_ADDR_STR, len);
+	CASEENUMSTRN(FI_ADDR_IB_UD, len);
+	CASEENUMSTRN(FI_ADDR_EFA, len);
+	CASEENUMSTRN(FI_ADDR_PSMX3, len);
 	default:
 		if (addr_format & FI_PROV_SPECIFIC)
-			ofi_strcatf(buf, "Provider specific");
+			ofi_strncatf(buf, len, "Provider specific");
 		else
-			ofi_strcatf(buf, "Unknown");
+			ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_progress(char *buf, enum fi_progress progress)
+static void ofi_tostr_progress(char *buf, size_t len, enum fi_progress progress)
 {
 	switch (progress) {
-	CASEENUMSTR(FI_PROGRESS_UNSPEC);
-	CASEENUMSTR(FI_PROGRESS_AUTO);
-	CASEENUMSTR(FI_PROGRESS_MANUAL);
+	CASEENUMSTRN(FI_PROGRESS_UNSPEC, len);
+	CASEENUMSTRN(FI_PROGRESS_AUTO, len);
+	CASEENUMSTRN(FI_PROGRESS_MANUAL, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_threading(char *buf, enum fi_threading threading)
+static void
+ofi_tostr_threading(char *buf, size_t len, enum fi_threading threading)
 {
 	switch (threading) {
-	CASEENUMSTR(FI_THREAD_UNSPEC);
-	CASEENUMSTR(FI_THREAD_SAFE);
-	CASEENUMSTR(FI_THREAD_FID);
-	CASEENUMSTR(FI_THREAD_DOMAIN);
-	CASEENUMSTR(FI_THREAD_COMPLETION);
-	CASEENUMSTR(FI_THREAD_ENDPOINT);
+	CASEENUMSTRN(FI_THREAD_UNSPEC, len);
+	CASEENUMSTRN(FI_THREAD_SAFE, len);
+	CASEENUMSTRN(FI_THREAD_FID, len);
+	CASEENUMSTRN(FI_THREAD_DOMAIN, len);
+	CASEENUMSTRN(FI_THREAD_COMPLETION, len);
+	CASEENUMSTRN(FI_THREAD_ENDPOINT, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_msgorder(char *buf, uint64_t flags)
-{
-	IFFLAGSTR(flags, FI_ORDER_RAR);
-	IFFLAGSTR(flags, FI_ORDER_RAW);
-	IFFLAGSTR(flags, FI_ORDER_RAS);
-	IFFLAGSTR(flags, FI_ORDER_WAR);
-	IFFLAGSTR(flags, FI_ORDER_WAW);
-	IFFLAGSTR(flags, FI_ORDER_WAS);
-	IFFLAGSTR(flags, FI_ORDER_SAR);
-	IFFLAGSTR(flags, FI_ORDER_SAW);
-	IFFLAGSTR(flags, FI_ORDER_SAS);
-	IFFLAGSTR(flags, FI_ORDER_RMA_RAR);
-	IFFLAGSTR(flags, FI_ORDER_RMA_RAW);
-	IFFLAGSTR(flags, FI_ORDER_RMA_WAR);
-	IFFLAGSTR(flags, FI_ORDER_RMA_WAW);
-	IFFLAGSTR(flags, FI_ORDER_ATOMIC_RAR);
-	IFFLAGSTR(flags, FI_ORDER_ATOMIC_RAW);
-	IFFLAGSTR(flags, FI_ORDER_ATOMIC_WAR);
-	IFFLAGSTR(flags, FI_ORDER_ATOMIC_WAW);
+static void ofi_tostr_msgorder(char *buf, size_t len, uint64_t flags)
+{
+	IFFLAGSTRN(flags, FI_ORDER_RAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_RAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_RAS, len);
+	IFFLAGSTRN(flags, FI_ORDER_WAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_WAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_WAS, len);
+	IFFLAGSTRN(flags, FI_ORDER_SAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_SAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_SAS, len);
+	IFFLAGSTRN(flags, FI_ORDER_RMA_RAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_RMA_RAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_RMA_WAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_RMA_WAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAW, len);
+	IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAR, len);
+	IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAW, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_comporder(char *buf, uint64_t flags)
+static void ofi_tostr_comporder(char *buf, size_t len, uint64_t flags)
 {
 	if ((flags & FI_ORDER_STRICT) == FI_ORDER_NONE) {
-		ofi_strcatf(buf, "FI_ORDER_NONE, ");
+		ofi_strncatf(buf, len, "FI_ORDER_NONE, ");
 	} else if ((flags & FI_ORDER_STRICT) == FI_ORDER_STRICT) {
-		ofi_strcatf(buf, "FI_ORDER_STRICT, ");
+		ofi_strncatf(buf, len, "FI_ORDER_STRICT, ");
 	}
 
-	IFFLAGSTR(flags, FI_ORDER_DATA);
+	IFFLAGSTRN(flags, FI_ORDER_DATA, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_caps(char *buf, uint64_t caps)
-{
-	IFFLAGSTR(caps, FI_MSG);
-	IFFLAGSTR(caps, FI_RMA);
-	IFFLAGSTR(caps, FI_TAGGED);
-	IFFLAGSTR(caps, FI_ATOMIC);
-	IFFLAGSTR(caps, FI_MULTICAST);
-	IFFLAGSTR(caps, FI_COLLECTIVE);
-
-	IFFLAGSTR(caps, FI_READ);
-	IFFLAGSTR(caps, FI_WRITE);
-	IFFLAGSTR(caps, FI_RECV);
-	IFFLAGSTR(caps, FI_SEND);
-	IFFLAGSTR(caps, FI_REMOTE_READ);
-	IFFLAGSTR(caps, FI_REMOTE_WRITE);
-
-	IFFLAGSTR(caps, FI_MULTI_RECV);
-	IFFLAGSTR(caps, FI_REMOTE_CQ_DATA);
-	IFFLAGSTR(caps, FI_TRIGGER);
-	IFFLAGSTR(caps, FI_FENCE);
-
-	IFFLAGSTR(caps, FI_VARIABLE_MSG);
-	IFFLAGSTR(caps, FI_RMA_PMEM);
-	IFFLAGSTR(caps, FI_SOURCE_ERR);
-	IFFLAGSTR(caps, FI_LOCAL_COMM);
-	IFFLAGSTR(caps, FI_REMOTE_COMM);
-	IFFLAGSTR(caps, FI_SHARED_AV);
-	IFFLAGSTR(caps, FI_RMA_EVENT);
-	IFFLAGSTR(caps, FI_SOURCE);
-	IFFLAGSTR(caps, FI_NAMED_RX_CTX);
-	IFFLAGSTR(caps, FI_DIRECTED_RECV);
-	IFFLAGSTR(caps, FI_HMEM);
+static void ofi_tostr_caps(char *buf, size_t len, uint64_t caps)
+{
+	IFFLAGSTRN(caps, FI_MSG, len);
+	IFFLAGSTRN(caps, FI_RMA, len);
+	IFFLAGSTRN(caps, FI_TAGGED, len);
+	IFFLAGSTRN(caps, FI_ATOMIC, len);
+	IFFLAGSTRN(caps, FI_MULTICAST, len);
+	IFFLAGSTRN(caps, FI_COLLECTIVE, len);
+
+	IFFLAGSTRN(caps, FI_READ, len);
+	IFFLAGSTRN(caps, FI_WRITE, len);
+	IFFLAGSTRN(caps, FI_RECV, len);
+	IFFLAGSTRN(caps, FI_SEND, len);
+	IFFLAGSTRN(caps, FI_REMOTE_READ, len);
+	IFFLAGSTRN(caps, FI_REMOTE_WRITE, len);
+
+	IFFLAGSTRN(caps, FI_MULTI_RECV, len);
+	IFFLAGSTRN(caps, FI_REMOTE_CQ_DATA, len);
+	IFFLAGSTRN(caps, FI_TRIGGER, len);
+	IFFLAGSTRN(caps, FI_FENCE, len);
+
+	IFFLAGSTRN(caps, FI_VARIABLE_MSG, len);
+	IFFLAGSTRN(caps, FI_RMA_PMEM, len);
+	IFFLAGSTRN(caps, FI_SOURCE_ERR, len);
+	IFFLAGSTRN(caps, FI_LOCAL_COMM, len);
+	IFFLAGSTRN(caps, FI_REMOTE_COMM, len);
+	IFFLAGSTRN(caps, FI_SHARED_AV, len);
+	IFFLAGSTRN(caps, FI_RMA_EVENT, len);
+	IFFLAGSTRN(caps, FI_SOURCE, len);
+	IFFLAGSTRN(caps, FI_NAMED_RX_CTX, len);
+	IFFLAGSTRN(caps, FI_DIRECTED_RECV, len);
+	IFFLAGSTRN(caps, FI_HMEM, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_ep_type(char *buf, enum fi_ep_type ep_type)
+static void ofi_tostr_ep_type(char *buf, size_t len, enum fi_ep_type ep_type)
 {
 	switch (ep_type) {
-	CASEENUMSTR(FI_EP_UNSPEC);
-	CASEENUMSTR(FI_EP_MSG);
-	CASEENUMSTR(FI_EP_DGRAM);
-	CASEENUMSTR(FI_EP_RDM);
-	CASEENUMSTR(FI_EP_SOCK_STREAM);
-	CASEENUMSTR(FI_EP_SOCK_DGRAM);
+	CASEENUMSTRN(FI_EP_UNSPEC, len);
+	CASEENUMSTRN(FI_EP_MSG, len);
+	CASEENUMSTRN(FI_EP_DGRAM, len);
+	CASEENUMSTRN(FI_EP_RDM, len);
+	CASEENUMSTRN(FI_EP_SOCK_STREAM, len);
+	CASEENUMSTRN(FI_EP_SOCK_DGRAM, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_protocol(char *buf, uint32_t protocol)
+static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol)
 {
 	switch (protocol) {
-	CASEENUMSTR(FI_PROTO_UNSPEC);
-	CASEENUMSTR(FI_PROTO_RDMA_CM_IB_RC);
-	CASEENUMSTR(FI_PROTO_IWARP);
-	CASEENUMSTR(FI_PROTO_IB_UD);
-	CASEENUMSTR(FI_PROTO_PSMX);
-	CASEENUMSTR(FI_PROTO_PSMX2);
-	CASEENUMSTR(FI_PROTO_UDP);
-	CASEENUMSTR(FI_PROTO_SOCK_TCP);
-	CASEENUMSTR(FI_PROTO_IB_RDM);
-	CASEENUMSTR(FI_PROTO_IWARP_RDM);
-	CASEENUMSTR(FI_PROTO_GNI);
-	CASEENUMSTR(FI_PROTO_RXM);
-	CASEENUMSTR(FI_PROTO_RXD);
-	CASEENUMSTR(FI_PROTO_MLX);
-	CASEENUMSTR(FI_PROTO_NETWORKDIRECT);
-	CASEENUMSTR(FI_PROTO_SHM);
-	CASEENUMSTR(FI_PROTO_RSTREAM);
-	CASEENUMSTR(FI_PROTO_RDMA_CM_IB_XRC);
-	CASEENUMSTR(FI_PROTO_EFA);
+	CASEENUMSTRN(FI_PROTO_UNSPEC, len);
+	CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_RC, len);
+	CASEENUMSTRN(FI_PROTO_IWARP, len);
+	CASEENUMSTRN(FI_PROTO_IB_UD, len);
+	CASEENUMSTRN(FI_PROTO_PSMX, len);
+	CASEENUMSTRN(FI_PROTO_PSMX2, len);
+	CASEENUMSTRN(FI_PROTO_UDP, len);
+	CASEENUMSTRN(FI_PROTO_SOCK_TCP, len);
+	CASEENUMSTRN(FI_PROTO_IB_RDM, len);
+	CASEENUMSTRN(FI_PROTO_IWARP_RDM, len);
+	CASEENUMSTRN(FI_PROTO_GNI, len);
+	CASEENUMSTRN(FI_PROTO_RXM, len);
+	CASEENUMSTRN(FI_PROTO_RXD, len);
+	CASEENUMSTRN(FI_PROTO_MLX, len);
+	CASEENUMSTRN(FI_PROTO_NETWORKDIRECT, len);
+	CASEENUMSTRN(FI_PROTO_SHM, len);
+	CASEENUMSTRN(FI_PROTO_RSTREAM, len);
+	CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_XRC, len);
+	CASEENUMSTRN(FI_PROTO_EFA, len);
+	CASEENUMSTRN(FI_PROTO_PSMX3, len);
 	default:
 		if (protocol & FI_PROV_SPECIFIC)
-			ofi_strcatf(buf, "Provider specific");
+			ofi_strncatf(buf, len, "Provider specific");
 		else
-			ofi_strcatf(buf, "Unknown");
+			ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_mode(char *buf, uint64_t mode)
+static void ofi_tostr_mode(char *buf, size_t len, uint64_t mode)
 {
-	IFFLAGSTR(mode, FI_CONTEXT);
-	IFFLAGSTR(mode, FI_MSG_PREFIX);
-	IFFLAGSTR(mode, FI_ASYNC_IOV);
-	IFFLAGSTR(mode, FI_RX_CQ_DATA);
-	IFFLAGSTR(mode, FI_LOCAL_MR);
-	IFFLAGSTR(mode, FI_NOTIFY_FLAGS_ONLY);
-	IFFLAGSTR(mode, FI_RESTRICTED_COMP);
-	IFFLAGSTR(mode, FI_CONTEXT2);
-	IFFLAGSTR(mode, FI_BUFFERED_RECV);
+	IFFLAGSTRN(mode, FI_CONTEXT, len);
+	IFFLAGSTRN(mode, FI_MSG_PREFIX, len);
+	IFFLAGSTRN(mode, FI_ASYNC_IOV, len);
+	IFFLAGSTRN(mode, FI_RX_CQ_DATA, len);
+	IFFLAGSTRN(mode, FI_LOCAL_MR, len);
+	IFFLAGSTRN(mode, FI_NOTIFY_FLAGS_ONLY, len);
+	IFFLAGSTRN(mode, FI_RESTRICTED_COMP, len);
+	IFFLAGSTRN(mode, FI_CONTEXT2, len);
+	IFFLAGSTRN(mode, FI_BUFFERED_RECV, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_addr(char *buf, uint32_t addr_format, void *addr)
+static void
+ofi_tostr_addr(char *buf, size_t len, uint32_t addr_format, void *addr)
 {
 	char *p;
-	size_t len;
+	size_t addrlen;
 
 	p = buf + strlen(buf);
+	addrlen = len - strlen(buf);
 
 	if (addr == NULL) {
-		ofi_strcatf(p, "(null)");
+		ofi_strncatf(p, addrlen, "(null)");
 		return;
 	}
 
-	len = 64;
-	ofi_straddr(p, &len, addr_format, addr);
+	ofi_straddr(p, &addrlen, addr_format, addr);
 }
 
-static void ofi_tostr_tx_attr(char *buf, const struct fi_tx_attr *attr,
-			     const char *prefix)
+static void
+ofi_tostr_tx_attr(char *buf, size_t len, const struct fi_tx_attr *attr,
+		  const char *prefix)
 {
 	if (!attr) {
-		ofi_strcatf(buf, "%sfi_tx_attr: (null)\n", prefix);
+		ofi_strncatf(buf, len, "%sfi_tx_attr: (null)\n", prefix);
 		return;
 	}
 
-	ofi_strcatf(buf, "%sfi_tx_attr:\n", prefix);
-	ofi_strcatf(buf, "%s%scaps: [ ", prefix, TAB);
-	ofi_tostr_caps(buf, attr->caps);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%smode: [ ", prefix, TAB);
-	ofi_tostr_mode(buf, attr->mode);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%sop_flags: [ ", prefix, TAB);
-	ofi_tostr_opflags(buf, attr->op_flags);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%smsg_order: [ ", prefix, TAB);
-	ofi_tostr_msgorder(buf, attr->msg_order);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%scomp_order: [ ", prefix, TAB);
-	ofi_tostr_comporder(buf, attr->comp_order);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%sinject_size: %zu\n", prefix, TAB, attr->inject_size);
-	ofi_strcatf(buf, "%s%ssize: %zu\n", prefix, TAB, attr->size);
-	ofi_strcatf(buf, "%s%siov_limit: %zu\n", prefix, TAB, attr->iov_limit);
-	ofi_strcatf(buf, "%s%srma_iov_limit: %zu\n", prefix, TAB, attr->rma_iov_limit);
+	ofi_strncatf(buf, len, "%sfi_tx_attr:\n", prefix);
+	ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB);
+	ofi_tostr_caps(buf, len, attr->caps);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB);
+	ofi_tostr_mode(buf, len, attr->mode);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB);
+	ofi_tostr_opflags(buf, len, attr->op_flags);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB);
+	ofi_tostr_msgorder(buf, len, attr->msg_order);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB);
+	ofi_tostr_comporder(buf, len, attr->comp_order);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%sinject_size: %zu\n", prefix, TAB,
+		     attr->inject_size);
+	ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size);
+	ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB,
+		     attr->iov_limit);
+	ofi_strncatf(buf, len, "%s%srma_iov_limit: %zu\n", prefix, TAB,
+		     attr->rma_iov_limit);
+	ofi_strncatf(buf, len, "%s%stclass: 0x%x\n", prefix, TAB, attr->tclass);
 }
 
-static void ofi_tostr_rx_attr(char *buf, const struct fi_rx_attr *attr,
-			     const char *prefix)
+static void
+ofi_tostr_rx_attr(char *buf, size_t len, const struct fi_rx_attr *attr,
+		  const char *prefix)
 {
 	if (!attr) {
-		ofi_strcatf(buf, "%sfi_rx_attr: (null)\n", prefix);
+		ofi_strncatf(buf, len, "%sfi_rx_attr: (null)\n", prefix);
 		return;
 	}
 
-	ofi_strcatf(buf, "%sfi_rx_attr:\n", prefix);
-	ofi_strcatf(buf, "%s%scaps: [ ", prefix, TAB);
-	ofi_tostr_caps(buf, attr->caps);
-	ofi_strcatf(buf, " ]\n");
+	ofi_strncatf(buf, len, "%sfi_rx_attr:\n", prefix);
+	ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB);
+	ofi_tostr_caps(buf, len, attr->caps);
+	ofi_strncatf(buf, len, " ]\n");
 
-	ofi_strcatf(buf, "%s%smode: [ ", prefix, TAB);
-	ofi_tostr_mode(buf, attr->mode);
-	ofi_strcatf(buf, " ]\n");
+	ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB);
+	ofi_tostr_mode(buf, len, attr->mode);
+	ofi_strncatf(buf, len, " ]\n");
 
-	ofi_strcatf(buf, "%s%sop_flags: [ ", prefix, TAB);
-	ofi_tostr_opflags(buf, attr->op_flags);
-	ofi_strcatf(buf, " ]\n");
+	ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB);
+	ofi_tostr_opflags(buf, len, attr->op_flags);
+	ofi_strncatf(buf, len, " ]\n");
 
-	ofi_strcatf(buf, "%s%smsg_order: [ ", prefix, TAB);
-	ofi_tostr_msgorder(buf, attr->msg_order);
-	ofi_strcatf(buf, " ]\n");
+	ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB);
+	ofi_tostr_msgorder(buf, len, attr->msg_order);
+	ofi_strncatf(buf, len, " ]\n");
 
-	ofi_strcatf(buf, "%s%scomp_order: [ ", prefix, TAB);
-	ofi_tostr_comporder(buf, attr->comp_order);
-	ofi_strcatf(buf, " ]\n");
+	ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB);
+	ofi_tostr_comporder(buf, len, attr->comp_order);
+	ofi_strncatf(buf, len, " ]\n");
 
-	ofi_strcatf(buf, "%s%stotal_buffered_recv: %zu\n", prefix, TAB, attr->total_buffered_recv);
-	ofi_strcatf(buf, "%s%ssize: %zu\n", prefix, TAB, attr->size);
-	ofi_strcatf(buf, "%s%siov_limit: %zu\n", prefix, TAB, attr->iov_limit);
+	ofi_strncatf(buf, len, "%s%stotal_buffered_recv: %zu\n", prefix, TAB,
+		     attr->total_buffered_recv);
+	ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size);
+	ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB,
+		     attr->iov_limit);
 }
 
-static void ofi_tostr_ep_attr(char *buf, const struct fi_ep_attr *attr, const char *prefix)
+static void
+ofi_tostr_ep_attr(char *buf, size_t len, const struct fi_ep_attr *attr,
+		  const char *prefix)
 {
 	if (!attr) {
-		ofi_strcatf(buf, "%sfi_ep_attr: (null)\n", prefix);
+		ofi_strncatf(buf, len, "%sfi_ep_attr: (null)\n", prefix);
 		return;
 	}
 
-	ofi_strcatf(buf, "%sfi_ep_attr:\n", prefix);
-	ofi_strcatf(buf, "%s%stype: ", prefix, TAB);
-	ofi_tostr_ep_type(buf, attr->type);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%sprotocol: ", prefix, TAB);
-	ofi_tostr_protocol(buf, attr->protocol);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%sprotocol_version: %d\n", prefix, TAB, attr->protocol_version);
-	ofi_strcatf(buf, "%s%smax_msg_size: %zu\n", prefix, TAB, attr->max_msg_size);
-	ofi_strcatf(buf, "%s%smsg_prefix_size: %zu\n", prefix, TAB, attr->msg_prefix_size);
-	ofi_strcatf(buf, "%s%smax_order_raw_size: %zu\n", prefix, TAB, attr->max_order_raw_size);
-	ofi_strcatf(buf, "%s%smax_order_war_size: %zu\n", prefix, TAB, attr->max_order_war_size);
-	ofi_strcatf(buf, "%s%smax_order_waw_size: %zu\n", prefix, TAB, attr->max_order_waw_size);
-	ofi_strcatf(buf, "%s%smem_tag_format: 0x%016llx\n", prefix, TAB, attr->mem_tag_format);
-
-	ofi_strcatf(buf, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, attr->tx_ctx_cnt);
-	ofi_strcatf(buf, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, attr->rx_ctx_cnt);
+	ofi_strncatf(buf, len, "%sfi_ep_attr:\n", prefix);
+	ofi_strncatf(buf, len, "%s%stype: ", prefix, TAB);
+	ofi_tostr_ep_type(buf, len, attr->type);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%sprotocol: ", prefix, TAB);
+	ofi_tostr_protocol(buf, len, attr->protocol);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%sprotocol_version: %d\n", prefix, TAB,
+		     attr->protocol_version);
+	ofi_strncatf(buf, len, "%s%smax_msg_size: %zu\n", prefix, TAB,
+		     attr->max_msg_size);
+	ofi_strncatf(buf, len, "%s%smsg_prefix_size: %zu\n", prefix, TAB,
+		     attr->msg_prefix_size);
+	ofi_strncatf(buf, len, "%s%smax_order_raw_size: %zu\n", prefix, TAB,
+		     attr->max_order_raw_size);
+	ofi_strncatf(buf, len, "%s%smax_order_war_size: %zu\n", prefix, TAB,
+		     attr->max_order_war_size);
+	ofi_strncatf(buf, len, "%s%smax_order_waw_size: %zu\n", prefix, TAB,
+		     attr->max_order_waw_size);
+	ofi_strncatf(buf, len, "%s%smem_tag_format: 0x%016llx\n", prefix, TAB,
+		     attr->mem_tag_format);
+
+	ofi_strncatf(buf, len, "%s%stx_ctx_cnt: ", prefix, TAB);
+	if (attr->tx_ctx_cnt == FI_SHARED_CONTEXT)
+		ofi_strncatf(buf, len, "FI_SHARED_CONTEXT\n");
+	else
+		ofi_strncatf(buf, len, "%zu\n", attr->tx_ctx_cnt);
+	ofi_strncatf(buf, len, "%s%srx_ctx_cnt: ", prefix, TAB);
+	if (attr->rx_ctx_cnt == FI_SHARED_CONTEXT)
+		ofi_strncatf(buf, len, "FI_SHARED_CONTEXT\n");
+	else
+		ofi_strncatf(buf, len, "%zu\n", attr->rx_ctx_cnt);
 
-	ofi_strcatf(buf, "%s%sauth_key_size: %zu\n", prefix, TAB, attr->auth_key_size);
+	ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB,
+		     attr->auth_key_size);
 }
 
-static void ofi_tostr_resource_mgmt(char *buf, enum fi_resource_mgmt rm)
+static void
+ofi_tostr_resource_mgmt(char *buf, size_t len, enum fi_resource_mgmt rm)
 {
 	switch (rm) {
-	CASEENUMSTR(FI_RM_UNSPEC);
-	CASEENUMSTR(FI_RM_DISABLED);
-	CASEENUMSTR(FI_RM_ENABLED);
+	CASEENUMSTRN(FI_RM_UNSPEC, len);
+	CASEENUMSTRN(FI_RM_DISABLED, len);
+	CASEENUMSTRN(FI_RM_ENABLED, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_av_type(char *buf, enum fi_av_type type)
+static void ofi_tostr_av_type(char *buf, size_t len, enum fi_av_type type)
 {
 	switch (type) {
-	CASEENUMSTR(FI_AV_UNSPEC);
-	CASEENUMSTR(FI_AV_MAP);
-	CASEENUMSTR(FI_AV_TABLE);
+	CASEENUMSTRN(FI_AV_UNSPEC, len);
+	CASEENUMSTRN(FI_AV_MAP, len);
+	CASEENUMSTRN(FI_AV_TABLE, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_mr_mode(char *buf, int mr_mode)
+static void ofi_tostr_mr_mode(char *buf, size_t len, int mr_mode)
 {
-	IFFLAGSTR(mr_mode, FI_MR_BASIC);
-	IFFLAGSTR(mr_mode, FI_MR_SCALABLE);
-	IFFLAGSTR(mr_mode, FI_MR_LOCAL);
-	IFFLAGSTR(mr_mode, FI_MR_RAW);
-	IFFLAGSTR(mr_mode, FI_MR_VIRT_ADDR);
-	IFFLAGSTR(mr_mode, FI_MR_ALLOCATED);
-	IFFLAGSTR(mr_mode, FI_MR_PROV_KEY);
-	IFFLAGSTR(mr_mode, FI_MR_MMU_NOTIFY);
-	IFFLAGSTR(mr_mode, FI_MR_RMA_EVENT);
-	IFFLAGSTR(mr_mode, FI_MR_ENDPOINT);
-	IFFLAGSTR(mr_mode, FI_MR_HMEM);
+	IFFLAGSTRN(mr_mode, FI_MR_BASIC, len);
+	IFFLAGSTRN(mr_mode, FI_MR_SCALABLE, len);
+	IFFLAGSTRN(mr_mode, FI_MR_LOCAL, len);
+	IFFLAGSTRN(mr_mode, FI_MR_RAW, len);
+	IFFLAGSTRN(mr_mode, FI_MR_VIRT_ADDR, len);
+	IFFLAGSTRN(mr_mode, FI_MR_ALLOCATED, len);
+	IFFLAGSTRN(mr_mode, FI_MR_PROV_KEY, len);
+	IFFLAGSTRN(mr_mode, FI_MR_MMU_NOTIFY, len);
+	IFFLAGSTRN(mr_mode, FI_MR_RMA_EVENT, len);
+	IFFLAGSTRN(mr_mode, FI_MR_ENDPOINT, len);
+	IFFLAGSTRN(mr_mode, FI_MR_HMEM, len);
+	IFFLAGSTRN(mr_mode, FI_MR_COLLECTIVE, len);
 
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_op_type(char *buf, int op_type)
+static void ofi_tostr_op_type(char *buf, size_t len, int op_type)
 {
 	switch (op_type) {
-	CASEENUMSTR(FI_OP_RECV);
-	CASEENUMSTR(FI_OP_SEND);
-	CASEENUMSTR(FI_OP_TRECV);
-	CASEENUMSTR(FI_OP_TSEND);
-	CASEENUMSTR(FI_OP_READ);
-	CASEENUMSTR(FI_OP_WRITE);
-	CASEENUMSTR(FI_OP_ATOMIC);
-	CASEENUMSTR(FI_OP_FETCH_ATOMIC);
-	CASEENUMSTR(FI_OP_COMPARE_ATOMIC);
-	CASEENUMSTR(FI_OP_CNTR_SET);
-	CASEENUMSTR(FI_OP_CNTR_ADD);
+	CASEENUMSTRN(FI_OP_RECV, len);
+	CASEENUMSTRN(FI_OP_SEND, len);
+	CASEENUMSTRN(FI_OP_TRECV, len);
+	CASEENUMSTRN(FI_OP_TSEND, len);
+	CASEENUMSTRN(FI_OP_READ, len);
+	CASEENUMSTRN(FI_OP_WRITE, len);
+	CASEENUMSTRN(FI_OP_ATOMIC, len);
+	CASEENUMSTRN(FI_OP_FETCH_ATOMIC, len);
+	CASEENUMSTRN(FI_OP_COMPARE_ATOMIC, len);
+	CASEENUMSTRN(FI_OP_CNTR_SET, len);
+	CASEENUMSTRN(FI_OP_CNTR_ADD, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_domain_attr(char *buf, const struct fi_domain_attr *attr,
-				 const char *prefix)
+static void
+ofi_tostr_domain_attr(char *buf, size_t len, const struct fi_domain_attr *attr,
+		      const char *prefix)
 {
 	if (!attr) {
-		ofi_strcatf(buf, "%sfi_domain_attr: (null)\n", prefix);
+		ofi_strncatf(buf, len, "%sfi_domain_attr: (null)\n", prefix);
 		return;
 	}
 
-	ofi_strcatf(buf, "%sfi_domain_attr:\n", prefix);
-
-	ofi_strcatf(buf, "%s%sdomain: 0x%x\n", prefix, TAB, attr->domain);
-
-	ofi_strcatf(buf, "%s%sname: %s\n", prefix, TAB, attr->name);
-	ofi_strcatf(buf, "%s%sthreading: ", prefix, TAB);
-	ofi_tostr_threading(buf, attr->threading);
-	ofi_strcatf(buf, "\n");
-
-	ofi_strcatf(buf, "%s%scontrol_progress: ", prefix,TAB);
-	ofi_tostr_progress(buf, attr->control_progress);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%sdata_progress: ", prefix, TAB);
-	ofi_tostr_progress(buf, attr->data_progress);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%sresource_mgmt: ", prefix, TAB);
-	ofi_tostr_resource_mgmt(buf, attr->resource_mgmt);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%sav_type: ", prefix, TAB);
-	ofi_tostr_av_type(buf, attr->av_type);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%s%smr_mode: [ ", prefix, TAB);
-	ofi_tostr_mr_mode(buf, attr->mr_mode);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%smr_key_size: %zu\n", prefix, TAB, attr->mr_key_size);
-	ofi_strcatf(buf, "%s%scq_data_size: %zu\n", prefix, TAB, attr->cq_data_size);
-	ofi_strcatf(buf, "%s%scq_cnt: %zu\n", prefix, TAB, attr->cq_cnt);
-	ofi_strcatf(buf, "%s%sep_cnt: %zu\n", prefix, TAB, attr->ep_cnt);
-	ofi_strcatf(buf, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, attr->tx_ctx_cnt);
-	ofi_strcatf(buf, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, attr->rx_ctx_cnt);
-	ofi_strcatf(buf, "%s%smax_ep_tx_ctx: %zu\n", prefix, TAB, attr->max_ep_tx_ctx);
-	ofi_strcatf(buf, "%s%smax_ep_rx_ctx: %zu\n", prefix, TAB, attr->max_ep_rx_ctx);
-	ofi_strcatf(buf, "%s%smax_ep_stx_ctx: %zu\n", prefix, TAB, attr->max_ep_stx_ctx);
-	ofi_strcatf(buf, "%s%smax_ep_srx_ctx: %zu\n", prefix, TAB, attr->max_ep_srx_ctx);
-	ofi_strcatf(buf, "%s%scntr_cnt: %zu\n", prefix, TAB, attr->cntr_cnt);
-	ofi_strcatf(buf, "%s%smr_iov_limit: %zu\n", prefix, TAB, attr->mr_iov_limit);
-
-	ofi_strcatf(buf, "%scaps: [ ", TAB);
-	ofi_tostr_caps(buf, attr->caps);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%smode: [ ", TAB);
-	ofi_tostr_mode(buf, attr->mode);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%s%sauth_key_size: %zu\n", prefix, TAB, attr->auth_key_size);
-	ofi_strcatf(buf, "%s%smax_err_data: %zu\n", prefix, TAB, attr->max_err_data);
-	ofi_strcatf(buf, "%s%smr_cnt: %zu\n", prefix, TAB, attr->mr_cnt);
-}
-
-static void ofi_tostr_fabric_attr(char *buf, const struct fi_fabric_attr *attr,
-				 const char *prefix)
+	ofi_strncatf(buf, len, "%sfi_domain_attr:\n", prefix);
+
+	ofi_strncatf(buf, len, "%s%sdomain: 0x%x\n", prefix, TAB, attr->domain);
+
+	ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name);
+	ofi_strncatf(buf, len, "%s%sthreading: ", prefix, TAB);
+	ofi_tostr_threading(buf, len, attr->threading);
+	ofi_strncatf(buf, len, "\n");
+
+	ofi_strncatf(buf, len, "%s%scontrol_progress: ", prefix,TAB);
+	ofi_tostr_progress(buf, len, attr->control_progress);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%sdata_progress: ", prefix, TAB);
+	ofi_tostr_progress(buf, len, attr->data_progress);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%sresource_mgmt: ", prefix, TAB);
+	ofi_tostr_resource_mgmt(buf, len, attr->resource_mgmt);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%sav_type: ", prefix, TAB);
+	ofi_tostr_av_type(buf, len, attr->av_type);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%s%smr_mode: [ ", prefix, TAB);
+	ofi_tostr_mr_mode(buf, len, attr->mr_mode);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%smr_key_size: %zu\n", prefix, TAB,
+		     attr->mr_key_size);
+	ofi_strncatf(buf, len, "%s%scq_data_size: %zu\n", prefix, TAB,
+		     attr->cq_data_size);
+	ofi_strncatf(buf, len, "%s%scq_cnt: %zu\n", prefix, TAB,
+		     attr->cq_cnt);
+	ofi_strncatf(buf, len, "%s%sep_cnt: %zu\n", prefix, TAB, attr->ep_cnt);
+	ofi_strncatf(buf, len, "%s%stx_ctx_cnt: %zu\n", prefix, TAB,
+		     attr->tx_ctx_cnt);
+	ofi_strncatf(buf, len, "%s%srx_ctx_cnt: %zu\n", prefix, TAB,
+		     attr->rx_ctx_cnt);
+	ofi_strncatf(buf, len, "%s%smax_ep_tx_ctx: %zu\n", prefix, TAB,
+		     attr->max_ep_tx_ctx);
+	ofi_strncatf(buf, len, "%s%smax_ep_rx_ctx: %zu\n", prefix, TAB,
+		     attr->max_ep_rx_ctx);
+	ofi_strncatf(buf, len, "%s%smax_ep_stx_ctx: %zu\n", prefix, TAB,
+		     attr->max_ep_stx_ctx);
+	ofi_strncatf(buf, len, "%s%smax_ep_srx_ctx: %zu\n", prefix, TAB,
+		     attr->max_ep_srx_ctx);
+	ofi_strncatf(buf, len, "%s%scntr_cnt: %zu\n", prefix, TAB,
+		     attr->cntr_cnt);
+	ofi_strncatf(buf, len, "%s%smr_iov_limit: %zu\n", prefix, TAB,
+		     attr->mr_iov_limit);
+
+	ofi_strncatf(buf, len, "%scaps: [ ", TAB);
+	ofi_tostr_caps(buf, len, attr->caps);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%smode: [ ", TAB);
+	ofi_tostr_mode(buf, len, attr->mode);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB,
+		     attr->auth_key_size);
+	ofi_strncatf(buf, len, "%s%smax_err_data: %zu\n", prefix, TAB,
+		     attr->max_err_data);
+	ofi_strncatf(buf, len, "%s%smr_cnt: %zu\n", prefix, TAB, attr->mr_cnt);
+	ofi_strncatf(buf, len, "%s%stclass: 0x%x\n", prefix, TAB, attr->tclass);
+}
+
+static void
+ofi_tostr_fabric_attr(char *buf, size_t len, const struct fi_fabric_attr *attr,
+		      const char *prefix)
 {
 	if (!attr) {
-		ofi_strcatf(buf, "%sfi_fabric_attr: (null)\n", prefix);
+		ofi_strncatf(buf, len, "%sfi_fabric_attr: (null)\n", prefix);
 		return;
 	}
 
-	ofi_strcatf(buf, "%sfi_fabric_attr:\n", prefix);
-	ofi_strcatf(buf, "%s%sname: %s\n", prefix, TAB, attr->name);
-	ofi_strcatf(buf, "%s%sprov_name: %s\n", prefix, TAB, attr->prov_name);
-	ofi_strcatf(buf, "%s%sprov_version: %d.%d\n", prefix, TAB,
+	ofi_strncatf(buf, len, "%sfi_fabric_attr:\n", prefix);
+	ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name);
+	ofi_strncatf(buf, len, "%s%sprov_name: %s\n", prefix, TAB,
+		     attr->prov_name);
+	ofi_strncatf(buf, len, "%s%sprov_version: %d.%d\n", prefix, TAB,
 		FI_MAJOR(attr->prov_version), FI_MINOR(attr->prov_version));
-	ofi_strcatf(buf, "%s%sapi_version: %d.%d\n", prefix, TAB,
+	ofi_strncatf(buf, len, "%s%sapi_version: %d.%d\n", prefix, TAB,
 		FI_MAJOR(attr->api_version), FI_MINOR(attr->api_version));
 }
 
-static void ofi_tostr_info(char *buf, const struct fi_info *info)
+static void ofi_tostr_info(char *buf, size_t len, const struct fi_info *info)
 {
-	ofi_strcatf(buf, "fi_info:\n");
-	ofi_strcatf(buf, "%scaps: [ ", TAB);
-	ofi_tostr_caps(buf, info->caps);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%smode: [ ", TAB);
-	ofi_tostr_mode(buf, info->mode);
-	ofi_strcatf(buf, " ]\n");
-
-	ofi_strcatf(buf, "%saddr_format: ", TAB);
-	ofi_tostr_addr_format(buf, info->addr_format);
-	ofi_strcatf(buf, "\n");
-
-	ofi_strcatf(buf, "%ssrc_addrlen: %zu\n", TAB, info->src_addrlen);
-	ofi_strcatf(buf, "%sdest_addrlen: %zu\n", TAB, info->dest_addrlen);
-	ofi_strcatf(buf, "%ssrc_addr: ", TAB);
-	ofi_tostr_addr(buf, info->addr_format, info->src_addr);
-	ofi_strcatf(buf, "\n");
-	ofi_strcatf(buf, "%sdest_addr: ", TAB);
-	ofi_tostr_addr(buf, info->addr_format, info->dest_addr);
-	ofi_strcatf(buf, "\n");
-	ofi_tostr_fid(TAB "handle: ", buf, info->handle);
-
-	ofi_tostr_tx_attr(buf, info->tx_attr, TAB);
-	ofi_tostr_rx_attr(buf, info->rx_attr, TAB);
-	ofi_tostr_ep_attr(buf, info->ep_attr, TAB);
-	ofi_tostr_domain_attr(buf, info->domain_attr, TAB);
-	ofi_tostr_fabric_attr(buf, info->fabric_attr, TAB);
-	ofi_tostr_fid(TAB "nic_fid: ", buf, &info->nic->fid);
+	ofi_strncatf(buf, len, "fi_info:\n");
+	ofi_strncatf(buf, len, "%scaps: [ ", TAB);
+	ofi_tostr_caps(buf, len, info->caps);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%smode: [ ", TAB);
+	ofi_tostr_mode(buf, len, info->mode);
+	ofi_strncatf(buf, len, " ]\n");
+
+	ofi_strncatf(buf, len, "%saddr_format: ", TAB);
+	ofi_tostr_addr_format(buf, len, info->addr_format);
+	ofi_strncatf(buf, len, "\n");
+
+	ofi_strncatf(buf, len, "%ssrc_addrlen: %zu\n", TAB, info->src_addrlen);
+	ofi_strncatf(buf, len, "%sdest_addrlen: %zu\n", TAB,
+		     info->dest_addrlen);
+	ofi_strncatf(buf, len, "%ssrc_addr: ", TAB);
+	ofi_tostr_addr(buf, len, info->addr_format, info->src_addr);
+	ofi_strncatf(buf, len, "\n");
+	ofi_strncatf(buf, len, "%sdest_addr: ", TAB);
+	ofi_tostr_addr(buf, len, info->addr_format, info->dest_addr);
+	ofi_strncatf(buf, len, "\n");
+	ofi_tostr_fid(TAB "handle: ", buf, len, info->handle);
+
+	ofi_tostr_tx_attr(buf, len, info->tx_attr, TAB);
+	ofi_tostr_rx_attr(buf, len, info->rx_attr, TAB);
+	ofi_tostr_ep_attr(buf, len, info->ep_attr, TAB);
+	ofi_tostr_domain_attr(buf, len, info->domain_attr, TAB);
+	ofi_tostr_fabric_attr(buf, len, info->fabric_attr, TAB);
+	ofi_tostr_fid(TAB "nic: ", buf, len, &info->nic->fid);
 }
 
-static void ofi_tostr_atomic_type(char *buf, enum fi_datatype type)
+static void ofi_tostr_atomic_type(char *buf, size_t len, enum fi_datatype type)
 {
 	switch (type) {
-	CASEENUMSTR(FI_INT8);
-	CASEENUMSTR(FI_UINT8);
-	CASEENUMSTR(FI_INT16);
-	CASEENUMSTR(FI_UINT16);
-	CASEENUMSTR(FI_INT32);
-	CASEENUMSTR(FI_UINT32);
-	CASEENUMSTR(FI_INT64);
-	CASEENUMSTR(FI_UINT64);
-	CASEENUMSTR(FI_FLOAT);
-	CASEENUMSTR(FI_DOUBLE);
-	CASEENUMSTR(FI_FLOAT_COMPLEX);
-	CASEENUMSTR(FI_DOUBLE_COMPLEX);
-	CASEENUMSTR(FI_LONG_DOUBLE);
-	CASEENUMSTR(FI_LONG_DOUBLE_COMPLEX);
+	CASEENUMSTRN(FI_INT8, len);
+	CASEENUMSTRN(FI_UINT8, len);
+	CASEENUMSTRN(FI_INT16, len);
+	CASEENUMSTRN(FI_UINT16, len);
+	CASEENUMSTRN(FI_INT32, len);
+	CASEENUMSTRN(FI_UINT32, len);
+	CASEENUMSTRN(FI_INT64, len);
+	CASEENUMSTRN(FI_UINT64, len);
+	CASEENUMSTRN(FI_INT128, len);
+	CASEENUMSTRN(FI_UINT128, len);
+	CASEENUMSTRN(FI_FLOAT, len);
+	CASEENUMSTRN(FI_DOUBLE, len);
+	CASEENUMSTRN(FI_FLOAT_COMPLEX, len);
+	CASEENUMSTRN(FI_DOUBLE_COMPLEX, len);
+	CASEENUMSTRN(FI_LONG_DOUBLE, len);
+	CASEENUMSTRN(FI_LONG_DOUBLE_COMPLEX, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_atomic_op(char *buf, enum fi_op op)
+static void ofi_tostr_atomic_op(char *buf, size_t len, enum fi_op op)
 {
 	switch (op) {
-	CASEENUMSTR(FI_MIN);
-	CASEENUMSTR(FI_MAX);
-	CASEENUMSTR(FI_SUM);
-	CASEENUMSTR(FI_PROD);
-	CASEENUMSTR(FI_LOR);
-	CASEENUMSTR(FI_LAND);
-	CASEENUMSTR(FI_BOR);
-	CASEENUMSTR(FI_BAND);
-	CASEENUMSTR(FI_LXOR);
-	CASEENUMSTR(FI_BXOR);
-	CASEENUMSTR(FI_ATOMIC_READ);
-	CASEENUMSTR(FI_ATOMIC_WRITE);
-	CASEENUMSTR(FI_CSWAP);
-	CASEENUMSTR(FI_CSWAP_NE);
-	CASEENUMSTR(FI_CSWAP_LE);
-	CASEENUMSTR(FI_CSWAP_LT);
-	CASEENUMSTR(FI_CSWAP_GE);
-	CASEENUMSTR(FI_CSWAP_GT);
-	CASEENUMSTR(FI_MSWAP);
+	CASEENUMSTRN(FI_MIN, len);
+	CASEENUMSTRN(FI_MAX, len);
+	CASEENUMSTRN(FI_SUM, len);
+	CASEENUMSTRN(FI_PROD, len);
+	CASEENUMSTRN(FI_LOR, len);
+	CASEENUMSTRN(FI_LAND, len);
+	CASEENUMSTRN(FI_BOR, len);
+	CASEENUMSTRN(FI_BAND, len);
+	CASEENUMSTRN(FI_LXOR, len);
+	CASEENUMSTRN(FI_BXOR, len);
+	CASEENUMSTRN(FI_ATOMIC_READ, len);
+	CASEENUMSTRN(FI_ATOMIC_WRITE, len);
+	CASEENUMSTRN(FI_CSWAP, len);
+	CASEENUMSTRN(FI_CSWAP_NE, len);
+	CASEENUMSTRN(FI_CSWAP_LE, len);
+	CASEENUMSTRN(FI_CSWAP_LT, len);
+	CASEENUMSTRN(FI_CSWAP_GE, len);
+	CASEENUMSTRN(FI_CSWAP_GT, len);
+	CASEENUMSTRN(FI_MSWAP, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_collective_op(char *buf, enum fi_collective_op op)
+static void
+ofi_tostr_collective_op(char *buf, size_t len, enum fi_collective_op op)
 {
 	switch (op) {
-	CASEENUMSTR(FI_BARRIER);
-	CASEENUMSTR(FI_BROADCAST);
-	CASEENUMSTR(FI_ALLTOALL);
-	CASEENUMSTR(FI_ALLREDUCE);
-	CASEENUMSTR(FI_ALLGATHER);
-	CASEENUMSTR(FI_REDUCE_SCATTER);
-	CASEENUMSTR(FI_REDUCE);
-	CASEENUMSTR(FI_SCATTER);
-	CASEENUMSTR(FI_GATHER);
+	CASEENUMSTRN(FI_BARRIER, len);
+	CASEENUMSTRN(FI_BROADCAST, len);
+	CASEENUMSTRN(FI_ALLTOALL, len);
+	CASEENUMSTRN(FI_ALLREDUCE, len);
+	CASEENUMSTRN(FI_ALLGATHER, len);
+	CASEENUMSTRN(FI_REDUCE_SCATTER, len);
+	CASEENUMSTRN(FI_REDUCE, len);
+	CASEENUMSTRN(FI_SCATTER, len);
+	CASEENUMSTRN(FI_GATHER, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_version(char *buf)
+static void ofi_tostr_version(char *buf, size_t len)
 {
-	ofi_strcatf(buf, VERSION);
-	ofi_strcatf(buf, BUILD_ID);
+	ofi_strncatf(buf, len, VERSION);
+	ofi_strncatf(buf, len, BUILD_ID);
 }
 
-static void ofi_tostr_eq_event(char *buf, int type)
+static void ofi_tostr_eq_event(char *buf, size_t len, int type)
 {
 	switch (type) {
-	CASEENUMSTR(FI_NOTIFY);
-	CASEENUMSTR(FI_CONNREQ);
-	CASEENUMSTR(FI_CONNECTED);
-	CASEENUMSTR(FI_SHUTDOWN);
-	CASEENUMSTR(FI_MR_COMPLETE);
-	CASEENUMSTR(FI_AV_COMPLETE);
-	CASEENUMSTR(FI_JOIN_COMPLETE);
+	CASEENUMSTRN(FI_NOTIFY, len);
+	CASEENUMSTRN(FI_CONNREQ, len);
+	CASEENUMSTRN(FI_CONNECTED, len);
+	CASEENUMSTRN(FI_SHUTDOWN, len);
+	CASEENUMSTRN(FI_MR_COMPLETE, len);
+	CASEENUMSTRN(FI_AV_COMPLETE, len);
+	CASEENUMSTRN(FI_JOIN_COMPLETE, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
-static void ofi_tostr_cq_event_flags(char *buf, uint64_t flags)
-{
-	IFFLAGSTR(flags, FI_SEND);
-	IFFLAGSTR(flags, FI_RECV);
-	IFFLAGSTR(flags, FI_RMA);
-	IFFLAGSTR(flags, FI_ATOMIC);
-	IFFLAGSTR(flags, FI_MSG);
-	IFFLAGSTR(flags, FI_TAGGED);
-	IFFLAGSTR(flags, FI_READ);
-	IFFLAGSTR(flags, FI_WRITE);
-	IFFLAGSTR(flags, FI_REMOTE_READ);
-	IFFLAGSTR(flags, FI_REMOTE_WRITE);
-	IFFLAGSTR(flags, FI_REMOTE_CQ_DATA);
-	IFFLAGSTR(flags, FI_MULTI_RECV);
-	IFFLAGSTR(flags, FI_MORE);
-	IFFLAGSTR(flags, FI_CLAIM);
+static void ofi_tostr_cq_event_flags(char *buf, size_t len, uint64_t flags)
+{
+	IFFLAGSTRN(flags, FI_SEND, len);
+	IFFLAGSTRN(flags, FI_RECV, len);
+	IFFLAGSTRN(flags, FI_RMA, len);
+	IFFLAGSTRN(flags, FI_ATOMIC, len);
+	IFFLAGSTRN(flags, FI_MSG, len);
+	IFFLAGSTRN(flags, FI_TAGGED, len);
+	IFFLAGSTRN(flags, FI_READ, len);
+	IFFLAGSTRN(flags, FI_WRITE, len);
+	IFFLAGSTRN(flags, FI_REMOTE_READ, len);
+	IFFLAGSTRN(flags, FI_REMOTE_WRITE, len);
+	IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len);
+	IFFLAGSTRN(flags, FI_MULTI_RECV, len);
+	IFFLAGSTRN(flags, FI_MORE, len);
+	IFFLAGSTRN(flags, FI_CLAIM, len);
 	ofi_remove_comma(buf);
 }
 
-static void ofi_tostr_hmem_iface(char *buf, enum fi_hmem_iface iface)
+static void
+ofi_tostr_hmem_iface(char *buf, size_t len, enum fi_hmem_iface iface)
 {
 	switch (iface) {
-	CASEENUMSTR(FI_HMEM_SYSTEM);
-	CASEENUMSTR(FI_HMEM_CUDA);
-	CASEENUMSTR(FI_HMEM_ROCR);
-	CASEENUMSTR(FI_HMEM_ZE);
+	CASEENUMSTRN(FI_HMEM_SYSTEM, len);
+	CASEENUMSTRN(FI_HMEM_CUDA, len);
+	CASEENUMSTRN(FI_HMEM_ROCR, len);
+	CASEENUMSTRN(FI_HMEM_ZE, len);
+	default:
+		ofi_strncatf(buf, len, "Unknown");
+		break;
+	}
+}
+
+static void
+ofi_tostr_cq_format(char *buf, size_t len, enum fi_cq_format cq_format)
+{
+	switch (cq_format) {
+	CASEENUMSTRN(FI_CQ_FORMAT_UNSPEC, len);
+	CASEENUMSTRN(FI_CQ_FORMAT_CONTEXT, len);
+	CASEENUMSTRN(FI_CQ_FORMAT_MSG, len);
+	CASEENUMSTRN(FI_CQ_FORMAT_DATA, len);
+	CASEENUMSTRN(FI_CQ_FORMAT_TAGGED, len);
 	default:
-		ofi_strcatf(buf, "Unknown");
+		ofi_strncatf(buf, len, "Unknown");
 		break;
 	}
 }
 
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
-char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype)
+char *DEFAULT_SYMVER_PRE(fi_tostr_r)(char *buf, size_t len,
+				     const void *data, enum fi_type datatype)
 {
-	static char *buf = NULL;
 	const uint64_t *val64;
 	const uint32_t *val32;
 	const int *enumval;
 
-	if (!data)
+	if (!data || !buf || !len)
 		return NULL;
 
 	val64 = (const uint64_t *) data;
 	val32 = (const uint32_t *) data;
 	enumval = (const int *) data;
 
-	if (!buf) {
-		buf = calloc(OFI_BUFSIZ, 1);
-		if (!buf)
-			return NULL;
-	}
 	buf[0] = '\0';
 
 	switch (datatype) {
 	case FI_TYPE_INFO:
-		ofi_tostr_info(buf, data);
+		ofi_tostr_info(buf, len, data);
 		break;
 	case FI_TYPE_EP_TYPE:
-		ofi_tostr_ep_type(buf, *enumval);
+		ofi_tostr_ep_type(buf, len, *enumval);
 		break;
 	case FI_TYPE_CAPS:
-		ofi_tostr_caps(buf, *val64);
+		ofi_tostr_caps(buf, len, *val64);
 		break;
 	case FI_TYPE_OP_FLAGS:
-		ofi_tostr_opflags(buf, *val64);
+		ofi_tostr_opflags(buf, len, *val64);
 		break;
 	case FI_TYPE_ADDR_FORMAT:
-		ofi_tostr_addr_format(buf, *val32);
+		ofi_tostr_addr_format(buf, len, *val32);
 		break;
 	case FI_TYPE_TX_ATTR:
-		ofi_tostr_tx_attr(buf, data, "");
+		ofi_tostr_tx_attr(buf, len, data, "");
 		break;
 	case FI_TYPE_RX_ATTR:
-		ofi_tostr_rx_attr(buf, data, "");
+		ofi_tostr_rx_attr(buf, len, data, "");
 		break;
 	case FI_TYPE_EP_ATTR:
-		ofi_tostr_ep_attr(buf, data, "");
+		ofi_tostr_ep_attr(buf, len, data, "");
 		break;
 	case FI_TYPE_DOMAIN_ATTR:
-		ofi_tostr_domain_attr(buf, data, "");
+		ofi_tostr_domain_attr(buf, len, data, "");
 		break;
 	case FI_TYPE_FABRIC_ATTR:
-		ofi_tostr_fabric_attr(buf, data, "");
+		ofi_tostr_fabric_attr(buf, len, data, "");
 		break;
 	case FI_TYPE_THREADING:
-		ofi_tostr_threading(buf, *enumval);
+		ofi_tostr_threading(buf, len, *enumval);
 		break;
 	case FI_TYPE_PROGRESS:
-		ofi_tostr_progress(buf, *enumval);
+		ofi_tostr_progress(buf, len, *enumval);
 		break;
 	case FI_TYPE_PROTOCOL:
-		ofi_tostr_protocol(buf, *val32);
+		ofi_tostr_protocol(buf, len, *val32);
 		break;
 	case FI_TYPE_MSG_ORDER:
-		ofi_tostr_msgorder(buf, *val64);
+		ofi_tostr_msgorder(buf, len, *val64);
 		break;
 	case FI_TYPE_MODE:
-		ofi_tostr_mode(buf, *val64);
+		ofi_tostr_mode(buf, len, *val64);
 		break;
 	case FI_TYPE_AV_TYPE:
-		ofi_tostr_av_type(buf, *enumval);
+		ofi_tostr_av_type(buf, len, *enumval);
 		break;
 	case FI_TYPE_ATOMIC_TYPE:
-		ofi_tostr_atomic_type(buf, *enumval);
+		ofi_tostr_atomic_type(buf, len, *enumval);
 		break;
 	case FI_TYPE_ATOMIC_OP:
-		ofi_tostr_atomic_op(buf, *enumval);
+		ofi_tostr_atomic_op(buf, len, *enumval);
 		break;
 	case FI_TYPE_VERSION:
-		ofi_tostr_version(buf);
+		ofi_tostr_version(buf, len);
 		break;
 	case FI_TYPE_EQ_EVENT:
-		ofi_tostr_eq_event(buf, *enumval);
+		ofi_tostr_eq_event(buf, len, *enumval);
 		break;
 	case FI_TYPE_CQ_EVENT_FLAGS:
-		ofi_tostr_cq_event_flags(buf, *val64);
+		ofi_tostr_cq_event_flags(buf, len, *val64);
 		break;
 	case FI_TYPE_MR_MODE:
 		/* mr_mode was an enum converted to int flags */
-		ofi_tostr_mr_mode(buf, *enumval);
+		ofi_tostr_mr_mode(buf, len, *enumval);
 		break;
 	case FI_TYPE_OP_TYPE:
-		ofi_tostr_op_type(buf, *enumval);
+		ofi_tostr_op_type(buf, len, *enumval);
 		break;
 	case FI_TYPE_FID:
-		ofi_tostr_fid("fid: ", buf, data);
+		ofi_tostr_fid("fid: ", buf, len, data);
 		break;
 	case FI_TYPE_COLLECTIVE_OP:
-		ofi_tostr_collective_op(buf, *enumval);
+		ofi_tostr_collective_op(buf, len, *enumval);
 		break;
 	case FI_TYPE_HMEM_IFACE:
-		ofi_tostr_hmem_iface(buf, *enumval);
+		ofi_tostr_hmem_iface(buf, len, *enumval);
+		break;
+	case FI_TYPE_CQ_FORMAT:
+		ofi_tostr_cq_format(buf, len, *enumval);
 		break;
 	default:
-		ofi_strcatf(buf, "Unknown type");
+		ofi_strncatf(buf, len, "Unknown type");
 		break;
 	}
 	return buf;
 }
-DEFAULT_SYMVER(fi_tostr_, fi_tostr, FABRIC_1.0);
+DEFAULT_SYMVER(fi_tostr_r_, fi_tostr_r, FABRIC_1.4);
 
-#undef CASEENUMSTR
-#undef IFFLAGSTR
+__attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
+char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype)
+{
+	static char *buf = NULL;
+	size_t len = 8192;
+
+	if (!buf) {
+		buf = calloc(len, 1);
+		if (!buf)
+			return NULL;
+	}
+
+	return fi_tostr_r(buf, len, data, datatype);
+}
+DEFAULT_SYMVER(fi_tostr_, fi_tostr, FABRIC_1.0);
diff --git a/deps/libfabric/src/hmem.c b/deps/libfabric/src/hmem.c
index 4e70a7c3cdc94e4bbcc6eebe58c595b32f0e99dc..aab59e64f7f48bbca800641b4fc409de5afe3546 100644
--- a/deps/libfabric/src/hmem.c
+++ b/deps/libfabric/src/hmem.c
@@ -1,5 +1,7 @@
 /*
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * (C) Copyright 2020-2021 Intel Corporation. All rights reserved.
+ * (C) Copyright 2021 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,23 +40,11 @@
 #include "ofi.h"
 #include "ofi_iov.h"
 
-struct ofi_hmem_ops {
-	bool initialized;
-	int (*init)(void);
-	int (*cleanup)(void);
-	int (*copy_to_hmem)(uint64_t device, void *dest, const void *src,
-			    size_t size);
-	int (*copy_from_hmem)(uint64_t device, void *dest, const void *src,
-			      size_t size);
-	bool (*is_addr_valid)(const void *addr);
-	int (*get_handle)(void *dev_buf, void **handle);
-	int (*open_handle)(void **handle, uint64_t device, void **ipc_ptr);
-	int (*close_handle)(void *ipc_ptr);
-};
+bool ofi_hmem_disable_p2p = false;
 
-static struct ofi_hmem_ops hmem_ops[] = {
+struct ofi_hmem_ops hmem_ops[] = {
 	[FI_HMEM_SYSTEM] = {
-		.initialized = false,
+		.initialized = true,
 		.init = ofi_hmem_init_noop,
 		.cleanup = ofi_hmem_cleanup_noop,
 		.copy_to_hmem = ofi_memcpy,
@@ -62,6 +52,10 @@ static struct ofi_hmem_ops hmem_ops[] = {
 		.get_handle = ofi_hmem_no_get_handle,
 		.open_handle = ofi_hmem_no_open_handle,
 		.close_handle = ofi_hmem_no_close_handle,
+		.host_register = ofi_hmem_register_noop,
+		.host_unregister = ofi_hmem_host_unregister_noop,
+		.get_base_addr = ofi_hmem_no_base_addr,
+		.is_ipc_enabled = ofi_hmem_no_is_ipc_enabled,
 	},
 	[FI_HMEM_CUDA] = {
 		.initialized = false,
@@ -70,20 +64,28 @@ static struct ofi_hmem_ops hmem_ops[] = {
 		.copy_to_hmem = cuda_copy_to_dev,
 		.copy_from_hmem = cuda_copy_from_dev,
 		.is_addr_valid = cuda_is_addr_valid,
-		.get_handle = ofi_hmem_no_get_handle,
-		.open_handle = ofi_hmem_no_open_handle,
-		.close_handle = ofi_hmem_no_close_handle,
+		.get_handle = cuda_get_handle,
+		.open_handle = cuda_open_handle,
+		.close_handle = cuda_close_handle,
+		.host_register = cuda_host_register,
+		.host_unregister = cuda_host_unregister,
+		.get_base_addr = ofi_hmem_no_base_addr,
+		.is_ipc_enabled = cuda_is_ipc_enabled,
 	},
 	[FI_HMEM_ROCR] = {
 		.initialized = false,
 		.init = rocr_hmem_init,
 		.cleanup = rocr_hmem_cleanup,
-		.copy_to_hmem = rocr_memcpy,
-		.copy_from_hmem = rocr_memcpy,
+		.copy_to_hmem = rocr_copy_to_dev,
+		.copy_from_hmem = rocr_copy_from_dev,
 		.is_addr_valid = rocr_is_addr_valid,
 		.get_handle = ofi_hmem_no_get_handle,
 		.open_handle = ofi_hmem_no_open_handle,
 		.close_handle = ofi_hmem_no_close_handle,
+		.host_register = rocr_host_register,
+		.host_unregister = rocr_host_unregister,
+		.get_base_addr = ofi_hmem_no_base_addr,
+		.is_ipc_enabled = ofi_hmem_no_is_ipc_enabled,
 	},
 	[FI_HMEM_ZE] = {
 		.initialized = false,
@@ -95,6 +97,10 @@ static struct ofi_hmem_ops hmem_ops[] = {
 		.get_handle = ze_hmem_get_handle,
 		.open_handle = ze_hmem_open_handle,
 		.close_handle = ze_hmem_close_handle,
+		.host_register = ofi_hmem_register_noop,
+		.host_unregister = ofi_hmem_host_unregister_noop,
+		.get_base_addr = ze_hmem_get_base_addr,
+		.is_ipc_enabled = ze_hmem_p2p_enabled,
 	},
 };
 
@@ -131,8 +137,12 @@ static ssize_t ofi_copy_hmem_iov_buf(enum fi_hmem_iface hmem_iface, uint64_t dev
 
 		hmem_buf = (char *)hmem_iov[i].iov_base + hmem_iov_offset;
 		len -= hmem_iov_offset;
+		hmem_iov_offset = 0;
 
 		len = MIN(len, size);
+		if (!len)
+			continue;
+
 		if (dir == OFI_COPY_BUF_TO_IOV)
 			ret = ofi_copy_to_hmem(hmem_iface, device, hmem_buf,
 					       (char *)buf + done, len);
@@ -144,7 +154,6 @@ static ssize_t ofi_copy_hmem_iov_buf(enum fi_hmem_iface hmem_iface, uint64_t dev
 		if (ret)
 			return ret;
 
-		hmem_iov_offset = 0;
 		size -= len;
 		done += len;
 	}
@@ -165,11 +174,11 @@ ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size,
 ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device,
 			     const struct iovec *hmem_iov,
 			     size_t hmem_iov_count, uint64_t hmem_iov_offset,
-			     void *src, size_t size)
+			     const void *src, size_t size)
 {
 	return ofi_copy_hmem_iov_buf(hmem_iface, device, hmem_iov,
 				     hmem_iov_count, hmem_iov_offset,
-				     src, size, OFI_COPY_BUF_TO_IOV);
+				     (void *) src, size, OFI_COPY_BUF_TO_IOV);
 }
 
 int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle)
@@ -188,10 +197,21 @@ int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr)
 	return hmem_ops[iface].close_handle(ipc_ptr);
 }
 
+int ofi_hmem_get_base_addr(enum fi_hmem_iface iface, const void *ptr,
+			   void **base, size_t *size)
+{
+	return hmem_ops[iface].get_base_addr(ptr, base, size);
+}
+
+bool ofi_hmem_is_initialized(enum fi_hmem_iface iface)
+{
+	return hmem_ops[iface].initialized;
+}
+
 void ofi_hmem_init(void)
 {
-	enum fi_hmem_iface iface;
-	int ret;
+	int iface, ret;
+	int disable_p2p = 0;
 
 	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
 		ret = hmem_ops[iface].init();
@@ -209,6 +229,15 @@ void ofi_hmem_init(void)
 			hmem_ops[iface].initialized = true;
 		}
 	}
+
+	fi_param_define(NULL, "hmem_disable_p2p", FI_PARAM_BOOL,
+			"Disable peer to peer support between device memory and"
+			" network devices. (default: no).");
+
+	if (!fi_param_get_bool(NULL, "hmem_disable_p2p", &disable_p2p)) {
+		if (disable_p2p == 1)
+			ofi_hmem_disable_p2p = true;
+	}
 }
 
 void ofi_hmem_cleanup(void)
@@ -216,14 +245,14 @@ void ofi_hmem_cleanup(void)
 	enum fi_hmem_iface iface;
 
 	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
-		if (hmem_ops[iface].initialized)
+		if (ofi_hmem_is_initialized(iface))
 			hmem_ops[iface].cleanup();
 	}
 }
 
 enum fi_hmem_iface ofi_get_hmem_iface(const void *addr)
 {
-	enum fi_hmem_iface iface;
+	int iface;
 
 	/* Since a is_addr_valid function is not implemented for FI_HMEM_SYSTEM,
 	 * HMEM iface is skipped. In addition, if no other HMEM ifaces claim the
@@ -231,10 +260,70 @@ enum fi_hmem_iface ofi_get_hmem_iface(const void *addr)
 	 */
 	for (iface = ARRAY_SIZE(hmem_ops) - 1; iface > FI_HMEM_SYSTEM;
 	     iface--) {
-		if (hmem_ops[iface].initialized &&
-		    hmem_ops[iface].is_addr_valid(addr))
+		if (ofi_hmem_is_initialized(iface) &&
+		    hmem_ops[iface].is_addr_valid(addr, NULL, NULL))
 			return iface;
 	}
 
 	return FI_HMEM_SYSTEM;
 }
+
+int ofi_hmem_host_register(void *ptr, size_t size)
+{
+	int iface, ret;
+
+	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
+		if (!ofi_hmem_is_initialized(iface))
+			continue;
+
+		ret = hmem_ops[iface].host_register(ptr, size);
+		if (ret != FI_SUCCESS)
+			goto err;
+	}
+
+	return FI_SUCCESS;
+
+err:
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to register host memory with hmem iface %s: %s\n",
+		fi_tostr(&iface, FI_TYPE_HMEM_IFACE),
+		fi_strerror(-ret));
+
+	for (iface--; iface >= 0; iface--) {
+		if (!ofi_hmem_is_initialized(iface))
+			continue;
+
+		hmem_ops[iface].host_unregister(ptr);
+	}
+
+	return ret;
+}
+
+int ofi_hmem_host_unregister(void *ptr)
+{
+	int iface, ret;
+
+	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
+		if (!ofi_hmem_is_initialized(iface))
+			continue;
+
+		ret = hmem_ops[iface].host_unregister(ptr);
+		if (ret != FI_SUCCESS)
+			goto err;
+	}
+
+	return FI_SUCCESS;
+
+err:
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to unregister host memory with hmem iface %s: %s\n",
+		fi_tostr(&iface, FI_TYPE_HMEM_IFACE),
+		fi_strerror(-ret));
+
+	return ret;
+}
+
+bool ofi_hmem_is_ipc_enabled(enum fi_hmem_iface iface)
+{
+	return hmem_ops[iface].is_ipc_enabled();
+}
diff --git a/deps/libfabric/src/hmem_cuda.c b/deps/libfabric/src/hmem_cuda.c
index a7d1e73a1c528ad2e6ea2192a884900efb525894..5b23113765df15c2e4a540f7245b74cf0f91fdbc 100644
--- a/deps/libfabric/src/hmem_cuda.c
+++ b/deps/libfabric/src/hmem_cuda.c
@@ -1,5 +1,6 @@
 /*
  * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * (C) Copyright 2021 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -43,15 +44,35 @@
 #include <cuda_runtime.h>
 
 struct cuda_ops {
-	cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t count,
+	cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t size,
 				  enum cudaMemcpyKind kind);
+	cudaError_t (*cudaFree)(void* ptr);
+	cudaError_t (*cudaMalloc)(void** ptr, size_t size);
 	const char *(*cudaGetErrorName)(cudaError_t error);
 	const char *(*cudaGetErrorString)(cudaError_t error);
 	CUresult (*cuPointerGetAttribute)(void *data,
 					  CUpointer_attribute attribute,
 					  CUdeviceptr ptr);
+	cudaError_t (*cudaHostRegister)(void *ptr, size_t size,
+					unsigned int flags);
+	cudaError_t (*cudaHostUnregister)(void *ptr);
+	cudaError_t (*cudaGetDeviceCount)(int *count);
+	cudaError_t (*cudaGetDevice)(int *device);
+	cudaError_t (*cudaSetDevice)(int device);
+	cudaError_t (*cudaIpcOpenMemHandle)(void **devptr,
+					    cudaIpcMemHandle_t handle,
+					    unsigned int flags);
+	cudaError_t (*cudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle,
+					   void *devptr);
+	cudaError_t (*cudaIpcCloseMemHandle)(void *devptr);
 };
 
+static bool hmem_cuda_use_gdrcopy;
+static bool cuda_ipc_enabled;
+
+static cudaError_t cuda_disabled_cudaMemcpy(void *dst, const void *src,
+					    size_t size, enum cudaMemcpyKind kind);
+
 #ifdef ENABLE_CUDA_DLOPEN
 
 #include <dlfcn.h>
@@ -64,17 +85,27 @@ static struct cuda_ops cuda_ops;
 
 static struct cuda_ops cuda_ops = {
 	.cudaMemcpy = cudaMemcpy,
+	.cudaFree = cudaFree,
+	.cudaMalloc = cudaMalloc,
 	.cudaGetErrorName = cudaGetErrorName,
 	.cudaGetErrorString = cudaGetErrorString,
 	.cuPointerGetAttribute = cuPointerGetAttribute,
+	.cudaHostRegister = cudaHostRegister,
+	.cudaHostUnregister = cudaHostUnregister,
+	.cudaGetDeviceCount = cudaGetDeviceCount,
+	.cudaGetDevice = cudaGetDevice,
+	.cudaSetDevice = cudaSetDevice,
+	.cudaIpcOpenMemHandle = cudaIpcOpenMemHandle,
+	.cudaIpcGetMemHandle = cudaIpcGetMemHandle,
+	.cudaIpcCloseMemHandle = cudaIpcCloseMemHandle
 };
 
 #endif /* ENABLE_CUDA_DLOPEN */
 
-cudaError_t ofi_cudaMemcpy(void *dst, const void *src, size_t count,
+cudaError_t ofi_cudaMemcpy(void *dst, const void *src, size_t size,
 			   enum cudaMemcpyKind kind)
 {
-	return cuda_ops.cudaMemcpy(dst, src, count, kind);
+	return cuda_ops.cudaMemcpy(dst, src, size, kind);
 }
 
 const char *ofi_cudaGetErrorName(cudaError_t error)
@@ -93,11 +124,31 @@ CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
 	return cuda_ops.cuPointerGetAttribute(data, attribute, ptr);
 }
 
-int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
+cudaError_t ofi_cudaHostRegister(void *ptr, size_t size, unsigned int flags)
 {
+	return cuda_ops.cudaHostRegister(ptr, size, flags);
+}
+
+cudaError_t ofi_cudaHostUnregister(void *ptr)
+{
+	return cuda_ops.cudaHostUnregister(ptr);
+}
+
+static cudaError_t ofi_cudaGetDeviceCount(int *count)
+{
+	return cuda_ops.cudaGetDeviceCount(count);
+}
+
+int cuda_copy_to_dev(uint64_t device, void *dst, const void *src, size_t size)
+{
+	if (hmem_cuda_use_gdrcopy) {
+		cuda_gdrcopy_to_dev(device, dst, src, size);
+		return FI_SUCCESS;
+	}
+
 	cudaError_t cuda_ret;
 
-	cuda_ret = ofi_cudaMemcpy(dev, host, size, cudaMemcpyHostToDevice);
+	cuda_ret = ofi_cudaMemcpy(dst, src, size, cudaMemcpyDefault);
 	if (cuda_ret == cudaSuccess)
 		return 0;
 
@@ -109,11 +160,16 @@ int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
 	return -FI_EIO;
 }
 
-int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size)
+int cuda_copy_from_dev(uint64_t device, void *dst, const void *src, size_t size)
 {
+	if (hmem_cuda_use_gdrcopy) {
+		cuda_gdrcopy_from_dev(device, dst, src, size);
+		return FI_SUCCESS;
+	}
+
 	cudaError_t cuda_ret;
 
-	cuda_ret = ofi_cudaMemcpy(host, dev, size, cudaMemcpyDeviceToHost);
+	cuda_ret = ofi_cudaMemcpy(dst, src, size, cudaMemcpyDefault);
 	if (cuda_ret == cudaSuccess)
 		return 0;
 
@@ -125,6 +181,87 @@ int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size
 	return -FI_EIO;
 }
 
+int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle)
+{
+	if (hmem_cuda_use_gdrcopy)
+		return cuda_gdrcopy_dev_register(mr_attr, handle);
+
+	*handle = mr_attr->device.cuda;
+	return FI_SUCCESS;
+}
+
+int cuda_dev_unregister(uint64_t handle)
+{
+	if (hmem_cuda_use_gdrcopy)
+		return cuda_gdrcopy_dev_unregister(handle);
+
+	return FI_SUCCESS;
+}
+
+int cuda_get_handle(void *dev_buf, void **handle)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaIpcGetMemHandle((cudaIpcMemHandle_t *)handle,
+						dev_buf);
+
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform cudaIpcGetMemHandle: %s:%s\n",
+			ofi_cudaGetErrorName(cuda_ret),
+			ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EINVAL;
+}
+
+int cuda_open_handle(void **handle, uint64_t device, void **ipc_ptr)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaIpcOpenMemHandle(ipc_ptr,
+						 *(cudaIpcMemHandle_t *)handle,
+						 cudaIpcMemLazyEnablePeerAccess);
+
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaIpcOpenMemHandle: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EINVAL;
+}
+
+int cuda_close_handle(void *ipc_ptr)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaIpcCloseMemHandle(ipc_ptr);
+
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaIpcCloseMemHandle: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EINVAL;
+}
+
+static cudaError_t cuda_disabled_cudaMemcpy(void *dst, const void *src,
+					    size_t size, enum cudaMemcpyKind kind)
+{
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"cudaMemcpy was called but FI_HMEM_CUDA_ENABLE_XFER = 0, "
+		"no copy will occur to prevent deadlock.");
+
+	return cudaErrorInvalidValue;
+}
+
 static int cuda_hmem_dl_init(void)
 {
 #ifdef ENABLE_CUDA_DLOPEN
@@ -151,6 +288,18 @@ static int cuda_hmem_dl_init(void)
 		goto err_dlclose_cuda;
 	}
 
+	cuda_ops.cudaFree = dlsym(cudart_handle, "cudaFree");
+	if (!cuda_ops.cudaFree) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find cudaFree\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaMalloc = dlsym(cudart_handle, "cudaMalloc");
+	if (!cuda_ops.cudaMalloc) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find cudaMalloc\n");
+		goto err_dlclose_cuda;
+	}
+
 	cuda_ops.cudaGetErrorName = dlsym(cudart_handle, "cudaGetErrorName");
 	if (!cuda_ops.cudaGetErrorName) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
@@ -174,6 +323,69 @@ static int cuda_hmem_dl_init(void)
 		goto err_dlclose_cuda;
 	}
 
+	cuda_ops.cudaHostRegister = dlsym(cudart_handle, "cudaHostRegister");
+	if (!cuda_ops.cudaHostRegister) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaHostRegister\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaHostUnregister = dlsym(cudart_handle,
+					    "cudaHostUnregister");
+	if (!cuda_ops.cudaHostUnregister) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaHostUnregister\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetDeviceCount = dlsym(cudart_handle,
+					    "cudaGetDeviceCount");
+	if (!cuda_ops.cudaGetDeviceCount) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaGetDeviceCount\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetDevice = dlsym(cudart_handle,
+					    "cudaGetDevice");
+	if (!cuda_ops.cudaGetDevice) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaGetDevice\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaSetDevice = dlsym(cudart_handle,
+					    "cudaSetDevice");
+	if (!cuda_ops.cudaSetDevice) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaSetDevice\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaIpcOpenMemHandle = dlsym(cudart_handle,
+					    "cudaIpcOpenMemHandle");
+	if (!cuda_ops.cudaIpcOpenMemHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaIpcOpenMemHandle\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaIpcGetMemHandle = dlsym(cudart_handle,
+					    "cudaIpcGetMemHandle");
+	if (!cuda_ops.cudaIpcGetMemHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaIpcGetMemHandle\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaIpcCloseMemHandle = dlsym(cudart_handle,
+					    "cudaIpcCloseMemHandle");
+	if (!cuda_ops.cudaIpcCloseMemHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaIpcCloseMemHandle\n");
+		goto err_dlclose_cuda;
+	}
+
 	return FI_SUCCESS;
 
 err_dlclose_cuda:
@@ -187,22 +399,110 @@ err_dlclose_cudart:
 #endif /* ENABLE_CUDA_DLOPEN */
 }
 
-int cuda_hmem_init(void)
-{
-	return cuda_hmem_dl_init();
-}
-
-int cuda_hmem_cleanup(void)
+static void cuda_hmem_dl_cleanup(void)
 {
 #ifdef ENABLE_CUDA_DLOPEN
 	dlclose(cuda_handle);
 	dlclose(cudart_handle);
 #endif
+}
+
+static int cuda_hmem_verify_devices(void)
+{
+	int device_count;
+	cudaError_t cuda_ret;
+
+	/* Verify CUDA compute-capable devices are present on the host. */
+	cuda_ret = ofi_cudaGetDeviceCount(&device_count);
+	switch (cuda_ret) {
+	case cudaSuccess:
+		break;
+
+	case cudaErrorNoDevice:
+		return -FI_ENOSYS;
+
+	default:
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform cudaGetDeviceCount: %s:%s\n",
+			ofi_cudaGetErrorName(cuda_ret),
+			ofi_cudaGetErrorString(cuda_ret));
+		return -FI_EIO;
+	}
+
+	if (device_count == 0)
+		return -FI_ENOSYS;
+
+	return FI_SUCCESS;
+}
+
+int cuda_hmem_init(void)
+{
+	int ret;
+	int gdrcopy_ret;
+	bool cuda_enable_xfer;
+
+	fi_param_define(NULL, "hmem_cuda_use_gdrcopy", FI_PARAM_BOOL,
+			"Use gdrcopy to copy data to/from CUDA GPU memory. "
+			"If libfabric is not compiled with gdrcopy support, "
+			"this variable is not checked. (default: true)");
+	fi_param_define(NULL, "hmem_cuda_enable_xfer", FI_PARAM_BOOL,
+			"Enable use of CUDA APIs for copying data to/from CUDA "
+			"GPU memory. This should be disabled if CUDA "
+			"operations on the default stream would result in a "
+			"deadlock in the application. (default: true)");
+
+	ret = cuda_hmem_dl_init();
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	ret = cuda_hmem_verify_devices();
+	if (ret != FI_SUCCESS)
+		goto dl_cleanup;
+
+	fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy",
+			  &ret);
+	hmem_cuda_use_gdrcopy = (ret != 0);
+	if (hmem_cuda_use_gdrcopy) {
+		gdrcopy_ret = cuda_gdrcopy_hmem_init();
+		if (gdrcopy_ret != FI_SUCCESS) {
+			hmem_cuda_use_gdrcopy = false;
+			if (gdrcopy_ret != -FI_ENOSYS)
+				FI_WARN(&core_prov, FI_LOG_CORE,
+					"gdrcopy initialization failed! "
+					"gdrcopy will not be used.\n");
+		}
+	}
+
+	ret = 1;
+	fi_param_get_bool(NULL, "hmem_cuda_enable_xfer", &ret);
+	cuda_enable_xfer = (ret != 0);
+
+	if (!cuda_enable_xfer)
+		cuda_ops.cudaMemcpy = cuda_disabled_cudaMemcpy;
+
+	/*
+	 * CUDA IPC is only enabled if gdrcopy is not in use and
+	 * cudaMemcpy can be used.
+	 */
+	cuda_ipc_enabled = !hmem_cuda_use_gdrcopy && cuda_enable_xfer;
 
 	return FI_SUCCESS;
+
+dl_cleanup:
+	cuda_hmem_dl_cleanup();
+
+	return ret;
 }
 
-bool cuda_is_addr_valid(const void *addr)
+int cuda_hmem_cleanup(void)
+{
+	cuda_hmem_dl_cleanup();
+	if (hmem_cuda_use_gdrcopy)
+		cuda_gdrcopy_hmem_cleanup();
+	return FI_SUCCESS;
+}
+
+bool cuda_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
 {
 	CUresult cuda_ret;
 	unsigned int data;
@@ -212,8 +512,21 @@ bool cuda_is_addr_valid(const void *addr)
 					     (CUdeviceptr)addr);
 	switch (cuda_ret) {
 	case CUDA_SUCCESS:
-		if (data == CU_MEMORYTYPE_DEVICE)
+		if (data == CU_MEMORYTYPE_DEVICE) {
+			if (flags)
+				*flags = FI_HMEM_DEVICE_ONLY;
+
+			if (device) {
+				*device = 0;
+				cuda_ret = ofi_cuPointerGetAttribute(
+						(int *) device,
+						CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+						(CUdeviceptr) addr);
+				if (cuda_ret)
+					break;
+			}
 			return true;
+		}
 		break;
 
 	/* Returned if the buffer is not associated with the CUcontext support
@@ -248,6 +561,48 @@ bool cuda_is_addr_valid(const void *addr)
 	return false;
 }
 
+int cuda_host_register(void *ptr, size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = ofi_cudaHostRegister(ptr, size, cudaHostRegisterDefault);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaHostRegister: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EIO;
+}
+
+int cuda_host_unregister(void *ptr)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = ofi_cudaHostUnregister(ptr);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaHostUnregister: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EIO;
+}
+
+bool cuda_is_ipc_enabled(void)
+{
+	return !ofi_hmem_p2p_disabled() && cuda_ipc_enabled;
+}
+
+bool cuda_is_gdrcopy_enabled(void)
+{
+	return hmem_cuda_use_gdrcopy;
+}
+
 #else
 
 int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
@@ -270,7 +625,52 @@ int cuda_hmem_cleanup(void)
 	return -FI_ENOSYS;
 }
 
-bool cuda_is_addr_valid(const void *addr)
+bool cuda_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
+{
+	return false;
+}
+
+int cuda_host_register(void *ptr, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_host_unregister(void *ptr)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle)
+{
+	return FI_SUCCESS;
+}
+
+int cuda_dev_unregister(uint64_t handle)
+{
+	return FI_SUCCESS;
+}
+
+int cuda_get_handle(void *dev_buf, void **handle)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_open_handle(void **handle, uint64_t device, void **ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_close_handle(void *ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+bool cuda_is_ipc_enabled(void)
+{
+	return false;
+}
+
+bool cuda_is_gdrcopy_enabled(void)
 {
 	return false;
 }
diff --git a/deps/libfabric/src/hmem_cuda_gdrcopy.c b/deps/libfabric/src/hmem_cuda_gdrcopy.c
new file mode 100644
index 0000000000000000000000000000000000000000..0a92922ff4d5d95750f21f84054d5af61f132136
--- /dev/null
+++ b/deps/libfabric/src/hmem_cuda_gdrcopy.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "ofi_hmem.h"
+#include "ofi.h"
+
+#if HAVE_GDRCOPY
+
+#include <pthread.h>
+#include <gdrapi.h>
+
+struct gdrcopy_handle {
+	gdr_mh_t mh; /* memory handler */
+	void *cuda_ptr; /* page aligned gpu pointer */
+	void *user_ptr; /* user space ptr mapped to GPU memory */
+	size_t length; /* page aligned length */
+};
+
+struct gdrcopy_ops {
+	gdr_t (*gdr_open)();
+	int (*gdr_close)(gdr_t g);
+	int (*gdr_pin_buffer)(gdr_t g, unsigned long addr, size_t size,
+			      uint64_t p2p_token, uint32_t va_space,
+			      gdr_mh_t *handle);
+	int (*gdr_unpin_buffer)(gdr_t g, gdr_mh_t handle);
+	int (*gdr_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+	int (*gdr_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+	int (*gdr_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr,
+				   const void *h_ptr, size_t size);
+	int (*gdr_copy_from_mapping)(gdr_mh_t handle, void *map_d_ptr,
+				     const void *h_ptr, size_t size);
+};
+
+enum gdrcopy_dir {
+	GDRCOPY_TO_DEVICE,
+	GDRCOPY_FROM_DEVICE,
+};
+
+static gdr_t global_gdr;
+static pthread_spinlock_t global_gdr_lock;
+
+#ifdef ENABLE_GDRCOPY_DLOPEN
+
+#include <dlfcn.h>
+
+static void *gdrapi_handle;
+static struct gdrcopy_ops global_gdrcopy_ops;
+
+static int cuda_gdrcopy_dl_hmem_init(void)
+{
+	gdrapi_handle = dlopen("libgdrapi.so", RTLD_NOW);
+	if (!gdrapi_handle) {
+		FI_INFO(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libgdrapi.so\n");
+		return -FI_ENOSYS;
+	}
+
+	global_gdrcopy_ops.gdr_open = dlsym(gdrapi_handle, "gdr_open");
+	if (!global_gdrcopy_ops.gdr_open) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_open\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_close = dlsym(gdrapi_handle, "gdr_close");
+	if (!global_gdrcopy_ops.gdr_close) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_close\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_pin_buffer = dlsym(gdrapi_handle, "gdr_pin_buffer");
+	if (!global_gdrcopy_ops.gdr_pin_buffer) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_pin_buffer\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_unpin_buffer = dlsym(gdrapi_handle, "gdr_unpin_buffer");
+	if (!global_gdrcopy_ops.gdr_unpin_buffer) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_unpin_buffer\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_map = dlsym(gdrapi_handle, "gdr_map");
+	if (!global_gdrcopy_ops.gdr_map) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_map\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_unmap = dlsym(gdrapi_handle, "gdr_unmap");
+	if (!global_gdrcopy_ops.gdr_unmap) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_unmap\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_copy_to_mapping = dlsym(gdrapi_handle, "gdr_copy_to_mapping");
+	if (!global_gdrcopy_ops.gdr_copy_to_mapping) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_copy_to_mapping\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	global_gdrcopy_ops.gdr_copy_from_mapping = dlsym(gdrapi_handle, "gdr_copy_from_mapping");
+	if (!global_gdrcopy_ops.gdr_copy_from_mapping) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find gdr_copy_from_mapping\n");
+		goto err_dlclose_gdrapi;
+	}
+
+	return FI_SUCCESS;
+
+err_dlclose_gdrapi:
+	memset(&global_gdrcopy_ops, 0, sizeof(global_gdrcopy_ops));
+	dlclose(gdrapi_handle);
+	return -FI_ENODATA;
+}
+
+static int cuda_gdrcopy_dl_hmem_cleanup(void)
+{
+	dlclose(gdrapi_handle);
+	return FI_SUCCESS;
+}
+
+#else
+
+static struct gdrcopy_ops global_gdrcopy_ops = {
+	.gdr_open = gdr_open,
+	.gdr_close = gdr_close,
+	.gdr_pin_buffer = gdr_pin_buffer,
+	.gdr_unpin_buffer = gdr_unpin_buffer,
+	.gdr_map = gdr_map,
+	.gdr_unmap = gdr_unmap,
+	.gdr_copy_to_mapping = gdr_copy_to_mapping,
+	.gdr_copy_from_mapping = gdr_copy_from_mapping
+};
+
+static int cuda_gdrcopy_dl_hmem_init(void)
+{
+	return FI_SUCCESS;
+}
+
+static int cuda_gdrcopy_dl_hmem_cleanup(void)
+{
+	return FI_SUCCESS;
+}
+
+#endif /* ENABLE_CUDA_DLOPEN */
+
+int cuda_gdrcopy_hmem_init(void)
+{
+	int err, ret = 0;
+
+	err = cuda_gdrcopy_dl_hmem_init();
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"gdrcopy_dl_hmem_init failed!\n");
+		return -FI_ENOSYS;
+	}
+
+	assert(global_gdrcopy_ops.gdr_open);
+
+	global_gdr = global_gdrcopy_ops.gdr_open();
+	if (!global_gdr) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"gdr_open failed!\n");
+		ret = -FI_ENOMEM;
+		goto exit;
+	}
+
+	err = pthread_spin_init(&global_gdr_lock, 0);
+	if (err) {
+		assert(global_gdrcopy_ops.gdr_close);
+		global_gdrcopy_ops.gdr_close(global_gdr);
+		ret = -err;
+	}
+
+exit:
+	cuda_gdrcopy_dl_hmem_cleanup();
+	return ret;
+}
+
+int cuda_gdrcopy_hmem_cleanup(void)
+{
+	int err, ret = 0;
+
+	err = pthread_spin_destroy(&global_gdr_lock);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"destroy global_gdr_lock failed! err: %s\n",
+			strerror(err));
+		ret = err;
+	}
+
+	assert(global_gdrcopy_ops.gdr_close);
+	err = global_gdrcopy_ops.gdr_close(global_gdr);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"close global_gdr failed! err: %s\n",
+			strerror(err));
+		ret = err;
+	}
+
+	err = cuda_gdrcopy_dl_hmem_cleanup();
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"cuda_gdrcopy_dl_hmem_cleaup() failed! err: %s\n",
+			strerror(err));
+		ret = err;
+	}
+
+	return ret;
+}
+
+void cuda_gdrcopy_impl(uint64_t handle, void *devptr,
+		       void *hostptr, size_t len,
+		       enum gdrcopy_dir dir)
+{
+	ssize_t off;
+	struct gdrcopy_handle *gdrcopy;
+	void *gdrcopy_user_ptr;
+
+	assert(global_gdrcopy_ops.gdr_copy_to_mapping);
+	assert(handle);
+
+	gdrcopy = (struct gdrcopy_handle *)handle;
+	off = (char *)devptr - (char *)gdrcopy->cuda_ptr;
+	assert(off >= 0 && off + len <= gdrcopy->length);
+	gdrcopy_user_ptr = (char *)gdrcopy->user_ptr + off;
+	if (dir == GDRCOPY_TO_DEVICE) {
+		global_gdrcopy_ops.gdr_copy_to_mapping(gdrcopy->mh,
+						       gdrcopy_user_ptr,
+						       hostptr, len);
+	} else {
+		assert(dir == GDRCOPY_FROM_DEVICE);
+		global_gdrcopy_ops.gdr_copy_from_mapping(gdrcopy->mh,
+							 gdrcopy_user_ptr,
+							 hostptr, len);
+	}
+}
+
+void cuda_gdrcopy_to_dev(uint64_t handle, void *devptr,
+			 const void *hostptr, size_t len)
+{
+	cuda_gdrcopy_impl(handle, devptr, (void *)hostptr, len,
+			  GDRCOPY_TO_DEVICE);
+}
+
+void cuda_gdrcopy_from_dev(uint64_t handle, void *hostptr,
+			   const void *devptr, size_t len)
+{
+	cuda_gdrcopy_impl(handle, (void *)devptr, hostptr, len,
+			  GDRCOPY_FROM_DEVICE);
+}
+
+int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle)
+{
+	int err;
+	uintptr_t regbgn, regend;
+	size_t reglen;
+	struct gdrcopy_handle *gdrcopy;
+
+	assert(global_gdr);
+	assert(global_gdrcopy_ops.gdr_pin_buffer);
+	assert(global_gdrcopy_ops.gdr_map);
+
+	regbgn = (uintptr_t)ofi_get_page_start(mr_attr->mr_iov->iov_base, GPU_PAGE_SIZE);
+	regend = (uintptr_t)mr_attr->mr_iov->iov_base + mr_attr->mr_iov->iov_len;
+	reglen = ofi_get_aligned_size(regend - regbgn, GPU_PAGE_SIZE);
+
+	gdrcopy = malloc(sizeof(struct gdrcopy_handle));
+	if (!gdrcopy)
+		return -FI_ENOMEM;
+
+	assert(global_gdr);
+	pthread_spin_lock(&global_gdr_lock);
+	err = global_gdrcopy_ops.gdr_pin_buffer(global_gdr, regbgn,
+					 reglen, 0, 0, &gdrcopy->mh);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"gdr_pin_buffer failed! error: %s ptr: %p len: %ld\n",
+			strerror(err), mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len);
+		free(gdrcopy);
+		goto exit;
+	}
+
+	gdrcopy->cuda_ptr = (void *)regbgn;
+	gdrcopy->length = reglen;
+
+	err = global_gdrcopy_ops.gdr_map(global_gdr, gdrcopy->mh,
+					 &gdrcopy->user_ptr, gdrcopy->length);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "gdr_map failed! error: %s\n",
+			strerror(err));
+		global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh);
+		free(gdrcopy);
+		goto exit;
+	}
+
+	*handle = (uint64_t)gdrcopy;
+exit:
+	pthread_spin_unlock(&global_gdr_lock);
+	return err;
+}
+
+int cuda_gdrcopy_dev_unregister(uint64_t handle)
+{
+	int err;
+	struct gdrcopy_handle *gdrcopy;
+
+	assert(global_gdr);
+	assert(global_gdrcopy_ops.gdr_unmap);
+	assert(global_gdrcopy_ops.gdr_unpin_buffer);
+
+	gdrcopy = (struct gdrcopy_handle *)handle;
+	assert(gdrcopy);
+
+	pthread_spin_lock(&global_gdr_lock);
+	err = global_gdrcopy_ops.gdr_unmap(global_gdr, gdrcopy->mh,
+					   gdrcopy->user_ptr, gdrcopy->length);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"gdr_unmap failed! error: %s\n",
+			strerror(err));
+		goto exit;
+	}
+
+	err = global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh);
+	if (err) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"gdr_unmap failed! error: %s\n",
+			strerror(err));
+		goto exit;
+	}
+
+exit:
+	pthread_spin_unlock(&global_gdr_lock);
+	free(gdrcopy);
+	return err;
+}
+
+#else
+
+int cuda_gdrcopy_hmem_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_gdrcopy_hmem_cleanup(void)
+{
+	return FI_SUCCESS;
+}
+
+void cuda_gdrcopy_to_dev(uint64_t devhandle, void *devptr,
+			 const void *hostptr, size_t len)
+{
+}
+
+void cuda_gdrcopy_from_dev(uint64_t devhandle, void *hostptr,
+			   const void *devptr, size_t len)
+{
+}
+
+int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle)
+{
+	return FI_SUCCESS;
+}
+
+int cuda_gdrcopy_dev_unregister(uint64_t handle)
+{
+	return FI_SUCCESS;
+}
+
+#endif /* HAVE_GDRCOPY */
diff --git a/deps/libfabric/src/hmem_rocr.c b/deps/libfabric/src/hmem_rocr.c
index 99eca4210ecc78c7f347b3ed4f8ca2691c6f0dc7..518b457a5fc5f9467ecccb62dfb8d53953c208d5 100644
--- a/deps/libfabric/src/hmem_rocr.c
+++ b/deps/libfabric/src/hmem_rocr.c
@@ -58,6 +58,13 @@ struct rocr_ops {
 	hsa_status_t (*hsa_amd_reg_dealloc_cb)(void *ptr,
 					       hsa_amd_deallocation_callback_t cb,
 					       void *user_data);
+	hsa_status_t (*hsa_amd_memory_lock)(void *host_ptr, size_t size,
+					    hsa_agent_t *agents, int num_agents,
+					    void **agent_ptr);
+	hsa_status_t (*hsa_amd_memory_unlock)(void *host_ptr);
+	hsa_status_t (*hsa_agent_get_info)(hsa_agent_t agent,
+					   hsa_agent_info_t attribute,
+					   void *value);
 };
 
 #ifdef ENABLE_ROCR_DLOPEN
@@ -79,10 +86,26 @@ static struct rocr_ops rocr_ops = {
 		hsa_amd_deregister_deallocation_callback,
 	.hsa_amd_reg_dealloc_cb =
 		hsa_amd_register_deallocation_callback,
+	.hsa_amd_memory_lock = hsa_amd_memory_lock,
+	.hsa_amd_memory_unlock = hsa_amd_memory_unlock,
+	.hsa_agent_get_info = hsa_agent_get_info,
 };
 
 #endif /* ENABLE_ROCR_DLOPEN */
 
+hsa_status_t ofi_hsa_amd_memory_lock(void *host_ptr, size_t size,
+				     hsa_agent_t *agents, int num_agents,
+				     void **agent_ptr)
+{
+	return rocr_ops.hsa_amd_memory_lock(host_ptr, size, agents, num_agents,
+					    agent_ptr);
+}
+
+hsa_status_t ofi_hsa_amd_memory_unlock(void *host_ptr)
+{
+	return rocr_ops.hsa_amd_memory_unlock(host_ptr);
+}
+
 hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size)
 {
 	return rocr_ops.hsa_memory_copy(dst, src, size);
@@ -138,7 +161,14 @@ hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr,
 	return rocr_ops.hsa_amd_reg_dealloc_cb(ptr, cb, user_data);
 }
 
-int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size)
+static hsa_status_t ofi_hsa_agent_get_info(hsa_agent_t agent,
+					   hsa_agent_info_t attribute,
+					   void *value)
+{
+	return rocr_ops.hsa_agent_get_info(agent, attribute, value);
+}
+
+static int rocr_memcpy(void *dest, const void *src, size_t size)
 {
 	hsa_status_t hsa_ret;
 
@@ -153,18 +183,89 @@ int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size)
 	return -FI_EIO;
 }
 
-bool rocr_is_addr_valid(const void *addr)
+static int rocr_host_memory_ptr(void *host_ptr, void **ptr)
+{
+	hsa_amd_pointer_info_t info = {
+		.size = sizeof(info),
+	};
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_amd_pointer_info((void *)host_ptr, &info, NULL, NULL,
+					   NULL);
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform hsa_amd_pointer_info: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+
+		return -FI_EIO;
+	}
+
+	if (info.type != HSA_EXT_POINTER_TYPE_LOCKED)
+		*ptr = host_ptr;
+	else
+		*ptr = (void *) ((uintptr_t) info.agentBaseAddress +
+				 (uintptr_t) host_ptr -
+				 (uintptr_t) info.hostBaseAddress);
+
+	return FI_SUCCESS;
+}
+
+int rocr_copy_from_dev(uint64_t device, void *dest, const void *src,
+		       size_t size)
+{
+	int ret;
+	void *dest_memcpy_ptr;
+
+	ret = rocr_host_memory_ptr(dest, &dest_memcpy_ptr);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	ret = rocr_memcpy(dest_memcpy_ptr, src, size);
+
+	return ret;
+}
+
+int rocr_copy_to_dev(uint64_t device, void *dest, const void *src,
+		     size_t size)
+{
+	int ret;
+	void *src_memcpy_ptr;
+
+	ret = rocr_host_memory_ptr((void *) src, &src_memcpy_ptr);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	ret = rocr_memcpy(dest, src_memcpy_ptr, size);
+
+	return ret;
+}
+
+bool rocr_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
 {
 	hsa_amd_pointer_info_t hsa_info = {
 		.size = sizeof(hsa_info),
 	};
+	hsa_device_type_t hsa_dev_type;
 	hsa_status_t hsa_ret;
 
 	hsa_ret = ofi_hsa_amd_pointer_info((void *)addr, &hsa_info, NULL, NULL,
 					   NULL);
 	if (hsa_ret == HSA_STATUS_SUCCESS) {
-		if (hsa_info.type == HSA_EXT_POINTER_TYPE_HSA)
-			return true;
+		hsa_ret = ofi_hsa_agent_get_info(hsa_info.agentOwner,
+						 HSA_AGENT_INFO_DEVICE,
+						 (void *) &hsa_dev_type);
+		if (hsa_ret == HSA_STATUS_SUCCESS) {
+			if (hsa_dev_type == HSA_DEVICE_TYPE_GPU) {
+				//TODO get device pointer/id
+				if (flags)
+					*flags = FI_HMEM_DEVICE_ONLY;
+				return true;
+			}
+		} else {
+			FI_WARN(&core_prov, FI_LOG_CORE,
+				"Failed to perform hsa_agent_get_info: %s\n",
+				ofi_hsa_status_to_string(hsa_ret));
+		}
 	} else {
 		FI_WARN(&core_prov, FI_LOG_CORE,
 			"Failed to perform hsa_amd_pointer_info: %s\n",
@@ -238,6 +339,29 @@ static int rocr_hmem_dl_init(void)
 		goto err;
 	}
 
+	rocr_ops.hsa_amd_memory_lock = dlsym(rocr_handle,
+					     "hsa_amd_memory_lock");
+	if (!rocr_ops.hsa_amd_memory_lock) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_amd_memory_lock\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_amd_memory_unlock = dlsym(rocr_handle,
+					       "hsa_amd_memory_unlock");
+	if (!rocr_ops.hsa_amd_memory_lock) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_amd_memory_unlock\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_agent_get_info = dlsym(rocr_handle, "hsa_agent_get_info");
+	if (!rocr_ops.hsa_agent_get_info) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_agent_get_info\n");
+		goto err;
+	}
+
 	return FI_SUCCESS;
 
 err:
@@ -308,9 +432,47 @@ int rocr_hmem_cleanup(void)
 	return FI_SUCCESS;
 }
 
+int rocr_host_register(void *ptr, size_t size)
+{
+	hsa_status_t hsa_ret;
+	void *tmp;
+
+	hsa_ret = ofi_hsa_amd_memory_lock(ptr, size, NULL, 0, &tmp);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform hsa_amd_memory_lock: %s\n",
+		ofi_hsa_status_to_string(hsa_ret));
+
+	return -FI_EIO;
+}
+
+int rocr_host_unregister(void *ptr)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_amd_memory_unlock(ptr);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform hsa_amd_memory_unlock: %s\n",
+		ofi_hsa_status_to_string(hsa_ret));
+
+	return -FI_EIO;
+}
+
 #else
 
-int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size)
+int rocr_copy_from_dev(uint64_t device, void *dest, const void *src,
+		       size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int rocr_copy_to_dev(uint64_t device, void *dest, const void *src,
+		     size_t size)
 {
 	return -FI_ENOSYS;
 }
@@ -325,9 +487,19 @@ int rocr_hmem_cleanup(void)
 	return -FI_ENOSYS;
 }
 
-bool rocr_is_addr_valid(const void *addr)
+bool rocr_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
 {
 	return false;
 }
 
+int rocr_host_register(void *ptr, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int rocr_host_unregister(void *ptr)
+{
+	return -FI_ENOSYS;
+}
+
 #endif /* HAVE_ROCR */
diff --git a/deps/libfabric/src/hmem_ze.c b/deps/libfabric/src/hmem_ze.c
index 892699a68b3ce023ba103cfb50845be8c9ea59bb..504f018ca377bb390faba5574b84b444157e59bb 100644
--- a/deps/libfabric/src/hmem_ze.c
+++ b/deps/libfabric/src/hmem_ze.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ * Copyright (c) 2020-2021 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,18 +37,21 @@
 #include "ofi_hmem.h"
 #include "ofi.h"
 
-#ifdef HAVE_LIBZE
+#if HAVE_LIBZE
 
+#include <dirent.h>
 #include <level_zero/ze_api.h>
 
-#define ZE_MAX_DEVICES 4
-
 static ze_context_handle_t context;
 static ze_device_handle_t devices[ZE_MAX_DEVICES];
 static ze_command_queue_handle_t cmd_queue[ZE_MAX_DEVICES];
 static int num_devices = 0;
+static int ordinals[ZE_MAX_DEVICES];
+static int dev_fds[ZE_MAX_DEVICES];
+static ze_device_uuid_t dev_uuids[ZE_MAX_DEVICES];
+static bool p2p_enabled = false;
 
-static const ze_command_queue_desc_t cq_desc = {
+static ze_command_queue_desc_t cq_desc = {
 	.stype		= ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
 	.pNext		= NULL,
 	.ordinal	= 0,
@@ -58,100 +61,710 @@ static const ze_command_queue_desc_t cq_desc = {
 	.priority	= ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
 };
 
-static const ze_command_list_desc_t cl_desc = {
+static ze_command_list_desc_t cl_desc = {
 	.stype				= ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
 	.pNext				= NULL,
 	.commandQueueGroupOrdinal	= 0,
 	.flags				= 0,
 };
 
+struct libze_ops {
+	ze_result_t (*zeInit)(ze_init_flags_t flags);
+	ze_result_t (*zeDriverGet)(uint32_t *pCount,
+				   ze_driver_handle_t *phDrivers);
+	ze_result_t (*zeDeviceGet)(ze_driver_handle_t hDriver,
+				   uint32_t *pCount,
+				   ze_device_handle_t *phDevices);
+	ze_result_t (*zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t hDevice,
+			uint32_t *pCount,
+			ze_command_queue_group_properties_t *pCommandQueueGroupProperties);
+	ze_result_t (*zeDeviceCanAccessPeer)(ze_device_handle_t hDevice,
+					     ze_device_handle_t hPeerDevice,
+					     ze_bool_t *value);
+	ze_result_t (*zeContextCreate)(ze_driver_handle_t hDriver,
+				       const ze_context_desc_t *desc,
+				       ze_context_handle_t *phContext);
+	ze_result_t (*zeContextDestroy)(ze_context_handle_t hContext);
+	ze_result_t (*zeCommandQueueCreate)(ze_context_handle_t hContext,
+					    ze_device_handle_t hDevice,
+					    const ze_command_queue_desc_t *desc,
+					    ze_command_queue_handle_t *phCommandQueue);
+	ze_result_t (*zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue);
+	ze_result_t (*zeCommandQueueExecuteCommandLists)(
+					ze_command_queue_handle_t hCommandQueue,
+					uint32_t numCommandLists,
+					ze_command_list_handle_t *phCommandLists,
+					ze_fence_handle_t hFence);
+	ze_result_t (*zeCommandListCreate)(ze_context_handle_t hContext,
+					   ze_device_handle_t hDevice,
+					   const ze_command_list_desc_t *desc,
+					   ze_command_list_handle_t *phCommandList);
+	ze_result_t (*zeCommandListDestroy)(ze_command_list_handle_t hCommandList);
+	ze_result_t (*zeCommandListClose)(ze_command_list_handle_t hCommandList);
+	ze_result_t (*zeCommandListAppendMemoryCopy)(
+				ze_command_list_handle_t hCommandList,
+				void *dstptr, const void *srcptr, size_t size,
+				ze_event_handle_t hSignalEvent,
+				uint32_t numWaitEvents,
+				ze_event_handle_t *phWaitEvents);
+	ze_result_t (*zeMemGetAllocProperties)(
+				ze_context_handle_t hContext, const void *ptr,
+				ze_memory_allocation_properties_t *pMemAllocProperties,
+				ze_device_handle_t *phDevice);
+	ze_result_t (*zeMemGetAddressRange)(
+				ze_context_handle_t hContext, const void *ptr,
+				void **pBase, size_t *pSize);
+	ze_result_t (*zeMemGetIpcHandle)(ze_context_handle_t hContext,
+					 const void *ptr,
+					 ze_ipc_mem_handle_t *pIpcHandle);
+	ze_result_t (*zeMemOpenIpcHandle)(ze_context_handle_t hContext,
+					  ze_device_handle_t hDevice,
+					  ze_ipc_mem_handle_t handle,
+					  ze_ipc_memory_flags_t flags,
+					  void **pptr);
+	ze_result_t (*zeMemCloseIpcHandle)(ze_context_handle_t hContext,
+					   const void *ptr);
+	ze_result_t (*zeDeviceGetProperties)(ze_device_handle_t hDevice,
+					     ze_device_properties_t *pDeviceProperties);
+};
+
+#ifdef ENABLE_ZE_DLOPEN
+
+#include <dlfcn.h>
+
+static void *libze_handle;
+static struct libze_ops libze_ops;
+
+#else
+
+static struct libze_ops libze_ops = {
+	.zeInit = zeInit,
+	.zeDriverGet = zeDriverGet,
+	.zeDeviceGet = zeDeviceGet,
+	.zeDeviceGetCommandQueueGroupProperties = zeDeviceGetCommandQueueGroupProperties,
+	.zeDeviceCanAccessPeer = zeDeviceCanAccessPeer,
+	.zeContextCreate = zeContextCreate,
+	.zeContextDestroy = zeContextDestroy,
+	.zeCommandQueueCreate = zeCommandQueueCreate,
+	.zeCommandQueueDestroy = zeCommandQueueDestroy,
+	.zeCommandQueueExecuteCommandLists = zeCommandQueueExecuteCommandLists,
+	.zeCommandListCreate = zeCommandListCreate,
+	.zeCommandListDestroy = zeCommandListDestroy,
+	.zeCommandListClose = zeCommandListClose,
+	.zeCommandListAppendMemoryCopy = zeCommandListAppendMemoryCopy,
+	.zeMemGetAllocProperties = zeMemGetAllocProperties,
+	.zeMemGetAddressRange = zeMemGetAddressRange,
+	.zeMemGetIpcHandle = zeMemGetIpcHandle,
+	.zeMemOpenIpcHandle = zeMemOpenIpcHandle,
+	.zeMemCloseIpcHandle = zeMemCloseIpcHandle,
+	.zeDeviceGetProperties = zeDeviceGetProperties,
+};
+
+#endif /* ENABLE_ZE_DLOPEN */
+
+ze_result_t ofi_zeInit(ze_init_flags_t flags)
+{
+	return (*libze_ops.zeInit)(flags);
+}
+
+ze_result_t ofi_zeDriverGet(uint32_t *pCount, ze_driver_handle_t *phDrivers)
+{
+	return (*libze_ops.zeDriverGet)(pCount, phDrivers);
+}
+
+ze_result_t ofi_zeDeviceGet(ze_driver_handle_t hDriver, uint32_t *pCount,
+			    ze_device_handle_t *phDevices)
+{
+	return (*libze_ops.zeDeviceGet)(hDriver, pCount, phDevices);
+}
+
+ze_result_t ofi_zeDeviceGetCommandQueueGroupProperties(ze_device_handle_t hDevice,
+	       uint32_t *pCount,
+	       ze_command_queue_group_properties_t *pCommandQueueGroupProperties)
+{
+	return (*libze_ops.zeDeviceGetCommandQueueGroupProperties)(hDevice,
+					pCount, pCommandQueueGroupProperties);
+}
+
+ze_result_t ofi_zeDeviceCanAccessPeer(ze_device_handle_t hDevice,
+				      ze_device_handle_t hPeerDevice,
+				      ze_bool_t *value)
+{
+	return (*libze_ops.zeDeviceCanAccessPeer)(hDevice, hPeerDevice, value);
+}
+
+ze_result_t ofi_zeContextCreate(ze_driver_handle_t hDriver,
+				const ze_context_desc_t *desc,
+				ze_context_handle_t *phContext)
+{
+	return (*libze_ops.zeContextCreate)(hDriver, desc, phContext);
+}
+
+ze_result_t ofi_zeContextDestroy(ze_context_handle_t hContext)
+{
+	return (*libze_ops.zeContextDestroy)(hContext);
+}
+
+ze_result_t ofi_zeCommandQueueCreate(ze_context_handle_t hContext,
+				     ze_device_handle_t hDevice,
+				     const ze_command_queue_desc_t *desc,
+				     ze_command_queue_handle_t *phCommandQueue)
+{
+	return (*libze_ops.zeCommandQueueCreate)(hContext, hDevice, desc,
+						 phCommandQueue);
+}
+
+ze_result_t ofi_zeCommandQueueDestroy(ze_command_queue_handle_t hCommandQueue)
+{
+	return (*libze_ops.zeCommandQueueDestroy)(hCommandQueue);
+}
+
+ze_result_t ofi_zeCommandQueueExecuteCommandLists(
+				ze_command_queue_handle_t hCommandQueue,
+				uint32_t numCommandLists,
+				ze_command_list_handle_t *phCommandLists,
+				ze_fence_handle_t hFence)
+{
+	return (*libze_ops.zeCommandQueueExecuteCommandLists)(
+				hCommandQueue, numCommandLists, phCommandLists,
+				hFence);
+}
+
+ze_result_t ofi_zeCommandListCreate(ze_context_handle_t hContext,
+				    ze_device_handle_t hDevice,
+				    const ze_command_list_desc_t *desc,
+				    ze_command_list_handle_t *phCommandList)
+{
+	return (*libze_ops.zeCommandListCreate)(hContext, hDevice, desc,
+						phCommandList);
+}
+
+ze_result_t ofi_zeCommandListDestroy(ze_command_list_handle_t hCommandList)
+{
+	return (*libze_ops.zeCommandListDestroy)(hCommandList);
+}
+
+ze_result_t ofi_zeCommandListClose(ze_command_list_handle_t hCommandList)
+{
+	return (*libze_ops.zeCommandListClose)(hCommandList);
+}
+
+ze_result_t ofi_zeCommandListAppendMemoryCopy(
+				ze_command_list_handle_t hCommandList,
+				void *dstptr, const void *srcptr, size_t size,
+				ze_event_handle_t hSignalEvent,
+				uint32_t numWaitEvents,
+				ze_event_handle_t *phWaitEvents)
+{
+	return (*libze_ops.zeCommandListAppendMemoryCopy)(
+				hCommandList, dstptr, srcptr, size, hSignalEvent,
+				numWaitEvents, phWaitEvents);
+}
+
+ze_result_t ofi_zeMemGetAllocProperties(ze_context_handle_t hContext,
+					const void *ptr,
+					ze_memory_allocation_properties_t
+						*pMemAllocProperties,
+					ze_device_handle_t *phDevice)
+{
+	return (*libze_ops.zeMemGetAllocProperties)(
+					hContext, ptr, pMemAllocProperties,
+					phDevice);
+}
+
+ze_result_t ofi_zeMemGetAddressRange(ze_context_handle_t hContext,
+				     const void *ptr, void **pBase,
+				     size_t *pSize)
+{
+	return (*libze_ops.zeMemGetAddressRange)(hContext, ptr, pBase, pSize);
+}
+
+ze_result_t ofi_zeMemGetIpcHandle(ze_context_handle_t hContext, const void *ptr,
+				  ze_ipc_mem_handle_t *pIpcHandle)
+{
+	return (*libze_ops.zeMemGetIpcHandle)(hContext, ptr, pIpcHandle);
+}
+
+ze_result_t ofi_zeMemOpenIpcHandle(ze_context_handle_t hContext,
+				   ze_device_handle_t hDevice,
+				   ze_ipc_mem_handle_t handle,
+				   ze_ipc_memory_flags_t flags,
+				   void **pptr)
+{
+	return (*libze_ops.zeMemOpenIpcHandle)(hContext, hDevice, handle, flags,
+					       pptr);
+}
+
+ze_result_t ofi_zeMemCloseIpcHandle(ze_context_handle_t hContext,
+				    const void *ptr)
+{
+	return (*libze_ops.zeMemCloseIpcHandle)(hContext, ptr);
+}
+
+ze_result_t ofi_zeDeviceGetProperties(ze_device_handle_t hDevice,
+				      ze_device_properties_t *pDeviceProperties)
+{
+	return (*libze_ops.zeDeviceGetProperties)(hDevice, pDeviceProperties);
+}
+
+#if HAVE_DRM
+#include <drm/i915_drm.h>
+#include <sys/ioctl.h>
+
+static int ze_hmem_init_fds(void)
+{
+	const char *dev_dir = "/dev/dri/by-path";
+	const char *suffix = "-render";
+	DIR *dir;
+	struct dirent *ent = NULL;
+	char dev_name[128];
+	int i = 0;
+
+	dir = opendir(dev_dir);
+	if (dir == NULL)
+		return -FI_EIO;
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (ent->d_name[0] == '.' ||
+		    strstr(ent->d_name, suffix) == NULL)
+			continue;
+
+		memset(dev_name, 0, sizeof(dev_name));
+		strncpy(dev_name, dev_dir, sizeof(dev_name));
+		strncat(dev_name, "/",
+			sizeof(dev_name) - strlen(dev_name));
+		strncat(dev_name, ent->d_name,
+			sizeof(dev_name) - strlen(dev_name));
+		dev_fds[i] = open(dev_name, O_RDWR);
+		if (dev_fds[i] == -1) {
+			FI_WARN(&core_prov, FI_LOG_CORE,
+				"Failed open device %d\n", i);
+			return -FI_EIO;
+		}
+		i++;
+	}
+	return FI_SUCCESS;
+}
+
+int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd,
+			      void **handle)
+{
+	struct drm_prime_handle open_fd = {0, 0, 0};
+	ze_ipc_mem_handle_t ze_handle;
+	int ret;
+
+	ret = ze_hmem_get_handle(dev_buf, (void **) &ze_handle);
+	if (ret)
+		return ret;
+
+	memcpy(ze_fd, &ze_handle, sizeof(*ze_fd));
+	memcpy(&open_fd.fd, &ze_handle, sizeof(open_fd.fd));
+	ret = ioctl(dev_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd);
+	if (ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"ioctl call failed on get, err %d\n", errno);
+		return -FI_EINVAL;
+	}
+
+	*(int *) handle = open_fd.handle;
+	return FI_SUCCESS;
+}
+
+int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd,
+			       uint64_t device, void **ipc_ptr)
+{
+	struct drm_prime_handle open_fd = {0, 0, 0};
+	ze_ipc_mem_handle_t ze_handle;
+	int ret;
+
+	open_fd.flags = DRM_CLOEXEC | DRM_RDWR;
+	open_fd.handle = *(int *) handle;
+
+	ret = ioctl(dev_fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd);
+	if (ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"ioctl call failed on open, err %d\n", errno);
+		return -FI_EINVAL;
+	}
+
+	*ze_fd = open_fd.fd;
+	memset(&ze_handle, 0, sizeof(ze_handle));
+	memcpy(&ze_handle, &open_fd.fd, sizeof(open_fd.fd));
+	return ze_hmem_open_handle((void **) &ze_handle, device, ipc_ptr);
+}
+
+bool ze_hmem_p2p_enabled(void)
+{
+	return !ofi_hmem_p2p_disabled() && p2p_enabled;
+}
+
+#else
+
+static int ze_hmem_init_fds(void)
+{
+	return FI_SUCCESS;
+}
+
+int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd,
+			      void **handle)
+{
+	return -FI_ENOSYS;
+}
+int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd,
+			       uint64_t device, void **ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+bool ze_hmem_p2p_enabled(void)
+{
+	return false;
+}
+
+#endif //HAVE_DRM
+
+static int ze_hmem_dl_init(void)
+{
+#ifdef ENABLE_ZE_DLOPEN
+	libze_handle = dlopen("libze_loader.so", RTLD_NOW);
+	if (!libze_handle) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libze_loader.so\n");
+		goto err_out;
+	}
+
+	libze_ops.zeInit = dlsym(libze_handle, "zeInit");
+	if (!libze_ops.zeInit) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeInit\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDriverGet = dlsym(libze_handle, "zeDriverGet");
+	if (!libze_ops.zeDriverGet) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeDriverGet\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceGet = dlsym(libze_handle, "zeDeviceGet");
+	if (!libze_ops.zeDeviceGet) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeDeviceGet\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceGetCommandQueueGroupProperties = dlsym(libze_handle,
+				"zeDeviceGetCommandQueueGroupProperties");
+	if (!libze_ops.zeDeviceGetCommandQueueGroupProperties) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find zeDeviceGetCommandQueueGroupProperties\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceCanAccessPeer = dlsym(libze_handle, "zeDeviceCanAccessPeer");
+	if (!libze_ops.zeDeviceCanAccessPeer) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeDeviceCanAccessPeer\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeContextCreate = dlsym(libze_handle, "zeContextCreate");
+	if (!libze_ops.zeContextCreate) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeContextCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeContextDestroy = dlsym(libze_handle, "zeContextDestroy");
+	if (!libze_ops.zeContextDestroy) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeContextDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueCreate = dlsym(libze_handle, "zeCommandQueueCreate");
+	if (!libze_ops.zeCommandQueueCreate) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandQueueCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueDestroy = dlsym(libze_handle, "zeCommandQueueDestroy");
+	if (!libze_ops.zeCommandQueueDestroy) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandQueueDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandQueueExecuteCommandLists = dlsym(libze_handle, "zeCommandQueueExecuteCommandLists");
+	if (!libze_ops.zeCommandQueueExecuteCommandLists) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandQueueExecuteCommandLists\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListCreate = dlsym(libze_handle, "zeCommandListCreate");
+	if (!libze_ops.zeCommandListCreate) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandListCreate\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListDestroy = dlsym(libze_handle, "zeCommandListDestroy");
+	if (!libze_ops.zeCommandListDestroy) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandListDestroy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListClose = dlsym(libze_handle, "zeCommandListClose");
+	if (!libze_ops.zeCommandListClose) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandListClose\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeCommandListAppendMemoryCopy = dlsym(libze_handle, "zeCommandListAppendMemoryCopy");
+	if (!libze_ops.zeCommandListAppendMemoryCopy) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeCommandListAppendMemoryCopy\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemGetAllocProperties = dlsym(libze_handle, "zeMemGetAllocProperties");
+	if (!libze_ops.zeMemGetAllocProperties) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeMemGetAllocProperties\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemGetAddressRange = dlsym(libze_handle, "zeMemGetAddressRange");
+	if (!libze_ops.zeMemGetAddressRange) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeMemGetAddressRange\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemGetIpcHandle = dlsym(libze_handle, "zeMemGetIpcHandle");
+	if (!libze_ops.zeMemGetIpcHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeMemGetIpcHandle\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemOpenIpcHandle = dlsym(libze_handle, "zeMemOpenIpcHandle");
+	if (!libze_ops.zeMemOpenIpcHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeMemOpenIpcHandle\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeMemCloseIpcHandle = dlsym(libze_handle, "zeMemCloseIpcHandle");
+	if (!libze_ops.zeMemCloseIpcHandle) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeMemCloseIpcHandle\n");
+		goto err_dlclose;
+	}
+
+	libze_ops.zeDeviceGetProperties = dlsym(libze_handle, "zeDeviceGetProperties");
+	if (!libze_ops.zeDeviceGetProperties) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find zeDeviceGetProperties\n");
+		goto err_dlclose;
+	}
+
+	return FI_SUCCESS;
+
+err_dlclose:
+	dlclose(libze_handle);
+
+err_out:
+	return -FI_ENODATA;
+
+#else
+	return FI_SUCCESS;
+#endif /* ENABLE_ZE_DLOPEN */
+}
+
+static void ze_hmem_dl_cleanup(void)
+{
+#ifdef ENABLE_ZE_DLOPEN
+	dlclose(libze_handle);
+#endif
+}
+
+static int ze_hmem_find_copy_only_engine(int device_num, int *ordinal)
+{
+	ze_result_t ze_ret;
+	uint32_t cq_grp_count = 0;
+	ze_command_queue_group_properties_t *cq_grp_props = NULL;
+	int i = 0;
+
+	ze_ret = ofi_zeDeviceGetCommandQueueGroupProperties(devices[device_num],
+							    &cq_grp_count, NULL);
+	if (ze_ret)
+		goto out;
+
+	cq_grp_props = calloc(cq_grp_count, sizeof(*cq_grp_props));
+
+	ze_ret = ofi_zeDeviceGetCommandQueueGroupProperties(devices[device_num],
+							    &cq_grp_count,
+							    cq_grp_props);
+	if (ze_ret)
+		goto out;
+
+	for (i = 0; i < cq_grp_count; i++) {
+		if (cq_grp_props[i].flags &
+		    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY &&
+		    !(cq_grp_props[i].flags &
+		      ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
+			break;
+		}
+	}
+
+out:
+	free(cq_grp_props);
+	*ordinal = i == cq_grp_count ? 0 : i;
+	return ze_ret;
+}
+
+/*
+ * Some L0 calls may segfault when called from a "destructor" if any
+ * HMEM-capable DL provider is enabled due to premature unloading of GPU
+ * specific L0 library. Before a permanent fix is available, don't make
+ * L0 calls insider a "destructor".
+ */
+static int ze_hmem_cleanup_internal(int fini_workaround)
+{
+	int i, ret = FI_SUCCESS;
+
+	for (i = 0; i < num_devices; i++) {
+		if (!fini_workaround) {
+			if (cmd_queue[i] &&
+			    ofi_zeCommandQueueDestroy(cmd_queue[i])) {
+				FI_WARN(&core_prov, FI_LOG_CORE,
+					"Failed to destroy ZE cmd_queue\n");
+				ret = -FI_EINVAL;
+			}
+		}
+		if (dev_fds[i] != -1) {
+			close(dev_fds[i]);
+			dev_fds[i] = -1;
+		}
+	}
+
+	if (!fini_workaround) {
+		if (ofi_zeContextDestroy(context))
+			ret = -FI_EINVAL;
+	}
+
+	ze_hmem_dl_cleanup();
+	return ret;
+}
+
+int ze_hmem_cleanup(void)
+{
+	return ze_hmem_cleanup_internal(1);
+}
+
 int ze_hmem_init(void)
 {
 	ze_driver_handle_t driver;
 	ze_context_desc_t context_desc = {0};
+	ze_device_properties_t dev_prop = {0};
 	ze_result_t ze_ret;
-	uint32_t count;
+	ze_bool_t access;
+	uint32_t count, i;
+	bool p2p = true;
+	int ret;
+
+	ret = ze_hmem_dl_init();
+	if (ret)
+		return ret;
 
-	ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY);
+	ze_ret = ofi_zeInit(ZE_INIT_FLAG_GPU_ONLY);
 	if (ze_ret)
 		return -FI_EIO;
 
 	count = 1;
-	ze_ret = zeDriverGet(&count, &driver);
+	ze_ret = ofi_zeDriverGet(&count, &driver);
 	if (ze_ret)
 		return -FI_EIO;
 
-	ze_ret = zeContextCreate(driver, &context_desc, &context);
+	ze_ret = ofi_zeContextCreate(driver, &context_desc, &context);
 	if (ze_ret)
 		return -FI_EIO;
 
+	for (i = 0; i < ZE_MAX_DEVICES; dev_fds[i++] = -1)
+		;
+
 	count = 0;
-	ze_ret = zeDeviceGet(driver, &count, NULL);
+	ze_ret = ofi_zeDeviceGet(driver, &count, NULL);
 	if (ze_ret || count > ZE_MAX_DEVICES)
 		goto err;
 
-	ze_ret = zeDeviceGet(driver, &count, devices);
+	ze_ret = ofi_zeDeviceGet(driver, &count, devices);
 	if (ze_ret)
 		goto err;
 
+	ret = ze_hmem_init_fds();
+	if (ret)
+		goto err;
+
 	for (num_devices = 0; num_devices < count; num_devices++) {
-		ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc,
-					      &cmd_queue[num_devices]);
+		ze_ret = ofi_zeDeviceGetProperties(devices[num_devices],
+						   &dev_prop);
+		if (ze_ret)
+			goto err;
+
+		memcpy(&dev_uuids[num_devices], &dev_prop.uuid,
+		       sizeof(*dev_uuids));
+
+		ze_ret = ze_hmem_find_copy_only_engine(num_devices,
+						       &ordinals[num_devices]);
 		if (ze_ret)
 			goto err;
+
+		cq_desc.ordinal = ordinals[num_devices];
+		ze_ret = ofi_zeCommandQueueCreate(context,
+						  devices[num_devices],
+						  &cq_desc,
+						  &cmd_queue[num_devices]);
+		if (ze_ret)
+			goto err;
+
+		for (i = 0; i < count; i++) {
+			if (ofi_zeDeviceCanAccessPeer(devices[num_devices],
+					devices[i], &access) || !access)
+				p2p = false;
+		}
 	}
 
+	p2p_enabled = p2p;
 	return FI_SUCCESS;
 
 err:
-	(void) ze_hmem_cleanup();
+	(void) ze_hmem_cleanup_internal(0);
 	FI_WARN(&core_prov, FI_LOG_CORE,
 		"Failed to initialize ZE driver resources\n");
 
 	return -FI_EIO;
 }
 
-int ze_hmem_cleanup(void)
-{
-	int i, ret = FI_SUCCESS;
-
-	for (i = 0; i < num_devices; i++) {
-		if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i])) {
-			FI_WARN(&core_prov, FI_LOG_CORE,
-				"Failed to destroy ZE cmd_queue\n");
-			ret = -FI_EINVAL;
-		}
-	}
-
-	if (zeContextDestroy(context))
-		return -FI_EINVAL;
-
-	return ret;
-}
-
 int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size)
 {
 	ze_command_list_handle_t cmd_list;
 	ze_result_t ze_ret;
 	int dev_id = (int) device;
 
-	ze_ret = zeCommandListCreate(context, devices[dev_id], &cl_desc, &cmd_list);
+	/* Host memory allocated via ZE */
+	if (dev_id < 0) {
+		memcpy(dst, src, size);
+		return 0;
+	}
+
+	cl_desc.commandQueueGroupOrdinal = ordinals[dev_id];
+	ze_ret = ofi_zeCommandListCreate(context, devices[dev_id], &cl_desc,
+					 &cmd_list);
 	if (ze_ret)
 		goto err;
 
-	ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL);
+	ze_ret = ofi_zeCommandListAppendMemoryCopy(cmd_list, dst, src, size,
+						   NULL, 0, NULL);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandListClose(cmd_list);
+	ze_ret = ofi_zeCommandListClose(cmd_list);
 	if (ze_ret)
 		goto free;
 
-	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[dev_id], 1,
-						   &cmd_list, NULL);
+	ze_ret = ofi_zeCommandQueueExecuteCommandLists(cmd_queue[dev_id], 1,
+						       &cmd_list, NULL);
 
 free:
-	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+	if (!ofi_zeCommandListDestroy(cmd_list) && !ze_ret)
 		return FI_SUCCESS;
 err:
 	FI_WARN(&core_prov, FI_LOG_CORE,
@@ -160,27 +773,48 @@ err:
 	return -FI_EIO;
 }
 
-bool ze_is_addr_valid(const void *addr)
+bool ze_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
 {
 	ze_result_t ze_ret;
-	ze_memory_allocation_properties_t mem_prop;
+	ze_memory_allocation_properties_t mem_props = {0};
+	ze_device_properties_t dev_prop = {0};
+	ze_device_handle_t device_ptr;
 	int i;
 
-	for (i = 0; i < num_devices; i++) {
-		ze_ret = zeMemGetAllocProperties(context, addr, &mem_prop,
-						 &devices[i]);
-		if (!ze_ret && mem_prop.type == ZE_MEMORY_TYPE_DEVICE)
+	ze_ret = ofi_zeMemGetAllocProperties(context, addr, &mem_props,
+					     &device_ptr);
+	if (ze_ret)
+		return false;
+
+	if (flags)
+		*flags = mem_props.type == ZE_MEMORY_TYPE_DEVICE ?
+			 FI_HMEM_DEVICE_ONLY : 0;
+
+	if (!device)
+		return true;
+
+	ze_ret = ofi_zeDeviceGetProperties(device_ptr, &dev_prop);
+	if (ze_ret)
+		return false;
+
+	for (i = 0, *device = 0; i < num_devices; i++) {
+		if (!memcmp(&dev_prop.uuid, &dev_uuids[i],
+			    sizeof(*dev_uuids))) {
+			*device = i;
 			return true;
+		}
 	}
-	return false;
+
+	assert(1);
+	return true;
 }
 
 int ze_hmem_get_handle(void *dev_buf, void **handle)
 {
 	ze_result_t ze_ret;
 
-	ze_ret = zeMemGetIpcHandle(context, dev_buf,
-				   (ze_ipc_mem_handle_t *) handle);
+	ze_ret = ofi_zeMemGetIpcHandle(context, dev_buf,
+				       (ze_ipc_mem_handle_t *) handle);
 	if (ze_ret) {
 		FI_WARN(&core_prov, FI_LOG_CORE, "Unable to get handle\n");
 		return -FI_EINVAL;
@@ -192,10 +826,14 @@ int ze_hmem_get_handle(void *dev_buf, void **handle)
 int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr)
 {
 	ze_result_t ze_ret;
+	int dev_id = (int) device;
+
+	/* only device memory is supported */
+	assert(dev_id >= 0);
 
-	ze_ret = zeMemOpenIpcHandle(context, devices[device],
-				    *((ze_ipc_mem_handle_t *) handle),
-				    0, ipc_ptr);
+	ze_ret = ofi_zeMemOpenIpcHandle(context, devices[dev_id],
+					*((ze_ipc_mem_handle_t *) handle),
+					0, ipc_ptr);
 	if (ze_ret) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
 			"Unable to open memory handle\n");
@@ -209,7 +847,7 @@ int ze_hmem_close_handle(void *ipc_ptr)
 {
 	ze_result_t ze_ret;
 
-	ze_ret = zeMemCloseIpcHandle(context, ipc_ptr);
+	ze_ret = ofi_zeMemCloseIpcHandle(context, ipc_ptr);
 	if (ze_ret) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
 			"Unable to close memory handle\n");
@@ -219,6 +857,44 @@ int ze_hmem_close_handle(void *ipc_ptr)
 	return FI_SUCCESS;
 }
 
+int ze_hmem_get_base_addr(const void *ptr, void **base, size_t *size)
+{
+	ze_result_t ze_ret;
+
+	ze_ret = ofi_zeMemGetAddressRange(context, ptr, base, size);
+	if (ze_ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Could not get base addr\n");
+		return -FI_EINVAL;
+	}
+	return FI_SUCCESS;
+}
+
+int ze_hmem_get_id(const void *ptr, uint64_t *id)
+{
+	ze_result_t ze_ret;
+	ze_memory_allocation_properties_t mem_props;
+	ze_device_handle_t device;
+
+	mem_props.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
+	mem_props.pNext = NULL;
+	ze_ret = ofi_zeMemGetAllocProperties(context, ptr, &mem_props, &device);
+	if (ze_ret || mem_props.type == ZE_MEMORY_TYPE_UNKNOWN) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Could not get memory id\n");
+		return -FI_EINVAL;
+	}
+
+	*id = mem_props.id;
+	return FI_SUCCESS;
+}
+
+int *ze_hmem_get_dev_fds(int *nfds)
+{
+	*nfds = num_devices;
+	return dev_fds;
+}
+
 #else
 
 int ze_hmem_init(void)
@@ -236,7 +912,7 @@ int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size)
 	return -FI_ENOSYS;
 }
 
-bool ze_is_addr_valid(const void *addr)
+bool ze_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags)
 {
 	return false;
 }
@@ -251,9 +927,42 @@ int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr)
 	return -FI_ENOSYS;
 }
 
+int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd,
+			      void **handle)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd,
+			       uint64_t device, void **ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
 int ze_hmem_close_handle(void *ipc_ptr)
 {
 	return -FI_ENOSYS;
 }
 
+bool ze_hmem_p2p_enabled(void)
+{
+	return false;
+}
+
+int ze_hmem_get_base_addr(const void *ptr, void **base, size_t *size)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_get_id(const void *ptr, uint64_t *id)
+{
+	return -FI_ENOSYS;
+}
+
+int *ze_hmem_get_dev_fds(int *nfds)
+{
+	*nfds = 0;
+	return NULL;
+}
+
 #endif /* HAVE_LIBZE */
diff --git a/deps/libfabric/src/indexer.c b/deps/libfabric/src/indexer.c
index 51ced09c8e2885168c3d24ae89021f04954bcc31..0abd7fb5eea8b3a276f1a31ab6c3c2a4bc13c436 100644
--- a/deps/libfabric/src/indexer.c
+++ b/deps/libfabric/src/indexer.c
@@ -122,7 +122,7 @@ void *ofi_idx_remove_ordered(struct indexer *idx, int index)
 
 	entry = idx->array[ofi_idx_array_index(index)];
 	item = entry[entry_index].item;
-	entry[entry_index].item = NULL;	
+	entry[entry_index].item = NULL;
 	if (ofi_idx_free_list_empty(idx) || index < idx->free_list) {
 		entry[entry_index].next = idx->free_list;
 		idx->free_list = index;
@@ -204,16 +204,29 @@ void *ofi_idm_clear(struct index_map *idm, int index)
 	return item;
 }
 
-void ofi_idm_reset(struct index_map *idm)
+void ofi_idm_reset(struct index_map *idm, void (*callback)(void *item))
 {
-	int i;
+	void **entry;
+	void *item;
+	int a, i;
+
+	for (a = 0; a < OFI_IDX_ARRAY_SIZE; a++) {
+		if (!idm->array[a]) {
+			assert(idm->count[a] == 0);
+			continue;
+		}
 
-	for (i=0; i<OFI_IDX_ARRAY_SIZE; i++) {
-		if (idm->array[i]) {
-			free(idm->array[i]);
-			idm->array[i] = NULL;
-			idm->count[i] = 0;
+		for (i = 0; idm->count[a] && i < OFI_IDX_ARRAY_SIZE; i++) {
+			entry = idm->array[a];
+			item = entry[i];
+			if (item) {
+				if (callback)
+					callback(item);
+				idm->count[a]--;
+			}
 		}
+		free(idm->array[a]);
+		idm->array[a] = NULL;
 	}
 }
 
diff --git a/deps/libfabric/src/iov.c b/deps/libfabric/src/iov.c
index cc6b674ffdf68c45b7c03b131dd61fcd50503c33..7a9a5e61d4fb13985322590fbc4e2d7edc9b0ac6 100644
--- a/deps/libfabric/src/iov.c
+++ b/deps/libfabric/src/iov.c
@@ -129,7 +129,7 @@ int ofi_truncate_iov(struct iovec *iov, size_t *iov_count, size_t new_size)
 		}
 		new_size -= iov[i].iov_len;
 	}
-	return -FI_ETRUNC;
+	return new_size ? -FI_ETRUNC : FI_SUCCESS;
 }
 
 /* Copy 'len' bytes worth of src iovec to dst */
diff --git a/deps/libfabric/src/log.c b/deps/libfabric/src/log.c
index 868f035849b2c26e9f7228dfa9268b272c4d18f8..129b647dbdb470111f96cc6221bd46c4b5f152d5 100644
--- a/deps/libfabric/src/log.c
+++ b/deps/libfabric/src/log.c
@@ -78,6 +78,7 @@ enum {
 	 ((uint64_t) (1 << (subsys + FI_LOG_SUBSYS_OFFSET))) | \
 	 ((uint64_t) (1 << level)))
 
+static int log_interval = 2000;
 uint64_t log_mask;
 struct fi_filter prov_log_filter;
 
@@ -103,6 +104,11 @@ void fi_log_init(void)
 	int level, i;
 	char *levelstr = NULL, *provstr = NULL, *subsysstr = NULL;
 
+	fi_param_define(NULL, "log_interval", FI_PARAM_INT,
+			"Delay in ms between rate limited log messages "
+			"(default 2000)");
+	fi_param_get_int(NULL, "log_interval", &log_interval);
+
 	fi_param_define(NULL, "log_level", FI_PARAM_STRING,
 			"Specify logging level: warn, trace, info, debug (default: warn)");
 	fi_param_get_str(NULL, "log_level", &levelstr);
@@ -145,6 +151,24 @@ int DEFAULT_SYMVER_PRE(fi_log_enabled)(const struct fi_provider *prov,
 }
 DEFAULT_SYMVER(fi_log_enabled_, fi_log_enabled, FABRIC_1.0);
 
+__attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
+int DEFAULT_SYMVER_PRE(fi_log_ready)(const struct fi_provider *prov,
+		enum fi_log_level level, enum fi_log_subsys subsys,
+		uint64_t *showtime)
+{
+	uint64_t cur;
+
+	if (fi_log_enabled(prov, level, subsys)) {
+		cur = ofi_gettime_ms();
+		if (cur >= *showtime) {
+			*showtime = cur + (uint64_t) log_interval;
+			return true;
+		}
+	}
+	return false;
+}
+CURRENT_SYMVER(fi_log_ready_, fi_log_ready);
+
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
 void DEFAULT_SYMVER_PRE(fi_log)(const struct fi_provider *prov, enum fi_log_level level,
 		enum fi_log_subsys subsys, const char *func, int line,
@@ -155,9 +179,9 @@ void DEFAULT_SYMVER_PRE(fi_log)(const struct fi_provider *prov, enum fi_log_leve
 
 	va_list vargs;
 
-	size = snprintf(buf, sizeof(buf), "%s:%d:%s:%s:%s():%d<%s> ", PACKAGE,
-			pid, prov->name, log_subsys[subsys], func, line,
-			log_levels[level]);
+	size = snprintf(buf, sizeof(buf), "%s:%d:%ld:%s:%s:%s():%d<%s> ",
+			PACKAGE, pid, (unsigned long) time(NULL), prov->name,
+			log_subsys[subsys], func, line, log_levels[level]);
 
 	va_start(vargs, fmt);
 	vsnprintf(buf + size, sizeof(buf) - size, fmt, vargs);
diff --git a/deps/libfabric/src/osx/osd.c b/deps/libfabric/src/osx/osd.c
new file mode 100644
index 0000000000000000000000000000000000000000..bcf671e97154aedf9e31fbee0964c1432657b22a
--- /dev/null
+++ b/deps/libfabric/src/osx/osd.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2020 by Argonne National Laboratory.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ofi.h"
+#include "ofi_osd.h"
+
+static ssize_t
+ofi_sendv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags)
+{
+	ssize_t size = 0;
+	int ret, i;
+
+	if (iov_cnt == 1) {
+		return ofi_send_socket(fd, iovec[0].iov_base,
+				       iovec[0].iov_len, flags);
+	}
+
+	for (i = 0; i < iov_cnt; i++) {
+		ret = ofi_send_socket(fd, iovec[i].iov_base,
+				      iovec[i].iov_len, flags);
+		if (ret >= 0) {
+			size += ret;
+			if (ret != iovec[i].iov_len)
+				return size;
+		} else {
+			return size ? size : ret;
+		}
+	}
+	return size;
+}
+
+static ssize_t
+ofi_recvv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags)
+{
+	ssize_t size = 0;
+	int ret, i;
+
+	if (iov_cnt == 1) {
+		return ofi_recv_socket(fd, iovec[0].iov_base,
+				       iovec[0].iov_len, flags);
+	}
+
+	for (i = 0; i < iov_cnt; i++) {
+		ret = ofi_recv_socket(fd, iovec[i].iov_base,
+				      iovec[i].iov_len, flags);
+		if (ret >= 0) {
+			size += ret;
+			if (ret != iovec[i].iov_len)
+				return size;
+		} else {
+			return size ? size : ret;
+		}
+	}
+	return size;
+}
+
+ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt)
+{
+	return ofi_sendv_socket(fd, iovec, iov_cnt, 0);
+}
+
+ssize_t ofi_readv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt)
+{
+	return ofi_recvv_socket(fd, iovec, iov_cnt, 0);
+}
+
+ssize_t ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags)
+{
+	return ofi_sendv_socket(fd, msg->msg_iov, msg->msg_iovlen, flags);
+}
+
+ssize_t ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags)
+{
+	return ofi_recvv_socket(fd, msg->msg_iov, msg->msg_iovlen, flags);
+}
diff --git a/deps/libfabric/src/unix/osd.c b/deps/libfabric/src/unix/osd.c
index 41faee5932f72d25eb3e1e403f1e5acfd6a86e5e..e32b05ae3de38f353549a923dd46489b4412f4fe 100644
--- a/deps/libfabric/src/unix/osd.c
+++ b/deps/libfabric/src/unix/osd.c
@@ -91,6 +91,21 @@ int fi_fd_nonblock(int fd)
 	return 0;
 }
 
+int fi_fd_block(int fd)
+{
+	long flags = 0;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0) {
+		return -errno;
+	}
+
+	if(fcntl(fd, F_SETFL, flags & ~O_NONBLOCK))
+		return -errno;
+
+	return 0;
+}
+
 int fi_wait_cond(pthread_cond_t *cond, pthread_mutex_t *mut, int timeout_ms)
 {
 	uint64_t t;
@@ -112,16 +127,17 @@ int ofi_shm_map(struct util_shm *shm, const char *name, size_t size,
 	int i, ret = FI_SUCCESS;
 	int flags = O_RDWR | (readonly ? 0 : O_CREAT);
 	struct stat mapstat;
+	int fname_size = 0;
 
 	*mapped = MAP_FAILED;
 	memset(shm, 0, sizeof(*shm));
 
-	fname = calloc(1, strlen(name) + 2); /* '/' + %s + trailing 0 */
+	fname_size = strlen(name) + 2; /* '/' + %s + trailing 0 */
+	fname = calloc(1, fname_size);
 	if (!fname)
 		return -FI_ENOMEM;
 
-	strcpy(fname, "/");
-	strcat(fname, name);
+	snprintf(fname, fname_size, "/%s", name);
 	shm->name = fname;
 
 	for (i = 0; i < strlen(fname); i++) {
@@ -280,3 +296,45 @@ int ofi_set_thread_affinity(const char *s)
 	return -FI_ENOSYS;
 #endif
 }
+
+
+void ofi_pollfds_do_add(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item)
+{
+	if (item->fd >= pfds->size) {
+		if (ofi_pollfds_grow(pfds, item->fd))
+			return;
+	}
+
+	pfds->fds[item->fd].fd = item->fd;
+	pfds->fds[item->fd].events = item->events;
+	pfds->fds[item->fd].revents = 0;
+	pfds->context[item->fd] = item->context;
+	if (item->fd >= pfds->nfds)
+		pfds->nfds = item->fd + 1;
+}
+
+int ofi_pollfds_do_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
+		       void *context)
+{
+	if ((fd < pfds->nfds) && (pfds->fds[fd].fd == fd)) {
+		pfds->fds[fd].events = events;
+		pfds->context[fd] = context;
+		return FI_SUCCESS;
+	}
+
+	return -FI_ENOENT;
+}
+
+void ofi_pollfds_do_del(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item)
+{
+	if (item->fd >= pfds->nfds)
+		return;
+
+	pfds->fds[item->fd].fd = INVALID_SOCKET;
+	pfds->fds[item->fd].events = 0;
+	pfds->fds[item->fd].revents = 0;
+	while (pfds->nfds && pfds->fds[pfds->nfds - 1].fd == INVALID_SOCKET)
+		pfds->nfds--;
+}
diff --git a/deps/libfabric/src/var.c b/deps/libfabric/src/var.c
index 6103db1afb9cb4f160cf489279c1274d8403b95d..6f4a9db788a2a4d2a9023980a9c369fa4939acb4 100644
--- a/deps/libfabric/src/var.c
+++ b/deps/libfabric/src/var.c
@@ -43,7 +43,6 @@
 #include "ofi_list.h"
 
 
-extern int ofi_init;
 extern void fi_ini(void);
 
 struct fi_param_entry {
@@ -87,8 +86,7 @@ int DEFAULT_SYMVER_PRE(fi_getparams)(struct fi_param **params, int *count)
 	int cnt, i;
 	char *tmp;
 
-	if (!ofi_init)
-		fi_ini();
+	fi_ini();
 
 	for (entry = param_list.next, cnt = 0; entry != &param_list;
 	     entry = entry->next)
@@ -258,6 +256,7 @@ int DEFAULT_SYMVER_PRE(fi_param_get)(struct fi_provider *provider,
 {
 	struct fi_param_entry *param;
 	char *str_value;
+	int parsed_boolean;
 	int ret = FI_SUCCESS;
 
 	if (!provider)
@@ -294,11 +293,17 @@ int DEFAULT_SYMVER_PRE(fi_param_get)(struct fi_provider *provider,
 			"read int var %s=%d\n", param_name, *(int *) value);
 		break;
 	case FI_PARAM_BOOL:
-		* ((int *) value) = fi_parse_bool(str_value);
+		parsed_boolean = fi_parse_bool(str_value);
+		if (parsed_boolean == -1) {
+			ret = -FI_EINVAL;
+			FI_WARN(provider, FI_LOG_CORE,
+					"failed to parse bool var %s=%s\n", param_name, str_value);
+			break;
+		}
+
+		* ((int *) value) = parsed_boolean;
 		FI_INFO(provider, FI_LOG_CORE,
 			"read bool var %s=%d\n", param_name, *(int *) value);
-		if (*(int *) value == -1)
-			ret = -FI_EINVAL;
 		break;
 	case FI_PARAM_SIZE_T:
 		* ((size_t *) value) = strtol(str_value, NULL, 0);
diff --git a/deps/libfabric/src/windows/osd.c b/deps/libfabric/src/windows/osd.c
index 7c0005d4ecc75e8e9b6b52a701a8ff58431f73fd..83ce12acbded8033fa70aa81c9856f99c3bf28aa 100644
--- a/deps/libfabric/src/windows/osd.c
+++ b/deps/libfabric/src/windows/osd.c
@@ -600,3 +600,55 @@ ssize_t ofi_recvmsg_udp(SOCKET fd, struct msghdr *msg, int flags)
 	ret = WSARecvMsg(fd, msg, &bytes, NULL, NULL);
 	return ret ? ret : bytes;
 }
+
+
+void ofi_pollfds_do_add(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item)
+{
+	if (pfds->nfds == pfds->size) {
+		if (ofi_pollfds_grow(pfds, pfds->size + 1))
+			return;
+	}
+
+	pfds->fds[pfds->nfds].fd = item->fd;
+	pfds->fds[pfds->nfds].events = item->events;
+	pfds->fds[pfds->nfds].revents = 0;
+	pfds->context[pfds->nfds] = item->context;
+	pfds->nfds++;
+}
+
+int ofi_pollfds_do_mod(struct ofi_pollfds *pfds, int fd, uint32_t events,
+		       void *context)
+{
+	int i;
+
+	/* 0 is signaling fd */
+	for (i = 1; i < pfds->nfds; i++) {
+		if (pfds->fds[i].fd == fd) {
+			pfds->fds[i].events = events;
+			pfds->context[i] = context;
+			return FI_SUCCESS;
+		}
+	}
+
+	return -FI_ENOENT;
+}
+
+void ofi_pollfds_do_del(struct ofi_pollfds *pfds,
+			struct ofi_pollfds_work_item *item)
+{
+	int i;
+
+	for (i = 0; i < pfds->nfds; i++) {
+		if (pfds->fds[i].fd == item->fd) {
+			pfds->fds[i].fd = INVALID_SOCKET;
+
+			pfds->nfds--;
+			pfds->fds[i].fd = pfds->fds[pfds->nfds].fd;
+			pfds->fds[i].events = pfds->fds[pfds->nfds].events;
+			pfds->fds[i].revents = pfds->fds[pfds->nfds].revents;
+			pfds->context[i] = pfds->context[pfds->nfds];
+			break;
+		}
+	}
+}
diff --git a/deps/libfabric/strerror.vcxproj b/deps/libfabric/strerror.vcxproj
index ed0f356ca2d25ccb58545069c72967ababa408c5..506535c64b64e636b97f3794c381cd196994ea63 100644
--- a/deps/libfabric/strerror.vcxproj
+++ b/deps/libfabric/strerror.vcxproj
@@ -13,6 +13,10 @@
       <Configuration>Debug-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug-v142|x64">
+      <Configuration>Debug-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release-ICC|x64">
       <Configuration>Release-ICC</Configuration>
       <Platform>x64</Platform>
@@ -25,6 +29,10 @@
       <Configuration>Release-v140</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release-v142|x64">
+      <Configuration>Release-v142</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{C835FB00-8E80-4D4A-9791-4B7D6D37168A}</ProjectGuid>
@@ -45,6 +53,12 @@
     <PlatformToolset>v141</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -65,6 +79,13 @@
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -83,6 +104,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -92,6 +116,9 @@
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -106,6 +133,11 @@
     <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <LinkIncremental>true</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
@@ -121,6 +153,11 @@
     <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
     <TargetName>fi_$(ProjectName)</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
+    <TargetName>fi_$(ProjectName)</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <LinkIncremental>false</LinkIncremental>
     <IntDir>$(Platform)\$(Configuration)\strerror\</IntDir>
@@ -158,6 +195,22 @@
       <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">
     <ClCompile>
       <PrecompiledHeader>
@@ -212,6 +265,25 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -235,9 +307,11 @@
     <ClCompile Include="util\strerror.c">
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v140|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v141|x64'">true</C99Support>
+      <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-v142|x64'">true</C99Support>
       <C99Support Condition="'$(Configuration)|$(Platform)'=='Release-ICC|x64'">true</C99Support>
     </ClCompile>
   </ItemGroup>
diff --git a/deps/libfabric/util/info.c b/deps/libfabric/util/info.c
index ed924a528bf3131934e1d268b84f9c6b43b11ac6..511e115436f1d440e428f54bc832b6b16d8516d0 100644
--- a/deps/libfabric/util/info.c
+++ b/deps/libfabric/util/info.c
@@ -51,6 +51,7 @@ static char *envstr;
 
 static const struct option longopts[] = {
 	{"help", no_argument, NULL, 'h'},
+	{"src_addr", required_argument, NULL, 's'},
 	{"node", required_argument, NULL, 'n'},
 	{"port", required_argument, NULL, 'P'},
 	{"caps", required_argument, NULL, 'c'},
@@ -70,7 +71,8 @@ static const struct option longopts[] = {
 
 static const char *help_strings[][2] = {
 	{"", "\t\tdisplay this help and exit"},
-	{"NAME", "\t\tnode name or address"},
+	{"ADDR", "\t\tsource name or address"},
+	{"NAME", "\t\tdest node name or address"},
 	{"PNUM", "\t\tport number"},
 	{"CAP1|CAP2..", "\tone or more capabilities: FI_MSG|FI_RMA..."},
 	{"MOD1|MOD2..", "\tone or more modes, default all modes"},
@@ -304,15 +306,13 @@ static int print_long_info(struct fi_info *info)
 	return EXIT_SUCCESS;
 }
 
-static int run(struct fi_info *hints, char *node, char *port)
+static int run(struct fi_info *hints, char *node, char *port, uint64_t flags)
 {
 	struct fi_info *info;
 	int ret;
-	uint64_t flags;
 
-	flags = list_providers ? FI_PROV_ATTR_ONLY : 0;
 	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
-			node, port, flags, hints, &info);
+			 node, port, flags, hints, &info);
 	if (ret) {
 		fprintf(stderr, "fi_getinfo: %d\n", ret);
 		return ret;
@@ -333,6 +333,7 @@ static int run(struct fi_info *hints, char *node, char *port)
 
 int main(int argc, char **argv)
 {
+	uint64_t flags = 0;
 	int op, ret, option_index;
 	int use_hints = 0;
 
@@ -344,7 +345,7 @@ int main(int argc, char **argv)
 	hints->domain_attr->mode = ~0;
 	hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE);
 
-	while ((op = getopt_long(argc, argv, "n:P:c:m:t:a:p:d:f:eg:lhv", longopts,
+	while ((op = getopt_long(argc, argv, "s:n:P:c:m:t:a:p:d:f:eg:lhv", longopts,
 				 &option_index)) != -1) {
 		switch (op) {
 		case 0:
@@ -357,6 +358,10 @@ int main(int argc, char **argv)
 				return EXIT_SUCCESS;
 			}
 			goto print_help;
+		case 's':
+			node = optarg;
+			flags |= FI_SOURCE;
+			break;
 		case 'n':
 			node = optarg;
 			break;
@@ -413,6 +418,7 @@ int main(int argc, char **argv)
 			break;
 		case 'l':
 			list_providers = 1;
+			flags |= FI_PROV_ATTR_ONLY;
 			break;
 		case 'v':
 			verbose = 1;
@@ -426,7 +432,7 @@ print_help:
 		}
 	}
 
-	ret = run(use_hints ? hints : NULL, node, port);
+	ret = run(use_hints ? hints : NULL, node, port, flags);
 
 out:
 	fi_freeinfo(hints);
diff --git a/deps/libfabric/util/pingpong.c b/deps/libfabric/util/pingpong.c
index ca419be26a67549640e7c56e8946b2c9cf007b6d..8b3355f2c5fd1de23694bdf597354b831b2fd1df 100644
--- a/deps/libfabric/util/pingpong.c
+++ b/deps/libfabric/util/pingpong.c
@@ -98,6 +98,7 @@ struct pp_opts {
 #define PP_MAX_CTRL_MSG 64
 #define PP_CTRL_BUF_LEN 64
 #define PP_MR_KEY 0xC0DE
+#define PP_MAX_ADDRLEN 1024
 
 #define INTEG_SEED 7
 #define PP_ENABLE_ALL (~0)
@@ -119,6 +120,7 @@ struct pp_opts {
 		__LINE__, ##__VA_ARGS__)
 
 int pp_debug;
+int pp_ipv6;
 
 #define PP_DEBUG(fmt, ...)                                                     \
 	do {                                                                   \
@@ -298,7 +300,7 @@ static int pp_getaddrinfo(char *name, uint16_t port, struct addrinfo **results)
 	char port_s[6];
 
 	struct addrinfo hints = {
-	    .ai_family = AF_INET,       /* IPv4 */
+	    .ai_family = pp_ipv6 ? AF_INET6 : AF_INET,
 	    .ai_socktype = SOCK_STREAM, /* TCP socket */
 	    .ai_protocol = IPPROTO_TCP, /* Any protocol */
 	    .ai_flags = AI_NUMERICSERV /* numeric port is used */
@@ -319,9 +321,22 @@ out:
 	return ret;
 }
 
+static void pp_print_addrinfo(struct addrinfo *ai, char *msg)
+{
+	char s[80] = {0};
+	void *addr;
+
+	if (ai->ai_family == AF_INET6)
+		addr = &((struct sockaddr_in6 *)ai->ai_addr)->sin6_addr;
+	else
+		addr = &((struct sockaddr_in *)ai->ai_addr)->sin_addr;
+
+	inet_ntop(ai->ai_family, addr, s, 80);
+	PP_DEBUG("%s %s\n", msg, s);
+}
+
 static int pp_ctrl_init_client(struct ct_pingpong *ct)
 {
-	struct sockaddr_in in_addr = {0};
 	struct addrinfo *results;
 	struct addrinfo *rp;
 	int errno_save = 0;
@@ -345,13 +360,27 @@ static int pp_ctrl_init_client(struct ct_pingpong *ct)
 		}
 
 		if (ct->opts.src_port != 0) {
-			in_addr.sin_family = AF_INET;
-			in_addr.sin_port = htons(ct->opts.src_port);
-			in_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+			if (pp_ipv6) {
+				struct sockaddr_in6 in6_addr = {0};
 
-			ret =
-			    bind(ct->ctrl_connfd, (struct sockaddr *)&in_addr,
-				 sizeof(in_addr));
+				in6_addr.sin6_family = AF_INET6;
+				in6_addr.sin6_port = htons(ct->opts.src_port);
+				in6_addr.sin6_addr = in6addr_any;
+
+				ret =
+				    bind(ct->ctrl_connfd, (struct sockaddr *)&in6_addr,
+					 sizeof(in6_addr));
+			} else {
+				struct sockaddr_in in_addr = {0};
+
+				in_addr.sin_family = AF_INET;
+				in_addr.sin_port = htons(ct->opts.src_port);
+				in_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+
+				ret =
+				    bind(ct->ctrl_connfd, (struct sockaddr *)&in_addr,
+					 sizeof(in_addr));
+			}
 			if (ret == -1) {
 				errno_save = ofi_sockerr();
 				ofi_close_socket(ct->ctrl_connfd);
@@ -359,6 +388,8 @@ static int pp_ctrl_init_client(struct ct_pingpong *ct)
 			}
 		}
 
+		pp_print_addrinfo(rp, "CLIENT: connecting to");
+
 		ret = connect(ct->ctrl_connfd, rp->ai_addr, rp->ai_addrlen);
 		if (ret != -1)
 			break;
@@ -382,12 +413,11 @@ static int pp_ctrl_init_client(struct ct_pingpong *ct)
 
 static int pp_ctrl_init_server(struct ct_pingpong *ct)
 {
-	struct sockaddr_in ctrl_addr = {0};
 	int optval = 1;
 	SOCKET listenfd;
 	int ret;
 
-	listenfd = ofi_socket(AF_INET, SOCK_STREAM, 0);
+	listenfd = ofi_socket(pp_ipv6 ? AF_INET6 : AF_INET, SOCK_STREAM, 0);
 	if (listenfd == INVALID_SOCKET) {
 		ret = -ofi_sockerr();
 		PP_PRINTERR("socket", ret);
@@ -402,12 +432,25 @@ static int pp_ctrl_init_server(struct ct_pingpong *ct)
 		goto fail_close_socket;
 	}
 
-	ctrl_addr.sin_family = AF_INET;
-	ctrl_addr.sin_port = htons(ct->opts.src_port);
-	ctrl_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	if (pp_ipv6) {
+		struct sockaddr_in6 ctrl6_addr = {0};
+
+		ctrl6_addr.sin6_family = AF_INET6;
+		ctrl6_addr.sin6_port = htons(ct->opts.src_port);
+		ctrl6_addr.sin6_addr = in6addr_any;
+
+		ret = bind(listenfd, (struct sockaddr *)&ctrl6_addr,
+			   sizeof(ctrl6_addr));
+	} else {
+		struct sockaddr_in ctrl_addr = {0};
+
+		ctrl_addr.sin_family = AF_INET;
+		ctrl_addr.sin_port = htons(ct->opts.src_port);
+		ctrl_addr.sin_addr.s_addr = htonl(INADDR_ANY);
 
-	ret = bind(listenfd, (struct sockaddr *)&ctrl_addr,
-		   sizeof(ctrl_addr));
+		ret = bind(listenfd, (struct sockaddr *)&ctrl_addr,
+			   sizeof(ctrl_addr));
+	}
 	if (ret == -1) {
 		ret = -ofi_sockerr();
 		PP_PRINTERR("bind", ret);
@@ -599,6 +642,8 @@ static int pp_recv_name(struct ct_pingpong *ct)
 		return ret;
 
 	len = ntohl(len);
+	if (len > PP_MAX_ADDRLEN)
+		return -EINVAL;
 
 	ct->rem_name = calloc(1, len);
 	if (!ct->rem_name) {
@@ -1376,6 +1421,11 @@ static int pp_alloc_active_res(struct ct_pingpong *ct, struct fi_info *fi)
 {
 	int ret;
 
+	if (fi->tx_attr->mode & FI_MSG_PREFIX)
+		ct->tx_prefix_size = fi->ep_attr->msg_prefix_size;
+	if (fi->rx_attr->mode & FI_MSG_PREFIX)
+		ct->rx_prefix_size = fi->ep_attr->msg_prefix_size;
+
 	ret = pp_alloc_msgs(ct);
 	if (ret)
 		return ret;
@@ -1411,11 +1461,6 @@ static int pp_alloc_active_res(struct ct_pingpong *ct, struct fi_info *fi)
 		}
 	}
 
-	if (fi->tx_attr->mode & FI_MSG_PREFIX)
-		ct->tx_prefix_size = fi->ep_attr->msg_prefix_size;
-	if (fi->rx_attr->mode & FI_MSG_PREFIX)
-		ct->rx_prefix_size = fi->ep_attr->msg_prefix_size;
-
 	ret = fi_endpoint(ct->domain, fi, &(ct->ep), NULL);
 	if (ret) {
 		PP_PRINTERR("fi_endpoint", ret);
@@ -1869,8 +1914,7 @@ static int pp_finalize(struct ct_pingpong *ct)
 
 	PP_DEBUG("Terminating test\n");
 
-	strcpy(ct->tx_buf, fin_buf);
-	((char *)ct->tx_buf)[fin_buf_size - 1] = '\0';
+	snprintf(ct->tx_buf, fin_buf_size, "%s", fin_buf);
 
 	iov.iov_base = ct->tx_buf;
 	iov.iov_len = fin_buf_size + ct->tx_prefix_size;
@@ -1968,6 +2012,7 @@ static void pp_pingpong_usage(struct ct_pingpong *ct, char *name, char *desc)
 
 	fprintf(stderr, " %-20s %s\n", "-h", "display this help output");
 	fprintf(stderr, " %-20s %s\n", "-v", "enable debugging output");
+	fprintf(stderr, " %-20s %s\n", "-6", "use IPv6 address");
 }
 
 static void pp_parse_opts(struct ct_pingpong *ct, int op, char *optarg)
@@ -2048,6 +2093,12 @@ static void pp_parse_opts(struct ct_pingpong *ct, int op, char *optarg)
 	case 'v':
 		pp_debug = 1;
 		break;
+
+	/* IPV6 */
+	case '6':
+		pp_ipv6 = 1;
+		break;
+
 	default:
 		/* let getopt handle unknown opts*/
 		break;
@@ -2151,7 +2202,9 @@ static int run_pingpong_dgram(struct ct_pingpong *ct)
 	/* Post an extra receive to avoid lacking a posted receive in the
 	 * finalize.
 	 */
-	ret = fi_recv(ct->ep, ct->rx_buf, ct->rx_size, fi_mr_desc(ct->mr), 0,
+	ret = fi_recv(ct->ep, ct->rx_buf,
+		      MAX(ct->rx_size, PP_MAX_CTRL_MSG) +  ct->rx_prefix_size,
+		      fi_mr_desc(ct->mr), 0,
 		      ct->rx_ctx_ptr);
 	if (ret)
 		return ret;
@@ -2241,7 +2294,7 @@ int main(int argc, char **argv)
 
 	ofi_osd_init();
 
-	while ((op = getopt(argc, argv, "hvd:p:e:I:S:B:P:cm:")) != -1) {
+	while ((op = getopt(argc, argv, "hvd:p:e:I:S:B:P:cm:6")) != -1) {
 		switch (op) {
 		default:
 			pp_parse_opts(&ct, op, optarg);
diff --git a/deps/mamba/.gitignore b/deps/mamba/.gitignore
index 8b13ed91580c68081c3ea04c3e97805cd151e5c9..4e278951db6b3c3ef65219df31bb52452dba7ae0 100644
--- a/deps/mamba/.gitignore
+++ b/deps/mamba/.gitignore
@@ -163,6 +163,9 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/sphinx/_build/
+docs/_static/
+docs/_templates/
 
 # PyBuilder
 target/
diff --git a/deps/mamba/.readthedocs.yaml b/deps/mamba/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61ed0fb3cdfa33473ee9598a3e7ee5b1e7d99fe0
--- /dev/null
+++ b/deps/mamba/.readthedocs.yaml
@@ -0,0 +1,27 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  image: latest
+  apt_packages:
+    - make
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/sphinx/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+# formats:
+#    - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   version: "3.7"
+   install:
+   - requirements: docs/sphinx/requirements.txt
+   system_packages: true
diff --git a/deps/mamba/common/mmb_layout.c b/deps/mamba/common/mmb_layout.c
index 211439ff84d427d5a4a55d84362f4183a7b35392..ed29843649f5e0ab16213bff6e8354ce9037d278 100644
--- a/deps/mamba/common/mmb_layout.c
+++ b/deps/mamba/common/mmb_layout.c
@@ -35,6 +35,8 @@
 #include "mmb_layout.h"
 #include "mmb_logging.h"
 
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
 static mmbLayoutPadding g_mmb_zero_padding = {0};
 /** Layout type strings */
 static const char *const
@@ -81,6 +83,20 @@ const char* mmb_layout_order_to_str(int order) {
   return mmb_layout_order_types[order];
 }
 
+static const char *const
+mmb_layout_equivalence[MMB_LAYOUT_DIFF_MAX] = { [MMB_LAYOUT_EQUAL] = "Identical layouts",
+                                          [MMB_LAYOUT_DIFF_INDEX] = "Same layouts, different index",
+                                          [MMB_LAYOUT_DIFF_FIELDS] = "Same layout type, different offsets/lengths etc",
+                                          [MMB_LAYOUT_DIFF_TYPES] = "Different layout types"};
+
+const char* mmb_layout_equivalence_to_str(int result) {
+  if(result >= MMB_LAYOUT_DIFF_MAX){
+    MMB_ERR("Mamba layout compare result is out of bounds\n");
+    return NULL;
+  }
+  return mmb_layout_equivalence[result];
+}
+
 mmbLayoutPadding *mmb_layout_padding_create_zero(void) {
   return &g_mmb_zero_padding;
 }
@@ -155,6 +171,88 @@ mmbError mmb_layout_create_irregular_1d(size_t element_size_bytes,
   return MMB_UNIMPLEMENTED;
 }
 
+mmbError mmb_layout_dist_find_piece_size(mmbLayout *dist_layout, int64_t *size){
+  
+  mmbError status = MMB_UNIMPLEMENTED;
+  *size = -1 ;
+  /* FIXME assuming irregular 1D mmb_layout only */
+  if((dist_layout->type == MMB_IRREGULAR) && (dist_layout->n_dims == 1)) {
+    /* size = length of my piece in the distribution * size of element */
+    *size =  dist_layout->irregular.lengths[dist_layout->index] * dist_layout->element.size_bytes;
+    status = MMB_OK;
+  }
+  else {
+    /* FIXME  support other mamba layout types*/
+    MMB_ERR("mamba layout type is not yet supported \n");
+    status = MMB_UNIMPLEMENTED;
+  }  
+  return status;  
+}
+
+
+mmbError mmb_layout_dist_find_entire_size(mmbLayout *dist_layout, int64_t *size){
+  mmbError status = MMB_UNIMPLEMENTED;
+  *size = -1 ;
+  /* FIXME assuming irregular 1D mmb_layout only */
+  if((dist_layout->type == MMB_IRREGULAR) && (dist_layout->n_dims == 1)) {
+    *size = 0; //  size
+    for (size_t i = 0; i < dist_layout->irregular.n_blocks.d[0]; i++) {
+      *size += dist_layout->irregular.lengths[i]; //  size
+    }
+    *size = *size  * dist_layout->element.size_bytes; /*size in bytes */
+    status = MMB_OK;
+  }
+  else {
+    /* FIXME  support other mamba layout types*/
+    MMB_ERR("mamba layout type is not yet supported \n");
+    status = MMB_UNIMPLEMENTED;
+  }
+
+  return status;
+}
+
+/** 
+  * Cook up a default distributed, which consists of one piece holding
+  * all the data from all pieces ... not really distributed */
+mmbError mmb_layout_dist_create_default_layout(mmbLayout *src_layout, mmbLayout **default_layout){
+  mmbError status = MMB_UNIMPLEMENTED;
+
+  /*FIXME only support irregular 1D layouts */
+  if ((src_layout->type == MMB_IRREGULAR) && (src_layout->n_dims == 1)) {
+    
+    size_t *offsets, *sizes;
+    offsets = (size_t *) malloc(sizeof(size_t) * 1);
+    sizes = (size_t *) malloc(sizeof(size_t) * 1);
+
+    if ((!offsets) || (!sizes) ) {
+      MMB_ERR("Can not allocate memory for default layout offsets and sizes \n");
+      return MMB_OUT_OF_MEMORY;
+    }
+
+    offsets[0] = 0; // data starts from the begining
+    /* find the whole size  */
+    sizes[0] = 0; //  size
+    for (size_t i = 0; i < src_layout->irregular.n_blocks.d[0]; i++) {
+      sizes[0] += src_layout->irregular.lengths[i]; // cdo size
+    }
+
+    status = mmb_layout_create_dist_irregular_1d(src_layout->element.size_bytes,
+                                       0, /*index = 0*/
+                                       1, /* nblocks = 1*/
+                                       offsets,
+                                       sizes,
+                                       default_layout);
+  }
+  else {
+    /* FIXME  support other mamba layout types*/
+    MMB_ERR("mamba layout type is not yet supported \n");
+    status = MMB_UNIMPLEMENTED;
+  }
+
+  return status;
+  
+}
+
 mmbError mmb_layout_create_dist_irregular_1d(size_t element_size_bytes,
                                      size_t index,
                                      size_t n_blocks,
@@ -233,28 +331,29 @@ mmbError mmb_layout_create_copy(mmbLayout *in_layout, mmbLayout **out_layout) {
 
   l->type = in_layout->type;
   l->n_dims = in_layout->n_dims;
+  l->index = in_layout->index;
   stat = mmb_layout_element_copy(&l->element, &in_layout->element);
   if(stat != MMB_OK) {
-    MMB_ERR("Unable to copy mmbLayoutElement");
+    MMB_ERR("Unable to copy mmbLayoutElement \n");
     goto BAILOUT;
   }
 
   stat = mmb_layout_padding_copy(&l->pad, &in_layout->pad, in_layout->n_dims);
   if(stat != MMB_OK) {
-    MMB_ERR("Unable to copy mmbLayoutPadding");
+    MMB_ERR("Unable to copy mmbLayoutPadding \n");
     goto BAILOUT;
   }
 
   if(in_layout->type == MMB_REGULAR_BLOCK) {
     stat = mmb_layout_block_copy(&l->block, &in_layout->block);
     if(stat != MMB_OK) {
-      MMB_ERR("Unable to copy mmbLayoutBlock");
+      MMB_ERR("Unable to copy mmbLayoutBlock \n");
       goto BAILOUT;
     }
   } else if (in_layout->type == MMB_IRREGULAR) {
     stat = mmb_layout_irregular_copy(&l->irregular, &in_layout->irregular);
     if(stat != MMB_OK) {
-      MMB_ERR("Unable to copy mmbLayoutIrregular");
+      MMB_ERR("Unable to copy mmbLayoutIrregular \n");
       goto BAILOUT;
     }
   }
@@ -265,46 +364,154 @@ BAILOUT:
   return stat;
 }
 
+mmbError mmb_layout_compute_intersection(mmbLayout *src_layout, mmbLayout *dst_layout, mmbLayoutIntersection **out_li) {
+  mmbError stat = MMB_OK;
 
-mmbError mmb_layout_cmp(mmbLayout *in_layout0, mmbLayout *in_layout1, int *diff) {
+  /*FIXME only irregular 1d is currently supported */
+  if ((src_layout->type == MMB_IRREGULAR) && (dst_layout->type == MMB_IRREGULAR)) {
+    if ((src_layout->irregular.n_blocks.size == 1) && (dst_layout->irregular.n_blocks.size == 1)) {
+      /* For only irregular 1D layouts */
+      /* Same layout, same index
+       * Same layout, different index
+       * Same layout type, different layout components*/
+       stat = mmb_layout_compute_overlap(src_layout, dst_layout, out_li);
+    }
+    else {
+      /* FIXME  support other mamba layout types*/
+      MMB_ERR("mamba layout type is not yet supported \n");
+      return MMB_UNIMPLEMENTED;
+    }
+  }
+  else if (src_layout->type != dst_layout->type) {
+    /* FIXME supoprt finding a mapping between two different mamba layout types */
+    MMB_ERR(" Finding a mapping between two different mmbLayout types is unimplemented \n");
+    stat = MMB_UNIMPLEMENTED;
+  }
+  else {
+    /* FIXME  support other mamba layout types*/
+    MMB_ERR("mamba layout type is not yet supported \n");
+    return MMB_UNIMPLEMENTED;
+  }
+
+   return stat;
+}
+
+mmbError mmb_layout_destroy_mmbLayoutIntersection(mmbLayoutIntersection *in_li) {
+  mmbError stat = MMB_OK;
+  if(!in_li)
+    return MMB_INVALID_ARG;
+
+  if (in_li->overlap) {
+    free(in_li->overlap);
+  }
+
+  free(in_li);
+  return stat;
+}
+
+mmbError mmb_layout_compute_overlap(mmbLayout *src_layout, mmbLayout *dst_layout, mmbLayoutIntersection **out_li) {
+  mmbError stat = MMB_OK;
+  size_t dst_pieces = dst_layout->irregular.n_blocks.d[0]; /* number of pieces for dst */
+  size_t src_pieces = src_layout->irregular.n_blocks.d[0]; /* number of pieces for src */
+  int index;
+
+  MMB_DEBUG("Number of src pieces %zu and dst pieces %zu \n", src_pieces, dst_pieces);
+  *out_li = (mmbLayoutIntersection *) malloc(sizeof(mmbLayoutIntersection));
+  if (*out_li == NULL) {
+    MMB_ERR("Unable to allocate memory for layout intersections \n");
+    return MMB_OUT_OF_MEMORY;
+  }
+  (*out_li)->n_dst_pieces =  dst_pieces;
+  (*out_li)->n_src_pieces =  src_pieces;
+
+  mmbLayoutOverlap *out;
+  size_t src_start, src_end, dst_start, dst_end;
+  out = (mmbLayoutOverlap *) malloc(sizeof(mmbLayoutOverlap)*dst_pieces*src_pieces);
+  if (out == NULL) {
+    MMB_ERR("Unable to allocate memory (%d bytes) for layout overlap \n", sizeof(mmbLayoutOverlap)*dst_pieces*src_pieces);
+    mmb_layout_destroy_mmbLayoutIntersection(*out_li);
+    return MMB_OUT_OF_MEMORY;
+  }
+  /* for every dst index */
+  for (size_t i = 0; i < dst_pieces; i++) {
+    /* for every src index */
+    for (size_t j = 0; j < src_pieces; j++) {
+      /*calculate the overlap between the two pieces*/
+      index = i*src_pieces+j;
+      src_start = src_layout->irregular.offsets[j];
+      src_end = src_start +src_layout->irregular.lengths[j];
+      dst_start = dst_layout->irregular.offsets[i];
+      dst_end = dst_start + dst_layout->irregular.lengths[i];
+
+      if ((dst_end < src_start) || (dst_start > src_end)) { // no overlap
+          out[index].src_offset = -1;
+          out[index].dst_offset = -1;
+          out[index].length = 0;
+      }
+      else  {
+        if (dst_start < src_start) {
+          out[index].src_offset = 0;
+          out[index].dst_offset = src_start - dst_start;
+          /* how much data is left in the piece
+           * length of piece - offset in piece */
+          out[index].length  = MIN(src_layout->irregular.lengths[j],(dst_layout->irregular.lengths[i] - out[index].dst_offset));
+        }
+        else {
+          out[index].src_offset =  dst_start - src_start;
+          out[index].dst_offset = 0;
+          out[index].length  = MIN(dst_layout->irregular.lengths[i],(src_layout->irregular.lengths[j] - out[index].src_offset));
+        }
+      }
+    }
+  }
+
+  (*out_li)->overlap = out;
+  return stat;
+}
+
+mmbError mmb_layout_cmp(mmbLayout *in_layout0, mmbLayout *in_layout1, mmbLayoutEquivalence *diff) {
   mmbError stat = MMB_OK;
+  int result;
 
   /* Compare layout 0 and 1 */
   if(in_layout0->type != in_layout1->type) {
-    *diff = 1;
+    *diff = MMB_LAYOUT_DIFF_TYPES;
     return MMB_OK;
   }
 
   if(in_layout0->n_dims != in_layout1->n_dims) {
-    *diff = 1;
+    *diff = MMB_LAYOUT_DIFF_FIELDS;
     return MMB_OK;
   }
 
-  stat = mmb_layout_element_cmp(&in_layout0->element, &in_layout1->element, diff);
+  stat = mmb_layout_element_cmp(&in_layout0->element, &in_layout1->element, &result);
   if(stat != MMB_OK) {
     MMB_ERR("Unable to compare layout elements");
     goto BAILOUT;
   }
-  if(*diff) {
+  if(result) {
+    *diff = MMB_LAYOUT_DIFF_FIELDS;
     return MMB_OK;
   }
 
-  stat = mmb_layout_padding_cmp(&in_layout0->pad, &in_layout1->pad, in_layout0->n_dims, diff);
+  stat = mmb_layout_padding_cmp(&in_layout0->pad, &in_layout1->pad, in_layout0->n_dims, &result);
   if(stat != MMB_OK) {
     MMB_ERR("Unable to compare layout elements");
     goto BAILOUT;
   }
-  if(*diff) {
+  if(result) {
+    *diff = MMB_LAYOUT_DIFF_FIELDS;
     return MMB_OK;
   }
 
   if (in_layout0->type == MMB_IRREGULAR) {
-    stat = mmb_layout_irregular_cmp(&in_layout0->irregular, &in_layout1->irregular, diff);
+    stat = mmb_layout_irregular_cmp(&in_layout0->irregular, &in_layout1->irregular, &result);
     if(stat != MMB_OK) {
       MMB_ERR("Unable to compare irregular layouts");
       goto BAILOUT;
     }
-    if(*diff) {
+    if(result) {
+      *diff = MMB_LAYOUT_DIFF_FIELDS;
       goto BAILOUT;
     }
   }
@@ -314,7 +521,12 @@ mmbError mmb_layout_cmp(mmbLayout *in_layout0, mmbLayout *in_layout1, int *diff)
     goto BAILOUT;
   }
 
-  *diff = 0;
+  if(in_layout0->index != in_layout1->index){
+    *diff = MMB_LAYOUT_DIFF_INDEX;
+    return MMB_OK;
+  }
+
+  *diff = MMB_LAYOUT_EQUAL;
 BAILOUT:
   return stat;
 }
diff --git a/deps/mamba/common/mmb_layout.h b/deps/mamba/common/mmb_layout.h
index 6f447835d3374941cf97d87301552e4e680d9209..2e86cd5f6b3bd73eb24c1aa2bfdc7064114299c9 100644
--- a/deps/mamba/common/mmb_layout.h
+++ b/deps/mamba/common/mmb_layout.h
@@ -58,6 +58,14 @@ enum e_mmbLayoutElementType {
   MMB_LAYOUT_ELEMENTTYPE_MAX
 };
 
+enum e_mmbLayoutEquivalence {
+  MMB_LAYOUT_EQUAL = 0,
+  MMB_LAYOUT_DIFF_INDEX,
+  MMB_LAYOUT_DIFF_FIELDS,
+  MMB_LAYOUT_DIFF_TYPES,
+  MMB_LAYOUT_DIFF_MAX
+};
+
 enum e_mmbLayoutOrder {
   MMB_LAYOUT_ORDER_NONE = 0,
   MMB_ROWMAJOR,
@@ -79,6 +87,7 @@ typedef enum e_mmbAccessType mmbAccessType;
 typedef enum e_mmbLayoutElementType mmbLayoutElementType;
 typedef enum e_mmbLayoutOrder mmbLayoutOrder;
 typedef enum e_mmbLayoutType mmbLayoutType;
+typedef enum e_mmbLayoutEquivalence mmbLayoutEquivalence;
 
 typedef struct mmbLayoutElement {
   mmbLayoutElementType type;
@@ -124,6 +133,27 @@ typedef struct mmbLayout {
   };
 } mmbLayout;
 
+/**
+  * Describes the overlap between src-dst mmbLayout pieces
+  */
+typedef struct mmbLayoutOverlap {
+  size_t src_offset;
+  size_t dst_offset;
+  size_t length;
+} mmbLayoutOverlap;
+
+/**
+  * Describes how a src layout intersects with a certain dst layout piece
+  * the length of mmbLayoutIntersection is equal to the number of src layout pieces
+  */
+typedef struct mmbLayoutIntersection {
+  size_t n_dst_pieces;
+  size_t n_src_pieces;
+  mmbLayoutOverlap *overlap;
+} mmbLayoutIntersection;
+
+
+
 /**
  * mmbLayout constructors accept a default 0 padding argument
  */
@@ -134,6 +164,7 @@ typedef struct mmbLayout {
 const char* mmb_layout_type_to_str(int type);
 const char* mmb_element_type_to_str(int type);
 const char* mmb_layout_order_to_str(int order);
+const char* mmb_layout_equivalence_to_str(int result);
 
 /* Construct default initialisation option */
 mmbLayoutPadding *mmb_layout_padding_create_zero(void);
@@ -168,8 +199,50 @@ mmbError mmb_layout_create_dist_irregular_1d(size_t element_size_bytes,
                                       size_t *lengths,
                                       mmbLayout **out_layout);
 
+                                
+/**
+ * @brief Cook up a default distributed, which consists of one piece holding
+ *    all the data from all pieces ... not really distributed.
+ *  
+ * @param src_layout a layout describes one piece of the data 
+ *                   that will be used as a blueprint to generate the default layout
+ * @param default_layout output default layout 
+ * @return MMB_OK on success 
+ */
+mmbError mmb_layout_dist_create_default_layout(mmbLayout *src_layout, mmbLayout **default_layout);
+
+/**
+ * @brief find the size in bytes of all the data that are distributed and described 
+ *      the input layout
+ * 
+ * @param dist_layout input distributed layout
+ * @param size  size in bytes
+ * @return MMB_OK on success 
+ */
+mmbError mmb_layout_dist_find_entire_size(mmbLayout *dist_layout, int64_t *size);
+
+/**
+ * @brief find the size in bytes of a piece in a distributed layout
+ * 
+ * @param dist_layout input distributed layout
+ * @param size  output size in bytes of the piece described by the first parameter
+ * @return MMB_OK on success 
+ */
+mmbError mmb_layout_dist_find_piece_size(mmbLayout *dist_layout, int64_t *size);
+
+
 mmbError mmb_layout_create_copy(mmbLayout *in_layout, mmbLayout **out_layout);
 
+
+mmbError mmb_layout_compute_intersection(mmbLayout *src_layout,
+                                      mmbLayout *dst_layout,
+                                      mmbLayoutIntersection **out_li);
+
+mmbError mmb_layout_destroy_mmbLayoutIntersection(mmbLayoutIntersection *in_li);
+
+mmbError mmb_layout_compute_overlap(mmbLayout *src_layout,
+                                mmbLayout *dst_layout,
+                                mmbLayoutIntersection **out_li);
 /**
  * @brief      Free layout object
  *
@@ -190,7 +263,7 @@ mmbError mmb_layout_destroy(mmbLayout *in_layout);
  */
 mmbError mmb_layout_copy(mmbLayout *dst, mmbLayout *src);
 
-mmbError mmb_layout_cmp(mmbLayout *in_layout0, mmbLayout *in_layout1, int* diff);
+mmbError mmb_layout_cmp(mmbLayout *in_layout0, mmbLayout *in_layout1, mmbLayoutEquivalence* diff);
 mmbError mmb_layout_buffer_size(mmbLayout *in_layout, mmbDimensions *dim,
                                 size_t * total_bytes);
 mmbError mmb_layout_buffer_size_nd(mmbLayout *in_layout, mmbDimensions *dim,
diff --git a/deps/mamba/common/mmb_tile.c b/deps/mamba/common/mmb_tile.c
index cb1912b014a46b8d8a3c31161ba2e3a53fa96606..924bab808eaf91ccc3f1bb8fc98e350affed79d7 100644
--- a/deps/mamba/common/mmb_tile.c
+++ b/deps/mamba/common/mmb_tile.c
@@ -58,7 +58,7 @@ static inline void mmb_tiling_unlock(mmbArray *mba) {
   (void) stat;
 }
 
-static inline mmbError nd_to_1d_idx(mmbIndex *in_idx, mmbDimensions * in_dims, 
+static inline mmbError nd_to_1d_idx(mmbIndex *in_idx, mmbDimensions * in_dims,
                                     size_t *out_idx){
   /* Validity and range checks */
   if(!in_idx || !in_dims || !out_idx)
@@ -66,7 +66,7 @@ static inline mmbError nd_to_1d_idx(mmbIndex *in_idx, mmbDimensions * in_dims,
 
   if(in_idx->size != in_dims->size){
     MMB_ERR("Index size %zu does not match dimensions size %zu\n", in_idx->size, in_dims->size);
-    return MMB_ERROR; 
+    return MMB_ERROR;
   }
 
   for (size_t i = 0; i < in_idx->size; i++)
@@ -96,28 +96,28 @@ static inline void mmb_tile_unlock(mmbArrayTile *t) {
   (void) stat;
 }
 
-mmbError mmb_tile_create_1d(mmbArray *in_mba, size_t in_idx, 
+mmbError mmb_tile_create_1d(mmbArray *in_mba, size_t in_idx,
                             mmbTileOptions *in_opts, mmbArrayTile **out_tile) {
   mmbIndex idx = {1, &in_idx};
-  return mmb_tile_create(in_mba, &idx, in_opts, out_tile); 
-}    
+  return mmb_tile_create(in_mba, &idx, in_opts, out_tile);
+}
 
 mmbError mmb_tile_create_2d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1,
                             mmbTileOptions *in_opts, mmbArrayTile **out_tile) {
   size_t idx_arr[2] = {in_idx0, in_idx1};
   mmbIndex idx = {2, &idx_arr[0]};
-  return mmb_tile_create(in_mba, &idx, in_opts, out_tile); 
-}    
+  return mmb_tile_create(in_mba, &idx, in_opts, out_tile);
+}
 
 mmbError mmb_tile_create_3d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1,
-                            size_t in_idx2, mmbTileOptions *in_opts, 
+                            size_t in_idx2, mmbTileOptions *in_opts,
                             mmbArrayTile **out_tile) {
   size_t idx_arr[3] = {in_idx0, in_idx1, in_idx2};
   mmbIndex idx = {3, &idx_arr[0]};
-  return mmb_tile_create(in_mba, &idx, in_opts, out_tile); 
-}    
+  return mmb_tile_create(in_mba, &idx, in_opts, out_tile);
+}
 
-mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx, 
+mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
                          mmbTileOptions *in_opts, mmbArrayTile **out_tile) {
   /* Validate args */
   if(in_mba == NULL)
@@ -128,7 +128,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
 
   if(out_tile == NULL)
     return MMB_INVALID_ARG;
-  
+
   mmbError stat = MMB_OK;
 
   mmbTileOptions default_opts = {0};
@@ -159,10 +159,10 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
       }
       stat = MMB_OK;
       tile_exists = true;
-    }    
+    }
     mmb_tiling_unlock(in_mba);
   }
-  
+
   /* Create tile if it wasnt found */
   if(!tile_exists) {
     // Create tile
@@ -195,7 +195,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
       free(t);
       free(dims);
       goto BAILOUT;
-    }  
+    }
 
     for (unsigned i = 0; i < t->layout->n_dims; i++) {
       t->abs_dim[i] = in_mba->dims.d[i];
@@ -209,7 +209,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
     }
 
     /* Check contiguity */
-    /* TODO: This only checks if tile is 1d, or if it covers whole array, 
+    /* TODO: This only checks if tile is 1d, or if it covers whole array,
     * not if subtiles are contiguous */
     int contiguous = 1;
     if(t->layout->n_dims > 1) {
@@ -231,7 +231,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
     t->tidx = tile_idx;
 
     /* Initialise the tile lock */
-    pthread_mutex_init(&t->lock, NULL); 
+    pthread_mutex_init(&t->lock, NULL);
 
     if(in_opts->cache_policy == MMB_TILECACHE_READ_WRITE) {
 
@@ -248,7 +248,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
         if(stat != MMB_OK)
           MMB_WARN("Failed to destroy temporary layout object\n");
         free(dims);
-        pthread_mutex_destroy(&t->lock); 
+        pthread_mutex_destroy(&t->lock);
         free(t);
 
         t = t_temp;
@@ -272,7 +272,7 @@ mmbError mmb_tile_create(mmbArray *in_mba, mmbIndex* in_idx,
       mmb_tiling_unlock(in_mba);
     }
   }
-  
+
   if(stat == MMB_OK)
     *out_tile = t;
 
@@ -280,27 +280,27 @@ BAILOUT:
   return stat;
 }
 
-mmbError mmb_tile_update_1d(mmbArray *in_mba, size_t in_idx, 
+mmbError mmb_tile_update_1d(mmbArray *in_mba, size_t in_idx,
                          mmbArrayTile **inout_tile) {
   mmbIndex idx = {1, &in_idx};
-  return mmb_tile_update(in_mba, &idx, inout_tile); 
-}    
+  return mmb_tile_update(in_mba, &idx, inout_tile);
+}
 
-mmbError mmb_tile_update_2d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1, 
+mmbError mmb_tile_update_2d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1,
                          mmbArrayTile **inout_tile){
   size_t idx_arr[2] = {in_idx0, in_idx1};
   mmbIndex idx = {2, &idx_arr[0]};
-  return mmb_tile_update(in_mba, &idx, inout_tile); 
-}    
+  return mmb_tile_update(in_mba, &idx, inout_tile);
+}
 
 mmbError mmb_tile_update_3d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1,
                          size_t in_idx2, mmbArrayTile **inout_tile){
   size_t idx_arr[3] =  {in_idx0, in_idx1, in_idx2};
   mmbIndex idx = {3, &idx_arr[0]};
-  return mmb_tile_update(in_mba, &idx, inout_tile); 
-}    
+  return mmb_tile_update(in_mba, &idx, inout_tile);
+}
 
-mmbError mmb_tile_update(mmbArray *in_mba, mmbIndex *in_idx, 
+mmbError mmb_tile_update(mmbArray *in_mba, mmbIndex *in_idx,
                          mmbArrayTile **inout_tile){
 
   /* Validate args */
@@ -342,7 +342,7 @@ mmbError mmb_tile_update(mmbArray *in_mba, mmbIndex *in_idx,
     *inout_tile = cached_tile;
   } else {
     /* Update the tile indices manually if no cache is being used */
-  /* We just update indices, we not modify the tile struct, 
+  /* We just update indices, we not modify the tile struct,
    * layout, or tile cache policy */
     for (unsigned i = 0; i < t->layout->n_dims; i++) {
       t->abs_dim[i] = in_mba->dims.d[i];
@@ -356,7 +356,7 @@ mmbError mmb_tile_update(mmbArray *in_mba, mmbIndex *in_idx,
     }
     /* TODO - Do we need to update this? */
     /* Check contiguity */
-    /* TODO: This only checks if tile is 1d, or if it covers whole array, 
+    /* TODO: This only checks if tile is 1d, or if it covers whole array,
     * not if subtiles are contiguous */
     int contiguous = 1;
     if(t->layout->n_dims > 1) {
@@ -413,7 +413,7 @@ mmbError mmb_tiling_dimensions(mmbArray *in_mba, mmbDimensions **out_dims) {
   if(stat != MMB_OK) {
     MMB_ERR("Failed to create mmb dimensions object\n");
     goto BAILOUT;
-  } 
+  }
 
   stat = mmb_dimensions_copy(dims, &in_mba->tiling->tile_count);
   if(stat != MMB_OK) {
@@ -465,14 +465,14 @@ mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx,
     stat = MMB_OK;
     goto BAILOUT_UNLOCK;
   }
-  
+
   mmb_tiling_unlock(in_mba);
 
-  /* Otherwise, create tile and return using default memory space */ 
+  /* Otherwise, create tile and return using default memory space */
 
   /* Typical threaded usage would be multiple threads asking for different
     tiles, so we let each thread make its own tile before insertion
-    This results in some wasted cycles if many threads ask for the same 
+    This results in some wasted cycles if many threads ask for the same
     not-yet-constructed tile, but minimises blocking in the more-common scenario
     where they ask for different not-yet-constructed tiles. */
   t = (mmbArrayTile *)calloc(1, sizeof(mmbArrayTile));
@@ -503,7 +503,7 @@ mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx,
     free(t);
     free(dims);
     goto BAILOUT;
-  }  
+  }
 
   for (unsigned i = 0; i < t->layout->n_dims; i++) {
     t->abs_dim[i] = in_mba->dims.d[i];
@@ -517,7 +517,7 @@ mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx,
   }
 
   /* Check contiguity */
-  /* TODO: This only checks if tile is 1d, or if it covers whole array, 
+  /* TODO: This only checks if tile is 1d, or if it covers whole array,
    * not if subtiles are contiguous */
   int contiguous = 1;
   if(t->layout->n_dims > 1) {
@@ -539,8 +539,8 @@ mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx,
   t->tidx = tile_idx;
 
   /* Initialise the tile lock */
-  pthread_mutex_init(&t->lock, NULL); 
-  
+  pthread_mutex_init(&t->lock, NULL);
+
   mmb_tiling_wrlock(in_mba);
   /* Check if someone else already created this tile in the meantime
    * if so, free local data and return pre-existing tile */
@@ -554,7 +554,7 @@ mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx,
     if(stat != MMB_OK)
       MMB_WARN("Failed to destroy temporary layout object\n");
     free(dims);
-    pthread_mutex_destroy(&t->lock); 
+    pthread_mutex_destroy(&t->lock);
     free(t);
 
     t = t_temp;
@@ -842,7 +842,7 @@ mmbError mmb_tile_copy(mmbArrayTile *dst_tile, mmbArrayTile *src_tile) {
   }
 
   /* Only matching layouts are supported currently */
-  int diff;
+  mmbLayoutEquivalence diff;
   mmbError stat = mmb_layout_cmp(src_tile->layout, dst_tile->layout, &diff);
   if(stat != MMB_OK) {
     MMB_ERR("Failed to compare layouts\n");
@@ -866,11 +866,11 @@ mmbError mmb_tile_copy(mmbArrayTile *dst_tile, mmbArrayTile *src_tile) {
 }
 
 mmbError mmb_tile_duplicate(mmbArrayTile *in_tile, mmbMemInterface *in_interface,
-                            mmbAccessType in_access, mmbLayout *in_layout,  
+                            mmbAccessType in_access, mmbLayout *in_layout,
                             mmbArrayTile **out_tile) {
 
   mmbError stat = MMB_OK;
-  /* 
+  /*
     Validate input arguments
   */
   if(!in_tile || !in_interface || !in_layout || !out_tile) {
@@ -885,8 +885,8 @@ mmbError mmb_tile_duplicate(mmbArrayTile *in_tile, mmbMemInterface *in_interface
     return MMB_ERROR;
   }
 
-  /* Lock tile for access. Currently you can only duplicate the top tile in the 
-   * stack, so this locking can be simple. Eventually we will support duplicating 
+  /* Lock tile for access. Currently you can only duplicate the top tile in the
+   * stack, so this locking can be simple. Eventually we will support duplicating
    * many read-only tiles from a single source tile simultaneously, in which case this
    * will require updating */
   mmb_tile_lock(in_tile);
@@ -943,7 +943,7 @@ mmbError mmb_tile_duplicate(mmbArrayTile *in_tile, mmbMemInterface *in_interface
     free(t);
     free(dims);
     goto BAILOUT;
-  } 
+  }
 
   /* Duplicated tile is now contiguous */
   t->access |= MMB_CONTIGUOUS;
@@ -954,23 +954,23 @@ mmbError mmb_tile_duplicate(mmbArrayTile *in_tile, mmbMemInterface *in_interface
     free(t);
     free(dims);
     goto BAILOUT;
-  } 
+  }
 
   /* Initialise the tile lock */
-  pthread_mutex_init(&t->lock, NULL); 
+  pthread_mutex_init(&t->lock, NULL);
 
   /* Add to the stack of tiles */
   in_tile->next = t;
   t->prev = in_tile;
 
-  /* If access type is writeable, then lock the tile below from being written */ 
+  /* If access type is writeable, then lock the tile below from being written */
   if(in_access & MMB_WRITE){
     in_tile->access |= MMB_WRITE_LOCKED;
   }
 
   /* Return tile */
   *out_tile = t;
-  
+
 BAILOUT:
   mmb_tile_unlock(in_tile);
   return stat;
@@ -1047,14 +1047,14 @@ mmbError mmb_tile_migrate(mmbArrayTile *in_tile, mmbMemInterface *in_interface,
 
 mmbError mmb_tile_merge(mmbArrayTile *in_tile, mmbMergeType in_merge) {
 
-  /* 
+  /*
       Validate input arguments
 
       switch on in_merge
         case OVERWRITE:
           get reference to tile_below
           copy in_tile to tile_below
-      
+
       Other cases we will consider:
         pull data from tile_below to in_tile
         fancy merges (e.g modified entries only, halos only, user defined function)
@@ -1068,7 +1068,7 @@ mmbError mmb_tile_merge(mmbArrayTile *in_tile, mmbMergeType in_merge) {
   }
   mmb_tile_lock(in_tile);
 
-  // TODO : This should either not destroy the bottom tile of the stack, or also 
+  // TODO : This should either not destroy the bottom tile of the stack, or also
   //        remove it from the tile cache
 
   /* If tile is bottom of stack, or if it was read only, free the tile. */
@@ -1119,7 +1119,7 @@ BAILOUT_UNLOCK_FREE:
     if(stat != MMB_OK) {
       MMB_ERR("Failed to destroy merged tile\n");
       goto BAILOUT;
-    }    
+    }
   }
 
 BAILOUT:
@@ -1128,12 +1128,12 @@ BAILOUT:
 
 mmbError mmb_tile_destroy(mmbArrayTile *in_tile) {
 
-  /* 
+  /*
     Validate input argument
 
     If tile has other tiles above it (in_tile->next != null)
       return error
-    
+
     If tile has other tiles below it (in_tile->prev != null)
       Clear next ptr from lower tile and if it was write-locked, unlock it.
 
@@ -1157,7 +1157,7 @@ mmbError mmb_tile_destroy(mmbArrayTile *in_tile) {
     mmbArrayTile *prev = in_tile->prev;
     prev->next = NULL;
     if(prev->access & MMB_WRITE_LOCKED) {
-      prev->access &= !MMB_WRITE_LOCKED;
+      prev->access &= ~MMB_WRITE_LOCKED;
     }
   }
 
@@ -1184,7 +1184,7 @@ mmbError mmb_tile_destroy(mmbArrayTile *in_tile) {
   }
 
   /* Destroy the tile lock */
-  pthread_mutex_destroy(&in_tile->lock); 
+  pthread_mutex_destroy(&in_tile->lock);
 
   free(in_tile->lower);
   free(in_tile);
diff --git a/deps/mamba/configure.ac b/deps/mamba/configure.ac
index 6a9b701ae7cd5d6a714608b69ca7f51c3a31ccd3..39ac322865d93dd604040d83163608d4c4a16209 100644
--- a/deps/mamba/configure.ac
+++ b/deps/mamba/configure.ac
@@ -29,7 +29,7 @@
 #  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #  */
 
-AC_INIT([mamba], [0.1.8], [emearesearchlab@hpe.com])
+AC_INIT([mamba], [0.1.9], [emearesearchlab@hpe.com])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_PROG_CC
diff --git a/deps/mamba/docs/sphinx/_static/images/Mamba-resource-manager-structure.png b/deps/mamba/docs/sphinx/_static/images/Mamba-resource-manager-structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..73999f2785a2a7cd9022aea7f33c43a654537a76
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/Mamba-resource-manager-structure.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/Mamba-structure.png b/deps/mamba/docs/sphinx/_static/images/Mamba-structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..1bfc85f9b4279169b255c4469b2e37345493e861
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/Mamba-structure.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/Memory_abstraction_device_software_stack.png b/deps/mamba/docs/sphinx/_static/images/Memory_abstraction_device_software_stack.png
new file mode 100644
index 0000000000000000000000000000000000000000..a6e5b5e0d057a3140d4860432e524c9141e889b6
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/Memory_abstraction_device_software_stack.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/array_distribution.png b/deps/mamba/docs/sphinx/_static/images/array_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b6a5016ebd7b95f72790d26f9db88099bde57fe
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/array_distribution.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/array_tiling.png b/deps/mamba/docs/sphinx/_static/images/array_tiling.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca9d27be0349becc02204610571c869f853da73b
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/array_tiling.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/duplicate_merge.png b/deps/mamba/docs/sphinx/_static/images/duplicate_merge.png
new file mode 100644
index 0000000000000000000000000000000000000000..4af5f3c59a18dab87592148a4fbcecff507e8235
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/duplicate_merge.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/key_data_structures.png b/deps/mamba/docs/sphinx/_static/images/key_data_structures.png
new file mode 100644
index 0000000000000000000000000000000000000000..640b7b0fe0f55901b8ac4712f0d08e369939e06b
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/key_data_structures.png differ
diff --git a/deps/mamba/docs/sphinx/_static/images/memory_abstraction.png b/deps/mamba/docs/sphinx/_static/images/memory_abstraction.png
new file mode 100644
index 0000000000000000000000000000000000000000..b635cfc162fa3b956c9c7c85a3c77c6497bc8a9a
Binary files /dev/null and b/deps/mamba/docs/sphinx/_static/images/memory_abstraction.png differ
diff --git a/deps/mamba/docs/sphinx/conf.py b/deps/mamba/docs/sphinx/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1737a7275ec79edab006127f463dd3a375d95a
--- /dev/null
+++ b/deps/mamba/docs/sphinx/conf.py
@@ -0,0 +1,87 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+#master_doc entry required to handle sidebar properly 
+master_doc = 'index'
+project = 'Mamba'
+copyright = '2021, Tim Dykes, Harvey Richardson'
+author = 'Tim Dykes, Harvey Richardson'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+        "sphinxcontrib.httpdomain",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+#import guzzle_sphinx_theme
+#
+#html_theme_path = guzzle_sphinx_theme.html_theme_path()
+#html_theme = 'guzzle_sphinx_theme'
+#
+#extensions.append("guzzle_sphinx_theme")
+#
+#html_theme_options = {
+#        "project_nav_name":"Mamba",
+#}
+
+on_rtd = os.environ.get("READTHEDOCS", None) == "True"
+
+if not on_rtd:
+    import sphinx_rtd_theme
+    extensions.append("sphinx_rtd_theme")
+    html_theme = "sphinx_rtd_theme"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+
+
+#following two lines are required for sidebar to not deform during browsing (e.g. 
+#change caption from manually entered toctree caption to sphinx global "content"
+#caption).
+html_css_files = ['css/custom.css']
+html_js_files = ['js/expand_tabs.js']
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_theme_options = {
+    'logo_only': True,
+    'includehidden': False,
+}
+
+html_static_path = ['_static']
+#html_css_files = ['css/custom.css']
+
diff --git a/deps/mamba/docs/sphinx/examples.rst b/deps/mamba/docs/sphinx/examples.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6439f1ef31179602ee1c598316e331fdfab9def8
--- /dev/null
+++ b/deps/mamba/docs/sphinx/examples.rst
@@ -0,0 +1,271 @@
+====================
+Programming Examples
+====================
+
+
+Code examples
+=============
+
+We give examples of some of the core functionalities of Mamba including:
+
+* memory space allocation
+* Mamba array construction
+* array tiling
+* tile access
+
+
+Allocations
+-----------
+
+An Allocation object provides an abstract container for a memory allocation in a specific memory space. To create an allocation object for CPU and GPU, we must first request memory space. Note that in this step the size, limits, default behaviours, etc can be set. 
+
+.. code:: C
+
+   mmbMemSpaceConfig *dram_space_config;
+   mmb_memspace_config_create_default (&dram_space_config);
+   mmbMemSpace *dram_space, *gpu_space;
+   mmb_request_space(MMB_DRAM, MMB_EXECUTION_CONTEXT_DEFAULT,
+                      dram_space_config, &dram_space);
+   mmb_request_space(MMB_GDRAM, MMB_GPU_CUDA, NULL, &gpu_space);
+
+Next, request a memory interface for the allocation:
+
+.. code:: C
+
+   mmbMemInterfaceConfig dram_interface_config =
+    {.provider = MMB_PROVIDER_DEFAULT, .strategy = MMB_POOLED};
+   mmbMemInterface *dram_interface, *gpu_interface;
+   mmb_request_interface(dram_space, &dram_interface_config,
+                          &dram_interface);
+   mmb_request_interface(gpu_space, NULL, &gpu_interface);
+
+Finally, allocate a buffer on the CPU, and another buffer on the GPU:    
+
+.. code:: C
+
+   mmbAllocation *host_buffer, *gpu_buffer;
+   mmbAllocation *host_buffer, *gpu_buffer;
+   mmb_allocate (n_bytes, dram_interface, &host_buffer);
+   mmb_allocate (n_bytes, gpu_interface, &gpu_buffer):
+   fill_host_buffer (host_buffer);
+   
+A generic copy function between the buffers can be performed as follows:   
+
+.. code:: C
+
+   mmb_copy (gpu_buffer, host_buffer);
+
+
+Arrays
+------
+
+A Mamba array is an array-like data structure that forms the core abstraction of the Mamba library. Subsets of the array may be duplicated or moved between memories. During construction we can specificy different array distributions such as block cyclic across tiles, pre-tiled across different spaces, etc. We construct a regular 2D array layout:
+
+.. code:: C
+   
+   mmbLayout *layout;
+   mmb_layout_create_regular_nd(sizeof(float), 2, MMB_ROWMAJOR,
+                                MMB_PADDING_NONE, &layout);
+
+To construct a Mamba array of size MxN:
+
+.. code:: C
+
+   size_t adims[2] = {array_size_M, array_size_N};
+   mmbDimensions array_size = {2, adims};
+   mmbArray *array;
+   mmb_array_create(&array_size, layout, dram_interface,
+                    MMB_READ_WRITE, &array);
+
+Fill the newly created array with data:
+
+.. code:: C
+   
+   fill_array(array);
+
+And move the entire array to the GPU:
+
+.. code:: C
+
+   mmb_array_migrate(array, gpu_interface);
+
+Note that the ``gpu_interface`` could be an interface to a single GPU, or additional options can be provided using e.g. device index. 
+
+
+Array Tiles
+-----------
+
+Mamba arrays may be decomposed into subsets, so-called array tiles,  for iteration or movement between memory spaces. This decomposition process is called tiling an array. To tile an array, we first create an array tile:
+
+.. code:: C
+   
+   mmb_array_tile_2d(array, tile_size_M, tile_size_N);
+
+To loop over the tiling, request a tile at each index (iteration over tile sets using iterator objects such as schedules, prefetching, automatic sizing, etc. is also possible):
+
+.. code:: C
+
+   mmbArrayTile *tile;
+   mmbDimensions *tiling_dims;
+   mmb_tiling_dimensions(mba, &tiling_dims);
+   for (size_t ti = 0; ti < tiling_dims->d[0]; ++ti) {
+    for (size_t tj = 0; tj < tiling_dims->d[1]; ++tj) {
+      mmb_tile_at_2d(mba, ti, tj, &tile);
+
+Within the loop, we can now duplicate tile data on the GPU. As we created a 2D array tile, the duplicate uses e.g. ``cudaMemcpy2D`` implicitly:
+
+.. code:: C
+
+      mmbArrayTile *duplicate_tile;
+      mmb_tile_duplicate(tile, gpu_interface,
+                        MMB_READ_WRITE, &duplicate_title);
+
+Run the GPU kernel:
+
+.. code:: C
+
+      run_cude_kernel(duplicate_tile);
+
+Merge the duplicate tile back to the original via overwrite and close the for loops:
+
+.. code:: C
+
+      mmb_tile_merge(duplicate_tile, MMB_OVERWRITE);
+    }
+   }
+
+
+Tile Access
+-----------
+Tile access can be performed using multiple approaches - with or without direct pointer access, as well as with user indexing. While macros are not necessary, they can provide convenient indexing for non-standard layouts such as block-cyclic tiles. 
+
+.. code:: C
+
+   void zero_tile(mmbArrayTile *t) {
+    float *ptr = mmb_tile_get_ptr(t);
+    for(size_t i = t->lower[0]; i < t->upper[0]; i++)
+      for(size_t j = t->lower[1]; j < t->upper[1]; j++){
+        // Without direct pointer access
+        MMB_IDX_2D(t, i, j, float) = 0;
+        // OR: With direct pointer access
+        ptr[MMB_IDX_EXPR_2D(i,j)] = 0; 
+        // OR: With user indexing
+        ptr[i * t->dim[1] + j)] = 0; }
+   }
+
+Fortran tiles can pass in an appropriately dimensioned pointer for regular indexing:
+
+.. code:: fortran 
+
+   block
+    real, pointer, dimension(:,:) :: tp
+    type(mmbTileData) tile_mdata
+    call mmb_tile_get_mdata(tile_c,tile_mdata,tp)
+    do j=tile_mdata%lower(1),tile_mdata%upper(1)
+      do i=tile_mdata%lower(2),tile_mdata%upper(2)
+        tp(i,j) = 0.0
+      end do
+    end do
+   end block
+
+
+Tile metadata is by default located in the CPU local space. The API to request a space-local handle for e.g. GPU-local tile metadata to pass into a kernel would be used as follows:
+
+.. code:: c
+
+   extern "C" void run_cuda_kernel(mmbArrayTile *tile){
+    size_t block_size = 16;
+    dim3 block_dim = dim3(block_size, block_size)
+    dim3 grid_dim = (tile->dim[0] / block_width,
+                    tile->dim[1] / block_height);
+    mmbArrayTile *dev_tile;
+    mmb_tile_get_space_local_handle(tile, &dev_tile);
+    cuda_compute_kernel<<<grid_dim, block_dim>>>(dev_tile);
+   }
+
+
+Examples Overview
+==================
+
+Examples are found in ``mamba/build/examples/``, or ``/path/to/install/dir/examples``. Each example is shown in C and fortran, and briefly described here with instructions on use.
+
+1d_array_copy
+-------------
+
+This shows the construction, tiled initialisation, and copy of a 1d mamba array to another 1d mamba array with matching layout and size, with full error checking.
+
+Source file: `examples/c/1d_array_copy.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/1d_array_copy.c>`_ | `examples/fortran/1d_array_copy.f90 <https://gitlab.com/cerl/mamba/-/blob/docs/examples/fortran/1d_array_copy.f90>`_
+
+Usage: ``./1d_array_copy`` | ``./1d_array_copy_f``
+
+1d_array_copy_wrapped
+---------------------
+
+The same as 1d_array_copy but using arrays contructed from existing user pointers.
+
+Source file: `examples/c/1d_array_copy_wrapped.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/1d_array_copy_wrapped.c>`_ | `examples/fortran/1d_array_copy_wrapped.f90 <https://gitlab.com/cerl/mamba/-/blob/docs/examples/fortran/1d_array_copy_wrapped.f90>`_
+
+Usage: ``./1d_array_copy_wrapped`` | ``./1d_array_copy_wrapped_f``
+
+
+tile_duplicate
+------------------
+
+This shows construction of a 1d array, tiling, duplication and merging of tiles.
+
+Source file: `examples/c/tile_duplicate.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/tile_duplicate.c>`_
+
+Usage: ``./tile_duplicate``
+
+.. _matrix_multiply:
+
+matrix_multiply
+---------------
+
+This demonstrates a tiled matrix multiply using 3 mamba arrays constructed on top of pre-initialised (with random or identity values) matrix buffers.
+
+Source file: `examples/c/matrix_multiply.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply.c>`_
+
+Usage: 
+
+.. code:: console
+
+   (all args optional): ./matrix_multiply -v (for verbose mode) -t N (for tile size NxN) -m N (for matrix size NxN) -i (use identity for matrix B)
+
+matrix_multiply_cuda (C only)
+-----------------------------
+
+This demonstrates a tiled matrix multiply using multiple mamba arrays constructed on top of pre-initialised (with random or identity values) matrix buffers.
+This example also present how to allocate and use memory on different memory devices (DRAM, GPU, HBM, ...), and how to copy from one memory tier to an other.
+This example shows as well how to use different strategies and/or different memory providers.
+
+This example works the same as the :ref:`matrix_multiply` example, excepted that it requires extra steps to pass the data to the actual kernel (in addition to allocate the data on the GPU memory, the tiling information needs to be forwarded as well).
+The `CUDA file <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply_cuda_ker.cu>`_ only deals with this forwarding (the packing is done in `examples/c/matrix_multiply_cuda.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply_cuda.c>`_).
+For now the tiles are not executed in parallel, but it is a work in progress.
+
+Source files: `examples/c/matrix_multiply_cuda.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply_cuda.c>`_, `examples/c/matrix_multiply_cuda_ker.cu <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply_cuda_ker.cu>`_, `examples/c/matrix_multiply_cuda.h <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/matrix_multiply_cuda.h>`_
+
+Usage: 
+
+.. code:: console
+
+   (all args optional): ./matrix_multiply_cuda -v (for verbose mode) -t N (for tile size NxN) -m N (for matrix size NxN) -i (use identity for matrix B)
+
+loop description (C only)
+-------------------------
+
+This example demonstrates the description of a loop using the loop description, followed by PET/ISL based polyhedral analysis of the loop with dependence computation. The loop description, auxiliary analysis information and calculated loop dependencies are output to the terminal.
+
+Source files: `examples/c/loop_description.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/loop_description.c>`_
+
+Usage: ``./loop_description``
+
+report_mem_state (C only)
+-------------------------
+
+This example show the output of the function mmb_dump_memory_state that dump to the ``FILE *`` given as parameter the current state of the memory system as retained by the MAMBA Memory Manager.
+
+Source file: `examples/c/report_mem_state.c <https://gitlab.com/cerl/mamba/-/blob/docs/examples/c/report_mem_state.c>`_
+
+Usage: ``./report_mem_state``
+
diff --git a/deps/mamba/docs/sphinx/index.rst b/deps/mamba/docs/sphinx/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..150c079a82389baff757f21edf4e8aaf15b2b90a
--- /dev/null
+++ b/deps/mamba/docs/sphinx/index.rst
@@ -0,0 +1,28 @@
+.. Mamba documentation master file, created by
+   sphinx-quickstart on Wed Nov  3 14:45:02 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Mamba: Managed Abstracted Memory Array documentation
+====================================================
+
+A library-based programming model for C, C++ and Fortran based on Managed Abstract Memory Arrays, aiming to deliver simplified and efficient usage of diverse memory systems to application developers in a performance portable way. Mamba arrays exploit a unified memory interface to abstract memory from both traditional memory devices, accelerators, and storage. This library aims to achieve good performance portability with an easy to use approach that requires minimal code intrusion. 
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: First steps
+
+   quickstart
+   examples
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Architecture
+
+   overview
+   mamba_abstract_memory_model
+   mamba_array
+   mamba_memory_management
+   mamba_resource_manager
+   mamba_library_implementation
diff --git a/deps/mamba/docs/sphinx/make.bat b/deps/mamba/docs/sphinx/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..153be5e2f6f996741b53558d8bc7825ec091699c
--- /dev/null
+++ b/deps/mamba/docs/sphinx/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/deps/mamba/docs/sphinx/mamba_abstract_memory_model.rst b/deps/mamba/docs/sphinx/mamba_abstract_memory_model.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ae49dbeb5684c72d419d8d274c40ac2f02cc3347
--- /dev/null
+++ b/deps/mamba/docs/sphinx/mamba_abstract_memory_model.rst
@@ -0,0 +1,110 @@
+.. _mamba abstract memory model:
+
+Mamba Abstract Memory Model
+===========================
+
+Mamba contains a memory management component, the Mamba Resource Manager (MRM), which defines an abstract memory model to represent the available system memory and serves as an abstraction layer and uniform interface to interact with heterogeneous memories. The MRM provides a public API to construct and manage a set of memory spaces in a user application, along with a low level interface to allocate, access, transport, and free simple memory blocks in a heterogeneous memory hierarchy.
+
+.. _memory model design:
+
+Memory Model Design
+--------------------
+
+The Mamba library is built on the assumption there are a variety of types of memory on a single node, with different characteristics. The memory model of Mamba allows heterogeneous memory systems to be described in a uniform manner, to simplify the process of allocating and moving data across such systems. Conceptually, as illustrated in the following figure, Mamba considers all the available memory as a single abstract memory space, consisting of a group of memory sub-spaces each with a set of defined characteristics, from which data may be allocated. 
+
+.. figure:: /_static/images/memory_abstraction.png
+   :width: 80%
+   :align: center
+   :alt: A conceptual illustration of memory abstraction in the Mamba library.
+
+          
+This concept is implemented in the Mamba library by five key interacting data structures, illustrated below. Two memory layers exist, each with a size-limited memory space. Each space has a corresponding memory interface, from which an allocation object may be obtained and used within a specific execution context.
+
+.. figure:: /_static/images/key_data_structures.png
+   :width: 80%
+   :align: center
+   :alt: An illustration of the five key data structures that form the memory model in the Mamba library. Two memory layers exist, each with a size-limited memory space. Each space has a corresponding memory interface, from which an allocation object may be obtained and used within a specific execution context.
+
+
+
+Memory Layer
+^^^^^^^^^^^^
+
+A Memory Layer, defined in the Mamba API as ``mmbMemLayer``, represents a particular type of memory with a defined set of characteristics. The memory layers available to a user are defined by the available hardware in the system, and may have characteristics such as high bandwidth or low latency. Such layers may be defined by the user, or discovered automatically where possible during library initialisation.
+
+Memory Space
+^^^^^^^^^^^^
+A Memory Space, defined in the Mamba API as ``mmbMemSpace``, represents a size-limited, addressable, instantiation of space corresponding to a specific memory layer, from which blocks of memory may be allocated. The size may be artificially limited by the user, or hardware-limited. A memory space has an associated execution context, ``mmbExecutionContext``, that determines how memory will be allocated and made available in that space. The execution context provides the user a means to choose the programming model through which memory must be made available. For example, NVIDIA GPU device memory could be provided within a CUDA, HIP, or OpenACC execution context.
+
+Memory Interface
+^^^^^^^^^^^^^^^^
+A Memory Interface, defined in the Mamba API as ``mmbMemInterface``, acts as an interface to a specific memory space, providing a mechanism to allocate, move, copy, and free memory. A memory interface may have a specific strategy, that defines the behaviour of the interface. For example, this may enforce thread safety during allocation, or define the type of memory allocator used (e.g. pooled vs. slab).
+
+Allocation
+^^^^^^^^^^
+An Allocation object, defined in the Mamba API as ``mmbAllocation``, provides an abstract container for a memory allocation in a specific memory space. Allocation objects are provided by memory interfaces, and passed into generic allocation, copy, and free routines. Allocation objects contain, if feasible, a real data pointer, along with provenance information such as the interface through which it was allocated, and whether the un- derlying data is owned by the allocation object or not. The concept of ownership allows, for example, sub-allocations to be created as slices of existing allocations.
+
+Execution Context
+^^^^^^^^^^^^^^^^^
+An Execution Context, defined in the Mamba API as ``mmbExecutionContext``, provides a mechanism to define the context within which an allocation object will be used during program execution. The execution context captures a variety of hardware execution targets and accompanying programming models to ensure the MRM can create allocation objects that are compatible with the intended use case for the allocation. For example, a variety of memory allocators are available for GPU; the correct choice depends on the type of GPU and the programming model used by the application developer. The execution context allows the developer to specify, for example, to use an OpenCL based memory allocator rather than a CUDA-based allocator for a GPU allocation in order to be compatible with application code written using the OpenCL programming model.
+
+.. _memory model API:
+
+Memory Model API
+----------------
+The Mamba library provides two mechanisms for constructing a topology view of the underlying memory system. Topology Discovery, where the available memories are discovered during Mamba initialisation via integration with the hwloc library for hardware locality; or topology construction, where the application developer constructs their own view of the memory system via the Mamba API.
+
+Topology Discovery or Construction
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The discovery feature enables Mamba to automatically discover the available memories on a node via the ``hwloc`` library for hardware locality. This mode is enabled via build configuration argument ``--enable-discovery``; when enabled, discovery will occur during initialisation of the Mamba library.
+
+During discovery, Mamba will store internally a memory topology that includes a list of available memory layers and a default memory space for each layer.
+In the case discovery is not enabled, the Mamba library relies on the user to provide a view of the available memory system. The user may do this via a series of API calls to register available memory layers. Once a memory layer is registered, a default space is created and the user may further configure or construct spaces. As a convenience, the user may combine these steps to construct a new memory space corresponding to the layer upon registration.
+
+.. code:: C
+
+   mmbError mmb_register_memory(const mmbMemLayer layer,
+                      const mmbExecutionContext ex_context,
+                      const mmbMemSpaceConfig *config_opts, 
+                      mmbMemSpace **new_space);
+
+
+Space Construction
+^^^^^^^^^^^^^^^^^^
+Once the system memory topology has been discovered or constructed, the user may then request a pre-constructed space and further configure if necessary, or create their own space with custom configuration
+
+.. code:: c
+
+   mmbError mmb_create_space(const mmbMemLayer layer,
+                            const mmbExecutionContext ex_con,
+                            const mmbMemSpaceConfig *config_opts, 
+                            mmbMemSpace **out_space);
+
+   mmbError mmb_request_space(const mmbMemLayer layer,
+                            const mmbExecutionContext ex_con,
+                            const mmbMemSpaceConfig *config_opts, 
+                            mmbMemSpace **out_space);
+
+   mmbError mmb_memspace_configure(const mmbMemSpaceConfig *config_opts, 
+                            mmbMemSpace *space);
+
+The custom configuration support includes providing default options for any memory interfaces constructed in this space, and configuring a set of sub-domains within the memory space, for example to support a space spanning multiple NUMA regions or spanning multiple devices. 
+
+Interface Construction
+^^^^^^^^^^^^^^^^^^^^^^
+Once a suitable set of memory spaces have been discovered, or constructed, the user may acquire one or more memory interfaces. As shown in the code snippet below, upon provision of a valid memory space and set of configuration options, the user may construct a new, or request an existing, memory interface.
+
+.. code:: C
+
+   mmbError mmb_create_interface(mmbMemSpace *space,
+                                const mmbMemInterfaceConfig *config_opts,
+                                mmbMemInterface **out_interface);
+
+   mmbError mmb_request_interface(mmbMemSpace *space,
+                                const mmbMemInterfaceConfig *config_opts,
+                                mmbMemInterface **out_interface); 
+
+   mmbError mmb_destroy_interface(mmbMemInterface *interface);
+
+Once a valid memory interface is acquired, it may be used to either construct a Mamba Array or allocate blocks of memory using the resource management API.
+
diff --git a/deps/mamba/docs/sphinx/mamba_array.rst b/deps/mamba/docs/sphinx/mamba_array.rst
new file mode 100644
index 0000000000000000000000000000000000000000..05de4091c13e8fe77c5c8dba859fa8d503307127
--- /dev/null
+++ b/deps/mamba/docs/sphinx/mamba_array.rst
@@ -0,0 +1,87 @@
+Mamba Array
+===========
+
+A Mamba array is an array-like data structure that forms the core abstraction of the Mamba library. It is built on top of the abstract memory model, and the underlying data in a Mamba array may be transparently distributed across multiple types of memory. Subsets of the array may be duplicated or moved between memories, either explicitly by the user or implicitly by the Mamba runtime. The remainder of this section describes the process to construct, access, and move Mamba arrays.
+
+.. figure:: /_static/images/array_distribution.png
+   :width: 60%
+   :align: center
+   :alt: A conceptual illustration of Mamba arrays distributed across the different types of memory in an abstract memory space representing a heterogeneous memory system. Array subsets may reside in one or more memory space
+
+   A conceptual illustration of Mamba arrays distributed across the different types of memory in an abstract memory space representing a heterogeneous memory system. Array subsets may reside in one or more memory space
+
+
+Array Construction
+-------------------
+A Mamba array may be constructed with internal allocation, or by wrapping an existing user pointer. The constructor for a Mamba array requires four input arguments.
+
+.. code:: C
+
+   mmbError mmb_array_create(mmbDimensions *in_dims, 
+                            mmbLayout *in_layout,
+                            mmbMemInterface *in_interface, 
+                            mmbAccessType in_access, 
+                            mmbArray **out_mba);
+
+   mmbError mmb_array_create_wrapped(void *in_data, 
+                                    mmbDimensions *in_dims,
+                                    mmbLayout *in_layout, 
+                                    mmbMemInterface *in_interface, 
+                                    mmbAccessType in_access,
+                                    mmbArray **out_mba);
+   mmbError mmb_array_destroy(mmbArray *in_mba);
+
+The ``mmbDimensions`` argument, in dims, specifies the number of dimensions, and size of each, for the array.
+
+The second argument, ``in_layout``, defines the layout of the array in physical memory, represented in the Mamba API as an ``mmbLayout`` data structure. This object defines the mapping of array indices to physical memory layout, and encapsulates array characteristics such as:   
+
+* Object type (Element/AOS/SOA) and size in bytes
+* Layout order (e.g. row major vs column major ordering)
+* Layout type (e.g. regular, block cyclic, irregular)
+* Array padding (in bytes)
+
+The third argument for Mamba array construction, ``in_interface``, provides the mem- ory interface through which the array should be allocated. This may be obtained through memory registration or discovery.
+
+The fourth argument, ``in_access``, indicates the access type for the array, represented by ``mmbAccessType`` in the Mamba API. This describes how the array is intended to be used and may be:
+
+* MMB_READ
+* MMB_WRITE
+* MMB_READ_WRITE
+
+The access type may influence where the array is placed in a particular memory space, such as using fast read-only memory where applicable, and is further exploited for array tiles.
+Finally, a Mamba array may also be constructed wrapped around existing user data, using the constructor ``mmb_create_wrapped``. This allows a user to pass in pre-allocated data for management by the Mamba library, using the additional argument ``in_data``.
+
+Array Tiling
+------------
+
+In the Mamba library, arrays may be decomposed into subsets for iteration or movement between memory spaces. These subsets are known as array tiles, and represented by the ``mmbArrayTile`` data structure. The process of decomposing an array into these subsets is known as tiling an array, and the set of array tiles that constitute the full array is known as an array tiling.
+
+The following code snippet shows the API provided to tile, and untile, a Mamba array. The only argument required to tile an array, beyond the array itself, is a dimensions object in dims, which defines the size of a single tile.
+
+.. code:: C
+
+   mmbError mmb_array_tile(mmbArray *in_mba, mmbDimensions *in_dims); 
+   mmbError mmb_array_untile(mmbArray *in_mba);
+
+The full array will then be decomposed into tiles with these dimensions by the Mamba runtime. As indicated by the API, a tiling is an action applied to an array object, and only a single tiling may be active on an array at any one time. Repeated calls to ``mmb_array_tile`` with new dimensions will result in removal of any previous tiling applied to the provided array. 
+
+.. figure:: /_static/images/array_tiling.png
+   :width: 80%
+   :align: center
+   :alt: An illustration of the concept of array tiling in one and two dimensions, each tile may reside in one or more memory spaces.
+
+   An illustration of the concept of array tiling in one and two dimensions, each tile may reside in one or more memory spaces.
+
+Array tiles may be accessed either directly by requesting a tile by index in the array tiling, or indirectly by requesting an iterator over the set of array tiles. Below code snippet shows the API for direct access by index. N-dimensional tiles may be accessed via the generic ``mmb_tile_at``, whilst specific dimension tiles may be accessed more conveniently with per-dimension index arguments.
+
+.. code:: C
+
+   mmbError mmb_tile_at(mmbArray *in_mba, mmbIndex *in_idx, 
+                      mmbArrayTile **out_tile);
+   // ...
+   mmbError mmb_tile_at_2d(mmbArray *in_mba, size_t in_idx0, size_t in_idx1, 
+                      mmbArrayTile **out_tile);
+   // ...
+
+
+Once a tile is acquired, it may be used to access the underlying array data. There are two mechanisms to do this, direct access via pointer or assisted indexing via convenience macros defined by the Mamba library. Direct access allows the user to retrieve a raw pointer from the array tile, and index it directly using indexing bounds stored in the tile structure. Alternatively, one may use the indexing macros provided by Mamba to simplify either just the index expression, or the full pointer access expression for common cases and/or more complicated layouts such as block cyclic.
diff --git a/deps/mamba/docs/sphinx/mamba_library_implementation.rst b/deps/mamba/docs/sphinx/mamba_library_implementation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7b9b9e6d74ef597c8041007fbff60d934abaf8b
--- /dev/null
+++ b/deps/mamba/docs/sphinx/mamba_library_implementation.rst
@@ -0,0 +1,374 @@
+Library Implementation
+======================
+
+Here we briefly describe the structure and build process of the library prototype, the common utilities, along with existing and proposed language support.
+
+Structure and Build Process
+---------------------------
+
+Below figure illustrates the directory structure of the Mamba library. At the top level, build files and a ``README`` file are provided, whilst most of the Mamba implementation is in the ``common`` and ``memory`` directories. Language specific files for Fortran and C++ are stored in the ``fortran`` and ``cpp`` directories. Dependent libraries for loop analysis, stored in the loopanalyzer subdirectory, are imported via ``git submodules`` via ``autogen.sh`` if required.
+
+
+.. figure:: /_static/images/Mamba-structure.png
+   :width: 80%
+   :align: center
+   :alt: An illustration of the directory structure of the Mamba library.
+
+   An illustration of the directory structure of the Mamba library.
+
+The Mamba library exploits the ``autotools`` build system for compilation; each of the code subdirectories is built as a convenience library, and these are combined to form ``libmamba``. 
+
+.. code:: bash
+
+   mkdir build;
+   cd build;
+   // Only required if you want to use the --with-loop-analysis configure arg
+   // If not, you can use ‘autoreconf -i‘ instead
+   ../autogen.sh
+   ../configure [--prefix=/p/a/t/h]    \
+                [--with-sicm=/p/a/t/h] \ 
+                [--with-umpire=/p/a/t/h] \ 
+                [--with-jemalloc=/p/a/t/h] \ 
+                [--with-jemalloc-prefix=<prefix>] \ 
+                [--with-memkind=/p/a/t/h] \ 
+                [--with-fortran] \
+                [--with-cpp] \ 
+                [--with-loop-analysis] \ 
+                [--enable-embedded] \ 
+                [--enable-cuda[=yes|no|<arch>]] \ 
+                [--enable-hip-rocm[=yes|no]] \ 
+                [--enable-opencl[=yes|no]] \ 
+                [--with-opencl=/p/a/t/h] \ 
+                [--enable-discovery[=yes|no|default]];
+
+   make;
+   make check-tests;
+   make check-examples;
+   make install; (optional)
+
+Common Utilities
+----------------
+
+Library Initialisation
+^^^^^^^^^^^^^^^^^^^^^^
+
+The Mamba library must be initialised before first use, and de-initialised after final use. To this end, the library includes functions for initialisation, finalisation, and checking the state of the library, in a similar way to the MPI library.
+
+.. code:: C
+
+   mmbError mmb_init(mmbOptions *in opts); 
+   mmbError mmb_is_initialized(int *initialized); 
+   const char * mmb_version();
+   mmbError mmb_finalize();
+
+
+The ``mmbOptions`` structure allows the user to provide initialisation options for the Mamba library, and may be constructed and modified as shown in the code snippet below. The user may also provide ``MMB_INIT_DEFAULT`` as the argument to ``mmb_init`` for default initialisation.
+
+.. code:: C
+
+   mmbError mmb_options_create_default(mmbOptions **opts);
+   mmbError mmb_options_destroy(mmbOptions *opts);
+   mmbError mmb_options_set_debug_level(mmbOptions *opts, int_level);
+   mmbError mmb_options_set_user_log_func(mmbOptions *opts, mmbUserLogFunc user_log_func); 
+   // ...
+
+
+Convenience Data Structures
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Mamba includes various convenience data structures for interacting with the library. Commonly used in the public C API are two wrapper structures for a simple resizeable array used to store either a list of dimensions, or a list of indices,
+* mmbDimensions 
+* mmbIndex
+
+An excerpt of the API available for this type of data structure is as follows:
+
+.. code:: C
+
+   mmbError mmb_dimensions_create(const size_t ndim, mmbDimensions **out_dims);
+   mmbError mmb_dimensions_set(mmbDimensions *in_dims, size_t dimN, ...);
+   mmbError mmb_dimensions_create_fill(const size_t ndim, const size_t* values, mmbDimensions **out_dims); 
+   mmbError mmb_dimensions_copy(mmbDimensions *dst, const mmbDimensions *src);
+   mmbError mmb_dimensions_copy_buffer(size t *dst, const mmbDimensions *src);
+   mmbError mmb_dimensions_fill(mmbDimensions *dst, const size_t *src, const size_t nelt);
+   mmbError mmb_dimensions_resize(mmbDimensions *in_dims, const size_t ndim);
+   mmbError mmb_dimensions_destroy(mmbDimensions *in_dims);
+   int mmb_dimensions_cmp(const mmbDimensions *d1, const mmbDimensions *d2);
+   mmbError mmb_dimensions_prod(const mmbDimensions *in dims, size_t *out_prod);
+   mmbError mmb_dimensions_get_size(const mmbDimensions *in_dims, size_t *size);
+   mmbError mmb_dimensions_get_sizeof(const mmbDimensions *in_dims, size_t *size);
+
+Logging
+^^^^^^^
+
+Mamba includes a logging infrastructure (``./common/mmb logging.h/c``), used internally and available for users to exploit. Maximum logging verbosity is defined by default with a compile time switch, and reduced verbosity may be requested by setting the environment variable ``MMB_LOG_LEVEL`` to one of the following values:
+
+.. code:: bash
+
+   /** log level for errors */
+   #define MMB_LOG_ERR 0
+   /** log level for warnings */
+   #define MMB_LOG_WARN 1
+   /** log level for informational messages */ 
+   #define MMB_LOG_INFO 2
+   /** log level for debugging messages */ 
+   #define MMB_LOG_DEBUG 3
+   /** log level for really chatty logging */ 
+   #define MMB_LOG_NOISE 4
+
+   /** Usage in Mamba, also available for use in user codes */ 
+   /** Error messages */
+   MMB_ERR(format, ...)
+   /** warning messages */
+   MMB_WARN(format, ...)
+   /** informational messages */
+   MMB_INFO(format, ...)
+   /** debug messages */
+   MMB_DEBUG(format, ...)
+   /** chatty messages */
+   MMB_NOISE(format, ...)
+   
+   /** example usage */
+   ERR("Error with integer value: %d\n", errno)
+   /** example output in format:
+       [error type] (process id): function(filename:line) error message */
+   [E] (12083): main(example.c:25) Error with integer value: 1
+
+The Mamba logging infrastructure by default outputs to ``stderr``, however may also be initialised with a user-provided logging function to override this behaviour, as seen in e.g. the user defined logging example code.
+
+Error Handling 
+^^^^^^^^^^^^^^
+
+Error handling is managed via datatype ``mmbError``, used as the return type for almost all Mamba API functions, and demonstrated in the code snippet below. As demonstrated, ``mmb_error`` description may be used to acquire a ``const char*`` string description of the error, which should not be ``free``’d by the user.
+
+.. code:: C
+
+   mmbError mmb_example_function_wrapper(void) {
+
+      mmbError err = mmb_example_function(arg1, arg2); 
+      if(err != MMB OK) {
+          MMB ERR("Failed to run the example function: %s\n", mmb_error_description(err));
+          goto BAILOUT;
+      }
+      /** ... */
+    
+   BAILOUT:
+      return err;
+   }
+
+
+Language Support
+----------------
+
+The majority of the library is written in C, based on the C11 language standard. As evidenced throughout the code listings in this document, we adhere to a programming style where:
+
+* Public API functions are prefixed ``mmb_``
+* Functions return an ``mmbError`` type for error handling.
+* Function names are written in underscore case.
+* Typenames are written in camel case.
+* Constructors are formatted ``mmb_(object_name)_create``, with an optional further configuration postfix (e.g. ``_2d``) for overloading constructors.
+* Destructors are formatted ``mmb_(object_name)_destroy``
+* Object memory allocation/free routines are similarly formatted ``mmb_(object_name)_lloc``, ``mmb_(object_name)_free``.
+* Commenting in doxygen format is encouraged
+
+In this section we detail the Fortran and C++ specific interfaces to Mamba.
+
+Fortran 
+^^^^^^^
+
+The Fortran interface is designed to mirror that of the C API and in most cases uses identically-named types to the C API for declaration of opaque objects that are passed to Mamba API procedures. The Fortran interface makes use of the interoperability features introduced in the Fortran 2008 revision of the Fortran standard. In line with the expecation for Fortran the Fortran API uses 1-based indexing for array tile and tile index spaces.
+The Fortran interface requires the use of the ``mamba`` module to provide the definitions of the API subprograms and support types. The module is used as follows:
+
+.. code:: fortran
+
+   use mamba
+
+In addition to required types, a set of kind values are defined for integer declarations: ``mmbErrorKind`` for the return error value, ``mmbSizeKind`` for memory counts and ``mmbIndexKind`` for quantities that relate to array indices.
+
+The Fortran interface uses subroutines instead of functions with the last argument used to return an error code, this code will either be ``MMB_OK``, ``MMB_ERROR`` or one of the many specific error values. Returning the error code is optional as shown in the finalize call below.
+
+.. code:: fortran
+
+   use mamba
+   integer(mmbErrorKind) err
+   call mmb_init(err)
+   call mmb_finalize()
+
+Some subroutines accept a missing optional argument in the case that the equivalent C API function would accept a NULL. This may mean that keyword arguments may be needed to disambiguate arguments. For example in the below code fragment the error argument is optional in the first call but the chunks argument is optional in the second call (in which case the whole array is tiled).
+
+.. code:: fortran
+
+   integer(mmbErrorKind) :: err
+   call mmb_array_tile(mamba_array_a, chunks)
+   call mmb_array_tile(mamba_array_b, err=err)
+
+In cases where the Mamba C API provides a mechanism to allocate and deallocate objects which are passed around the API but whose components are not accessed directly the Fortran inteface uses an opaque type. The opaque types are ``mmbMemSpace``, ``mmbMemInterface``, ``mmbLayout``, ``mmbLayoutPadding``, ``mmbArray``, ``mmbTiling``, ``mmbTileIterator`` and ``mmbIndex``. Variables using these types can be defined as shown in the example code below:
+
+.. code:: fortran
+
+   use mamba
+   type(mmbArray) :: mamba_array
+   type(mmbLayout) layout
+
+New derived types are provided for objects where direct access to the components is needed, this is true for example for the ``mmbArrayTile`` type which provides (in part) corresponding elements to the ``mmbArrayTile`` structure in the C API. The ``mmbArrayTile`` type is defined as follows:
+
+.. code:: fortran
+
+   TYPE :: mmbArrayTile
+      TYPE(C_mmbArrayTile) :: c tile ! The C tile structure 
+      INTEGER :: rank=0
+      TYPE(C_PTR) :: ptr
+      INTEGER(mmbIndexKind), allocatable :: lower(:) 
+      INTEGER(mmbIndexKind), allocatable :: upper(:) 
+      INTEGER(mmbIndexKind), allocatable :: alower(:) 
+      INTEGER(mmbIndexKind), allocatable :: aupper(:) 
+      INTEGER(mmbIndexKind), allocatable :: dim(:) 
+      INTEGER(mmbIndexKind), allocatable :: abs dim(:) 
+      LOGICAL is_contiguous
+   END TYPE mmbArrayTile
+
+The components ``lower``, ``upper``, ``alower``, ``aupper``, ``dim`` and ``abs_dim`` are directly equivalent to those of the ArrayTile object in the C API but the lower and upper boulds are 1-based and not 0-based. The ``rank`` component holds the rank (number of dimensions) of the tile and the ``is_contiguous`` component is true if the tile is contiguous in memory. The other components are used internally in the API.
+
+Various structures and preprocessor macros are used in the C API to pass configuration options into the API functions, notably for memory registration and requests for spaces and interfaces. The Fortran API defines types with the same names, for example for ``mmbMemSpaceConfig``, ``mmbSizeConfig``, ``mmbMemInterfaceConfig``, ``mmbProviderOptions``, ``MMB_MEMINTERFACE_CONFIG_DEFAULT``, and MMB_MEMSPACE_CONFIG_DEFAULT``. These types can be used to construct the relevant constants for initialization.
+
+.. code:: fortran
+
+   use mamba
+   integer(mmbErrorKind) err 
+   type(mmbMemSpace) dram_space 
+   type(mmbMemSpaceConfig) dram_config 
+   type(mmbMemInterface) dram_interface 
+   type(mmbArray) :: mba0,mba1
+
+   call mmb_init(err=err)
+   dram_config = mmbMemSpaceConfig(mmbSizeConfig(MMB_SIZE_SET,.false.,8000),&
+                                MMB_MEMINTERFACE_CONFIG_DEFAULT) 
+   call mmb_request_space(MMB_DRAM, MMB_EXECUTION_CONTEXT_DEFAULT, &
+                        new_space=dram_space, err=err )
+   call mmb_request_interface(dram_space, new_interface=dram_interface, err=err)
+
+
+The ``mmbSizeConfig`` arguments relate to the action, a logical to indicate if the is a numa configuration and the size requested for the space.
+
+An ``mmbDimensions`` object is not used in the Fortran API as a Fortran array can be used directly. All API functions that accept a Dimensions object will accept an array instead.
+
+.. code:: fortran
+
+   use mamba
+   type(mmbArray) :: mamba_array
+   integer(mmbIndexKind), dimension(2) :: dims,chunks 
+   integer(mmbSizeKind), allocatable, dimension(:) :: tiling_dims 
+   ! ...
+   dims = [array_size_M, array_size_N]
+   call mmb_array_tile(mamba_array, dims, err)
+   ! enquire about tiling dimentions
+   allocate( tiling_dims(2) )
+   call mmb_tiling_dimensions(mamba_array, tiling_dims, err)
+
+The iterator routines ``mmb_tile_iterator_first`` and ``mmb_tile_iterator_next`` have an additional argument that returns the relevant tile of type ``mmbArrayTile``. (The C API allowed direct access to the tile from the Iterator). The tile accessor routines ``mmb_tile_at``, ``mmb_tile_at_2d`` etc. also return an ``mmbArrayTile`` object.
+
+In contrast to the C API the Fortran interface associates a Fortran pointer with the tile data using the subroutine ``mmb_tile_get_pointer``. The following code snippet illustrates obtaining the pointer from the tile and then using it to set the data covered by the tile:
+
+.. code:: fortran
+
+   real, pointer, dimension(:,:) :: tp
+   call mmb_tile_get_pointer(tile, tp) 
+   tp(tile%lower(1):tile%upper(1),tile%lower(2):tile%upper(2))=1.0
+
+Note that the returned pointer may actually point to a larger space than that which should be accessed via the lower/upper bounds.
+
+The Fortran Interface also supports access to tile elements via preprocessor macros. This is not a standard paradigm for Fortran but it is implemented in case there are distributions that cannot be dealt with using a Fortran pointer. To use the index macros the program must include the file mambaf.h in addition to using the mamba module. This makes the indexing macros available and they can be used as:
+
+.. code:: fortran
+
+   use mamba
+   #include "mambaf.h"
+
+   ! tiles tile_a, tile_b, tile_c defined above
+   do ei = tile_c%lower(1),tile_c%upper(1) 
+    do ej = tile_c%lower(2),tile_c%upper(2)
+     do ek = tile_a%lower(2),tile_a%upper(2)
+       MMB_IDX_2D(tile_c,tp_c,ei,ej) = &
+       MMB_IDX_2D(tile_c,tp_c,ei,ej) + &
+       MMB_IDX_2D(tile_a,tp_a,ei,ek) * MMB_IDX_2D(tile_b,tp_b,ek,ej)
+     end do 
+    end do
+   end do
+
+Note that the macro accepts as arguments the array tile, the tile pointer and then the list of indexes. Additional macros ``MMB_IDX_1D`` and ``MMB_IDX_2D_NORM`` having the same meaning as per the C interface.
+
+The following code shows part of one of the Mamba Fortran example programs with comments removed. The example allocates a buffer, registers it with mamba and then creates a layout and mamba array.
+
+.. code:: fortran
+
+   program array_copy_wrapped_1d
+     use mamba
+     implicit none
+     integer, parameter :: M=128
+     integer(mmbErrorKind) err
+     integer(mmbSizeKind) ntiles
+     integer(mmbSizeKind), allocatable, dimension(:) :: tile_dims 
+     type(mmbMemSpace) dram_space
+     type(mmbMemInterface) dram_interface 
+     type(mmbArray) :: mba0 
+     integer(mmbIndexKind) :: dims(1) 
+     type(mmbLayout) layout 
+     type(mmbTileIterator) it 
+     type(mmbArrayTile) tile
+     real, dimension(:), allocatable,target :: buffer0
+     real, dimension(:), pointer :: tp
+     integer :: i,itile,chunksize
+    
+     allocate( buffer0(m) )
+
+     call mmb_init(err)
+     dram_config = mmbMemSpaceConfig(mmbSizeConfig(MMB_SIZE_SET,.false.,8000),&
+                               MMB_MEMINTERFACE_CONFIG_DEFAULT)
+     call mmb_register_memory(MMB_DRAM, MMB_EXECUTION_CONTEXT_DEFAULT, &
+                            dram_config,err=err)
+     call mmb_request_space(MMB_DRAM, MMB_EXECUTION_CONTEXT_DEFAULT, &
+                          new_space=dram_space, err=err )
+     call mmb_request_interface(dram_space, new_interface=dram_interface, err=err)
+     call mmb_layout_create_regular_1d(int(storage_size(buffer0)/8,mmbSizeKind),& 
+                        mmb_layout_padding_create_zero(), &
+                        layout,err)
+     call mmb_array_create_wrapped(buffer0,dims, layout, &
+                                  dram_interface, MMB_READ_WRITE, mba0,err )
+     call mmb_array_tile(mba0, dims, err)
+     call mmb_tile_iterator_create(mba0, it, err) 
+     call mmb_tile_iterator_first(it,tile,err)
+     
+     block
+        asynchronous :: buffer0
+      do i=1,M
+        tp(i)=i*0.001+3.0
+      end do
+     end block
+
+     call mmb_finalize(err)
+
+   end program array_copy_wrapped_1d
+
+A mamba iterator is used to get the first tile in the array, its location in the larger array can be determined from elements of the ``mmbTile`` type. This type also provides a pointer to the data (in this example we illustrate the initial prototype where this is the full array).
+
+Fortran examples are provided with the Mamba distribution and are a useful resource that illustrates the features of the API in a practical context.
+
+C++ Interface
+^^^^^^^^^^^^^
+
+The current C++ interface is a C++ wrapper of the C interface, due to the similarity of the languages this is relatively simple to provide, and as such the C interface design is sufficient to describe current interactions in C++.
+
+Work is on-going to extend this interface to use advanced features of C++ to provide a cleaner interface more suited to C++ programming. This includes:
+
+* Replacing function variants with overloaded functions (e.g. ``_nd`` function suffixes).
+* Wrapping the C Mamba array in a C++ Mamba array template class, allowing users to specify the underlying type of the data on construction.
+* Wrapping the C Mamba array tile object in a C++ Mamba array tile template class, allowing users to avoid specifying type during tile access.
+* General usage of C++ syntactic sugar to ensure the Mamba API is natural to use for C++ programmers.
+
+In the final stages of the project, we will also explore where possible other potential improvements to Mamba, such as understanding the effect of overloaded array operators for tile indexing, while ensuring to maintain the possibility of compiler vectorization of the array accesses, and investigate the applicability of more modern C++ language features, for example potentially storing loop kernels as lambdas in tile iterators. This is not a major focus of the project, as many such features are being explored in C++ centric libraries such as Kokkos.
+
+Examples and Tests
+------------------
+
+A set of examples are included with the library, demonstrating various features of the Mamba library for users. They are stored in the ``examples/`` subfolder, separated by language, and automatically built and installed during make install. Each example is documented in the library ``README``, and can either be run individually from the installed directory, or all examples may be run sequentially at build-time as a secondary set of tests via ``make check examples``.
+
+A set of unit tests are included in the tests/ subfolder, targeting the C-based library implementation code. These may be run at build time via make check.
diff --git a/deps/mamba/docs/sphinx/mamba_memory_management.rst b/deps/mamba/docs/sphinx/mamba_memory_management.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2f87b2be2bd8b7d45ad9c64fee30d2027d4bde64
--- /dev/null
+++ b/deps/mamba/docs/sphinx/mamba_memory_management.rst
@@ -0,0 +1,166 @@
+Managing Memory With Mamba
+==========================
+
+There are three levels of granularity at which memory may be managed in the Mamba library: arrays, array tiles, and allocations; each of which will be described in the following subsections, followed by descriptions of asynchronous APIs and tile iterators. 
+
+Array-based Memory Management
+-----------------------------
+The Mamba API allows users to construct an array in any of the memory spaces available to the library, by passing an appropriate memory interface during construction.
+
+Once constructed, a full array may be transported between memory spaces using one of four methods:
+
+* Array copy
+* Array duplication 
+* Array merging
+* Array migration
+
+Full array transport is intended as a convenience for use with non-tiled arrays, and exploits the tiling infrastructure for transport. The semantics for copy, duplication, merging, and migration match those explained in full detail in the following :ref:`section <tile-based memory management>`. For a tiled array a user should exploit the tile-granularity management.
+
+.. code:: C
+
+   mmbError mmb_array_copy(mmbArray *dst, mmbArray *src);
+   
+   mmbError mmb_array_migrate(mmbArray *in_array, mmbMemInterface *in_interface,
+                            mmbAccessType in_access);
+   
+   mmbError mmb_array_duplicate(mmbArray *in_array, mmbMemInterface *in_interface,
+                            mmbAccessType in_access, mmbArray **out_array); 
+            
+   mmbError mmb_array_merge(mmbArray *in_array, mmbMergeType in_merge);
+
+.. _tile-based memory management:
+
+Tile-based Memory Management
+----------------------------
+
+Array tiles are the mechanism by which arrays may be segmented and transported between physical memory tiers. The Mamba library provides features for four types of transport for tile-based movement:
+
+* Array tile copy
+* Array tile duplication
+* Array tile merging
+* Array tile migration
+
+Array tile copy provides a means of copying the contents of a source tile to a destination tile in another (or the same) memory space. It is assumed the source and destination tiles both exist and are of the same dimensions and layout, and that the source data will overwrite any existing data in the destination tile.
+
+Array tile duplication provides a means of creating a duplicate of an array tile in another (or even the same) memory space. A new block of memory is allocated (or taken from an internal cache) to store the new tile, and data is copied from the original tile. A reference to the original tile is maintained, however this tile is not typically accessible to the user. The user is able to maintain a reference to the original tile if they require, however Mamba will not be aware if the user writes to the original tile and so the user must explicitly maintain coherence between multiple tile references.
+
+Array tile merging provides a means of merging a duplicated tile back to the original tile from which it was duplicated. A strategy, ``mmbMergeStrategy``, defines how this merge will take place. In the prototype library implementation, the only strategy supported is to overwrite the original data. Other strategies are envisioned such as typical reduction operators or the provision of a custom user function to merge array indices.
+
+.. figure:: /_static/images/duplicate_merge.png
+   :width: 80%
+   :align: center
+   :alt: An illustration of the difference between duplicating and migrating array tiles. In the case of duplication, a copy of the original tile still exists, whereas in the case of migration the original tile is discarded.
+
+   An illustration of the difference between duplicating and migrating array tiles. In the case of duplication, a copy of the original tile still exists, whereas in the case of migration the original tile is discarded.
+
+Array tile migration provides a means of migrating an array tile to a different memory space. A new block of memory is allocated (or taken from an internal cache) to store the new tile, and data is copied from the original tile. The new tile replaces the original tile, and the original tile reference is dropped, releasing associated memory where possible.
+
+For duplication and migration, a target memory space is required, along with an access type and layout for the new tile. The access type may be used for optimised memory placement of the new tile, or to optimise the data movement requirements.
+
+.. code:: C
+
+   mmbError mmb_tile_copy(mmbArrayTile *dst_tile, mmbArrayTile *src_tile);
+
+   mmbError mmb_tile_duplicate(mmbArrayTile *in_tile, mmbMemSpace *in_space, 
+                            mmbAccessType in_access, mmbLayout *in_layout,
+                            mmbArrayTile **out_tile);
+   
+   mmbError mmb_tile_migrate(mmbArrayTile *in_tile, mmbMemSpace *in_space,
+                           mmbAccessType in_access, mmbLayout *in_layout); 
+                           
+   mmbError mmb_tile_merge(mmbArrayTile *in_tile, mmbMergeType in_merge);
+
+By default, tile metadata is stored in the ``MMB_DRAM`` memory layer with the ``MMB_CPU`` execution context. However, once a tile has been moved to another memory space with an alternate execution context, the metadata is also made available in the native execution context of the memory space in which the tile is located. The API to acquire an execution context local handle is as follows:
+
+.. code:: C
+
+   mmbError mmb_tile_get_space_local_handle(mmbArrayTile *in_tile,
+                                         mmbArrayTile **out_tile);
+
+
+Allocation-based Memory Management
+----------------------------------
+
+Mamba provides an API for low level memory management at the granularity of an allocation object which represents a single block of contiguous memory in a specific memory space. This API is based on the data structures detailed in :ref:`Memory Model Design <memory model design>`. Once an appropriate memory interface is acquired, via the API detailed in :ref:`Memory Model API <memory model API>`, blocks of memory may be allocated via the API shown in below code snippet.
+
+The allocation API allows users to:
+
+* Allocate and free 1D contiguous blocks of memory.
+* Optionally, allocate with additional options for the provided interface.
+* Copy data from a source allocation to a destination allocation.
+* Copy N-dimensional sub-buffers between allocations, where supported by the underlying memory provider.
+
+.. code:: C
+
+   mmbError mmb_allocate(const size_t n_bytes, mmbMemInterface *interface, 
+                      mmbAllocation **out_allocation);
+
+   mmbError mmb_allocate_opts(const size_t n_bytes, 
+                            mmbMemInterface *interface,
+                            const mmbAllocateOptions *opts,
+                            mmbAllocation **out_allocation); 
+   mmbError mmb_free(mmbAllocation *allocation);
+
+   mmbError mmb_copy(mmbAllocation *dest, mmbAllocation *src);
+
+   mmbError mmb_copy_1d(mmbAllocation *dst, const size_t doffset, 
+                      mmbAllocation *src, const size_t soffset,
+                      const size_t width);
+
+   mmbError mmb_copy_2d(mmbAllocation *dst, const size_t dxoffset, const size_t dyoffset, 
+                      const size_t dxpitch,
+                      mmbAllocation *src, const size_t sxoffset, const size_t syoffset, 
+                      const size_t spitch,
+                      const size_t width, const size_t height);
+
+   mmbError mmb_copy_nd(mmbAllocation *dst, const size_t *doffset, const size_t *dpitch, 
+                      mmbAllocation *src, const size_t *soffset, const size_t *spitch,
+                      const size_t ndims, const size_t *dims);
+
+Additional API is provided to extract a native pointer from the allocation object for the appropriate execution context, and options may be provided to specify non-default allocation behaviour from specific interfaces.
+
+Asynchronous API
+----------------
+The Mamba design includes asynchronous versions of many of the available memory management functions, denoted by the post-fix ``_async``. Two additional arguments are required for asynchronous functions, a Queue object and a Request request object.
+
+A Mamba Queue, ``mmbQueue``, represents a FIFO execution queue for Mamba data management function calls. Depending on the source and destination memory spaces, the queue may be implemented via threading, or vendor queue implementations (for example, CUDA streams for NVIDIA GPUs). In the default case, Mamba will construct an appropriate queue, however provision of options during construction allow the user to submit existing provider-specific queue structures for use by the underlying implementation, and additional API allows the user to further extract such structures if required.
+
+A Mamba Request, ``mmbRequest``, provides a handle to an asynchronous function call, and may be used to test the status or wait for completion of such a call. The asynchronous versions of a function rely on support via the underlying memory provider, and so are not available for every combination of memory space, execution context, and memory provider. In the case asynchronicity is not supported, the Mamba library will revert to the supported blocking implementation.
+
+.. code:: C
+
+   // Array-granularity
+   mmbError mmb_array_copy_async(mmbArray *dst, mmbArray *src, mmbQueue *q, mmbRequest *req);
+   
+   // Tile-granularity
+   mmbError mmb_tile_duplicate_async(mmbArrayTile *in_tile, mmbMemSpace *in_space, 
+                              mmbAccessType in_access, mmbLayout *in_layout,
+                              mmbQueue *q, mmbRequest *req, mmbArrayTile **out_tile);
+   
+   // Allocation-granularity
+   mmbError mmb_copy_async(mmbAllocation *dest, mmbAllocation *src, mmbQueue *q, mmbRequest *req);
+   
+   // Synchronisation
+   mmbError mmb_request_is_complete(mmbRequest *req, bool *complete);
+   mmbError mmb_wait(mmbRequest *req);
+   mmbError mmb_wait_all(size_t num_requests, mmbRequest *reqs, mmbError *errors);
+   
+   // Queue construction/destruction
+   mmbError mmb_queue_create(mmbQueueOptions *opts, mmbQueue **q); 
+   mmbError mmb_queue_destroy(mmbQueueOptions *opts, mmbQueue **q);
+
+
+Tile Iterators
+--------------
+
+Tile iterators, represented by the ``mmbTileIterator`` data structure, are a more convenient way of iterating over a full array tiling. An iterator object contains an internal schedule over tiles, and provides typical iterator operations (first, next, etc) to traverse the array tiling. The initial API for array tile iterators is shown in following code snippet; whilst the current library implementation supports iterators the API for customising behaviour (for example, custom iterator schedules) is currently limited.
+
+.. code:: C
+   
+   mmbError mmb_tile_iterator_create(mmbArray *in_mba, mmbTileIteratorOptions *in_opt, 
+                                    mmbTileIterator** out_it);
+   mmbError mmb_tile_iterator_first(mmbTileIterator * in_it);
+   mmbError mmb_tile_iterator_next(mmbTileIterator * in_it);
+   mmbError mmb_tile_iterator_count(mmbTileIterator * in_it, size_t* count); 
+   mmbError mmb_tile_iterator_destroy(mmbTileIterator * in_it);
+
diff --git a/deps/mamba/docs/sphinx/mamba_resource_manager.rst b/deps/mamba/docs/sphinx/mamba_resource_manager.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a18fe8cf9c8b83e8e037182f1f69ddfaa8f1f34d
--- /dev/null
+++ b/deps/mamba/docs/sphinx/mamba_resource_manager.rst
@@ -0,0 +1,114 @@
+Mamba Resource Manager
+======================
+
+The Mamba Resource Manager (MRM) implements the memory model, providing the required data structures and API to build the higher level Mamba data containers. The broad structure of the MRM is illustrated in the figure, showing the key software layers through which a user request to allocate memory will pass. In the following we detail the Interface and Provider layers, along with the two primary supported providers. 
+
+.. figure:: /_static/images/Mamba-resource-manager-structure.png
+   :width: 80%
+   :align: center
+   :alt: The structure of the Mamba Resource Manager, consisting of an allocation API supported by an interface layer and a provider layer that provide stackable allocation strategies and per layer and execution context allocator implementations.
+
+   The structure of the Mamba Resource Manager, consisting of an allocation API supported by an interface layer and a provider layer that provide stackable allocation strategies and per layer and execution context allocator implementations.
+
+Interface Layer
+---------------
+
+The interface layer of the MRM provides an abstraction layer between the user allocation API, and the underlying memory providers. One or more interfaces may be created per memory space, separating user access to different spaces, whilst sharing access to the underlying provider layer that may serve multiple different spaces.
+
+General interface construction as part of the memory API utilisation is demonstrated in :ref:`Mamba Abstract Memory Model <mamba abstract memory model>`, in this section we provide further details on configuration of interfaces. During construction, an interface may be provided a set of options for configuration, for example which underlying provider to use.
+
+As Mamba abstracts data management and not compute, it can be necessary for users to pass in provider-specific data structures for effective cooperation between user-managed compute and Mamba-managed data when a shared compute/data programming model is utilised. The following code snippet shows an example of such data structures being passed in during interface construction. A provider-specific options structure contains opaque pointers for each underlying memory implementation, which may be used via explicit include of an interoperability header file.
+
+.. code:: C
+
+   #include "mmb_provider_opencl.h"
+
+   // Omitted user-managed OpenCL setup, creating a cl_context context, a cl_command queue queue, and a_cl device id list devices[].
+
+   // Omitted Mamba memory space setup, as shown in previous sections, creating an mmbMemorySpace gpu_space.
+
+   cl_device_id      d = devices[device_index]; 
+   cl_context        c = context;
+   cl_command_queue  q = queue;
+
+   struct mmb_opencl_t oclopt = { .device = d, .context = c, .queue = q};
+   
+   mmbMemInterfaceConfig gpu_opencl_conf = (mmbMemInterfaceConfig){ 
+   .provider = MMB_PROVIDER_ANY,
+   .strategy = MMB_STRATEGY_ANY,
+   .name = "",
+   .provider_opts = {.ocl = &oclopt} 
+   };
+
+   mmbMemInterface *gpu_interface;
+   mmb_create_interface(gpu_space, &gpu_opencl_conf, &gpu_interface);
+
+Each interface may apply one of a set of available strategies. A strategy provides a set of common functionality on top of individual providers, for example adding thread-safety or memory pooling support to a specific interface without requiring such support to be implemented in every hardware-specific provider implementation. The chosen strategy may be applied at interface configuration time as in the code above, strategies may also be stacked in the implementation, for example to provide thread-safe and pooled support in a single strategy. The currently available strategies are:
+
+* ``MMB_STRATEGY_ANY``
+* ``MMB_STRATEGY_NONE``
+* ``MMB_POOLED``
+* ``MMB_THREAD_SAFE``
+* ``MMB_STATISTICS``
+* ``MMB_POOLED_STATISTICS``
+
+The choice of ``MMB_STRATEGY_ANY`` results in the interface utilising a default strategy for the memory space, which if not specified during space construction is ``MMB_STRATEGY_NONE``. 
+
+Each configuration option that offers a default setting is configurable via compile-time or environment variables.
+
+Compile-time Variables
+^^^^^^^^^^^^^^^^^^^^^^
+
+The following variables can be set at compile time (or during the call to ``configure`` when provided to ``CPPFLAGS``). In order to set their value, use the format ``-D<name>=<value>``.
+
+* ``MMB_CONFIG_PROVIDER_DEFAULT``: Default memory provider to use to allocate mem- ory when none is requested. Default: ``MMB_NATIVE``.
+* ``MMB_CONFIG_STRATEGY_DEFAULT``: Default memory allocation strategy to use when none is requested. Default: ``MMB_STRATEGY_NONE``.
+* ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``: Default execution context to use when allocating and copying memory to/from GPUs. Default: ``MMB_GPU_CUDA``.
+* ``MMB_CONFIG_PROVIDER_DEFAULT_ENV_NAME``: Environment variable’s name to look for when setting default provider. Default: ``MMB_CONFIG_PROVIDER_DEFAULT``.
+* ``MMB_CONFIG_STRATEGY_DEFAULT_ENV_NAME``: Environment variable’s name to look for when setting default strategy. Default: ``MMB_CONFIG_STRATEGY_DEFAULT``.
+* ``MMB_CONFIG_INTERFACE_NAME_DEFAULT_ENV_NAME``: Environment variable’s name to look for when setting default strategy. Default: ``MMB_CONFIG_INTERFACE_NAME_DEFAULT``.
+* ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT_ENV_NAME``: Environment variable’s name to look for when setting default execution context for the GPU. Default: ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``.
+
+Run-time variables
+^^^^^^^^^^^^^^^^^^
+
+The following variables can be set in the environment at run-time in order to modify some of the compile-time defined behaviors. These variable are read only once, during the library initialization.
+
+* ``MMB_CONFIG_PROVIDER_DEFAULT``
+* ``MMB_CONFIG_STRATEGY_DEFAULT``
+* ``MMB_CONFIG_INTERFACE_NAME DEFAULT``
+* ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``
+
+The variables defaults to the compile-time values. The name of these of these variable can be changed at compile time using the available settings above. For simplicity, ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT`` also accepts ``NONE`` as a valid choice.
+
+Statistics
+^^^^^^^^^^
+
+The statistics strategy will record telemetry data during utilisation of the interface, this currently includes the number of allocations and total allocated size. Work is on- going to extend this to include telemetry related to data transport, and extend the user-interface to interact with telemetry data.
+
+Provider Layer
+--------------
+
+The provider layer provides the lowest level abstraction of the underlying memory hardware. Multiple providers are supported, and each provider implements the hardware and/or programming-model specific allocation API for each memory layer and execution context pairing supported by that provider.
+
+Primary support in Mamba is for the System and uMMAP-IO providers, however experimental support is included for the SICM, UMPIRE, and JEMALLOC memory management libraries.
+
+
+Provider: System
+----------------
+
+The System provider is the default provider in Mamba, and is built into the library. This provider has support for the following memory layers:
+
+* ``MMB_DRAM``
+* ``MMB_GDRAM`` 
+* ``MMB_HBM``
+* ``MMB_NVDIMM``
+
+As well as the following execution contexts:
+
+* ``MMB_CPU``
+* ``MMB_GPU_CUDA``
+* ``MMB_GPU_HIP``
+* ``MMB_GPU_OPENCL``
+
+For each valid pairing of memory layer and execution context, the system provider implements a set of memory management functions to be utilised by the Mamba resource manager.
diff --git a/deps/mamba/docs/sphinx/overview.rst b/deps/mamba/docs/sphinx/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bbc428ba3ad1b3a6bf12f69ef5826b36ee52e859
--- /dev/null
+++ b/deps/mamba/docs/sphinx/overview.rst
@@ -0,0 +1,13 @@
+Overview
+========
+
+The Memory Abstraction Device software stack, shown in the below figure, is positioned between a user application code and the underlying memory hardware/software to provide a generic and portable approach for accessing the diverse memories of a heterogeneous high performance computing system. The primary component of the stack is the Mamba library, consisting of a set of data containers that provide a high-level array-like interface for the user to manage application data, supported by a resource manager that defines an abstract memory model to map to system hardware, and provides a low-level support for managing memory allocation and movement in heterogeneous memory hierarchies. The resource manager may interface to one of a variety of underlying memory providers, the default of which is the built-in Mamba system provider. 
+
+A number of supporting libraries make up the remainder of the software stack: the cost model library is intended to support the runtime in making decisions regarding tile sizing and data movement; the loop analysis library is intended to support analysis of data access within loop kernels; and the uMMap-IO library, which serves as an alternate memory provider.
+
+.. figure:: /_static/images/Memory_abstraction_device_software_stack.png
+   :width: 80%
+   :align: center
+   :alt: The software stack of the Memory Abstraction Device. User applications may exploit both the high level Mamba Data Containers and the lower level memory abstractions provided via the Resource Manager, to interact with the underlying memory system hardware/software stack.
+
+   The software stack of the Memory Abstraction Device. User applications may exploit both the high level Mamba Data Containers and the lower level memory abstractions provided via the Resource Manager, to interact with the underlying memory system hardware/software stack.
diff --git a/deps/mamba/docs/sphinx/quickstart.rst b/deps/mamba/docs/sphinx/quickstart.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fecad4923863aeda9848a07a43d87d0ed81fc4f1
--- /dev/null
+++ b/deps/mamba/docs/sphinx/quickstart.rst
@@ -0,0 +1,258 @@
+==========
+Quickstart
+==========
+
+
+How to build and run
+====================
+
+.. code:: bash
+
+  bash
+  mkdir build;
+  cd build;
+  # if loop analysis module is required, run autogen.sh instead of autoreconf -i
+  autoreconf -i
+  ../configure [--prefix=/path/to/install/dir]                   \
+             [--enable-discovery[=yes|no|default]];            \
+             [--with-fortran]                                  \
+             [--with-fortran-ISO-bindings-includedir=/p/a/t/h] \
+             [--enable-embedded]                               \
+             [--enable-cuda[=yes|no|<arch>]]                   \
+             [--enable-hip-rocm[=yes|no]]                      \
+             [--enable-opencl[=yes|no]]                        \
+             [--with-opencl=/path/to/opencl/install]           \
+             [--enable-pmem[=yes|no]]                          \
+             [--with-memkind=/path/to/libmemkind/install]      \
+             [--with-numa[=/path/to/libnuma/install]]          \
+             [--with-loop-analysis]                            \
+             [--with-cost-model[=/path/to/costmodel/install]]  \
+             [--with-sicm=/path/to/sicm/install]               \
+             [--with-umpire=/path/to/umpire/install]           \
+             [--with-jemalloc=/path/to/jemalloc/install]       \
+             [--with-jemalloc-prefix=<prefix>]                 
+  make;
+  make check-tests;
+  make check-examples;
+  make install; (optional)
+
+
+Configure
+------------
+
+autogen.sh
+^^^^^^^^^^
+
+Only required to use --with-loop-analysis
+This will get and update mamba loop analysis dependencies as submodules, and is an optional step if you have already recursively cloned the repository using ``git clone --recursive``. In this case, you may use ``autoreconf -i`` instead.
+
+--prefix
+^^^^^^^^^^^^
+
+Set the directory prefix for make install
+
+--enable-discovery
+^^^^^^^^^^^^^^^^^^
+
+Enable discovery mode, where Mamba will use hwloc to analyse the memory topology and construct a set of appropriate memory spaces during initialisation. This requires hwloc>=2.0 to be installed. ``default`` behaviour is to look for a suitable version of hwloc, and enable discovery if found, otherwise disable and issue a warning message at configure time.
+
+--with-fortran
+^^^^^^^^^^^^^^
+
+Build the Fortran Mamba library.
+
+--with-fortran-ISO-bindings-includedir
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Specify a non-standard path to the location of ``ISO_Fortran_binding.h`` to use the C/Fortran ISO bindings (required for the fortran build) 
+
+--enable-embedded
+^^^^^^^^^^^^^^^^^
+
+Enable the embedded support generating the libtool convenience libraries to easily import the library and its dependencies into your own project.
+
+--enable-cuda
+^^^^^^^^^^^^^^^^^
+
+Enable the CUDA support in the memory manager.
+The configure lists all the pkg-config module files containing the sub-string 'cuda' and test each until one provides the support requested.
+
+--enable-hip-rocm
+^^^^^^^^^^^^^^^^^
+
+Enable HIP support for AMD devices (via ROCM) in the memory manager.
+We use hipconfig to determine appropriate CFLAGS, see common issues section for info on passing additional hipcc flags
+
+--enable-opencl 
+^^^^^^^^^^^^^^^
+
+Enable OpenCL support, currently tested on AMD and NVIDIA GPU devices and Xilinx FPGA devices
+
+--with-opencl
+^^^^^^^^^^^^^
+
+Provide a non-standard path to your OpenCL installation 
+
+--enable-pmem 
+^^^^^^^^^^^^^
+
+Enable persistent memory support, such as Intel Optane non-volatile DIMMs. Requires the memkind library.
+
+--with-memkind
+^^^^^^^^^^^^^^
+
+Build with libmemkind support, which allows HBM (e.g. Intel KNL MCDRAM) and persistent memory allocation (e.g. Intel Optane NV-DIMMs). Disabled by default;
+
+--with-numa
+^^^^^^^^^^^
+
+Build with libnuma support for numa-aware memory spaces.
+
+--with-loop-analysis
+^^^^^^^^^^^^^^^^^^^^
+
+Build with loop analysis features. The loop analysis module depends on external loop analysis libraries; during autogen, the appropriate libraries will be downloaded as git submodules. A dependency on LLVM is also introduced, if you have trouble building the loopanalyzer library, refer to the build instructions in the loopanalyzer repository. If you have previously built without this option you will also need to ``make clean``. To test the support libraries, ``make check`` will run tests for all dependencies integrated into the Mamba build system.
+
+--with-cost-model
+^^^^^^^^^^^^^^^^^
+
+Build with cost model library support for automatic tile sizing features.
+
+--with-sicm
+^^^^^^^^^^^
+
+Experimental external library support. Allows underlying memory allocation using the `LANL/SICM memory manager <https://github.com/lanl/SICM>`_.
+
+--with-umpire
+^^^^^^^^^^^^^
+
+Experimental external library support. Allows underlying memory allocation using the `LLNL/Umpire memory manager <https://github.com/LLNL/umpire>`_.
+
+--with-jemalloc and --with-jemalloc-prefix
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Allows underlying memory allocation using the `jemalloc <https://github.com/jemalloc/jemalloc>`_ malloc implementation.
+The default prefix of the jemalloc functions namespace is ``je_``.
+
+Additional options
+------------------
+
+To change the compiler used, set ``CC=...``, ``CXX=...`` and/or ``FTN=...`` during configure. 
+
+Cray (CCE)
+^^^^^^^^^^
+
+On a Cray system, it is typical to use the compiler wrappers to manage the compilation environment correctly:
+
+.. code:: console
+
+  ./configure CC=cc CXX=CC FTN=ftn ...
+
+
+GNU
+^^^
+
+Add ``-std=gnu11`` to get C11 std with gnu extensions, required for posix pthread lock structures.
+
+.. code:: console
+
+  ./configure CFLAGS="-std=gnu11" ...
+
+
+Configuration variables
+-----------------------
+
+In order to modify the default behavior in order to make it fit better your usage, additional compile-time and run-time variables can be set.
+
+Compile-time
+^^^^^^^^^^^^^
+
+The following variables can be set at compile time (or during the call to ``configure`` when provided to ``CPPFLAGS``).
+In order to set their value, use the format ``-D<name>=<value>``.
+
+  * ``MMB_LOG_LEVEL``: Compile-time max log level cut-off, default ``MMB_LOG_DEBUG``
+  * ``MMB_CONFIG_PROVIDER_DEFAULT``: Default memory provider to use to allocate memory when none is requested. Default: ``MMB_NATIVE``.
+  * ``MMB_CONFIG_STRATEGY_DEFAULT``: Default memory allocation strategy to use when none is requested. Default: ``MMB_STRATEGY_NONE``.
+  * ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``: Default execution context to use when allocating and copying memory to/from GPUs. Default: ``MMB_GPU_CUDA``.
+  * ``MMB_CONFIG_PROVIDER_DEFAULT_ENV_NAME``: Environment variable's name to look for when setting default provider. Default: ``MMB_CONFIG_PROVIDER_DEFAULT``.
+  * ``MMB_CONFIG_STRATEGY_DEFAULT_ENV_NAME``: Environment variable's name to look for when setting default strategy. Default: ``MMB_CONFIG_STRATEGY_DEFAULT``.
+  * ``MMB_CONFIG_INTERFACE_NAME_DEFAULT_ENV_NAME``: Environment variable's name to look for when setting default strategy. Default: ``MMB_CONFIG_INTERFACE_NAME_DEFAULT``.
+  * ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT_ENV_NAME``: Environment variable's name to look for when setting default execution context for the GPU. Default: ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``.
+
+The following variables can be set in the environment at compile time to modify compilation behaviour, use format ``export <name>=<value>`` or ``./configure <name>=<value>``
+
+  * ``MMB_CONFIG_HIPCC_EXTRA_CPPFLAGS``: Extra flags to pass to hipcc compiler during compilation of .hip files.
+
+
+Run-time
+^^^^^^^^
+
+The following variables can be set in the environment at run-time in order to modify some of the compile-time defined behaviors.
+These variable are read only once, during the library initialization.
+
+  * ``MMB_CONFIG_PROVIDER_DEFAULT``
+  * ``MMB_CONFIG_STRATEGY_DEFAULT``
+  * ``MMB_CONFIG_INTERFACE_NAME_DEFAULT``
+  * ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT``
+
+The variables defaults to the compile-time values.
+The name of these of these variable can be changed at compile time by setting ``MMB_CONFIG_PROVIDER_DEFAULT_ENV_NAME``, ``MMB_CONFIG_STRATEGY_DEFAULT_ENV_NAME``, ``MMB_CONFIG_INTERFACE_NAME_DEFAULT_ENV_NAME`` and ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT_ENV_NAME`` respectively.
+For simplicity, ``MMB_CONFIG_EXECUTION_CONTEXT_GPU_DEFAULT`` also accepts ``NONE`` as a valid choice.
+
+The following variable can modify the log level at run-time, up to the max compile-time cutoff, and overrides API log level setting.
+
+  * ``MMB_LOG_LEVEL``: Run-time log level setting, cannot override max cutoff defined at compile time.
+
+
+Common Issues
+=============
+
+C standard
+--------------
+
+If you force standard conformance, with e.g. ``-std=c11``, you may also need to pass something like ``-D_XOPEN_SOURCE=500`` to get required POSIX features.
+Alternatively use ``-std=gnu11``.
+
+HIP ROCM Support
+----------------
+
+If you see the following error:
+
+.. code:: console
+
+  .../hip_code_object.cpp:120: guarantee(false && "hipErrorNoBinaryForGpu: Coudn't find binary for current devices!")
+
+
+You may not have got appropriate HIP ARCH definitions during compilation. This can, for example, occur when compiling on a login node without GPUs attached.
+If you have appropriate environment/module resolution for this, use that, otherwise you can forward extra cpp args to the hipcc compiler during the Mamba build via the following environment variable, which you need to export prior to configuration:
+
+.. code:: console
+
+  // Valid for AMD mi60, export before configure
+  export MMB_CONFIG_HIPCC_EXTRA_CPPFLAGS="-D__HIP_ARCH_GFX906__=1 --cuda-gpu-arch=gfx906"
+
+
+To check, you can run hipcc --cxxflags and check for something like the above. Setting HIPCC_VERBOSE=7 will additionally provide verbose info from the hipcc compiler.
+
+Furthermore, discovery of AMD GPUs via hwloc is currently not able to find the available memory size, and so memory spaces created automatically during discovery will be of unlimited size (i.e. limited by hip runtime, rather than Mamba).
+
+CUDA 
+----
+
+If you see the following error: 
+
+.. code:: console
+
+  no kernel image is available for execution on the device.
+
+
+You may be using the wrong CUDA architecture for the GPU device available on your node.
+You can change the architecture used by setting it on your configure line with ``./configure --enable-cuda=<arch>``.
+The default architecture we are using is ``sm_60``.
+It this value is too high you may want to try ``sm_30``.
+
+OpenCL/FPGA
+-----------
+
+The buffer_copy_opencl example (run automatically during ``make check`` or ``make check-examples``) will try to build a kernel at run-time; for most FPGA platforms OpenCL does not have access to a compiler, as such this will likely fail. In order to have this example run, you must build a bitstream for your specifc FPGA that matches the example kernel in examples/c/buffer_copy_opencl.c, and export the path to this bitstream via the environment variable ``MMB_CONFIG_BUFFER_COPY_OPENCL_BINARY`` before running the example.
+
diff --git a/deps/mamba/docs/sphinx/requirements.txt b/deps/mamba/docs/sphinx/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bae998b9cb3bb8dee63790f210fb6cf1a3eab2d4
--- /dev/null
+++ b/deps/mamba/docs/sphinx/requirements.txt
@@ -0,0 +1,3 @@
+Sphinx==4.2.0
+sphinx_rtd_theme==1.0.0
+sphinxcontrib-httpdomain==1.8.0
diff --git a/deps/mamba/memory/strategies/i_statistics.h b/deps/mamba/memory/strategies/i_statistics.h
index 8993e296c5c980ee3639668a5ee45ecc4aabc674..b5f594784c9e3836ccba87c07b445219634866b7 100644
--- a/deps/mamba/memory/strategies/i_statistics.h
+++ b/deps/mamba/memory/strategies/i_statistics.h
@@ -95,7 +95,7 @@ mmbError mmb_strat_stats_check_interface(mmbMemInterface *interface);
 /**
  * @brief Size of the data structure example.
  */
-#define mmb_strat_data_size__stats sizeof(struct mmb_strat_stats_data)
+#define mmb_strat_data_size__stats ((mmb_strat_data_size)sizeof(struct mmb_strat_stats_data))
 
 /**
  * @brief Initialize the parameters used to gather statistics.
diff --git a/deps/mamba/memory/strategies/i_thread_safe.h b/deps/mamba/memory/strategies/i_thread_safe.h
index 920633aa63a986e30b2d61963db994d19a2ab4cb..65ee689ea2e98113ca12a2e8399a40db48e8dc70 100644
--- a/deps/mamba/memory/strategies/i_thread_safe.h
+++ b/deps/mamba/memory/strategies/i_thread_safe.h
@@ -70,7 +70,7 @@ struct mmb_strat_thread_safe_data {
  * This value is used to compute the required size for an struct
  * mmbMemInterface.
  */
-#define mmb_strat_data_size__thread_safe sizeof(struct mmb_strat_thread_safe_data)
+#define mmb_strat_data_size__thread_safe ((mmb_strat_data_size)sizeof(struct mmb_strat_thread_safe_data))
 
 /**
  * @brief Initialize the parameters used to apply the thread safe strategy.
diff --git a/deps/mamba/tests/Makefile.am b/deps/mamba/tests/Makefile.am
index c6cdfff5d40921ac2ce7c9451e45cc4399610f81..50eb36a596bb8740fe297fd2ec06a6ef52c05223 100644
--- a/deps/mamba/tests/Makefile.am
+++ b/deps/mamba/tests/Makefile.am
@@ -46,6 +46,7 @@ check_bin = version             \
             interface_creation  \
             allocating          \
             check_array_tiling  \
+						check_layout        \
             check_array_reshape \
             introspection_api   \
             indexing            \
diff --git a/deps/mamba/tests/check_layout.c b/deps/mamba/tests/check_layout.c
new file mode 100644
index 0000000000000000000000000000000000000000..b0ed8003c42d1669b34be1ba2ef9d3ecb7604fc3
--- /dev/null
+++ b/deps/mamba/tests/check_layout.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2018-2021 Cray UK
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* needed before inclusion of cheat.h: */
+#ifndef __BASE_FILE__
+#define __BASE_FILE__ __FILE__
+#endif
+
+
+#include "cheat.h"
+#include "cheats.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <getopt.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <mamba.h>
+#include <mmb_logging.h>
+
+
+#ifndef MAX_LOG_LEVEL
+#  ifdef MMB_LOG_DEBUG
+#    define MAX_LOG_LEVEL MMB_LOG_DEBUG
+#  else
+#    define MAX_LOG_LEVEL 3
+#  endif
+#endif
+
+
+
+CHEAT_TEST(check_array_tiling_1d,
+
+  // Initialise mamba
+  cheat_assert (MMB_OK == mmb_init(MMB_INIT_DEFAULT));
+
+  cheat_assert (MMB_OK == mmb_logging_set_debug_level(MMB_LOG_DEBUG));
+
+  mmbLayout *src_layout, *src_layout2, *dst_layout, *regular_layout;
+  int src_blocks = 2;
+  size_t *src_offsets = malloc(sizeof(size_t) *src_blocks);
+  size_t *src_lengths = malloc(sizeof(size_t) *src_blocks);
+
+  int dst_blocks = 4;
+  size_t *dst_offsets = malloc(sizeof(size_t) *dst_blocks);
+  size_t *dst_lengths = malloc(sizeof(size_t) *dst_blocks);
+
+
+  src_offsets[0] = 0;
+  src_offsets[1] = 100;
+  src_lengths[0] = 100;
+  src_lengths[1] = 50;
+
+
+  dst_offsets[0] = 0;
+  dst_offsets[1] = 25;
+  dst_offsets[2] = 75;
+  dst_offsets[3] = 100;
+  dst_lengths[0] = 25;
+  dst_lengths[1] = 50;
+  dst_lengths[2] = 25;
+  dst_lengths[3] = 50;
+
+  cheat_assert (MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                       0,
+                                       src_blocks,
+                                       src_offsets,
+                                       src_lengths,
+                                       &src_layout));
+
+  cheat_assert (MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                       1,
+                                       src_blocks,
+                                       src_offsets,
+                                       src_lengths,
+                                       &src_layout2));
+
+
+  cheat_assert (MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                       0,
+                                       dst_blocks,
+                                       dst_offsets,
+                                       dst_lengths,
+                                       &dst_layout));
+
+  cheat_assert (MMB_OK ==  mmb_layout_create_regular_1d(sizeof(float), MMB_PADDING_NONE, &regular_layout));
+
+
+  /*check mmb_layout_cmp*/
+  mmbLayoutEquivalence diff;
+
+  cheat_assert (MMB_OK == mmb_layout_cmp(src_layout, src_layout, &diff));
+
+  cheat_assert(MMB_LAYOUT_EQUAL == diff);
+
+  cheat_assert (MMB_OK == mmb_layout_cmp(src_layout, src_layout2, &diff));
+
+  cheat_assert(MMB_LAYOUT_DIFF_INDEX == diff);
+
+  cheat_assert (MMB_OK == mmb_layout_cmp(src_layout, dst_layout, &diff));
+
+  cheat_assert(MMB_LAYOUT_DIFF_FIELDS == diff);
+
+  cheat_assert (MMB_OK == mmb_layout_cmp(src_layout, regular_layout, &diff));
+
+  cheat_assert(MMB_LAYOUT_DIFF_TYPES == diff);
+
+  /*check mmb_layout_compute_intersection */
+  mmbLayoutIntersection *intersection;
+
+  /* Two different types of layouts ... not implemented FIXME  */
+  cheat_assert (MMB_OK != mmb_layout_compute_intersection(regular_layout, dst_layout, &intersection));
+
+  cheat_assert (MMB_OK == mmb_layout_compute_intersection(src_layout, src_layout, &intersection));
+  /** Intersection with myself ... same layout
+    * dst piece 0 intersects src piece 0 at:
+    * src_offset: = 0
+    * dst_offset: = 0
+    * length: = 100
+
+    * dst piece 1 intersects src piece 1 at:
+    * src_offset: = 0
+    * dst_offset: = 0
+    * length: = 50
+    */
+
+  /** check dst piece 0 intersects src piece 0 */
+   int index = 0*intersection->n_src_pieces + 0;
+   cheat_assert(intersection->overlap[index].src_offset == 0 );
+   cheat_assert(intersection->overlap[index].dst_offset == 0 );
+   cheat_assert(intersection->overlap[index].length == 100 );
+
+  cheat_assert (MMB_OK == mmb_layout_destroy_mmbLayoutIntersection(intersection));
+
+  cheat_assert (MMB_OK == mmb_layout_compute_intersection(src_layout, dst_layout, &intersection));
+
+  /** intersection should be
+    * dst piece 0 intersects src piece 0 at:
+    * src_piece_offset: = 0
+    * dst_piece_offset: = 0
+    * length: = 25
+
+    * dst piece 1 intersects src piece 0 at:
+    * src_piece_offset: = 25
+    * dst_piece_offset: = 0
+    * length: = 50
+
+    * dst piece 2 intersects src piece 0 at:
+    * src_piece_offset: = 75
+    * dst_piece_offset: = 0
+    * length: = 25
+
+    * dst piece 3 intersects src piece 1 at:
+    * src_piece_offset: = 0
+    * dst_piece_offset: = 0
+    * length: = 50
+  */
+
+  /** check dst piece 1 intersects src piece 0 */
+  index = 1*intersection->n_src_pieces + 0;
+  cheat_assert(intersection->overlap[index].src_offset == 25 );
+  cheat_assert(intersection->overlap[index].dst_offset == 0 );
+  cheat_assert(intersection->overlap[index].length == 50 );
+
+  cheat_assert (MMB_OK == mmb_layout_destroy_mmbLayoutIntersection(intersection));
+
+  cheat_assert(MMB_OK == mmb_finalize());
+)
diff --git a/deps/mamba/tests/version.c b/deps/mamba/tests/version.c
index 2f96a36644dfcb8827d13d27dff3416740258b7a..052e09c90d00cb6f0f874b4f44a0e9da889dc00a 100644
--- a/deps/mamba/tests/version.c
+++ b/deps/mamba/tests/version.c
@@ -50,5 +50,5 @@ CHEAT_TEST(version_numbers,
     cheat_assert(3 == sscanf(version, "mamba %u.%u.%u", &major, &minor, &patch));
     cheat_assert(0 == major);
     cheat_assert(1 == minor);
-    cheat_assert(8 == patch);
+    cheat_assert(9 == patch);
 )
diff --git a/docs/sphinx/Makefile.am b/docs/sphinx/Makefile.am
index e9a3c4424cf25254e9fccd77c5e6661b8061ad9b..24f57844ded9f8a9aee85067af18464ac0380b3f 100644
--- a/docs/sphinx/Makefile.am
+++ b/docs/sphinx/Makefile.am
@@ -29,8 +29,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
-# TODO need to check beforehand: sphinx myst-parser breathe
 if HAVE_SPHINX
+if HAVE_BREATHE
 
 clean-local:
 	-${RM} -r _build/*
@@ -56,4 +56,4 @@ all-local:
 #	-${RM} -r _build _sources _templates
 
 endif
-
+endif
diff --git a/docs/sphinx/dev.rst b/docs/sphinx/dev.rst
index c804707e045b954a4771796d6c987b722760473b..db097634d3fc6cf356b02339e3feed0f76b787b2 100644
--- a/docs/sphinx/dev.rst
+++ b/docs/sphinx/dev.rst
@@ -1,23 +1,22 @@
 Developers' Corner
 ==================
 
-TODO
+"How do we take Maestro Core forward?"
 
 Contributing
 ------------
 
-TODO
+Maestro Core will soon be open to contributions!
+For now, please file an issue on `Gitlab <https://gitlab.jsc.fz-juelich.de/maestro/maestro-core/-/issues/new?issue%5Bmilestone_id%5D=>`__ to start a discussion.
 
 
-..  
-  doxygenfunction:: mstro_cdo_demand
-
+API
+---
 
-Full documentation
-------------------
-
-TODO 
+Link to online doxygen to appear. Maestro Core only builds a local doxygen documentation for the moment.
 
+..  
+  doxygenfunction:: mstro_cdo_demand
 
 ..
   doxygengroup:: MSTRO_CONFIG
diff --git a/docs/sphinx/essentials.rst b/docs/sphinx/essentials.rst
index 1eab704e8755a9bbce8b013cbe6e41450d9e0ca7..643ddaf325d0b2261d67c4c9500e112c3726a219 100644
--- a/docs/sphinx/essentials.rst
+++ b/docs/sphinx/essentials.rst
@@ -3,6 +3,8 @@ Essentials
 
 Play with Maestro Core basic concepts.
 
+.. _cdo:
+
 Core Data Objects (CDOs)
 ------------------------
 
@@ -43,6 +45,9 @@ actual data to be transferred, we are going to need a pointer and a size:
 At this point of the CDO definition the upcoming transfer will indeed consist
 of data and metadata. 
 
+To go further: :ref:`user metadata<story_user_metadata>` and :ref:`layout
+metadata<story_layout_metadata>`.
+
 .. _cdo_management:
 
 CDO Management
@@ -75,7 +80,7 @@ Maestro Pool. When ``withdrawn``, the CDO data is garanteed to be left
 unscathed by Maestro.
 
 
-.. image:: management.jpeg
+.. image:: img/management.jpeg
    :alt: CDO management diagram
 
 On the consumer side, after ``seal`` the CDO may be ``required``, which will
@@ -173,6 +178,8 @@ system transport or "MIO" for object store transport. MIO is Maestro Core
 interface to object stores, which has one backend that is the `cortx-motr
 <https://github.com/Seagate/cortx-mio>`__ object store.
 
+.. _mamba:
+
 Memory Management
 -----------------
 
@@ -194,7 +201,7 @@ to keep track of the proceedings. Such events may be filtered using a selector
    mstro_cdo_selector_create( ..., "(has .maestro.core.cdo.name)",
                                       &selector);
 
-Subscribing to events if done with:
+Subscribing to events is done with
 
 .. code-block:: c
 
@@ -213,6 +220,9 @@ Then events may be polled
    mstro_pool_event event;
    mstro_subscription_poll(cdo_subscription, &event);
 
+.. NOTE::
+   ``mstro_subscription_poll()`` returns in *event* a list of events, where *event* is the head and where the next element is ``event->next``.
+
 and inspected for CDO properties
 
 .. code-block:: c
@@ -223,6 +233,9 @@ and inspected for CDO properties
        cdo_name = event->offer.cdo_name;
        ...
 
+
+
+
 Groups
 ------
 
@@ -252,7 +265,7 @@ elements may be inspected
 
 before being potentially individually ``required``.
 
-.. WARNING::
+.. NOTE::
  * All group participant must know all CDOs, at least their names
  * Consumer must know all CDO names at least
 
diff --git a/docs/sphinx/img/maestro_core_components.png b/docs/sphinx/img/maestro_core_components.png
new file mode 100644
index 0000000000000000000000000000000000000000..4db25010d3af8c9c3a72e49c83c6d67bcba0f646
Binary files /dev/null and b/docs/sphinx/img/maestro_core_components.png differ
diff --git a/docs/sphinx/management.jpeg b/docs/sphinx/img/management.jpeg
similarity index 100%
rename from docs/sphinx/management.jpeg
rename to docs/sphinx/img/management.jpeg
diff --git a/docs/sphinx/pm_event_ack.png b/docs/sphinx/img/pm_event_ack.png
similarity index 100%
rename from docs/sphinx/pm_event_ack.png
rename to docs/sphinx/img/pm_event_ack.png
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 2ee3bc232edff9dfc4aa46639422214306996bd0..19239847dd9f5b1d5c924e42817a3b7c64a5bd95 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -6,11 +6,11 @@
 Welcome to Maestro Core's documentation!
 ========================================
 
-Maestro Core is a C library that does multi-threaded cross-application data
-transfer over high-speed interconnect using lightweight data wrappers named
-CDOs that include user metadata.
+Maestro Core is a C library that allows multi-threaded cross-application data
+transfer and inspection over high-speed interconnect, using lightweight data
+wrappers named CDOs that include user metadata and data semantics.
 
-A producer application using Maestro Core may offer data and metadata:
+A producer application using Maestro Core may offer data and metadata
 
 .. code-block:: c
 
@@ -28,7 +28,7 @@ A producer application using Maestro Core may offer data and metadata:
                            &value, ...);
    mstro_cdo_offer(src_handle);
 
-Which a consumer application may demand:
+Which a consumer application may demand
 
 .. code-block:: c
 
@@ -38,11 +38,11 @@ Which a consumer application may demand:
    mstro_cdo_require(dst_handle);
    mstro_cdo_demand(dst_handle);
 
-When ``mstro_cdo_demand()`` returns, the data and metadata has
+When ``mstro_cdo_demand()`` returns, the data and metadata have
 been transferred by Maestro Core from the producer and is locally available for
 the consumer.
 
-CDOs are central to Maestro Core design.
+CDOs are central to the Maestro Core design.
 
 Maestro Core is part of the `Maestro <https://www.maestro-data.eu/>`__
 framework, which aims at data-driven workflow orchestration.
@@ -51,7 +51,8 @@ That's it for the introduction.
 
 .. toctree::
    :maxdepth: 2
-   
+  
+   showroom 
    started
    essentials
    stories
diff --git a/docs/sphinx/showroom.rst b/docs/sphinx/showroom.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a80bcac5c29cdf50b2771d492308b556ea5a4123
--- /dev/null
+++ b/docs/sphinx/showroom.rst
@@ -0,0 +1,95 @@
+Showroom
+========
+
+"What can I use Maestro Core for?"
+
+
+Overview
+--------
+
+Workflow applications using Maestro Core offer and demand CDOs to the
+:ref:`Maestro Pool<cdo_management>`, a conceptual entity that represents the
+set of resources contributed to Maestro, in particular the set of offered CDOs.
+One typical use of Maestro Core is bypassing persistent storage for application
+coupling, by way of the Maestro Pool.
+
+:ref:`CDOs<cdo>` are basically Maestro Core currency, they contain data and metadata,
+including user-defined metadata plus data semantics such as data layout
+information and other information on the data usage the transfer scheduler may
+take advantage of.
+
+A minimal Maestro-enabled multi-app setup would consist in one producer
+application, one consumer application and one Pool Manager application, the
+latter being shipped with Maestro Core and alternatively invokable via the
+Maestro Core API. The :ref:`Pool Manager<pm>` is a Maestro Core-produced
+application that handles networking, transport scheduling and propagates
+pool events for inspection.
+
+.. image:: img/maestro_core_components.png
+   :alt: Maestro core components overview
+
+:ref:`Events<events>` allow for higher-level control of the Maestro-enabled
+workflow, and interfacing with many useful components. In particular, they
+allow the implementation of data-driven Workflow Managers or more generally
+Execution Frameworks, meaning they can schedule jobs based on CDO availability
+and location which is indeed notified by events. :ref:`Data
+management<cdo_management>` (and :ref:`memory management<mamba>` as well)  is
+then effectively delegated by Execution Frameworks and applications to Maestro
+Core.
+
+As an example of useful workflow components, staging and saving CDOs of choice
+may be implemented by users via Librarian and Archiver components, the latter
+relying especially on events to be notified of the availability of relevant
+CDOs for archiving purposes.
+A keep-alive proxy may allow to retain CDOs from being withdrawn from the pool,
+which would be before they have been properly handled by all the components
+that might have needed them. 
+Maestro-enabled components natively produce logs as well as a telemetry report
+at finalize time, nonetheless Events allow to implement more specific
+telemetry/logger components, with :ref:`selectors<events>` allowing
+:ref:`cherry-picking<story_cherry>` of events and CDOs.
+
+CDOs convey a great deal of data and memory semantics. 
+Data usage information such as CDO dependences or an eager transfer policy for
+instance *(both yet to be implemented)* helps the scheduler and thus general
+performance.
+Data layout information lets Maestro make the transformations between a given
+offered CDO in a given data layout L1 (say row-major) to the demanded CDO on
+the consumer side that may have a layout L2 (say column-major). This allows
+user to abstract away data layout, especially if tools / source-to-source
+edition fills the data layout attribute (semi) automatically. Data layouts may
+be distributed *(WIP)* and Maestro Core *will soon* handle the redistribution
+complying with producer and consumer layout and dsitribution scheme
+requirements.
+
+.. image:: img/core_data_object.png
+   :alt: Anatomy of the Core Data Object
+
+CDO location is expressed as a so-called Memory Object, which of course can mean a certain number of layers (DRAM, GDRAM, Parallel Filesystem, Object Store, etc) and associated access methods (pointer, path, object ID, etc.), which is then used by Maestro Core to handle the transport between layers and nodes.
+
+Collections
+- Group
+- Selector (get/set, select, subscribe)
+   user metadata
+
+
+Demos
+-----
+
+A few demonstrators are shipped with Maestro Core, which are compiled and run
+through
+
+.. code-block:: shell
+
+  make check 
+
+- Local multi-threaded setup ``demo_mvp_d3_2``: with a few parameters such as number of producer, consumer and archiver threads.
+
+- Multi-application setup ``check_pm_interlock``: The interlock demo is a minimal workflow with Pool Manager and two applications, exchanging a CDO under the Pool Manager supervision, using filesystem and object store transport, RDMA is used if any transport unavailabilities.
+
+- Logger sample application ``simple_telemetry_listener``
+
+- Archiver sample application ``simple_archiver``
+
+
+
diff --git a/docs/sphinx/started.rst b/docs/sphinx/started.rst
index 475ba5430c973058d0818e36f197cf0bfc0ab28f..57c412290226e562c7cb19c845a53deb82e7dade 100644
--- a/docs/sphinx/started.rst
+++ b/docs/sphinx/started.rst
@@ -85,8 +85,8 @@ and the first program is ready to run.
 .. TIP::
    You can inspect what Maestro does by setting the following environment variable ``MSTRO_LOG_LEVEL=info``. By default Maestro Core outputs to ``stderr``, you can also choose ``stdout`` or ``syslog`` via  ``MSTRO_LOG_DST=syslog``
  
-Demos
------
+Examples
+--------
    
 TODO 
 
diff --git a/docs/sphinx/stories.rst b/docs/sphinx/stories.rst
index 5dfa3c1c344bc01e96ac785385a8d5e769229137..af5544c31cc7b22259b21b908274f6238ac1fe25 100644
--- a/docs/sphinx/stories.rst
+++ b/docs/sphinx/stories.rst
@@ -3,6 +3,8 @@ Users' Stories
 
 "How can I do [...] with Maestro Core?"
 
+.. _story_user_metadata:
+
 User metadata
 -------------
 
@@ -16,7 +18,7 @@ Users may define their own metadata, using a YAML schema:
    
    maestro-attributes:
    
-     - key: "my_key"
+     - key: "my-key"
        type: int()
        required: True
        documentation: This attribute is an integer that is required,
@@ -36,7 +38,7 @@ Then the schema may be imported to Maestro Core via environment variable
 
 .. code-block:: shell
 
-   MSTRO_SCHEMA_LIST="my_schema.yaml"
+   MSTRO_SCHEMA_LIST="my_schema.yaml, ..."
 
 making it available for attributes to be set to CDOs, similarly to :ref:`core
 attributes<metadata>`.
@@ -48,6 +50,7 @@ attributes<metadata>`.
                            ".maestro.my-namespace.my-key",
                            &value, ...);
 
+.. _story_cherry:
 
 Cherry-pick CDOs of interest based on metadata
 ----------------------------------------------
@@ -63,6 +66,7 @@ Selector by key-value comparison or regexp
 
 and then use Maestro Core :ref:`events subscriptions<events>`.
 
+.. _story_layout_metadata:
 
 Seamless C\<-\>Fortran applications transfers
 ---------------------------------------------
@@ -115,6 +119,8 @@ consumer CDO thanks to their identical name, and transparently performs the
 transformation on the DEMANDer, and ensures the CDO is in the requested layout
 when the DEMAND returns.
 
+.. _story_withdraw:
+
 Not let producers withdraw early
 --------------------------------
 
@@ -148,7 +154,7 @@ with request for acknowledgment
       Producer <-- PoolManager [label=offer_ack];
     }
 
-.. image:: pm_event_ack.png
+.. image:: img/pm_event_ack.png
    :alt: PM event ack diagram
 
 which means the consumer here, upon receiving the ``offer`` event, may
diff --git a/examples/local_pool_op.c b/examples/local_pool_op.c
index 72a099eaf368ae08e551486cd1d44819a55736ad..bbb3eab4fdf329964aa70ccdb38d42aec846cd30 100644
--- a/examples/local_pool_op.c
+++ b/examples/local_pool_op.c
@@ -123,7 +123,7 @@
 
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
-
+#include <string.h>
 
 /* simplify logging */
 #define DEBUG(...) LOG_DEBUG(MSTRO_LOG_MODULE_USER,__VA_ARGS__)
@@ -238,7 +238,7 @@ wait_for_announcement(struct cdo_announcement *announcement)
   }
   s = mstro_cdo_attribute_set(announcement_cdo,
                               MSTRO_ATTR_CORE_CDO_RAW_PTR,
-                              announcement);
+                              announcement, false);
   if(s!=MSTRO_OK) {
     ERR("Failed to set raw-ptr attribute on announcement CDO\n");
     abort();
@@ -287,7 +287,7 @@ do_announce(struct cdo_announcement *announcement, mstro_cdo *result)
   }
   s = mstro_cdo_attribute_set(announcement_cdo,
                               MSTRO_ATTR_CORE_CDO_RAW_PTR,
-                              announcement);
+                              announcement, false);
 
   if(s!=MSTRO_OK) {
     ERR("Failed to set raw-ptr attribute on  announcement CDO\n");
@@ -451,24 +451,22 @@ archiver_flush_to_disk(const char *name, mmbArray *a)
     abort();
   }
 
-  size_t nt;
-  stat = mmb_tile_iterator_count(it, &nt);
-  if(stat != MMB_OK) {
-    ERR("Failed to get tile iterator count\n");
-    abort();
-  }
-  stat = mmb_tile_iterator_first(it);
-  if(stat != MMB_OK) {
-    ERR("Failed to move tile iterator to first\n");
-    abort();
-  }
+  mmbDimensions *tiling_dims = NULL;
+  stat = mmb_tiling_dimensions(a, &tiling_dims);
+
+  
   FILE *dst = fopen((const char*)name, "w");
   if(dst==NULL) {
     ERR("Failed to open %s for writing\n", name);
     abort();
   }
-  for(size_t i = 0; i < nt; i++){
-    mmbArrayTile* tile = it->tile;
+  for(size_t i = 0; i < tiling_dims->d[0]; i++){
+    mmbArrayTile* tile = NULL;
+    stat = mmb_tile_at_1d(a, i, &tile); 
+    if(stat != MMB_OK) {
+       ERR("Failed to obtain tile %zu: %d\n", i, stat);
+       abort();
+    }
     size_t count = fwrite(&MMB_IDX_1D(tile, tile->lower[0], char),
                           1,
                           tile->upper[0]-tile->lower[0],
@@ -477,19 +475,16 @@ archiver_flush_to_disk(const char *name, mmbArray *a)
       ERR("Incomplete write on tile %d of %s\n", i, name);
       abort();
     }
-    stat = mmb_tile_iterator_next(it);
-    if(stat != MMB_OK) {
-      ERR("Failed to increment tile iterator\n");
-      abort();
-    }
   }
   if(0!=fclose(dst)) {
     ERR("Failed to close %s after writing\n", name);
     abort();
   }
-  stat = mmb_tile_iterator_destroy(it);
+
+  // destroy tiling dim info:
+  stat = mmb_dimensions_destroy(tiling_dims);
   if(stat != MMB_OK) {
-    ERR("Failed to free tile iterator\n");
+    ERR("Failed to destroy tiling dimension info\\n");
     abort();
   }
 
@@ -551,10 +546,10 @@ archiver_thread_fun(void *closure)
       }
       s = mstro_cdo_attribute_set(incoming[i],
                                   MSTRO_ATTR_CORE_CDO_RAW_PTR,
-                                  incoming_buffers[i]);
+                                  incoming_buffers[i], false);
       s |= mstro_cdo_attribute_set(incoming[i],
                                    MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                                   &announcement->cdo_size);
+                                   &announcement->cdo_size, true);
       //      INFO("archiver cdo %d incoming buffer %p\n", i, incoming_buffers[i]);
 
       if(s!=MSTRO_OK) {
@@ -672,10 +667,10 @@ producer_thread_fun(void *closure)
 
       s = mstro_cdo_attribute_set(outgoing[i],
                                   MSTRO_ATTR_CORE_CDO_RAW_PTR,
-                                  outgoing_buffers[i]);
+                                  outgoing_buffers[i], false);
       s |= mstro_cdo_attribute_set(outgoing[i],
                                    MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                                   &announcement->cdo_size);
+                                   &announcement->cdo_size, true);
 
       if(s!=MSTRO_OK) {
         ERR("Failed to add outgoing buffer to CDO %s\n",
@@ -804,10 +799,10 @@ consumer_thread_fun(void *closure)
       }
       s = mstro_cdo_attribute_set(incoming[i],
                                   MSTRO_ATTR_CORE_CDO_RAW_PTR,
-                                  incoming_buffers[i]);
+                                  incoming_buffers[i], false);
       s |= mstro_cdo_attribute_set(incoming[i],
                                    MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                                   &announcement->cdo_size);
+                                   &announcement->cdo_size, true);
       //      INFO("consumer cdo %d incoming buffer %p\n", i, incoming_buffers[i]);
 
       if(s!=MSTRO_OK) {
diff --git a/include/Makefile.am b/include/Makefile.am
index c42a4d21de1f335fb9d4cccea28febaf1373da0b..1eb5531c2e718b23d16232b9e0c8d037f12db54d 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -72,7 +72,6 @@ noinst_HEADERS = \
         maestro/i_uthash.h     \
 	maestro/i_utlist.h     \
 	maestro/i_utstack.h    \
-	maestro/i_tpl.h        \
 	maestro/i_pool_manager_protocol.h \
 	maestro/i_pool_manager_registry.h \
 	maestro/i_pool_manager.h \
diff --git a/include/maestro/attributes.h b/include/maestro/attributes.h
index 73bf061dbf876ac7460977ce44370d5a9357e370..c1d5cdff31d7aae08738277f4a81698205b57c58 100644
--- a/include/maestro/attributes.h
+++ b/include/maestro/attributes.h
@@ -43,9 +43,9 @@ extern "C" {
 #include <stdlib.h>
 #include <time.h>
 #include <stdbool.h>
-  
-#include "maestro/status.h"
 
+#include "maestro/status.h"
+#include "mamba.h"
 
 /**@addtogroup MSTRO_Core
  **@{
@@ -99,14 +99,14 @@ extern const char *MSTRO_ATTR_CORE_CDO_MAESTRO_PROVIDED_STORAGE;
 /**@brief maestro.core.cdo.raw-ptr
  **
  ** The raw pointer value in user-accessible address space for the
- ** local data of the CDO, contiguous. 
+ ** local data of the CDO, contiguous.
  **
  ** Interacts with maestro.core.cdo.scope.local-size.
- ** 
+ **
  ** Get only be set. To query use mstro_cdo_access_ptr().
  **
  ** If unavailable (non-contiguous CDO etc.) the value returned will be NULL.
- ** 
+ **
  ** If set, the maestro.core.cdo.scope.local-size value must also be set.
  **
  ** C-side data type: `void*'
@@ -119,7 +119,7 @@ extern const char *MSTRO_ATTR_CORE_CDO_RAW_PTR;
  ** Local size of the data pointed by raw-ptr.
  **
  ** Interacts with maestro.core.cdo.scope.raw-ptr.
- ** 
+ **
  ** If set, the maestro.core.cdo.scope.raw-ptr value must also be set.
  **
  ** C-side data type: `uint64_t'
@@ -145,6 +145,27 @@ extern const char *MSTRO_ATTR_CORE_CDO_ISGROUP;
  **/
 extern const char *MSTRO_ATTR_CORE_CDO_GROUP_MEMBERS;
 
+/**@brief mstro.core.cdo.isdistributed
+ **
+ ** Indicates that the CDO data are distributed among various components.  The
+ ** current CDO only has part of the whole CDO data.
+ ** Users are not expected to utilize or  set this attribute.
+ **
+ ** C-side data type: `bool`
+ ** default value: False
+ **/
+extern const char *MSTRO_ATTR_CORE_CDO_ISDISTRIBUTED;
+
+/**@brief mstro.core.cdo.dist-layout
+ **
+ ** Layout of the distrbuted CDO, i.e. which piece of the whole CDO data the
+ ** current CDO represents.
+ **
+ ** C-side data type: `mmbLayout`
+ ** default value: NIL
+ **/
+extern const char *MSTRO_ATTR_CORE_CDO_DIST_LAYOUT;
+
 
 /* TODO DOC */
 extern const char* MSTRO_ATTR_CORE_CDO_MAMBA_ARRAY;
@@ -156,7 +177,7 @@ extern const char* MSTRO_ATTR_CORE_CDO_LAYOUT_NDIMS;
   /* FIXME: these need to be replaced by schema names/lookups */
 #define MSTRO_ATTR_CORE_CDO_LAYOUT_ORDER_ROWMAJOR 0
 #define MSTRO_ATTR_CORE_CDO_LAYOUT_ORDER_COLMAJOR 1
-  
+
 extern const char* MSTRO_ATTR_CORE_CDO_LAYOUT_ORDER;
 extern const char* MSTRO_ATTR_CORE_CDO_LAYOUT_DIMS_SIZE;
 
@@ -238,24 +259,29 @@ mstro_status
 mstro_timestamp_to_tm_local(const mstro_timestamp *tsp, struct tm *tmp);
 
 
+/** Parse mmbLayout from string */
+mstro_status
+mstro_mmbLayout_parse(const char *str, mmbLayout **dist_layout);
+
+
 /** Built-in Maestro attribute data types */
 enum mstro_cdo_attr_value_type {
   /** Invalid value */
   MSTRO_CDO_ATTR_VALUE_INVALID = 0,
 
   /** Not applicable, since the key has not been found */
-  MSTRO_CDO_ATTR_VALUE_NA, 
+  MSTRO_CDO_ATTR_VALUE_NA,
 
   /** Not available (no value for key has been set) */
-  MSTRO_CDO_ATTR_VALUE_NONE, 
+  MSTRO_CDO_ATTR_VALUE_NONE,
 
   /* types castable to the respective C types */
   MSTRO_CDO_ATTR_VALUE_bool,     /**< castable to bool */
   MSTRO_CDO_ATTR_VALUE_int32,    /**< castable to int32_t */
   MSTRO_CDO_ATTR_VALUE_uint32,   /**< castable to uint32_t */
-  MSTRO_CDO_ATTR_VALUE_int64,    /**< castable to int64_t */ 
+  MSTRO_CDO_ATTR_VALUE_int64,    /**< castable to int64_t */
   MSTRO_CDO_ATTR_VALUE_uint64,   /**< castable to uint64_t */
-  MSTRO_CDO_ATTR_VALUE_float,    /**< castable to float */ 
+  MSTRO_CDO_ATTR_VALUE_float,    /**< castable to float */
   MSTRO_CDO_ATTR_VALUE_double,   /**< castable to double */
 
   /* these are castable to const char* */
@@ -271,20 +297,14 @@ enum mstro_cdo_attr_value_type {
 
   MSTRO_CDO_ATTR_VALUE_pointer,   /**< castable to void*. Not transmitted to other localities. */
 
-  /* WIP: */
-  /* Mamba layout spec */
-  MSTRO_CDO_ATTR_VALUE_layout,
-
-  /* Distribution spec */
-  MSTRO_CDO_ATTR_VALUE_distribution,
-
+  MSTRO_CDO_ATTR_VALUE_mmblayout,  /** mmblayout ...mainly for distributed cdos support*/
 
   /* this is the number of different value types supported */
   MSTRO_CDO_ATTR_VALUE__MAX
 };
 
 
-  // All data types have an implicit length, so we need a wrapper 
+  // All data types have an implicit length, so we need a wrapper
 /** A data type to hold blobs.
  **
  ** This provides a wrapper around the user memory region so that we can know the size of it.
@@ -307,9 +327,9 @@ mstro_blob_create(size_t len, void *data, mstro_blob **result_p);
  */
 mstro_status
 mstro_blob_dispose(mstro_blob *b);
-  
+
 /** Opaque CDO handle. */
-typedef struct mstro_cdo_ *mstro_cdo; 
+typedef struct mstro_cdo_ *mstro_cdo;
 
 /**
  ** @brief Add (*key*, *val*) pair to attribute set of *cdo*
@@ -318,7 +338,7 @@ typedef struct mstro_cdo_ *mstro_cdo;
  ** @param[in]	key			Attribute key string
  ** @param[in]  val			Pointer to the value to be set
  ** @param[in]	copy_value  Create an internal allocation for the value and
- **							copy @arg val into it 
+ **							copy @arg val into it
  **
  ** BEWARE: If copy_value is set to false, the memory pointed to by val must
  ** remain valid for the entire lifetime of the CDO. Stack-allocated variables
@@ -354,7 +374,7 @@ mstro_cdo_attribute_get(mstro_cdo cdo, const char*key,
 
 /**
  ** @brief Add *keyval_in_yaml* pair of size *len* to attribute set of *cdo*
- ** 
+ **
  ** Consider using predefined constants for the key (see attributes.h) as they will be looked up more efficiently.
  **
  ** @param[in]	cdo		A CDO handle
@@ -367,8 +387,8 @@ mstro_cdo_attribute_set_yaml(mstro_cdo cdo, const char *keyval_in_yaml);
 
 /**
  ** @brief Set default attributes.
- ** 
- ** Initialize attribute table of *cdo* using default values. 
+ **
+ ** Initialize attribute table of *cdo* using default values.
  **
  ** @param[in]	cdo		A CDO handle
  **
diff --git a/include/maestro/cdo.h b/include/maestro/cdo.h
index dcc3e682cb419f4964f31f5430da6b706a3ca8d9..c96c81a208b607c9f6bf85142bb834510ba5bcb8 100644
--- a/include/maestro/cdo.h
+++ b/include/maestro/cdo.h
@@ -243,6 +243,18 @@ mstro_cdo_declaration_seal(mstro_cdo cdo);
 mstro_status
 mstro_cdo_dispose(mstro_cdo cdo);
 
+/**
+ ** @brief Reuse disposed CDO.
+ **
+ ** Disposes the *cdo* and declares it again, keeping the same handle, for
+ ** speed and convenience. User attributes are also preserved.
+ **
+ ** @param[in] cdo 	A CDO handle.
+ ** @returns A status code, ::MSTRO_OK on success.
+ **/
+mstro_status
+mstro_cdo_dispose_and_reuse(mstro_cdo cdo);
+
 
 
 /**@} (end of group MSTRO_CDO) */
diff --git a/include/maestro/core.h b/include/maestro/core.h
index 38918899689034ee4270156b7e67949884e35fe2..444f41786dd60a86459de3cbe81865d0c298ff5b 100644
--- a/include/maestro/core.h
+++ b/include/maestro/core.h
@@ -111,6 +111,19 @@ extern "C" {
   /** the number of characters needed to print an @ref mstro_app_id value using @ref PRIappid */
   #define MSTRO_APP_ID_STR_LEN (21) // decimal, no sign, no NUL
 
+  /** type to express nanosecond time points (since an arbitrary point in time) */
+  typedef uint64_t mstro_nanosec_t;
+
+  /** Printing support */
+  #define PRInanosec PRIu64
+  
+  /** Return the current time */
+  mstro_nanosec_t
+  mstro_clock(void);
+  
+  /** nsec-per-seconds multiplier */
+  #define NSEC_PER_SEC ((mstro_nanosec_t)1000*1000*1000)
+  
 
   /**@} (end of group MSTRO_Core) */
 
diff --git a/include/maestro/env.h b/include/maestro/env.h
index 4a21202d2338dffd16f79651ae171146197b1790..542236799ce06a4f8626c6dfbb78409ceb57132e 100644
--- a/include/maestro/env.h
+++ b/include/maestro/env.h
@@ -239,6 +239,27 @@
  **/
 #define MSTRO_ENV_MIO_CONFIG "MSTRO_MIO_CONFIG"
 
+
+/**
+ ** @brief Flag to enable higher network security on Cray GNI interfaces
+ **
+ ** By default, Cray GNI (Aries) networks allow only jobs of the same
+ ** job allocation to use the HSN between each other. Despite using
+ ** user-id based DRC credentials (which allows cross-talk for jobs on
+ ** different nodes if the user's UID matches), jobs running on the
+ ** same node of an allocation can not talk to each other unless we
+ ** use DRC_FLAGS_FLEX_CREDENTIAL.
+ **
+ ** By default we do set DRC_FLAGS_FLEX_CREDENTIAL, as that allows
+ ** users to schedule jobs of the same workflow on the same or
+ ** different nodes without worrying about this. If you are sure you
+ ** will only run one job per compute node, consider enabling @ref
+ ** MSTRO_ENV_DRC_NON_FLEX to disable flex-credential usage.
+ **
+ **/
+#define MSTRO_ENV_DRC_NON_FLEX "MSTRO_DRC_NON_FLEX"
+
+
 /**@} (end of group MSTRO_ENV) */
 
 #endif
diff --git a/include/maestro/i_attributes.h b/include/maestro/i_attributes.h
index 36d5af9b96fa175429700f9ba829be0a059a01e0..830205415010526dadb3688f083d5dbeec1c4a4b 100644
--- a/include/maestro/i_attributes.h
+++ b/include/maestro/i_attributes.h
@@ -93,26 +93,26 @@ typedef struct mstro_cdo_attr_scope_* mstro_cdo_attr_scope;
  ** This is the Internal Attributes API, as developed for D3.2
  **/
 
-/**@brief Create a fresh attribute table 
- ** 
+/**@brief Create a fresh attribute table
+ **
  ** @param[out]	result	Freshly allocated table
  */
 mstro_status
 mstro_cdo_attr_table__alloc(mstro_cdo_attr_table* result);
 
-/**@brief Destroy an attribute table 
+/**@brief Destroy an attribute table
  **
- ** @param[in] 	tab	Attribute table to destroy	
+ ** @param[in] 	tab	Attribute table to destroy
  */
 mstro_status
 mstro_cdo_attr_table__destroy(mstro_cdo_attr_table tab);
 
-  
+
 /**@brief Query an attribute, based on string key.
  **
  ** An unsuccessful query will see *valtype* set to
  ** MSTRO_CDO_ATTR_VALUE_NA and *value_dst* set to NULL
- ** 
+ **
  ** Consider using the predefined constant string keys from
  ** attributes.h, they will make lookups faster.
  **
@@ -140,18 +140,34 @@ mstro_cdo_attr_table__lookup(const mstro_cdo cdo,
 
 /**@brief Insert an attribute, based on YAML string (key,value) pair
  **
- ** Update attributes of *cdo* using a YAML string *keyval_as_yaml* 
+ ** Update attributes of *cdo* using a YAML string *keyval_as_yaml*
  ** representing (key,value) pairs
  **
  ** @param[in]	cdo		A CDO handle
  ** @param[in]	keyval_in_yaml	(key,value) pair(s) in a YAML string format
  **
  ** @returns A status code, ::MSTRO_OK on success.
- **/  
+ **/
 mstro_status
 mstro_cdo_attr__insert_yaml(mstro_cdo cdo,
                                   const char *keyval_as_yaml);
 
+/** Parse irregular_1D mmbLayout from string for distributed CDOs */
+mstro_status
+mstro_parse_mmbLayout_irregular_1D(char *str, mmbLayout **dist_layout);
+
+/*Parse a size_t array from a string, contained in brackets, i.e. "[100, 200, 300]" */
+static inline
+mstro_status
+mstro_parse_number_array(char *str, size_t *num);
+
+
+/* Automaticallly set cdo.isdistributed attributed if the user set any of the
+   distributed layout attributes */
+static inline
+mstro_status
+mstro_cdo_attribute_set_isdistributed(mstro_cdo cdo, const char* key);
+
 
 /**@} (end of group MSTRO_I_Attributes) */
 /**@} (end of addtogroup MSTRO_Internal) */
diff --git a/include/maestro/i_cdo.h b/include/maestro/i_cdo.h
index 5e2aca3e5cccd460e31a874964ccf2ab63c3c7da..748d43540d1be6531e333bab8ae533bb79620756 100644
--- a/include/maestro/i_cdo.h
+++ b/include/maestro/i_cdo.h
@@ -114,6 +114,12 @@ typedef uint32_t mstro_cdo_state;
 /** CDO returned from global pool by RETRACT */
 #define MSTRO_CDO_STATE_RETRACTED_GLOBALLY (1U<<10)
 //#define MSTRO_CDO_STATE_DISPOSED_GLOBALLY (1U<<11)
+/** CDO REQUIRED to local pool. This is a temporary state when the CDO
+ * has made it to the local pool, but the global REQUIRE is still
+ * outstanding. If there is no pool manager it will quickly move to
+ * ::MSTRO_CDO_STATE_REQUIRED too. */
+#define MSTRO_CDO_STATE_REQUIRED_LOCALLY (1U<<11)
+
 /** CDO returned from pool */
 #define MSTRO_CDO_STATE_RETURNED  (  MSTRO_CDO_STATE_WITHDRAWN  \
                                    | MSTRO_CDO_STATE_DEMANDED   \
@@ -142,6 +148,8 @@ typedef uint32_t mstro_cdo_state;
  * equivalent to OFFERED */
 #define MSTRO_CDO_STATE_SATISFIED      (1U<<31)
 
+
+
 /** CDO state flags that are to be preserved across state changes */
 #define MSTRO_CDO_STATE_FLAGS                   \
     (MSTRO_CDO_STATE_INJECTED                   \
@@ -418,7 +426,8 @@ struct mstro_cdo_ {
   struct mstro_cdo_id gid;       /**< the globally unique ID */
 
   char *name;                    /**< user-provided name */
-
+  _Atomic int64_t n_segments;    /**< number of outstanding transmissions 
+                                  required to fill all the required data for a distributed CDO */
 
   /* Cached copies of the data attributes. Caching occirs at SEAL
    * time, after which the attribute setters will refuse chaning the
@@ -471,6 +480,8 @@ mstro_increment_async_blocked_counter(mstro_request request);
 /** set state. Notify all waiters on state change queue. */
 void
 mstro_cdo_state_set(mstro_cdo cdo, mstro_cdo_state s);
+void
+mstro_cdo_state_set_safe_flags(mstro_cdo cdo, mstro_cdo_state s);
 
 /** get state. May be stale by the time you read it */
 mstro_cdo_state
diff --git a/include/maestro/i_drc.h b/include/maestro/i_drc.h
index ba89b96d3cc4d9f44580648af28da34a1cc3c54c..6f7802b32d0bf5057a14af7bc9d99d1bc59e321b 100644
--- a/include/maestro/i_drc.h
+++ b/include/maestro/i_drc.h
@@ -66,11 +66,21 @@ mstro_status
 mstro_drc_get_oob_string(char **result_p,
                          const mstro_drc_info info);
 
+/** same, but return numeric DRC credential ID */
+mstro_status
+mstro_drc_get_credential(uint32_t *result_p,
+                         const mstro_drc_info info);
+
 /** Create drc object from OOB string info */
 mstro_status
 mstro_drc_init_from_oob_string(mstro_drc_info *result_p,
                                const char *info_string);
 
+/** same, but from DRC credential ID */
+mstro_status
+mstro_drc_init_from_credential(mstro_drc_info *result_p,
+                               uint32_t credential);
+
 
 /**@} (InternalDRC) */
 /**@} (Internal) */ 
diff --git a/include/maestro/i_ofi.h b/include/maestro/i_ofi.h
index 80063292482b576a4a0050ef9154e989507b7d93..29c9ec4b4f2ce18e63d2a904fd0bac8d07ddf91d 100644
--- a/include/maestro/i_ofi.h
+++ b/include/maestro/i_ofi.h
@@ -47,6 +47,7 @@
 
 
 #include "maestro/status.h"
+#include "protocols/mstro_ep.pb-c.h"
 
 #include <stdatomic.h>
 
@@ -113,73 +114,6 @@ mstro_status
 mstro_ofi__msg_context_destroy(mstro_ofi_msg_context ctx);
 
 
-enum mstro_endpoint_type {
-  MSTRO_EP_INVALID = 0, /**< an illegal value */
-  /** libfabric native endpoints. FIXME: not all supported yet. */
-  MSTRO_EP_OFI_IN4 = 1, /**< open fabric FI_SOCKADDR_IN */
-  MSTRO_EP_OFI_IN6,     /**< open fabric FI_SOCKADDR_IN6 */
-  MSTRO_EP_OFI_IB,      /**< open fabric FI_SOCKADDR_IB */
-  MSTRO_EP_OFI_PSMX,    /**< open fabric FI_ADDR_PSMX */
-  MSTRO_EP_OFI_PSMX2,   /**< open fabric FI_ADDR_PSMX2 */
-  MSTRO_EP_OFI_GNI,     /**< open fabric FI_ADDR_GNI */
-  MSTRO_EP_OFI_BGQ,     /**< open fabric FI_ADDR_BGQ */
-  MSTRO_EP_OFI_MLX,     /**< open fabric FI_ADDR_MLX */
-  MSTRO_EP_OFI_STR,     /**< open fabric FI_ADDR_STR */
-  /** other transport endpoints */
-  MSTRO_EP__MAX
-};
-
-static const
-char *mstro_ep_descriptor_names[MSTRO_EP__MAX] = {
-  [MSTRO_EP_INVALID]  = "MSTRO_EP_INVALID",
-  /** libfabric native endpoints. FIXME: not all supported yet. */
-  [MSTRO_EP_OFI_IN4]  = "MSTRO_EP_OFI_IN4",
-  [MSTRO_EP_OFI_IN6]  = "MSTRO_EP_OFI_IN6",
-  [MSTRO_EP_OFI_IB]   = "MSTRO_EP_OFI_IB",
-  [MSTRO_EP_OFI_PSMX] = "MSTRO_EP_OFI_PSMX",
-  [MSTRO_EP_OFI_PSMX2]= "MSTRO_EP_OFI_PSMX2",
-  [MSTRO_EP_OFI_GNI]  = "MSTRO_EP_OFI_GNI",
-  [MSTRO_EP_OFI_BGQ]  = "MSTRO_EP_OFI_BGQ",
-  [MSTRO_EP_OFI_MLX]  = "MSTRO_EP_OFI_MLX",
-  [MSTRO_EP_OFI_STR]  = "MSTRO_EP_OFI_STR",
-    /** other transport endpoints */
-};
-
-/** maximum length of an OFI key. FIXME: this should not exist, and we
- * should support arbitrary length keys. Will be fixed in #81 when
- * chaning the serialization of endpoints to protobuf */
-#define MSTRO_OFI_KEY_LEN_MAX 20
-
-/** for each endpoint type there needs to be a serialized descriptor */
-struct mstro_endpoint_descriptor_ {
-  enum mstro_endpoint_type type; /**< type of endpoint described */
-  union {
-    struct sockaddr_in  in4;      /**< for MSTRO_EP_OFI_IN4 */
-    struct sockaddr_in6 in6;      /**< for MSTRO_EP_OFI_IN6 */
-#ifdef HAVE_IB
-    struct sockaddr_ib  ib;       /**< for MSTRO_EP_OFI_IB */
-#endif
-    uint64_t            psmx;     /**< for MSTRO_EP_OFI_PSMX */
-    uint64_t            psmx2[2]; /**< for MSTRO_EP_OFI_PSMX2 */
-    uint64_t            gni[6];   /**< for MSTRO_EP_OFI_GNI */
-    uint64_t            bgq;      /**< for MSTRO_EP_OFI_BGQ */
-    uint64_t            mlx;      /**< for MSTRO_EP_OFI_MLX */
-    char *              str;      /**< for MSTRO_EP_OFI_STR */
-    void *              addr;     /**< generic 'whatever it is' reader-slot */
-  };
-  char *oob_cookie;               /**< a cookie (currently used for DRC, and only on GNI only) */
-  char *name;                     /**< a printable name */
-  /* temporary (until protobuf-based endpoint handling comes along) */
-  uint64_t info_addr;    /** well-known app descriptor: block memory address */
-  uint64_t info_keysize; /** well-known app descriptor: block memory key size */
-  char     info_key[MSTRO_OFI_KEY_LEN_MAX];     /** well-known app descriptor: block memory key */
-
-  /**< permit chaining descriptors in a singly-linked list */
-  struct mstro_endpoint_descriptor_ *next; 
-    
-};
-
-typedef struct mstro_endpoint_descriptor_ *mstro_endpoint_descriptor;
 
 /** structure holding information about an endpoint */
 struct mstro_endpoint {
@@ -198,61 +132,39 @@ struct mstro_endpoint {
   /** the address to be passed to clients */
   /* void *addr;            /\**< address buffer *\/ */
   /* size_t addrlen;        /\**< address length *\/ */
-  char *addr_serialized; /**< string-form of an endpoint
+    /* protobuf-based descriptor */
+  Mstro__Endpoint        *pbep;    /**< endpoint protocol and address */
+  Mstro__OfiCredential   *cred;    /**< credential -- NULL if none */
+  Mstro__OfiMemoryRegion *inforeg; /**< config block MR location */
+
+  char *serialized; /**< string-form of an endpoint
                           * for passing OOB */
-  mstro_endpoint_descriptor descr; /**< descriptor */
+
   struct mstro_endpoint *next; /**< make structure sinly-linked list linkable */
 };
 
 
-/** intermediate datastructure used in serialization */
-struct serialized_endpoint_element {
-  int64_t  type;         /**< enum mstro_endpoint converted to int */
-  char    *strval;       /**< base64-encoded version of tpl_map encoded
-                          * endpoint address; exact tpl_map format string
-                          * depends on TYPE */
-  char    *oob_cookie;   /**< DRC or other cookie for endpoint */
-  uint64_t info_addr;    /** well-known app descriptor: block memory address */
-  uint64_t info_keysize; /** well-known app descriptor: block memory key size */
-  char     info_key[MSTRO_OFI_KEY_LEN_MAX];     /** well-known app descriptor: block memory key */
-};
-/** we serialize the linked list as a variable-length array of tag
- * (32-bit unsigned integer) and string (result of serializing the
- * individual entry, without the tag, and base64-encoding the result)
- * and the memory registration info necessary to do RDMA reads of the
- * @ref g_mstro_component_info block.
- *
- *Each descriptor type needs to implement a suitable
- * serialization for the descriptor slot it uses; many can share the
- * same one though.*/
 mstro_status
-mstro_ep_desc_serialize(char **result_p,
-                        const struct mstro_endpoint * ep);
+mstro_appinfo_deserialize(const char *serialized_eps,
+                          Mstro__AppInfo **result_p);
 
 mstro_status
-mstro_ep_desc_deserialize(mstro_endpoint_descriptor *result_p,
-                          const char *serialized_eps);
+mstro_appinfo_serialize(const struct mstro_endpoint * ep,
+                        char **result_p);
 
-const char *
-mstro_ep_desc_describe(mstro_endpoint_descriptor desc);
 
 const char *
-mstro_endpoint_describe(struct mstro_endpoint *ep);
+mstro_endpoint_describe(const struct mstro_endpoint *ep);
 
-mstro_status
-mstro_ep_desc_to_ofi_addr(uint32_t *addr_format,
-                          void **addr,
-                          size_t *addrlen,
-                          const mstro_endpoint_descriptor epd);
 
 mstro_status
 mstro_mr_key_get(struct fi_info* fi, struct fid_mr* mr, 
                  uint8_t** mr_key, size_t* keysize, uint64_t* addr);
 
-mstro_status
-mstro_ofi__check_compatibility(bool *suitable_p,
-                               struct mstro_endpoint_descriptor_ *remote,
-                               struct mstro_endpoint *local);
+/* mstro_status */
+/* mstro_ofi__check_compatibility(bool *suitable_p, */
+/*                                const struct mstro_endpoint *remote, */
+/*                                const struct mstro_endpoint *local); */
 
 /**@brief Initialize OFI layer.
  **
@@ -295,7 +207,7 @@ mstro_ofi__submit_message_nowait(struct mstro_endpoint *ep, fi_addr_t dst,
 
 /** select the best match between remote and local endpoints */
 mstro_status
-mstro_ofi__select_endpoint(struct mstro_endpoint_descriptor_ *remote,
+mstro_ofi__select_endpoint(const Mstro__AppInfo *remote,
                            struct mstro_endpoint **local_p,
                            fi_addr_t *remote_addr_p);
 
diff --git a/include/maestro/i_pool.h b/include/maestro/i_pool.h
index 70a3c02e331bed9305e69cecacdd3a0c5dc8bfc0..dec6f464e87e6e44266cb2a7df37dea1bd453e1e 100644
--- a/include/maestro/i_pool.h
+++ b/include/maestro/i_pool.h
@@ -214,6 +214,19 @@ mstro_status
 mstro_pool_finalize(void);
 
 
+/**@brief Find a CDO in the pool by CDOID with certain local-id
+ **
+ ** Locates a CDO by its CDOID.
+ **
+ ** *result is set to the CDO handle if found, or NULL if not.
+ **
+ **@returns A status code, ::MSTRO_OK on success, ::MSTRO_FAIL if not found
+ **/
+mstro_status
+mstro_pool__find_cdo_with_local_id(
+                                const struct mstro_cdo_id *cdoid,
+                                mstro_cdo *result);
+
 /**@brief Find a CDO in the pool by CDOID that has data
  **
  ** Locates a CDO by its CDOID that can provide the data of the CDO.
diff --git a/include/maestro/i_pool_manager_registry.h b/include/maestro/i_pool_manager_registry.h
index e8f385f173a0668fb695edcba765f30b9579f5d2..05780aa324a2b426e980ccd0c6de6776f07d363e 100644
--- a/include/maestro/i_pool_manager_registry.h
+++ b/include/maestro/i_pool_manager_registry.h
@@ -66,7 +66,7 @@ typedef uint64_t mstro_app_id;
 
 
 /* FIXME: can go once the 'addr' is out of the mstro_pm_app_registry_entry */
-#include <rdma/fabric.h> 
+#include <rdma/fabric.h>
 
 #include "maestro/i_event.h"
 
@@ -90,6 +90,18 @@ struct mstro_pm_app_registry_entry {
   bool dead;                 /**< If an app LEAVES the app-registry entry stays around in 'dead' state. That allows us to resurrect it on re-JOIN */
 };
 
+/** mstro_pm_candidates is a structure of arrays to hold which applications are we going to
+ * contact to transfer all the pieces needed for a distributed cdo.
+ *
+ * number of elements of arrays inside mstro_pm_candidates is in n_sources;
+**/
+typedef struct mstro_pm_candidates {
+  size_t n_sources; /* number of apps */
+  struct per_app_cdo_entries ** app;
+  uint64_t * local_id;
+  bool is_distributed;
+} mstro_pm_candidates;
+
 
 /** @brief Register a new app.
  **
@@ -190,6 +202,7 @@ mstro_status mstro_pm_cdo_registry_store_attributes(
     const mstro_app_id app_id,
     Mstro__Pool__Attributes *attributes);
 
+
 /** @brief Update CDO state on CDOID for APP_ID.
  **
  **/
@@ -219,7 +232,7 @@ mstro_pm_handle_demand_queue(void);
  ** it has not been DECLARED/SEALED by the requestor in the CDO
  ** registry, the pool manager must have injected it in
  ** ::MSTRO_CDO_STATE_INJECTED.
- ** 
+ **
  ** it is a wise decision.
  **/
 mstro_status
@@ -264,14 +277,54 @@ mstro_pm_cdo_registry_transfer_completed(const struct mstro_cdo_id *cdo_id,
  * are error codes.  It is not an error to call this when ORIGIN has
  * no live entry for ID -- the return value will be MSTRO_NOMATCH in
  * that case.
- * 
+ *
  **/
 mstro_status
 mstro_pm_cdo_app_match(mstro_app_id origin, const struct mstro_cdo_id *id,
                        const struct mstro_cdo_selector_ *cdo_selector);
 
+/** Convert the mapping between two distributed layouts found by mmb_layout_compute_intersection
+  * to a list of candidates (apps, local cdo-ids, offsets, lengths) that satisfy
+  * the requested distribution from a different source distribution.
+  * Return MSTRO_OK on success.
+**/
+static inline
+mstro_status
+mstro_pm__mmbLayoutIntersecton_to_candidates(
+                            mmbLayoutIntersection *intersection,
+                            size_t dst_index,
+                            mmbLayout *src_layout,
+                            struct per_app_cdo_entries *app_to_attributes_table,
+                            mstro_pm_candidates **candidates);
+
+
+/** Find an offered distributed cdo with the required exact layout
+  * (including index),i.e., 1:1 mapping
+**/
+static inline
+mstro_status
+mstro_pm__find_cdo_with_layout(
+                          struct per_app_cdo_entries *app_to_attributes_table,
+                          mmbLayout *s_layout,
+                          struct per_app_cdo_entries **app,
+                          uint64_t *local_id);
+
+
+/** Create and allocate  mstro_pm_candidates needed to hold the list of
+ * applications and local cdo ids that fullfill the current demand
+**/
+static inline
+mstro_status
+mstro__pool_create_candidates(mstro_pm_candidates **candidates, size_t length, bool distribution);
+
+/** Destroy mstro_pm_candidates **/
+static inline
+mstro_status
+mstro_pm_candidates_destroy(mstro_pm_candidates *candidates);
+
+
+
 /**@} (end of group MSTRO_I_PM_Registry) */
 /**@} (end of group MSTRO_Internal) */
 
 #endif /* MSTRO_POOL_MANAGER_REGISTRY_H_ */
- 
diff --git a/include/maestro/i_statistics.h b/include/maestro/i_statistics.h
index cde6ef819e5aa7d47b64b74c477aa3c22b3e59c4..9180440a6ea6cc3b8262204f0935ecda5c743ac9 100644
--- a/include/maestro/i_statistics.h
+++ b/include/maestro/i_statistics.h
@@ -42,6 +42,8 @@ extern "C" {
 #endif
 
 #include "maestro/status.h"
+#include "maestro/core.h"
+
 #include <stdint.h>
 #include <stdio.h>
 
@@ -89,15 +91,6 @@ mstro_stats_init(void);
 mstro_status
 mstro_stats_finalize(void);
 
-/** type to express nanosecond time points (since an arbitrary point in time) */
-typedef uint64_t mstro_nanosec_t;
-
-/** Printing support */
-#define PRInanosec PRIu64
-
-/** Return the current time */
-mstro_nanosec_t
-mstro_clock(void);
 
 /** Define a new label to collecting information */
 mstro_status
diff --git a/include/maestro/i_tpl.h b/include/maestro/i_tpl.h
deleted file mode 100644
index e57d8b791ba616a6efcb54f73678c8ee662d679e..0000000000000000000000000000000000000000
--- a/include/maestro/i_tpl.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-Copyright (c) 2005-2013, Troy D. Hanson     http://troydhanson.github.com/tpl/
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef TPL_H
-#define TPL_H 
-
-#include <stddef.h>     /* size_t */
-
-#include <stdarg.h>  /* va_list */
-
-//#ifdef __INTEL_COMPILER
-//#include <tbb/tbbmalloc_proxy.h>
-//#endif /* Intel Compiler efficient memcpy etc */
-
-#ifdef _MSC_VER
-typedef unsigned int uint32_t;
-#else
-#include <inttypes.h>   /* uint32_t */
-#endif
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#ifdef _WIN32
-#ifdef TPL_EXPORTS
-#define TPL_API __declspec(dllexport)
-#else							/*  */
-#ifdef TPL_NOLIB
-#define TPL_API
-#else
-#define TPL_API __declspec(dllimport)
-#endif /* TPL_NOLIB */
-#endif	/* TPL_EXPORTS*/
-#else
-#define TPL_API
-#endif
-
-/* bit flags (external) */
-#define TPL_FILE      (1 << 0)
-#define TPL_MEM       (1 << 1)
-#define TPL_PREALLOCD (1 << 2)
-#define TPL_EXCESS_OK (1 << 3)
-#define TPL_FD        (1 << 4)
-#define TPL_UFREE     (1 << 5)  
-#define TPL_DATAPEEK  (1 << 6)  
-#define TPL_FXLENS    (1 << 7)  
-#define TPL_GETSIZE   (1 << 8)
-/* do not add flags here without renumbering the internal flags! */
-
-/* flags for tpl_gather mode */
-#define TPL_GATHER_BLOCKING    1
-#define TPL_GATHER_NONBLOCKING 2
-#define TPL_GATHER_MEM         3
-
-/* Hooks for error logging, memory allocation functions and fatal */
-typedef int (tpl_print_fcn)(const char *fmt, ...);
-typedef void *(tpl_malloc_fcn)(size_t sz);
-typedef void *(tpl_realloc_fcn)(void *ptr, size_t sz);
-typedef void (tpl_free_fcn)(void *ptr);
-typedef void (tpl_fatal_fcn)(const char *fmt, ...);
-
-typedef struct tpl_hook_t {
-    tpl_print_fcn *oops;
-    tpl_malloc_fcn *malloc;
-    tpl_realloc_fcn *realloc;
-    tpl_free_fcn *free;
-    tpl_fatal_fcn *fatal;
-    size_t gather_max;
-} tpl_hook_t;
-
-typedef struct tpl_node {
-    int type;
-    void *addr;
-    void *data;                  /* r:tpl_root_data*. A:tpl_atyp*. ow:szof type */
-    int num;                     /* length of type if its a C array */
-    size_t ser_osz;              /* serialization output size for subtree */
-    struct tpl_node *children;   /* my children; linked-list */
-    struct tpl_node *next,*prev; /* my siblings (next child of my parent) */
-    struct tpl_node *parent;     /* my parent */
-} tpl_node;
-
-/* used when un/packing 'B' type (binary buffers) */
-typedef struct tpl_bin {
-    void *addr;
-    uint32_t sz;
-} tpl_bin;
-
-/* for async/piecemeal reading of tpl images */
-typedef struct tpl_gather_t {
-    char *img;
-    int len;
-} tpl_gather_t;
-
-/* Callback used when tpl_gather has read a full tpl image */
-typedef int (tpl_gather_cb)(void *img, size_t sz, void *data);
-
-/* Prototypes */
-TPL_API tpl_node *tpl_map(char *fmt,...);       /* define tpl using format */
-TPL_API void tpl_free(tpl_node *r);             /* free a tpl map */
-TPL_API int tpl_pack(tpl_node *r, int i);       /* pack the n'th packable */
-TPL_API int tpl_unpack(tpl_node *r, int i);     /* unpack the n'th packable */
-TPL_API int tpl_dump(tpl_node *r, int mode, ...); /* serialize to mem/file */
-TPL_API int tpl_load(tpl_node *r, int mode, ...); /* set mem/file to unpack */
-TPL_API int tpl_Alen(tpl_node *r, int i);      /* array len of packable i */
-TPL_API char* tpl_peek(int mode, ...);         /* sneak peek at format string */
-TPL_API int tpl_gather( int mode, ...);        /* non-blocking image gather */
-TPL_API int tpl_jot(int mode, ...);            /* quick write a simple tpl */
-
-TPL_API tpl_node *tpl_map_va(char *fmt, va_list ap);
-
-#if defined __cplusplus
-    }
-#endif
-
-#endif /* TPL_H */
-
diff --git a/include/maestro/pool.h b/include/maestro/pool.h
index a37e1d2d31ae19e96c1c98baecfe657cdbc0eb21..532b7282f09c73bf78020579a03c6d89bb14cb90 100644
--- a/include/maestro/pool.h
+++ b/include/maestro/pool.h
@@ -387,6 +387,7 @@ mstro_subscribe(mstro_cdo_selector cdos, mstro_pool_event_kinds events,
 struct mstro_pool_event_ {
   enum mstro_pool_event_kind kind;  /**< the event kind */
   uint64_t serial;             /**< the event identifier */
+  mstro_nanosec_t  ctime;      /**< the time the event was created at the source */
   union {
     /* for DECLARE events */
     struct {
diff --git a/maestro/Makefile.am b/maestro/Makefile.am
index 0defd90d80479e1fa6748934199f114ee6416ecd..3287ce980df2e10034ac05400bc53aa1a60a4d5a 100644
--- a/maestro/Makefile.am
+++ b/maestro/Makefile.am
@@ -58,7 +58,7 @@ libmaestro_core_la_SOURCES = \
         uuid_str.c uuid_ui128.c uuid_ui64.c \
         base64.c\
         misc.c \
-	env.c logging.c tpl.c \
+	env.c logging.c \
         statistics.c \
         pool.c \
 	pool_manager_protocol.c \
diff --git a/maestro/attributes.c b/maestro/attributes.c
index 501170a41d27247172d53cbff399ab8567562a55..0952b4393390dd47f3ea34746832bd3c47e560df 100644
--- a/maestro/attributes.c
+++ b/maestro/attributes.c
@@ -5,21 +5,21 @@
 
 /*
  * Copyright (C) 2019 Cray Computer GmbH
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
+ *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
- * 
+ *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
- * 
+ *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software without
  * specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -37,7 +37,7 @@
 #include "attributes/maestro-schema.h"
 
 #include "c-timestamp/timestamp.h"
-
+#include <ctype.h>
 #include <inttypes.h>
 
 
@@ -59,10 +59,10 @@ const char *MSTRO_ATTR_CORE_CDO_ALLOCATE_NOW =
 const char *MSTRO_ATTR_CORE_CDO_MAESTRO_PROVIDED_STORAGE =
     ".maestro.core.cdo.maestro-provided-storage";
 
-const char *MSTRO_ATTR_CORE_CDO_RAW_PTR = 
+const char *MSTRO_ATTR_CORE_CDO_RAW_PTR =
     ".maestro.core.cdo.raw-ptr";
 
-const char *MSTRO_ATTR_CORE_CDO_MAMBA_ARRAY = 
+const char *MSTRO_ATTR_CORE_CDO_MAMBA_ARRAY =
     ".maestro.core.cdo.mamba-array";
 
 const char *MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE =
@@ -86,7 +86,11 @@ const char *MSTRO_ATTR_CORE_CDO_ISGROUP =
 const char *MSTRO_ATTR_CORE_CDO_GROUP_MEMBERS =
     ".maestro.core.cdo.group-members";
 
+const char *MSTRO_ATTR_CORE_CDO_ISDISTRIBUTED =
+    ".maestro.core.cdo.isdistributed";
 
+const char *MSTRO_ATTR_CORE_CDO_DIST_LAYOUT =
+      ".maestro.core.cdo.dist-layout";
 
 /* .... */
 
@@ -159,16 +163,37 @@ mstro_cdo_attribute_set(mstro_cdo cdo, const char* key, void* val, bool copy_val
   } else {
     fqkey=key;
   }
-  
+
   mstro_status status
       = mstro_attribute_dict_set(cdo->attributes, fqkey,
                                  MSTRO_CDO_ATTR_VALUE_INVALID, /* we dont help in checking */
                                  val, copy_value);
+
+  /* Automaticallly set cdo.isdistributed attributed if the user set any of the
+   distributed layout attributes */
+  if (status == MSTRO_OK) {
+    status = mstro_cdo_attribute_set_isdistributed(cdo, key);
+  }
+
   if(tmpfqkey)
     free(tmpfqkey);
   return status;
 }
 
+static inline
+mstro_status
+mstro_cdo_attribute_set_isdistributed(mstro_cdo cdo, const char* key)
+{
+  mstro_status status = MSTRO_OK;
+  bool val = true;
+
+  if ((strcmp(key, MSTRO_ATTR_CORE_CDO_DIST_LAYOUT) == 0)){
+    status = mstro_cdo_attribute_set(cdo, MSTRO_ATTR_CORE_CDO_ISDISTRIBUTED, &val, true);
+  }
+
+  return status;
+}
+
 mstro_status
 mstro_cdo_attribute_get(mstro_cdo cdo, const char* key,
                         enum mstro_cdo_attr_value_type *type,
@@ -215,14 +240,14 @@ mstro_cdo_attribute_set_default(mstro_cdo cdo)
   mstro_status s= mstro_attribute_dict_get_schema(cdo->attributes, &schema);
   if(s!=MSTRO_OK)
     return s;
-  
+
   return mstro_attribute_dict_set_defaults(schema,
                                            true,
                                            &(cdo->attributes));
 }
 
 mstro_status
-mstro_cdo_attribute_set_yaml(mstro_cdo cdo, const char* keyval_in_yaml) 
+mstro_cdo_attribute_set_yaml(mstro_cdo cdo, const char* keyval_in_yaml)
 {
   if(cdo==NULL || keyval_in_yaml==NULL)
     return MSTRO_INVARG;
@@ -236,7 +261,7 @@ mstro_cdo_attribute_set_yaml(mstro_cdo cdo, const char* keyval_in_yaml)
   mstro_status s= mstro_attribute_dict_get_schema(cdo->attributes, &schema);
   if(s!=MSTRO_OK)
     return s;
-  
+
   return mstro_attributes_parse(schema,
                                 keyval_in_yaml,
                                 &cdo->attributes,
@@ -257,7 +282,7 @@ mstro_timestamp_parse(const char *str, size_t len, mstro_timestamp *tsp)
   } else {
     ts = (timestamp_t*)tsp;
   }
-  
+
   int err = timestamp_parse(str, len, ts);
   if(err==0)
     return MSTRO_OK;
@@ -265,6 +290,137 @@ mstro_timestamp_parse(const char *str, size_t len, mstro_timestamp *tsp)
     return MSTRO_FAIL;
 }
 
+static inline
+mstro_status
+mstro_parse_number_array(char *str, size_t *num) {
+  char * token, *tmp;
+  const char *sep = " ";
+  size_t i = 0;
+  int len = strlen(str)-1;
+  /*trim leading spaces and trailing spaces*/
+  while(isspace((unsigned char)*str)) str++;
+  while(isspace((unsigned char)*(str+len-1))) len--;
+
+
+  /**assuming the firts the last characters are []*/
+  assert(str[0] == '[');
+  assert(str[len-1] == ']');
+
+  /* exclude the first character '[' */
+  str++;
+  /* exclude the last character ']' */
+  str[len-2] = 0;
+
+  /*get first number */
+  token = strtok_r(str,sep, &tmp);
+  while(token != NULL) {
+    sscanf(token, "%zd", &num[i]);
+    token = strtok_r(NULL,sep, &tmp);
+    i++;
+  }
+
+  return MSTRO_OK;
+}
+/* dist_irregular_1D ... mmbLayoutType, n_dims, index, element_size_bytes, n_blocks, offsets, lengths */
+/* example "MMB_IRREGULAR, 1, 0,  8, 3, [0 1 2], [1 1 1]"*/
+mstro_status
+mstro_parse_mmbLayout_irregular_1D(char *str, mmbLayout **dist_layout) {
+  const char *sep = ",";
+  char * token, *tmp;
+  mmbError mmb_status;
+  size_t index, temp_int;
+  size_t n_blocks;
+  size_t *offsets;
+  size_t *lengths;
+  size_t element_size_bytes;
+
+
+  token = strtok_r(str, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+
+  /** Reading layout type ... should be MMB_IRREGULAR **/
+  if(strcmp(token, "MMB_IRREGULAR") != 0 ) {
+    ERR("Cannot parse other mmb_layout types. Only support MMB_IRREGULAR \n ");
+    return MSTRO_INVARG;
+  }
+
+  /*reading n_dims ... should be always 1 */
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  sscanf(token, "%zd", &temp_int);
+  if (temp_int != 1 ) {
+    ERR("Can not parse %zd layouts, only support 1D mmbLayout \n ", temp_int);
+    return MSTRO_INVARG;
+  }
+
+  /*reading index ...*/
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  sscanf(token, "%zd", &index);
+
+  /*reading element_size_bytes ...*/
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  sscanf(token, "%zd", &element_size_bytes);
+  if(element_size_bytes == 0) {
+    ERR("mmbLayout element_size_bytes should be greater than 1 \n ");
+    return MSTRO_INVARG;
+  }
+
+  /*reading n_blocks ...*/
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  sscanf(token, "%zd", &n_blocks);
+
+  offsets = malloc(n_blocks*sizeof(size_t));
+  lengths = malloc(n_blocks*sizeof(size_t));
+
+  /*reading offsets ...*/
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  mstro_parse_number_array(token, offsets);
+
+  /*reading lengths ...*/
+  token = strtok_r(NULL, sep, &tmp);
+  DEBUG("parsing token %s \n", token);
+  mstro_parse_number_array(token, lengths);
+
+  DEBUG("element_size_bytes %zd, index %zd, n_blocks %zd, offsets[0] %zd, lengths[0] %zd \n",
+               element_size_bytes, index, n_blocks, offsets[0],lengths[0]);
+  mmb_status = mmb_layout_create_dist_irregular_1d(element_size_bytes,
+                                      index, n_blocks,offsets,lengths,
+                                      dist_layout);
+
+  free(offsets);
+  free(lengths);
+
+  return (mmb_status == MMB_OK) ? MSTRO_OK:MSTRO_FAIL;
+}
+
+mstro_status
+mstro_mmbLayout_parse(const char *str, mmbLayout **dist_layout)
+{
+  DEBUG("parsing %s, with length %d \n", str, strlen(str));
+  if(str==NULL) {
+    ERR("NULL string, cannot parse as mmbLayout \n");
+    return MSTRO_INVARG;
+  } else if(strcmp(str, "NIL")==0) {
+    INFO("NULL mmbLayout ...freeing the allocated memory \n");
+    *dist_layout = NULL;
+    free(*dist_layout);
+    return MSTRO_OK;
+  }
+
+  /*read the layout as a string */
+  /*mmbLayout: "mmbLayoutType, n_dims, index, mmbLayoutElement, mmbLayoutPadding, mmbLayoutBlock|mmbLayoutIrregular" */
+  /* FIXME only support Irregular_1D mmb_layout for distributed CDOs */
+  /* mmbLayoutIrregular: "n_blocks, offsets, lengths" */
+  DEBUG("I only support now irregular_1D mmbLayout \n");
+  /*bypass the const char warning here ... working on the string for parsing */
+  char * tmp = strdup(str);
+  return mstro_parse_mmbLayout_irregular_1D(tmp, dist_layout);
+}
+
 mstro_status
 mstro_timestamp_valid(const mstro_timestamp *tsp, bool *valid)
 {
@@ -304,7 +460,7 @@ mstro_timestamp_compare(const mstro_timestamp *tsp1, const mstro_timestamp *tsp2
 mstro_status
 mstro_timestamp_to_tm_utc(const mstro_timestamp *tsp, struct tm *tmp)
 {
-  if(tsp==NULL) 
+  if(tsp==NULL)
     return MSTRO_INVARG;
   if(tmp==NULL)
     return MSTRO_INVOUT;
@@ -312,7 +468,7 @@ mstro_timestamp_to_tm_utc(const mstro_timestamp *tsp, struct tm *tmp)
   struct tm *res = timestamp_to_tm_utc((const timestamp_t*)tsp, tmp);
   if(res==NULL)
     return MSTRO_FAIL;
-  
+
   return MSTRO_OK;
 }
 
@@ -320,7 +476,7 @@ mstro_timestamp_to_tm_utc(const mstro_timestamp *tsp, struct tm *tmp)
 mstro_status
 mstro_timestamp_to_tm_local(const mstro_timestamp *tsp, struct tm *tmp)
 {
-  if(tsp==NULL) 
+  if(tsp==NULL)
     return MSTRO_INVARG;
   if(tmp==NULL)
     return MSTRO_INVOUT;
@@ -328,7 +484,7 @@ mstro_timestamp_to_tm_local(const mstro_timestamp *tsp, struct tm *tmp)
   struct tm *res = timestamp_to_tm_local((const timestamp_t*)tsp, tmp);
   if(res==NULL)
     return MSTRO_FAIL;
-  
+
   return MSTRO_OK;
 }
 
diff --git a/maestro/cdo.c b/maestro/cdo.c
index 9411dc8072b8c046d82728725edb1ae2f9989fb8..d3037585fa43b55702098bb7374b9a6932470700 100644
--- a/maestro/cdo.c
+++ b/maestro/cdo.c
@@ -231,15 +231,167 @@ mstro_cdo_state_check(mstro_cdo cdo, mstro_cdo_state s)
   return (state&s)!=0;
 }
 
+/** check whether state transition is valid */
+static inline
+bool
+mstro_pc_cdo__valid_state_transition(const struct mstro_cdo_id *cdoid,
+                                              mstro_cdo_state old_state,
+                                              mstro_cdo_state new_state)
+{
+  assert(cdoid!=NULL);
+  char *msg=NULL;
+  bool unchecked=false;
+  
+  switch(new_state) {
+    case MSTRO_CDO_STATE_INVALID:
+      msg = "Cannot transition to INVALID state on PC";
+      break;
+      
+    case MSTRO_CDO_STATE_CREATED:
+      if(  old_state != MSTRO_CDO_STATE_INVALID 
+        && !(old_state&MSTRO_CDO_STATE_DISPOSABLE))
+        msg = "Cannot step to CREATED state from any other valid state, unless in case of dispose-and-reuse";
+      break;
+      
+    case MSTRO_CDO_STATE_DECLARED:
+      if(old_state!=MSTRO_CDO_STATE_CREATED)
+        msg = "Cannot DECLARE unless freshly CREATED";
+      break;
+      
+    case MSTRO_CDO_STATE_SEALED:
+      if(old_state!=MSTRO_CDO_STATE_DECLARED)
+        msg = "Cannot SEAL unless freshly DECLARED";
+      break;
+      
+    case MSTRO_CDO_STATE_OFFERED: 
+      if(old_state != MSTRO_CDO_STATE_OFFERED_LOCALLY)
+        msg = "Cannot OFFER unless OFFERED_LOCALLY";
+      break;
+
+    case MSTRO_CDO_STATE_OFFERED_LOCALLY: 
+      if(old_state != MSTRO_CDO_STATE_SEALED)
+        msg = "Cannot OFFER_LOCALLY unless properly SEALED";
+      break;
+
+    case MSTRO_CDO_STATE_WITHDRAWN_GLOBALLY:
+      if(old_state != MSTRO_CDO_STATE_OFFERED)
+        msg = "Cannot WITHDRAW_GLOBALLY unless OFFERED (and no other flags set)";
+      break;
+
+    case MSTRO_CDO_STATE_WITHDRAWN:
+      if(old_state != MSTRO_CDO_STATE_WITHDRAWN_GLOBALLY)
+        msg = "Cannot WITHDRAW unless WITHDRAWN_GLOBALLY";
+      break;
+
+    case MSTRO_CDO_STATE_REQUIRED_LOCALLY:
+      if(old_state != MSTRO_CDO_STATE_DECLARED
+       &&old_state != MSTRO_CDO_STATE_SEALED)
+        msg = "Cannot REQUIRE unless DECLARED/SEALED";
+      break;
+
+    case MSTRO_CDO_STATE_REQUIRED:
+      if(old_state != MSTRO_CDO_STATE_REQUIRED_LOCALLY)
+        msg = "Cannot flag REQUIRED as IN_TRANSPORT unless REQUIRED";
+      break;
+
+    case MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_IN_TRANSPORT:
+      if(old_state != MSTRO_CDO_STATE_REQUIRED)
+        msg = "Cannot flag REQUIRED as IN_TRANSPORT unless REQUIRED";
+      break;
+
+    case MSTRO_CDO_STATE_OFFERED|MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_SATISFIED:
+      if(! (old_state == (MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_IN_TRANSPORT)))
+	msg = "Cannot set OFFERED|REQUIRED|SATISFIED unless REQUIRED|IN_TRANSPORT";
+      break;
+
+    case MSTRO_CDO_STATE_DEMANDED:
+      if(old_state != (MSTRO_CDO_STATE_REQUIRED)
+       &&old_state != (MSTRO_CDO_STATE_OFFERED|MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_SATISFIED))
+	msg = "Cannot set DEMANDED+IN-TRANSPORT unless REQUIRED (and no other flags set)";
+      break;
+    
+    case MSTRO_CDO_STATE_DEMANDED|MSTRO_CDO_STATE_IN_TRANSPORT:
+      if(! (old_state == (MSTRO_CDO_STATE_REQUIRED)))
+	msg = "Cannot set DEMANDED+IN-TRANSPORT unless REQUIRED (and no other flags set)";
+      break;
+
+    case MSTRO_CDO_STATE_RETRACTED_GLOBALLY:
+      if(! (   (old_state == MSTRO_CDO_STATE_REQUIRED)
+            || (old_state == (MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_IN_TRANSPORT))))
+        msg = "Cannot set RETRACTED unless REQUIRED or REQUIRED+IN-TRANSPORT (and no other flags set)";
+      break;
+  
+    case MSTRO_CDO_STATE_RETRACTED:
+      if(old_state != MSTRO_CDO_STATE_RETRACTED_GLOBALLY)
+        msg = "Cannot set RETRACTED unless RETRACTED GLOBALLY";
+      break;
+      
+    default:
+      msg="Unchecked state transition";
+      unchecked=true;
+      break;
+  }
+
+  if(msg==NULL) {
+    return true;
+  } else {
+    if(unchecked) {
+      WITH_CDO_ID_STR(
+          idstr, cdoid,
+          WARN("Unchecked state transition for %s, %s to %s\n",
+               idstr,
+               mstro_cdo_state_describe(old_state),
+               mstro_cdo_state_describe(new_state),
+               msg););
+      return false;
+    } else {
+      WITH_CDO_ID_STR(
+          idstr, cdoid,
+          ERR("Illegal state transition for %s, %s (%" PRIu32 ") to %s (%" PRIu32 "): %s\n",
+              idstr, 
+              mstro_cdo_state_describe(old_state), old_state,
+              mstro_cdo_state_describe(new_state), new_state,
+              msg););
+      return false;
+    }
+  }
+}
+
 /** set state. Notify all waiters on state change queue. Does not require CDO to be locked. */
 void
 mstro_cdo_state_set(mstro_cdo cdo, mstro_cdo_state s)
 {
-  atomic_store(&cdo->state, s);
+  assert(g_pool_app_id != MSTRO_APP_ID_MANAGER);
+  mstro_cdo_state tmp = atomic_exchange(&cdo->state, s);
+  DEBUG("Attempting to change state from `%s` to `%s` \n", 
+         mstro_cdo_state_describe(tmp), 
+         mstro_cdo_state_describe(s));
+  assert(mstro_pc_cdo__valid_state_transition(&cdo->gid,tmp, s));
+
+  pthread_mutex_lock(&g_cdo_state_change_lock);
+  pthread_cond_broadcast(&g_cdo_state_change_cvar);
+  pthread_mutex_unlock(&g_cdo_state_change_lock);
+}
+
+void
+mstro_cdo_state_set_safe_flags(mstro_cdo cdo, mstro_cdo_state s)
+{
+  assert(g_pool_app_id != MSTRO_APP_ID_MANAGER);
+
+  mstro_cdo_state old_state = mstro_cdo_state_get(cdo);
+  mstro_cdo_state state_flags = (old_state & MSTRO_CDO_STATE_FLAGS);
+  mstro_cdo_state new_state = s | state_flags;
+  mstro_cdo_state tmp = atomic_exchange(&cdo->state, new_state);
+  assert (tmp == old_state); // nobody should come in-between
 
   pthread_mutex_lock(&g_cdo_state_change_lock);
   pthread_cond_broadcast(&g_cdo_state_change_cvar);
   pthread_mutex_unlock(&g_cdo_state_change_lock);
+
+  DEBUG("Attempting to change state from `%s` to `%s` \n", 
+         mstro_cdo_state_describe(old_state), 
+         mstro_cdo_state_describe(new_state));
+  assert(mstro_pc_cdo__valid_state_transition(&cdo->gid, old_state, new_state));
 }
 
 mstro_cdo_state
@@ -294,84 +446,9 @@ mstro_cdo_name(mstro_cdo cdo)
 }
 
 mstro_status
-mstro_cdo_declare_async(const char *name,
-                        mstro_cdo_attributes attributes,
-		                    mstro_cdo *result,
-                        mstro_request *request)
-{
-   mstro_status status = MSTRO_UNIMPL;
-   status = mstro_cdo_declare_core(name, attributes, result);
-   /** if successful, fill in the request */
-   if(status == MSTRO_OK)
-   {
-     /** fill the request structure to be able to check on the operation later **/
-     DEBUG("Allocate mstro_request struct and save cdo handle and target state to it\n");
-     *request = malloc(sizeof(struct mstro_request_));
-     if (*request == NULL) {
-       ERR("Can not allocate memory for request object \n");
-     }
-     (*request)->cdo = *result;
-     (*request)->target_state = MSTRO_CDO_STATE_DECLARED;
-     //add to counter
-      mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_ASYNC_DECLARE, 1);
-   }
-   /** always return the output status*/
-   return status;
-}
-
-mstro_status
-mstro_cdo_declare(const char *name,
-                  mstro_cdo_attributes attributes,
-		  mstro_cdo *result) {
-
-  mstro_status status = MSTRO_UNIMPL;
-  status = mstro_cdo_declare_core(name, attributes, result);
-  /** if successful, increase statistics counter */
-  if(status == MSTRO_OK) {
-    mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_DECLARE, 1);
-
-    WITH_CDO_ID_STR(idstr, &(*result)->id,
-                  INFO("Declared CDO `%s', (local ID: %s)\n",
-                       (*result)->name, idstr);
-                  );
-  }
-
-  /** always return the output status*/
-  return status;
-
-}
-
-mstro_status
-mstro_cdo_declare_core(const char *name,
-                  mstro_cdo_attributes attributes,
+mstro_cdo_declare_propagate(const char *name,
 		  mstro_cdo *result)
 {
-  if(name==NULL) {
-    ERR("name cannot be NULL\n");
-    return MSTRO_INVARG;
-  }
-  if(result==NULL) {
-    ERR("declaration handle storage cannot be NULL\n");
-    return MSTRO_INVOUT;
-  }
-  if(attributes!=MSTRO_ATTR_DEFAULT) {
-    ERR("non-default declaration attributes unsupported, FIXME\n");
-    return MSTRO_UNIMPL;
-  }
-
-  *result = mstro_cdo__alloc();
-  if(*result==NULL) {
-    ERR("Cannot allocate for CDO declaration\n");
-    return MSTRO_NOMEM;
-  }
-
-  (*result)->name = strdup(name);
-  if((*result)->name==NULL)  {
-    ERR("Cannot allocate for CDO name\n");
-    mstro_cdo__free(result);
-    return MSTRO_NOMEM;
-  }
-
   mstro_cdo_state_set(*result, MSTRO_CDO_STATE_CREATED);
 
   (*result)->gid = MSTRO_CDO_ID_NULL;
@@ -392,6 +469,7 @@ mstro_cdo_declare_core(const char *name,
   }
   (*result)->id.local_id = serial;
   (*result)->serial = serial;
+  (*result)->n_segments = 0; /* initial value ... no outstanding communications */
 
 
   /*** inform pool manager ***/
@@ -437,7 +515,46 @@ mstro_cdo_declare_core(const char *name,
     }
   }
 
-  /* mamba array and raw pointer */
+  return MSTRO_OK;
+}
+
+mstro_status
+mstro_cdo_declare_core(const char *name,
+                  mstro_cdo_attributes attributes,
+		  mstro_cdo *result)
+{
+  if(name==NULL) {
+    ERR("name cannot be NULL\n");
+    return MSTRO_INVARG;
+  }
+  if(result==NULL) {
+    ERR("declaration handle storage cannot be NULL\n");
+    return MSTRO_INVOUT;
+  }
+  if(attributes!=MSTRO_ATTR_DEFAULT) {
+    ERR("non-default declaration attributes unsupported, FIXME\n");
+    return MSTRO_UNIMPL;
+  }
+
+  *result = mstro_cdo__alloc();
+  if(*result==NULL) {
+    ERR("Cannot allocate for CDO declaration\n");
+    return MSTRO_NOMEM;
+  }
+
+  (*result)->name = strdup(name);
+  if((*result)->name==NULL)  {
+    ERR("Cannot allocate for CDO name\n");
+    mstro_cdo__free(result);
+    return MSTRO_NOMEM;
+  }
+
+  mstro_status s;
+  s = mstro_cdo_declare_propagate(name, result);
+  if (s != MSTRO_OK)
+    return s;
+
+ /* mamba array and raw pointer */
   (*result)->mamba_array = NULL;
   (*result)->raw_ptr = NULL;
 
@@ -445,7 +562,7 @@ mstro_cdo_declare_core(const char *name,
   s = mstro_cdo_attribute_set_default(*result);
   if (s != MSTRO_OK)
     return s;
-  
+
   mstro_stats_add_counter(MSTRO_STATS_CAT_OBJECTS,
                           MSTRO_STATS_L_NUM_CDOS_CREATED,
                           1);
@@ -453,6 +570,54 @@ mstro_cdo_declare_core(const char *name,
   return MSTRO_OK;
 }
 
+mstro_status
+mstro_cdo_declare_async(const char *name,
+                        mstro_cdo_attributes attributes,
+		                    mstro_cdo *result,
+                        mstro_request *request)
+{
+   mstro_status status = MSTRO_UNIMPL;
+   status = mstro_cdo_declare_core(name, attributes, result);
+   /** if successful, fill in the request */
+   if(status == MSTRO_OK)
+   {
+     /** fill the request structure to be able to check on the operation later **/
+     DEBUG("Allocate mstro_request struct and save cdo handle and target state to it\n");
+     *request = malloc(sizeof(struct mstro_request_));
+     if (*request == NULL) {
+       ERR("Can not allocate memory for request object \n");
+     }
+     (*request)->cdo = *result;
+     (*request)->target_state = MSTRO_CDO_STATE_DECLARED;
+     //add to counter
+      mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_ASYNC_DECLARE, 1);
+   }
+   /** always return the output status*/
+   return status;
+}
+
+mstro_status
+mstro_cdo_declare(const char *name,
+                  mstro_cdo_attributes attributes,
+		  mstro_cdo *result) {
+
+  mstro_status status = MSTRO_UNIMPL;
+  status = mstro_cdo_declare_core(name, attributes, result);
+  /** if successful, increase statistics counter */
+  if(status == MSTRO_OK) {
+    mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_DECLARE, 1);
+
+    WITH_CDO_ID_STR(idstr, &(*result)->id,
+                  INFO("Declared CDO `%s', (local ID: %s)\n",
+                       (*result)->name, idstr);
+                  );
+  }
+
+  /** always return the output status*/
+  return status;
+
+}
+
 /* Build the cdo->attributes_msg protobuf structure for all attributes of the CDO
  * If already non-NULL, warn.
  * We do not support updating it -- use mstro_cdo_attributes_merge() for that.
@@ -476,8 +641,8 @@ mstro_status
 mstro_cdo__sync_rawptr(mstro_cdo cdo)
 {
   if (cdo->raw_ptr != NULL) {
-    ERR("raw-ptr is already occupied\n");
-    return MSTRO_INVARG;
+    DEBUG("raw-ptr is already occupied\n");
+    return MSTRO_OK;
   }
 
   mstro_status status = MSTRO_OK;
@@ -683,23 +848,22 @@ mstro_cdo_seal_async(mstro_cdo cdo, mstro_request *request)
 
   DEBUG("Raw ptr %p, mamba_array %p\n", raw_ptr, mamba_array);
   if(raw_ptr!=NULL && mamba_array!=NULL) {
-    ERR("CDO `%s` has both raw-ptr and existing mamba-array set at SEAL time, unsupported\n",
+    INFO("CDO `%s` has both raw-ptr and existing mamba-array, assuming DISPOSE_AND_REUSE case\n",
         cdo->name);
-    status=MSTRO_FAIL;
-    goto BAILOUT;
   }
 
   /* we will cache these two attributes, as the user can not change
-   * them anymore and we'd like all mamba core code to be able to
-   * avoid dictionary lookups for them */
-  assert(cdo->raw_ptr==NULL && cdo->mamba_array==NULL);
+   * them anymore (until potential dispose_and_reuse) and we'd like all mamba
+   * core code to be able to avoid dictionary lookups for them */
+ // assert(cdo->raw_ptr==NULL && cdo->mamba_array==NULL);
 
-  /* one of the two will be NULL; both means type-0 CDO, which is also ok */
   cdo->raw_ptr = raw_ptr;
   cdo->mamba_array = mamba_array;
 
   if (cdo->mamba_array == NULL && cdo->raw_ptr == NULL) {
     INFO("Sealing CDO `%s`, but no raw-ptr (type 0)\n", cdo->name);
+  } else if (cdo->raw_ptr!=NULL && cdo->mamba_array!=NULL)  { 
+     // do nothing, reusing resources 
   } else {
     status = mstro_cdo_allocate_data(cdo); /* sync raw-ptr+mamba_array+attributes */
     if (status != MSTRO_OK) {
@@ -757,7 +921,6 @@ mstro_cdo_seal_async(mstro_cdo cdo, mstro_request *request)
     cdoid.qw1 = cdo->gid.qw[1];
     cdoid.local_id = cdo->gid.local_id;
 
-
     status = mstro_cdo_ensure_attribute_msg(cdo);
     if(status!=MSTRO_OK) {
       ERR("Failed to serialize attributes\n");
@@ -779,10 +942,8 @@ mstro_cdo_seal_async(mstro_cdo cdo, mstro_request *request)
           seal.base.descriptor->name);
       return status;
     }
-
     /* FIXME: do we need an ACK here? */
     status = mstro_pmp_send_nowait(MSTRO_APP_ID_MANAGER, &msg);
-
     switch(status) {
       case MSTRO_OK:
         ;
@@ -1068,25 +1229,17 @@ mstro_cdo_declaration_seal(mstro_cdo cdo)
     }
   }
 
-  DEBUG("Raw ptr %p, mamba_array %p\n", raw_ptr, mamba_array);
-  if(raw_ptr!=NULL && mamba_array!=NULL) {
-    ERR("CDO `%s` has both raw-ptr and existing mamba-array set at SEAL time, unsupported\n",
-        cdo->name);
-    status=MSTRO_FAIL;
-    goto BAILOUT;
-  }
-
   /* we will cache these two attributes, as the user can not change
    * them anymore and we'd like all mamba core code to be able to
    * avoid dictionary lookups for them */
-  assert(cdo->raw_ptr==NULL && cdo->mamba_array==NULL);
-
-  /* one of the two will be NULL; both means type-0 CDO, which is also ok */
+//  assert(cdo->raw_ptr==NULL && cdo->mamba_array==NULL);
   cdo->raw_ptr = raw_ptr;
   cdo->mamba_array = mamba_array;
 
   if (cdo->mamba_array == NULL && cdo->raw_ptr == NULL) {
     DEBUG("Sealing CDO `%s`, but no raw-ptr (type 0)\n", cdo->name);
+  } else if(raw_ptr!=NULL && mamba_array!=NULL) { // dispose-and-reuse use-case
+    DEBUG("Sealing CDO `%s`, dispose-and-reuse fashion\n", cdo->name);
   } else {
     status = mstro_cdo_allocate_data(cdo); /* sync raw-ptr+mamba_array+attributes */
     if (status != MSTRO_OK) {
@@ -1171,10 +1324,8 @@ mstro_cdo_declaration_seal(mstro_cdo cdo)
           seal.base.descriptor->name);
       return status;
     }
-
     /* FIXME: do we need an ACK here? */
     status = mstro_pmp_send_nowait(MSTRO_APP_ID_MANAGER, &msg);
-
     switch(status) {
       case MSTRO_OK:
         ;
@@ -1326,6 +1477,10 @@ mstro_cdo_offer(mstro_cdo cdo)
     return MSTRO_INVARG;
   }
 
+  WITH_CDO_ID_STR(lidstr, &cdo->id, {
+      WITH_CDO_ID_STR(gidstr, &cdo->gid, {
+          DEBUG("Offer of %s (lid: %s, gid: %s)\n",
+                cdo->name, lidstr, gidstr);});});
 
   if(!mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_SEALED)) {
     if(mstro_cdo_state_check(cdo,
@@ -1353,6 +1508,28 @@ mstro_cdo_offer(mstro_cdo cdo)
                 cdo->name, lidstr, gidstr);});});
 
   WITH_CDO_ID_STR(id, &cdo->gid, {
+
+      const int64_t* val;
+      int64_t cdo_size;
+      enum mstro_cdo_attr_value_type type;
+      status = mstro_cdo_attribute_get(cdo, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
+                                           NULL, (const void**)&val);
+      if(status!=MSTRO_OK) {
+        ERR("CDO has no local-size\n");
+        return MSTRO_FAIL;
+      }
+      cdo_size = *val;
+      if(cdo_size == -1){
+        cdo_size = 0;
+         /* -1 is the default cdo size, setting zero to reflect the correct size of data */
+         status = mstro_attribute_dict_set(cdo->attributes, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
+                                 MSTRO_CDO_ATTR_VALUE_INVALID, /* we dont help in checking */
+                                &cdo_size, true);
+         if(status != MSTRO_OK) {
+           ERR("Can not set the cdo size to zero \n");
+           return MSTRO_FAIL;
+         }
+       }
       status = mstro_pool__add(cdo, MSTRO_CDO_STATE_OFFERED_LOCALLY);
 
       if(status!=MSTRO_OK) {
@@ -1401,19 +1578,10 @@ mstro_cdo_offer(mstro_cdo cdo)
         mstro_cdo_block_until(cdo, MSTRO_CDO_STATE_OFFERED, "OFFERED");
 
         {
-          const int64_t* val;
-          enum mstro_cdo_attr_value_type type;
-          status = mstro_cdo_attribute_get(cdo, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                                           NULL, (const void**)&val);
-          if(status!=MSTRO_OK) {
-            ERR("CDO has no local-size\n");
-            return MSTRO_FAIL;
-          }
-
+          
           status = mstro_stats_add_counter(MSTRO_STATS_CAT_POOL,
                                            MSTRO_STATS_L_BYTES_POOLED,
-                                           /* type 0 CDOs have no size */
-                                           *val==-1? 0 : *val);
+                                           cdo_size);
 
           mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_OFFER, 1);
         }
@@ -1547,7 +1715,7 @@ mstro_cdo_require_async(mstro_cdo cdo, mstro_request *request)
   WITH_CDO_ID_STR(
       id, &cdo->gid,
       {
-        status = mstro_pool__add(cdo, MSTRO_CDO_STATE_REQUIRED);
+        status = mstro_pool__add(cdo, MSTRO_CDO_STATE_REQUIRED_LOCALLY);
 
         if(status!=MSTRO_OK) {
           ERR("Failed to add CDO %s (id %s) to local pool\n", cdo->name, id);
@@ -1583,6 +1751,14 @@ mstro_cdo_require_async(mstro_cdo cdo, mstro_request *request)
           if(status!=MSTRO_OK) {
             if(status==MSTRO_NO_PM) {
               DEBUG("require local only, no PM\n");
+              mstro_cdo_state_set(cdo, MSTRO_CDO_STATE_REQUIRED);
+              status = mstro_pool__notify(cdo);
+  
+              WITH_CDO_ID_STR(idstr, &cdo->gid, {
+                  DEBUG("CDO %s now in state %s\n",
+                        idstr, mstro_cdo_state_describe(mstro_cdo_state_get(cdo)));
+                });
+
             } else {
               ERR("Failed to send REQUIRE message to pool manager: %d (%s)\n",
                   status, mstro_status_description(status));
@@ -1635,7 +1811,7 @@ mstro_cdo_require(mstro_cdo cdo)
   /* We have a sealed CDO. Good. */
 
   WITH_CDO_ID_STR(id, &cdo->gid, {
-      status = mstro_pool__add(cdo, MSTRO_CDO_STATE_REQUIRED);
+      status = mstro_pool__add(cdo, MSTRO_CDO_STATE_REQUIRED_LOCALLY);
       if(status!=MSTRO_OK) {
         ERR("Failed to add CDO %s (id %s) to local pool\n", cdo->name, id);
         return status;
@@ -1668,6 +1844,14 @@ mstro_cdo_require(mstro_cdo cdo)
           if(status!=MSTRO_OK) {
             if(status==MSTRO_NO_PM) {
               DEBUG("require local only, no PM\n");
+              mstro_cdo_state_set(cdo, MSTRO_CDO_STATE_REQUIRED);
+              status = mstro_pool__notify(cdo);
+  
+              WITH_CDO_ID_STR(idstr, &cdo->gid, {
+                  DEBUG("CDO %s now in state %s\n",
+                        idstr, mstro_cdo_state_describe(mstro_cdo_state_get(cdo)));
+                });
+
             } else {
               ERR("Failed to send REQUIRE message to pool manager: %d (%s)\n",
                   status, mstro_status_description(status));
@@ -1695,12 +1879,18 @@ mstro_cdo_demand(mstro_cdo cdo)
           DEBUG("Demanding %s (lid: %s, gid: %s)\n",
                 cdo->name, lidstr, gidstr);});});
 
-  if(! mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_REQUIRED)) {
-    WITH_CDO_ID_STR(
-        idstr, &cdo->gid,
-        ERR("CDO `%s' (id %s) not REQUIRED in pool, cannot DEMAND it\n",
-            cdo->name, idstr););
-    return MSTRO_INVARG;
+  mstro_cdo_state tmp = mstro_cdo_state_get(cdo);
+  if(tmp != MSTRO_CDO_STATE_REQUIRED) {
+    if (tmp == MSTRO_CDO_STATE_REQUIRED_LOCALLY) {
+      /* PM-route: wait for ACK to set required state */
+      mstro_cdo_block_until(cdo, MSTRO_CDO_STATE_REQUIRED, "REQUIRED");
+    } else {
+      WITH_CDO_ID_STR(
+          idstr, &cdo->gid,
+          ERR("CDO `%s' (id %s) not REQUIRED in pool, cannot DEMAND it\n",
+              cdo->name, idstr););
+      return MSTRO_INVARG;
+    } 
   }
 
   mstro_status status = mstro_pool__demand(cdo, MSTRO_CDO_STATE_DEMANDED);
@@ -1737,12 +1927,18 @@ mstro_cdo_demand_async(mstro_cdo cdo, mstro_request *request)
       INFO("Demanding CDO `%s' (id %s) from pool\n",
             cdo->name, idstr);});
 
-  if(! mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_REQUIRED)) {
-    WITH_CDO_ID_STR(
-        idstr, &cdo->gid,
-        ERR("CDO `%s' (id %s) not REQUIRED in pooled, cannot DEMAND it\n",
-            cdo->name, idstr););
-    return MSTRO_INVARG;
+  mstro_cdo_state tmp = mstro_cdo_state_get(cdo);
+  if(tmp != MSTRO_CDO_STATE_REQUIRED) {
+    if (tmp == MSTRO_CDO_STATE_REQUIRED_LOCALLY) {
+      /* PM-route: wait for ACK to set required state */
+      mstro_cdo_block_until(cdo, MSTRO_CDO_STATE_REQUIRED, "REQUIRED");
+    } else {
+      WITH_CDO_ID_STR(
+          idstr, &cdo->gid,
+          ERR("CDO `%s' (id %s) not REQUIRED in pool, cannot DEMAND it\n",
+              cdo->name, idstr););
+      return MSTRO_INVARG;
+    } 
   }
 
   status =  mstro_pool__demand_async(cdo, MSTRO_CDO_STATE_DEMANDED);
@@ -1784,14 +1980,16 @@ mstro_cdo_retract_async(mstro_cdo cdo, mstro_request *request)
       DEBUG("Retracting CDO `%s' (id %s) from pool\n",
             cdo->name, idstr);});
 
-  if(! mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_REQUIRED)) {
+  mstro_cdo_state tmp = mstro_cdo_state_get(cdo);
+  if(tmp != MSTRO_CDO_STATE_REQUIRED
+   &&tmp != MSTRO_CDO_STATE_REQUIRED_LOCALLY) {
     WITH_CDO_ID_STR(
         idstr, &cdo->gid,
         ERR("CDO `%s' (id %s) not REQUIRED in pool, cannot RETRACT it\n",
             cdo->name, idstr););
     return MSTRO_INVARG;
-  }
-
+  } 
+  
   mstro_status status = mstro_pool__remove_async(cdo, MSTRO_CDO_STATE_RETRACTED);
 
   if (status == MSTRO_OK ){
@@ -1823,10 +2021,12 @@ mstro_cdo_retract(mstro_cdo cdo)
       DEBUG("Retracting CDO `%s' (id %s) from pool\n",
             cdo->name, idstr);});
 
-  if(! mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_REQUIRED)) {
+  mstro_cdo_state tmp = mstro_cdo_state_get(cdo);
+  if(tmp != MSTRO_CDO_STATE_REQUIRED
+   &&tmp != MSTRO_CDO_STATE_REQUIRED_LOCALLY) {
     WITH_CDO_ID_STR(
         idstr, &cdo->gid,
-        ERR("CDO `%s' (id %s) not REQUIRED in pooled, cannot RETRACT it\n",
+        ERR("CDO `%s' (id %s) not REQUIRED in pool, cannot RETRACT it\n",
             cdo->name, idstr););
     return MSTRO_INVARG;
   }
@@ -1850,7 +2050,7 @@ mstro_cdo_retract(mstro_cdo cdo)
 }
 
 mstro_status
-mstro_cdo_dispose(mstro_cdo cdo)
+mstro_cdo_dispose_propagate(mstro_cdo cdo)
 {
   if(cdo==NULL) {
     ERR("NULL CDO invalid\n");
@@ -1934,6 +2134,20 @@ mstro_cdo_dispose(mstro_cdo cdo)
   pthread_mutex_unlock(&g_live_cdo_table_mtx);
   free(res);
 
+  return MSTRO_UNIMPL;
+}
+  
+mstro_status
+mstro_cdo_dispose(mstro_cdo cdo)
+{
+// Bookkeeping
+  mstro_status s;
+  s = mstro_cdo_dispose_propagate(cdo);
+  if (s != MSTRO_OK && s != MSTRO_UNIMPL)
+    // already printed an error
+    return s;
+
+// Free resources
   /* Careful! We need can only get name, id, etc. *before* the disposal
    * happens.
    * FIXME: This should actually be #ifdef-guarded depending on log
@@ -1948,14 +2162,38 @@ mstro_cdo_dispose(mstro_cdo cdo)
         if(status!=MSTRO_OK) {
           ERR("Failed to deallocate CDO (id %s)\n", idstr);
           return status;
-        } else {
-          return MSTRO_OK;
-        }
+        } 
       });
 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_DISPOSE, 1);
 
-  return MSTRO_UNIMPL;
+  return MSTRO_OK;
+}
+
+// TODO make it more efficient by using one new pool message (UNSEAL?) to avoid
+// doing dispose and declare messages. Then *_propagate "bypass" functions can go.
+mstro_status
+mstro_cdo_dispose_and_reuse(mstro_cdo cdo)
+{
+  if(cdo==NULL) {
+    ERR("NULL CDO invalid\n");
+    return MSTRO_INVARG;
+  }
+  mstro_status s;
+
+  s = mstro_cdo_dispose_propagate(cdo); // will not free *cdo*
+  if (s != MSTRO_OK && s != MSTRO_UNIMPL)
+    // already printed an error
+    return s;
+
+  s = mstro_cdo_declare_propagate(cdo->name, &cdo); // will not alloc *cdo*, will not alter user attributes
+  if (s != MSTRO_OK)
+    // already printed an error
+    return s;
+
+  mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_DISPOSE, 1);
+
+  return MSTRO_OK;
 }
 
 static inline
@@ -2032,7 +2270,7 @@ mstro_cdo__adjust_space(mstro_cdo cdo, int64_t size, int policy)
 {
   DEBUG("Raw ptr %p, mamba_array %p\n", cdo->raw_ptr, cdo->mamba_array);
 
-  if (size < 0) 
+  if (size < 0)
     ERR("Invalid size (%" PRIx64 ")\n", size);
   if (size == 0) {
     NOISE("Receiving 0-size CDO\n");
@@ -2214,14 +2452,8 @@ mstro_cdo__satisfy(mstro_cdo src, mstro_cdo dst)
     WITH_CDO_ID_STR(idstr, &dst->gid,
                     ERR("Failed to update CDO |%s| (%p) attributes from src data: %d (%s)\n",
                         idstr, dst, s, mstro_status_description(s)););
-    return s;
   }
 
-BAILOUT:
-  if(s==MSTRO_OK) {
-    mstro_cdo_state_set(dst, (MSTRO_CDO_STATE_DEMANDED
-                              | MSTRO_CDO_STATE_SATISFIED));
-  }
   return s;
 }
 
@@ -2384,7 +2616,23 @@ mstro_cdo__mark_transfer_complete(mstro_cdo cdo)
   mstro_status status=MSTRO_UNIMPL;
 
   mstro_cdo_state s = cdo->state;
-  assert(cdo->state & MSTRO_CDO_STATE_IN_TRANSPORT);
+  if (!(cdo->state & MSTRO_CDO_STATE_IN_TRANSPORT)) 
+    WARN("Unexpected CDO state `%s` , it should be marked IN_TRANSPORT", mstro_cdo_state_describe(cdo->state));
+
+  int64_t n_pieces;
+
+  /* reduce the number of outstanding segments by 1 as we received one part*/
+  n_pieces = atomic_fetch_sub(&cdo->n_segments, 1);
+  
+  assert(n_pieces > 0); /* there is some outstanding communication */
+  n_pieces--; /* decrement n_pieces to reflect cdo->n_segments after fetch and sub operation */
+
+  if (n_pieces != 0)
+  {
+    DEBUG("There are %zu outstanding pieces for CDO %zu \n", n_pieces, cdo->id.local_id);
+    return MSTRO_OK;
+  }
+  
 
   /* clear in-transport flag */
   s = s & ~(MSTRO_CDO_STATE_IN_TRANSPORT);
@@ -2428,13 +2676,13 @@ struct {
   {MSTRO_CDO_STATE_SEALED,             "|SEALED"},
   {MSTRO_CDO_STATE_OFFERED_LOCALLY,    "|OFFERED_LOCALLY"},
   {MSTRO_CDO_STATE_OFFERED,            "|OFFERED"},
+  {MSTRO_CDO_STATE_REQUIRED_LOCALLY,   "|REQUIRED_LOCALLY"},
   {MSTRO_CDO_STATE_REQUIRED,           "|REQUIRED"},
   {MSTRO_CDO_STATE_WITHDRAWN_GLOBALLY, "|WITHDRAWN_GLOBALLY"},
   {MSTRO_CDO_STATE_WITHDRAWN,          "|WITHDRAWN"},
   {MSTRO_CDO_STATE_DEMANDED,           "|DEMANDED"},
   {MSTRO_CDO_STATE_RETRACTED,          "|RETRACTED"},
   {MSTRO_CDO_STATE_RETRACTED_GLOBALLY, "|RETRACTED_GLOBALLY"},
-  {    (1U<<11),                       "|inv11"},
   {MSTRO_CDO_STATE_DEAD,               "|DEAD"},
   {    (1U<<13),                       "|inv13"},
   {    (1U<<14),                       "|inv14"},
@@ -2582,22 +2830,16 @@ mstro_cdo_attributes_update_incoming(mstro_cdo cdo,
         }
         if(i==num_precious_attr) {
           /* not a precious attribute */
-          if(strcmp(key, MSTRO_ATTR_CORE_CDO_RAW_PTR)==0) {
-            DEBUG("Skipping incoming raw pointer attribute\n");
-          } else if(strcmp(key, MSTRO_ATTR_CORE_CDO_MAMBA_ARRAY)==0) {
-            DEBUG("Skipping incoming mamba array attribute\n");
-          } else {
-            WITH_CDO_ID_STR(idstr, &cdo->gid, {
-                DEBUG("Updating attribute %s on CDO %s (%p)\n",
+          WITH_CDO_ID_STR(idstr, &cdo->gid, {
+              DEBUG("Updating attribute %s on CDO %s (%p)\n",
                       key, idstr, cdo);});
-            s = mstro_attribute_dict_set_kventry(cdo->attributes, entry);
-            if(s!=MSTRO_OK) {
-              WITH_CDO_ID_STR(idstr, &cdo->gid,{
-                  ERR("Failed to update attribute %s on CDO %s (%p): %d (%s)\n",
+          s = mstro_attribute_dict_set_kventry(cdo->attributes, entry);
+          if(s!=MSTRO_OK) {
+            WITH_CDO_ID_STR(idstr, &cdo->gid,{
+                ERR("Failed to update attribute %s on CDO %s (%p): %d (%s)\n",
                       key, idstr, cdo, s, mstro_status_description(s));
-                });
-              retstat|=s;
-            }
+            });
+            retstat|=s;
           }
         }
         break;
diff --git a/maestro/core.c b/maestro/core.c
index 1b4ea0542a648bf40854ddd86838b1ef15b69f44..2346858a3bebbf851fa6c5ba9274225209be291a 100644
--- a/maestro/core.c
+++ b/maestro/core.c
@@ -724,3 +724,57 @@ mstro_core_state_get(const struct mstro_core_initdata **res)
   return status;
 }
 
+
+
+
+/* timing stuff */
+#if defined(__linux)
+#  define HAVE_POSIX_TIMER
+#  include <time.h>
+#  ifdef CLOCK_MONOTONIC
+#     define CLOCKID CLOCK_MONOTONIC
+#  else
+#     define CLOCKID CLOCK_REALTIME
+#  endif
+  static struct timespec linux_rate;
+#elif defined(__APPLE__)
+#  include <time.h>
+#  define HAVE_MACH_TIMER
+#  ifdef CLOCK_MONOTONIC
+#     define CLOCKID CLOCK_MONOTONIC
+#  else
+#     define CLOCKID CLOCK_REALTIME
+#  endif
+#  include <mach/mach_time.h>
+   static mach_timebase_info_data_t info;
+#elif defined(_WIN32)
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+   static LARGE_INTEGER win_frequency;
+#else
+#error Unsupported system class
+#endif
+
+mstro_nanosec_t
+mstro_clock(void)
+{
+#if defined(__APPLE__)
+  mstro_nanosec_t now = clock_gettime_nsec_np(CLOCKID);
+    /* now = mach_absolute_time(); */
+    /* now *= info.numer; */
+    /* now /= info.denom; */
+    return now;
+#elif defined(__linux)
+    mstro_nanosec_t now;
+    struct timespec spec;
+    clock_gettime(CLOCKID, &spec);
+    now = spec.tv_sec * 1.0e9 + spec.tv_nsec;
+    return now;
+#elif defined(_WIN32)
+    LARGE_INTEGER now;
+    QueryPerformanceCounter(&now);
+    return (uint64_t) ((1e9 * now.QuadPart)  / win_frequency.QuadPart);
+#else
+    #error Unsupported system class
+#endif
+}
diff --git a/maestro/drc.c b/maestro/drc.c
index 8f95dca52b57b0f3cf8d67c51acdf85bdb13aabb..5556920bc31f36337bc9d17d7fc5e2af9be46065 100644
--- a/maestro/drc.c
+++ b/maestro/drc.c
@@ -55,8 +55,13 @@
 
 #ifdef HAVE_DRC
 #include <rdmacred.h>
-/* #include <rdma/fi_ext_gni.h> */
-#include <fi_ext_gni.h> /* use uninstalled one */
+
+#ifndef LOCAL_LIBFABRIC
+# include <rdma/fi_ext_gni.h>
+#else
+# include <fi_ext_gni.h> /* use uninstalled one */
+#endif
+
 #else
 
 /* define some placeholders; we want to compile code with dummy
@@ -69,12 +74,19 @@ struct fi_gni_auth_key {
 };
 
 #define DRC_SUCCESS 0
-#define DRC_FLAGS_TARGET_UID 0
-#define DRC_FLAGS_TARGET_UID 0
 
 #define GNIX_AKT_RAW 4711
 typedef void * drc_info_handle_t;
 
+enum {
+        DRC_FLAGS_FLEX_CREDENTIAL = 1 << 0,  /* acquire flag, flexible credential mode */
+        DRC_FLAGS_PERSISTENT = 1 << 1, /* acquire flag, persistent credential */
+        DRC_FLAGS_TARGET_WLM = 1 << 2, /* grant/revoke flag, value is WLM ID */
+        DRC_FLAGS_TARGET_UID = 1 << 3, /* grant/revoke flag, value is UID */
+        DRC_FLAGS_TARGET_GID = 1 << 4, /* grant/revoke flag, value is GID */
+        DRC_MAX_FLAGS
+};
+
 static inline
 int drc_acquire(uint32_t *credential, int flags)
 {
@@ -149,7 +161,18 @@ mstro_drc_init(mstro_drc_info *result_p)
   if(*result_p) {
     int ret;
     drc_info_handle_t info;
-    ret = drc_acquire(&(*result_p)->drc_id, 0);
+    char *do_nonflex = getenv(MSTRO_ENV_DRC_NON_FLEX);
+    if(do_nonflex!=NULL && atoi(do_nonflex)!=0
+       && do_nonflex[0]!='f' && do_nonflex[0]!='F' // fAlSe
+       && do_nonflex[0]!='d' && do_nonflex[0]!='D' // DiSabled
+       ) {
+      // if user requests it: use non-flex credentials
+      ret = drc_acquire(&(*result_p)->drc_id, 0);
+    } else {
+      // default: flex credentials, to allow multiple jobs on the same node ("DRC node insecure mode")
+      ret = drc_acquire(&(*result_p)->drc_id,  DRC_FLAGS_FLEX_CREDENTIAL);
+    }
+      
     if(ret!=DRC_SUCCESS) {
       ERR("Failed to drc_acquire a new credential: %d\n", ret);
       stat=MSTRO_FAIL; goto BAILOUT_FREE;
@@ -261,6 +284,17 @@ mstro_drc_get_oob_string(char **result_p,
     return MSTRO_OK;
 }
 
+mstro_status
+mstro_drc_get_credential(uint32_t *result_p,
+                         const mstro_drc_info info)
+{
+  if(result_p==NULL)
+    return MSTRO_INVOUT;
+
+  *result_p = info->drc_id;
+  return MSTRO_OK;
+}
+
 /** Create drc object from OOB string info */
 mstro_status
 mstro_drc_init_from_oob_string(mstro_drc_info *result_p,
@@ -309,6 +343,44 @@ BAILOUT:
   return stat;  
 }
 
+mstro_status
+mstro_drc_init_from_credential(mstro_drc_info *result_p,
+                               uint32_t credential)
+{
+  mstro_status stat = MSTRO_UNIMPL;
+  uint32_t id = credential;
+  int ret;
+
+  if(result_p==NULL)
+    return MSTRO_INVOUT;
+  
+  *result_p = malloc(sizeof(struct mstro_drc_info_));
+  if(*result_p) {
+    drc_info_handle_t info;
+    (*result_p)->drc_id = id;
+    
+    ret = drc_access((*result_p)->drc_id, 0, &info);
+    if(ret!=DRC_SUCCESS) {
+      ERR("Failed to drc_access credential %" PRIu32 ": %d\n", id, ret);
+      goto BAILOUT_FREE;
+    }
+
+    mstro_drc_init_internal(*result_p, info);
+
+    DEBUG("Accessing DRC credential %d (OOB: is %s), cookie %" PRIx64 ")\n",
+          (*result_p)->drc_id,
+          (*result_p)->serialized_id,
+          (*result_p)->auth_key.raw.protection_key);
+    stat = MSTRO_OK;
+  } else {
+    stat = MSTRO_NOMEM;
+  }
+  goto BAILOUT;
+BAILOUT_FREE:
+  NFREE(*result_p);
+BAILOUT:
+  return stat;  
+}
 
 
   
diff --git a/maestro/heartbeat.c b/maestro/heartbeat.c
index 0d00c5008f67d13c57e4fa7d66377a3a5503cb94..f19df6af47ecf00894d61d0165f5ed53b3469e04 100644
--- a/maestro/heartbeat.c
+++ b/maestro/heartbeat.c
@@ -62,7 +62,6 @@ static pthread_t g_heartbeat_thread;
 /** desired heartbeat interval */
 static struct timespec g_sleep_ts;
 
-#define NSEC_PER_SEC (1000000000)
 static
 void *
 mstro_heartbeat_threadfun(void *closure)
diff --git a/maestro/misc.c b/maestro/misc.c
index 1aebfbd0b0d7cf083cf3d886049cbf084bc1a577..67b743b100a44558b1546bde66490557401970ef 100644
--- a/maestro/misc.c
+++ b/maestro/misc.c
@@ -1,13 +1,15 @@
 #include "maestro/i_misc.h"
 #include "maestro/logging.h"
 
+#ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 1
+#endif
 #include <limits.h>
 
 #include <assert.h>
 #include <sys/stat.h>
 #include <errno.h>
-
+#include <string.h>
 
 /* simplify logging */
 #define NOISE(...) LOG_DEBUG(MSTRO_LOG_MODULE_CORE,__VA_ARGS__)
diff --git a/maestro/ofi.c b/maestro/ofi.c
index 8171beb935170b1fd3836c2a6167ee9c4121e723..1c71ae80c7c54f01631647cffcac1f2c8f55f35c 100644
--- a/maestro/ofi.c
+++ b/maestro/ofi.c
@@ -5,6 +5,7 @@
 
 /*
  * Copyright (C) 2019 Cray Computer GmbH
+ * Copyright (C) 2020-2021 Hewlett-Packard (Schweiz) GmbH
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +36,6 @@
 #include "maestro/logging.h"
 #include "maestro/env.h"
 #include "maestro/i_utlist.h"
-#include "maestro/i_tpl.h"
 #include "maestro/i_base64.h"
 #include "maestro/i_pool_manager_protocol.h"
 #include "maestro/i_ofi.h"
@@ -43,6 +43,8 @@
 #include "maestro/i_pool.h"
 #include "maestro/i_memory.h"
 
+#include "protocols/maestro-endpoints.h"
+
 #include "i_pool_client.h"
 #include "i_pool_manager.h"
 #include "maestro/i_memlock.h"
@@ -105,7 +107,7 @@ struct mstro_endpoint_set {
 #define MSTRO_OFI_MODE    (FI_CONTEXT|FI_CONTEXT2|FI_RX_CQ_DATA)
 #define MSTRO_OFI_MRMODE  (FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | FI_MR_MMU_NOTIFY)
 #define MSTRO_OFI_EP_TYPE FI_EP_RDM
-#define MSTRO_OFI_VERSION FI_VERSION(1,10)
+#define MSTRO_OFI_VERSION FI_VERSION(1,14)
 
 static mstro_drc_info g_drc_info = NULL;
 
@@ -126,133 +128,55 @@ static struct fi_info *g_fi = NULL;
 
 
 
-static inline
-mstro_status
-mstro_ep_desc_free(mstro_endpoint_descriptor desc)
-{
-  if(desc) {
-    mstro_endpoint_descriptor el,tmp;
-    LL_FOREACH_SAFE(desc,el,tmp) {
-      if(el->name)
-        free(el->name);
-      if(el->oob_cookie)
-        free(el->oob_cookie);
-      free(el);
-    }
-  }
-  return MSTRO_OK;
-}
 
+static
 mstro_status
-mstro_ep_desc_create_ofi(mstro_endpoint_descriptor *result_p,
-                         const struct fi_info *fi, struct fid_ep *ep)
+mstro_epd_create_ofi(const struct fi_info *fi, struct fid_ep *ep,
+                     struct mstro_endpoint *dst)
 {
-  if(result_p==NULL)
+  if(dst==NULL)
     return MSTRO_INVOUT;
   if(fi==NULL || ep==NULL)
     return MSTRO_INVARG;
 
-  enum mstro_endpoint_type ept = MSTRO_EP_INVALID;
-
-  switch(fi->addr_format) {
-    /** This is the list documented for libfabric 1.5 */
-    case FI_SOCKADDR_IN:
-      ept = MSTRO_EP_OFI_IN4;        break;
-    case FI_SOCKADDR_IN6:
-      ept = MSTRO_EP_OFI_IN6;        break;
-    case FI_SOCKADDR_IB:
-      ept = MSTRO_EP_OFI_IB;         break;
-    case FI_ADDR_PSMX:
-      ept = MSTRO_EP_OFI_PSMX;       break;
-    case FI_ADDR_GNI:
-      ept = MSTRO_EP_OFI_GNI;        break;
-#if FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION), FI_VERSION(1,5))
-    case FI_ADDR_PSMX2:
-      ept = MSTRO_EP_OFI_PSMX2;      break;
-    case FI_ADDR_BGQ:
-      ept = MSTRO_EP_OFI_BGQ;        break;
-    case FI_ADDR_MLX:
-      ept = MSTRO_EP_OFI_MLX;        break;
-    case FI_ADDR_STR:
-      ept = MSTRO_EP_OFI_STR;        break;
-#endif
-      /* unsupported ones: */
-    case FI_FORMAT_UNSPEC:
-    case FI_SOCKADDR:
-      ERR("fi_info entry with unsupported format, ignoring it\n");
-      break;
-      /* unexpected ones: */
-    default:
-      ERR("fi_info entry with unexpected format %d, ignoring it\n");
-      break;
+
+  /* now build protobuf-based version */
+  Mstro__Endpoint *e=NULL;
+  mstro_status status = mstro_ep_fi_to_ep(fi, ep, &e);
+  if(status!=MSTRO_OK) {
+    ERR("Failed to build endpoint descriptor: %d (%s)\n",
+        status, mstro_status_description(status));
+  } else {
+    dst->pbep = e;
   }
-  if(ept!=MSTRO_EP_INVALID) {
-    mstro_endpoint_descriptor target
-        = calloc(1, sizeof(struct mstro_endpoint_descriptor_));
-    if(target==NULL) {
-      ERR("Failed to allocate ep descriptor entry\n");
-      /* FIXME: avoid leaking */
-      return MSTRO_FAIL;
-    }
-    target->type = ept;
-    target->name = NULL;
-    /** all targets will fit 2x uint64, which is what psmx2 allocates
-     * in the union, except for a string provider. For that we need to
-     * query here */
-    size_t addr_len;
-    char *buf;
-#if FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION), FI_VERSION(1,5))
-    if(fi->addr_format==FI_ADDR_STR) {
-      int s = fi_getname(&ep->fid, NULL, &addr_len);
-      if(s<0) {
-        if(s!=-FI_ETOOSMALL) {
-          ERR("Failed to obtain endpoint name: %d (%s)\n",
-              s, fi_strerror(-s));
-          return MSTRO_FAIL;
-        }
-      }
-      buf = malloc(addr_len+1);
-      if(buf==NULL) {
-        ERR("Failed to allocate endpoint name buffer\n");
-        free(target);
-        return MSTRO_NOMEM;
-      }
-      buf[addr_len]='\0';
-    } else
-#endif
-    {
-      /* gni has largest buffer */
-      buf = (char*)&target->gni[0];
-      addr_len = sizeof(target->gni);
+
+  return MSTRO_OK;
+}
+
+/** free the toplevel of @arg epl, but don't descend into the
+ * ep slot that may be shared with a mstro_endpoint->pbeb */
+static
+mstro_status
+mstro_epl__free_shallow(Mstro__EndpointList *epl)
+{
+  if(epl!=NULL) {
+    NFREE(epl->eps);
+    if(epl->credentials) {
+      for(size_t i=0; i<epl->n_credentials; i++) {
+        mstro__ofi_credential__free_unpacked(epl->credentials[i],
+                                             NULL);
+      }      
+      free(epl->credentials);
     }
-    size_t assumed_addr_len = addr_len;
-    int s = fi_getname(&ep->fid, buf, &addr_len);
-    if(s<0) {
-      if(s==-FI_ETOOSMALL) {
-        ERR("Unexpectedly large endpoint name requirement: Tried %zu, need %zu\n",
-            assumed_addr_len, addr_len);
-      } else {
-        ERR("Failed to obtain endpoint name: %d, (%s)\n",
-            s, fi_strerror(-s));
+    if(epl->inforegs) {
+      for(size_t i=0; i<epl->n_inforegs; i++) {
+        mstro__ofi_memory_region__free_unpacked(epl->inforegs[i],
+                                                NULL);
       }
-      free(target);
-      return MSTRO_FAIL;
-    }
-#if FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION), FI_VERSION(1,5))
-    if(fi->addr_format==FI_ADDR_STR)
-      target->str = (char*)buf;
-#endif
-    assert(g_drc_info!=NULL);
-    mstro_drc_get_oob_string(&target->oob_cookie, g_drc_info);
-    DEBUG("Saved DRC cookie %s in ep-desc\n", target->oob_cookie);
-    target->next = NULL;
-    target->name = NULL;
-
-    *result_p = target;
+      free(epl->inforegs);
+    }    
+    free(epl);
   }
-  /* DEBUG("built ep description for type %d (%s) EP\n", */
-  /*       (*result_p)->type, tntcl_ep_descriptor_names[(*result_p)->type]); */
-
   return MSTRO_OK;
 }
 
@@ -263,568 +187,237 @@ mstro_ep_desc_create_ofi(mstro_endpoint_descriptor *result_p,
  * If EP is a linked list of endpoints, serialize them in order.
  *
  * If *result_p is NULL it will be allocated, otherwise
- * overwritten. */
+ * overwritten.
+ *
+ * This function re-uses the pbep/credentials/inforegs slots of the
+ * @arg ep objects, so care must be taken when freeing the resulting
+ * @arg *result_p to not invalidate them. Consider using
+ * @ref mstro_epl__free_shallow().
+ */
 
 #define MSTRO_EP_STRING_MAX (8*1024)
 
-/* IPv4 sockaddr */
-static inline
+static
 mstro_status
-mstro_ep_desc_serialize__in4(char *dst,
-                             const struct sockaddr_in *sin)
+mstro_ep__to_el(const struct mstro_endpoint *ep,
+                Mstro__EndpointList **result_p)
 {
-  mstro_status stat = MSTRO_UNIMPL;
+  assert(result_p!=NULL);
+  assert(*result_p==NULL);
+  mstro_status stat=MSTRO_UNIMPL;
+  Mstro__EndpointList *epl;
 
-  /* struct sockaddr_in is defined platform-independent and
-   * already using network byte order as needed, so we can handle
-   * it as a struct */
-  tpl_node *tns;
-  tpl_bin tb;
-
-  tb.sz = sizeof(struct sockaddr_in);
-  tb.addr = sin;
-  tns = tpl_map("B", &tb);
-  tpl_pack(tns,0);
-  void *buf;
-  size_t buflen;
-  if(-1==tpl_dump(tns,TPL_MEM, &buf, &buflen)) {
-    ERR("Not enough space to pack IPv4 endpoint\n");
-    return MSTRO_FAIL;
-  }
-  /* DEBUG("tpl dump of in4 address has length %zu\n", buflen); */
-  /* b64 encode */
-  size_t needed;
-  unsigned char *encoded = base64_encode(buf,buflen,&needed);
-  if(needed>MSTRO_EP_STRING_MAX) {
-    ERR("Cannot b64 encode IPv4 endpoint data, need %d\n", needed);
-    stat=MSTRO_FAIL; goto BAILOUT;
-  }
-  if(encoded!=NULL) {
-    strcpy(dst, (const char*)encoded);
-    /* DEBUG("Encoded in4 address: %s\n", dst); */
-  } else {
-    ERR("Cannot b64 encode IPv4 endpoint data, no memory\n");
+  assert(ep!=NULL);
+
+  epl = malloc(sizeof(Mstro__EndpointList));
+  if(epl==NULL) {
+    ERR("Failed to allocate endpoint list\n");
     stat=MSTRO_NOMEM;
     goto BAILOUT;
-  }
-  stat=MSTRO_OK;
-BAILOUT:
-  NFREE(encoded);
-  /* clean up */
-  tpl_free(tns);
-  free(buf);
+  } else {
+    mstro__endpoint_list__init(epl);
 
-  return stat;
-}
+    size_t count = 0;
+    const struct mstro_endpoint *tmp;
+    LL_COUNT(ep,tmp,count);
+    epl->eps = malloc(count * sizeof(Mstro__Endpoint*));
+    if(epl->eps==NULL) {
+      ERR("Failed to allocate endpoint\n");
+      stat=MSTRO_NOMEM;
+      goto BAILOUT;
+    }
 
-/** take base64-encoded string (0-terminated) and unpack it into *sin */
-static inline
-mstro_status
-mstro_ep_desc_deserialize__in4(struct sockaddr_in *sin,
-                               char *b64_strval)
-{
-  mstro_status stat=MSTRO_UNIMPL;
+    epl->inforegs = malloc(count * sizeof(Mstro__OfiMemoryRegion*));
+    if(epl->inforegs==NULL) {
+      ERR("Failed to allocate endpoint\n");
+      stat=MSTRO_NOMEM;
+      goto BAILOUT;
+    }
 
-  tpl_node *tn;
-  tpl_bin tb;
-  tn = tpl_map( "B", &tb );
+    epl->credentials = malloc(count * sizeof(Mstro__OfiCredential*));
+    if(epl->credentials==NULL) {
+      ERR("Failed to allocate endpoint\n");
+      stat=MSTRO_NOMEM;
+      goto BAILOUT;
+    }
 
-  /* undo base64 */
-  size_t buflen;
-  unsigned char *buf = base64_decode((unsigned char*)b64_strval,
-                                     strlen(b64_strval),
-                                     &buflen);
-  if(buf==NULL) {
-    stat=MSTRO_NOMEM;
-    goto BAILOUT;
-  }
-  /* DEBUG("b64decode: buf of length %zu\n", buflen); */
+    size_t i=0;
+    const struct mstro_endpoint *elt;
+    LL_FOREACH(ep,elt) {
+      assert(ep->pbep!=NULL);
+
+      epl->eps[i] = elt->pbep;
+      assert(elt->pbep->proto_case==MSTRO__ENDPOINT__PROTO_OFIPROTO);
+      epl->n_eps=i+1;
 
-  tpl_load( tn, TPL_MEM, buf, buflen);
-  tpl_unpack( tn, 0 );
-  tpl_free(tn);
+      epl->inforegs[i] = malloc(sizeof(Mstro__OfiMemoryRegion));
+      if(epl->inforegs[i]==NULL) {
+        ERR("Failed to allocate inforeg\n");
+        stat=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__ofi_memory_region__init(epl->inforegs[i]);
+        epl->inforegs[i]->baseaddr = ep->component_info_addr;
+        epl->inforegs[i]->raw_key.len = ep->component_info_keysize;
+        epl->inforegs[i]->raw_key.data = malloc(epl->inforegs[i]->raw_key.len);
+        if(epl->inforegs[i]->raw_key.data==NULL) {
+          ERR("Failed to allocate space for raw MR key\n");
+          stat=MSTRO_NOMEM;
+          goto BAILOUT;
+        }
+        memcpy(epl->inforegs[i]->raw_key.data,
+               ep->component_info_raw_key,
+               epl->inforegs[i]->raw_key.len);
+        epl->n_inforegs=i+1;
+      }
 
-  assert(tb.sz==sizeof(struct sockaddr_in));
+      epl->credentials[i] = malloc(sizeof(Mstro__OfiCredential));
+      if(epl->credentials[i]==NULL) {
+        ERR("Failed to allocate credential region\n");
+        stat=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__ofi_credential__init(epl->credentials[i]);
+        switch(elt->pbep->ofiproto) {
+          case MSTRO__OFI_ENDPOINT_KIND__GNI:
+            epl->credentials[i]->val_case = MSTRO__OFI_CREDENTIAL__VAL_DRC;
+            epl->credentials[i]->drc = malloc(sizeof(Mstro__CredDRC));
+            if(epl->credentials[i]->drc==NULL) {
+              ERR("Failed to allocate credential space\n");
+              stat=MSTRO_NOMEM;
+              goto BAILOUT;
+            } else {
+              mstro__cred_drc__init(epl->credentials[i]->drc);
+              mstro_drc_get_credential(&epl->credentials[i]->drc->credential,
+                                       g_drc_info);
+            }
+            break;
+          default:
+            DEBUG("No credential support for endpoint type %d\n",
+                  elt->pbep->proto_case);
+        }
+        epl->n_credentials=i+1;
+      };
 
-  memcpy(sin, tb.addr, tb.sz);
+      i++;
+    }
+    assert(epl->n_eps == count);
+  }
   stat = MSTRO_OK;
-BAILOUT:
-  free(tb.addr);  /* our responsibility to free it */
+  *result_p = epl;
 
+BAILOUT:
+  if(stat!=MSTRO_OK) {
+    if(epl) {
+      if(epl->eps) {
+        free(epl->eps); /* don't descend, entries are owned by ep->pbeb entries */
+        epl->eps=NULL;
+        epl->n_eps=0;
+      }
+      mstro__endpoint_list__free_unpacked(epl, NULL);
+    }
+  }
   return stat;
 }
 
-static inline
+/** serialize all endpoints in the linked list of endpoints starting
+ * at @arg ep into a serialized and base64-encoded Mstro__AppInfo */
+static
 mstro_status
-mstro_ep_desc_serialize__in6(char *dst,
-                             const struct sockaddr_in6 *sin)
+mstro_ofi__epl_to_serialized_appinfo(char **result_p,
+                                     const struct mstro_endpoint *ep)
 {
-  mstro_status stat = MSTRO_UNIMPL;
-  /* struct sockaddr_in6 is defined platform-independent and
-   * already using network byte order as needed, so we can handle
-   * it as a struct */
-  tpl_node *tns;
-  tpl_bin tb;
-
-  DEBUG("sockaddr_in6 with port %d\n", ntohs(sin->sin6_port));
-
-  tb.sz = sizeof(struct sockaddr_in6);
-  tb.addr = sin;
-  tns = tpl_map("B", &tb);
-  tpl_pack(tns,0);
-  void *buf;
-  size_t buflen;
-  if(-1==tpl_dump(tns,TPL_MEM, &buf, &buflen)) {
-    ERR("Not enough space to pack IPv6 endpoint\n");
-    return MSTRO_FAIL;
-  }
-  /* b64 encode */
-  size_t needed;
-  unsigned char *encoded = base64_encode(buf, buflen, &needed);
-  if(needed>MSTRO_EP_STRING_MAX) {
-    ERR("Cannot b64 encode IPv6 endpoint data, need %d\n", needed);
-    stat=MSTRO_FAIL;
+  unsigned char *encoded=NULL;
+  uint8_t *pbbuf=NULL;
+  Mstro__AppInfo appinfo = MSTRO__APP_INFO__INIT;
+  
+  mstro_status status = mstro_ep__to_el(ep, &appinfo.eps);
+  if(status!=MSTRO_OK) {
+    ERR("Failed to create protobuf AppInfo for endpoints\n");
     goto BAILOUT;
   }
 
-  if(encoded!=NULL) {
-    strcpy(dst, (char*)encoded);
-  } else {
-    ERR("Cannot b64 encode IPv6 endpoint data, no memory\n");
-    stat=MSTRO_FAIL;
+  size_t pblen = mstro__app_info__get_packed_size(&appinfo);
+  DEBUG("Endpoint AppInfo/PM info packing needs %zu bytes \n", pblen);
+  pbbuf = malloc(sizeof(uint8_t)*pblen);
+  if(pbbuf==NULL) {
+    ERR("Failed to allocate protobuf buffer\n");
+    status=MSTRO_NOMEM;
     goto BAILOUT;
   }
-
-  stat=MSTRO_OK;
-BAILOUT:
-  /* clean up */
-  NFREE(encoded);
-  tpl_free(tns);
-  free(buf);
-  return stat;
-}
-
-static inline
-mstro_status
-mstro_ep_desc_deserialize__in6(struct sockaddr_in6 *sin,
-                               char *b64_strval)
-{
-  mstro_status stat=MSTRO_UNIMPL;
-  tpl_node *tn;
-  tpl_bin tb;
-  tn = tpl_map( "B", &tb );
-
-  /* undo base64 */
-  size_t buflen;
-  unsigned char *buf = base64_decode((unsigned char*)b64_strval,
-                                     strlen(b64_strval),
-                                     &buflen);
-
-  if(buf==NULL) {
-    stat=MSTRO_NOMEM;
+  if(mstro__app_info__pack(&appinfo, pbbuf)!=pblen) {
+    ERR("Failed to pack appinfo\n");
+    status = MSTRO_FAIL;
     goto BAILOUT;
   }
 
-  tpl_load( tn, TPL_MEM, buf, buflen);
-  tpl_unpack( tn, 0 );
-  tpl_free(tn);
-
-  assert(tb.sz==sizeof(struct sockaddr_in6));
-
-  memcpy(sin, tb.addr, tb.sz);
-  stat=MSTRO_OK;
-BAILOUT:
-  free(tb.addr);  /* our responsibility to free it */
-
-  return stat;
-}
-
-mstro_status
-mstro_ep_desc_serialize(char **result_p,
-                        const struct mstro_endpoint *ep)
-{
-  unsigned char *encoded=NULL;
-  mstro_status stat=MSTRO_UNIMPL;
-
-  if(result_p==NULL)
-    return MSTRO_INVOUT;
-  if(ep==NULL)
-    return MSTRO_INVARG;
-  const struct mstro_endpoint_descriptor_ *epd = ep->descr;
-
-  /* Read the documentation at
-   * https://troydhanson.github.io/tpl/userguide.html#_linked_lists
-   * for details */
-
-  tpl_node *tn=NULL;
-  const struct mstro_endpoint_descriptor_ *elt;
-  struct serialized_endpoint_element serialized_element;
-
-  assert(MSTRO_EP__MAX<=INT_MAX);
-
-  tn = tpl_map("A(S(IssUUc#))", &serialized_element, MSTRO_OFI_KEY_LEN_MAX);
-  char *strval = alloca(MSTRO_EP_STRING_MAX*sizeof(char));
-  if(strval==NULL)
-    return MSTRO_NOMEM;
-
-
-  LL_FOREACH(epd,elt) {
-    /* each one should serialize into STRVAL buffer, base64-encoded,
-     * NULL-terminated */
-    /* for code-reuse we don't use a 'switch-case' here */
-    if(elt->type == MSTRO_EP_OFI_IN4) {
-      stat = mstro_ep_desc_serialize__in4(strval, &elt->in4);
-    } else if(elt->type == MSTRO_EP_OFI_IN6) {
-      stat = mstro_ep_desc_serialize__in6(strval, &elt->in6);
-    } else if(elt->type == MSTRO_EP_OFI_IB) {
-      ERR("Infiniband endpoints unsupported\n");
-      return MSTRO_FAIL;
-    } else if(elt->type == MSTRO_EP_OFI_PSMX
-              || elt->type == MSTRO_EP_OFI_PSMX2
-              || elt->type == MSTRO_EP_OFI_GNI
-              || elt->type == MSTRO_EP_OFI_BGQ
-              || elt->type == MSTRO_EP_OFI_MLX) {
-      /* uint64_t based endpoints */
-      tpl_node *tns;
-      //      uint64_t val;
-      /* abusing C union properties we can refer to all entries by the
-       * gni slots; we end up always transporting 6 uint64, but who cares */
-      tns = tpl_map("UUUUUU",
-                    & elt->gni[0], & elt->gni[1],
-                    & elt->gni[2], & elt->gni[3],
-                    & elt->gni[4], & elt->gni[5]);
-      tpl_pack(tns, 0);
-      void *buf=NULL;
-      size_t buflen;
-      if(-1==tpl_dump(tns,TPL_MEM, &buf, &buflen)) {
-        ERR("Not enough space to pack fabric endpoint\n");
-        return MSTRO_FAIL;
-      }
-      /* b64 encode */
-
-      size_t needed;
-      encoded = base64_encode(buf, buflen, &needed);
-      if(needed>MSTRO_EP_STRING_MAX) {
-        ERR("Cannot b64 encode fabric endpoint data, need %d\n", needed);
-        stat=MSTRO_FAIL;
-        return stat;
-      }
-      if(encoded!=NULL) {
-        strcpy(strval, (char*)encoded);
-      } else {
-        ERR("Cannot b64 encode fabric endpoint data, no memory\n");
-        stat=MSTRO_FAIL;
-        goto BAILOUT_1;
-      }
-      stat=MSTRO_OK;
-   BAILOUT_1:
-      /* clean up */
-      NFREE(encoded);
-      tpl_free(tns);
-      free(buf);
-    } else if(elt->type == MSTRO_EP_OFI_STR) {
-      /* string-based endpoint descriptors */
-      char *tmp;
-      switch(elt->type) {
-        /* string based methods */
-        case MSTRO_EP_OFI_STR:
-          tmp = elt->str;
-          break;
-        default:
-          ERR("Confusion\n");
-          return MSTRO_FAIL;
-      }
-      size_t len = strlen(tmp)+1;
-      if(len>MSTRO_EP_STRING_MAX) {
-        ERR("config endpoint string too long (%d) for serialization\n", len);
-        return MSTRO_FAIL;
-      }
-      /* b64 encode */
-      size_t needed;
-      encoded = base64_encode((unsigned char*)tmp,
-		              strlen(tmp)+1, &needed);
-      if(needed>MSTRO_EP_STRING_MAX) {
-        ERR("Cannot b64 encode fabric endpoint data, need %d\n", needed);
-        stat=MSTRO_FAIL;
-        goto BAILOUT_2;
-      }
-      if(encoded!=NULL) {
-        strcpy(strval, (char*)encoded);
-      } else {
-        ERR("Cannot b64 encode fabric endpoint data, no memory\n");
-        stat=MSTRO_NOMEM;
-        goto BAILOUT_2;
-      }
-
-      stat=MSTRO_OK;
-   BAILOUT_2:
-      NFREE(encoded);
-    } else {
-      ERR("Unsupported MSTRO_EP type: %d (%s)\n",
-          elt->type, mstro_ep_descriptor_names[elt->type]);
-      return MSTRO_FAIL;
-    }
-
-    serialized_element.type = elt->type;
-    serialized_element.strval=strdup(strval); /* shrink wrap size */
-    if(serialized_element.strval==NULL)
-      return MSTRO_NOMEM;
-
-    mstro_drc_get_oob_string(&serialized_element.oob_cookie, g_drc_info);
-    DEBUG("Added OOB info %s\n", serialized_element.oob_cookie);
-
-    /* keys */
-    serialized_element.info_addr = ep->component_info_addr;
-    serialized_element.info_keysize = ep->component_info_keysize;
-    memcpy(serialized_element.info_key,
-           ep->component_info_raw_key,
-           ep->component_info_keysize);
-    DEBUG("Serialized MR addr 0x%" PRIx64 ", len %zu\n",
-          serialized_element.info_addr, ep->component_info_keysize);
-
-    /* DEBUG("Serializing %d:%s\n", serialized_element.type, */
-    /*       serialized_element.strval); */
-    tpl_pack(tn,1);
-    free(serialized_element.strval);
-    free(serialized_element.oob_cookie);
-  }
-
-  /* now write all to a string */
-  size_t len;
-  void *buf=NULL;
-  tpl_dump(tn,TPL_MEM, &buf, &len);
-  if(len>MSTRO_EP_STRING_MAX) {
-    ERR("EP serialization took %zu bytes, but only %zu are supported\n",
-        len, MSTRO_EP_STRING_MAX);
-    return MSTRO_FAIL;
-  }
   /* b64 encode */
   size_t needed;
-  encoded = base64_encode(buf, len, &needed);
-  if(needed>MSTRO_EP_STRING_MAX) {
-    ERR("Cannot b64 encode endpoint descriptor list, need %d\n", needed);
-    stat=MSTRO_FAIL;
-    goto BAILOUT;
-  }
+  encoded = base64_encode(pbbuf, pblen, &needed);
   if(encoded==NULL) {
     ERR("Cannot b64 encode endpoint descriptor list, out of memory\n");
-    stat=MSTRO_NOMEM;
+    status = MSTRO_NOMEM;
     goto BAILOUT;
   } else {
     *result_p = (char*)encoded;
   }
-#if 0
+
+#if 1
   {
-    size_t count = 0;
-    mstro_endpoint_descriptor_t tmp;
+    size_t count=0;
+    const struct mstro_endpoint *tmp;
     LL_COUNT(ep,tmp,count);
     DEBUG("serialized %d EPs to |%s|\n",
           count, *result_p);
   }
 #endif
+
 BAILOUT:
-  if(buf)
-    free(buf);
-  if(tn)
-    tpl_free(tn);
-  return stat;
+  if(pbbuf)
+    free(pbbuf);
+  if(status!=MSTRO_OK) {
+    if(encoded)
+      free(encoded);
+  }
+  if(appinfo.eps) {
+    mstro_epl__free_shallow(appinfo.eps);
+  }
+          
+  return status;
 }
 
 mstro_status
-mstro_ofi_pm_info(char **result_p)
+mstro_appinfo_serialize(const struct mstro_endpoint *ep,
+                        char **result_p)
 {
-  void *strbuf=NULL;
+  if(result_p==NULL)
+    return MSTRO_INVOUT;
+  if(ep==NULL)
+    return MSTRO_INVARG;
+
+  assert(ep!=NULL);
+  
+  return mstro_ofi__epl_to_serialized_appinfo(result_p, ep);
+}
+
+
 
+mstro_status
+mstro_ofi_pm_info(char **result_p)
+{
   if(result_p==NULL)
     return MSTRO_INVOUT;
-  mstro_status status = MSTRO_OK;
 
-  unsigned char *encoded=NULL;
   if(g_endpoints==NULL) {
     ERR("No endpoints configured -- did you call mstro_pm_start()?\n");
     return MSTRO_FAIL;
   }
 
-  /* Read the documentation at
-   * https://troydhanson.github.io/tpl/userguide.html#_linked_lists
-   * for details */
-
-  tpl_node *tn;
-  struct mstro_endpoint *elt;
-  struct serialized_endpoint_element serialized_element;
-
-  assert(MSTRO_EP__MAX<=INT_MAX);
-
-  tn = tpl_map("A(S(IssUUc#))", &serialized_element, MSTRO_OFI_KEY_LEN_MAX);
-  char *strval = alloca(MSTRO_EP_STRING_MAX*sizeof(char));
-  if(strval==NULL)
-    return MSTRO_NOMEM;
-
-  mstro_status stat;
-
-  LL_FOREACH(&(g_endpoints->eps[0]),elt) {
-    mstro_endpoint_descriptor d = elt->descr;
-    /* each one should serialize into STRVAL buffer, base64-encoded,
-     * NULL-terminated */
-    /* for code-reuse we don't use a 'switch-case' here */
-    if(d->type == MSTRO_EP_OFI_IN4) {
-      stat = mstro_ep_desc_serialize__in4(strval, &d->in4);
-    } else if(d->type == MSTRO_EP_OFI_IN6) {
-      stat = mstro_ep_desc_serialize__in6(strval, &d->in6);
-    } else if(d->type == MSTRO_EP_OFI_IB) {
-      ERR("Infiniband endpoints unsupported\n");
-      stat=MSTRO_FAIL;
-      goto BAILOUT;
-    } else if(d->type == MSTRO_EP_OFI_PSMX
-              || d->type == MSTRO_EP_OFI_PSMX2
-              || d->type == MSTRO_EP_OFI_GNI
-              || d->type == MSTRO_EP_OFI_BGQ
-              || d->type == MSTRO_EP_OFI_MLX) {
-      /* uint64_t based endpoints */
-      tpl_node *tns;
-      /* abusing C union properties we can refer to all entries by the
-       * gni slots; we end up always transporting 6 uint64, but who cares */
-      tns = tpl_map("UUUUUU",
-                    & d->gni[0], & d->gni[1],
-                    & d->gni[2], & d->gni[3],
-                    & d->gni[4], & d->gni[5]);
-      tpl_pack(tns, 0);
-      void *buf=NULL;
-      size_t buflen;
-      if(-1==tpl_dump(tns,TPL_MEM, &buf, &buflen)) {
-        ERR("Not enough space to pack fabric endpoint\n");
-        stat = MSTRO_FAIL;
-        goto BAILOUT;
-      }
-      /* b64 encode */
-
-      size_t needed;
-      encoded = base64_encode(buf, buflen, &needed);
-      if(needed>MSTRO_EP_STRING_MAX) {
-        ERR("Cannot b64 encode fabric endpoint data, need %d\n", needed);
-        stat=MSTRO_FAIL;
-        goto BAILOUT;
-      }
-      if(encoded!=NULL) {
-        strcpy(strval, (char*)encoded);
-      } else {
-        ERR("Cannot b64 encode fabric endpoint data, no memory\n");
-        stat=MSTRO_FAIL;
-        goto BAILOUT_1;
-      }
-      stat=MSTRO_OK;
-   BAILOUT_1:
-      /* clean up */
-      NFREE(encoded);
-      tpl_free(tns);
-      free(buf);
-    } else if(d->type == MSTRO_EP_OFI_STR) {
-      /* string-based endpoint descriptors */
-      char *tmp;
-      switch(d->type) {
-        /* string based methods */
-        case MSTRO_EP_OFI_STR:
-          tmp = d->str;
-          break;
-        default:
-          ERR("Confusion\n");
-          stat=MSTRO_FAIL;
-          goto BAILOUT;
-      }
-      size_t len = strlen(tmp)+1;
-      if(len>MSTRO_EP_STRING_MAX) {
-        ERR("config endpoint string too long (%d) for serialization\n", len);
-        stat=MSTRO_FAIL;
-        goto BAILOUT;
-      }
-      /* b64 encode */
-      size_t needed;
-      encoded = base64_encode((unsigned char*)tmp,
-		              strlen(tmp)+1, &needed);
-      if(needed>MSTRO_EP_STRING_MAX) {
-        ERR("Cannot b64 encode fabric endpoint data, need %d\n", needed);
-        stat=MSTRO_FAIL;
-        goto BAILOUT_2;
-      }
-      if(encoded!=NULL) {
-        strcpy(strval, (char*)encoded);
-      } else {
-        ERR("Cannot b64 encode fabric endpoint data, no memory\n");
-        stat=MSTRO_NOMEM;
-        goto BAILOUT_2;
-      }
-
-      stat=MSTRO_OK;
-   BAILOUT_2:
-      NFREE(encoded);
-    } else {
-      ERR("Unsupported MSTRO_EP type: %d\n",
-          d->type, mstro_ep_descriptor_names[d->type]);
-      stat = MSTRO_FAIL;
-      goto BAILOUT;
-    }
-
-    serialized_element.type = d->type;
-    serialized_element.strval=strdup(strval); /* shrink wrap size */
-    if(serialized_element.strval==NULL) {
-      stat=MSTRO_NOMEM;
-      goto BAILOUT;
-    }
-
-    mstro_drc_get_oob_string(&serialized_element.oob_cookie, g_drc_info);
-    DEBUG("Added OOB info %s\n", serialized_element.oob_cookie);
-
-    serialized_element.info_addr = elt->component_info_addr;
-    serialized_element.info_keysize = elt->component_info_keysize;
-    memcpy(serialized_element.info_key,
-           elt->component_info_raw_key,
-           elt->component_info_keysize);
-    DEBUG("Serialized for PM_INFO MR addr 0x%" PRIx64 ", len %zu\n",
-              serialized_element.info_addr, elt->component_info_keysize);
-
-    /* DEBUG("Serializing %d:%s\n", serialized_element.type, */
-    /*       serialized_element.strval); */
-    tpl_pack(tn,1);
-    free(serialized_element.strval);
-    free(serialized_element.oob_cookie);
-  }
-  /* now write all to a string */
-  size_t len;
-  tpl_dump(tn,TPL_MEM, &strbuf, &len);
-  if(len>MSTRO_EP_STRING_MAX) {
-    ERR("EP serialization took %zu bytes, but only %zu are supported\n",
-        len, MSTRO_EP_STRING_MAX);
-    stat = MSTRO_FAIL;
-    goto BAILOUT;
-  }
-  /* b64 encode */
-  size_t needed;
-  encoded = base64_encode(strbuf, len, &needed);
-  if(needed>MSTRO_EP_STRING_MAX) {
-    ERR("Cannot b64 encode endpoint descriptor list, need %d\n", needed);
-    stat=MSTRO_FAIL;
-    goto BAILOUT;
-  }
-  if(encoded==NULL) {
-    ERR("Cannot b64 encode endpoint descriptor list, out of memory\n");
-    stat=MSTRO_NOMEM;
-    goto BAILOUT;
-  } else {
-    *result_p = (char*)encoded;
-  }
-#if 1
-  {
-    DEBUG("serialized %d EPs to |%s|\n",
-          g_endpoints->size, *result_p);
-  }
-#endif
-BAILOUT:
-  if(strbuf)
-    free(strbuf);
-  if(tn)
-    tpl_free(tn);
-  return status;
 
+  /* we rely on endpoints being linked from #0 */
+  assert(g_endpoints->size>0);
+  assert(g_endpoints->eps[0].pbep!=NULL);
+  return mstro_ofi__epl_to_serialized_appinfo(result_p, &(g_endpoints->eps[0]));
 }
 
 
@@ -857,324 +450,263 @@ BAILOUT:
 }
 
 
-/** unpack one or two uint64 values. DST must be six uint64 entries
- * wide.  This abuses structure union layout, just like at packing
- * time to handle the case where only a 1-element address is
- * transported: the junk we packed will be unpacked into the second
- * element */
-static inline
+
+/* deserialize AppInfo from base64 encoded AppInfo OOB data */
 mstro_status
-mstro_ep_desc_deserialize__uint64(uint64_t (*dst)[6],
-                                  char *b64_strval)
+mstro_appinfo_deserialize(const char *serialized_eps,
+                          Mstro__AppInfo **result_p)
 {
-  tpl_node *tn=NULL;
-  if(dst==NULL ||*dst==NULL) {
-    ERR("dst or *dst is NULL\n");
+  if(result_p==NULL)
     return MSTRO_INVOUT;
-  }
-  if(b64_strval==NULL) {
-    ERR("b64strval is NULL\n");
+  if(serialized_eps==NULL)
     return MSTRO_INVARG;
-  }
   mstro_status stat=MSTRO_UNIMPL;
 
-
+  Mstro__AppInfo *appinfo=NULL;
+  
   size_t buflen;
-  unsigned char *buf = base64_decode((unsigned char*)b64_strval,
-                                     strlen(b64_strval),
+  unsigned char *buf = base64_decode((unsigned char*)serialized_eps,
+                                     strlen(serialized_eps),
                                      &buflen);
 
   if(buf==NULL) {
+    ERR("Failed to base64-decode endpoint description -- invalid data or out of memory\n");
     stat=MSTRO_NOMEM;
     goto BAILOUT;
   }
 
-  uint64_t v[6];
-  tn = tpl_map("UUUUUU", v, v+1, v+2, v+3, v+4, v+5);
-  tpl_load(tn, TPL_MEM, buf, buflen);
-  tpl_unpack( tn, 0 );
-  tpl_free(tn);
-  memcpy(*dst,v,sizeof(v));
+  appinfo = mstro__app_info__unpack(NULL, buflen, buf);
+  if(appinfo==NULL) {
+    ERR("Failed to parse endpoint description\n");
+    stat=MSTRO_FAIL;
+    goto BAILOUT;
+  }
 
+  if(appinfo->eps->n_eps!=appinfo->eps->n_inforegs || appinfo->eps->n_eps != appinfo->eps->n_credentials) {
+    ERR("Invalid EndpointList\n");
+    stat=MSTRO_INVARG;
+    goto BAILOUT;
+  }
+  DEBUG("Parsed %zu endpoints\n", appinfo->eps->n_eps);
+
+  for(size_t i=0; i<appinfo->eps->n_eps; i++) {
+    MSTRO_EP__EL_DESCRIBE(MSTRO_LOG_DEBUG,MSTRO_LOG_MODULE_COMM,
+                          "Received endpoint descriptor",
+                          appinfo->eps, i);
+  }
+
+  *result_p = appinfo;
+  appinfo=NULL;
+  
   stat=MSTRO_OK;
+
 BAILOUT:
-  NFREE(buf);
+  if(buf)
+    free(buf);
+
+  if(appinfo)
+    mstro__app_info__free_unpacked(appinfo, NULL);
 
   return stat;
 }
 
 
-/** read the comments in mstro_ep_desc_serialize for the format */
+
+
+/** Extract the address from @arg epd and return in in *addr.
+ *
+ * @arg *addr must be have some space preallocated, and the available
+ * space must be passed in @arg *addrlen. If it is not sufficient an
+ * MSTRO_NOMEM will be returned and @arg *addlen will be set to the
+ * required space.
+ */
+static
 mstro_status
-mstro_ep_desc_deserialize(mstro_endpoint_descriptor *result_p,
-                          const char *serialized_eps)
+mstro_epd_to_ofi_addr(const Mstro__Endpoint *epd,
+                      uint32_t *addr_format,
+                      void *addr,
+                      size_t *addrlen)
 {
-  size_t element_idx=0;
-  tpl_node *tn=NULL;
-  mstro_endpoint_descriptor *next = NULL;
-
-  if(result_p==NULL)
+  if(addr_format==NULL || addr==NULL || addrlen==NULL)
     return MSTRO_INVOUT;
-  if(serialized_eps==NULL)
+  if(epd==NULL)
     return MSTRO_INVARG;
-  mstro_status stat=MSTRO_UNIMPL;
 
-  size_t buflen;
-  unsigned char *buf = base64_decode((unsigned char*)serialized_eps,
-                                     strlen(serialized_eps),
-                                     &buflen);
+  assert(epd->proto_case==MSTRO__ENDPOINT__PROTO_OFIPROTO);
+  assert(epd->addr_case==MSTRO__ENDPOINT__ADDR_OFIADDR);
 
-  if(buf==NULL) {
-    stat=MSTRO_NOMEM;
-    goto BAILOUT;
-  }
-
-  struct serialized_endpoint_element serialized_element;
-  //DEBUG("SE size: %zu, expecting %zu\n", sizeof(serialized_element),
-  //      4+8+8+8+MSTRO_OFI_KEY_LEN_MAX);
-  tn=tpl_map("A(S(IssUUc#))", &serialized_element, MSTRO_OFI_KEY_LEN_MAX);
-  tpl_load( tn, TPL_MEM, buf, buflen);
-  DEBUG("endpoint deserialization: got %d EPs\n", tpl_Alen(tn,1));
-
-  /* target for list-append */
-  next = result_p;
-
-  while( tpl_unpack( tn, 1 ) > 0 ) {
-    /* got another element */
-    enum mstro_endpoint_type eptype
-        = (enum mstro_endpoint_type) serialized_element.type;
-    mstro_endpoint_descriptor target
-        = malloc(sizeof(struct mstro_endpoint_descriptor_));
-    if(target==NULL) {
-      ERR("Failed to allocate EP descriptor\n");
-      return MSTRO_NOMEM;
-    }
-
-    if(serialized_element.oob_cookie!=NULL) {
-      target->oob_cookie=strdup(serialized_element.oob_cookie);
-      DEBUG("oob cookie: %s\n", target->oob_cookie);
-    } else {
-      target->oob_cookie=NULL;
-      DEBUG("no cookie\n");
-    }
-
-    target->name=NULL;
-
-    if(eptype==MSTRO_EP_OFI_IN4) {
-      stat = mstro_ep_desc_deserialize__in4(&(target->in4),
-                                            serialized_element.strval);
-    } else if(eptype==MSTRO_EP_OFI_IN6) {
-      stat = mstro_ep_desc_deserialize__in6(&(target->in6),
-                                            serialized_element.strval);
-    } else if(eptype==MSTRO_EP_OFI_IB) {
-      ERR("Endpoint typpe OFI_IB unsupported");
-      stat = MSTRO_UNIMPL;
-    } else if(eptype==MSTRO_EP_OFI_PSMX
-              || eptype==MSTRO_EP_OFI_PSMX2
-              || eptype==MSTRO_EP_OFI_GNI
-              || eptype==MSTRO_EP_OFI_BGQ
-              || eptype==MSTRO_EP_OFI_MLX) {
-      /** transparently handles one .. six entries */
-      stat = mstro_ep_desc_deserialize__uint64(&target->gni,
-                                               serialized_element.strval);
-    } else if(eptype==MSTRO_EP_OFI_STR) {
-      target->str=NULL;
-      stat = mstro_ep_desc_deserialize__str(&(target->str),
-                                            serialized_element.strval);
-    } else {
-      ERR("Unsupported EP type: %d\n", eptype);
-      stat = MSTRO_UNIMPL;
-    }
-    DEBUG("Parsed ep %d\n", element_idx);
-    if(stat==MSTRO_OK) {
-      /* DEBUG("%d. endpoint type %d (%s)\n", */
-      /*       ++i, eptype, mstro_ep_descriptor_names[eptype]); */
-      target->type = eptype;
-      target->next = NULL;
-      *next = target;
-      next = &target->next;
-
-      DEBUG("Found MR addr 0x%" PRIx64 ", keylen %" PRIu64 "\n",
-            serialized_element.info_addr, serialized_element.info_keysize);
-      target->info_addr = serialized_element.info_addr;
-      target->info_keysize = serialized_element.info_keysize;
-      assert(MSTRO_OFI_KEY_LEN_MAX>=serialized_element.info_keysize);
-      memcpy(target->info_key,
-             serialized_element.info_key,
-             serialized_element.info_keysize);
-    } else {
-      DEBUG("dropped %zu. endpoint type %d (%s)\n",
-           element_idx, eptype, mstro_ep_descriptor_names[eptype]);
-      if(target->oob_cookie) free(target->oob_cookie);
-      free(target);
-    }
-    free(serialized_element.strval);
-
-    element_idx++;
-  }
-BAILOUT:
-  free(buf);
-
-  return stat;
-}
+  const Mstro__OfiAddr *a = epd->ofiaddr;
+  switch(a->val_case) {
+    case MSTRO__OFI_ADDR__VAL__NOT_SET:
+      ERR("No address set, that's an error\n");
+      return MSTRO_FAIL;
+    case MSTRO__OFI_ADDR__VAL_UNSPEC:
+      ERR("No address value specified\n");
+      return MSTRO_FAIL;
+    case MSTRO__OFI_ADDR__VAL_SOCK:
+      ERR("Unspecific sockaddr not supported\n");
+      return MSTRO_UNIMPL;
 
-#define MSTRO_EP_DESC_STRMAX 1024
-const char *
-mstro_ep_desc_describe(mstro_endpoint_descriptor desc)
-{
-  int s;
-  /* FIXME: May need locking */
-  if(desc==NULL)
-    return "(null)";
-  if(!desc->name) {
-    char *buf = malloc(3* MSTRO_EP_DESC_STRMAX);
-    if(buf!=NULL) {
-      char *host = buf+MSTRO_EP_DESC_STRMAX;
-      char *service = buf+2*MSTRO_EP_DESC_STRMAX;
-      switch(desc->type) {
-        case MSTRO_EP_OFI_IN4:
-          s=getnameinfo((struct sockaddr*)&desc->in4, sizeof(struct sockaddr_in),
-                      host, MSTRO_EP_DESC_STRMAX,
-                      service, MSTRO_EP_DESC_STRMAX,
-                      NI_NUMERICHOST | NI_NUMERICSERV);
-	  if(s!=0) {
-	    ERR("Failed to obtain IN4 address: %d (%s)\n",
-	        errno, strerror(errno));
-	    abort();
-	  }
-          break;
-        case MSTRO_EP_OFI_IN6:
-          s=getnameinfo((struct sockaddr*)&desc->in6, sizeof(struct sockaddr_in6),
-                      host, MSTRO_EP_DESC_STRMAX,
-                      service, MSTRO_EP_DESC_STRMAX,
-                      NI_NUMERICHOST | NI_NUMERICSERV);
-	  if(s!=0) {
-	    ERR("Failed to obtain IN6 address: %d (%s)\n",
-	        errno, strerror(errno));
-	    abort();
-	  }
-          break;
-#ifdef HAVE_IB
-          /* case FI_SOCKADDR_IB: */
-          /*   ept = MSTRO_EP_OFI_IB;         break; */
-#endif
-        case MSTRO_EP_OFI_PSMX:
-        case MSTRO_EP_OFI_BGQ:
-        case MSTRO_EP_OFI_MLX:
-          s=snprintf(host, MSTRO_EP_DESC_STRMAX,
-		     "[%" PRIu64 "]", desc->psmx);
-          if(s<0 || s>MSTRO_EP_DESC_STRMAX) {
-	    ERR("Failed to write u64 address: %d (errno %d = %s)\n",
-	        s, errno, strerror(errno));
-	    abort();
-	  }
-          service[0]='\0';
-          break;
-        case MSTRO_EP_OFI_PSMX2:
-          s=snprintf(host, MSTRO_EP_DESC_STRMAX,
-                   "[%"PRIu64":%"PRIu64"]", desc->psmx2[0], desc->psmx2[1]);
-          if(s<0 || s>MSTRO_EP_DESC_STRMAX) {
-	    ERR("Failed to write psmx2 address: %d (errno %d = %s)\n",
-	        s, errno, strerror(errno));
-	    abort();
-	  }
-          service[0]='\0';
-          break;
-        case MSTRO_EP_OFI_GNI:
-          s=snprintf(host, MSTRO_EP_DESC_STRMAX,
-                   "[%" PRIu64 ":%" PRIu64 ":%" PRIu64 ":%" PRIu64 ":%" PRIu64 ":%" PRIu64 "/%s]",
-                   desc->gni[0], desc->gni[1],
-                   desc->gni[2], desc->gni[3],
-                   desc->gni[4], desc->gni[5],
-                   desc->oob_cookie);
-          if(s<0 || s>MSTRO_EP_DESC_STRMAX) {
-	    ERR("Failed to write GNI address: %d (errno %d = %s)\n",
-	        s, errno, strerror(errno));
-	    abort();
-	  }
-          service[0]='\0';
-	  break;
-        case MSTRO_EP_OFI_STR:
-          strcpy(host,desc->str);
-          service[0]='\0';
-          break;
-        default:
-          strcpy(host, "FIXME");
-          service[0]='\0';
+    case MSTRO__OFI_ADDR__VAL_IN4: {
+      struct sockaddr_in res = {
+        .sin_family      = a->in4->sin_family,
+        .sin_addr.s_addr = a->in4->sin_addr,
+        .sin_port        = a->in4->sin_port
+      };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, &res, sizeof(res));
+        *addr_format = FI_SOCKADDR_IN;
       }
-      s=snprintf(buf, MSTRO_EP_DESC_STRMAX, "EPD type %d (%s) addr %s:%s",
-	         desc->type, mstro_ep_descriptor_names[desc->type],
-                 host, service);
-      if(s<0 || s>MSTRO_EP_DESC_STRMAX) {
-	      ERR("Failed to write endpoint description: %d (errno %d = %s)\n",
-	      s, errno, strerror(errno));
-	      abort();
+      break;
+    }
+    case MSTRO__OFI_ADDR__VAL_IN6: {
+      struct sockaddr_in6 res = {
+        .sin6_family          = a->in6->sin6_family,
+        .sin6_port            = a->in6->sin6_port,
+        .sin6_flowinfo        = a->in6->sin6_flowinfo,
+        .sin6_addr.s6_addr[0] = (a->in6->sin6_addr_0 & 0x000000ff) >> 0,
+        .sin6_addr.s6_addr[1] = (a->in6->sin6_addr_0 & 0x0000ff00) >> 8,
+        .sin6_addr.s6_addr[2] = (a->in6->sin6_addr_0 & 0x00ff0000) >> 16,
+        .sin6_addr.s6_addr[3] = (a->in6->sin6_addr_0 & 0xff000000) >> 24,
+        .sin6_addr.s6_addr[4] = (a->in6->sin6_addr_1 & 0x000000ff) >> 0,
+        .sin6_addr.s6_addr[5] = (a->in6->sin6_addr_1 & 0x0000ff00) >> 8,
+        .sin6_addr.s6_addr[6] = (a->in6->sin6_addr_1 & 0x00ff0000) >> 16,
+        .sin6_addr.s6_addr[7] = (a->in6->sin6_addr_1 & 0xff000000) >> 24,
+        .sin6_scope_id = a->in6->sin6_scope_id
+      };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, &res, sizeof(res));
+        *addr_format = FI_SOCKADDR_IN6;
       }
-
-      desc->name = buf;
+      break;
     }
-  }
-  return desc->name;
-}
-
-mstro_status
-mstro_ep_desc_to_ofi_addr(uint32_t *addr_format,
-                          void **addr,
-                          size_t *addrlen,
-                          const mstro_endpoint_descriptor epd)
-{
-  if(addr_format==NULL || addr==NULL || addrlen==NULL)
-    return MSTRO_INVOUT;
-  if(epd==NULL)
-    return MSTRO_INVARG;
 
-  switch(epd->type) {
-    case MSTRO_EP_OFI_IN4:
-      *addr_format = FI_SOCKADDR_IN;
-      *addr = &epd->in4;
-      *addrlen = sizeof(struct sockaddr_in);
-      break;
-    case MSTRO_EP_OFI_IN6:
-      *addr_format = FI_SOCKADDR_IN6;
-      *addr = &epd->in6;
-      *addrlen = sizeof(struct sockaddr_in6);
+    case MSTRO__OFI_ADDR__VAL_GNI: {
+      uint64_t res[6] = { a->gni->a0, a->gni->a1, a->gni->a2,
+                          a->gni->a3, a->gni->a4, a->gni->a5 };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, res, sizeof(res));
+        *addr_format = FI_ADDR_GNI;
+      }
       break;
-#ifdef HAVE_IB
-    case MSTRO_EP_OFI_IB:
-      *addr_format = FI_SOCKADDR_IB;
-      *addr = &epd->ib;
-      *addrlen = sizeof(struct sockaddr_ib);
+    }
+      
+    case MSTRO__OFI_ADDR__VAL_STR: {
+      size_t l=strlen(a->str)+1;
+      if(*addrlen<l) {
+        *addrlen = l;
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, a->str, l);
+        *addr_format = FI_ADDR_STR;
+      }
       break;
-#endif
-    case MSTRO_EP_OFI_PSMX:
-      *addr_format = FI_ADDR_PSMX;
-      *addr = &epd->psmx;
-      *addrlen = sizeof(uint64_t);
+    }
+
+    case MSTRO__OFI_ADDR__VAL_PSMX: {
+      if(*addrlen<sizeof(uint64_t)) {
+        *addrlen=sizeof(uint64_t);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, &a->psmx, sizeof(uint64_t));
+        *addr_format = FI_ADDR_PSMX;
+      }
+      break;        
+    }
+      
+    case MSTRO__OFI_ADDR__VAL_PSMX2: {
+      uint64_t res[2] = { a->psmx2->a0, a->psmx2->a1 };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, res, sizeof(res));
+        *addr_format = FI_ADDR_PSMX2;
+      }
       break;
-#if FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION), FI_VERSION(1,5))
-    case MSTRO_EP_OFI_PSMX2:
-      *addr_format = FI_ADDR_PSMX2;
-      *addr = &epd->psmx2;
-      *addrlen = 2*sizeof(uint64_t);
+    }
+      
+    case MSTRO__OFI_ADDR__VAL_PSMX3: {
+      uint64_t res[2] = { a->psmx3->a0, a->psmx3->a1 };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, res, sizeof(res));
+        *addr_format = FI_ADDR_PSMX3;
+      }
       break;
-#endif
-    case MSTRO_EP_OFI_GNI:
-      *addr_format = FI_ADDR_GNI;
-      *addr = &epd->gni;
-      *addrlen = 6*sizeof(uint64_t);
+    }
+      
+    case MSTRO__OFI_ADDR__VAL_IB: {
+      *addr_format = FI_SOCKADDR_IB;
+/* #ifdef HAVE_IB */
+/*     case MSTRO_EP_OFI_IB: */
+/*       *addr_format = FI_SOCKADDR_IB; */
+/*       *addr = &epd->ib; */
+/*       *addrlen = sizeof(struct sockaddr_ib); */
+/*       break; */
+/* #endif */
+      ERR("IB addresses unimplemented\n");
+      return MSTRO_UNIMPL;
+    }
+    case MSTRO__OFI_ADDR__VAL_IB_UD: {
+      uint64_t res[4] = { a->ib_ud->a0, a->ib_ud->a1,
+                          a->ib_ud->a2, a->ib_ud->a3 };
+      if(*addrlen<sizeof(res)) {
+        *addrlen = sizeof(res);
+        return MSTRO_NOMEM;
+      } else {
+        memcpy(addr, res, sizeof(res));
+        *addr_format = FI_ADDR_IB_UD;
+      }
       break;
+    }
+      
+    case MSTRO__OFI_ADDR__VAL_BGQ:
+    case MSTRO__OFI_ADDR__VAL_MLX:
+    case MSTRO__OFI_ADDR__VAL_EFA:
     default:
-      ERR("Unimplemented mgr address format: %d\n", epd->type);
+      ERR("Unsupported OFI address kind: %d\n", a->val_case);
       return MSTRO_UNIMPL;
   }
   return MSTRO_OK;
 }
+/*     case MSTRO_EP_OFI_PSMX: */
+/*       *addr_format = FI_ADDR_PSMX; */
+/*       *addr = &epd->psmx; */
+/*       *addrlen = sizeof(uint64_t); */
+/*       break; */
+/* #if FI_VERSION_GE(FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION), FI_VERSION(1,5)) */
+/*     case MSTRO_EP_OFI_PSMX2: */
+/*       *addr_format = FI_ADDR_PSMX2; */
+/*       *addr = &epd->psmx2; */
+/*       *addrlen = 2*sizeof(uint64_t); */
+/*       break; */
+/* #endif */
+/*     default: */
+/*       ERR("Unimplemented mgr address format: %d\n", epd->type); */
+/*       return MSTRO_UNIMPL; */
+/*   } */
+/*   return MSTRO_OK; */
+/* } */
+
 
 
 
 /** the component descriptor read from the PM */
 union mstro_component_descriptor g_pm_component_descriptor;
 
+#define MSTRO_OFI_KEY_LEN_MAX 8
 
 mstro_status
 mstro_mr_key_get(struct fi_info* fi, struct fid_mr* mr, 
@@ -1484,14 +1016,13 @@ mstro_ep_build_from_ofi(struct mstro_endpoint *dst,
   dst->next = NULL;
 
   /* store serialized local address */
-  retstat = mstro_ep_desc_create_ofi(&dst->descr, fi, ep);
+  retstat = mstro_epd_create_ofi(fi, ep, dst);
   if(retstat!=MSTRO_OK) {
     ERR("Failed to construct worker mgmt ep description: %d\n", retstat);
     goto BAILOUT_FAIL;
   }
 
-  retstat = mstro_ep_desc_serialize(&dst->addr_serialized,
-                                    dst);
+  retstat = mstro_appinfo_serialize(dst, &dst->serialized);
   if(retstat!=MSTRO_OK) {
     ERR("Failed to serialize worker mgmt ep description: %d\n", retstat);
     goto BAILOUT_FAIL;
@@ -1530,7 +1061,7 @@ BAILOUT:
 #define EP_DESC_BUF_MAX 1024
 
 const char *
-mstro_endpoint_describe(struct mstro_endpoint *ep)
+mstro_endpoint_describe(const struct mstro_endpoint *ep)
 {
   /* FIXME: this should lock the EP, save the string there, ... */
   /* for now this is only a debugging hack */
@@ -1577,8 +1108,9 @@ DONE:
  */
 
 static
-int mstro_ofi__fi_info_cmp(const struct fi_info *x1,
-                           const struct fi_info *x2)
+int
+mstro_ofi__fi_info_cmp(const struct fi_info *x1,
+                       const struct fi_info *x2)
 {
   assert(x1!=NULL); assert(x2!=NULL);
 
@@ -1591,6 +1123,43 @@ int mstro_ofi__fi_info_cmp(const struct fi_info *x1,
 
   /* DEBUG("p1: %d, p2: %d\n", p1, p2); */
 
+  /* newer versions may introduce unknown providers, so check here: */  
+  assert(FI_VERSION_GE(FI_VERSION(1,14),
+                       FI_VERSION(FI_MAJOR_VERSION,FI_MINOR_VERSION)));
+#define MSTRO__FI_PROTO_MAX FI_PROTO_PSMX3
+  /* provider order: sorted by preference */
+  const int proto_order[MSTRO__FI_PROTO_MAX+1] = {
+    [FI_PROTO_SHM]            = 0,
+    [FI_PROTO_IB_UD]          = 5,
+    [FI_PROTO_GNI]            = 10,
+    [FI_PROTO_IB_RDM]         = 50,
+    [FI_PROTO_RXM]            = 70,
+    [FI_PROTO_UDP]            = 90,
+    [FI_PROTO_MRAIL]          = 100,
+    [FI_PROTO_RDMA_CM_IB_XRC] = 200,
+    [FI_PROTO_RDMA_CM_IB_RC]  = 250,
+    [FI_PROTO_PSMX3]          = 200,
+    [FI_PROTO_PSMX2]          = 300,
+    [FI_PROTO_PSMX]           = 400,
+    [FI_PROTO_EFA]            = 500,
+    // coming after 1.14: [FI_PROTO_RXM_TCP]        = 600
+    [FI_PROTO_SOCK_TCP]       = 650,
+    [FI_PROTO_NETWORKDIRECT]  = 900,
+    [FI_PROTO_MLX]            = 910,
+    [FI_PROTO_RXD]            = 920,
+    [FI_PROTO_RSTREAM]        = 940,
+    [FI_PROTO_IWARP]          = 970,
+    [FI_PROTO_MXM]            = 980,
+    [FI_PROTO_IWARP_RDM]      = 990,
+    [FI_PROTO_UNSPEC]         = 1000,
+  };
+
+  if(p1!=p2) {
+    assert(p1<=MSTRO__FI_PROTO_MAX);
+    assert(p2<=MSTRO__FI_PROTO_MAX);
+    return proto_order[p1] - proto_order[p2];
+  }
+    
   /* break ties on anything IP-related to prefer localhost */
   {
     if(x1->addr_format==FI_SOCKADDR_IN
@@ -1642,46 +1211,7 @@ int mstro_ofi__fi_info_cmp(const struct fi_info *x1,
     }
   }
 
-  /* otherwise prefer the most performant interconnects */
-
-  /* shared mem */
-  if(p1==FI_PROTO_SHM)
-    return -1;
-  if(p2==FI_PROTO_SHM)
-    return +1;
-
-  /* Aries */
-  if(p1==FI_PROTO_GNI)
-    return -1;
-  if(p2==FI_PROTO_GNI)
-    return +1;
-
-  /* modern Mellanox */
-  if(p1==FI_PROTO_RDMA_CM_IB_RC)
-    return -1;
-  if(p2==FI_PROTO_RDMA_CM_IB_RC)
-    return +1;
-
-  if(p1==FI_PROTO_IB_UD)
-    return -1;
-  if(p2==FI_PROTO_IB_UD)
-    return +1;
-
-  if(p1==FI_PROTO_RXM)
-    return -1;
-  if(p2==FI_PROTO_RXM)
-    return +1;
-
-  if(p1==FI_PROTO_UDP)
-    return -1;
-  if(p2==FI_PROTO_UDP)
-    return +1;
-
-  if(p1==FI_PROTO_SOCK_TCP)
-    return -1;
-  if(p2==FI_PROTO_SOCK_TCP)
-    return +1;
-
+  
   DEBUG("not ordering: %d/%d vs %d/%d\n",
         p1, x1->addr_format, p2, x2->addr_format);
 
@@ -1703,6 +1233,32 @@ mstro_ofi__order_fi_list(struct fi_info **fi)
 }
 
 
+static
+mstro_status
+mstro_ofi__drop_unusable_fis(struct fi_info **fi)
+{
+  assert(fi!=NULL);
+  struct fi_info *head = *fi;
+  struct fi_info **el=fi;
+
+  /** basically an implementation of LL_DELETE_IF() */
+  while (*el) {
+    /* check if we like it */
+    /* IPv6 currently has issues: addresses are accepted by the
+     * endpoint, but local-scope ones may hang afterwards */ 
+    if((*el)->addr_format==FI_SOCKADDR_IN6)  {
+      struct fi_info *victim = *el;
+      *el = victim->next;
+      victim->next=NULL;
+      DEBUG("Dropping IPv6 endpoint %s\n", fi_tostr(victim, FI_TYPE_INFO));
+      fi_freeinfo(victim);
+    } else {
+      el = &(*el)->next;
+    }
+  }
+  return MSTRO_OK;    
+}
+
 
 /** Populate @ref g_endpoints with an enabled endpoint for each useful
  * OFI endpoint discovered */
@@ -1741,8 +1297,11 @@ mstro_ofi_init(void)
   hints->mode = MSTRO_OFI_MODE;
   hints->ep_attr->type = MSTRO_OFI_EP_TYPE;
   hints->domain_attr->mr_mode = MSTRO_OFI_MRMODE;
+  /* FIXME: We should be able to support MANUAL progress when properly multithreading PC/PM */
+  hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
+  hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
 
-  /* we really want 1.8 or above */
+  /* we really want 1.14 or above */
   stat = fi_getinfo(MSTRO_OFI_VERSION, NULL, NULL, 0, hints, &fi);
   fi_freeinfo(hints);
 
@@ -1751,6 +1310,14 @@ mstro_ofi_init(void)
     retstat=MSTRO_FAIL; goto BAILOUT_FAIL;
   }
 
+  /* pre-select (= drop some) endpoints */
+  stat = mstro_ofi__drop_unusable_fis(&fi);
+  if(stat!=MSTRO_OK) {
+    ERR("Failed to clean up FI-list\n");
+    retstat = MSTRO_FAIL;
+    goto BAILOUT_FAIL;
+  }
+
   /* order endpoint list.
    *
    * The goal is to have the high-speed ones first. Ideally OFI should
@@ -1872,8 +1439,14 @@ mstro_ofi_finalize(bool destroy_drc_info)
       int s;
       struct mstro_endpoint *e = &g_endpoints->eps[i];
 
-      if(e->addr_serialized)
-        free(e->addr_serialized);
+      if(e->serialized)
+        free(e->serialized);
+      if(e->pbep)
+        mstro__endpoint__free_unpacked(e->pbep, NULL);
+      if(e->cred)
+        mstro__ofi_credential__free_unpacked(e->cred, NULL);
+      if(e->inforeg)
+        mstro__ofi_memory_region__free_unpacked(e->inforeg, NULL);
 
       if(e->peer_info_mr) {
         DEBUG("closing RDMA peer_info MR for ep %zu\n", i);
@@ -1914,7 +1487,6 @@ mstro_ofi_finalize(bool destroy_drc_info)
       //      CLOSE_FID(fi,"FABRIC");
 #undef CLOSE_FID
 
-      mstro_ep_desc_free(e->descr);
     }
     free(g_endpoints);
   }
@@ -2052,17 +1624,102 @@ BAILOUT:
   return retval;
 }
 
+static
+mstro_status
+mstro_ofi__check_compatibility(bool *suitable_p,
+                               const Mstro__Endpoint *remote,
+                               const struct mstro_endpoint *local)
+{
+  if(suitable_p==NULL)
+    return MSTRO_INVOUT;
+  if(remote==NULL ||local==NULL)
+    return MSTRO_INVARG;
+  assert(remote->proto_case == MSTRO__ENDPOINT__PROTO_OFIPROTO);
+  assert(local->pbep->proto_case == MSTRO__ENDPOINT__PROTO_OFIPROTO);
+
+  if(remote->ofiproto
+     == local->pbep->ofiproto) {
+    *suitable_p = true;
+  } else {
+    *suitable_p = false;
+  }
+  return MSTRO_OK;
+}
+
+
+
+/** try to use insert the address specificed in @arg remote in @arg local_ep. On sucess returns the translated address in @arg *translated_addr_p */
+static
+mstro_status
+mstro_ofi__try_epd_addr(const struct mstro_endpoint *local_ep,
+                        const Mstro__Endpoint *remote,
+                        fi_addr_t *translated_addr_p)
+{
+  bool suitable=false;
+  mstro_status status;
+  
+  status = mstro_ofi__check_compatibility(&suitable, remote, local_ep);
+  if(status!=MSTRO_OK) {
+    ERR("Transport coupling check failed: %d\n", status);
+    goto BAILOUT;
+  }
+  
+  if(!suitable) {
+    DEBUG("endpoints not compatible\n");
+    status = MSTRO_FAIL;
+    goto BAILOUT;
+  }
+
+  /* try address insertion */
+  uint32_t addr_format;
+#define DEFAULT_ADDR_LEN 128
+  uint8_t remote_addr[DEFAULT_ADDR_LEN];
+  size_t addrlen=DEFAULT_ADDR_LEN;
+  
+  status = mstro_epd_to_ofi_addr(remote,
+                                 &addr_format,
+                                 remote_addr, &addrlen);
+  if(status!=MSTRO_OK) {
+    if(status==MSTRO_NOMEM) {
+      /* FIXME: if we get NOMEM we could retry with larger addrlen allocation */
+      WARN("FIXME: Endpoint buffer too small and not retrying with larger buffer (have %zu, need %zu)\n",
+           DEFAULT_ADDR_LEN, addrlen);
+    }
+    ERR("Cannot convert EPD to address: %d\n", status);
+    goto BAILOUT;
+  }
+  
+  /* try that address in the EP */
+  int ret = fi_av_insert(local_ep->av,
+                         remote_addr, 1,
+                         translated_addr_p,
+                         /* no flags, no context: we want
+                          * synchronous insert */
+                         0, NULL);
+  if(ret!=1) {
+    /* we asked to insert 1, so we should get 1 insert confirmed */
+    ERR("failed to insert source address into endpoint av: %d\n",
+        ret);
+    status = MSTRO_FAIL; 
+  }
+  DEBUG("AV of ep %s accepted address\n",
+        mstro_endpoint_describe(local_ep));
+  status = MSTRO_OK;
+  
+BAILOUT:
+  return status;
+}
+
 /** select the best match between remote and local endpoints */
 mstro_status
-mstro_ofi__select_endpoint(struct mstro_endpoint_descriptor_ *remote,
+mstro_ofi__select_endpoint(const Mstro__AppInfo *remote,
                            struct mstro_endpoint **local_p,
                            fi_addr_t *remote_addr_p)
 {
   mstro_status stat = MSTRO_UNIMPL;
 
-  mstro_endpoint_descriptor tmp = NULL;
   struct mstro_endpoint *my_ep = NULL;
-
+  
   mstro_ofi_msg_context ctx=NULL;
   stat = mstro_ofi__msg_context_create(&ctx, NULL, true);
   if(stat!=MSTRO_OK) {
@@ -2071,167 +1728,145 @@ mstro_ofi__select_endpoint(struct mstro_endpoint_descriptor_ *remote,
     goto BAILOUT;
   };
 
-  LL_FOREACH(remote, tmp) {
-    DEBUG("Trying remote EP %s\n", mstro_ep_desc_describe(tmp));
+  for(size_t i=0; i<remote->eps->n_eps; i++) {
+    const Mstro__EndpointList *epl = remote->eps;
+    const Mstro__Endpoint *remote = epl->eps[i];
+    WITH_MSTRO_EPL_ENTRY_DESCRIPTION(buf,epl,i,{
+        DEBUG("Trying remote EP %s\n", buf);};);
+    
     LL_FOREACH(g_endpoints->eps, my_ep) {
       DEBUG("Trying local EP %s\n", mstro_endpoint_describe(my_ep));
-      bool suitable=false;
-      stat = mstro_ofi__check_compatibility(&suitable, tmp, my_ep);
+
+      fi_addr_t translated_addr=FI_ADDR_UNSPEC;
+
+      stat = mstro_ofi__try_epd_addr(my_ep, remote, &translated_addr);
       if(stat!=MSTRO_OK) {
-        ERR("Transport coupling check failed: %d\n", stat);
-      } else if(suitable) {
-        /* try transfer */
-        /* FIXME: generalize to non-OFI */
-        uint32_t addr_format;
-        void *remote_addr;
-        size_t addrlen;
-        mstro_status stat
-            = mstro_ep_desc_to_ofi_addr(&addr_format,
-                                        &remote_addr, &addrlen,
-                                        tmp);
-        if(stat!=MSTRO_OK) {
-          ERR("Cannot convert EPD to address: %d\n", stat);
-          continue;
-        }
-        /* try that address in the EP */
-        fi_addr_t translated_addr=FI_ADDR_UNSPEC;
-        int ret = fi_av_insert(my_ep->av,
-                               remote_addr, 1,
-                               &translated_addr,
-                               /* no flags, no context: we want
-                                * synchronous insert */
-                               0, NULL);
-        if(ret!=1) {
-          /* we asked to insert 1, so we should get 1 insert confirmed */
-          ERR("failed to insert source address into endpoint av, skipping: %d\n",
-              ret);
-          continue; /* inner FOREACH */
+        DEBUG("local EP did not accept remote addr, skipping\n");
+        continue;
+      }
+      /* FIXME: Once we move to separate handler threads for message processing this work-around can go away */
+      /* If we are handling a PM protocol message which causes a new
+       * OFI endpoint selection (like transport does) we are running
+       * on the PC thread. In that case we'd need to do the
+       * fi_cq_read here, instead of relying the normal handler to
+       * do it for us. But that is extremely hard without a full
+       * refactor of the handler code. So we just skip it for
+       * now. ... */
+      if(pthread_self()==g_pc_thread || pthread_self()==g_pm_thread) {
+        DEBUG("fi_read from PC or PM thread, skipping config block verification\n");
+      } else {
+        int ret = pthread_mutex_lock(&ctx->lock);
+        if(ret!=0) {
+          ERR("Failed to acquire msg context lock: %d\n", ret);
+          stat = MSTRO_FAIL;
+          goto BAILOUT;
         }
-        DEBUG("AV of ep %s accepted address\n",
-              mstro_endpoint_describe(my_ep));
-
-
-        /* FIXME: Once we move to separate handler threads for message processing this work-around can go away */
-        /* If we are handling a PM protocol message which causes a new
-         * OFI endpoint selection (like transport does) we are running
-         * on the PC thread. In that case we'd need to do the
-         * fi_cq_read here, instead of relying the normal handler to
-         * do it for us. But that is extremely hard without a full
-         * refactor of the handler code. So we just skip it for
-         * now. ... */
-        if(pthread_self()==g_pc_thread || pthread_self()==g_pm_thread) {
-          DEBUG("fi_read from PC or PM thread, skipping config block verification\n");
-        } else {
-          ret = pthread_mutex_lock(&ctx->lock);
-          if(ret!=0) {
-            ERR("Failed to acquire msg context lock: %d\n", ret);
-            stat = MSTRO_FAIL;
-            goto BAILOUT;
-          }
+        
+        uint64_t mr_addr=0;
+        /* since the EP accepted the address we interpolate that
+         * MR_RAW will be set identically (while we really should be
+         * checking a flag sent in the OOB info about the MR) */
+        assert(!(my_ep->fi->domain_attr->mr_mode & FI_MR_RAW)); /* FIXME */
 
-          uint64_t mr_addr=0;
-          /* since the EP accepted the address we interpolate that
-           * MR_RAW will be set identically (while we really should be
-           * checking a flag sent in the OOB info about the MR) */
-          assert(!(my_ep->fi->domain_attr->mr_mode & FI_MR_RAW)); /* FIXME */
-          mr_addr = tmp->info_addr;         /** FIXME: with RAW we'd need to translate */
-
-          uint64_t mr_key=0;
-          switch(tmp->info_keysize) {
-            case 1: { uint8_t  x; memcpy(&x,tmp->info_key, sizeof(x)); mr_key = x; break; }
-            case 2: { uint16_t x; memcpy(&x,tmp->info_key, sizeof(x)); mr_key = x; break; }
-            case 4: { uint32_t x; memcpy(&x,tmp->info_key, sizeof(x)); mr_key = x; break; }
-            case 8: {             memcpy(&mr_key,tmp->info_key, sizeof(mr_key));   break; }
-            default: {
-              ERR("Unhandled MR key length -- likely needs OFI based translation\n");
-              stat=MSTRO_UNIMPL;
-              goto BAILOUT;
-            }
-          }
-          DEBUG("Checking for PM config block MR at (remote addr) 0x%" PRIx64 ", key %" PRIx64 "\n",
-                mr_addr, mr_key);
-
-          assert(ctx->msg==NULL);
-          assert(my_ep->peer_info_mr!=NULL); /* incoming buffer has been registered at local endpoint set creation */
-          void * local_buf_mr_desc = fi_mr_desc(my_ep->peer_info_mr);
-          DEBUG("FI_READ with ctx %p\n", ctx);
-
-          int num_retries = 3;
-       RETRY_RDMA_CONFIG_BLOCK_READ:
-          DEBUG("Doing fi_read into addr %p (%zu bytes), mr 0x%" PRIx64 ", desc %p\n",
-                &g_pm_component_descriptor, sizeof(g_pm_component_descriptor),
-                my_ep->peer_info_mr, local_buf_mr_desc);
-          ret = fi_read(my_ep->ep,
-                        &g_pm_component_descriptor, sizeof(g_pm_component_descriptor),
-                        local_buf_mr_desc,
-                        translated_addr, mr_addr, mr_key,
-                        ctx);
-          if(ret==-FI_EAGAIN) {
-            DEBUG("RDMA read for PM config block needs to be retried\n");
-            sleep(1);
-            if(num_retries-->0)
-              goto RETRY_RDMA_CONFIG_BLOCK_READ;
-          }
-          if(ret<0) {
-            ERR("Failed to do RDMA read for PM config block: %d (%s)\n",
-                ret, fi_strerror(-ret));
-            pthread_mutex_unlock(&ctx->lock);
-            continue;
-          }
-          DEBUG("going to sleep waiting for RDMA read to return\n");
-          ret = pthread_cond_wait(&ctx->completion, &ctx->lock);
-          if(ret!=0) {
-            ERR("Failed to wait on msg ctx cond var for RDMA read: %d\n", ret);
-            stat = MSTRO_FAIL;
+        const Mstro__OfiMemoryRegion *inforeg = epl->inforegs[i];
+        
+        mr_addr = inforeg->baseaddr; /** FIXME: with RAW we'd need to translate */
+        
+        /* key is in tmp->inforeg->raw_key.data */
+        uint64_t mr_key=0;
+        switch(inforeg->raw_key.len) {
+          case 1: { uint8_t  x; memcpy(&x,     inforeg->raw_key.data, sizeof(x)); mr_key = x; break; }
+          case 2: { uint16_t x; memcpy(&x,     inforeg->raw_key.data, sizeof(x)); mr_key = x; break; }
+          case 4: { uint32_t x; memcpy(&x,     inforeg->raw_key.data, sizeof(x)); mr_key = x; break; }
+          case 8: {             memcpy(&mr_key,inforeg->raw_key.data, sizeof(mr_key));   break; }
+          default: {
+            ERR("Unhandled MR key length -- likely needs OFI based translation\n");
+            stat=MSTRO_UNIMPL;
             goto BAILOUT;
           }
-
-	  ret = pthread_mutex_unlock(&ctx->lock);
-	  if(ret!=0) {
-	    ERR("Failed to unlock msg context lock: %d (%s)\n", ret, strerror(ret));
-	    stat = MSTRO_FAIL;
-	    goto BAILOUT;
-	  }
-          DEBUG("Found remote config block (%s). type %" PRIu64 ", protocol version: %d.%d.%d, workflow: %s, component: %s\n",
-                g_pm_component_descriptor.version,
-                g_pm_component_descriptor.type,
-                g_pm_component_descriptor.protocol_version.major,
-                g_pm_component_descriptor.protocol_version.minor,
-                g_pm_component_descriptor.protocol_version.patch,
-                g_pm_component_descriptor.workflow_name,
-                g_pm_component_descriptor.component_name);
-          if(g_pm_component_descriptor.type==MSTRO_COMPONENT_TYPE_UNDEF
-             || 0!=strcmp(g_pm_component_descriptor.workflow_name,g_component_descriptor.workflow_name)) {
-            DEBUG("Component descriptor mismatch, illegal type or other workflow, skipping\n");
-            continue;
-          }
-          DEBUG("PM schema list %s \n", g_pm_component_descriptor.schema_list);
-          DEBUG("Component schema list %s \n", g_component_descriptor.schema_list);
-          if(!schema_list_compatible_p(g_component_descriptor.schema_list,
-                                       g_pm_component_descriptor.schema_list)) {
-            DEBUG("Component descriptor mismatch, PM does not support the schema list of component, skipping\n");
-            continue;
-          }
-        } /* end of possible config block verification */
-
-        *remote_addr_p = translated_addr;
-        *local_p = my_ep;
-        stat=MSTRO_OK;
-        DEBUG("Using local endpoint %s to communicate to remote at %s\n",
-             mstro_endpoint_describe(*local_p),
-             mstro_ep_desc_describe(tmp));
-
-        goto BAILOUT;
-
-      } else {
-        DEBUG("(deemed unsuitable)\n");
-      }
+        }
+        
+        DEBUG("Checking for PM config block MR at (remote addr) 0x%" PRIx64 ", key of len %zu value %" PRIx64 "\n",
+              mr_addr, inforeg->raw_key.len, mr_key);
+        
+        assert(ctx->msg==NULL);
+        assert(my_ep->peer_info_mr!=NULL); /* incoming buffer has been registered at local endpoint set creation */
+        void * local_buf_mr_desc = fi_mr_desc(my_ep->peer_info_mr);
+        DEBUG("FI_READ with ctx %p\n", ctx);
+        
+        int num_retries = 3;
+     RETRY_RDMA_CONFIG_BLOCK_READ:
+        DEBUG("Doing fi_read into addr %p (%zu bytes), mr 0x%" PRIx64 ", desc %p\n",
+              &g_pm_component_descriptor, sizeof(g_pm_component_descriptor),
+              my_ep->peer_info_mr, local_buf_mr_desc);
+        ret = fi_read(my_ep->ep,
+                      &g_pm_component_descriptor, sizeof(g_pm_component_descriptor),
+                      local_buf_mr_desc,
+                      translated_addr, mr_addr, mr_key,
+                      ctx);
+        if(ret==-FI_EAGAIN) {
+          DEBUG("RDMA read for PM config block needs to be retried\n");
+          sleep(1);
+          if(num_retries-->0)
+            goto RETRY_RDMA_CONFIG_BLOCK_READ;
+        }
+        if(ret<0) {
+          ERR("Failed to do RDMA read for PM config block: %d (%s)\n",
+              ret, fi_strerror(-ret));
+          pthread_mutex_unlock(&ctx->lock);
+          continue;
+        }
+        DEBUG("going to sleep waiting for RDMA read to return\n");
+        ret = pthread_cond_wait(&ctx->completion, &ctx->lock);
+        if(ret!=0) {
+          ERR("Failed to wait on msg ctx cond var for RDMA read: %d\n", ret);
+          stat = MSTRO_FAIL;
+          goto BAILOUT;
+        }
+        
+        ret = pthread_mutex_unlock(&ctx->lock);
+        if(ret!=0) {
+          ERR("Failed to unlock msg context lock: %d (%s)\n", ret, strerror(ret));
+          stat = MSTRO_FAIL;
+          goto BAILOUT;
+        }
+        DEBUG("Found remote config block (%s). type %" PRIu64 ", protocol version: %d.%d.%d, workflow: %s, component: %s\n",
+              g_pm_component_descriptor.version,
+              g_pm_component_descriptor.type,
+              g_pm_component_descriptor.protocol_version.major,
+              g_pm_component_descriptor.protocol_version.minor,
+              g_pm_component_descriptor.protocol_version.patch,
+              g_pm_component_descriptor.workflow_name,
+              g_pm_component_descriptor.component_name);
+        if(g_pm_component_descriptor.type==MSTRO_COMPONENT_TYPE_UNDEF
+           || 0!=strcmp(g_pm_component_descriptor.workflow_name,g_component_descriptor.workflow_name)) {
+          DEBUG("Component descriptor mismatch, illegal type or other workflow, skipping\n");
+          continue;
+        }
+        DEBUG("PM schema list %s \n", g_pm_component_descriptor.schema_list);
+        DEBUG("Component schema list %s \n", g_component_descriptor.schema_list);
+        if(!schema_list_compatible_p(g_component_descriptor.schema_list,
+                                     g_pm_component_descriptor.schema_list)) {
+          DEBUG("Component descriptor mismatch, PM does not support the schema list of component, skipping\n");
+          continue;
+        }
+      } /* end of possible config block verification */
+      
+      *remote_addr_p = translated_addr;
+      *local_p = my_ep;
+      stat=MSTRO_OK;
+      WITH_MSTRO_EPL_ENTRY_DESCRIPTION(str,epl,i,{
+          DEBUG("Using local endpoint %s to communicate to remote at %s\n",
+                mstro_endpoint_describe(*local_p),
+                str);});
+      goto BAILOUT;
+      
     }
   }
-
+  
   ERR("No matching transport combination found\n");
   stat = MSTRO_FAIL;
-
+  
 BAILOUT:
   stat |= mstro_ofi__msg_context_destroy(ctx);
 
@@ -2239,6 +1874,7 @@ BAILOUT:
 }
 
 
+
 static
 mstro_status
 mstro_ofi__submit_message(struct mstro_endpoint *ep,
@@ -2434,39 +2070,35 @@ mstro_pm__register_app(Mstro__Pool__Join *join_msg,
   assert(join_msg!=NULL);
 
   /* unpack serialized address */
-  struct mstro_endpoint_descriptor_ *epd=NULL;
-  s = mstro_ep_desc_deserialize(&epd, join_msg->serialized_endpoint);
+  Mstro__AppInfo *epd=NULL;
+  s = mstro_appinfo_deserialize(join_msg->serialized_endpoint, &epd);
   if(s!=MSTRO_OK) {
     ERR("Failed to deserialize endpoint in JOIN message\n");
     goto BAILOUT_FREE;
   }
-  DEBUG("EPD serial %s\n", join_msg->serialized_endpoint);
-  DEBUG("EPD parsed as %s\n", mstro_ep_desc_describe(epd));
-
-  uint32_t addr_format;
-  void *remote_addr;
-  size_t addrlen;
-  mstro_status stat
-      = mstro_ep_desc_to_ofi_addr(&addr_format,
-                                  &remote_addr, &addrlen,
-                                  epd);
-  if(stat!=MSTRO_OK) {
-    ERR("Cannot convert EPD to address: %d\n", stat);
+  DEBUG("incoming serialized EPD: %s\n", join_msg->serialized_endpoint);
+  if(epd->eps==NULL) {
+    ERR("Empty EPD\n");
+    s=MSTRO_FAIL;
     goto BAILOUT_FREE;
   }
 
-  /* insert in endpoint */
-  void *context=NULL;
-  uint64_t flags=0;
-  fi_addr_t translated_addr;
-  int ret = fi_av_insert(ep->av,
-                         remote_addr, 1,
-                         &translated_addr,
-                         flags, context);
-  if(ret!=1) {
-    /* we asked to insert 1, so we should see ret==1 */
-    ERR("Failed to insert source address into ep: %d\n", ret);
-    s = MSTRO_FAIL;
+  
+  WITH_MSTRO_EPL_DESCRIPTION(str,epd->eps,{
+      DEBUG("EPD parsed as %s\n", str);
+    };);
+
+  assert(epd->eps->n_eps>0);
+  if(epd->eps->n_eps>1) {
+    WARN("App provided %zu contact endpoints, only trying #0 -- FIXME\n",
+         epd->eps->n_eps);
+  }
+  
+  fi_addr_t translated_addr=FI_ADDR_UNSPEC;
+  s = mstro_ofi__try_epd_addr(ep, epd->eps->eps[0], &translated_addr);
+  if(s!=MSTRO_OK) {
+    DEBUG("local EP did not accept remote addr, can't welcome app %s:%zu\n",
+          join_msg->component_name, join_msg->component_index);
     goto BAILOUT_FREE;
   }
 
@@ -2514,21 +2146,7 @@ BAILOUT_FREE:
 }
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
 typedef mstro_status (*mstro_msg_handler)(const struct mstro_msg_envelope *envelope,
                                           struct mstro_endpoint *ep);
 
@@ -2715,7 +2333,10 @@ mstro_ofi__check_and_handle_cq(struct mstro_endpoint *ep,
     }
 
     if(re_post) {
-      ; /* fresh receive will be posted later, and slot messages have no completion watchers */
+       /* fresh receive has been posted earlier, and slot messages have no completion watchers */
+      DEBUG("slot message handled, cleaning up envelope and ctx\n");
+      mstro_msg_envelope_free(ctx->msg);
+      status = mstro_ofi__msg_context_destroy(ctx);
     } else {
       /* other recv */
       DEBUG("non-slot message, cleaning up\n");
@@ -2845,61 +2466,6 @@ mstro_ofi_pc_loop(_Atomic bool *terminate)
 }
 
 
-mstro_status
-mstro_ofi__check_compatibility(bool *suitable_p,
-                               struct mstro_endpoint_descriptor_ *remote,
-                               struct mstro_endpoint *local)
-{
-  if(suitable_p==NULL)
-    return MSTRO_INVOUT;
-  if(remote==NULL ||local==NULL)
-    return MSTRO_INVARG;
-  switch(remote->type) {
-    case MSTRO_EP_OFI_IN4:
-    case MSTRO_EP_OFI_IN6:
-    case MSTRO_EP_OFI_IB:
-    case MSTRO_EP_OFI_PSMX:
-    case MSTRO_EP_OFI_PSMX2:
-    case MSTRO_EP_OFI_GNI:
-    case MSTRO_EP_OFI_BGQ:
-    case MSTRO_EP_OFI_MLX:
-    case MSTRO_EP_OFI_STR: {
-      /* these have a chance against current OFI-only endpoints */
-      uint32_t addr_format;
-      void *addr;
-      size_t addrlen;
-      mstro_status stat
-          = mstro_ep_desc_to_ofi_addr(&addr_format, &addr, &addrlen, remote);
-      if(stat!=MSTRO_OK) {
-        ERR("Cannot convert EPD to address: %d\n", stat);
-        *suitable_p =false;
-/* #if !defined(OFI_ISSUE_5453_RESOLVED) */
-      /* disable IPv6 for now */
-      } else if(addr_format == FI_SOCKADDR_IN6) {
-        *suitable_p = false;
-/* #endif */
-      } else {
-        if(addr_format==local->fi->addr_format) {
-          *suitable_p = true;
-        } else
-          *suitable_p = false;
-      }
-      break;
-    }
-    default:
-      DEBUG("Non-OFI transport endpoint encountered\n");
-      *suitable_p = false;
-      break;
-  }
-  return MSTRO_OK;
-}
-
-
-
-
-
-
-
 
 
 /* FIXME: should go to pool.c */
@@ -3024,14 +2590,16 @@ BAILOUT:
 
 /* end of FIXME (code for pool.c) */
 
+
+
 /** Attach current application to a running pool manager */
 mstro_status
 mstro_pm_attach(const char *remote_pm_info)
 {
   /* GNI cookies are special. We need to have the matching cookie before we can enable the endpoint. */
-  mstro_endpoint_descriptor pm_epd=NULL;
+  Mstro__AppInfo *pm_epd=NULL;
 
-  mstro_status s=mstro_ep_desc_deserialize(&pm_epd, remote_pm_info);
+  mstro_status s=mstro_appinfo_deserialize(remote_pm_info, &pm_epd);
 
   if(s!=MSTRO_OK) {
     ERR("Failed to parse pool manager info: %d (%s%s)\n",
@@ -3040,43 +2608,44 @@ mstro_pm_attach(const char *remote_pm_info)
         s==MSTRO_NOMEM ? " or invalid pool-manager info data" : "");
     goto BAILOUT;
   }
-
-  mstro_endpoint_descriptor tmp;
-  size_t num_entries=0;
-  LL_COUNT(pm_epd, tmp, num_entries);
-  DEBUG("Parsed %d pool manager endpoints\n", num_entries);
-
+  const Mstro__EndpointList *epl = pm_epd->eps;
+    
+  DEBUG("Parsed %d pool manager endpoints\n", epl->n_eps);
+  assert(epl->n_eps==epl->n_inforegs);
+  assert(epl->n_eps==epl->n_credentials);
+  
   {
-    mstro_endpoint_descriptor tmp;
     size_t num_cookies = 0;
-    
-    LL_FOREACH(pm_epd, tmp) {
-      DEBUG("Pool manager endpoint: %s\n", mstro_ep_desc_describe(tmp));
-      
-      if(tmp->oob_cookie) {
-        /* FIXME: this only handles DRC cookies, not other OOB cookies
-         * that other interconnects could be requiring (we don't
-         * support any at this time) */
-        if(tmp->type==MSTRO_EP_OFI_GNI) {
-          DEBUG("GNI descriptor found, oob cookie %s\n", tmp->oob_cookie);
-        } else {
-          DEBUG("DRC cookie found in non-GNI EP desc: %s, will use DRC although we probably don't need it\n",
-                tmp->oob_cookie);
-        }
-        
-        if(num_cookies<1) {
-          s = mstro_drc_init_from_oob_string(&g_drc_info, tmp->oob_cookie);
-          if(s!=MSTRO_OK) {
-            ERR("Failed to use provided DRC cookie\n");
-            goto BAILOUT;
+
+    for(size_t i=0; i<epl->n_credentials; i++) {
+      WITH_MSTRO_EPL_ENTRY_DESCRIPTION(str, epl, i, {
+          DEBUG("Inspecting pool manager endpoint for credentials: %s\n", str);
+        });
+
+      if(epl->credentials[i]) {
+        if(epl->eps[i]->ofiproto == MSTRO__OFI_ENDPOINT_KIND__GNI) {
+          assert(epl->credentials[i]->val_case == MSTRO__OFI_CREDENTIAL__VAL_DRC);
+          DEBUG("GNI descriptor found, oob cookie %" PRIu32 "\n",
+                epl->credentials[i]->drc->credential);
+          if(num_cookies<1) {
+            s = mstro_drc_init_from_credential(&g_drc_info,
+                                               epl->credentials[i]->drc->credential);
+            if(s!=MSTRO_OK) {
+              ERR("Failed to use provided DRC credential\n");
+              goto BAILOUT;
+            } else {
+              num_cookies++;
+            }
           } else {
-            num_cookies++;
-          }
+            WARN("Multiple GNI cookies found, first one will win\n");
+          }          
         } else {
-          WARN("Multiple GNI cookies found, first one will win\n");
+          if(epl->credentials[i]->val_case != MSTRO__OFI_CREDENTIAL__VAL__NOT_SET) {
+            DEBUG("Credential found in non-GNI EP. This is unsupported -- FIXME\n");
+          }
         }
       } else {
-        if(tmp->type==MSTRO_EP_OFI_GNI) {
+        if(epl->eps[i]->ofiproto == MSTRO__OFI_ENDPOINT_KIND__GNI) {
           WARN("GNI endpoint and no DRC cookie, things likely will go wrong\n");
         }
       }
@@ -3102,7 +2671,6 @@ mstro_pm_attach(const char *remote_pm_info)
   }
   g_component_descriptor.type = MSTRO_COMPONENT_TYPE_APP;
 
-
   s=mstro_ofi__select_endpoint(pm_epd, &g_pm_endpoint, &g_pm_addr);
   if(s!=MSTRO_OK) {
     ERR("Failed to select a suitable endpoint to talk to pool manager\n");
@@ -3123,14 +2691,13 @@ mstro_pm_attach(const char *remote_pm_info)
 
   /* FIXME make function set_transport_default() beware protobuf stack init
    */
-  Mstro__Pool__TransportKind transport_kinds[] =
-      {
-	    MSTRO__POOL__TRANSPORT_KIND__OFI
-	    ,MSTRO__POOL__TRANSPORT_KIND__GFS
+  Mstro__Pool__TransportKind transport_kinds[] = {
+    MSTRO__POOL__TRANSPORT_KIND__OFI
+    ,MSTRO__POOL__TRANSPORT_KIND__GFS
 #ifdef HAVE_MIO
-        ,MSTRO__POOL__TRANSPORT_KIND__MIO
+    ,MSTRO__POOL__TRANSPORT_KIND__MIO
 #endif
-      };
+  };
   size_t num_available_transports = sizeof(transport_kinds)/sizeof(transport_kinds[0]);
 
   Mstro__Pool__TransportMethods transport_methods
@@ -3143,13 +2710,13 @@ mstro_pm_attach(const char *remote_pm_info)
   if (env_transport_default != NULL) {
     int i;
     int found = 0;
-    for (	i = 0;	i < MSTRO__POOL__TRANSPORT_KIND__NUMBER_OF_KINDS; i++ ) {
+    for (i = 0;	i < MSTRO__POOL__TRANSPORT_KIND__NUMBER_OF_KINDS; i++ ) {
       if (!strcmp(env_transport_default, g_transport_registry[i].name)) {
         transport_methods.supported[0] = (Mstro__Pool__TransportKind)i;
         found = 1;
         DEBUG("Setting default transport method to %s (env %s)\n",
-             g_transport_registry[i].name,
-             MSTRO_ENV_TRANSPORT_DEFAULT);
+              g_transport_registry[i].name,
+              MSTRO_ENV_TRANSPORT_DEFAULT);
       }
     }
     if (! found)
@@ -3163,7 +2730,15 @@ mstro_pm_attach(const char *remote_pm_info)
 
   Mstro__Pool__Join join = MSTRO__POOL__JOIN__INIT;
   join.protocol_version = MSTRO_POOL_PROTOCOL_VERSION;
-  join.serialized_endpoint = g_pm_endpoint->addr_serialized;
+  assert(g_endpoints->size>0);
+  assert(g_endpoints->eps[0].pbep!=NULL);
+  s = mstro_ofi__epl_to_serialized_appinfo(&join.serialized_endpoint,
+                                           &(g_endpoints->eps[0]));
+  if(s!=MSTRO_OK) {
+    ERR("Failed to package serialized endpoint info\n");
+    goto BAILOUT;
+  }
+
   join.transport_methods = &transport_methods;
   join.component_name = g_initdata->component_name;
   join.component_index = g_initdata->component_index;
@@ -3218,11 +2793,14 @@ mstro_pm_attach(const char *remote_pm_info)
 
 
   /* Welcome received */
-  INFO("Welcome message received, our app id is %"PRIu64"\n",
+  INFO("Welcome message received, our id is: app %" PRIappid "\n",
        g_pool_app_id);
 
 
 BAILOUT:
+  if(pm_epd) {
+    mstro__app_info__free_unpacked(pm_epd, NULL);
+  }
   return s;
 }
 
diff --git a/maestro/pool.c b/maestro/pool.c
index e9a55193079b0d2e28883359a10d61cdf0715afc..e7894c99a347ee9466c41fbe906a954e14468705 100644
--- a/maestro/pool.c
+++ b/maestro/pool.c
@@ -285,7 +285,7 @@ mstro_pool__add(mstro_cdo cdo, mstro_cdo_state new_state)
   }
 
   assert(new_state==MSTRO_CDO_STATE_OFFERED_LOCALLY
-         || new_state==MSTRO_CDO_STATE_REQUIRED);
+         || new_state==MSTRO_CDO_STATE_REQUIRED_LOCALLY);
 
   /* find ID in pool */
   s = pthread_mutex_lock(&g_mstro_pool_local_mtx);
@@ -318,7 +318,8 @@ mstro_pool__add(mstro_cdo cdo, mstro_cdo_state new_state)
   if(status!=MSTRO_OK) {
     ERR("Failed to add CDO to pool entry\n");
   } else {
-    cdo->state = new_state;
+    mstro_cdo_state_set(cdo, new_state);
+
     /* DEBUG("added CDO to pool entry; could inject into scheduler knowledge\n"); */
 
     /* WARN("NOT signaling pool manager that entry is in pool now\n"); */
@@ -774,13 +775,11 @@ mstro_pool__remove(mstro_cdo cdo, mstro_cdo_state new_state)
   /* find entry */
   mstro_status status = MSTRO_UNIMPL;
 
-  mstro_cdo_state orig_state;
-  if(mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_OFFERED)) {
-    orig_state= MSTRO_CDO_STATE_OFFERED;
-  } else if (mstro_cdo_state_check(cdo, MSTRO_CDO_STATE_REQUIRED)) {
-    orig_state = MSTRO_CDO_STATE_REQUIRED;
-  } else {
-    ERR("Illegal CDO state: %d\n");
+  mstro_cdo_state orig_state = mstro_cdo_state_get(cdo); 
+  if(orig_state != MSTRO_CDO_STATE_OFFERED
+   &&orig_state != MSTRO_CDO_STATE_REQUIRED
+   &&orig_state != MSTRO_CDO_STATE_REQUIRED_LOCALLY) {
+    ERR("Illegal CDO state: %d\n", orig_state);
     return MSTRO_FAIL;
   }
   WITH_CDO_ID_STR(
@@ -827,7 +826,7 @@ mstro_pool__remove(mstro_cdo cdo, mstro_cdo_state new_state)
       break;
     }
     default:
-      ERR("illegal new_state: %d\n");
+      ERR("illegal new_state: %d\n", new_state);
       return MSTRO_FAIL;
   }
   /* send out request. The CDO state will be changed to
@@ -849,7 +848,7 @@ mstro_pool__remove(mstro_cdo cdo, mstro_cdo_state new_state)
             mstro_cdo_state_set(cdo, MSTRO_CDO_STATE_RETRACTED_GLOBALLY);
             break;
           default:
-            ERR("illegale new_state %d\n");
+            ERR("illegal new_state %d\n", new_state);
             return MSTRO_FAIL;
         }
         break;
@@ -1133,7 +1132,7 @@ mstro_pool__demand_no_pm(mstro_cdo cdo, mstro_cdo_state new_state)
   }
 
   /* have locked entry.
-   *
+   * FIXME this code should be refactored to use mstro_pool__find_source_cdo or similar 
    * Since we're PM-less we need to find a local * source or go to
    * sleep until one comes in */
   size_t src=0;
@@ -1199,7 +1198,7 @@ HAVE_SOURCE:
   /* drop our own entry */
   status = mstro_pool_entry__drop(e, cdo);
   if(status==MSTRO_OK) {
-    cdo->state = new_state;
+    mstro_cdo_state_set(cdo, new_state);
 
     /* tell all who want to know. Note that this is only relevant for
      * non-DEMAND waiters, like WITHDRAWs that could not proceed, and
@@ -1375,7 +1374,8 @@ HAVE_SOURCE:
   /* drop our own entry */
   status = mstro_pool_entry__drop(e, cdo);
   if(status==MSTRO_OK) {
-    cdo->state = request->target_state;
+    mstro_cdo_state_set(cdo, request->target_state);
+
     result = true;
 
     /* tell all who want to know. Note that this is only relevant for
@@ -1524,7 +1524,7 @@ HAVE_SOURCE:
   /* drop our own entry */
   status = mstro_pool_entry__drop(e, cdo);
   if(status==MSTRO_OK) {
-    cdo->state = request->target_state;
+    mstro_cdo_state_set(cdo, request->target_state);
 
     /* tell all who want to know. Note that this is only relevant for
      * non-DEMAND waiters, like WITHDRAWs that could not proceed, and
@@ -1676,7 +1676,7 @@ mstro_pool__demand_with_pm(mstro_cdo cdo, mstro_cdo_state new_state)
   /* drop our own entry */
   status = mstro_pool_entry__drop(e,cdo);
   if(status==MSTRO_OK) {
-    cdo->state = new_state;
+    mstro_cdo_state_set(cdo, new_state);
 
     /* tell all who want to know. Note that this is only relevant for
      * non-DEMAND waiters, like WITHDRAWs that could not proceed, and
@@ -1831,7 +1831,7 @@ void mstro_cdo_wait_demand_async_with_pm(mstro_request request)
   /* drop our own entry */
   status = mstro_pool_entry__drop(e,cdo);
   if(status==MSTRO_OK) {
-    cdo->state = request->target_state;
+    mstro_cdo_state_set(cdo, request->target_state);
 
     /* tell all who want to know. Note that this is only relevant for
      * non-DEMAND waiters, like WITHDRAWs that could not proceed, and
@@ -1925,6 +1925,72 @@ mstro_pool__retract(mstro_cdo cdo, mstro_cdo_state new_state)
   return mstro_pool__remove(cdo, new_state);
 }
 
+mstro_status
+mstro_pool__find_cdo_with_local_id( const struct mstro_cdo_id *cdoid, mstro_cdo *result) {
+  mstro_status status=MSTRO_UNIMPL;
+
+  if(cdoid==NULL) {
+    ERR("NULL CDOID\n");
+    return MSTRO_INVARG;
+  }
+  if(result==NULL) {
+    ERR("NULL output destination\n");
+    return MSTRO_INVOUT;
+  }
+
+  int s = pthread_mutex_lock(&g_mstro_pool_local_mtx);
+  if(s!=0) {
+    ERR("Failed to lock pool\n");
+    return MSTRO_FAIL;
+  }
+
+  struct mstro_pool_entry* e = NULL;
+
+  struct mstro_cdo_id head = *cdoid;
+  head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
+
+  HASH_FIND(hh, g_mstro_pool_local,
+            &head, sizeof(struct mstro_cdo_id),
+            e);
+
+  if(e==NULL) {
+    WITH_CDO_ID_STR(idstr, &head,
+                    ERR("No entry found in local pool for CDO gid %s\n", idstr););
+    status=MSTRO_NOENT;
+    goto BAILOUT_UNLOCK;
+  }
+
+  size_t i;
+  for(i=0; i<e->num_cdo_handles; i++) {
+    WITH_CDO_ID_STR(idstr, &e->cdo_handles[i]->gid,
+                    DEBUG("Inspecting %s, state %s\n",
+                          idstr, mstro_cdo_state_describe(e->cdo_handles[i]->state)););
+    if(e->cdo_handles[i]->id.local_id == cdoid->local_id) {              
+      break;
+    }
+  }
+  if(i<e->num_cdo_handles) {
+    DEBUG("Found CDO at index %zu that can provide data\n", i);
+    *result = e->cdo_handles[i];
+    status = MSTRO_OK;
+  } else {
+    WITH_CDO_ID_STR(idstr, cdoid, {
+        ERR("CDO %s entry has no source handle available with local-id %zu \n",
+            idstr, cdoid->local_id);});
+    status = MSTRO_NOENT;
+    *result = NULL;
+  }
+
+BAILOUT_UNLOCK:
+  s=pthread_mutex_unlock(&g_mstro_pool_local_mtx);
+  if(s!=0) {
+    ERR("Failed to unlock pool mutex\n");
+    status = MSTRO_FAIL;
+  }
+
+  return status;
+}
+
 
 mstro_status
 mstro_pool__find_source_cdo(
@@ -1977,16 +2043,6 @@ mstro_pool__find_source_cdo(
   }
   if(i<e->num_cdo_handles) {
     DEBUG("Found CDO at index %zu that can provide data\n", i);
-    if(mstro_cdo_state_get(e->cdo_handles[i])==MSTRO_CDO_STATE_OFFERED_LOCALLY) {
-      WITH_CDO_ID_STR(idstr, cdoid, {
-          DEBUG("Doing implicit OFFER-ACK for CDO %s at transport ticket creation (src side) time\n",
-                idstr);
-        });
-      mstro_cdo_state state_flags = (mstro_cdo_state_get(e->cdo_handles[i])
-                                     & MSTRO_CDO_STATE_FLAGS);
-      mstro_cdo_state_set(e->cdo_handles[i],
-                          MSTRO_CDO_STATE_OFFERED | state_flags);
-    }
     *result = e->cdo_handles[i];
   } else {
     WITH_CDO_ID_STR(idstr, cdoid, {
diff --git a/maestro/pool_client.c b/maestro/pool_client.c
index cc44e96d4076a6645639b29a028d6fe6603ae17d..6e354410779cf42c0e6bb7a0e51f8b28d287d538 100644
--- a/maestro/pool_client.c
+++ b/maestro/pool_client.c
@@ -189,7 +189,7 @@ mstro_pc__handle_poolop_ack(Mstro__Pool__PoolOpAck *poack)
       new_state = MSTRO_CDO_STATE_OFFERED;
       goto shared_cdo_state_update;
 
-    case MSTRO__POOL__POOL_OP_ACK__POOL_OP__REQUIRE:
+    case MSTRO__POOL__POOL_OP_ACK__POOL_OP__REQUIRE: ;
       new_state = MSTRO_CDO_STATE_REQUIRED;
       goto shared_cdo_state_update;
 
@@ -217,8 +217,13 @@ mstro_pc__handle_poolop_ack(Mstro__Pool__PoolOpAck *poack)
           assert(poack->op == MSTRO__POOL__POOL_OP_ACK__POOL_OP__SEAL);
           mstro_cdo__replace_attributes(cdo, new_attributes);
         }
-        mstro_cdo_state state_flags = (mstro_cdo_state_get(cdo)
-                                       & MSTRO_CDO_STATE_FLAGS);
+         if(new_state==MSTRO_CDO_STATE_REQUIRED) {
+           mstro_cdo_state tmp = mstro_cdo_state_get(cdo);
+           /* not waiting for require_ack in when retracting, so let's ignore it if it's late */
+           if (tmp == MSTRO_CDO_STATE_RETRACTED)
+        //     ||tmp == MSTRO_CDO_STATE_DISPOSED)
+              goto BAILOUT;
+        }
         if(new_state==MSTRO_CDO_STATE_OFFERED
            && (mstro_cdo_state_get(cdo) & MSTRO_CDO_STATE_OFFERED)) {
           /* an implicit ack was performed at initiate-transfer time */
@@ -227,7 +232,7 @@ mstro_pc__handle_poolop_ack(Mstro__Pool__PoolOpAck *poack)
                     idstr);
             });
         } else {
-          mstro_cdo_state_set(cdo, new_state | state_flags);
+          mstro_cdo_state_set_safe_flags(cdo, new_state);
           WITH_CDO_ID_STR(idstr, &id, {
               DEBUG("CDO %s now in state %d (%s)\n",
                     idstr, new_state, mstro_cdo_state_describe(new_state));
@@ -249,6 +254,7 @@ mstro_pc__handle_poolop_ack(Mstro__Pool__PoolOpAck *poack)
       ERR("PoolOpAck for invalid pool operation %d\n", poack->operand_case);
       return MSTRO_INVMSG;
   }
+BAILOUT:
   return MSTRO_OK;
 }
 
@@ -322,9 +328,9 @@ mstro_pc__construct_gfs_path_for_cdo(const mstro_cdo src_cdo,
   /* make the directory path */
   WITH_CDO_ID_STR(idstr, &src_cdo->gid, {
     size_t n = snprintf(*path, l1, "%s%" PRIappid "/%s/",
-                     g_mstro_transport_gfs_dir,
-                     g_pool_app_id,
-                     idstr);
+                        g_mstro_transport_gfs_dir,
+                        g_pool_app_id,
+                        idstr);
     if(n>=l1) {
       ERR("GFS path truncated for CDO %s\n", idstr);
       free(*path);
@@ -340,10 +346,10 @@ mstro_pc__construct_gfs_path_for_cdo(const mstro_cdo src_cdo,
   /* this time make the file path */
   WITH_CDO_ID_STR(idstr, &src_cdo->gid, {
     size_t n = snprintf(*path, l, "%s%" PRIappid "/%s/%" PRIlocalid,
-                     g_mstro_transport_gfs_dir,
-                     g_pool_app_id,
-                     idstr,
-                     src_cdo->gid.local_id);
+                        g_mstro_transport_gfs_dir,
+                        g_pool_app_id,
+                        idstr,
+                        src_cdo->gid.local_id);
     if(n>=l) {
       ERR("GFS path truncated for CDO %s\n", idstr);
       free(*path);
@@ -355,6 +361,105 @@ mstro_pc__construct_gfs_path_for_cdo(const mstro_cdo src_cdo,
   return MSTRO_OK;
 }
 
+static inline
+mstro_status
+mstro_pc__calculate_data_size_and_offsets(
+                                      mstro_cdo src_cdo,
+                                      const Mstro__Pool__InitiateTransfer* init, 
+                                      int64_t *data_length,
+                                      int64_t *src_offset,
+                                      int64_t *dst_offset) {
+  mstro_status status = MSTRO_OK;
+  bool is_distributed = init->distributed_cdo;
+  /* find out if the cdo (src or dst) is distributed */  
+  Mstro__Pool__Attributes *dst_attributes = init->dst_attributes;
+  
+  
+  if(is_distributed) {
+    mmbLayout *src_layout = NULL;
+    mmbLayout *dst_layout = NULL;
+    mmbError stat = MMB_OK;
+    /* read source and dst layouts */
+    status = mstro_attribute_pool_find_dist_layout(src_cdo->attributes_msg, &src_layout);
+    status = mstro_attribute_pool_find_dist_layout(dst_attributes, &dst_layout); 
+    assert(status == MSTRO_OK);
+
+    if(src_layout == NULL){
+      /* cook a default layout for src from dst */
+      stat = mmb_layout_dist_create_default_layout(dst_layout, &src_layout);
+    }
+    else if(dst_layout == NULL) {
+      /* cook a default layout for dst from src */
+      stat = mmb_layout_dist_create_default_layout(src_layout, &dst_layout);
+    }
+    assert(stat == MMB_OK);
+    /* find the mapping between src and dst layouts */
+    mmbLayoutIntersection *out_li = NULL;
+    mmbError mmb_s;
+    mmb_s = mmb_layout_compute_intersection(src_layout, dst_layout, &out_li);
+    assert(MMB_OK == mmb_s);
+    DEBUG("Found the intersection between required layout and my layout \n");
+    if (out_li) {
+      /*calculate my index at the intersection object*/
+      size_t index; /*dst_layout_index * n_src_pieces + src_layout_index*/
+      index = dst_layout->index * out_li->n_src_pieces + src_layout->index;
+      if(out_li->overlap[index].length <= 0) {
+        ERR("Invalid intersection length of %zu \n", out_li->overlap[index].length);
+        status =  MSTRO_FAIL;
+      }
+      else {
+        /*read the length, number of segments, and src and dst offsets from the intersection object*/
+        *dst_offset = out_li->overlap[index].dst_offset * src_layout->element.size_bytes; /* offset in bytes */
+        *src_offset = out_li->overlap[index].src_offset * src_layout->element.size_bytes; /* offset in bytes */
+        /* data_length is equal to the length of intersection * size of element in bytes */
+        DEBUG("Intersection length %zu and size of element is %zu \n", out_li->overlap[index].length, src_layout->element.size_bytes);
+        *data_length = out_li->overlap[index].length * src_layout->element.size_bytes ; /* data size in bytes */
+      }
+      
+      mmb_layout_destroy_mmbLayoutIntersection(out_li);
+    }
+    else {
+      ERR("No intersection between dst layout and src layout ... incorrect ticket\n");
+      status = MSTRO_FAIL;
+    }
+
+    /* free up allocated layout objects*/
+    if(src_layout) {
+      mmb_layout_destroy(src_layout);
+    }
+    if(dst_layout) {
+      mmb_layout_destroy(dst_layout);
+    }
+
+  } /* not distributed */
+  else {
+    const void *size=NULL;
+    enum mstro_cdo_attr_value_type vt;
+    status = mstro_attribute_dict_get(src_cdo->attributes,
+                               MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
+                               &vt, &size, NULL, false);
+    if(status==MSTRO_NOENT && vt==MSTRO_CDO_ATTR_VALUE_INVALID) {
+      ERR("CDO has mamba-array but no local-size\n");
+      return MSTRO_FAIL;
+    }
+    if(status!=MSTRO_OK) {
+      ERR("Failed to retrieve local-size on CDO\n");
+      return MSTRO_FAIL;
+    }
+
+    *data_length = *(int64_t*)size;
+    if(*data_length==-1) {
+      DEBUG("Source CDO empty, doing NULL transfer\n");
+      *data_length = 0;
+    }
+
+    *dst_offset = 0; /* not distributed  ... write from the begining */
+    *src_offset = 0; /* not distributed ... read from the begining */
+  }
+
+  return status;
+
+}
 
 static inline
 mstro_status
@@ -380,17 +485,37 @@ mstro_pc__handle_initiate_transfer(const Mstro__Pool__InitiateTransfer* init)
       WITH_CDO_ID_STR(there_idstr, &dstcdoid, {
           NOISE("ticket for CDO gid %s (here), gid %s (there)\n",
                here_idstr, there_idstr);});});
-
-  /* we may see the ticket before OFFER ack. In that case
-   * find_source_cdo does an implicit ack by setting state to OFFERED
-   * (PM would not send ticket if it has not handled the OFFER) */
-  if (!(MSTRO_OK == mstro_pool__find_source_cdo(&srccdoid, init->dst_attributes,
-                                                &src_cdo))) {
-    WITH_CDO_ID_STR(idstr, &srccdoid,
+  if (srccdoid.local_id != MSTRO_CDO_LOCAL_ID_NONE) {  
+    /* We trust that the PM gave us the correct local-id */
+    if (!(MSTRO_OK == mstro_pool__find_cdo_with_local_id(&srccdoid, &src_cdo))) {
+      WITH_CDO_ID_STR(idstr, &srccdoid,
                     ERR("Cannot transfer CDO gid %s because not in local pool\n",
                         idstr););
-    return MSTRO_FAIL;
+      return MSTRO_FAIL;
+     }
   }
+  else { /*we do not have a valid local-id ... looking for an appropriate cdo */
+    if (!(MSTRO_OK == mstro_pool__find_source_cdo(&srccdoid, init->dst_attributes, &src_cdo))) {
+      WITH_CDO_ID_STR(idstr, &srccdoid,
+                    ERR("Cannot transfer CDO gid %s because not in local pool\n",
+                        idstr););
+      return MSTRO_FAIL;
+    }
+  }
+
+   /* we may see the ticket before OFFER ack. In that case
+   * we do an implicit ack by setting state to OFFERED
+   * (PM would not send ticket if it has not handled the OFFER) */
+    if(mstro_cdo_state_get(src_cdo)==MSTRO_CDO_STATE_OFFERED_LOCALLY) {
+      WITH_CDO_ID_STR(idstr, &srccdoid, {
+          DEBUG("Doing implicit OFFER-ACK for CDO %s at transport ticket creation (src side) time\n",
+                idstr);
+        });
+      mstro_cdo_state state_flags = (mstro_cdo_state_get(src_cdo)
+                                     & MSTRO_CDO_STATE_FLAGS);
+      mstro_cdo_state_set(src_cdo,
+                          MSTRO_CDO_STATE_OFFERED | state_flags);
+    }
 
   if(init->dst_attributes==NULL) {
     WARN("No attributes on CDO\n");
@@ -402,7 +527,7 @@ mstro_pc__handle_initiate_transfer(const Mstro__Pool__InitiateTransfer* init)
     }
   }
 
-  DEBUG("Initiating transfer from src app %zu (me) to dst app %zu of CDO %s\n",
+  DEBUG("Initiating transfer from src app %" PRIappid " (me) to dst app %" PRIappid " of CDO %s\n",
         g_pool_app_id, init->dst_appid->id, src_cdo->name);
 
   if(g_pool_app_id==init->dst_appid->id) {
@@ -431,26 +556,22 @@ mstro_pc__handle_initiate_transfer(const Mstro__Pool__InitiateTransfer* init)
   ticket.srcid = &myid;
 
   mstro_status s=MSTRO_UNIMPL;
-  const void *size=NULL;
-  enum mstro_cdo_attr_value_type vt;
-  s = mstro_attribute_dict_get(src_cdo->attributes,
-                               MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                               &vt, &size, NULL, false);
-  if(s==MSTRO_NOENT && vt==MSTRO_CDO_ATTR_VALUE_INVALID) {
-    ERR("CDO has mamba-array but no local-size\n");
-    return MSTRO_FAIL;
-  }
-  if(s!=MSTRO_OK) {
-    ERR("Failed to retrieve local-size on CDO\n");
-    return MSTRO_FAIL;
+  int64_t realsize, src_offset, dst_offset;
+                                
+  /**Calculate the length of the data, src and dst offsets and number of segments */
+  s = mstro_pc__calculate_data_size_and_offsets(
+              src_cdo, init, &realsize, &src_offset, &dst_offset);
+  if (s != MSTRO_OK) {
+    return s;
   }
+  
+  /*fill ticket with the gathered information */
+  ticket.src_offset = src_offset;
+  ticket.dst_offset = dst_offset;
+  ticket.n_segments = init->n_segments;
+  ticket.distributed_cdo = init->distributed_cdo;
 
-  int64_t realsize = *(int64_t*)size;
-  if(realsize==-1) {
-    DEBUG("Source CDO empty, doing NULL transfer\n");
-    realsize = 0;
-  }
-  if (   init->methods->supported[0] == MSTRO__POOL__TRANSPORT_KIND__MIO
+  if (init->methods->supported[0] == MSTRO__POOL__TRANSPORT_KIND__MIO
       && (!g_mio_available || (realsize % getpagesize()) != 0 )
       ){
     WARN("Not issuing a ticket with MIO. Either not available or CDO size (%zu)"
@@ -552,7 +673,7 @@ mstro_pc__handle_initiate_transfer(const Mstro__Pool__InitiateTransfer* init)
 	  struct mstro_pm_app_registry_entry *e;
       mstro_status status = mstro_pm_app_lookup(appid.id, &e);
 	  assert(e!=NULL); // we befriended dst app earlier, it should be in the registry
-      ticket.src_serialized_endpoint = e->ep->addr_serialized;
+      ticket.src_serialized_endpoint = e->ep->serialized;
       ofi.h = &rh; /* rest to be filled in transport_execute */
       ticket.want_completion = 1; /* so we can refcount-- */
       ticket.ofi = &ofi;
@@ -569,7 +690,7 @@ mstro_pc__handle_initiate_transfer(const Mstro__Pool__InitiateTransfer* init)
   }
   ticket.attributes = src_cdo->attributes_msg;
 
-  INFO("Issued ticket to App %" PRIu64 " for CDO %s, and starting execute process\n", init->dst_appid->id, src_cdo->name);
+  INFO("Issued ticket to app %" PRIu64 " for CDO %s, and starting execute process\n", init->dst_appid->id, src_cdo->name);
 
   NOISE("TransferTicket using path %s\n", ticket.gfs->path);
   NOISE("TransferTicket cdo size %" PRIi64 "\n", ticket.data_size);
@@ -686,7 +807,7 @@ mstro_pc__transport_send_completion(mstro_app_id srcappid,
       return MSTRO_FAIL;
     }
     WITH_CDO_ID_STR(idstr, srccdoid, {
-        DEBUG("Sent completion of transfer for %s to %" PRIappid "\n",
+        DEBUG("Sent completion of transfer for %s to app %" PRIappid "\n",
               idstr, srcappid);});
   }
 
@@ -718,15 +839,20 @@ mstro_pc__handle_transfer_ticket(Mstro__Pool__TransferTicket* ticket)
                            ticket->method))
                        ->name););
   mstro_cdo dst_cdo;
-  status = mstro_pool__find_sink_cdo(&cdoid, ticket->attributes,
+  status = mstro_pool__find_cdo_with_local_id(&cdoid, &dst_cdo);
+
+  if (status != MSTRO_OK) {
+    /* we did not find the cdo by local id */
+    status = mstro_pool__find_sink_cdo(&cdoid, ticket->attributes,
                                      &dst_cdo);
-  if(MSTRO_OK!=status && MSTRO_NOENT!=status) {
+    if(MSTRO_OK!=status && MSTRO_NOENT!=status) {
     WITH_CDO_ID_STR(idstr, &cdoid,
                     ERR("Cannot transfer CDO gid %s because not in local pool\n",
                         idstr););
     return MSTRO_FAIL;
+    }
   }
-
+  
   if(MSTRO_NOENT==status) {
     WITH_CDO_ID_STR(idstr, &cdoid, {
         WARN("Incoming ticket for CDO %s, but CDO not present in local pool, skipping ticket\n",
@@ -737,7 +863,7 @@ mstro_pc__handle_transfer_ticket(Mstro__Pool__TransferTicket* ticket)
     }
   } else {
     WITH_CDO_ID_STR(idstr, &dst_cdo->gid, {
-        DEBUG("Initiating incoming transfer to %" PRIappid " (me) from %zu for CDO %s (%s)\n",
+        DEBUG("Initiating incoming transfer to app %" PRIappid " (me) from %zu for CDO %s (%s)\n",
               g_pool_app_id, ticket->srcid->id, dst_cdo->name, idstr);});
 
     /* ensure we know how to talk to the recipient */
@@ -747,7 +873,16 @@ mstro_pc__handle_transfer_ticket(Mstro__Pool__TransferTicket* ticket)
         return status;
     }
 
-   /* Execute transport (non-blocking) */
+    /** check and update the number of segments on the dst cdo 
+     * if n_segments of the cdo is zero, then it means this is the first ticket 
+     * and we should update the n_segments value from the ticket
+     * Otherwise, then we are not the first ticket, and we should leave it alone
+     * with the completion of every transport the value of n_segments is decremented
+     * until it reaches zero, marking that all required data is filled. 
+    */
+    int64_t expected = 0;
+    atomic_compare_exchange_strong(&dst_cdo->n_segments, &expected, ticket->n_segments);
+    /* Execute transport (non-blocking) */
     status = mstro_transport_execute(dst_cdo, ticket);
     if(MSTRO_OK != status) {
       ERR("Failure in transport execute for CDO %s\n",
@@ -772,7 +907,7 @@ mstro_pc__handle_transfer_completed(Mstro__Pool__MstroMsg *msg)
   assert(completion!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
   mstro_status s = MSTRO_UNIMPL;
 
-  DEBUG("Received transfer completion message from %zu\n", app_id);
+  DEBUG("Received transfer completion message from app %" PRIappid "\n", app_id);
 
   /* if this is an OFI ticket: call mstro_transport_rdma_src_execute_bh */
   struct mstro_transport_mreg_table_entry* regentry = NULL;
@@ -829,11 +964,12 @@ mstro_pc__handle_event(const Mstro__Pool__Event *ev)
     return MSTRO_INVMSG;
   }
 
-  DEBUG("Pool event for sid %" PRIu64 ", kind %d (payload case %d)\n",
-        ev->subscription_handle->id, ev->kind, ev->payload_case);
+  DEBUG("Pool event for sid %" PRIu64 ", kind %d (payload case %d), ctime %" PRIu64 "\n",
+        ev->subscription_handle->id, ev->kind, ev->payload_case,
+        ((uint64_t)ev->ctime->sec * NSEC_PER_SEC) + (uint64_t) ev->ctime->nsec);
 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PC_NUM_POOL_EVENTS, 1);
-  uint64_t sid = ev->subscription_handle->id;
+  //uint64_t sid = ev->subscription_handle->id;
 
   return mstro_pool_event_consume(ev);
 }
diff --git a/maestro/pool_manager.c b/maestro/pool_manager.c
index 43922231b87fb1535e90cddd9c86a9ec7c3d2766..6106078b59cc41e7507f3d93f126fb0ecc5f6f43 100644
--- a/maestro/pool_manager.c
+++ b/maestro/pool_manager.c
@@ -186,7 +186,7 @@ mstro_pm__send_ack(mstro_app_id app_id,
   /* send it off: this function automatically packs the message and sends it. The   */
   status = mstro_pmp_send_nowait(app_id, &msg);
   if(status!=MSTRO_OK) {
-    ERR("Failed to send %s-ACK to %zu\n", app_id);
+    ERR("Failed to send %s-ACK to app %" PRIappid "\n", app_id);
   }
   
   return status; 
@@ -274,8 +274,8 @@ mstro_pm__event_bh(mstro_event event,
   DEBUG("continuation %p, handler %p\n", cont, cont->bh_handler);
   mstro_status s = cont->bh_handler(event, cont);
   if(s!=MSTRO_OK) {
-    ERR("Executing handler %p failed: %d (%s)\n",
-        cont->bh_handler, s, mstro_status_description(s));
+    ERR("Executing continuation handler failed: %d (%s)\n",
+        s, mstro_status_description(s));
   }  
 }
 
@@ -340,7 +340,15 @@ mstro_pm__continuation_create(
 
   return res;
 }
-    
+
+static inline
+void
+mstro_pm__continuation_destroy(
+  struct mstro_pm__continuation_ctx *ctx)
+{
+  assert(ctx!=NULL);
+  free(ctx);
+}   
 
 /* send event notification to subscribers.
  *
@@ -410,9 +418,17 @@ mstro_pm__event_notify_and_continue(Mstro__Pool__Event *pool_event_msg,
   if(s!=MSTRO_OK) {
     ERR("Failed to schedule event handler: %d (%s)\n",
         s, mstro_status_description(s));
-    return s;
+    goto BAILOUT_FREE;
   }
 
+  Mstro__Pool__Timestamp ctime = MSTRO__POOL__TIMESTAMP__INIT;
+  ctime.offset = 0;
+  mstro_nanosec_t tick = mstro_clock();
+  ctime.sec = tick/NSEC_PER_SEC;
+  ctime.nsec = tick-ctime.sec;
+  pool_event_msg->ctime = &ctime;
+
+
   /* Send subscription info. We pass our continuation event so that
    *  the subscription engine can trigger our continuation when all
    *  the event acks have arrived. If none are needed the event will
@@ -429,9 +445,13 @@ mstro_pm__event_notify_and_continue(Mstro__Pool__Event *pool_event_msg,
   if(s!=MSTRO_OK) {
     ERR("Failed to advertise event: %d (%s)\n",
         s, mstro_status_description(s));
-    return s;
+    goto BAILOUT_FREE;
   }
   return MSTRO_OK;
+
+BAILOUT_FREE:
+  mstro_pm__continuation_destroy(ctx);
+  return s;
 }
 
 
@@ -479,6 +499,7 @@ mstro_pm__handle_declare_phase3(mstro_event event,
 {
   DEBUG("DECLARE phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_DECLARE, 1);
   return MSTRO_OK;
 }
@@ -527,7 +548,7 @@ mstro_pm__handle_declare_phase2(mstro_event event,
   }
 
   WITH_CDO_ID_STR(idstr, &id,
-                  INFO("App %" PRIu64 " declared CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " declared CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
   
@@ -558,8 +579,8 @@ mstro_pm__handle_declare_phase2(mstro_event event,
 
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-
 DONE:
+  mstro_pm__continuation_destroy(cont);
   return status;
 }
 
@@ -571,7 +592,7 @@ mstro_pm__handle_declare(Mstro__Pool__MstroMsg *msg)
   mstro_app_id app_id = msg->token->appid->id;
   assert(declare!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
   
-  DEBUG("CDO declaration from %zu for %s (serial %zu)\n",
+  DEBUG("CDO declaration from app %" PRIappid " for %s (serial %zu)\n",
         app_id, declare->cdo_name, declare->serial);
 
   Mstro__Pool__Event ev = MSTRO__POOL__EVENT__INIT;
@@ -596,6 +617,9 @@ mstro_pm__handle_declare(Mstro__Pool__MstroMsg *msg)
   }
 BAILOUT:
   return status;
+BAILOUT_FREE:
+  mstro_pm__msg_free(msg);
+  return status;
 }
 
 
@@ -664,7 +688,7 @@ mstro_pm__group_record_create(const struct mstro_cdo_id *cdoid,
     }
     WITH_CDO_ID_STR(cidstr, &c, {
         WITH_CDO_ID_STR(gidstr, cdoid, {
-            DEBUG("Batch-declared CDO |%s| (%s) for group %s from app %zu\n",
+            DEBUG("Batch-declared CDO |%s| (%s) for group %s from app %" PRIappid "\n",
                   name, cidstr, gidstr, app_id);});};);
     
     memcpy(&entry->member_ids[i+offset], &c, sizeof(struct mstro_cdo_id));
@@ -913,6 +937,7 @@ mstro_pm__handle_seal_phase3(mstro_event event,
 {
   DEBUG("SEAL phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_SEAL, 1);
   return MSTRO_OK;
 }
@@ -981,11 +1006,11 @@ mstro_pm__handle_seal_phase2(mstro_event event,
                                                     seal->attributes);
     if(status==MSTRO_OK) {
       WITH_CDO_ID_STR(idstr, &cdoid,{
-          DEBUG("Set attributes on %s for app %d\n", idstr, app_id);
+          DEBUG("Set attributes on %s for app %" PRIappid "\n", idstr, app_id);
         });
     } else {
       WITH_CDO_ID_STR(idstr, &cdoid,{
-          ERR("Failed to set attributes on %s for app %d\n", idstr, app_id);
+          ERR("Failed to set attributes on %s for app %" PRIappid "\n", idstr, app_id);
         });
       goto BAILOUT_FREE;
     }
@@ -1003,7 +1028,7 @@ mstro_pm__handle_seal_phase2(mstro_event event,
   }
 
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " sealed CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " sealed CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1041,6 +1066,7 @@ mstro_pm__handle_seal_phase2(mstro_event event,
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont);
   return status;
 }
 
@@ -1052,7 +1078,7 @@ mstro_pm__handle_seal(Mstro__Pool__MstroMsg *msg)
   mstro_app_id app_id = msg->token->appid->id;
   assert(seal!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
   
-  DEBUG("CDO SEAL from %zu\n", app_id);
+  DEBUG("CDO SEAL from %" PRIappid "\n", app_id);
   mstro_status s;
   
   if(seal->cdoid==NULL) {
@@ -1116,6 +1142,7 @@ mstro_pm__handle_offer_phase3(mstro_event event,
 {
   DEBUG("OFFER phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_OFFER, 1);
   return MSTRO_OK;
 }
@@ -1152,7 +1179,7 @@ mstro_pm__handle_offer_phase2(mstro_event event,
   }
   
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " offered CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " offered CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1184,6 +1211,7 @@ mstro_pm__handle_offer_phase2(mstro_event event,
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont);
   return s;
 }
 
@@ -1198,7 +1226,7 @@ mstro_pm__handle_offer(Mstro__Pool__MstroMsg *msg)
                                 .qw[1] = offer->cdoid->qw1,
                                 .local_id = offer->cdoid->local_id };
   
-  DEBUG("CDO OFFER from %zu\n", app_id);
+  DEBUG("CDO OFFER from app %" PRIappid "\n", app_id);
   mstro_status s=MSTRO_FAIL;
   
   if(offer->cdoid==NULL) {
@@ -1247,6 +1275,7 @@ mstro_pm__handle_require_phase3(mstro_event event,
 {
   DEBUG("REQUIRE phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_REQUIRE, 1);
   return MSTRO_OK;
 }
@@ -1286,7 +1315,7 @@ mstro_pm__handle_require_phase2(mstro_event event,
   }
 
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " required CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " required CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1317,8 +1346,8 @@ mstro_pm__handle_require_phase2(mstro_event event,
 
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-  
 DONE:
+  mstro_pm__continuation_destroy(cont);
   return s;
 }
 
@@ -1333,7 +1362,7 @@ mstro_pm__handle_require(Mstro__Pool__MstroMsg *msg)
                                 .qw[1] = require->cdoid->qw1,
                                 .local_id = require->cdoid->local_id };
 
-  DEBUG("CDO REQUIRE from %zu\n", app_id);
+  DEBUG("CDO REQUIRE from app %" PRIappid "\n", app_id);
   mstro_status s;
   
   if(require->cdoid==NULL) {
@@ -1380,6 +1409,7 @@ mstro_pm__handle_retract_phase3(mstro_event event,
 {
   DEBUG("RETRACT phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_RETRACT, 1);
   return MSTRO_OK;
 }
@@ -1433,7 +1463,7 @@ mstro_pm__handle_retract_phase2(mstro_event event,
   }
 
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " retracted CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " retracted CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1465,8 +1495,8 @@ mstro_pm__handle_retract_phase2(mstro_event event,
 
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-  
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
 }
 
@@ -1524,6 +1554,7 @@ mstro_pm__handle_demand_phase3(mstro_event event,
 {
   DEBUG("DEMAND phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_DEMAND, 1);
   return MSTRO_OK;
 }
@@ -1561,7 +1592,7 @@ mstro_pm__handle_demand_phase2(mstro_event event,
    * requestor is ready to send out the TRANSFER_COMPLETED */
 
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " demanded CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " demanded CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1592,8 +1623,8 @@ mstro_pm__handle_demand_phase2(mstro_event event,
 
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-  
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
 }
 
@@ -1609,7 +1640,7 @@ mstro_pm__handle_demand(Mstro__Pool__MstroMsg *msg)
                                 .qw[1] = demand->cdoid->qw1,
                                 .local_id = demand->cdoid->local_id };
   
-  DEBUG("CDO DEMAND from %zu\n", app_id);
+  DEBUG("CDO DEMAND from app %" PRIappid "\n", app_id);
   mstro_status s;
   
   if(demand->cdoid==NULL) {
@@ -1657,6 +1688,7 @@ mstro_pm__handle_withdraw_phase4(mstro_event event,
 {
   DEBUG("WITHDRAW phase 4/4, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_WITHDRAW, 1);
   return MSTRO_OK;
 }
@@ -1685,7 +1717,7 @@ mstro_pm__handle_withdraw_phase3(mstro_event event,
   }
 
   WITH_CDO_ID_STR(idstr, &cdoid,
-                  INFO("App %" PRIu64 " withdrawn CDO (global id: `%s')\n",
+                  INFO("app %" PRIappid " withdrew CDO (global id: `%s')\n",
                        app_id, idstr);
                   );
 
@@ -1717,8 +1749,8 @@ mstro_pm__handle_withdraw_phase3(mstro_event event,
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
-
 }
 
 static 
@@ -1758,6 +1790,7 @@ mstro_pm__handle_withdraw_phase2(mstro_event event,
   
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
 DONE:
   return s;
 }
@@ -1769,7 +1802,7 @@ mstro_pm__handle_withdraw(Mstro__Pool__MstroMsg *msg)
   Mstro__Pool__Withdraw *withdraw = msg->withdraw;
   mstro_app_id app_id = msg->token->appid->id;
   assert(withdraw!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
-  DEBUG("CDO WITHDRAW from %zu\n", app_id);
+  DEBUG("CDO WITHDRAW from app %" PRIappid "\n", app_id);
   mstro_status s;
   
   if(withdraw->cdoid==NULL) {
@@ -1820,6 +1853,7 @@ mstro_pm__handle_dispose_phase3(mstro_event event,
 {
   DEBUG("DISPOSE phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_DISPOSE, 1);
   return MSTRO_OK;
 }
@@ -1875,6 +1909,7 @@ mstro_pm__handle_dispose_phase2(mstro_event event,
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
 }
 
@@ -1930,6 +1965,7 @@ mstro_pm__handle_transfer_completed_phase3(mstro_event event,
 {
   DEBUG("TRANSFER_COMPLETED (3/3), event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_TRANSFER_COMPLETIONS, 1);
   return MSTRO_OK;
 }
@@ -1977,6 +2013,7 @@ mstro_pm__handle_transfer_completed_phase2(mstro_event event,
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
 }
 
@@ -2000,10 +2037,10 @@ mstro_pm__handle_transfer_completed(Mstro__Pool__MstroMsg *msg)
   
   WITH_CDO_ID_STR(srcstr, &srccdoid, {
           WITH_CDO_ID_STR(dststr, &dstcdoid, {
-              DEBUG("TRANSFER COMPLETION from %" PRIappid
-                    " for dst %s <- src %s (providing app: %zu)\n",
-                    app_id, dststr, srcstr,
-                    msg->transfer_completed->srcid->id);});});
+              INFO("TRANSFER COMPLETION from app %" PRIappid
+                   " for dst %s <- src %s (providing: app %" PRIappid ")\n",
+                   app_id, dststr, srcstr,
+                   msg->transfer_completed->srcid->id);});});
 
   /* produce completed:after event */
   Mstro__Pool__Event ev = MSTRO__POOL__EVENT__INIT;
@@ -2046,6 +2083,7 @@ mstro_pm__handle_leave_phase3(mstro_event event,
 {
   DEBUG("LEAVE phase 3/3, event %p\n", event);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_LEAVE, 1);
   return MSTRO_OK;
 }
@@ -2067,10 +2105,10 @@ mstro_pm__handle_leave_phase2(mstro_event event,
    * and preemption on the message handlers at some point */
   mstro_status s = mstro_pm_app_deregister(app_id);
   if(s!=MSTRO_OK) {
-    ERR("Failed to deregister app %zu\n", app_id);
+    ERR("Failed to deregister app %" PRIappid "\n", app_id);
   }
 
-  INFO("Granting LEAVE request from app %zu\n", app_id);
+  INFO("Granting LEAVE request from app %" PRIappid "\n", app_id);
   
   Mstro__Pool__Bye bye = MSTRO__POOL__BYE__INIT;
 
@@ -2083,7 +2121,7 @@ mstro_pm__handle_leave_phase2(mstro_event event,
   }
   s = mstro_pmp_send_nowait(app_id, &msg_bye);
   if(s!=MSTRO_OK) {
-    ERR("Cannot send BYE reply to %zu: %d (%s)\n",
+    ERR("Cannot send BYE reply to app %" PRIappid ": %d (%s)\n",
         app_id, s, mstro_status_description(s));
     goto BAILOUT;
   }
@@ -2113,6 +2151,7 @@ BAILOUT:
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
 DONE:
+  mstro_pm__continuation_destroy(cont); 
   return s;
 }
 
@@ -2178,8 +2217,8 @@ mstro_pm__handle_join_phase5(mstro_event event,
                              struct mstro_pm__continuation_ctx *cont)
 {
   DEBUG("JOIN phase 5/5, event %p\n", event);
-
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont); 
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_JOIN, 1);
   return MSTRO_OK;
 }
@@ -2240,6 +2279,7 @@ mstro_pm__handle_join_phase4(mstro_event event,
   
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
 DONE:
   return status;
 }
@@ -2278,7 +2318,7 @@ mstro_pm__handle_join_phase3(mstro_event event,
   goto DONE;
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-
+  mstro_pm__continuation_destroy(cont);
 DONE:
   return status;
 }
@@ -2360,6 +2400,8 @@ mstro_pm__handle_join_phase2(mstro_event event,
   aid.id = regentry->appid;
   ev.origin_id = &aid;
 
+  INFO("JOIN message received. Caller %s:%d is now known as app #%" PRIu64 "\n", 
+       ev.join->component_name, ev.join->component_index, regentry->appid);
 
   status = mstro_pm__event_notify_and_continue(
       &ev,
@@ -2375,9 +2417,10 @@ mstro_pm__handle_join_phase2(mstro_event event,
   goto DONE;
 BAILOUT_FREE:
   mstro_pm__msg_free(cont->msg);
-  /* FIXME: free regentry and context */
+  /* FIXME: free regentry*/
 
 DONE:
+  mstro_pm__continuation_destroy(cont);
   return status;
 }
 
@@ -2390,11 +2433,9 @@ mstro_pm__handle_join(Mstro__Pool__MstroMsg *msg,
   Mstro__Pool__Join *join = msg->join;
   mstro_status status = MSTRO_OK;
   
-  INFO("JOIN message received. Caller %s:%d\n", 
-       join->component_name, join->component_index);
-
-  DEBUG("(PM proto version %d) advertises endpoint %s\n",
-   join->protocol_version, join->serialized_endpoint);
+  DEBUG("New component %s:%d (PM proto version %06x) advertises endpoint %s\n",
+        join->component_name, join->component_index,
+        join->protocol_version, join->serialized_endpoint);
 
   if(join->transport_methods==NULL
      || (join->transport_methods->n_supported==0)) {
@@ -2442,13 +2483,13 @@ mstro_pm__handle_subscribe(Mstro__Pool__MstroMsg *msg)
   Mstro__Pool__Subscribe *subscribe = msg->subscribe;
   mstro_app_id app_id = msg->token->appid->id;
   assert(subscribe!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
-  DEBUG("SUBSCRIBE from %zu\n", app_id);
+  DEBUG("SUBSCRIBE from app %" PRIappid "\n", app_id);
   mstro_status s=MSTRO_UNIMPL;
 
   Mstro__Pool__SubscriptionHandle *handle = NULL;
   s = mstro_subscription_message_register(subscribe, app_id, &handle);
   if(s!=MSTRO_OK) {
-    ERR("Failed to register subscription for app %" PRIu64 " (local id % " PRIu64 ")\n",
+    ERR("Failed to register subscription for app %" PRIu64 " (local id %" PRIu64 ")\n",
         app_id, subscribe->local_id);
     goto BAILOUT;
   }
@@ -2467,7 +2508,8 @@ mstro_pm__handle_subscribe(Mstro__Pool__MstroMsg *msg)
   
   s = mstro_pmp_send_nowait(app_id, &msg_sack);
   if(s!=MSTRO_OK) {
-    ERR("Failed to send %s-ACK to %zu\n", app_id);
+    ERR("Failed to send %s-ACK to app %" PRIappid "\n",
+        sack.base.descriptor->name, app_id);
     goto BAILOUT;
   }
   
@@ -2505,6 +2547,8 @@ BAILOUT:
                          MSTRO__POOL__POOL_OP_ACK__POOL_OP__UNSUBSCRIBE,
                          NULL, handle, NULL, s);
   mstro_pm__msg_free(cont->msg);
+  mstro_pm__continuation_destroy(cont);
+
   return s;
 }
 
@@ -2515,7 +2559,7 @@ mstro_pm__handle_unsubscribe(Mstro__Pool__MstroMsg *msg)
   Mstro__Pool__Unsubscribe *unsubscribe = msg->unsubscribe;
   mstro_app_id app_id = msg->token->appid->id;
   assert(unsubscribe!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
-  DEBUG("UNSUBSCRIBE from %zu\n", app_id);
+  DEBUG("UNSUBSCRIBE from app %" PRIappid "\n", app_id);
   mstro_status s=MSTRO_UNIMPL;
 
   /* broadcast JOIN event */
@@ -2557,7 +2601,7 @@ mstro_pm__handle_event_ack(Mstro__Pool__MstroMsg *msg)
   if(ack==NULL)
     return MSTRO_INVARG;
 
-  /* pass in to subscription engine. It will do teh accounting, and in
+  /* pass in to subscription engine. It will do the accounting, and in
    * the end trigger continuations for those handlers that sent the
    * notification event. */
   mstro_status s = mstro_subscription_message_event_ack(ack);
@@ -2579,7 +2623,7 @@ mstro_pm__handle_resolve(Mstro__Pool__MstroMsg *msg)
   Mstro__Pool__Resolve *resolve = msg->resolve;
   mstro_app_id app_id = msg->token->appid->id;
   assert(resolve!=NULL); assert(app_id!=MSTRO_APP_ID_INVALID);
-  DEBUG("RESOLVE from %zu\n", app_id);
+  DEBUG("RESOLVE from app %" PRIappid "\n", app_id);
   mstro_status s=MSTRO_UNIMPL;
 
   Mstro__Pool__ResolveReply reply = MSTRO__POOL__RESOLVE_REPLY__INIT;
@@ -2601,7 +2645,7 @@ mstro_pm__handle_resolve(Mstro__Pool__MstroMsg *msg)
         ; /* set in lookup call above */
       }
       WITH_CDO_ID_STR(idstr, &id, {
-          DEBUG("Resolved CDO ID %s to |%s| for app %" PRIi64 "\n",
+          DEBUG("Resolved CDO ID %s to |%s| for app %" PRIappid "\n",
                 idstr, reply.name, app_id);});
       s=MSTRO_OK;
       break;
@@ -2617,7 +2661,7 @@ mstro_pm__handle_resolve(Mstro__Pool__MstroMsg *msg)
       } else {
         reply.name = entry->component_name;
       }
-      DEBUG("Resolved APP ID %" PRIi64 " to |%s| for app %" PRIi64 "\n",
+      DEBUG("Resolved app %" PRIappid " to |%s| for app %" PRIappid "\n",
             appid, reply.name, app_id);
       s=MSTRO_OK;
       break;
@@ -2644,7 +2688,7 @@ mstro_pm__handle_resolve(Mstro__Pool__MstroMsg *msg)
   }
   s = mstro_pmp_send_nowait(app_id, &rmsg);
   if(s!=MSTRO_OK) {
-    ERR("Failed to send ResolveReply to %zu\n", app_id);
+    ERR("Failed to send ResolveReply to app %" PRIappid "\n", app_id);
     goto BAILOUT;
   }
   mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_RESOLVE, 1);
diff --git a/maestro/pool_manager_protocol.c b/maestro/pool_manager_protocol.c
index 547cb1cde92982669bd056411feb4ba06e8fd502..c200a3ce2099812e766c7e575571380c07a243cc 100644
--- a/maestro/pool_manager_protocol.c
+++ b/maestro/pool_manager_protocol.c
@@ -5,21 +5,21 @@
 
 /*
  * Copyright (C) 2019 Cray Computer GmbH
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
+ *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
- * 
+ *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
- * 
+ *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software without
  * specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -94,16 +94,14 @@ mstro_status
 mstro_pmp_send_nowait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
 {
   size_t msgsize = mstro__pool__mstro_msg__get_packed_size(msg);
-
   assert(msg->msg_case!=MSTRO__POOL__MSTRO_MSG__MSG__NOT_SET);
-
   /* we rely on the true sub-message name being in any of the union
    * member's base slot, so DECLARE is as good as any */
   DEBUG("handling msg %s to %zu\n",
         msg->declare->base.descriptor->name, target);
 
   assert(target!=MSTRO_APP_ID_INVALID);
-  
+
   if(target==MSTRO_APP_ID_MANAGER && !g_mstro_pm_attached) {
     /* check if this is the JOIN message */
     if(msg->msg_case == MSTRO__POOL__MSTRO_MSG__MSG_JOIN) {
@@ -117,7 +115,7 @@ mstro_pmp_send_nowait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
     /* non-PM target or PM attached */
     ;
   }
-  
+
   /* serialize it */
   size_t len = msgsize;
   if(len>MSTRO_MSG_SHORT_SIZE) {
@@ -133,7 +131,7 @@ mstro_pmp_send_nowait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
   }
 
   struct mstro_msg_envelope *env=NULL;
-  
+
   mstro_status s = mstro_msg_envelope_allocate(&env);
   if(s!=MSTRO_OK) {
     ERR("Cannot allocate for pool message\n");
@@ -159,7 +157,7 @@ mstro_pmp_send_nowait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
     return MSTRO_FAIL;
   } else {
     DEBUG("Target found in app table\n");
-  }   
+  }
 
   /* for non-completion contexts the message handler frees the context */
   s = mstro_ofi__submit_message_nowait(app_entry->ep, app_entry->addr,
@@ -171,7 +169,7 @@ mstro_pmp_send_nowait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
     goto BAILOUT;
   }
   DEBUG("OFI message submitted\n");
-  
+
   s=MSTRO_OK;
 BAILOUT:
   /* envelope freed in completion handler */
@@ -187,7 +185,7 @@ mstro_pmp_send_nowait_ep(struct mstro_endpoint *ep, fi_addr_t addr, const Mstro_
   assert(msg->msg_case!=MSTRO__POOL__MSTRO_MSG__MSG__NOT_SET);
 
   assert(ep!=NULL);
-  
+
   /* serialize it */
   size_t len = msgsize;
   if(len>MSTRO_MSG_SHORT_SIZE) {
@@ -203,7 +201,7 @@ mstro_pmp_send_nowait_ep(struct mstro_endpoint *ep, fi_addr_t addr, const Mstro_
   }
 
   struct mstro_msg_envelope *env=NULL;
-  
+
   mstro_status s = mstro_msg_envelope_allocate(&env);
   if(s!=MSTRO_OK) {
     ERR("Cannot allocate for pool message\n");
@@ -232,7 +230,7 @@ mstro_pmp_send_nowait_ep(struct mstro_endpoint *ep, fi_addr_t addr, const Mstro_
     goto BAILOUT;
   }
   DEBUG("OFI message submitted\n");
-  
+
   s=MSTRO_OK;
 BAILOUT:
   /* envelope freed in completion handler */
@@ -250,7 +248,7 @@ mstro_status
 mstro_pmp_send_wait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
 {
   size_t msgsize = mstro__pool__mstro_msg__get_packed_size(msg);
-  
+
   /* we rely on the true sub-message name being in any of the union
    * member's base slot, so DECLARE is as good as any */
   DEBUG("handling msg %s to %zu\n", msg->declare->base.descriptor->name, target);
@@ -293,7 +291,7 @@ mstro_pmp_send_wait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
 
   DEBUG("Packed %s message (space required: %zu) into envelope (size %zu)\n",
         msg->base.descriptor->name, msgsize, env->size);
-  
+
   struct mstro_pm_app_registry_entry *app_entry = NULL;
   s = mstro_pm_app_lookup(target, &app_entry);
   if(app_entry==NULL) {
@@ -312,7 +310,7 @@ mstro_pmp_send_wait(mstro_app_id target, const Mstro__Pool__MstroMsg *msg)
     goto BAILOUT;
   }
   DEBUG("OFI message submitted, waited, and completed\n");
-  
+
   s=MSTRO_OK;
 BAILOUT:
   /* envelope freed in completion handler */
@@ -328,7 +326,7 @@ mstro_pmp_package(Mstro__Pool__MstroMsg *msg,
   if(msg==NULL || inner==NULL) {
     return MSTRO_INVARG;
   }
-  
+
   /* check that dst has been initialized somehow */
   assert(msg->base.descriptor == &mstro__pool__mstro_msg__descriptor);
 
@@ -340,12 +338,12 @@ mstro_pmp_package(Mstro__Pool__MstroMsg *msg,
    * mstro__pool__mstro_msg__free_unpacked() function will recurse ind
    * try to free these */
   msg->token = &g_pool_apptoken; /* populated at WELCOME time for clients, at PM start for pool manager */
-  
+
   /* security part of token would need to be set from security token
    * at some point */
   /* msg->token = ... */
   /* opts left at NULL means: not set */
-  
+
   /* no extra data (join sub-message contains all info) */
 
 
@@ -429,6 +427,5 @@ mstro_pmp_package(Mstro__Pool__MstroMsg *msg,
           inner->descriptor->name);
       return MSTRO_UNIMPL;
   }
-  return MSTRO_OK;  
+  return MSTRO_OK;
 }
-
diff --git a/maestro/pool_manager_registry.c b/maestro/pool_manager_registry.c
index 4d8bc766e6ac095dab9784b7695a1aa126298c5a..94d468693cee7368c23022c1fc94335621636e03 100644
--- a/maestro/pool_manager_registry.c
+++ b/maestro/pool_manager_registry.c
@@ -6,21 +6,21 @@
 /*
  * Copyright (C) 2019 Cray Computer GmbH
  * Copyright (C) 2021 HPE Switzerland GmbH
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
+ *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
- * 
+ *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
- * 
+ *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software without
  * specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -114,13 +114,13 @@ mstro_status
 mstro_pm_app_reg__new_entry(struct mstro_pm_app_registry_entry **entry_p, mstro_app_id id)
 {
   assert(entry_p!=NULL);
-  
+
   struct mstro_pm_app_registry_entry *e
       =malloc(sizeof(struct mstro_pm_app_registry_entry));
   if(e==NULL) {
     return MSTRO_NOMEM;
   }
-  
+
   e->appid = id;
   e->ep = NULL;
   e->serialized_desc = NULL;
@@ -143,19 +143,19 @@ mstro_pm_app_reg__entry_dispose(struct mstro_pm_app_registry_entry *e)
   if(e->ep) {
     ; /* owned by creator (?) */
   }
-  
+
   if(e->serialized_desc) {
     ; /* owned by creator (?) */
   }
   //    free(e->serialized_desc);
-  
+
   if(e->transport_methods) {
     ; /* owned by creator */
   }
-  
+
   if(e->component_name)
     free(e->component_name);
-  
+
   free(e);
   return MSTRO_OK;
 }
@@ -175,7 +175,7 @@ mstro_pm_app_register(struct mstro_endpoint *ep,
   if(id_p==NULL && entry_p==NULL)
     return MSTRO_INVOUT;
 
-  
+
   struct mstro_pm_app_registry_entry *e=NULL;
   mstro_status status
       = mstro_pm_app_reg__new_entry(&e, mstro_pm_app__next_id());
@@ -191,7 +191,7 @@ mstro_pm_app_register(struct mstro_endpoint *ep,
   e->transport_methods = (Mstro__Pool__TransportMethods *)transport_methods;
   e->component_name = strdup(component_name);
   e->component_index = component_index;
-  
+
   /* check for duplicates */
   WITH_LOCKED_APP_REGISTRY({
       struct mstro_pm_app_registry_entry *elt=NULL;
@@ -201,9 +201,9 @@ mstro_pm_app_register(struct mstro_endpoint *ep,
           if(elt->component_index == e->component_index) {
 	    if(elt->dead) {
 	      /* garbage-collect the old one; FIXME: could be smarter and re-use parts of it */
-	      DEBUG("Garbage-collecting previous app entry for %s/%zu (appid %" PRIappid ")\n",
+	      DEBUG("Garbage-collecting previous app entry for %s/%zu (app %" PRIappid ")\n",
 		    elt->component_name, elt->component_index);
-	      HASH_DEL(g_mstro_pm_app_registry, elt); 
+	      HASH_DEL(g_mstro_pm_app_registry, elt);
 	      goto unlock;
 	    } else {
 	      ERR("Duplicate component registration: %s:%" PRIu64         \
@@ -218,7 +218,7 @@ mstro_pm_app_register(struct mstro_endpoint *ep,
       }
    unlock:
       ;});
-  
+
   if(status!=MSTRO_OK) {
     mstro_pm_app_reg__entry_dispose(e);
     goto BAILOUT;
@@ -230,16 +230,15 @@ mstro_pm_app_register(struct mstro_endpoint *ep,
   e->transport_methods = (Mstro__Pool__TransportMethods *)transport_methods;
   e->component_name = strdup(component_name);
   e->component_index = component_index;
-  
-  
+
   DEBUG("Registered app %" PRIappid " for %s:%" PRIu64 " with transport methods %p\n",
        e->appid, e->component_name, e->component_index, e->transport_methods);
-  
+
   WITH_LOCKED_APP_REGISTRY({
       HASH_ADD(hh, g_mstro_pm_app_registry,
                appid, sizeof(mstro_app_id), e);
     });
-  
+
   /* return data to caller if interesed */
   if(id_p!=NULL) {
     *id_p = e->appid;
@@ -261,17 +260,21 @@ mstro_pc_app_befriend(mstro_app_id appid, const char* serialized_ep, const Mstro
     DEBUG("Found app %zu in local registry, good\n", appid);
   } else {
     DEBUG("Unknown app %zu, let's make friends\n", appid);
-    mstro_endpoint_descriptor dst_epd;
-    status=mstro_ep_desc_deserialize(&dst_epd, serialized_ep);
-  
+    Mstro__AppInfo *dst_epd=NULL;
+    status=mstro_appinfo_deserialize(serialized_ep, &dst_epd);
+
     if(status!=MSTRO_OK) {
       ERR("Failed to parse destination endpoint descriptor\n");
       return MSTRO_INVMSG;
     }
+    
 
     struct mstro_endpoint *dst_ep=NULL;
     fi_addr_t dst_addr;
     status=mstro_ofi__select_endpoint(dst_epd, &dst_ep, &dst_addr);
+
+    mstro__app_info__free_unpacked(dst_epd, NULL);
+
     if(status!=MSTRO_OK) {
       ERR("DST endpoint not usable\n");
       return MSTRO_FAIL;
@@ -371,7 +374,7 @@ mstro_pm_app_lookup(mstro_app_id appid,
 {
   assert(app_entry_p!=NULL);
   mstro_status status=MSTRO_OK;
-  
+
   WITH_LOCKED_APP_REGISTRY({
       struct mstro_pm_app_registry_entry *e=NULL;
       HASH_FIND(hh, g_mstro_pm_app_registry, &appid, sizeof(mstro_app_id), e);
@@ -391,10 +394,10 @@ mstro_pm_app_lookup(mstro_app_id appid,
 /** pair of CDO name and chosen CDO ID */
 struct cdo_name_id_pair {
   struct cdo_name_id_pair *next;  /**< this is a linked list */
-  char *cdo_name;                 /**< the CDO name */ 
+  char *cdo_name;                 /**< the CDO name */
   struct mstro_cdo_id cdo_id;     /**< the CDO ID */
 };
-  
+
 /** Table of colliding names for the same CDO ID, and their resolution */
 struct cdo_id_collision_entry {
   UT_hash_handle hh;                   /**< hash on cdo_id */
@@ -412,8 +415,8 @@ struct cdo_handle_entry {
   uint64_t local_id; /**< the local-id part of the CDO id */
   mstro_cdo_state cdo_state;            /**< state of this handle */
   Mstro__Pool__Attributes *attributes;  /**< attributes reported from this app */
-  /* FIXME: add reference to distribution here; maybe use NULL to
-   * identify non-distributed CDOs (or add a flag to cdo_state?) */
+  int64_t n_segments; /**< number of transmissions required to fill this CDO -- mainly for dist CDOs*/
+  mmbLayout *dist_layout; /**< distributed layout handle*/
 };
 
 /** Per-Application CDO knowledge structure. */
@@ -424,13 +427,14 @@ struct per_app_cdo_entries {
   /* we could have a separate mutex for the handles to that we can
    * local the per-app entry independently of the registry table */
 };
-  
+
 /** Entry used to keep all CDOs known to the pool manager in sync */
 struct mstro_pm_cdo_registry_entry {
   UT_hash_handle hh;                             /**< hash on CDO id */
   struct mstro_cdo_id cdo_id;                    /**< the global cdo id (local-part always MSTRO_CDO_LOCAL_ID_NONE */
   char *cdo_name;                                /**< the CDO printable name */
   struct per_app_cdo_entries *app_to_attributes; /**< app-to-attributes table for the cdo id */
+  bool is_distributed;                           /**< flag indicates a distributed CDO */
 };
 
 
@@ -471,6 +475,7 @@ struct mstro_pm_demand_queue_entry {
   struct mstro_cdo_id cdo_id; /**< the CDO ID */
   mstro_app_id requestor; /**< the requesting app */
   Mstro__Pool__Attributes *req_attributes; /**< the requesting side's CDO attributes */
+  mmbLayout *req_dist_layout; /**<the requesting side's CDO distributed layout handle*/
   Mstro__Pool__TransportMethods *req_methods; /**< requesting side's transfer method wishes */
   mstro_app_id provider; /**< source satisfying this request */
 };
@@ -553,17 +558,17 @@ mstro_pm__withdraw_queue_entry_push(struct mstro_pm_withdraw_queue_entry *e)
   assert(e->next==NULL);
 
   mstro_status status = MSTRO_OK;
-  
+
   int s = pthread_mutex_lock(&g_mstro_pm_withdraw_queue_mtx);
   if(s!=0) {
     ERR("Failed to lock withdraw queue mutex: %d (%s)\n", s, strerror(s));
     status = MSTRO_FAIL;
     goto BAILOUT;
   }
-  
+
   e->next = g_mstro_pm_withdraw_queue;
   g_mstro_pm_withdraw_queue = e;
-  
+
   s = pthread_mutex_unlock(&g_mstro_pm_withdraw_queue_mtx);
   if(s!=0) {
     ERR("Failed to unlock withdraw queue mutex: %d (%s)\n", s, strerror(s));
@@ -584,7 +589,7 @@ mstro_pm__withdraw_queue_entry_concat(struct mstro_pm_withdraw_queue_entry *list
   assert(list!=NULL);
 
   mstro_status status = MSTRO_OK;
-  
+
   int s = pthread_mutex_lock(&g_mstro_pm_withdraw_queue_mtx);
   if(s!=0) {
     ERR("Failed to lock withdraw queue mutex: %d (%s)\n", s, strerror(s));
@@ -592,7 +597,7 @@ mstro_pm__withdraw_queue_entry_concat(struct mstro_pm_withdraw_queue_entry *list
     goto BAILOUT;
   }
   LL_CONCAT(g_mstro_pm_withdraw_queue, list);
-  
+
   s = pthread_mutex_unlock(&g_mstro_pm_withdraw_queue_mtx);
   if(s!=0) {
     ERR("Failed to unlock withdraw queue mutex: %d (%s)\n", s, strerror(s));
@@ -633,6 +638,324 @@ mstro_pm__notify_cdo_registry_change(void)
   return MSTRO_OK;
 }
 
+/** find best candidate to serve distributed CDO to app_id
+ *
+ * Returns best match in *best, or MSTRO_APP_ID_INVALID if none exists.
+ *
+ * must be called under lock.
+ */
+static inline
+mstro_status
+mstro_pm__dist_cdo_find_best_provider(
+    struct per_app_cdo_entries *app_to_attributes_table,
+    struct mstro_pm_demand_queue_entry *e,
+    mstro_pm_candidates **candidates)
+{
+  struct per_app_cdo_entries *entry, *tmp;
+  struct per_app_cdo_entries *best;
+  uint64_t best_local_id;
+  best = NULL;
+  mstro_status status;
+  bool default_layout = false;
+  mmbLayout *src_layout = NULL;
+  mmbError stat;
+
+  /* FIXME: this is just greedy */
+  /* FIXME: this also does not return the actual local-id */
+  /* Try to find a 1:1 mapping for the required cdo layout */
+  status = mstro_pm__find_cdo_with_layout(
+                              app_to_attributes_table,
+                              e->req_dist_layout,
+                              &best,
+                              &best_local_id);
+  assert(status == MSTRO_OK);
+  if(best) {
+    /* create candidates out of best with length 1 */
+    status = mstro__pool_create_candidates(candidates, 1, true);
+    if (status != MSTRO_OK) {
+      return status;
+    }
+    (*candidates)->app[0] = best;
+    (*candidates)->local_id[0] = best_local_id;
+    DEBUG("Found entry with local-id %zu for % " PRIappid " to provide CDO with the exact layout to % " PRIappid "\n",
+          (*candidates)->local_id[0], (*candidates)->app[0], e->requestor);
+    return MSTRO_OK;
+  }
+  else {
+    /* try to find a many:1 mapping for the required cdo layout */
+    HASH_ITER(hh, app_to_attributes_table, entry, tmp) {
+      struct cdo_handle_entry *h,*t;
+      HASH_ITER(hh, entry->handles, h, t) {
+        if(h->cdo_state==MSTRO_CDO_STATE_OFFERED)  {
+
+          if(!h->dist_layout) {
+            WARN("CDO with local-id %zu is flagged as distributed but has no distribution layout ... will create a default one \n", h->local_id);
+            stat = mmb_layout_dist_create_default_layout(e->req_dist_layout, &src_layout);
+            assert(stat == MMB_OK);
+            default_layout = true;
+          }
+          else {
+            src_layout = h->dist_layout;
+            default_layout = false;
+          }
+
+          /** try to match the distribution */
+          mmbLayoutEquivalence diff;
+          
+          stat = mmb_layout_cmp(src_layout, e->req_dist_layout, &diff);
+          assert(MMB_OK == stat);
+          assert(diff != MMB_LAYOUT_EQUAL); /* would have been caught above by mstro_pm__find_cdo_with_layout */
+          if (diff == MMB_LAYOUT_DIFF_FIELDS){
+            DEBUG("Found entry with local-id %zu for % " PRIappid " but with different layout \n",
+                h->local_id, entry->app);
+
+            /* FIXME make a hash table of the two layouts, so that we do not compute this intersection many times */
+            mmbLayoutIntersection *out_li = NULL;
+            stat = mmb_layout_compute_intersection(src_layout, e->req_dist_layout, &out_li);
+            assert(MMB_OK == stat);
+            DEBUG("checking layouts intersection \n");
+            if (out_li) {
+              mstro_pm__mmbLayoutIntersecton_to_candidates(out_li, e->req_dist_layout->index, src_layout, app_to_attributes_table, candidates);
+              mmb_layout_destroy_mmbLayoutIntersection(out_li);
+            }
+
+            /*destroy the default layout we created */
+            if (default_layout) {
+              mmb_layout_destroy(src_layout);
+            }
+
+
+          }
+          else { /*diff == MMB_LAYOUT_DIFF_INDEX OR diff == MMB_LAYOUT_DIFF_TYPES */
+            /* do nothing */
+            /* we do not care if we found same layout with different index
+             * because there will be no intersection
+             * AND
+             * we do not care about MMB_LAYOUT_DIFF_TYPES because finding intersections
+             * is not supported (yet) */
+
+          }
+        }
+
+      }
+    }
+  }
+  return MSTRO_OK;
+}
+
+static inline
+mstro_status
+mstro__pool_create_candidates(mstro_pm_candidates **candidates, size_t length, bool distribution) {
+  mstro_status status = MSTRO_OK;
+  /* Initialize candidates */
+  *candidates = (mstro_pm_candidates *) malloc(sizeof(mstro_pm_candidates));
+  if (!(*candidates)) {
+    ERR("Not enough memory for candidates \n");
+    return MSTRO_NOMEM;
+  }
+  (*candidates)->n_sources = length; /* number of apps */
+  (*candidates)->is_distributed = distribution; /* a (re)distribution operation */
+  if (!((*candidates)->n_sources)) {
+    mstro_pm_candidates_destroy(*candidates);
+    ERR("Not enough memory for candidates \n");
+    return MSTRO_NOMEM;
+  }
+  (*candidates)->app = malloc(sizeof(struct per_app_cdo_entries *)*length);
+  if (!((*candidates)->app)) {
+    mstro_pm_candidates_destroy(*candidates);
+    ERR("Not enough memory for candidates \n");
+    return MSTRO_NOMEM;
+  }
+  (*candidates)->local_id = malloc(sizeof(uint64_t)*length);
+  if (!((*candidates)->local_id)) {
+    mstro_pm_candidates_destroy(*candidates);
+    ERR("Not enough memory for candidates \n");
+    return MSTRO_NOMEM;
+  }
+
+
+  return status;
+}
+
+static inline
+mstro_status
+mstro_pm__count_intersections(
+                        mmbLayoutIntersection *intersection,
+                        size_t dst_index,
+                        size_t *count) {
+  mstro_status status = MSTRO_OK;
+  *count = 0;
+  size_t n_src_pieces = intersection->n_src_pieces;
+  size_t index;
+  for (size_t i = 0; i < n_src_pieces; i++) {
+
+    index = dst_index*n_src_pieces + i;
+
+    /* we are only interested if the length of intersection is > 0*/
+    if (intersection->overlap[index].length > 0) {
+      *count += 1;
+    }
+  }
+
+  return status;
+}
+
+static inline
+mstro_status
+mstro_pm__mmbLayoutIntersecton_to_candidates(
+                            mmbLayoutIntersection *intersection,
+                            size_t dst_index,
+                            mmbLayout *src_layout,
+                            struct per_app_cdo_entries *app_to_attributes_table,
+                            mstro_pm_candidates **candidates) {
+
+  mstro_status status = MSTRO_OK;
+  size_t n_src_pieces = intersection->n_src_pieces;
+  struct per_app_cdo_entries *src;
+  uint64_t src_local_id;
+  int index;
+  mmbError stat;
+
+  /* create a copy of src layout to play with */
+  mmbLayout *target_layout = NULL;
+  stat = mmb_layout_create_copy(src_layout, &target_layout);
+  assert(stat == MMB_OK);
+  
+  /*get true number of candidates */
+  size_t n_candidates = 0;
+  status = mstro_pm__count_intersections(intersection, dst_index, &n_candidates);
+  DEBUG("number of candidates %zu \n", n_candidates);
+  assert(status == MSTRO_OK);
+  /* Initialize candidates */
+  status = mstro__pool_create_candidates(candidates, n_candidates, true);
+  if (status != MSTRO_OK) {
+    return status;
+  }
+  size_t candidates_index = 0;
+  /* find all the required pieces for dst_index, aka our CDO, and fill the list of candidates */
+  for (size_t i = 0; i < n_src_pieces; i++) {
+
+    index = dst_index*n_src_pieces + i;
+    src = NULL; // initialize
+
+    /* we are only interested if the length of intersection is > 0*/
+    if (intersection->overlap[index].length > 0) {
+      DEBUG("finding piece %zu with src offset %zu and dst offset %zu, length %zu \n",
+                              i,
+                              intersection->overlap[index].src_offset,
+                              intersection->overlap[index].dst_offset,
+                              intersection->overlap[index].length);
+
+      /* point to the current src piece in the layout */
+      target_layout->index = i;
+      /* find the app and cdo id with a layout matching source distribution */
+      status = mstro_pm__find_cdo_with_layout(
+                                  app_to_attributes_table,
+                                  target_layout,
+                                  &src,
+                                  &src_local_id);
+      assert(status == MSTRO_OK);
+      if(src == NULL) {
+        DEBUG("One or more required pieces of this distribution is missing and we can not use it \n ");
+        mstro_pm_candidates_destroy(*candidates);
+        *candidates = NULL;
+        return status;
+
+      }
+      DEBUG("found piece %zu in app %" PRIappid " with local id %zu \n", candidates_index, src, src_local_id);
+      (*candidates)->app[candidates_index] = src;
+      (*candidates)->local_id[candidates_index] = src_local_id;
+      candidates_index++;
+
+    }
+    else { /* intersection is zero ...  nothing to do here */
+
+    }
+  }
+
+  stat = mmb_layout_destroy(target_layout);
+  assert(stat == MMB_OK);
+
+  return status;
+}
+
+static inline
+mstro_status
+mstro_pm__find_cdo_with_layout(
+                            struct per_app_cdo_entries *app_to_attributes_table,
+                            mmbLayout *s_layout,
+                            struct per_app_cdo_entries **app,
+                            uint64_t *local_id) {
+
+  mstro_status status = MSTRO_OK;
+  mmbError stat;
+  mmbLayout *src_layout = NULL;
+  bool default_layout = false;
+  struct per_app_cdo_entries *entry, *tmp;
+  *app = NULL;
+
+  HASH_ITER(hh, app_to_attributes_table, entry, tmp) {
+    struct cdo_handle_entry *h,*t;
+    HASH_ITER(hh, entry->handles, h, t) {
+      if (h->cdo_state==MSTRO_CDO_STATE_OFFERED) {
+
+        if (!h->dist_layout) {
+          WARN("CDO with local-id %zu is flagged as distributed but has no distribution layout ... will create a default one \n", h->local_id);
+          stat = mmb_layout_dist_create_default_layout(s_layout, &src_layout);
+          assert(stat == MMB_OK);
+          default_layout = true;
+        }
+        else {
+          src_layout = h->dist_layout;
+          default_layout = false;
+        }
+
+        /** try to match the distribution */
+        mmbLayoutEquivalence diff;
+        mmbError mmb_s;
+        mmb_s = mmb_layout_cmp(src_layout, s_layout, &diff);
+        assert(MMB_OK == mmb_s);
+        /** release the default layout we created */
+        if (default_layout) {
+          mmb_layout_destroy(src_layout);
+        }
+        if (diff == MMB_LAYOUT_EQUAL){
+          DEBUG("Found entry with local-id %zu for % " PRIappid " to provide CDO \n",
+                h->local_id, entry->app);
+          *app = entry;
+          *local_id = h->local_id;
+          /* best case */
+          return  status;
+        }
+      }
+    }
+  }
+
+  return status;
+}
+
+
+static inline
+mstro_status
+mstro_pm_candidates_destroy(mstro_pm_candidates *candidates){
+    mstro_status status = MSTRO_OK;
+    DEBUG("Destroying candidate list \n");
+    if(candidates)
+    {
+      if(candidates->app) {
+        free(candidates->app);
+      }
+      if (candidates->local_id) {
+        free(candidates->local_id);
+      }
+
+      free(candidates);
+    }
+
+
+    return status;
+}
+
 /** find best candidate to serve CDO to app_id
  *
  * Returns best match in *best, or MSTRO_APP_ID_INVALID if none exists.
@@ -643,11 +966,10 @@ static inline
 mstro_status
 mstro_pm__find_best_provider(
     struct per_app_cdo_entries *app_to_attributes_table,
-    mstro_app_id app_id,
-    struct per_app_cdo_entries **best, uint64_t *best_local_id)
+    mstro_app_id app_id, mstro_pm_candidates **candidates)
 {
   struct per_app_cdo_entries *entry, *tmp;
-  *best = NULL;
+  mstro_status status;
   app_id=app_id; /* silence unused arg warning */
   /* FIXME: this is just greedy */
   /* FIXME: this also does not return the actual local-id */
@@ -655,19 +977,226 @@ mstro_pm__find_best_provider(
     struct cdo_handle_entry *h,*t;
     HASH_ITER(hh, entry->handles, h, t) {
       if(h->cdo_state==MSTRO_CDO_STATE_OFFERED) {
-        DEBUG("Found entry with local-id %zu for % " PRIappid " to provide CDO to % " PRIappid "\n",
+        DEBUG("Found entry with local-id %zu for app % " PRIappid " to provide CDO to app % " PRIappid "\n",
               h->local_id, entry->app, app_id);
-        *best = entry;
-        *best_local_id = h->local_id;
-        goto done;
+        status = mstro__pool_create_candidates(candidates, 1, false);
+        if (status != MSTRO_OK) {
+          return status;
+        }
+        (*candidates)->app[0] = entry;
+        (*candidates)->local_id[0] = h->local_id;
+        return MSTRO_OK;
       }
     }
   }
-done:
   return MSTRO_OK;
 }
-  
-  
+
+/** find a dist_layout for the current cdo
+ *
+ * Returns a pointer to dist_layout of the current cdo
+ *
+ * must be called under lock.
+ */
+static inline
+mstro_status
+mstro_pm__find_dist_layout(
+    struct per_app_cdo_entries *app_to_attributes_table,
+    mmbLayout **default_layout)
+{
+  struct per_app_cdo_entries *entry, *tmp;
+  mstro_status status;
+  /* FIXME: this is just greedy */
+  HASH_ITER(hh, app_to_attributes_table, entry, tmp) {
+    struct cdo_handle_entry *h,*t;
+    HASH_ITER(hh, entry->handles, h, t) {
+      if(h->dist_layout) {
+        *default_layout = h->dist_layout;
+        return MSTRO_OK;
+      }
+    }
+  }
+  /* We should not reach this part ...if is_distributed is set, then at least one has a distribution */
+  ERR("Can not find a distribution for this CDO \n");
+  return MSTRO_FAIL;
+}
+
+/** look up per-app cdo entry. Must be called unter CDO_REGISTRY_LOCK. */
+static inline
+mstro_status
+mstro_pm_cdo_app_lookup(const struct mstro_cdo_id* cdo_id,
+                        mstro_app_id app_id,
+                        struct per_app_cdo_entries **app_entry, struct cdo_handle_entry **handle_entry)
+{
+  struct mstro_pm_cdo_registry_entry *regentry=NULL;
+
+  struct mstro_cdo_id head = *cdo_id;
+  head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
+
+  HASH_FIND(hh, g_mstro_pm_cdo_registry,
+            &head, sizeof(struct mstro_cdo_id), regentry);
+  if(!regentry) {
+    WITH_CDO_ID_STR(idstr, &head,
+                    DEBUG("No regentry for cdo %s\n", idstr););
+    *app_entry = NULL;
+    *handle_entry = NULL;
+    return MSTRO_FAIL;
+  } else {
+    struct per_app_cdo_entry *per_app_entry;
+    HASH_FIND(hh, regentry->app_to_attributes,
+              &app_id, sizeof(app_id), *app_entry);
+    if(*app_entry==NULL) {
+      WITH_CDO_ID_STR(idstr, &head,
+                      DEBUG("No regentry for app %" PRIappid " for CDO %s\n", app_id, idstr););
+
+      *handle_entry = NULL;
+      return MSTRO_FAIL;
+    } else {
+      HASH_FIND(hh, (*app_entry)->handles,
+                &cdo_id->local_id, sizeof(cdo_id->local_id), *handle_entry);
+      if(*handle_entry)
+        return MSTRO_OK;
+      else {
+        WITH_CDO_ID_STR(idstr, &head,
+                        DEBUG("No handle entry for app %" PRIappid " for CDO %s, local-id %zu\n",
+                              app_id, idstr, cdo_id->local_id););
+        return MSTRO_FAIL;
+      }
+    }
+  }
+}
+
+
+static inline
+mstro_status
+mstro_pm__send_transfer_init(struct mstro_pm_demand_queue_entry *e,
+                             struct per_app_cdo_entries *candidate,
+                             uint64_t candidate_local_id, 
+                             size_t n_segments,
+                             bool is_distributed) {
+
+  mstro_status status = MSTRO_OK;
+
+  WITH_CDO_ID_STR(dststr, &e->cdo_id, {
+      DEBUG("Selected provider for CDO %s, app %" PRIappid
+            " using local-id %zu of app %" PRIappid "\n",
+            dststr, e->requestor,
+            candidate_local_id, candidate->app);});
+
+  Mstro__Pool__CDOID srccdoid = MSTRO__POOL__CDOID__INIT;
+  srccdoid.qw0 = e->cdo_id.qw[0];
+  srccdoid.qw1 = e->cdo_id.qw[1];
+  srccdoid.local_id = candidate_local_id;
+
+  Mstro__Pool__CDOID dstcdoid = MSTRO__POOL__CDOID__INIT;
+  dstcdoid.qw0 = e->cdo_id.qw[0];
+  dstcdoid.qw1 = e->cdo_id.qw[1];
+  dstcdoid.local_id = e->cdo_id.local_id;
+
+
+  Mstro__Pool__Appid dst_appid = MSTRO__POOL__APPID__INIT;
+  dst_appid.id = e->requestor;
+  NOISE("Initiating a transfer for requestor %zu\n",
+       dst_appid.id);
+
+  Mstro__Pool__InitiateTransfer it
+      = MSTRO__POOL__INITIATE_TRANSFER__INIT;
+  it.srccdoid = &srccdoid;
+  it.dstcdoid = &dstcdoid;
+  it.dst_appid = &dst_appid;
+  it.dst_attributes = e->req_attributes;
+
+  /* set the number of tickets required to fill the data for this CDO*/
+  it.n_segments = (int64_t) n_segments;
+  /* flag this ticket as a (re)distribution or not */
+  it.distributed_cdo = is_distributed;
+
+  struct mstro_pm_app_registry_entry *app_entry = NULL;
+  mstro_status s;
+  s = mstro_pm_app_lookup(dst_appid.id, &app_entry);
+  if(s != MSTRO_OK || app_entry==NULL) {
+    ERR("Target %zu not in app table\n", e->requestor);
+    return MSTRO_FAIL;
+  }
+  DEBUG("InitiateTransfer msg packed with serialized ep desc %s\n",
+       app_entry->serialized_desc);
+  it.dst_serialized_endpoint = app_entry->serialized_desc;
+  DEBUG("entry has req methods %p\n", e->req_methods);
+  it.methods = e->req_methods;
+  it.cp = 0; // FIXME When PM starts doing smart stuff. I am
+             // here arbitrarily choosing not to keep a
+             // local copy on the src after
+             // transfer. Perhaps PM wants to communicate to
+             // keep the transport file/obj on the dst, in
+             // which TODO case add a "keep_file" field in
+             // the InitiateTransfer msg
+
+  Mstro__Pool__MstroMsg msg = MSTRO__POOL__MSTRO_MSG__INIT;
+
+  status = mstro_pmp_package(&msg, (ProtobufCMessage*)&it);
+  if(status!=MSTRO_OK) {
+    ERR("Failed to package %s into a pool manager message\n",
+        it.base.descriptor->name);
+    return MSTRO_FAIL;
+  }
+
+  status = mstro_pmp_send_nowait(candidate->app, &msg);
+
+  switch(status) {
+    case MSTRO_OK:
+      WITH_CDO_ID_STR(
+          idstr, &e->cdo_id,
+          INFO("Sent initiate-transfer for %s to app %zu (for app %zu)\n",
+               idstr,
+               candidate->app, it.dst_appid->id);
+                      );
+      break;
+    default:
+      ERR("Failed to send initiate-transfer message to app %zu: %d (%s)\n",
+          candidate->app, status, mstro_status_description(status));
+      return MSTRO_FAIL;
+  }
+  mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_TICKETS, 1);
+
+  return status;
+}
+
+static inline
+mstro_status
+mstro_pm__send_transfer_init_to_candidates(
+                                      struct mstro_pm_demand_queue_entry *e,
+                                      mstro_pm_candidates *candidates) {
+  mstro_status status = MSTRO_OK;
+  if((!candidates) || (!e)){
+    ERR("Invalid list of candidates or required cdo\n");
+    return MSTRO_INVARG;
+  }
+  DEBUG("Number of candidates %zu \n", candidates->n_sources);
+  for (size_t i = 0; i < candidates->n_sources ; i++) {
+    status = mstro_pm__send_transfer_init(e, candidates->app[i], candidates->local_id[i], candidates->n_sources, candidates->is_distributed);
+    if (status != MSTRO_OK) {
+      return status;
+    }
+  }
+  /* set the number of required transmissions on cdo handle*/
+  DEBUG("Set the number of required transmissions on cdo handle\n");
+  struct per_app_cdo_entries *app_entry;
+  struct cdo_handle_entry *handle_entry;
+  status = mstro_pm_cdo_app_lookup(&e->cdo_id, e->requestor, &app_entry, &handle_entry);
+  if(status!=MSTRO_OK) {
+    ERR("Failed to find CDO entry \n");
+      return status; 
+  }
+  handle_entry->n_segments = candidates->n_sources;
+
+  DEBUG("Disposing demand queue entry that has been handled\n");
+  status = mstro_pm_demand_queue_entry__destroy(e);
+  if(status!=MSTRO_OK) {
+    ERR("Failed to de-allocate demand queue entry\n");
+    return MSTRO_FAIL;
+  }
+  return status;
+}
 
 static inline
 mstro_status
@@ -677,8 +1206,11 @@ mstro_pm__handle_demands(struct mstro_pm_demand_queue_entry *q)
   struct mstro_pm_demand_queue_entry *unhandled=NULL;
   size_t len=0;
   struct mstro_pm_demand_queue_entry *e;
+  mmbError stat;
+  mmbLayout *default_layout = NULL;
+  mmbLayout *src_layout;
   LL_COUNT(q,e,len);
-  
+
   NOISE("Handling %d demand queue entries\n", len);
 
   while(q!=NULL) {
@@ -699,103 +1231,45 @@ mstro_pm__handle_demands(struct mstro_pm_demand_queue_entry *q)
                           ERR("No CDO registry entry for DEMANDed CDO %s\n",
                               idstr););
         } else {
-          struct per_app_cdo_entries *candidate=NULL;
-          uint64_t candidate_local_id;
-          mstro_pm__find_best_provider(regentry->app_to_attributes,
-                                       e->requestor,
-                                       &candidate, &candidate_local_id);
-          if(candidate==NULL) {
-            /* no offer for this demand */
-            LL_PREPEND(unhandled, e);
-          } else {
-            WITH_CDO_ID_STR(dststr, &e->cdo_id, {
-                DEBUG("Selected provider for CDO %s, app %" PRIappid
-                      " using local-id %zu of app %" PRIappid "\n",
-                      dststr, e->requestor,
-                      candidate_local_id, candidate->app);});
-                  
-            Mstro__Pool__CDOID srccdoid = MSTRO__POOL__CDOID__INIT;
-            srccdoid.qw0 = e->cdo_id.qw[0];
-            srccdoid.qw1 = e->cdo_id.qw[1];
-            srccdoid.local_id = candidate_local_id;
-
-            Mstro__Pool__CDOID dstcdoid = MSTRO__POOL__CDOID__INIT;
-            dstcdoid.qw0 = e->cdo_id.qw[0];
-            dstcdoid.qw1 = e->cdo_id.qw[1];
-            dstcdoid.local_id = e->cdo_id.local_id;
-
-
-            Mstro__Pool__Appid dst_appid = MSTRO__POOL__APPID__INIT;
-            dst_appid.id = e->requestor;
-            NOISE("Initiating a transfer for requestor %zu\n",
-                 dst_appid.id);
-            
-            Mstro__Pool__InitiateTransfer it
-                = MSTRO__POOL__INITIATE_TRANSFER__INIT;
-            it.srccdoid = &srccdoid;
-            it.dstcdoid = &dstcdoid;
-            it.dst_appid = &dst_appid;
-            it.dst_attributes = e->req_attributes;
-
-            struct mstro_pm_app_registry_entry *app_entry = NULL;
-            mstro_status s;
-            s = mstro_pm_app_lookup(dst_appid.id, &app_entry);
-            if(s != MSTRO_OK || app_entry==NULL) {
-              ERR("Target %zu not in app table\n", e->requestor);
-              return MSTRO_FAIL;
-            }
-            DEBUG("InitiateTransfer msg packed with serialized ep desc %s\n",
-                 app_entry->serialized_desc);
-            it.dst_serialized_endpoint = app_entry->serialized_desc;
-            DEBUG("entry has req methods %p\n", e->req_methods);
-            it.methods = e->req_methods;
-            it.cp = 0; // FIXME When PM starts doing smart stuff. I am
-                       // here arbitrarily choosing not to keep a
-                       // local copy on the src after
-                       // transfer. Perhaps PM wants to communicate to
-                       // keep the transport file/obj on the dst, in
-                       // which TODO case add a "keep_file" field in
-                       // the InitiateTransfer msg
-
-            Mstro__Pool__MstroMsg msg = MSTRO__POOL__MSTRO_MSG__INIT;
-
-            status = mstro_pmp_package(&msg, (ProtobufCMessage*)&it);
-            if(status!=MSTRO_OK) {
-              ERR("Failed to package %s into a pool manager message\n",
-                  it.base.descriptor->name);
-              goto BAILOUT_UNLOCK;
-            }
+          mstro_pm_candidates *candidates = NULL;
 
-            status = mstro_pmp_send_nowait(candidate->app, &msg);
-            
-            switch(status) {
-              case MSTRO_OK:
-                WITH_CDO_ID_STR(
-                    idstr, &e->cdo_id,
-                    INFO("Sent initiate-transfer for %s to app %zu (for app %zu)\n",
-                         idstr,
-                         candidate->app, it.dst_appid->id);
-                                );
-                break;
-              default:
-                ERR("Failed to send initiate-transfer message to app %zu: %d (%s)\n",
-                    candidate->app, status, mstro_status_description(status));
-                goto BAILOUT_UNLOCK;
+          if (regentry->is_distributed) {
+
+            if (!e->req_dist_layout) {
+              WARN("Required CDO is flagged as distributed but has no distribution layout ... will create a default one \n");
+              /* look for a cdo distribution as a source dist */
+              status = mstro_pm__find_dist_layout(regentry->app_to_attributes, &src_layout);
+              stat = mmb_layout_dist_create_default_layout(src_layout, &default_layout);
+              assert(stat == MMB_OK);
+              e->req_dist_layout = default_layout;
             }
-	    mstro_stats_add_counter(MSTRO_STATS_CAT_PROTOCOL, MSTRO_STATS_L_PM_NUM_TICKETS, 1);
-            DEBUG("Disposing demand queue entry that has been handled\n");
-            status = mstro_pm_demand_queue_entry__destroy(e);
-            if(status!=MSTRO_OK) {
-              ERR("Failed to de-allocate demand queue entry\n");
+
+            /** distributed cdo  -- need to handle the distribution */
+            mstro_pm__dist_cdo_find_best_provider(regentry->app_to_attributes,
+                                              e,
+                                              &candidates);
+          }
+          else {
+            mstro_pm__find_best_provider(regentry->app_to_attributes,
+                                         e->requestor,
+                                         &candidates);
+          }
+          if(candidates==NULL) {
+            /* no offer for this demand */
+            LL_PREPEND(unhandled, e);
+          } else { // send transfer init
+            status = mstro_pm__send_transfer_init_to_candidates(e, candidates);
+            if (status != MSTRO_OK) {
               goto BAILOUT_UNLOCK;
             }
-          }
+            mstro_pm_candidates_destroy(candidates);
+          } // end else
         }
    BAILOUT_UNLOCK:
         ;
-                             );
+        );
   }
-  
+
   if(unhandled!=NULL) {
     /* append the leftovers to queue for another round later */
     int s=pthread_mutex_lock(&g_mstro_pm_demand_queue_mtx);
@@ -809,7 +1283,7 @@ mstro_pm__handle_demands(struct mstro_pm_demand_queue_entry *q)
       ERR("Failed to unlock demand queue mutex: %d\n", s);
       return MSTRO_FAIL;
     }
-  }    
+  }
 
   return status;
 }
@@ -862,7 +1336,7 @@ mstro_status
 mstro_pm__handle_withdraws(void)
 {
   mstro_status status = MSTRO_OK;
-  
+
   int s=pthread_mutex_lock(&g_mstro_pm_withdraw_queue_mtx);
   if(s!=0) {
     ERR("Failed to lock withdraw queue mutex: %d (%s)\n", s, strerror(s));
@@ -884,7 +1358,7 @@ BAILOUT:
   return status;
 }
 
-      
+
 /* This function is called by the transport initiator thread to process the demand queue once:
  *  wait on cvar, drain queue, unlock, process drained queue entries, quit */
 mstro_status
@@ -912,7 +1386,7 @@ mstro_pm_handle_demand_queue(void)
     status = MSTRO_FAIL;
     goto BAILOUT;
   }
-  
+
   while(g_mstro_pm_demand_queue!=NULL) {
     /* drain */
     q = g_mstro_pm_demand_queue;
@@ -923,7 +1397,7 @@ mstro_pm_handle_demand_queue(void)
       status = MSTRO_FAIL;
       goto BAILOUT;
     }
-    
+
     status = mstro_pm__handle_demands(q);
     if(status!=MSTRO_OK) {
       ERR("Failure processing demand queue entries\n");
@@ -936,7 +1410,7 @@ mstro_pm_handle_demand_queue(void)
       ERR("Failure processing withdraw-queue entries\n");
       goto BAILOUT;
     }
-    
+
 
     /* re-obtain lock */
     s = pthread_mutex_lock(&g_mstro_pm_demand_queue_mtx);
@@ -952,7 +1426,7 @@ mstro_pm_handle_demand_queue(void)
       status=MSTRO_FAIL;
       goto BAILOUT;
     }
-    
+
     /* this will jump into cleanup/non-local exit if we were canceled */
     pthread_testcancel();
     s=pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
@@ -964,7 +1438,7 @@ mstro_pm_handle_demand_queue(void)
     /* if someone added new entries we'll loop */
   }
   /* no entries during last processing step; we hold the lock: go to sleep */
-  
+
   s=pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
   if(s!=0) {
     ERR("failed to enable thread cancelation: %d\n", s);
@@ -982,7 +1456,7 @@ mstro_pm_handle_demand_queue(void)
   } else {
     DEBUG("Woke up on demand queue change\n");
   }
-  
+
   /* once we wake up we hold the lock again: Pull off one round of
    * entries, then return. We'll be called again by the thread running
    * it */
@@ -1004,7 +1478,7 @@ mstro_pm_handle_demand_queue(void)
     ERR("Failure processing demand queue entries\n");
     goto BAILOUT;
   }
-  
+
 BAILOUT:
   return status;
 }
@@ -1026,6 +1500,8 @@ mstro_pm__cdo_handle_entry__create(uint64_t local_id, mstro_cdo_state state,
   tmp->local_id = local_id;
   tmp->cdo_state = state;
   tmp->attributes = NULL;
+  tmp->n_segments = 0;
+  tmp->dist_layout = NULL;
   *result = tmp;
   return MSTRO_OK;
 }
@@ -1034,12 +1510,18 @@ static inline
 mstro_status
 mstro_pm__cdo_handle_entry__destroy(struct cdo_handle_entry *h)
 {
+  mstro_status m_status = MSTRO_OK;
+  mmbError stat = MMB_OK;
   if(h==NULL)
     return MSTRO_INVARG;
   if(h->attributes)
     mstro__pool__attributes__free_unpacked(h->attributes, NULL);
+  if(h->dist_layout){
+    stat = mmb_layout_destroy(h->dist_layout);
+    m_status = (stat == MMB_OK)? MSTRO_OK: MSTRO_FAIL;
+  }
   free(h);
-  return MSTRO_OK;
+  return m_status;
 }
 
 
@@ -1087,7 +1569,7 @@ mstro_pm_cdo_registry_declare(mstro_app_id app,
 {
   struct mstro_pm_app_registry_entry *e;
   assert(app!=MSTRO_APP_ID_INVALID);
-      
+
   if(cdo_name==NULL)
     return MSTRO_INVARG;
   if(cdo_id_p==NULL)
@@ -1131,10 +1613,11 @@ mstro_pm_cdo_registry_declare(mstro_app_id app,
         }
         memcpy(&(regentry->cdo_id), &head, sizeof(struct mstro_cdo_id));
         regentry->app_to_attributes = NULL;
+        regentry->is_distributed = false; /* init value*/
         HASH_ADD(hh, g_mstro_pm_cdo_registry,
                  cdo_id, sizeof(struct mstro_cdo_id), regentry);
       }
-      
+
       /* We now have a regentry, possibly without app registration */
       if(0!=strcmp(regentry->cdo_name, cdo_name)) {
         WITH_CDO_ID_STR(idstr, &head, {
@@ -1160,7 +1643,7 @@ mstro_pm_cdo_registry_declare(mstro_app_id app,
                     &local_id, sizeof(local_id), existing_entry);
           if(existing_entry) {
             WITH_CDO_ID_STR(idstr, cdo_id_p,
-                            ERR("Duplicate DECLARE for same handle: appid %" PRIappid ", id %s, local-id %" PRIx64 "\n",
+                            ERR("Duplicate DECLARE for same handle: app %" PRIappid ", id %s, local-id %" PRIx64 "\n",
                                 app, idstr, local_id););
           } else {
             HASH_ADD(hh, per_app_entries->handles,
@@ -1168,7 +1651,7 @@ mstro_pm_cdo_registry_declare(mstro_app_id app,
           }
         } else {
           /* first entry for thus app */
-          DEBUG("Creating first entry for %" PRIappid "\n", app); 
+          DEBUG("Creating first entry for app %" PRIappid "\n", app); 
           status = mstro_pm__per_app_entries__create(app, local_id, MSTRO_CDO_STATE_DECLARED,
                                                      &per_app_entries);
           if(status!=MSTRO_OK) {
@@ -1185,7 +1668,7 @@ mstro_pm_cdo_registry_declare(mstro_app_id app,
  BAILOUT_UNLOCK:
       ;
                            );
-  
+
 BAILOUT:
   return status;
 }
@@ -1194,12 +1677,12 @@ mstro_status
 mstro_pm_cdo_registry_find(mstro_app_id app, const struct mstro_cdo_id *id)
 {
   bool found=false;
-  
+
   struct mstro_pm_cdo_registry_entry *regentry=NULL;
 
   struct mstro_cdo_id tmp = *id;
   tmp.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-  
+
   WITH_LOCKED_CDO_REGISTRY({
       HASH_FIND(hh, g_mstro_pm_cdo_registry,
                 &tmp, sizeof(struct mstro_cdo_id), regentry);
@@ -1211,7 +1694,7 @@ mstro_pm_cdo_registry_find(mstro_app_id app, const struct mstro_cdo_id *id)
           struct cdo_handle_entry *e;
           HASH_FIND(hh, per_app_entries->handles,
                     &(id->local_id), sizeof(id->local_id), e);
-          if(e) 
+          if(e)
             found=true;
         }
       }
@@ -1229,10 +1712,11 @@ mstro_pm_cdo_registry_store_attributes(const struct mstro_cdo_id *cdo_id,
 {
   struct mstro_pm_cdo_registry_entry *regentry=NULL;
   bool found=false;
-
+  mstro_status status;
+  mmbLayout *dist_layout = NULL;
   struct mstro_cdo_id head = *cdo_id;
   head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-  
+
   WITH_LOCKED_CDO_REGISTRY(
       HASH_FIND(hh, g_mstro_pm_cdo_registry,
                 &head, sizeof(struct mstro_cdo_id), regentry);
@@ -1248,29 +1732,35 @@ mstro_pm_cdo_registry_store_attributes(const struct mstro_cdo_id *cdo_id,
             if(e->attributes==NULL) {
               e->attributes = attributes;
               found=true;
+              mstro_attribute_pool_find_dist_layout(attributes, &dist_layout);
+              e->dist_layout = dist_layout;
+              /** set is_distributed flag on the global CDO entry*/
+              if(dist_layout) {
+                regentry->is_distributed = true;
+              }
             } else {
               WITH_CDO_ID_STR(idstr, cdo_id, {
-                  ERR("Trying to overwrite attributes for %s from app %d\n",
+                  ERR("Trying to overwrite attributes for %s from app %" PRIappid "\n",
                       idstr, app_id);
                 });
             }
           } else {
             WITH_CDO_ID_STR(idstr, cdo_id, {
-                ERR("No matching local-id handle for app %d for %s\n", app_id, idstr);
+                ERR("No matching local-id handle for app %" PRIappid " for %s\n", app_id, idstr);
               };);
           }
         } else {
           WITH_CDO_ID_STR(idstr, cdo_id, {
-              ERR("No entries at all for app %d for %s\n", app_id, idstr);
+              ERR("No entries at all for app %" PRIappid " for %s\n", app_id, idstr);
             };);
         }
       } else {
         WITH_CDO_ID_STR(headid, &head, {
             WITH_CDO_ID_STR(idstr, cdo_id, {
-                ERR("No entries at all for %s (tried on behalf of app %d, CDO %s)\n",
+                ERR("No entries at all for %s (tried on behalf of app %" PRIappid ", CDO %s)\n",
                     headid, app_id, idstr);});};);
       });
- 
+
   return found? MSTRO_OK : MSTRO_FAIL;
 }
 
@@ -1285,28 +1775,28 @@ mstro_pm_cdo_registry__valid_state_transition(const struct mstro_cdo_id *cdoid,
   assert(cdoid!=NULL);
   char *msg=NULL;
   bool unchecked=false;
-  
+
   switch(new_state) {
     case MSTRO_CDO_STATE_INVALID:
       msg = "Cannot transition to INVALID state on PM";
       break;
-      
+
     case MSTRO_CDO_STATE_CREATED:
       if(old_state!=MSTRO_CDO_STATE_INVALID)
         msg = "Cannot step to CREATED state from any other valid state";
       break;
-      
+
     case MSTRO_CDO_STATE_DECLARED:
       if(old_state!=MSTRO_CDO_STATE_CREATED)
         msg = "Cannot DECLARE unless freshly CREATED";
       break;
-      
+
     case MSTRO_CDO_STATE_SEALED:
       if(old_state!=MSTRO_CDO_STATE_DECLARED)
         msg = "Cannot SEAL unless freshly DECLARED";
       break;
-      
-    case MSTRO_CDO_STATE_OFFERED: 
+
+    case MSTRO_CDO_STATE_OFFERED:
       if(old_state != MSTRO_CDO_STATE_DECLARED)
         msg = "Cannot OFFER unless properly DECLARED";
       break;
@@ -1325,7 +1815,7 @@ mstro_pm_cdo_registry__valid_state_transition(const struct mstro_cdo_id *cdoid,
       if(!  (old_state == MSTRO_CDO_STATE_REQUIRED))
         msg = "Cannot flag REQUIRED as IN_TRANSPORT unless REQUIRED (and no other flags set)";
       break;
-    
+
     case MSTRO_CDO_STATE_DEMANDED|MSTRO_CDO_STATE_IN_TRANSPORT:
       if(! (old_state == (MSTRO_CDO_STATE_REQUIRED)))
 	msg = "Cannot set DEMANDED+IN-TRANSPORT unless REQUIRED (and no other flags set)";
@@ -1336,7 +1826,7 @@ mstro_pm_cdo_registry__valid_state_transition(const struct mstro_cdo_id *cdoid,
             || (old_state == (MSTRO_CDO_STATE_REQUIRED|MSTRO_CDO_STATE_IN_TRANSPORT))))
         msg = "Cannot set RETRACTED unless REQUIRED or REQUIRED+IN-TRANSPORT (and no other flags set)";
       break;
-      
+
     default:
       msg="Unchecked state transition";
       unchecked=true;
@@ -1390,7 +1880,7 @@ mstro_pm_cdo_registry__set_state(const struct mstro_cdo_id *cdoid,
                         appid, idstr););
     return MSTRO_NOENT;
   }
-  
+
   /* intentionally inside an assert, so that it can be compiled-out */
   assert(mstro_pm_cdo_registry__valid_state_transition(
       cdoid, appid,
@@ -1419,7 +1909,7 @@ mstro_pm_cdo_registry_update_state(const struct mstro_cdo_id *cdoid,
 
   struct mstro_cdo_id head = *cdoid;
   head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-  
+
   WITH_LOCKED_CDO_REGISTRY(
       HASH_FIND(hh, g_mstro_pm_cdo_registry,
                 &head, sizeof(struct mstro_cdo_id), regentry);
@@ -1444,7 +1934,7 @@ mstro_pm_cdo_registry_update_state(const struct mstro_cdo_id *cdoid,
       return status;
     }
   }
-  
+
   return found? MSTRO_OK : MSTRO_FAIL;
 }
 
@@ -1456,13 +1946,13 @@ mstro_pm__per_app_cdo_entry_dispose(struct per_app_cdo_entries *e)
     return MSTRO_INVARG;
   struct cdo_handle_entry *h,*tmp;
   mstro_status s=MSTRO_OK;
-  
+
   HASH_ITER(hh, e->handles, h, tmp) {
     HASH_DEL(e->handles, h);
     s |= mstro_pm__cdo_handle_entry__destroy(h);
   }
   free(e);
-  
+
   return s;
 }
 
@@ -1478,16 +1968,16 @@ mstro_pm_app_deregister(mstro_app_id key)
 
   WARN("Not checking whether app %zu holds permanent CDOs that need to be preserved\n", key);
   WARN("Not checking whether app %zu offers any CDOs that will be needed by others\n", key);
-  
+
   if(key==MSTRO_APP_ID_MANAGER) {
     DEBUG("De-registering pool manager connection\n");
   }
-  
+
   WITH_LOCKED_CDO_REGISTRY({
       struct mstro_pm_cdo_registry_entry *e=NULL;
       struct mstro_pm_cdo_registry_entry *tmp=NULL;
       DEBUG("Have %zu CDO registry entries\n", HASH_COUNT(g_mstro_pm_cdo_registry));
-      
+
       HASH_ITER(hh, g_mstro_pm_cdo_registry, e, tmp) {
         struct per_app_cdo_entries *app_record;
         WITH_CDO_ID_STR(idstr, &e->cdo_id,
@@ -1500,7 +1990,7 @@ mstro_pm_app_deregister(mstro_app_id key)
           ; /* this app not on list for this CDO */
         } else {
           WITH_CDO_ID_STR(idstr, &e->cdo_id,
-                          DEBUG("App %" PRIappid " listed for CDO %s with %zu handles\n",
+                          DEBUG("Found app %" PRIappid " listed for CDO %s with %zu handles\n",
                                 key, idstr, HASH_COUNT(app_record->handles)););
           struct cdo_handle_entry *h;
           struct cdo_handle_entry *tmp;
@@ -1512,7 +2002,7 @@ mstro_pm_app_deregister(mstro_app_id key)
                 break; /* ok, sealed and never offered */
               case MSTRO_CDO_STATE_OFFERED:
                 WITH_CDO_ID_STR(idstr, &e->cdo_id,
-                                ERR("LEAVE request from %" PRIappid " when CDO %s (local-id %" PRIx64 ") is still OFFERED\n",
+                                ERR("LEAVE request from app %" PRIappid " when CDO %s (local-id %" PRIx64 ") is still OFFERED\n",
                                     key, idstr, h->local_id);
                                 );
                 break;
@@ -1541,64 +2031,20 @@ mstro_pm_app_deregister(mstro_app_id key)
 
   /* We can not drop the reference to the app itself because we could
    * not send the BYE otherwise. Instead we mark the entry 'dead' and can recycle it on re-join */
-  WITH_LOCKED_APP_REGISTRY({ 
-    struct mstro_pm_app_registry_entry *e=NULL; 
-    HASH_FIND(hh, g_mstro_pm_app_registry, &key, sizeof(key), e); 
-    if(e!=NULL) { 
+  WITH_LOCKED_APP_REGISTRY({
+    struct mstro_pm_app_registry_entry *e=NULL;
+    HASH_FIND(hh, g_mstro_pm_app_registry, &key, sizeof(key), e);
+    if(e!=NULL) {
       e->dead = true;
-    } else { 
-      ERR("Registry entry for app %" PRIappid " vanished\n", key); 
-    }       
-  }); 
-  DEBUG("Dropped all registry entries for app %" PRIappid "\n", key); 
+    } else {
+      ERR("Registry entry for app %" PRIappid " vanished\n", key);
+    }
+  });
+  DEBUG("Dropped all registry entries for app %" PRIappid "\n", key);
 
   return MSTRO_OK;
 }
 
-/** look up per-app cdo entry. Must be called unter CDO_REGISTRY_LOCK. */
-static inline
-mstro_status
-mstro_pm_cdo_app_lookup(const struct mstro_cdo_id* cdo_id,
-                        mstro_app_id app_id,
-                        struct per_app_cdo_entries **app_entry, struct cdo_handle_entry **handle_entry)
-{
-  struct mstro_pm_cdo_registry_entry *regentry=NULL;
-
-  struct mstro_cdo_id head = *cdo_id;
-  head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-
-  HASH_FIND(hh, g_mstro_pm_cdo_registry,
-            &head, sizeof(struct mstro_cdo_id), regentry);
-  if(!regentry) {
-    WITH_CDO_ID_STR(idstr, &head, 
-                    DEBUG("No regentry for cdo %s\n", idstr););
-    *app_entry = NULL;
-    *handle_entry = NULL;
-    return MSTRO_FAIL;
-  } else {
-    struct per_app_cdo_entry *per_app_entry;
-    HASH_FIND(hh, regentry->app_to_attributes,
-              &app_id, sizeof(app_id), *app_entry);
-    if(*app_entry==NULL) {
-      WITH_CDO_ID_STR(idstr, &head, 
-                      DEBUG("No regentry for app %" PRIappid " for CDO %s\n", app_id, idstr););
-
-      *handle_entry = NULL;
-      return MSTRO_FAIL;
-    } else {
-      HASH_FIND(hh, (*app_entry)->handles,
-                &cdo_id->local_id, sizeof(cdo_id->local_id), *handle_entry);
-      if(*handle_entry)
-        return MSTRO_OK;
-      else {
-        WITH_CDO_ID_STR(idstr, &head, 
-                        DEBUG("No handle entry for app %" PRIappid " for CDO %s, local-id %zu\n",
-                              app_id, idstr, cdo_id->local_id););
-        return MSTRO_FAIL;
-      }
-    }
-  }
-}
 
 mstro_status
 mstro_pm_cdo_registry_cdo_name_lookup(const struct mstro_cdo_id *id,
@@ -1609,7 +2055,7 @@ mstro_pm_cdo_registry_cdo_name_lookup(const struct mstro_cdo_id *id,
 
   struct mstro_cdo_id head = *id;
   head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-  
+
   WITH_LOCKED_CDO_REGISTRY(
       HASH_FIND(hh, g_mstro_pm_cdo_registry,
                 &head, sizeof(struct mstro_cdo_id), regentry);
@@ -1628,6 +2074,7 @@ mstro_pm_demand_queue_entry__create(const struct mstro_cdo_id *cdo_id,
                                     mstro_app_id app,
                                     struct mstro_pm_demand_queue_entry **res)
 {
+  mmbLayout *dist_layout = NULL;
   if(res==NULL)
     return MSTRO_INVOUT;
   *res=malloc(sizeof(struct mstro_pm_demand_queue_entry));
@@ -1672,6 +2119,9 @@ mstro_pm_demand_queue_entry__create(const struct mstro_cdo_id *cdo_id,
             WARN("operation on CDO that is not marked 'in-transport'\n");
           }
           (*res)->req_attributes = handle_entry->attributes;
+          mstro_attribute_pool_find_dist_layout(handle_entry->attributes, &dist_layout);
+          (*res)->req_dist_layout = dist_layout;
+
         }
       });
     return status;
@@ -1748,61 +2198,68 @@ mstro_pm_cdo_registry_transfer_completed(const struct mstro_cdo_id *cdo_id,
   mstro_status status = MSTRO_OK;
 
   DEBUG("FIXME: this code should use state checker function\n");
-  
+
   WITH_LOCKED_CDO_REGISTRY({
       struct per_app_cdo_entries *app_entry;
       struct cdo_handle_entry *handle_entry;
       mstro_status status = mstro_pm_cdo_app_lookup(cdo_id, app_id, &app_entry, &handle_entry);
       if(status!=MSTRO_OK) {
-        ERR("Failed to find CDO entry for app %zu\n", app_id);
+        ERR("Failed to find CDO entry for app %" PRIappid "\n", app_id);
         status = MSTRO_INVARG;
       } else {
-        if(! (handle_entry->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT)) {
-          if(handle_entry->cdo_state==MSTRO_CDO_STATE_RETRACTED) {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+        /* decrement the number of outstanding transmissions*/
+        handle_entry->n_segments--;
+        WITH_CDO_ID_STR(idstr, cdo_id,
+                            DEBUG("There are %zu outstanding pieces for (probably distributed) CDO  %s\n",
+                                  handle_entry->n_segments, idstr););
+        if (handle_entry->n_segments == 0) {
+          if(! (handle_entry->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT)) {
+            if(handle_entry->cdo_state==MSTRO_CDO_STATE_RETRACTED) {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             ERR("RETRACTED and not IN_TRANSPORT state for CDO %s (app %" PRIappid
                                 ": %d (%s), assuming retract overlapped with eager transfer.\n",
                                 idstr, app_id,
                                 handle_entry->cdo_state,
                                 mstro_cdo_state_describe(handle_entry->cdo_state)););
-          } else {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+            } else {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             ERR("Unexpected PM cdo registry state for CDO %s (app %" PRIappid ": %d (%s)\n",
                                 idstr, app_id,
                                 handle_entry->cdo_state,
                                 mstro_cdo_state_describe(handle_entry->cdo_state)););
-            status = MSTRO_INVARG;
-          }
-        } else {
+              status = MSTRO_INVARG;
+            }
+          } else {
           /* ok, it was in-transport */
-          if(handle_entry->cdo_state & MSTRO_CDO_STATE_DEMANDED) {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+            if(handle_entry->cdo_state & MSTRO_CDO_STATE_DEMANDED) {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             DEBUG("Clearing IN_TRANSPORT state from DEMANDED CDO %s\n",
                                   idstr););
-            handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
-          } else if(handle_entry->cdo_state & MSTRO_CDO_STATE_REQUIRED) {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+              handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
+            } else if(handle_entry->cdo_state & MSTRO_CDO_STATE_REQUIRED) {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             DEBUG("Clearing IN_TRANSPORT state from REQUIRED CDO %s\n",
                                   idstr););
-            handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
-            handle_entry->cdo_state |= MSTRO_CDO_STATE_SATISFIED;
-          } else if(handle_entry->cdo_state & MSTRO_CDO_STATE_INJECTED) {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+              handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
+              handle_entry->cdo_state |= MSTRO_CDO_STATE_SATISFIED;
+            } else if(handle_entry->cdo_state & MSTRO_CDO_STATE_INJECTED) {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             DEBUG("Clearing IN_TRANSPORT state from INJECTED CDO %s\n",
                                   idstr););
-            handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
-            handle_entry->cdo_state |= MSTRO_CDO_STATE_SATISFIED;
-          } else {
-            WITH_CDO_ID_STR(idstr, cdo_id,
+              handle_entry->cdo_state &= ~MSTRO_CDO_STATE_IN_TRANSPORT;
+              handle_entry->cdo_state |= MSTRO_CDO_STATE_SATISFIED;
+            } else {
+              WITH_CDO_ID_STR(idstr, cdo_id,
                             ERR("Unhandled CDO state %s for CDO %s\n",
                                 mstro_cdo_state_describe(handle_entry->cdo_state), idstr););
-            status = MSTRO_UNIMPL;
+              status = MSTRO_UNIMPL;
+            }
+            /* WITH_CDO_ID_STR(idstr, cdo_id, { */
+            /*     DEBUG("New state for CDO %s (app %" PRIappid ": %d (%s)\n", */
+            /*           idstr, app_id, */
+            /*           handle_entry->cdo_state, */
+            /*           mstro_cdo_state_describe(handle_entry->cdo_state));}); */
           }
-          /* WITH_CDO_ID_STR(idstr, cdo_id, { */
-          /*     DEBUG("New state for CDO %s (app %" PRIappid ": %d (%s)\n", */
-          /*           idstr, app_id, */
-          /*           handle_entry->cdo_state, */
-          /*           mstro_cdo_state_describe(handle_entry->cdo_state));}); */
         }
       }
     });
@@ -1836,7 +2293,7 @@ mstro_pm_cdo_app_match(mstro_app_id origin, const struct mstro_cdo_id *id,
   }
 
   /* WITH_CDO_ID_STR(str,id, { */
-  /*     DEBUG("Trying to match %s origin %" PRIappid " for selector %p, query |%s|\n", */
+  /*     DEBUG("Trying to match %s origin app %" PRIappid " for selector %p, query |%s|\n", */
   /*           str, origin, cdo_selector, cdo_selector->query); */
   /*   }); */
   mstro_status status = MSTRO_FAIL;
@@ -1848,13 +2305,201 @@ mstro_pm_cdo_app_match(mstro_app_id origin, const struct mstro_cdo_id *id,
       if(status!=MSTRO_OK) {
         ERR("Failed to find CDO entry for app %zu\n", origin);
       } else {
-        status = mstro_subscription_selector_eval(id, cdo_selector, 
+        status = mstro_subscription_selector_eval(id, cdo_selector,
                                                   handle_entry->attributes);
       }
     });
   return status;
 }
 
+/** evaluate if we can immediately withdraw a distributed cdo .... MUST be called within locked registery block*/
+static inline
+mstro_status
+mstro_pm_dist_cdo_registry_immediate_withdraw(
+                               const struct mstro_cdo_id *cdoid,
+                               mstro_app_id appid,
+                               struct mstro_pm_cdo_registry_entry *regentry,
+                               bool *immediate_withdraw) {
+
+    mstro_status status = MSTRO_OK;
+    *immediate_withdraw=false;
+
+    struct mstro_cdo_id head = *cdoid;
+    head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
+
+    INFO("Withdrawing a piece of a distributed CDO\n");
+
+   /* count OFFERs */
+   struct per_app_cdo_entries *entry;
+   struct per_app_cdo_entries *tmp;
+   size_t num_offers=0;
+   size_t num_required=0;
+   size_t num_retracted=0;
+   size_t num_in_flight=0; /* demanded and in-transport */
+   size_t num_req_in_transport=0; /* required and in-transport (eager sends) */
+   HASH_ITER(hh, regentry->app_to_attributes, entry, tmp) {
+        struct cdo_handle_entry *h;
+        struct cdo_handle_entry *t;
+        HASH_ITER(hh, entry->handles, h, t) {
+        /* WITH_CDO_ID_STR(idstr, &head, { */
+        /*     DEBUG("Inspecting %s for app %" PRIappid " local-id %zu: %d (%s)\n", */
+        /*           idstr, entry->app, h->local_id, */
+        /*           h->cdo_state, */
+        /*           mstro_cdo_state_describe(h->cdo_state));}); */
+        if(h->cdo_state & MSTRO_CDO_STATE_OFFERED) {
+          num_offers++;
+        }
+        if(h->cdo_state & MSTRO_CDO_STATE_RETRACTED) {
+          num_retracted++;
+        }
+        if(h->cdo_state & MSTRO_CDO_STATE_REQUIRED) {
+          num_required++;
+          if(h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT) {
+                num_req_in_transport++;
+              }
+        }
+        if((h->cdo_state & MSTRO_CDO_STATE_DEMANDED)
+               && (h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT)) {
+                 num_in_flight++;
+        }
+      }
+    }
+    assert(num_offers>0); /* because we must have one outstanding OFFER for it from appid */
+        /* find our entry */
+    HASH_FIND(hh, regentry->app_to_attributes,
+                  &appid, sizeof(appid), entry);
+    if(entry==NULL) {
+          ERR("Failed to find our per-app CDO registry entry (num_offers %zu)\n",
+              num_offers);
+          status = MSTRO_NOENT;
+          return  status;
+    }
+
+    /* check whether there is any REQUIRE or DEMAND for it */
+    if(num_required==0 && num_in_flight==0) {
+      WITH_CDO_ID_STR(idstr, cdoid,
+                      DEBUG("No outstanding REQUIRE or in-flight DEMAND for CDO %s, permitting WITHDRAW\n", idstr););
+      mstro_pm_cdo_registry__set_state(cdoid, appid,
+                                       MSTRO_CDO_STATE_WITHDRAWN,
+                                       entry);
+      *immediate_withdraw=true;
+    } else {
+      WARN("FIXME we are blocking all instances of a distributed CDO if there is a required piece\n, we should only block the required piece \n");
+      WITH_CDO_ID_STR(idstr, cdoid,
+                      DEBUG("Still have %zu REQUIREs and %zu in-flight DEMANDs for CDO %s outstanding\n",
+                            num_required, num_in_flight, idstr););
+      if(num_req_in_transport< num_required) {
+        INFO("Some REQUIREs are not in-transport yet; could check whether this instance is worth copying\n");
+      }
+      /* can't withdraw right away, need to schedule a wakeup on this CDO */
+      *immediate_withdraw=false;
+    }
+
+    return status;
+}
+
+
+/** evaluate if we can immediately withdraw .... MUST be called within locked registery block*/
+static inline
+mstro_status
+mstro_pm_cdo_registry_immediate_withdraw(
+                               const struct mstro_cdo_id *cdoid,
+                               mstro_app_id appid,
+                               struct mstro_pm_cdo_registry_entry *regentry,
+                               bool *immediate_withdraw) {
+
+    mstro_status status = MSTRO_OK;
+    *immediate_withdraw=false;
+
+    struct mstro_cdo_id head = *cdoid;
+    head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
+
+
+   /* count OFFERs */
+   struct per_app_cdo_entries *entry;
+   struct per_app_cdo_entries *tmp;
+   size_t num_offers=0;
+   size_t num_required=0;
+   size_t num_retracted=0;
+   size_t num_in_flight=0; /* demanded and in-transport */
+   size_t num_req_in_transport=0; /* required and in-transport (eager sends) */
+   HASH_ITER(hh, regentry->app_to_attributes, entry, tmp) {
+        struct cdo_handle_entry *h;
+        struct cdo_handle_entry *t;
+        HASH_ITER(hh, entry->handles, h, t) {
+        /* WITH_CDO_ID_STR(idstr, &head, { */
+        /*     DEBUG("Inspecting %s for app %" PRIappid " local-id %zu: %d (%s)\n", */
+        /*           idstr, entry->app, h->local_id, */
+        /*           h->cdo_state, */
+        /*           mstro_cdo_state_describe(h->cdo_state));}); */
+        if(h->cdo_state & MSTRO_CDO_STATE_OFFERED) {
+          num_offers++;
+        }
+        if(h->cdo_state & MSTRO_CDO_STATE_RETRACTED) {
+          num_retracted++;
+        }
+        if(h->cdo_state & MSTRO_CDO_STATE_REQUIRED) {
+          num_required++;
+          if(h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT) {
+                num_req_in_transport++;
+              }
+        }
+        if((h->cdo_state & MSTRO_CDO_STATE_DEMANDED)
+               && (h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT)) {
+                 num_in_flight++;
+        }
+      }
+    }
+    assert(num_offers>0); /* because we must have one outstanding OFFER for it from appid */
+        /* find our entry */
+    HASH_FIND(hh, regentry->app_to_attributes,
+                  &appid, sizeof(appid), entry);
+    if(entry==NULL) {
+          ERR("Failed to find our per-app CDO registry entry (num_offers %zu)\n",
+              num_offers);
+          status = MSTRO_NOENT;
+          return  status;
+    }
+
+    if(num_offers>1) {
+        /* easy, someone else has it too */
+        WITH_CDO_ID_STR(idstr, cdoid,
+                          DEBUG("Multiple OFFERs of CDO %s in pool\n", idstr););
+          INFO("FIXME: performing quick WITHDRAW for CDO from app %d;"
+               " could check whether this instance was worth copying\n",
+               appid);
+          assert(entry!=NULL);
+          mstro_pm_cdo_registry__set_state(cdoid, appid,
+                                           MSTRO_CDO_STATE_WITHDRAWN,
+                                           entry);
+          *immediate_withdraw=true;
+      } else {
+          /* only one offer */
+          WITH_CDO_ID_STR(idstr, cdoid, DEBUG("Only one OFFER of CDO %s in pool\n", idstr););
+          /* check whether there is any REQUIRE or DEMAND for it */
+          if(num_required==0 && num_in_flight==0) {
+            WITH_CDO_ID_STR(idstr, cdoid,
+                            DEBUG("No outstanding REQUIRE or in-flight DEMAND for CDO %s, permitting WITHDRAW\n", idstr););
+            mstro_pm_cdo_registry__set_state(cdoid, appid,
+                                             MSTRO_CDO_STATE_WITHDRAWN,
+                                             entry);
+            *immediate_withdraw=true;
+          } else {
+            WITH_CDO_ID_STR(idstr, cdoid,
+                            DEBUG("Still have %zu REQUIREs and %zu in-flight DEMANDs for CDO %s outstanding\n",
+                                  num_required, num_in_flight, idstr););
+            if(num_req_in_transport< num_required) {
+              INFO("Some REQUIREs are not in-transport yet; could check whether this instance is worth copying\n");
+            }
+            /* can't withdraw right away, need to schedule a wakeup on this CDO */
+            *immediate_withdraw=false;
+          }
+        }
+
+
+      return status;
+}
+
 
 /** withdraw CDOID for APPID
  *
@@ -1911,7 +2556,7 @@ mstro_pm_cdo_registry_withdraw(const struct mstro_cdo_id *cdoid,
 
   struct mstro_cdo_id head = *cdoid;
   head.local_id = MSTRO_CDO_LOCAL_ID_NONE;
-  
+
   WITH_LOCKED_CDO_REGISTRY({
       struct mstro_pm_cdo_registry_entry *regentry=NULL;
       HASH_FIND(hh, g_mstro_pm_cdo_registry,
@@ -1921,93 +2566,21 @@ mstro_pm_cdo_registry_withdraw(const struct mstro_cdo_id *cdoid,
         WITH_CDO_ID_STR(idstr, cdoid,
                         ERR("No CDO registry entry for WITHDRAWN CDO %s\n",
                             idstr););
-        s=MSTRO_NOENT;
+         s = MSTRO_NOENT;
         goto unlock_fail;
       }
-
-      /* count OFFERs */
-      struct per_app_cdo_entries *entry;
-      struct per_app_cdo_entries *tmp;
-      size_t num_offers=0;
-      size_t num_required=0;
-      size_t num_retracted=0;
-      size_t num_in_flight=0; /* demanded and in-transport */
-      size_t num_req_in_transport=0; /* required and in-transport (eager sends) */
-      HASH_ITER(hh, regentry->app_to_attributes, entry, tmp) {
-        struct cdo_handle_entry *h;
-        struct cdo_handle_entry *t;
-        HASH_ITER(hh, entry->handles, h, t) {
-          /* WITH_CDO_ID_STR(idstr, &head, { */
-          /*     DEBUG("Inspecting %s for app %" PRIappid " local-id %zu: %d (%s)\n", */
-          /*           idstr, entry->app, h->local_id, */
-          /*           h->cdo_state, */
-          /*           mstro_cdo_state_describe(h->cdo_state));}); */
-          if(h->cdo_state & MSTRO_CDO_STATE_OFFERED) {
-            num_offers++;
-          }
-          if(h->cdo_state & MSTRO_CDO_STATE_RETRACTED) {
-            num_retracted++;
-          }
-          if(h->cdo_state & MSTRO_CDO_STATE_REQUIRED) {
-            num_required++;
-            if(h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT) {
-              num_req_in_transport++;
-            }
-          }
-          if(   (h->cdo_state & MSTRO_CDO_STATE_DEMANDED)
-             && (h->cdo_state & MSTRO_CDO_STATE_IN_TRANSPORT)) {
-            num_in_flight++;
-          }
-        }
-      }
-      assert(num_offers>0); /* because we must have one outstanding OFFER for it from appid */
-      /* find our entry */
-      HASH_FIND(hh, regentry->app_to_attributes,
-                &appid, sizeof(appid), entry);
-      if(entry==NULL) {
-        ERR("Failed to find our per-app CDO registry entry (num_offers %zu)\n",
-            num_offers);
-        s=MSTRO_NOENT;
-        goto unlock_fail;
+      if(regentry->is_distributed) {
+        s = mstro_pm_dist_cdo_registry_immediate_withdraw(cdoid, appid,regentry, &immediate_withdraw);
       }
-      
-      if(num_offers>1) {
-        /* easy, someone else has it too */
-        WITH_CDO_ID_STR(idstr, cdoid,
-                        DEBUG("Multiple OFFERs of CDO %s in pool\n", idstr););
-        INFO("FIXME: performing quick WITHDRAW for CDO from app %d;"
-             " could check whether this instance was worth copying\n",
-             appid);
-        assert(entry!=NULL);
-        mstro_pm_cdo_registry__set_state(cdoid, appid,
-                                         MSTRO_CDO_STATE_WITHDRAWN,
-                                         entry);
-        immediate_withdraw=true;
-      } else {
-        /* only one offer */
-        WITH_CDO_ID_STR(idstr, cdoid, DEBUG("Only one OFFER of CDO %s in pool\n", idstr););
-        /* check whether there is any REQUIRE or DEMAND for it */
-        if(num_required==0 && num_in_flight==0) {
-          WITH_CDO_ID_STR(idstr, cdoid,
-                          DEBUG("No outstanding REQUIRE or in-flight DEMAND for CDO %s, permitting WITHDRAW\n", idstr););
-          mstro_pm_cdo_registry__set_state(cdoid, appid,
-                                           MSTRO_CDO_STATE_WITHDRAWN,
-                                           entry);
-          immediate_withdraw=true;
-        } else {
-          WITH_CDO_ID_STR(idstr, cdoid,
-                          DEBUG("Still have %zu REQUIREs and %zu in-flight DEMANDs for CDO %s outstanding\n",
-                                num_required, num_in_flight, idstr););
-          if(num_req_in_transport< num_required) {
-            INFO("Some REQUIREs are not in-transport yet; could check whether this instance is worth copying\n");
-          }
-          /* can't withdraw right away, need to schedule a wakeup on this CDO */
-          immediate_withdraw=false;
-        }
+      else {
+        s = mstro_pm_cdo_registry_immediate_withdraw(cdoid, appid,regentry, &immediate_withdraw);
       }
-   unlock_fail:
-      ;
-    });
+
+      unlock_fail:
+         ;
+       });
+
+
   if(s==MSTRO_OK) {
     if(immediate_withdraw) {
       /* CDO state changed, so can trigger caller continuation event */
@@ -2036,7 +2609,7 @@ mstro_pm_cdo_registry_withdraw(const struct mstro_cdo_id *cdoid,
       }
       mstro_stats_add_counter(MSTRO_STATS_CAT_POOL, MSTRO_STATS_L_PM_NUM_WITHDRAW_WAKEUPS, 1);
       WITH_CDO_ID_STR(idstr, cdoid,
-                      DEBUG("Scheduled retry of WITHDRAW for cdo %s for appid %zu\n", 
+                      DEBUG("Scheduled retry of WITHDRAW for cdo %s for appid %zu\n",
                             idstr, appid););
     }
   } else {
@@ -2082,6 +2655,8 @@ mstro_pm_cdo_registry_dispose(const struct mstro_cdo_id *cdo_id,
       }
       /* now delete from (app_to_attributes) table */
       HASH_DEL(app_entry->handles, handle_entry);
+      /* and kill entry */
+      s = mstro_pm__cdo_handle_entry__destroy(handle_entry);
       /* we don't remove the app entry from the overall CDO registry
        * at this point; this will be done by mstro_pm_app_deregister
        * which will notice we don't have any handles for any CDOs
diff --git a/maestro/statistics.c b/maestro/statistics.c
index 8b2465c9b04bbbe67bb772745c26b799e2f7160c..733f33aa42cd272b9dc3402bf387164b82301649 100644
--- a/maestro/statistics.c
+++ b/maestro/statistics.c
@@ -56,7 +56,6 @@
 #define WARN(...)  LOG_WARN(MSTRO_LOG_MODULE_STATS,__VA_ARGS__)
 #define ERR(...)   LOG_ERR(MSTRO_LOG_MODULE_STATS,__VA_ARGS__)
 
-
 /* timing stuff */
 #if defined(__linux)
 #  define HAVE_POSIX_TIMER
@@ -100,29 +99,6 @@ mstro_stats__init_clock(void)
 #endif
 }
 
-mstro_nanosec_t
-mstro_clock(void)
-{
-#if defined(__APPLE__)
-  mstro_nanosec_t now = clock_gettime_nsec_np(CLOCKID);
-    /* now = mach_absolute_time(); */
-    /* now *= info.numer; */
-    /* now /= info.denom; */
-    return now;
-#elif defined(__linux)
-    mstro_nanosec_t now;
-    struct timespec spec;
-    clock_gettime(CLOCKID, &spec);
-    now = spec.tv_sec * 1.0e9 + spec.tv_nsec;
-    return now;
-#elif defined(_WIN32)
-    LARGE_INTEGER now;
-    QueryPerformanceCounter(&now);
-    return (uint64_t) ((1e9 * now.QuadPart)  / win_frequency.QuadPart);
-#else
-    #error Unsupported system class
-#endif
-}
 
 #include <stdint.h>
 
diff --git a/maestro/subscription_registry.c b/maestro/subscription_registry.c
index bd3056a1bfec345e7cad2ab3623d8efb6dda6960..7ecd110919f1fb9fc3b7b1da41bcb05b9a4b3180 100644
--- a/maestro/subscription_registry.c
+++ b/maestro/subscription_registry.c
@@ -443,8 +443,8 @@ mstro_subscription_register__local(mstro_subscription subscription,
     }
     subscription->handle.id = id;
     
-    DEBUG("Assigned local unique id %" PRIu64 " (event %p) to subscription %p\n",
-          subscription->handle.id, e->subscription_event_handle, subscription);
+    INFO("Assigned local unique id %" PRIu64 " (event %p) to subscription %p from app %" PRIappid "\n",
+         subscription->handle.id, e->subscription_event_handle, subscription, origin);
   } else {
     /* proxied subscriptions, handle assigned by PM */
     e->subscription_event_handle=NULL;
@@ -1297,7 +1297,7 @@ mstro_subscription_message_event_ack(const Mstro__Pool__EventAck *msg)
   if(msg==NULL)
     return MSTRO_INVARG;
 
-  INFO("Incoming ack on subscription id %" PRIu64 "\n",
+  INFO("Incoming ack on subscription, event id %" PRIu64 "\n",
        msg->serial);
 
   assert(g_subscription_table.edom!=NULL);
@@ -1863,6 +1863,10 @@ mstro_pool_event_consume(const Mstro__Pool__Event *eventmsg)
         eventmsg->kind, ev->kind, mstro_pool_event_description(ev->kind));
   ev->serial = eventmsg->serial;
 
+  assert(eventmsg->ctime!=NULL);
+  
+  ev->ctime = (eventmsg->ctime->sec * NSEC_PER_SEC) + eventmsg->ctime->nsec;
+
   switch(ev->kind) {
     case MSTRO_POOL_EVENT_APP_JOIN:
       assert(eventmsg->payload_case==MSTRO__POOL__EVENT__PAYLOAD_JOIN);
@@ -1878,7 +1882,7 @@ mstro_pool_event_consume(const Mstro__Pool__Event *eventmsg)
         ev->join.appid = MSTRO_APP_ID_INVALID;
         
       
-      DEBUG("Event: %s JOINed (appid %" PRIappid ")\n",
+      DEBUG("Event: %s JOINed (app %" PRIappid ")\n",
             ev->join.component_name, ev->join.appid);
       break;
       
@@ -1912,11 +1916,13 @@ mstro_pool_event_consume(const Mstro__Pool__Event *eventmsg)
       const char* cdo_name;
       cdo_name = malloc(MSTRO_CDO_NAME_MAX);
       assert(cdo_name!=NULL);
-      struct mstro_cdo_id id = {
-        .qw[0] = eventmsg->offer->cdoid->qw0,
-        .qw[1] = eventmsg->offer->cdoid->qw1
-        //, .local_id = eventmsg->offer->cdoid->local_id;
-      };
+      /* struct mstro_cdo_id id = { */
+      /*   .qw[0] = eventmsg->offer->cdoid->qw0, */
+      /*   .qw[1] = eventmsg->offer->cdoid->qw1, */
+      /*   .local_id = MSTRO_CDO_LOCAL_ID_NONE */
+      /*   // FIXME: Why do we not set the actual local_id? (I don't remember --uuh) */
+      /*   //, .local_id = eventmsg->offer->cdoid->local_id; */
+      /* }; */
       /* we cannot call the resolver in here, as that would lock up
        * our thread in the PC. But CDO-related events should have a
        * NAME value */
@@ -1982,6 +1988,28 @@ mstro_pool_event_consume(const Mstro__Pool__Event *eventmsg)
             ev->require.cdo_name, ev->require.appid);
       break;
 
+    case MSTRO_POOL_EVENT_DEMAND:
+      assert(eventmsg->payload_case==MSTRO__POOL__EVENT__PAYLOAD_DEMAND);
+      assert(eventmsg->origin_id!=NULL
+             && eventmsg->origin_id->id!=MSTRO_APP_ID_INVALID);
+      assert(eventmsg->demand->cdoid!=NULL);
+      ev->demand.appid = eventmsg->origin_id->id;
+      if(eventmsg->cdo_name==NULL) {
+        ERR("DEMAND event missing a CDO name\n");
+        free(ev);
+        return MSTRO_FAIL;
+      } else {
+        ev->demand.cdo_name = strdup(eventmsg->cdo_name);
+        if(ev->demand.cdo_name == NULL) {
+          ERR("Failed to allocate event data\n");
+          free(ev);
+          return MSTRO_NOMEM;
+        }
+      }
+      
+      DEBUG("Event: DEMAND for |%s| from %" PRIu64 "\n",
+            ev->demand.cdo_name, ev->demand.appid);
+      break;
     case MSTRO_POOL_EVENT_WITHDRAW:
       assert(eventmsg->payload_case==MSTRO__POOL__EVENT__PAYLOAD_WITHDRAW);
       assert(eventmsg->origin_id!=NULL
@@ -2013,7 +2041,7 @@ mstro_pool_event_consume(const Mstro__Pool__Event *eventmsg)
       break;
 
     case MSTRO_POOL_EVENT_SEAL_GROUP:
-    case MSTRO_POOL_EVENT_DEMAND:
+    
     case MSTRO_POOL_EVENT_RETRACT:
     case MSTRO_POOL_EVENT_DISPOSE:
     case MSTRO_POOL_EVENT_TRANSPORT_INIT:
diff --git a/maestro/tpl.c b/maestro/tpl.c
deleted file mode 100644
index fe86b15fbc1c30cdd3d735a022974bb18d13971d..0000000000000000000000000000000000000000
--- a/maestro/tpl.c
+++ /dev/null
@@ -1,2481 +0,0 @@
-/*
-Copyright (c) 2005-2013, Troy D. Hanson     http://troydhanson.github.com/tpl/
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define TPL_VERSION 1.6
-
-static const char id[]="$Id: tpl.c 192 2009-04-24 10:35:30Z thanson $";
-
-
-#include <stdlib.h>  /* malloc */
-#include <stdarg.h>  /* va_list */
-#include <string.h>  /* memcpy, memset, strchr */
-#include <stdio.h>   /* printf (tpl_hook.oops default function) */
-
-#ifndef _WIN32
-#include <unistd.h>     /* for ftruncate */
-#else
-#include <io.h>
-#define ftruncate(x,y) _chsize(x,y)
-#endif
-#include <sys/types.h>  /* for 'open' */
-#include <sys/stat.h>   /* for 'open' */
-#include <fcntl.h>      /* for 'open' */
-#include <errno.h>
-#ifndef _WIN32
-#include <inttypes.h>   /* uint32_t, uint64_t, etc */
-#else
-typedef unsigned short ushort;
-typedef __int16 int16_t;
-typedef __int32 int32_t;
-typedef __int64 int64_t;
-typedef unsigned __int16 uint16_t;
-typedef unsigned __int32 uint32_t;
-typedef unsigned __int64 uint64_t;
-#endif
-
-#ifndef S_ISREG
-#define S_ISREG(mode)  (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-#if ( defined __CYGWIN__ || defined __MINGW32__ || defined _WIN32 )
-#include "win/mman.h"   /* mmap */
-#else
-#include <sys/mman.h>   /* mmap */
-#endif
-
-#include "maestro/i_tpl.h"
-
-#define TPL_GATHER_BUFLEN 8192
-#define TPL_MAGIC "tpl"
-
-/* macro to add a structure to a doubly-linked list */
-#define DL_ADD(head,add)                                        \
-    do {                                                        \
-        if (head) {                                             \
-            (add)->prev = (head)->prev;                         \
-            (head)->prev->next = (add);                         \
-            (head)->prev = (add);                               \
-            (add)->next = NULL;                                 \
-        } else {                                                \
-            (head)=(add);                                       \
-            (head)->prev = (head);                              \
-            (head)->next = NULL;                                \
-        }                                                       \
-    } while (0);
-
-#define fatal_oom() tpl_hook.fatal("out of memory\n")
-
-/* bit flags (internal). preceded by the external flags in tpl.h */
-#define TPL_WRONLY         (1 << 9)  /* app has initiated tpl packing  */
-#define TPL_RDONLY         (1 << 10)  /* tpl was loaded (for unpacking) */
-#define TPL_XENDIAN        (1 << 11)  /* swap endianness when unpacking */
-#define TPL_OLD_STRING_FMT (1 << 12) /* tpl has strings in 1.2 format */
-
-/* values for the flags byte that appears after the magic prefix */
-#define TPL_SUPPORTED_BITFLAGS 3
-#define TPL_FL_BIGENDIAN   (1 << 0)
-#define TPL_FL_NULLSTRINGS (1 << 1)
-
-/* char values for node type */
-#define TPL_TYPE_ROOT   0
-#define TPL_TYPE_INT32  1
-#define TPL_TYPE_UINT32 2
-#define TPL_TYPE_BYTE   3
-#define TPL_TYPE_STR    4
-#define TPL_TYPE_ARY    5
-#define TPL_TYPE_BIN    6
-#define TPL_TYPE_DOUBLE 7
-#define TPL_TYPE_INT64  8
-#define TPL_TYPE_UINT64 9
-#define TPL_TYPE_INT16  10
-#define TPL_TYPE_UINT16 11
-#define TPL_TYPE_POUND  12
-
-/* error codes */
-#define ERR_NOT_MINSIZE        (-1)
-#define ERR_MAGIC_MISMATCH     (-2)
-#define ERR_INCONSISTENT_SZ    (-3)
-#define ERR_FMT_INVALID        (-4)
-#define ERR_FMT_MISSING_NUL    (-5)
-#define ERR_FMT_MISMATCH       (-6)
-#define ERR_FLEN_MISMATCH      (-7)
-#define ERR_INCONSISTENT_SZ2   (-8)
-#define ERR_INCONSISTENT_SZ3   (-9)
-#define ERR_INCONSISTENT_SZ4   (-10)
-#define ERR_UNSUPPORTED_FLAGS  (-11)
-
-/* access to A(...) nodes by index */
-typedef struct tpl_pidx {
-    struct tpl_node *node;
-    struct tpl_pidx *next,*prev;
-} tpl_pidx;
-
-/* A(...) node datum */
-typedef struct tpl_atyp {
-    uint32_t num;    /* num elements */
-    size_t sz;       /* size of each backbone's datum */
-    struct tpl_backbone *bb,*bbtail; 
-    void *cur;                       
-} tpl_atyp;
-
-/* backbone to extend A(...) lists dynamically */
-typedef struct tpl_backbone {
-    struct tpl_backbone *next;
-    /* when this structure is malloc'd, extra space is alloc'd at the
-     * end to store the backbone "datum", and data points to it. */
-#if __STDC_VERSION__ < 199901
-    char *data;  
-#else
-    char data[];
-#endif
-} tpl_backbone;
-
-/* mmap record */
-typedef struct tpl_mmap_rec {
-    int fd;
-    void *text;
-    size_t text_sz;
-} tpl_mmap_rec;
-
-/* root node datum */
-typedef struct tpl_root_data {
-    int flags;
-    tpl_pidx *pidx;
-    tpl_mmap_rec mmap;
-    char *fmt;
-    int *fxlens, num_fxlens;
-} tpl_root_data;
-
-/* node type to size mapping */
-struct tpl_type_t {
-    char c;
-    int sz;
-};
-
-
-/* Internal prototypes */
-static tpl_node *tpl_node_new(tpl_node *parent);
-static tpl_node *tpl_find_i(tpl_node *n, int i);
-static void *tpl_cpv(void *datav, const void *data, size_t sz);
-static void *tpl_extend_backbone(tpl_node *n);
-static char *tpl_fmt(tpl_node *r);
-static void *tpl_dump_atyp(tpl_node *n, tpl_atyp* at, void *dv);
-static size_t tpl_ser_osz(tpl_node *n);
-static void tpl_free_atyp(tpl_node *n,tpl_atyp *atyp);
-static int tpl_dump_to_mem(tpl_node *r, void *addr, size_t sz);
-static int tpl_mmap_file(char *filename, tpl_mmap_rec *map_rec);
-static int tpl_mmap_output_file(char *filename, size_t sz, void **text_out);
-static int tpl_cpu_bigendian(void);
-static int tpl_needs_endian_swap(void *);
-static void tpl_byteswap(void *word, int len);
-static void tpl_fatal(const char *fmt, ...);
-static int tpl_serlen(tpl_node *r, tpl_node *n, void *dv, size_t *serlen);
-static int tpl_unpackA0(tpl_node *r);
-static int tpl_oops(const char *fmt, ...);
-static int tpl_gather_mem( char *buf, size_t len, tpl_gather_t **gs, tpl_gather_cb *cb, void *data);
-static int tpl_gather_nonblocking( int fd, tpl_gather_t **gs, tpl_gather_cb *cb, void *data);
-static int tpl_gather_blocking(int fd, void **img, size_t *sz);
-
-/* This is used internally to help calculate padding when a 'double' 
- * follows a smaller datatype in a structure. Normally under gcc
- * on x86, d will be aligned at +4, however use of -malign-double
- * causes d to be aligned at +8 (this is actually faster on x86).
- * Also SPARC and x86_64 seem to align always on +8. 
- */
-struct tpl_double_alignment_detector {
-    char a;
-    double d;  /* some platforms align this on +4, others on +8 */
-};
-
-/* this is another case where alignment varies. mac os x/gcc was observed
- * to align the int64_t at +4 under -m32 and at +8 under -m64 */
-struct tpl_int64_alignment_detector {
-    int i;
-    int64_t j;  /* some platforms align this on +4, others on +8 */
-};
-
-typedef struct {
-  size_t inter_elt_len; /* padded inter-element len; i.e. &a[1].field - &a[0].field */
-  tpl_node *iter_start_node; /* node to jump back to, as we start each new iteration */
-  size_t iternum; /* current iteration number (total req'd. iter's in n->num) */
-} tpl_pound_data;
-
-/* Hooks for customizing tpl mem alloc, error handling, etc. Set defaults. */
-tpl_hook_t tpl_hook = {
-    /* .oops =       */ tpl_oops,
-    /* .malloc =     */ malloc,
-    /* .realloc =    */ realloc,
-    /* .free =       */ free,
-    /* .fatal =      */ tpl_fatal,
-    /* .gather_max = */ 0 /* max tpl size (bytes) for tpl_gather */
-};
-
-static const char tpl_fmt_chars[] = "AS($)BiucsfIUjv#"; /* valid format chars */
-static const char tpl_S_fmt_chars[] = "iucsfIUjv#$()"; /* valid within S(...) */
-static const char tpl_datapeek_ok_chars[] = "iucsfIUjv"; /* valid in datapeek */
-static const struct tpl_type_t tpl_types[] = {
-    /* [TPL_TYPE_ROOT] =   */  {'r', 0},
-    /* [TPL_TYPE_INT32] =  */  {'i', sizeof(int32_t)},
-    /* [TPL_TYPE_UINT32] = */  {'u', sizeof(uint32_t)},
-    /* [TPL_TYPE_BYTE] =   */  {'c', sizeof(char)},
-    /* [TPL_TYPE_STR] =    */  {'s', sizeof(char*)},
-    /* [TPL_TYPE_ARY] =    */  {'A', 0},
-    /* [TPL_TYPE_BIN] =    */  {'B', 0},
-    /* [TPL_TYPE_DOUBLE] = */  {'f', 8}, /* not sizeof(double) as that varies */
-    /* [TPL_TYPE_INT64] =  */  {'I', sizeof(int64_t)},
-    /* [TPL_TYPE_UINT64] = */  {'U', sizeof(uint64_t)},
-    /* [TPL_TYPE_INT16] =  */  {'j', sizeof(int16_t)},
-    /* [TPL_TYPE_UINT16] = */  {'v', sizeof(uint16_t)},
-    /* [TPL_TYPE_POUND] =  */  {'#', 0},
-};
-
-/* default error-reporting function. Just writes to stderr. */
-static int tpl_oops(const char *fmt, ...) {
-    va_list ap;
-    va_start(ap,fmt);
-    vfprintf(stderr,fmt,ap);
-    va_end(ap);
-    return 0;
-}
-
-
-static tpl_node *tpl_node_new(tpl_node *parent) {
-    tpl_node *n;
-    if ((n=tpl_hook.malloc(sizeof(tpl_node))) == NULL) {
-        fatal_oom();
-    }
-    n->addr=NULL;
-    n->data=NULL;
-    n->num=1;
-    n->ser_osz=0;
-    n->children=NULL;
-    n->next=NULL;
-    n->parent=parent;
-    return n;
-}
-
-/* Used in S(..) formats to pack several fields from a structure based on 
- * only the structure address. We need to calculate field addresses 
- * manually taking into account the size of the fields and intervening padding.
- * The wrinkle is that double is not normally aligned on x86-32 but the
- * -malign-double compiler option causes it to be. Double are aligned
- * on Sparc, and apparently on 64 bit x86. We use a helper structure 
- * to detect whether double is aligned in this compilation environment.
- */
-char *calc_field_addr(tpl_node *parent, int type,char *struct_addr, int ordinal) {
-    tpl_node *prev;
-    int offset;
-    int align_sz;
-
-    if (ordinal == 1) return struct_addr;  /* first field starts on structure address */
-
-    /* generate enough padding so field addr is divisible by it's align_sz. 4, 8, etc */
-    prev = parent->children->prev; 
-    switch(type) {
-      case TPL_TYPE_DOUBLE:
-        align_sz = sizeof(struct tpl_double_alignment_detector) > 12 ? 8 : 4; 
-        break;
-      case TPL_TYPE_INT64:
-      case TPL_TYPE_UINT64:
-        align_sz = sizeof(struct tpl_int64_alignment_detector) > 12 ? 8 : 4; 
-        break;
-      default:
-        align_sz = tpl_types[type].sz;
-        break;
-    }
-    offset = ((uintptr_t)prev->addr - (uintptr_t)struct_addr)
-            + (tpl_types[prev->type].sz * prev->num);
-    offset = (offset + align_sz - 1) / align_sz * align_sz;
-    return struct_addr + offset;
-}
-
-TPL_API tpl_node *tpl_map(char *fmt,...) {
-  va_list ap;
-  tpl_node *tn;
-
-  va_start(ap,fmt);
-  tn = tpl_map_va(fmt, ap);
-  va_end(ap);
-  return tn;
-}
-
-TPL_API tpl_node *tpl_map_va(char *fmt, va_list ap) {
-    int lparen_level=0,expect_lparen=0,t=0,in_structure=0,ordinal=0;
-    int in_nested_structure=0;
-    char *c, *peek, *struct_addr=NULL, *struct_next;
-    tpl_node *root,*parent,*n=NULL,*preceding,*iter_start_node=NULL,
-             *struct_widest_node=NULL, *np; tpl_pidx *pidx;
-    tpl_pound_data *pd;
-    int *fxlens, num_fxlens, pound_num, pound_prod, applies_to_struct;
-    int contig_fxlens[10]; /* temp space for contiguous fxlens */
-    size_t num_contig_fxlens, i, j;
-    ptrdiff_t inter_elt_len=0; /* padded element length of contiguous structs in array */
-
-
-    root = tpl_node_new(NULL);
-    root->type = TPL_TYPE_ROOT; 
-    root->data = (tpl_root_data*)tpl_hook.malloc(sizeof(tpl_root_data));
-    if (!root->data) fatal_oom();
-    memset((tpl_root_data*)root->data,0,sizeof(tpl_root_data));
-
-    /* set up root nodes special ser_osz to reflect overhead of preamble */
-    root->ser_osz =  sizeof(uint32_t); /* tpl leading length */
-    root->ser_osz += strlen(fmt) + 1;  /* fmt + NUL-terminator */
-    root->ser_osz += 4;                /* 'tpl' magic prefix + flags byte */
-
-    parent=root;
-
-    c=fmt;
-    while (*c != '\0') {
-        switch (*c) {
-            case 'c':
-            case 'i':
-            case 'u':
-            case 'j':
-            case 'v':
-            case 'I':
-            case 'U':
-            case 'f':
-                if      (*c=='c') t=TPL_TYPE_BYTE;
-                else if (*c=='i') t=TPL_TYPE_INT32;
-                else if (*c=='u') t=TPL_TYPE_UINT32;
-                else if (*c=='j') t=TPL_TYPE_INT16;
-                else if (*c=='v') t=TPL_TYPE_UINT16;
-                else if (*c=='I') t=TPL_TYPE_INT64;
-                else if (*c=='U') t=TPL_TYPE_UINT64;
-                else if (*c=='f') t=TPL_TYPE_DOUBLE;
-
-                if (expect_lparen) goto fail;
-                n = tpl_node_new(parent);
-                n->type = t;
-                if (in_structure) {
-                    if (ordinal == 1) {
-                      /* for S(...)# iteration. Apply any changes to case 's' too!!! */
-                      iter_start_node = n; 
-                      struct_widest_node = n;
-                    }
-                    if (tpl_types[n->type].sz > tpl_types[struct_widest_node->type].sz) {
-                      struct_widest_node = n;
-                    }
-                    n->addr = calc_field_addr(parent,n->type,struct_addr,ordinal++);
-                } else n->addr = (void*)va_arg(ap,void*);
-                n->data = tpl_hook.malloc(tpl_types[t].sz);
-                if (!n->data) fatal_oom();
-                if (n->parent->type == TPL_TYPE_ARY) 
-                    ((tpl_atyp*)(n->parent->data))->sz += tpl_types[t].sz;
-                DL_ADD(parent->children,n);
-                break;
-            case 's':
-                if (expect_lparen) goto fail;
-                n = tpl_node_new(parent);
-                n->type = TPL_TYPE_STR;
-                if (in_structure) {
-                    if (ordinal == 1) {
-                      iter_start_node = n; /* for S(...)# iteration */
-                      struct_widest_node = n;
-                    }
-                    if (tpl_types[n->type].sz > tpl_types[struct_widest_node->type].sz) {
-                      struct_widest_node = n;
-                    }
-                    n->addr = calc_field_addr(parent,n->type,struct_addr,ordinal++);
-                } else n->addr = (void*)va_arg(ap,void*);
-                n->data = tpl_hook.malloc(sizeof(char*));
-                if (!n->data) fatal_oom();
-                *(char**)(n->data) = NULL;
-                if (n->parent->type == TPL_TYPE_ARY) 
-                    ((tpl_atyp*)(n->parent->data))->sz += sizeof(void*);
-                DL_ADD(parent->children,n);
-                break;
-            case '#':
-                /* apply a 'num' to preceding atom */
-                if (!parent->children) goto fail;
-                preceding = parent->children->prev; /* first child's prev is 'last child'*/
-                t = preceding->type;
-                applies_to_struct = (*(c-1) == ')') ? 1 : 0;
-                if (!applies_to_struct) {
-                  if (!(t == TPL_TYPE_BYTE   || t == TPL_TYPE_INT32 ||
-                        t == TPL_TYPE_UINT32 || t == TPL_TYPE_DOUBLE ||
-                        t == TPL_TYPE_UINT64 || t == TPL_TYPE_INT64 || 
-                        t == TPL_TYPE_UINT16 || t == TPL_TYPE_INT16 || 
-                        t == TPL_TYPE_STR )) goto fail;
-                }
-                /* count up how many contiguous # and form their product */
-                pound_prod=1;
-                num_contig_fxlens=0;
-                for(peek=c; *peek == '#'; peek++) {
-                  pound_num = va_arg(ap, int);
-                  if (pound_num < 1) {
-                    tpl_hook.fatal("non-positive iteration count %d\n", pound_num);
-                  }
-                  if (num_contig_fxlens >= (sizeof(contig_fxlens)/sizeof(contig_fxlens[0]))) {
-                    tpl_hook.fatal("contiguous # exceeds hardcoded limit\n");
-                  }
-                  contig_fxlens[num_contig_fxlens++] = pound_num;
-                  pound_prod *= pound_num;
-                }
-                /* increment c to skip contiguous # so its points to last one */
-                c = peek-1;
-                /* differentiate atom-# from struct-# by noting preceding rparen */
-                if (applies_to_struct) { /* insert # node to induce looping */
-                  n = tpl_node_new(parent);
-                  n->type = TPL_TYPE_POUND;
-                  n->num = pound_prod;
-                  n->data = tpl_hook.malloc(sizeof(tpl_pound_data));
-                  if (!n->data) fatal_oom();
-                  pd = (tpl_pound_data*)n->data;
-                  pd->inter_elt_len = inter_elt_len;
-                  pd->iter_start_node = iter_start_node; 
-                  pd->iternum = 0;
-                  DL_ADD(parent->children,n);
-                  /* multiply the 'num' and data space on each atom in the structure */
-                  for(np = iter_start_node; np != n; np = np->next) {
-                    if (n->parent->type == TPL_TYPE_ARY) {
-                      ((tpl_atyp*)(n->parent->data))->sz += 
-                         tpl_types[np->type].sz * (np->num * (n->num - 1));
-                    }
-                    np->data = tpl_hook.realloc(np->data, tpl_types[np->type].sz * 
-                                                          np->num * n->num);
-                    if (!np->data) fatal_oom();
-                    memset(np->data, 0, tpl_types[np->type].sz * np->num * n->num);
-                  }
-                } else { /* simple atom-# form does not require a loop */
-                  preceding->num = pound_prod;
-                  preceding->data = tpl_hook.realloc(preceding->data, 
-                      tpl_types[t].sz * preceding->num);
-                  if (!preceding->data) fatal_oom();
-                  memset(preceding->data,0,tpl_types[t].sz * preceding->num);
-                  if (n->parent->type == TPL_TYPE_ARY) {
-                      ((tpl_atyp*)(n->parent->data))->sz += tpl_types[t].sz * 
-                                                            (preceding->num-1);
-                  }
-                }
-                root->ser_osz += (sizeof(uint32_t) * num_contig_fxlens);
-
-                j = ((tpl_root_data*)root->data)->num_fxlens; /* before incrementing */
-                (((tpl_root_data*)root->data)->num_fxlens) += num_contig_fxlens;
-                num_fxlens = ((tpl_root_data*)root->data)->num_fxlens; /* new value */
-                fxlens = ((tpl_root_data*)root->data)->fxlens;
-                fxlens = tpl_hook.realloc(fxlens, sizeof(int) * num_fxlens);
-                if (!fxlens) fatal_oom();
-                ((tpl_root_data*)root->data)->fxlens = fxlens;
-                for(i=0; i < num_contig_fxlens; i++) fxlens[j++] = contig_fxlens[i];
-
-                break;
-            case 'B':
-                if (expect_lparen) goto fail;
-                if (in_structure) goto fail;
-                n = tpl_node_new(parent);
-                n->type = TPL_TYPE_BIN;
-                n->addr = (tpl_bin*)va_arg(ap,void*);
-                n->data = tpl_hook.malloc(sizeof(tpl_bin*));
-                if (!n->data) fatal_oom();
-                *((tpl_bin**)n->data) = NULL;
-                if (n->parent->type == TPL_TYPE_ARY) 
-                    ((tpl_atyp*)(n->parent->data))->sz += sizeof(tpl_bin);
-                DL_ADD(parent->children,n);
-                break;
-            case 'A':
-                if (in_structure) goto fail;
-                n = tpl_node_new(parent);
-                n->type = TPL_TYPE_ARY;
-                DL_ADD(parent->children,n);
-                parent = n;
-                expect_lparen=1;
-                pidx = (tpl_pidx*)tpl_hook.malloc(sizeof(tpl_pidx));
-                if (!pidx) fatal_oom();
-                pidx->node = n;
-                pidx->next = NULL;
-                DL_ADD(((tpl_root_data*)(root->data))->pidx,pidx);
-                /* set up the A's tpl_atyp */
-                n->data = (tpl_atyp*)tpl_hook.malloc(sizeof(tpl_atyp));
-                if (!n->data) fatal_oom();
-                ((tpl_atyp*)(n->data))->num = 0;
-                ((tpl_atyp*)(n->data))->sz = 0;
-                ((tpl_atyp*)(n->data))->bb = NULL;
-                ((tpl_atyp*)(n->data))->bbtail = NULL;
-                ((tpl_atyp*)(n->data))->cur = NULL;
-                if (n->parent->type == TPL_TYPE_ARY) 
-                    ((tpl_atyp*)(n->parent->data))->sz += sizeof(void*);
-                break;
-            case 'S':
-                if (in_structure) goto fail;
-                expect_lparen=1;
-                ordinal=1;  /* index upcoming atoms in S(..) */
-                in_structure=1+lparen_level; /* so we can tell where S fmt ends */
-                struct_addr = (char*)va_arg(ap,void*);
-                break;
-            case '$': /* nested structure */
-                if (!in_structure) goto fail;
-                expect_lparen=1;
-                in_nested_structure++;
-                break;
-            case ')':
-                lparen_level--;
-                if (lparen_level < 0) goto fail;
-                if (*(c-1) == '(') goto fail;
-                if (in_nested_structure) in_nested_structure--;
-                else if (in_structure && (in_structure-1 == lparen_level)) {
-                  /* calculate delta between contiguous structures in array */
-                  struct_next = calc_field_addr(parent, struct_widest_node->type, 
-                                                struct_addr, ordinal++);
-                  inter_elt_len = struct_next - struct_addr;
-                  in_structure=0;
-                }
-                else parent = parent->parent; /* rparen ends A() type, not S() type */
-                break;
-            case '(':
-                if (!expect_lparen) goto fail;
-                expect_lparen=0;
-                lparen_level++;
-                break;
-            default:
-                tpl_hook.oops("unsupported option %c\n", *c);
-                goto fail;
-        }
-        c++;
-    }
-    if (lparen_level != 0) goto fail;
-
-    /* copy the format string, save for convenience */
-    ((tpl_root_data*)(root->data))->fmt = tpl_hook.malloc(strlen(fmt)+1);
-    if (((tpl_root_data*)(root->data))->fmt == NULL) 
-        fatal_oom();
-    memcpy(((tpl_root_data*)(root->data))->fmt,fmt,strlen(fmt)+1);
-
-    return root;
-
-fail:
-    tpl_hook.oops("failed to parse %s\n", fmt);
-    tpl_free(root);
-    return NULL;
-}
-
-static int tpl_unmap_file( tpl_mmap_rec *mr) {
-
-    if ( munmap( mr->text, mr->text_sz ) == -1 ) {
-        tpl_hook.oops("Failed to munmap: %s\n", strerror(errno));
-    }
-    close(mr->fd);
-    mr->text = NULL;
-    mr->text_sz = 0;
-    return 0;
-}
-
-static void tpl_free_keep_map(tpl_node *r) {
-    int mmap_bits = (TPL_RDONLY|TPL_FILE);
-    int ufree_bits = (TPL_MEM|TPL_UFREE);
-    tpl_node *nxtc,*c;
-    int find_next_node=0,looking,i;
-    size_t sz;
-
-    /* For mmap'd files, or for 'ufree' memory images , do appropriate release */
-    if ((((tpl_root_data*)(r->data))->flags & mmap_bits) == mmap_bits) {
-        tpl_unmap_file( &((tpl_root_data*)(r->data))->mmap); 
-    } else if ((((tpl_root_data*)(r->data))->flags & ufree_bits) == ufree_bits) {
-        tpl_hook.free( ((tpl_root_data*)(r->data))->mmap.text );
-    }
-
-    c = r->children;
-    if (c) {
-        while(c->type != TPL_TYPE_ROOT) {    /* loop until we come back to root node */
-            switch (c->type) {
-                case TPL_TYPE_BIN:
-                    /* free any binary buffer hanging from tpl_bin */
-                    if ( *((tpl_bin**)(c->data)) ) {
-                        if ( (*((tpl_bin**)(c->data)))->addr ) {
-                            tpl_hook.free( (*((tpl_bin**)(c->data)))->addr );
-                        }
-                        *((tpl_bin**)c->data) = NULL; /* reset tpl_bin */
-                    }
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_STR:
-                    /* free any packed (copied) string */
-                    for(i=0; i < c->num; i++) {
-                      char *str = ((char**)c->data)[i];
-                      if (str) {
-                        tpl_hook.free(str);
-                        ((char**)c->data)[i] = NULL;
-                      }
-                    }
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_INT32:
-                case TPL_TYPE_UINT32:
-                case TPL_TYPE_INT64:
-                case TPL_TYPE_UINT64:
-                case TPL_TYPE_BYTE:
-                case TPL_TYPE_DOUBLE:
-                case TPL_TYPE_INT16:
-                case TPL_TYPE_UINT16:
-                case TPL_TYPE_POUND:
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_ARY:
-                    c->ser_osz = 0; /* zero out the serialization output size */
-
-                    sz = ((tpl_atyp*)(c->data))->sz;  /* save sz to use below */
-                    tpl_free_atyp(c,c->data);
-
-                    /* make new atyp */
-                    c->data = (tpl_atyp*)tpl_hook.malloc(sizeof(tpl_atyp));
-                    if (!c->data) fatal_oom();
-                    ((tpl_atyp*)(c->data))->num = 0;
-                    ((tpl_atyp*)(c->data))->sz = sz;  /* restore bb datum sz */
-                    ((tpl_atyp*)(c->data))->bb = NULL;
-                    ((tpl_atyp*)(c->data))->bbtail = NULL;
-                    ((tpl_atyp*)(c->data))->cur = NULL;
-
-                    c = c->children; 
-                    break;
-                default:
-                    tpl_hook.fatal("unsupported format character\n");
-                    break;
-            }
-
-            if (find_next_node) {
-                find_next_node=0;
-                looking=1;
-                while(looking) {
-                    if (c->next) {
-                        nxtc=c->next;
-                        c=nxtc;
-                        looking=0;
-                    } else {
-                        if (c->type == TPL_TYPE_ROOT) break; /* root node */
-                        else {
-                            nxtc=c->parent;
-                            c=nxtc;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    ((tpl_root_data*)(r->data))->flags = 0;  /* reset flags */
-}
-
-TPL_API void tpl_free(tpl_node *r) {
-    int mmap_bits = (TPL_RDONLY|TPL_FILE);
-    int ufree_bits = (TPL_MEM|TPL_UFREE);
-    tpl_node *nxtc,*c;
-    int find_next_node=0,looking,i;
-    tpl_pidx *pidx,*pidx_nxt;
-
-    /* For mmap'd files, or for 'ufree' memory images , do appropriate release */
-    if ((((tpl_root_data*)(r->data))->flags & mmap_bits) == mmap_bits) {
-        tpl_unmap_file( &((tpl_root_data*)(r->data))->mmap); 
-    } else if ((((tpl_root_data*)(r->data))->flags & ufree_bits) == ufree_bits) {
-        tpl_hook.free( ((tpl_root_data*)(r->data))->mmap.text );
-    }
-
-    c = r->children;
-    if (c) {
-        while(c->type != TPL_TYPE_ROOT) {    /* loop until we come back to root node */
-            switch (c->type) {
-                case TPL_TYPE_BIN:
-                    /* free any binary buffer hanging from tpl_bin */
-                    if ( *((tpl_bin**)(c->data)) ) {
-                        if ( (*((tpl_bin**)(c->data)))->sz != 0 ) {
-                            tpl_hook.free( (*((tpl_bin**)(c->data)))->addr );
-                        }
-                        tpl_hook.free(*((tpl_bin**)c->data)); /* free tpl_bin */
-                    }
-                    tpl_hook.free(c->data);  /* free tpl_bin* */
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_STR:
-                    /* free any packed (copied) string */
-                    for(i=0; i < c->num; i++) {
-                      char *str = ((char**)c->data)[i];
-                      if (str) {
-                        tpl_hook.free(str);
-                        ((char**)c->data)[i] = NULL;
-                      }
-                    }
-                    tpl_hook.free(c->data);
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_INT32:
-                case TPL_TYPE_UINT32:
-                case TPL_TYPE_INT64:
-                case TPL_TYPE_UINT64:
-                case TPL_TYPE_BYTE:
-                case TPL_TYPE_DOUBLE:
-                case TPL_TYPE_INT16:
-                case TPL_TYPE_UINT16:
-                case TPL_TYPE_POUND:
-                    tpl_hook.free(c->data);
-                    find_next_node=1;
-                    break;
-                case TPL_TYPE_ARY:
-                    tpl_free_atyp(c,c->data);
-                    if (c->children) c = c->children; /* normal case */
-                    else find_next_node=1; /* edge case, handle bad format A() */
-                    break;
-                default:
-                    tpl_hook.fatal("unsupported format character\n");
-                    break;
-            }
-
-            if (find_next_node) {
-                find_next_node=0;
-                looking=1;
-                while(looking) {
-                    if (c->next) {
-                        nxtc=c->next;
-                        tpl_hook.free(c);
-                        c=nxtc;
-                        looking=0;
-                    } else {
-                        if (c->type == TPL_TYPE_ROOT) break; /* root node */
-                        else {
-                            nxtc=c->parent;
-                            tpl_hook.free(c);
-                            c=nxtc;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    /* free root */
-    for(pidx=((tpl_root_data*)(r->data))->pidx; pidx; pidx=pidx_nxt) {
-        pidx_nxt = pidx->next;
-        tpl_hook.free(pidx);
-    }
-    tpl_hook.free(((tpl_root_data*)(r->data))->fmt);
-    if (((tpl_root_data*)(r->data))->num_fxlens > 0) {
-        tpl_hook.free(((tpl_root_data*)(r->data))->fxlens);
-    }
-    tpl_hook.free(r->data);  /* tpl_root_data */
-    tpl_hook.free(r);
-}
-
-
-/* Find the i'th packable ('A' node) */
-static tpl_node *tpl_find_i(tpl_node *n, int i) {
-    int j=0;
-    tpl_pidx *pidx;
-    if (n->type != TPL_TYPE_ROOT) return NULL;
-    if (i == 0) return n;  /* packable 0 is root */
-    for(pidx=((tpl_root_data*)(n->data))->pidx; pidx; pidx=pidx->next) {
-        if (++j == i) return pidx->node;
-    }
-    return NULL;
-}
-
-static void *tpl_cpv(void *datav, const void *data, size_t sz) {
-    if (sz>0) memcpy(datav,data,sz);
-    return (void*)((uintptr_t)datav + sz);
-}
-
-static void *tpl_extend_backbone(tpl_node *n) {
-    tpl_backbone *bb;
-    bb = (tpl_backbone*)tpl_hook.malloc(sizeof(tpl_backbone) +
-      ((tpl_atyp*)(n->data))->sz );  /* datum hangs on coattails of bb */
-    if (!bb) fatal_oom();
-#if __STDC_VERSION__ < 199901
-    bb->data = (char*)((uintptr_t)bb + sizeof(tpl_backbone)); 
-#endif
-    memset(bb->data,0,((tpl_atyp*)(n->data))->sz);
-    bb->next = NULL;
-    /* Add the new backbone to the tail, also setting head if necessary  */
-    if (((tpl_atyp*)(n->data))->bb == NULL) {
-        ((tpl_atyp*)(n->data))->bb = bb;
-        ((tpl_atyp*)(n->data))->bbtail = bb;
-    } else {
-        ((tpl_atyp*)(n->data))->bbtail->next = bb;
-        ((tpl_atyp*)(n->data))->bbtail = bb;
-    }
-
-    ((tpl_atyp*)(n->data))->num++;
-    return bb->data;
-}
-
-/* Get the format string corresponding to a given tpl (root node) */
-static char *tpl_fmt(tpl_node *r) {
-    return ((tpl_root_data*)(r->data))->fmt;
-}
-
-/* Get the fmt # lengths as a contiguous buffer of ints (length num_fxlens) */
-static int *tpl_fxlens(tpl_node *r, int *num_fxlens) {
-    *num_fxlens = ((tpl_root_data*)(r->data))->num_fxlens;
-    return ((tpl_root_data*)(r->data))->fxlens;
-}
-
-/* called when serializing an 'A' type node into a buffer which has
- * already been set up with the proper space. The backbone is walked
- * which was obtained from the tpl_atyp header passed in. 
- */
-static void *tpl_dump_atyp(tpl_node *n, tpl_atyp* at, void *dv) {
-    tpl_backbone *bb;
-    tpl_node *c;
-    void *datav;
-    uint32_t slen;
-    tpl_bin *binp;
-    char *strp;
-    tpl_atyp *atypp;
-    tpl_pound_data *pd;
-    int i;
-    size_t itermax;
-
-    /* handle 'A' nodes */
-    dv = tpl_cpv(dv,&at->num,sizeof(uint32_t));  /* array len */
-    for(bb=at->bb; bb; bb=bb->next) {
-        datav = bb->data;
-        c=n->children;
-        while(c) {
-            switch (c->type) {
-                case TPL_TYPE_BYTE:
-                case TPL_TYPE_DOUBLE:
-                case TPL_TYPE_INT32:
-                case TPL_TYPE_UINT32:
-                case TPL_TYPE_INT64:
-                case TPL_TYPE_UINT64:
-                case TPL_TYPE_INT16:
-                case TPL_TYPE_UINT16:
-                    dv = tpl_cpv(dv,datav,tpl_types[c->type].sz * c->num);
-                    datav = (void*)((uintptr_t)datav + tpl_types[c->type].sz * c->num);
-                    break;
-                case TPL_TYPE_BIN:
-                    /* dump the buffer length followed by the buffer */
-                    memcpy(&binp,datav,sizeof(tpl_bin*)); /* cp to aligned */
-                    slen = binp->sz;
-                    dv = tpl_cpv(dv,&slen,sizeof(uint32_t));
-                    dv = tpl_cpv(dv,binp->addr,slen);
-                    datav = (void*)((uintptr_t)datav + sizeof(tpl_bin*));
-                    break;
-                case TPL_TYPE_STR:
-                    /* dump the string length followed by the string */
-                    for(i=0; i < c->num; i++) {
-                      memcpy(&strp,datav,sizeof(char*)); /* cp to aligned */
-                      slen = strp ? (strlen(strp)+1) : 0;
-                      dv = tpl_cpv(dv,&slen,sizeof(uint32_t));
-                      if (slen > 1) dv = tpl_cpv(dv,strp,slen-1);
-                      datav = (void*)((uintptr_t)datav + sizeof(char*));
-                    }
-                    break;
-                case TPL_TYPE_ARY:
-                    memcpy(&atypp,datav,sizeof(tpl_atyp*)); /* cp to aligned */
-                    dv = tpl_dump_atyp(c,atypp,dv);
-                    datav = (void*)((uintptr_t)datav + sizeof(void*));
-                    break;
-                case TPL_TYPE_POUND:
-                    /* iterate over the preceding nodes */
-                    pd = (tpl_pound_data*)c->data;
-                    itermax = c->num;
-                    if (++(pd->iternum) < itermax) {
-                      c = pd->iter_start_node;
-                      continue;
-                    } else { /* loop complete. */
-                      pd->iternum = 0;
-                    }
-                    break;
-                default:
-                    tpl_hook.fatal("unsupported format character\n");
-                    break;
-            }
-            c=c->next;
-        }
-    }
-    return dv;
-}
-
-/* figure the serialization output size needed for tpl whose root is n*/
-static size_t tpl_ser_osz(tpl_node *n) {
-    tpl_node *c, *np;
-    size_t sz, itermax;
-    tpl_bin *binp;
-    char *strp;
-    tpl_pound_data *pd;
-    int i;
-
-    /* handle the root node ONLY (subtree's ser_osz have been bubbled-up) */
-    if (n->type != TPL_TYPE_ROOT) {
-        tpl_hook.fatal("internal error: tpl_ser_osz on non-root node\n");
-    }
-
-    sz = n->ser_osz;    /* start with fixed overhead, already stored */
-    c=n->children;
-    while (c) {
-        switch (c->type) {
-            case TPL_TYPE_BYTE:
-            case TPL_TYPE_DOUBLE:
-            case TPL_TYPE_INT32:
-            case TPL_TYPE_UINT32:
-            case TPL_TYPE_INT64:
-            case TPL_TYPE_UINT64:
-            case TPL_TYPE_INT16:
-            case TPL_TYPE_UINT16:
-                sz += tpl_types[c->type].sz * c->num;
-                break;
-            case TPL_TYPE_BIN:
-                sz += sizeof(uint32_t);  /* binary buf len */
-                memcpy(&binp,c->data,sizeof(tpl_bin*)); /* cp to aligned */
-                sz += binp->sz; 
-                break;
-            case TPL_TYPE_STR:
-                for(i=0; i < c->num; i++) {
-                  sz += sizeof(uint32_t);  /* string len */
-                  memcpy(&strp,&((char**)c->data)[i],sizeof(char*)); /* cp to aligned */
-                  sz += strp ? strlen(strp) : 0;
-                }
-                break;
-            case TPL_TYPE_ARY:
-                sz += sizeof(uint32_t);  /* array len */
-                sz += c->ser_osz;        /* bubbled-up child array ser_osz */
-                break;
-            case TPL_TYPE_POUND:
-                /* iterate over the preceding nodes */
-                itermax = c->num;
-                pd = (tpl_pound_data*)c->data;
-                if (++(pd->iternum) < itermax) {
-                  for(np=pd->iter_start_node; np != c; np = np->next) {
-                     np->data = (char*)(np->data) + 
-                                (tpl_types[np->type].sz * np->num);
-                  }
-                  c = pd->iter_start_node;
-                  continue;
-                } else { /* loop complete. */
-                  pd->iternum = 0;
-                  for(np=pd->iter_start_node; np != c; np = np->next) {
-                     np->data = (char*)(np->data) - ((itermax-1) * 
-                                                     tpl_types[np->type].sz * 
-                                                     np->num);
-                  }
-                }
-                break;
-            default:
-                tpl_hook.fatal("unsupported format character\n");
-                break;
-        }
-        c=c->next;
-    }
-    return sz;
-}
-
-
-TPL_API int tpl_dump(tpl_node *r, int mode, ...) {
-    va_list ap;
-    char *filename, *bufv;
-    void **addr_out,*buf, *pa_addr;
-    int fd,rc=0;
-    size_t sz,*sz_out, pa_sz;
-    struct stat sbuf;
-
-    if (((tpl_root_data*)(r->data))->flags & TPL_RDONLY) {  /* unusual */
-        tpl_hook.oops("error: tpl_dump called for a loaded tpl\n");
-        return -1;
-    }
-
-    sz = tpl_ser_osz(r); /* compute the size needed to serialize  */
-
-    va_start(ap,mode);
-    if (mode & TPL_FILE) {
-        filename = va_arg(ap,char*);
-        fd = tpl_mmap_output_file(filename, sz, &buf);
-        if (fd == -1) rc = -1;
-        else {
-            rc = tpl_dump_to_mem(r,buf,sz);
-            if (msync(buf,sz,MS_SYNC) == -1) {
-                tpl_hook.oops("msync failed on fd %d: %s\n", fd, strerror(errno));
-            }
-            if (munmap(buf, sz) == -1) {
-                tpl_hook.oops("munmap failed on fd %d: %s\n", fd, strerror(errno));
-            }
-            close(fd);
-        }
-    } else if (mode & TPL_FD) {
-        fd = va_arg(ap, int);
-        if ( (buf = tpl_hook.malloc(sz)) == NULL) fatal_oom();
-        tpl_dump_to_mem(r,buf,sz);
-        bufv = buf;
-        do {
-            rc = write(fd,bufv,sz);
-            if (rc > 0) {
-                sz -= rc;
-                bufv += rc;
-            } else if (rc == -1) {
-                if (errno == EINTR || errno == EAGAIN) continue;
-                tpl_hook.oops("error writing to fd %d: %s\n", fd, strerror(errno));
-                free(buf);
-                /* attempt to rewind partial write to a regular file */
-                if (fstat(fd,&sbuf) == 0 && S_ISREG(sbuf.st_mode)) {
-                  if (ftruncate(fd,sbuf.st_size - (bufv-(char*)buf)) == -1) {
-                    tpl_hook.oops("can't rewind: %s\n", strerror(errno));
-                  }
-                }
-                return -1;
-            }
-        } while (sz > 0);
-        free(buf);
-        rc = 0;
-    } else if (mode & TPL_MEM) {
-        if (mode & TPL_PREALLOCD) { /* caller allocated */
-          pa_addr = (void*)va_arg(ap, void*);
-          pa_sz = va_arg(ap, size_t);
-          if (pa_sz < sz) {
-              tpl_hook.oops("tpl_dump: buffer too small, need %d bytes\n", sz);
-              return -1;
-          }
-          rc=tpl_dump_to_mem(r,pa_addr,sz);
-        } else { /* we allocate */
-          addr_out = (void**)va_arg(ap, void*);
-          sz_out = va_arg(ap, size_t*);
-          if ( (buf = tpl_hook.malloc(sz)) == NULL) fatal_oom();
-          *sz_out = sz;
-          *addr_out = buf;
-          rc=tpl_dump_to_mem(r,buf,sz);
-        }
-    } else if (mode & TPL_GETSIZE) {
-        sz_out = va_arg(ap, size_t*);
-        *sz_out = sz;
-    } else {
-        tpl_hook.oops("unsupported tpl_dump mode %d\n", mode);
-        rc=-1;
-    }
-    va_end(ap);
-    return rc;
-}
-
-/* This function expects the caller to have set up a memory buffer of 
- * adequate size to hold the serialized tpl. The sz parameter must be
- * the result of tpl_ser_osz(r).
- */
-static int tpl_dump_to_mem(tpl_node *r,void *addr,size_t sz) {
-    uint32_t slen, sz32;
-    int *fxlens, num_fxlens, i;
-    void *dv;
-    char *fmt,flags;
-    tpl_node *c, *np;
-    tpl_pound_data *pd;
-    size_t itermax;
-
-    fmt = tpl_fmt(r);
-    flags = 0;
-    if (tpl_cpu_bigendian()) flags |= TPL_FL_BIGENDIAN;
-    if (strchr(fmt,'s')) flags |= TPL_FL_NULLSTRINGS;
-    sz32 = sz; 
-
-    dv = addr;
-    dv = tpl_cpv(dv,TPL_MAGIC,3);         /* copy tpl magic prefix */
-    dv = tpl_cpv(dv,&flags,1);            /* copy flags byte */
-    dv = tpl_cpv(dv,&sz32,sizeof(uint32_t));/* overall length (inclusive) */
-    dv = tpl_cpv(dv,fmt,strlen(fmt)+1);   /* copy format with NUL-term */
-    fxlens = tpl_fxlens(r,&num_fxlens);
-    dv = tpl_cpv(dv,fxlens,num_fxlens*sizeof(uint32_t));/* fmt # lengths */
-
-    /* serialize the tpl content, iterating over direct children of root */
-    c = r->children;
-    while (c) {
-        switch (c->type) {
-            case TPL_TYPE_BYTE:
-            case TPL_TYPE_DOUBLE:
-            case TPL_TYPE_INT32:
-            case TPL_TYPE_UINT32:
-            case TPL_TYPE_INT64:
-            case TPL_TYPE_UINT64:
-            case TPL_TYPE_INT16:
-            case TPL_TYPE_UINT16:
-                dv = tpl_cpv(dv,c->data,tpl_types[c->type].sz * c->num);
-                break;
-            case TPL_TYPE_BIN:
-                slen = (*(tpl_bin**)(c->data))->sz;
-                dv = tpl_cpv(dv,&slen,sizeof(uint32_t));  /* buffer len */
-                dv = tpl_cpv(dv,(*(tpl_bin**)(c->data))->addr,slen); /* buf */
-                break;
-            case TPL_TYPE_STR:
-                for(i=0; i < c->num; i++) {
-                  char *str = ((char**)c->data)[i];
-                  slen = str ? strlen(str)+1 : 0;
-                  dv = tpl_cpv(dv,&slen,sizeof(uint32_t));  /* string len */
-                  if (slen>1) dv = tpl_cpv(dv,str,slen-1); /*string*/
-                }
-                break;
-            case TPL_TYPE_ARY:
-                dv = tpl_dump_atyp(c,(tpl_atyp*)c->data,dv);
-                break;
-            case TPL_TYPE_POUND:
-                 pd = (tpl_pound_data*)c->data;
-                 itermax = c->num;
-                 if (++(pd->iternum) < itermax) {
-
-                   /* in start or midst of loop. advance data pointers. */
-                   for(np=pd->iter_start_node; np != c; np = np->next) {
-                     np->data = (char*)(np->data) + 
-                                (tpl_types[np->type].sz * np->num);
-                   }
-                   /* do next iteration */
-                   c = pd->iter_start_node;
-                   continue;
-
-                 } else { /* loop complete. */
-                 
-                   /* reset iteration index and addr/data pointers. */
-                   pd->iternum = 0;
-                   for(np=pd->iter_start_node; np != c; np = np->next) {
-                     np->data = (char*)(np->data) - ((itermax-1) * 
-                                                     tpl_types[np->type].sz * 
-                                                     np->num);
-                   }
-
-                 }
-                 break;
-            default:
-                tpl_hook.fatal("unsupported format character\n");
-                break;
-        }
-        c = c->next;
-    }
-
-    return 0;
-}
-
-static int tpl_cpu_bigendian() {
-   unsigned i = 1;
-   char *c;
-   c = (char*)&i;
-   return (c[0] == 1 ? 0 : 1);
-}
-
-
-/*
- * algorithm for sanity-checking a tpl image:
- * scan the tpl whilst not exceeding the buffer size (bufsz) ,
- * formulating a calculated (expected) size of the tpl based
- * on walking its data. When calcsize has been calculated it
- * should exactly match the buffer size (bufsz) and the internal
- * recorded size (intlsz)
- */
-static int tpl_sanity(tpl_node *r, int excess_ok) {
-    uint32_t intlsz;
-    int found_nul=0,rc, octothorpes=0, num_fxlens, *fxlens, flen;
-    void *d, *dv;
-    char intlflags, *fmt, c, *mapfmt;
-    size_t bufsz, serlen;
-
-    d = ((tpl_root_data*)(r->data))->mmap.text;
-    bufsz = ((tpl_root_data*)(r->data))->mmap.text_sz;
-
-    dv = d;
-    if (bufsz < (4 + sizeof(uint32_t) + 1)) return ERR_NOT_MINSIZE; /* min sz: magic+flags+len+nul */
-    if (memcmp(dv,TPL_MAGIC, 3) != 0) return ERR_MAGIC_MISMATCH; /* missing tpl magic prefix */
-    if (tpl_needs_endian_swap(dv)) ((tpl_root_data*)(r->data))->flags |= TPL_XENDIAN;
-    dv = (void*)((uintptr_t)dv + 3);
-    memcpy(&intlflags,dv,sizeof(char));  /* extract flags */
-    if (intlflags & ~TPL_SUPPORTED_BITFLAGS) return ERR_UNSUPPORTED_FLAGS;
-    /* TPL1.3 stores strings with a "length+1" prefix to discern NULL strings from
-       empty strings from non-empty strings; TPL1.2 only handled the latter two. 
-       So we need to be mindful of which string format we're reading from. */
-    if (!(intlflags & TPL_FL_NULLSTRINGS)) {
-      ((tpl_root_data*)(r->data))->flags |= TPL_OLD_STRING_FMT;
-    }
-    dv = (void*)((uintptr_t)dv + 1);
-    memcpy(&intlsz,dv,sizeof(uint32_t));  /* extract internal size */
-    if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN) tpl_byteswap(&intlsz, sizeof(uint32_t));
-    if (!excess_ok && (intlsz != bufsz)) return ERR_INCONSISTENT_SZ;  /* inconsisent buffer/internal size */
-    dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-
-    /* dv points to the start of the format string. Look for nul w/in buf sz */
-    fmt = (char*)dv;
-    while ((uintptr_t)dv-(uintptr_t)d < bufsz && !found_nul) {
-        if ( (c = *(char*)dv) != '\0') {
-            if (strchr(tpl_fmt_chars,c) == NULL) 
-               return ERR_FMT_INVALID;  /* invalid char in format string */
-            if ( (c = *(char*)dv) == '#') octothorpes++;
-            dv = (void*)((uintptr_t)dv + 1);
-        }
-        else found_nul = 1;
-    }
-    if (!found_nul) return ERR_FMT_MISSING_NUL;  /* runaway format string */
-    dv = (void*)((uintptr_t)dv + 1);   /* advance to octothorpe lengths buffer */
-    
-    /* compare the map format to the format of this tpl image */
-    mapfmt = tpl_fmt(r);
-    rc = strcmp(mapfmt,fmt);
-    if (rc != 0) return ERR_FMT_MISMATCH; 
-
-    /* compare octothorpe lengths in image to the mapped values */
-    if ((((uintptr_t)dv + (octothorpes * 4)) - (uintptr_t)d) > bufsz) return ERR_INCONSISTENT_SZ4;
-    fxlens = tpl_fxlens(r,&num_fxlens);  /* mapped fxlens */
-    while(num_fxlens--) {
-        memcpy(&flen,dv,sizeof(uint32_t)); /* stored flen */
-        if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN) tpl_byteswap(&flen, sizeof(uint32_t));
-        if (flen != *fxlens) return ERR_FLEN_MISMATCH;
-        dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-        fxlens++;
-    }
-
-    /* dv now points to beginning of data */
-    rc = tpl_serlen(r,r,dv,&serlen);  /* get computed serlen of data part */
-    if (rc == -1) return ERR_INCONSISTENT_SZ2; /* internal inconsistency in tpl image */
-    serlen += ((uintptr_t)dv - (uintptr_t)d);   /* add back serlen of preamble part */
-    if (excess_ok && (bufsz < serlen)) return ERR_INCONSISTENT_SZ3;  
-    if (!excess_ok && (serlen != bufsz)) return ERR_INCONSISTENT_SZ3;  /* buffer/internal sz exceeds serlen */
-    return 0;
-}
-
-static void *tpl_find_data_start(void *d) {
-    int octothorpes=0;
-    d = (void*)((uintptr_t)d + 4); /* skip TPL_MAGIC and flags byte */
-    d = (void*)((uintptr_t)d + 4); /* skip int32 overall len */
-    while(*(char*)d != '\0') {
-        if (*(char*)d == '#') octothorpes++;
-        d = (void*)((uintptr_t)d + 1);
-    }
-    d = (void*)((uintptr_t)d +  1);  /* skip NUL */
-    d = (void*)((uintptr_t)d +  (octothorpes * sizeof(uint32_t)));  /* skip # array lens */
-    return d;
-}
-
-static int tpl_needs_endian_swap(void *d) {
-    char *c;
-    int cpu_is_bigendian;
-    c = (char*)d;
-    cpu_is_bigendian = tpl_cpu_bigendian();
-    return ((c[3] & TPL_FL_BIGENDIAN) == cpu_is_bigendian) ? 0 : 1;
-}
-
-static size_t tpl_size_for(char c) {
-  size_t i;
-  for(i=0; i < sizeof(tpl_types)/sizeof(tpl_types[0]); i++) {
-    if (tpl_types[i].c == c) return tpl_types[i].sz;
-  }
-  return 0;
-}
-
-TPL_API char* tpl_peek(int mode, ...) {
-    va_list ap;
-    int xendian=0,found_nul=0,old_string_format=0;
-    char *filename=NULL, *datapeek_f=NULL, *datapeek_c, *datapeek_s;
-    void *addr=NULL, *dv, *datapeek_p=NULL;
-    size_t sz=0, fmt_len, first_atom, num_fxlens=0;
-    uint32_t datapeek_ssz, datapeek_csz, datapeek_flen;
-    tpl_mmap_rec mr = {0,NULL,0};
-    char *fmt,*fmt_cpy=NULL,c;
-    uint32_t intlsz, **fxlens=NULL, *num_fxlens_out=NULL, *fxlensv;
-
-    va_start(ap,mode);
-    if ((mode & TPL_FXLENS) && (mode & TPL_DATAPEEK)) {
-        tpl_hook.oops("TPL_FXLENS and TPL_DATAPEEK mutually exclusive\n");
-        goto fail;
-    }
-    if (mode & TPL_FILE) filename = va_arg(ap,char *);
-    else if (mode & TPL_MEM) {
-        addr = va_arg(ap,void *);
-        sz = va_arg(ap,size_t);
-    } else {
-        tpl_hook.oops("unsupported tpl_peek mode %d\n", mode);
-        goto fail;
-    }
-    if (mode & TPL_DATAPEEK) {
-        datapeek_f = va_arg(ap, char*);
-    }
-    if (mode & TPL_FXLENS) {
-        num_fxlens_out = va_arg(ap,uint32_t *);
-        fxlens = va_arg(ap,uint32_t **);
-        *num_fxlens_out = 0;
-        *fxlens = NULL;
-    }
-
-    if (mode & TPL_FILE) {
-        if (tpl_mmap_file(filename, &mr) != 0) {
-            tpl_hook.oops("tpl_peek failed for file %s\n", filename);
-            goto fail;
-        }
-        addr = mr.text;
-        sz = mr.text_sz;
-    }
-
-    dv = addr;
-    if (sz < (4 + sizeof(uint32_t) + 1)) goto fail; /* min sz */
-    if (memcmp(dv,TPL_MAGIC, 3) != 0) goto fail; /* missing tpl magic prefix */
-    if (tpl_needs_endian_swap(dv)) xendian=1;
-    if ((((char*)dv)[3] & TPL_FL_NULLSTRINGS)==0) old_string_format=1;
-    dv = (void*)((uintptr_t)dv + 4);
-    memcpy(&intlsz,dv,sizeof(uint32_t));  /* extract internal size */
-    if (xendian) tpl_byteswap(&intlsz, sizeof(uint32_t));
-    if (intlsz != sz) goto fail;  /* inconsisent buffer/internal size */
-    dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-
-    /* dv points to the start of the format string. Look for nul w/in buf sz */
-    fmt = (char*)dv;
-    while ((uintptr_t)dv-(uintptr_t)addr < sz && !found_nul) {
-        if ( (c = *(char*)dv) == '\0') {
-            found_nul = 1;
-        } else if (c == '#') {
-          num_fxlens++;
-        }
-        dv = (void*)((uintptr_t)dv + 1);
-    }
-    if (!found_nul) goto fail;  /* runaway format string */
-    fmt_len = (char*)dv - fmt;  /* include space for \0 */
-    fmt_cpy = tpl_hook.malloc(fmt_len);
-    if (fmt_cpy == NULL) {
-        fatal_oom();
-    }
-    memcpy(fmt_cpy, fmt, fmt_len);
-
-    /* retrieve the octothorpic lengths if requested */
-    if (num_fxlens > 0) {
-      if (sz < ((uintptr_t)dv + (num_fxlens * sizeof(uint32_t)) - (uintptr_t)addr)) {
-        goto fail;
-      }
-    }
-    if ((mode & TPL_FXLENS) && (num_fxlens > 0)) {
-      *fxlens = tpl_hook.malloc(num_fxlens * sizeof(uint32_t));
-      if (*fxlens == NULL) tpl_hook.fatal("out of memory");
-      *num_fxlens_out = num_fxlens;
-      fxlensv = *fxlens;
-      while(num_fxlens--) {
-          memcpy(fxlensv,dv,sizeof(uint32_t)); 
-          if (xendian) tpl_byteswap(fxlensv, sizeof(uint32_t));
-          dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-          fxlensv++;
-      }
-    }
-    /* if caller requested, peek into the specified data elements */
-    if (mode & TPL_DATAPEEK) {
-
-       first_atom = strspn(fmt, "S()"); /* skip any leading S() */
-
-       datapeek_flen = strlen(datapeek_f);
-       if (strspn(datapeek_f, tpl_datapeek_ok_chars) < datapeek_flen) {
-         tpl_hook.oops("invalid TPL_DATAPEEK format: %s\n", datapeek_f);
-         tpl_hook.free(fmt_cpy); fmt_cpy = NULL; /* fail */
-         goto fail;
-       }
-
-       if (strncmp( &fmt[first_atom], datapeek_f, datapeek_flen) != 0) {
-         tpl_hook.oops("TPL_DATAPEEK format mismatches tpl iamge\n");
-         tpl_hook.free(fmt_cpy); fmt_cpy = NULL; /* fail */
-         goto fail;
-       }
-
-       /* advance to data start, then copy out requested elements */
-       dv = (void*)((uintptr_t)dv +  (num_fxlens * sizeof(uint32_t)));  
-       for(datapeek_c = datapeek_f; *datapeek_c != '\0'; datapeek_c++) {
-         datapeek_p = va_arg(ap, void*);
-         if (*datapeek_c == 's') {  /* special handling for strings */
-           if ((uintptr_t)dv-(uintptr_t)addr + sizeof(uint32_t) > sz) {
-             tpl_hook.oops("tpl_peek: tpl has insufficient length\n");
-             tpl_hook.free(fmt_cpy); fmt_cpy = NULL; /* fail */
-             goto fail;
-           }
-           memcpy(&datapeek_ssz,dv,sizeof(uint32_t)); /* get slen */
-           if (xendian) tpl_byteswap(&datapeek_ssz, sizeof(uint32_t));
-           if (old_string_format) datapeek_ssz++;
-           dv = (void*)((uintptr_t)dv + sizeof(uint32_t)); /* adv. to str */
-           if (datapeek_ssz == 0) datapeek_s = NULL;
-           else {
-             if ((uintptr_t)dv-(uintptr_t)addr + datapeek_ssz-1 > sz) {
-               tpl_hook.oops("tpl_peek: tpl has insufficient length\n");
-               tpl_hook.free(fmt_cpy); fmt_cpy = NULL; /* fail */
-               goto fail;
-             }
-             datapeek_s = tpl_hook.malloc(datapeek_ssz);
-             if (datapeek_s == NULL) fatal_oom();
-             memcpy(datapeek_s, dv, datapeek_ssz-1);
-             datapeek_s[datapeek_ssz-1] = '\0';
-             dv = (void*)((uintptr_t)dv + datapeek_ssz-1);
-           }
-           *(char**)datapeek_p = datapeek_s;
-         } else {
-           datapeek_csz = tpl_size_for(*datapeek_c);
-           if ((uintptr_t)dv-(uintptr_t)addr + datapeek_csz > sz) {
-             tpl_hook.oops("tpl_peek: tpl has insufficient length\n");
-             tpl_hook.free(fmt_cpy); fmt_cpy = NULL; /* fail */
-             goto fail;
-           }
-           memcpy(datapeek_p, dv, datapeek_csz);
-           if (xendian) tpl_byteswap(datapeek_p, datapeek_csz);
-           dv = (void*)((uintptr_t)dv + datapeek_csz);
-         }
-       }
-    }
-
-fail:
-    va_end(ap);
-    if ((mode & TPL_FILE) && mr.text != NULL) tpl_unmap_file( &mr );
-    return fmt_cpy;
-}
-
-/* tpl_jot(TPL_FILE, "file.tpl", "si", &s, &i); */
-/* tpl_jot(TPL_MEM, &buf, &sz, "si", &s, &i); */
-/* tpl_jot(TPL_FD, fd, "si", &s, &i); */
-TPL_API int tpl_jot(int mode, ...) {
-    va_list ap;
-    char *filename, *fmt;
-    size_t *sz;
-    int fd, rc=0;
-    void **buf;
-    tpl_node *tn;
-
-    va_start(ap,mode);
-    if (mode & TPL_FILE) {
-      filename = va_arg(ap,char*);
-      fmt = va_arg(ap,char*);
-      tn = tpl_map_va(fmt, ap);
-      if (tn == NULL) { rc=-1; goto fail;}
-      tpl_pack(tn, 0);
-      rc = tpl_dump(tn, TPL_FILE, filename);
-      tpl_free(tn);
-    } else if (mode & TPL_MEM) {
-      buf = va_arg(ap,void*);
-      sz = va_arg(ap,size_t*);
-      fmt = va_arg(ap,char*);
-      tn = tpl_map_va(fmt,ap);
-      if (tn == NULL) { rc=-1; goto fail;}
-      tpl_pack(tn,0);
-      rc = tpl_dump(tn, TPL_MEM, buf, sz);
-      tpl_free(tn);
-    } else if (mode & TPL_FD) {
-      fd = va_arg(ap,int);
-      fmt = va_arg(ap,char*);
-      tn = tpl_map_va(fmt,ap);
-      if (tn == NULL) { rc=-1; goto fail;}
-      tpl_pack(tn,0);
-      rc = tpl_dump(tn, TPL_FD, fd);
-      tpl_free(tn);
-    } else {
-      tpl_hook.fatal("invalid tpl_jot mode\n");
-    }
-
-fail:
-    va_end(ap);
-    return rc;
-}
-
-TPL_API int tpl_load(tpl_node *r, int mode, ...) {
-    va_list ap;
-    int rc=0,fd=0;
-    char *filename=NULL;
-    void *addr;
-    size_t sz;
-
-    va_start(ap,mode);
-    if (mode & TPL_FILE) filename = va_arg(ap,char *);
-    else if (mode & TPL_MEM) {
-        addr = va_arg(ap,void *);
-        sz = va_arg(ap,size_t);
-    } else if (mode & TPL_FD) {
-        fd = va_arg(ap,int);
-    } else {
-        tpl_hook.oops("unsupported tpl_load mode %d\n", mode);
-        return -1;
-    }
-    va_end(ap);
-
-    if (r->type != TPL_TYPE_ROOT) {
-        tpl_hook.oops("error: tpl_load to non-root node\n");
-        return -1;
-    }
-    if (((tpl_root_data*)(r->data))->flags & (TPL_WRONLY|TPL_RDONLY)) {
-        /* already packed or loaded, so reset it as if newly mapped */
-        tpl_free_keep_map(r);
-    }
-    if (mode & TPL_FILE) {
-        if (tpl_mmap_file(filename, &((tpl_root_data*)(r->data))->mmap) != 0) {
-            tpl_hook.oops("tpl_load failed for file %s\n", filename);
-            return -1;
-        }
-        if ( (rc = tpl_sanity(r, (mode & TPL_EXCESS_OK))) != 0) {
-            if (rc == ERR_FMT_MISMATCH) {
-                tpl_hook.oops("%s: format signature mismatch\n", filename);
-            } else if (rc == ERR_FLEN_MISMATCH) { 
-                tpl_hook.oops("%s: array lengths mismatch\n", filename);
-            } else { 
-                tpl_hook.oops("%s: not a valid tpl file, err %d\n", filename, rc); 
-            }
-            tpl_unmap_file( &((tpl_root_data*)(r->data))->mmap );
-            return -1;
-        }
-        ((tpl_root_data*)(r->data))->flags = (TPL_FILE | TPL_RDONLY);
-    } else if (mode & TPL_MEM) {
-        ((tpl_root_data*)(r->data))->mmap.text = addr;
-        ((tpl_root_data*)(r->data))->mmap.text_sz = sz;
-        if ( (rc = tpl_sanity(r, (mode & TPL_EXCESS_OK))) != 0) {
-            if (rc == ERR_FMT_MISMATCH) {
-                tpl_hook.oops("format signature mismatch\n");
-            } else { 
-                tpl_hook.oops("not a valid tpl file, err %d\n", rc); 
-            }
-            return -1;
-        }
-        ((tpl_root_data*)(r->data))->flags = (TPL_MEM | TPL_RDONLY);
-        if (mode & TPL_UFREE) ((tpl_root_data*)(r->data))->flags |= TPL_UFREE;
-    } else if (mode & TPL_FD) {
-        /* if fd read succeeds, resulting mem img is used for load */
-        if (tpl_gather(TPL_GATHER_BLOCKING,fd,&addr,&sz) > 0) {
-            return tpl_load(r, TPL_MEM|TPL_UFREE, addr, sz);
-        } else return -1;
-    } else {
-        tpl_hook.oops("invalid tpl_load mode %d\n", mode);
-        return -1;
-    }
-    /* this applies to TPL_MEM or TPL_FILE */
-    if (tpl_needs_endian_swap(((tpl_root_data*)(r->data))->mmap.text))
-        ((tpl_root_data*)(r->data))->flags |= TPL_XENDIAN;
-    tpl_unpackA0(r);   /* prepare root A nodes for use */
-    return 0;
-}
-
-TPL_API int tpl_Alen(tpl_node *r, int i) {
-    tpl_node *n;
-
-    n = tpl_find_i(r,i);
-    if (n == NULL) {
-        tpl_hook.oops("invalid index %d to tpl_unpack\n", i);
-        return -1;
-    }
-    if (n->type != TPL_TYPE_ARY) return -1;
-    return ((tpl_atyp*)(n->data))->num;
-}
-
-static void tpl_free_atyp(tpl_node *n, tpl_atyp *atyp) {
-    tpl_backbone *bb,*bbnxt;
-    tpl_node *c;
-    void *dv;
-    tpl_bin *binp;
-    tpl_atyp *atypp;
-    char *strp;
-    size_t itermax;
-    tpl_pound_data *pd;
-    int i;
-
-    bb = atyp->bb;
-    while (bb) {
-        bbnxt = bb->next;
-        dv = bb->data;
-        c=n->children; 
-        while (c) {
-            switch (c->type) {
-                case TPL_TYPE_BYTE:
-                case TPL_TYPE_DOUBLE:
-                case TPL_TYPE_INT32:
-                case TPL_TYPE_UINT32:
-                case TPL_TYPE_INT64:
-                case TPL_TYPE_UINT64:
-                case TPL_TYPE_INT16:
-                case TPL_TYPE_UINT16:
-                    dv = (void*)((uintptr_t)dv + tpl_types[c->type].sz*c->num);
-                    break;
-                case TPL_TYPE_BIN:
-                    memcpy(&binp,dv,sizeof(tpl_bin*)); /* cp to aligned */
-                    if (binp->addr) tpl_hook.free( binp->addr ); /* free buf */
-                    tpl_hook.free(binp);  /* free tpl_bin */
-                    dv = (void*)((uintptr_t)dv + sizeof(tpl_bin*));
-                    break;
-                case TPL_TYPE_STR:
-                    for(i=0; i < c->num; i++) {
-                      memcpy(&strp,dv,sizeof(char*)); /* cp to aligned */
-                      if (strp) tpl_hook.free(strp); /* free string */
-                      dv = (void*)((uintptr_t)dv + sizeof(char*));
-                    }
-                    break;
-                case TPL_TYPE_POUND:
-                    /* iterate over the preceding nodes */
-                    itermax = c->num;
-                    pd = (tpl_pound_data*)c->data;
-                    if (++(pd->iternum) < itermax) {
-                      c = pd->iter_start_node;
-                      continue;
-                    } else { /* loop complete. */
-                      pd->iternum = 0;
-                    }
-                    break;
-                case TPL_TYPE_ARY:
-                    memcpy(&atypp,dv,sizeof(tpl_atyp*)); /* cp to aligned */
-                    tpl_free_atyp(c,atypp);  /* free atyp */
-                    dv = (void*)((uintptr_t)dv + sizeof(void*));
-                    break;
-                default:
-                    tpl_hook.fatal("unsupported format character\n");
-                    break;
-            }
-            c=c->next;
-        }
-        tpl_hook.free(bb);
-        bb = bbnxt;
-    }
-    tpl_hook.free(atyp);
-}
-
-/* determine (by walking) byte length of serialized r/A node at address dv 
- * returns 0 on success, or -1 if the tpl isn't trustworthy (fails consistency)
- */
-static int tpl_serlen(tpl_node *r, tpl_node *n, void *dv, size_t *serlen) {
-    uint32_t slen;
-    int num=0,fidx;
-    tpl_node *c;
-    size_t len=0, alen, buf_past, itermax;
-    tpl_pound_data *pd;
-
-    buf_past = ((uintptr_t)((tpl_root_data*)(r->data))->mmap.text + 
-                      ((tpl_root_data*)(r->data))->mmap.text_sz);
-
-    if (n->type == TPL_TYPE_ROOT) num = 1;
-    else if (n->type == TPL_TYPE_ARY) {
-        if ((uintptr_t)dv + sizeof(uint32_t) > buf_past) return -1;
-        memcpy(&num,dv,sizeof(uint32_t));
-        if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-             tpl_byteswap(&num, sizeof(uint32_t));
-        dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-        len += sizeof(uint32_t);
-    } else tpl_hook.fatal("internal error in tpl_serlen\n");
-
-    while (num-- > 0) {
-        c=n->children; 
-        while (c) {
-            switch (c->type) {
-                case TPL_TYPE_BYTE:
-                case TPL_TYPE_DOUBLE:
-                case TPL_TYPE_INT32:
-                case TPL_TYPE_UINT32:
-                case TPL_TYPE_INT64:
-                case TPL_TYPE_UINT64:
-                case TPL_TYPE_INT16:
-                case TPL_TYPE_UINT16:
-                    for(fidx=0; fidx < c->num; fidx++) {  /* octothorpe support */
-                        if ((uintptr_t)dv + tpl_types[c->type].sz > buf_past) return -1;
-                        dv = (void*)((uintptr_t)dv + tpl_types[c->type].sz);
-                        len += tpl_types[c->type].sz;
-                    }
-                    break;
-                case TPL_TYPE_BIN:
-                    len += sizeof(uint32_t);
-                    if ((uintptr_t)dv + sizeof(uint32_t) > buf_past) return -1;
-                    memcpy(&slen,dv,sizeof(uint32_t));
-                    if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                        tpl_byteswap(&slen, sizeof(uint32_t));
-                    len += slen;
-                    dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                    if ((uintptr_t)dv + slen > buf_past) return -1;
-                    dv = (void*)((uintptr_t)dv + slen);
-                    break;
-                case TPL_TYPE_STR:
-                    for(fidx=0; fidx < c->num; fidx++) {  /* octothorpe support */
-                      len += sizeof(uint32_t);
-                      if ((uintptr_t)dv + sizeof(uint32_t) > buf_past) return -1;
-                      memcpy(&slen,dv,sizeof(uint32_t));
-                      if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                          tpl_byteswap(&slen, sizeof(uint32_t));
-                      if (!(((tpl_root_data*)(r->data))->flags & TPL_OLD_STRING_FMT))
-                         slen = (slen>1) ? (slen-1) : 0;
-                      len += slen;
-                      dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                      if ((uintptr_t)dv + slen > buf_past) return -1;
-                      dv = (void*)((uintptr_t)dv + slen);
-                    }
-                    break;
-                case TPL_TYPE_ARY:
-                    if ( tpl_serlen(r,c,dv, &alen) == -1) return -1;
-                    dv = (void*)((uintptr_t)dv + alen);
-                    len += alen;
-                    break;
-                case TPL_TYPE_POUND:
-                    /* iterate over the preceding nodes */
-                    itermax = c->num;
-                    pd = (tpl_pound_data*)c->data;
-                    if (++(pd->iternum) < itermax) {
-                      c = pd->iter_start_node;
-                      continue;
-                    } else { /* loop complete. */
-                      pd->iternum = 0;
-                    }
-                    break;
-                default:
-                    tpl_hook.fatal("unsupported format character\n");
-                    break;
-            }
-            c=c->next;
-        }
-    }
-    *serlen = len;
-    return 0;
-}
-
-static int tpl_mmap_output_file(char *filename, size_t sz, void **text_out) {
-    void *text;
-    int fd,perms;
-
-#ifndef _WIN32
-    perms = S_IRUSR|S_IWUSR|S_IWGRP|S_IRGRP|S_IROTH;  /* ug+w o+r */
-    fd=open(filename,O_CREAT|O_TRUNC|O_RDWR,perms);
-#else
-	perms = _S_IWRITE;
-    fd=_open(filename,_O_CREAT|_O_TRUNC|_O_RDWR,perms);
-#endif
-
-    if ( fd == -1 ) {
-        tpl_hook.oops("Couldn't open file %s: %s\n", filename, strerror(errno));
-        return -1;
-    }
-
-    text = mmap(0, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
-    if (text == MAP_FAILED) {
-        tpl_hook.oops("Failed to mmap %s: %s\n", filename, strerror(errno));
-        close(fd);
-        return -1;
-    }
-    if (ftruncate(fd,sz) == -1) {
-        tpl_hook.oops("ftruncate failed: %s\n", strerror(errno));
-        munmap( text, sz );
-        close(fd);
-        return -1;
-    }
-    *text_out = text;
-    return fd;
-}
-
-static int tpl_mmap_file(char *filename, tpl_mmap_rec *mr) {
-    struct stat stat_buf;
-
-    if ( (mr->fd = open(filename, O_RDONLY)) == -1 ) {
-        tpl_hook.oops("Couldn't open file %s: %s\n", filename, strerror(errno));
-        return -1;
-    }
-
-    if ( fstat(mr->fd, &stat_buf) == -1) {
-        close(mr->fd);
-        tpl_hook.oops("Couldn't stat file %s: %s\n", filename, strerror(errno));
-        return -1;
-    }
-
-    mr->text_sz = (size_t)stat_buf.st_size;  
-    mr->text = mmap(0, stat_buf.st_size, PROT_READ, MAP_PRIVATE, mr->fd, 0);
-    if (mr->text == MAP_FAILED) {
-        close(mr->fd);
-        tpl_hook.oops("Failed to mmap %s: %s\n", filename, strerror(errno));
-        return -1;
-    }
-
-    return 0;
-}
-
-TPL_API int tpl_pack(tpl_node *r, int i) {
-    tpl_node *n, *child, *np;
-    void *datav=NULL;
-    size_t sz, itermax;
-    uint32_t slen;
-    char *str;
-    tpl_bin *bin;
-    tpl_pound_data *pd;
-    int fidx;
-
-    n = tpl_find_i(r,i);
-    if (n == NULL) {
-        tpl_hook.oops("invalid index %d to tpl_pack\n", i);
-        return -1;
-    }
-
-    if (((tpl_root_data*)(r->data))->flags & TPL_RDONLY) {
-        /* convert to an writeable tpl, initially empty */
-        tpl_free_keep_map(r);
-    }
-
-    ((tpl_root_data*)(r->data))->flags |= TPL_WRONLY;
-
-    if (n->type == TPL_TYPE_ARY) datav = tpl_extend_backbone(n);
-    child = n->children;
-    while(child) {
-        switch(child->type) {
-            case TPL_TYPE_BYTE:
-            case TPL_TYPE_DOUBLE:
-            case TPL_TYPE_INT32:
-            case TPL_TYPE_UINT32:
-            case TPL_TYPE_INT64:
-            case TPL_TYPE_UINT64:
-            case TPL_TYPE_INT16:
-            case TPL_TYPE_UINT16:
-                /* no need to use fidx iteration here; we can copy multiple values in one memcpy */
-                memcpy(child->data,child->addr,tpl_types[child->type].sz * child->num);
-                if (datav) datav = tpl_cpv(datav,child->data,tpl_types[child->type].sz * child->num);
-                if (n->type == TPL_TYPE_ARY) n->ser_osz += tpl_types[child->type].sz * child->num;
-                break;
-            case TPL_TYPE_BIN:
-                /* copy the buffer to be packed */ 
-                slen = ((tpl_bin*)child->addr)->sz;
-                if (slen >0) {
-                    str = tpl_hook.malloc(slen);
-                    if (!str) fatal_oom();
-                    memcpy(str,((tpl_bin*)child->addr)->addr,slen);
-                } else str = NULL;
-                /* and make a tpl_bin to point to it */
-                bin = tpl_hook.malloc(sizeof(tpl_bin));
-                if (!bin) fatal_oom();
-                bin->addr = str;
-                bin->sz = slen;
-                /* now pack its pointer, first deep freeing any pre-existing bin */
-                if (*(tpl_bin**)(child->data) != NULL) {
-                    if ((*(tpl_bin**)(child->data))->sz != 0) {
-                            tpl_hook.free( (*(tpl_bin**)(child->data))->addr );
-                    }
-                    tpl_hook.free(*(tpl_bin**)(child->data));  
-                }
-                memcpy(child->data,&bin,sizeof(tpl_bin*));
-                if (datav) {
-                    datav = tpl_cpv(datav, &bin, sizeof(tpl_bin*));
-                    *(tpl_bin**)(child->data) = NULL;  
-                }
-                if (n->type == TPL_TYPE_ARY) {
-                    n->ser_osz += sizeof(uint32_t); /* binary buf len word */
-                    n->ser_osz += bin->sz;          /* binary buf */
-                }
-                break;
-            case TPL_TYPE_STR:
-                for(fidx=0; fidx < child->num; fidx++) {
-                  /* copy the string to be packed. slen includes \0. this 
-                     block also works if the string pointer is NULL. */
-                  char *caddr = ((char**)child->addr)[fidx];
-                  char **cdata = &((char**)child->data)[fidx];
-                  slen = caddr ?  (strlen(caddr) + 1) : 0;
-                  if (slen) {
-                    str = tpl_hook.malloc(slen);
-                    if (!str) fatal_oom();
-                    memcpy(str,caddr,slen); /* include \0 */
-                  } else {
-                    str = NULL;
-                  } 
-                  /* now pack its pointer, first freeing any pre-existing string */
-                  if (*cdata != NULL) {
-                      tpl_hook.free(*cdata);  
-                  }
-                  memcpy(cdata,&str,sizeof(char*));
-                  if (datav) {
-                      datav = tpl_cpv(datav, &str, sizeof(char*));
-                      *cdata = NULL;  
-                  }
-                  if (n->type == TPL_TYPE_ARY) {
-                      n->ser_osz += sizeof(uint32_t); /* string len word */
-                      if (slen>1) n->ser_osz += slen-1;/* string (without nul) */
-                  }
-                }
-                break;
-            case TPL_TYPE_ARY:
-                /* copy the child's tpl_atype* and reset it to empty */
-                if (datav) {
-                    sz = ((tpl_atyp*)(child->data))->sz;
-                    datav = tpl_cpv(datav, &child->data, sizeof(void*));
-                    child->data = tpl_hook.malloc(sizeof(tpl_atyp));
-                    if (!child->data) fatal_oom();
-                    ((tpl_atyp*)(child->data))->num = 0;
-                    ((tpl_atyp*)(child->data))->sz = sz;
-                    ((tpl_atyp*)(child->data))->bb = NULL;
-                    ((tpl_atyp*)(child->data))->bbtail = NULL;
-                }
-                /* parent is array? then bubble up child array's ser_osz */
-                if (n->type == TPL_TYPE_ARY) {
-                    n->ser_osz += sizeof(uint32_t); /* array len word */
-                    n->ser_osz += child->ser_osz;   /* child array ser_osz */
-                    child->ser_osz = 0;             /* reset child array ser_osz */
-                }
-                break;
-
-            case TPL_TYPE_POUND: 
-                /* we need to iterate n times over preceding nodes in S(...). 
-                 * we may be in the midst of an iteration each time or starting. */
-                 pd = (tpl_pound_data*)child->data;
-                 itermax = child->num;
-
-                 /* itermax is total num of iterations needed  */
-                 /* pd->iternum is current iteration index  */
-                 /* pd->inter_elt_len is element-to-element len of contiguous structs */
-                 /* pd->iter_start_node is where we jump to at each iteration. */
-
-                 if (++(pd->iternum) < itermax) {
-
-                   /* in start or midst of loop. advance addr/data pointers. */
-                   for(np=pd->iter_start_node; np != child; np = np->next) {
-                     np->data = (char*)(np->data) + 
-                          (tpl_types[np->type].sz * np->num);
-                     np->addr = (char*)(np->addr) + pd->inter_elt_len;
-                   }
-                   /* do next iteration */
-                   child = pd->iter_start_node;
-                   continue;
-
-                 } else { /* loop complete. */
-                 
-                   /* reset iteration index and addr/data pointers. */
-                   pd->iternum = 0;
-                   for(np=pd->iter_start_node; np != child; np = np->next) {
-                     np->data = (char*)(np->data) - ((itermax-1) *
-                                                      tpl_types[np->type].sz * 
-                                                      np->num);
-                     np->addr = (char*)(np->addr) - ((itermax-1) * pd->inter_elt_len);
-                   }
-
-                 }
-                break;
-            default:
-                tpl_hook.fatal("unsupported format character\n");
-                break;
-        }
-        child=child->next;
-    }
-    return 0;
-}
-
-TPL_API int tpl_unpack(tpl_node *r, int i) {
-    tpl_node *n, *c, *np;
-    uint32_t slen;
-    int rc=1, fidx;
-    char *str;
-    void *dv=NULL, *caddr;
-    size_t A_bytes, itermax;
-    tpl_pound_data *pd;
-    void *img;
-    size_t sz;
-
-
-    /* handle unusual case of tpl_pack,tpl_unpack without an 
-     * intervening tpl_dump. do a dump/load implicitly. */
-    if (((tpl_root_data*)(r->data))->flags & TPL_WRONLY) {
-        if (tpl_dump(r,TPL_MEM,&img,&sz) != 0) return -1;
-        if (tpl_load(r,TPL_MEM|TPL_UFREE,img,sz) != 0) {
-            tpl_hook.free(img);
-            return -1;
-        };
-    }
-
-    n = tpl_find_i(r,i);
-    if (n == NULL) {
-        tpl_hook.oops("invalid index %d to tpl_unpack\n", i);
-        return -1;
-    }
-
-    /* either root node or an A node */
-    if (n->type == TPL_TYPE_ROOT) {
-        dv = tpl_find_data_start( ((tpl_root_data*)(n->data))->mmap.text );
-    } else if (n->type == TPL_TYPE_ARY) {
-        if (((tpl_atyp*)(n->data))->num <= 0) return 0; /* array consumed */
-        else rc = ((tpl_atyp*)(n->data))->num--;
-        dv = ((tpl_atyp*)(n->data))->cur;
-        if (!dv) tpl_hook.fatal("must unpack parent of node before node itself\n");
-    }
-
-    c = n->children;
-    while (c) {
-        switch (c->type) {
-            case TPL_TYPE_BYTE:
-            case TPL_TYPE_DOUBLE:
-            case TPL_TYPE_INT32:
-            case TPL_TYPE_UINT32:
-            case TPL_TYPE_INT64:
-            case TPL_TYPE_UINT64:
-            case TPL_TYPE_INT16:
-            case TPL_TYPE_UINT16:
-                /* unpack elements of cross-endian octothorpic array individually */
-                if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN) {
-                    for(fidx=0; fidx < c->num; fidx++) {
-                        caddr = (void*)((uintptr_t)c->addr + (fidx * tpl_types[c->type].sz));
-                        memcpy(caddr,dv,tpl_types[c->type].sz);
-                        tpl_byteswap(caddr, tpl_types[c->type].sz);
-                        dv = (void*)((uintptr_t)dv + tpl_types[c->type].sz);
-                    }
-                } else {
-                    /* bulk unpack ok if not cross-endian */
-                    memcpy(c->addr, dv, tpl_types[c->type].sz * c->num);
-                    dv = (void*)((uintptr_t)dv + tpl_types[c->type].sz * c->num);
-                }
-                break;
-            case TPL_TYPE_BIN:
-                memcpy(&slen,dv,sizeof(uint32_t));
-                if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                    tpl_byteswap(&slen, sizeof(uint32_t));
-                if (slen > 0) {
-                    str = (char*)tpl_hook.malloc(slen);
-                    if (!str) fatal_oom();
-                } else str=NULL;
-                dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                if (slen>0) memcpy(str,dv,slen);
-                memcpy(&(((tpl_bin*)c->addr)->addr),&str,sizeof(void*));
-                memcpy(&(((tpl_bin*)c->addr)->sz),&slen,sizeof(uint32_t));
-                dv = (void*)((uintptr_t)dv + slen);
-                break;
-            case TPL_TYPE_STR:
-                for(fidx=0; fidx < c->num; fidx++) {
-                  memcpy(&slen,dv,sizeof(uint32_t));
-                  if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                      tpl_byteswap(&slen, sizeof(uint32_t));
-                  if (((tpl_root_data*)(r->data))->flags & TPL_OLD_STRING_FMT)
-                    slen += 1;
-                  dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                  if (slen) {  /* slen includes \0 */
-                    str = (char*)tpl_hook.malloc(slen);
-                    if (!str) fatal_oom();
-                    if (slen>1) memcpy(str,dv,slen-1);
-                    str[slen-1] = '\0'; /* nul terminate */
-                    dv = (void*)((uintptr_t)dv + slen-1);
-                  } else str=NULL;
-                  memcpy(&((char**)c->addr)[fidx],&str,sizeof(char*));
-                }
-                break;
-            case TPL_TYPE_POUND:
-                /* iterate over preceding nodes */
-                pd = (tpl_pound_data*)c->data;
-                itermax = c->num;
-                if (++(pd->iternum) < itermax) {
-                  /* in start or midst of loop. advance addr/data pointers. */
-                  for(np=pd->iter_start_node; np != c; np = np->next) {
-                    np->addr = (char*)(np->addr) + pd->inter_elt_len;
-                  }
-                  /* do next iteration */
-                  c = pd->iter_start_node;
-                  continue;
-
-                } else { /* loop complete. */
-                
-                  /* reset iteration index and addr/data pointers. */
-                  pd->iternum = 0;
-                  for(np=pd->iter_start_node; np != c; np = np->next) {
-                    np->addr = (char*)(np->addr) - ((itermax-1) * pd->inter_elt_len);
-                  }
-
-                }
-                break;
-            case TPL_TYPE_ARY:
-                if (tpl_serlen(r,c,dv, &A_bytes) == -1) 
-                    tpl_hook.fatal("internal error in unpack\n");
-                memcpy( &((tpl_atyp*)(c->data))->num, dv, sizeof(uint32_t));
-                if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                    tpl_byteswap(&((tpl_atyp*)(c->data))->num, sizeof(uint32_t));
-                ((tpl_atyp*)(c->data))->cur = (void*)((uintptr_t)dv+sizeof(uint32_t));
-                dv = (void*)((uintptr_t)dv + A_bytes);
-                break;
-            default:
-                tpl_hook.fatal("unsupported format character\n");
-                break;
-        }
-
-        c = c->next;
-    }
-    if (n->type == TPL_TYPE_ARY) ((tpl_atyp*)(n->data))->cur = dv; /* next element */
-    return rc;
-}
-
-/* Specialized function that unpacks only the root's A nodes, after tpl_load  */
-static int tpl_unpackA0(tpl_node *r) {
-    tpl_node *n, *c;
-    uint32_t slen;
-    int rc=1,fidx,i;
-    void *dv;
-    size_t A_bytes, itermax;
-    tpl_pound_data *pd;
-
-    n = r;
-    dv = tpl_find_data_start( ((tpl_root_data*)(r->data))->mmap.text);
-
-    c=n->children;
-    while (c)  {
-        switch (c->type) {
-            case TPL_TYPE_BYTE:
-            case TPL_TYPE_DOUBLE:
-            case TPL_TYPE_INT32:
-            case TPL_TYPE_UINT32:
-            case TPL_TYPE_INT64:
-            case TPL_TYPE_UINT64:
-            case TPL_TYPE_INT16:
-            case TPL_TYPE_UINT16:
-                for(fidx=0;fidx < c->num; fidx++) {
-                    dv = (void*)((uintptr_t)dv + tpl_types[c->type].sz);
-                }
-                break;
-            case TPL_TYPE_BIN:
-                memcpy(&slen,dv,sizeof(uint32_t));
-                if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                    tpl_byteswap(&slen, sizeof(uint32_t));
-                dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                dv = (void*)((uintptr_t)dv + slen);
-                break;
-            case TPL_TYPE_STR:
-                for(i=0; i<c->num; i++) {
-                  memcpy(&slen,dv,sizeof(uint32_t));
-                  if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                      tpl_byteswap(&slen, sizeof(uint32_t));
-                  if (((tpl_root_data*)(r->data))->flags & TPL_OLD_STRING_FMT)
-                    slen += 1;
-                  dv = (void*)((uintptr_t)dv + sizeof(uint32_t));
-                  if (slen>1) dv = (void*)((uintptr_t)dv + slen-1);
-                }
-                break;
-            case TPL_TYPE_POUND:
-                /* iterate over the preceding nodes */
-                itermax = c->num;
-                pd = (tpl_pound_data*)c->data;
-                if (++(pd->iternum) < itermax) {
-                  c = pd->iter_start_node;
-                  continue;
-                } else { /* loop complete. */
-                  pd->iternum = 0;
-                }
-                break;
-            case TPL_TYPE_ARY:
-                if ( tpl_serlen(r,c,dv, &A_bytes) == -1) 
-                    tpl_hook.fatal("internal error in unpackA0\n");
-                memcpy( &((tpl_atyp*)(c->data))->num, dv, sizeof(uint32_t));
-                if (((tpl_root_data*)(r->data))->flags & TPL_XENDIAN)
-                    tpl_byteswap(&((tpl_atyp*)(c->data))->num, sizeof(uint32_t));
-                ((tpl_atyp*)(c->data))->cur = (void*)((uintptr_t)dv+sizeof(uint32_t));
-                dv = (void*)((uintptr_t)dv + A_bytes);
-                break;
-            default:
-                tpl_hook.fatal("unsupported format character\n");
-                break;
-        }
-        c=c->next;
-    }
-    return rc;
-}
-
-/* In-place byte order swapping of a word of length "len" bytes */
-static void tpl_byteswap(void *word, int len) {
-    int i;
-    char c, *w;
-    w = (char*)word;
-    for(i=0; i<len/2; i++) {
-        c = w[i];
-        w[i] = w[len-1-i];
-        w[len-1-i] = c;
-    }
-}
-
-static void tpl_fatal(const char *fmt, ...) {
-    va_list ap;
-    char exit_msg[100];
-
-    va_start(ap,fmt);
-    vsnprintf(exit_msg, 100, fmt, ap);
-    va_end(ap);
-
-    tpl_hook.oops("%s", exit_msg);
-    exit(-1);
-}
-
-TPL_API int tpl_gather(int mode, ...) {
-    va_list ap;
-    int fd,rc=0;
-    size_t *szp,sz;
-    void **img,*addr,*data;
-    tpl_gather_t **gs;
-    tpl_gather_cb *cb;
-
-    va_start(ap,mode);
-    switch (mode) {
-        case TPL_GATHER_BLOCKING:
-            fd = va_arg(ap,int);
-            img = va_arg(ap,void*);
-            szp = va_arg(ap,size_t*);
-            rc = tpl_gather_blocking(fd,img,szp);
-            break;
-        case TPL_GATHER_NONBLOCKING:
-            fd = va_arg(ap,int);
-            gs = (tpl_gather_t**)va_arg(ap,void*);
-            cb = (tpl_gather_cb*)va_arg(ap,tpl_gather_cb*);
-            data = va_arg(ap,void*);
-            rc = tpl_gather_nonblocking(fd,gs,cb,data);
-            break;
-        case TPL_GATHER_MEM:
-            addr = va_arg(ap,void*);
-            sz = va_arg(ap,size_t);
-            gs = (tpl_gather_t**)va_arg(ap,void*);
-            cb = (tpl_gather_cb*)va_arg(ap,tpl_gather_cb*);
-            data = va_arg(ap,void*);
-            rc = tpl_gather_mem(addr,sz,gs,cb,data);
-            break;
-        default:
-            tpl_hook.fatal("unsupported tpl_gather mode %d\n",mode);
-            break;
-    }
-    va_end(ap);
-    return rc;
-}
-
-/* dequeue a tpl by reading until one full tpl image is obtained.
- * We take care not to read past the end of the tpl.
- * This is intended as a blocking call i.e. for use with a blocking fd.
- * It can be given a non-blocking fd, but the read spins if we have to wait.
- */
-static int tpl_gather_blocking(int fd, void **img, size_t *sz) {
-    char preamble[8];
-    int i=0, rc;
-    uint32_t tpllen;
-
-    do { 
-        rc = read(fd,&preamble[i],8-i);
-        i += (rc>0) ? rc : 0;
-    } while ((rc==-1 && (errno==EINTR||errno==EAGAIN)) || (rc>0 && i<8));
-
-    if (rc<0) {
-        tpl_hook.oops("tpl_gather_fd_blocking failed: %s\n", strerror(errno));
-        return -1;
-    } else if (rc == 0) {
-        /* tpl_hook.oops("tpl_gather_fd_blocking: eof\n"); */
-        return 0;
-    } else if (i != 8) {
-        tpl_hook.oops("internal error\n");
-        return -1;
-    }
-
-    if (preamble[0] == 't' && preamble[1] == 'p' && preamble[2] == 'l') {
-        memcpy(&tpllen,&preamble[4],4);
-        if (tpl_needs_endian_swap(preamble)) tpl_byteswap(&tpllen,4);
-    } else {
-        tpl_hook.oops("tpl_gather_fd_blocking: non-tpl input\n");
-        return -1;
-    }
-
-    /* malloc space for remainder of tpl image (overall length tpllen) 
-     * and read it in
-     */
-    if (tpl_hook.gather_max > 0 && 
-        tpllen > tpl_hook.gather_max) {
-        tpl_hook.oops("tpl exceeds max length %d\n", 
-            tpl_hook.gather_max);
-        return -2;
-    }
-    *sz = tpllen;
-    if ( (*img = tpl_hook.malloc(tpllen)) == NULL) {
-        fatal_oom();
-    }
-
-    memcpy(*img,preamble,8);  /* copy preamble to output buffer */
-    i=8;
-    do { 
-        rc = read(fd,&((*(char**)img)[i]),tpllen-i);
-        i += (rc>0) ? rc : 0;
-    } while ((rc==-1 && (errno==EINTR||errno==EAGAIN)) || (rc>0 && (size_t)i<(size_t)tpllen));
-
-    if (rc<0) {
-        tpl_hook.oops("tpl_gather_fd_blocking failed: %s\n", strerror(errno));
-        tpl_hook.free(*img);
-        return -1;
-    } else if (rc == 0) {
-        /* tpl_hook.oops("tpl_gather_fd_blocking: eof\n"); */
-        tpl_hook.free(*img);
-        return 0;
-    } else if ((size_t)i != (size_t)tpllen) {
-        tpl_hook.oops("internal error\n");
-        tpl_hook.free(*img);
-        return -1;
-    }
-
-    return 1;
-}
-
-/* Used by select()-driven apps which want to gather tpl images piecemeal */
-/* the file descriptor must be non-blocking for this functino to work. */
-static int tpl_gather_nonblocking( int fd, tpl_gather_t **gs, tpl_gather_cb *cb, void *data) {
-    char buf[TPL_GATHER_BUFLEN], *img, *tpl;
-    int rc, keep_looping, cbrc=0;
-    size_t catlen;
-    uint32_t tpllen;
-
-    while (1) {
-        rc = read(fd,buf,TPL_GATHER_BUFLEN);
-        if (rc == -1) {
-            if (errno == EINTR) continue;  /* got signal during read, ignore */
-            if (errno == EAGAIN) return 1; /* nothing to read right now */
-            else {
-                tpl_hook.oops("tpl_gather failed: %s\n", strerror(errno));
-                if (*gs) {
-                    tpl_hook.free((*gs)->img);
-                    tpl_hook.free(*gs);
-                    *gs = NULL;
-                }
-                return -1;                 /* error, caller should close fd  */
-            }
-        } else if (rc == 0) {
-            if (*gs) {
-                tpl_hook.oops("tpl_gather: partial tpl image precedes EOF\n");
-                tpl_hook.free((*gs)->img);
-                tpl_hook.free(*gs);
-                *gs = NULL;
-            }
-            return 0;                      /* EOF, caller should close fd */
-        } else {
-            /* concatenate any partial tpl from last read with new buffer */
-            if (*gs) {
-                catlen = (*gs)->len + rc;
-                if (tpl_hook.gather_max > 0 && 
-                    catlen > tpl_hook.gather_max) {
-                    tpl_hook.free( (*gs)->img );
-                    tpl_hook.free( (*gs) );
-                    *gs = NULL;
-                    tpl_hook.oops("tpl exceeds max length %d\n", 
-                        tpl_hook.gather_max);
-                    return -2;              /* error, caller should close fd */
-                }
-                if ( (img = tpl_hook.realloc((*gs)->img, catlen)) == NULL) {
-                    fatal_oom();
-                }
-                memcpy(img + (*gs)->len, buf, rc);
-                tpl_hook.free(*gs);
-                *gs = NULL;
-            } else {
-                img = buf;
-                catlen = rc;
-            }
-            /* isolate any full tpl(s) in img and invoke cb for each */
-            tpl = img;
-            keep_looping = (tpl+8 < img+catlen) ? 1 : 0;
-            while (keep_looping) {
-                if (strncmp("tpl", tpl, 3) != 0) {
-                    tpl_hook.oops("tpl prefix invalid\n");
-                    if (img != buf) tpl_hook.free(img);
-                    tpl_hook.free(*gs);
-                    *gs = NULL;
-                    return -3; /* error, caller should close fd */
-                }
-                memcpy(&tpllen,&tpl[4],4);
-                if (tpl_needs_endian_swap(tpl)) tpl_byteswap(&tpllen,4);
-                if (tpl+tpllen <= img+catlen) {
-                    cbrc = (cb)(tpl,tpllen,data);  /* invoke cb for tpl image */
-                    tpl += tpllen;                 /* point to next tpl image */
-                    if (cbrc < 0) keep_looping = 0;
-                    else keep_looping = (tpl+8 < img+catlen) ? 1 : 0;
-                } else keep_looping=0;
-            } 
-            /* check if app callback requested closure of tpl source */
-            if (cbrc < 0) {
-                tpl_hook.oops("tpl_fd_gather aborted by app callback\n");
-                if (img != buf) tpl_hook.free(img);
-                if (*gs) tpl_hook.free(*gs);
-                *gs = NULL;
-                return -4;
-            }
-            /* store any leftover, partial tpl fragment for next read */
-            if (tpl == img && img != buf) {  
-                /* consumed nothing from img!=buf */
-                if ( (*gs = tpl_hook.malloc(sizeof(tpl_gather_t))) == NULL ) {
-                    fatal_oom();
-                }
-                (*gs)->img = tpl;
-                (*gs)->len = catlen;
-            } else if (tpl < img+catlen) {  
-                /* consumed 1+ tpl(s) from img!=buf or 0 from img==buf */
-                if ( (*gs = tpl_hook.malloc(sizeof(tpl_gather_t))) == NULL ) {
-                    fatal_oom();
-                }
-                if ( ((*gs)->img = tpl_hook.malloc(img+catlen - tpl)) == NULL ) {
-                    fatal_oom();
-                }
-                (*gs)->len = img+catlen - tpl;
-                memcpy( (*gs)->img, tpl, img+catlen - tpl);
-                /* free partially consumed concat buffer if used */
-                if (img != buf) tpl_hook.free(img); 
-            } else {                        /* tpl(s) fully consumed */
-                /* free consumed concat buffer if used */
-                if (img != buf) tpl_hook.free(img); 
-            }
-        }
-    } 
-}
-
-/* gather tpl piecemeal from memory buffer (not fd) e.g., from a lower-level api */
-static int tpl_gather_mem( char *buf, size_t len, tpl_gather_t **gs, tpl_gather_cb *cb, void *data) {
-    char *img, *tpl;
-    int keep_looping, cbrc=0;
-    size_t catlen;
-    uint32_t tpllen;
-
-    /* concatenate any partial tpl from last read with new buffer */
-    if (*gs) {
-        catlen = (*gs)->len + len;
-        if (tpl_hook.gather_max > 0 && 
-            catlen > tpl_hook.gather_max) {
-            tpl_hook.free( (*gs)->img );
-            tpl_hook.free( (*gs) );
-            *gs = NULL;
-            tpl_hook.oops("tpl exceeds max length %d\n", 
-                tpl_hook.gather_max);
-            return -2;              /* error, caller should stop accepting input from source*/
-        }
-        if ( (img = tpl_hook.realloc((*gs)->img, catlen)) == NULL) {
-            fatal_oom();
-        }
-        memcpy(img + (*gs)->len, buf, len);
-        tpl_hook.free(*gs);
-        *gs = NULL;
-    } else {
-        img = buf;
-        catlen = len;
-    }
-    /* isolate any full tpl(s) in img and invoke cb for each */
-    tpl = img;
-    keep_looping = (tpl+8 < img+catlen) ? 1 : 0;
-    while (keep_looping) {
-        if (strncmp("tpl", tpl, 3) != 0) {
-            tpl_hook.oops("tpl prefix invalid\n");
-            if (img != buf) tpl_hook.free(img);
-            tpl_hook.free(*gs);
-            *gs = NULL;
-            return -3; /* error, caller should stop accepting input from source*/
-        }
-        memcpy(&tpllen,&tpl[4],4);
-        if (tpl_needs_endian_swap(tpl)) tpl_byteswap(&tpllen,4);
-        if (tpl+tpllen <= img+catlen) {
-            cbrc = (cb)(tpl,tpllen,data);  /* invoke cb for tpl image */
-            tpl += tpllen;               /* point to next tpl image */
-            if (cbrc < 0) keep_looping = 0;
-            else keep_looping = (tpl+8 < img+catlen) ? 1 : 0;
-        } else keep_looping=0;
-    } 
-    /* check if app callback requested closure of tpl source */
-    if (cbrc < 0) {
-        tpl_hook.oops("tpl_mem_gather aborted by app callback\n");
-        if (img != buf) tpl_hook.free(img);
-        if (*gs) tpl_hook.free(*gs);
-        *gs = NULL;
-        return -4;
-    }
-    /* store any leftover, partial tpl fragment for next read */
-    if (tpl == img && img != buf) {  
-        /* consumed nothing from img!=buf */
-        if ( (*gs = tpl_hook.malloc(sizeof(tpl_gather_t))) == NULL ) {
-            fatal_oom();
-        }
-        (*gs)->img = tpl;
-        (*gs)->len = catlen;
-    } else if (tpl < img+catlen) {  
-        /* consumed 1+ tpl(s) from img!=buf or 0 from img==buf */
-        if ( (*gs = tpl_hook.malloc(sizeof(tpl_gather_t))) == NULL ) {
-            fatal_oom();
-        }
-        if ( ((*gs)->img = tpl_hook.malloc(img+catlen - tpl)) == NULL ) {
-            fatal_oom();
-        }
-        (*gs)->len = img+catlen - tpl;
-        memcpy( (*gs)->img, tpl, img+catlen - tpl);
-        /* free partially consumed concat buffer if used */
-        if (img != buf) tpl_hook.free(img); 
-    } else {                        /* tpl(s) fully consumed */
-        /* free consumed concat buffer if used */
-        if (img != buf) tpl_hook.free(img); 
-    }
-    return 1;
-}
diff --git a/protocols/Makefile.am b/protocols/Makefile.am
index c5239e1c21c30000e6ce506a8ab2908a06ac02c5..40d1a63254fb0e198ce4d105f3484ea5c7077c8a 100644
--- a/protocols/Makefile.am
+++ b/protocols/Makefile.am
@@ -43,6 +43,9 @@ BUILT_SOURCES = \
 	     mstro_ep.pb-c.h mstro_ep.pb-c.c \
 	     protobuf-c/protobuf-c.c protobuf-c/protobuf-c.h
 
+SOURCES = \
+	maestro-endpoints.c maestro-endpoints.h
+
 # direct inclusion of protobuf-c utility code into our library saves us from
 # building dependency lib
 protobuf-c/protobuf-c.c: $(abs_top_srcdir)/deps/protobuf-c/protobuf-c/protobuf-c.c
@@ -51,7 +54,16 @@ protobuf-c/protobuf-c.h: $(abs_top_srcdir)/deps/protobuf-c/protobuf-c/protobuf-c
 	cp $< $@
 
 libmaestro_proto_la_CPPFLAGS = \
-                   -I/Users/uhaus/work/cray/maestro/deps/protobuf/dst/include
+                   -I$(abs_top_builddir)/deps/protobuf/dst/include \
+		   		   -I$(abs_top_srcdir)/include \
+				   -I$(top_srcdir)/deps/mamba/common \
+		           -I$(top_srcdir)/deps/mamba/memory 
+
+
+if WITH_LOCAL_LIBFABRIC
+libmaestro_proto_la_CPPFLAGS += -I$(top_srcdir)/deps/libfabric/include -I$(top_srcdir)/deps/libfabric/prov/gni/include
+# don't add libfabric symbols, maestro core lib will do that for us
+endif
 
 # If you ever need to debug protobuf add this to the start of protobuf/protobuf-c.c:
 ##define PROTOBUF_C_UNPACK_ERROR(...)       do {                         \
@@ -66,7 +78,7 @@ libmaestro_proto_la_CPPFLAGS = \
 
 
 libmaestro_proto_la_SOURCES = \
-	$(BUILT_SOURCES)
+	$(BUILT_SOURCES) $(SOURCES)
 
 libmaestro_proto_la_LIBADD=
 
diff --git a/protocols/maestro-endpoints.c b/protocols/maestro-endpoints.c
new file mode 100644
index 0000000000000000000000000000000000000000..408ffbbe94b87dd93cf453ae8837bd611f5ca113
--- /dev/null
+++ b/protocols/maestro-endpoints.c
@@ -0,0 +1,454 @@
+/* -*- mode:c -*- */
+/** @file
+ ** @brief Maestro Endpoint (De-)Serialization
+ **/
+/*
+ * Copyright (C) 2021 Hewlett-Packaged (Schweiz) GmbH
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "maestro-endpoints.h"
+#include <stdlib.h>
+#include "maestro/logging.h"
+
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h> 
+#include <arpa/inet.h>
+
+/* simplify logging */
+#define NOISE(...) LOG_NOISE(MSTRO_LOG_MODULE_COMM,__VA_ARGS__)
+#define DEBUG(...) LOG_DEBUG(MSTRO_LOG_MODULE_COMM,__VA_ARGS__)
+#define INFO(...)  LOG_INFO(MSTRO_LOG_MODULE_COMM,__VA_ARGS__)
+#define WARN(...)  LOG_WARN(MSTRO_LOG_MODULE_COMM,__VA_ARGS__)
+#define ERR(...)   LOG_ERR(MSTRO_LOG_MODULE_COMM,__VA_ARGS__)
+
+
+mstro_status
+mstro_ep_fi_to_ep(const struct fi_info *fi, struct fid_ep *ep,
+                  Mstro__Endpoint **result)
+{
+  mstro_status s=MSTRO_UNIMPL;
+  
+  if(fi==NULL)
+    return MSTRO_INVARG;
+  if(result==NULL)
+    return MSTRO_INVOUT;
+
+  Mstro__Endpoint *me = malloc(sizeof(Mstro__Endpoint));
+  if(me==NULL) {
+    s=MSTRO_NOMEM;
+    goto BAILOUT;
+  } else {
+    mstro__endpoint__init(me);
+    me->proto_case = MSTRO__ENDPOINT__PROTO_OFIPROTO;
+    me->addr_case  = MSTRO__ENDPOINT__ADDR_OFIADDR;
+  }
+
+  /* set endpoint protocol */
+  switch(fi->ep_attr->protocol) {
+    case FI_PROTO_UNSPEC:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__UNSPEC; break;
+    case FI_PROTO_RDMA_CM_IB_RC:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_RC; break;
+    case FI_PROTO_IWARP:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__IWARP; break;
+    case FI_PROTO_IB_UD:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__IB_UD; break;
+    case FI_PROTO_PSMX:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__PSMX; break;
+    case FI_PROTO_UDP:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__UDP; break;
+    case FI_PROTO_SOCK_TCP:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__SOCK_TCP; break;
+    case FI_PROTO_MXM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__MXM; break;
+    case FI_PROTO_IWARP_RDM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__IWARP_RDM; break;
+    case FI_PROTO_IB_RDM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__IB_RDM; break;
+    case FI_PROTO_GNI:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__GNI; break;
+    case FI_PROTO_RXM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__RXM; break;
+    case FI_PROTO_RXD:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__RXD; break;
+    case FI_PROTO_MLX:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__MLX; break;
+    case FI_PROTO_NETWORKDIRECT:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__NETWORKDIRECT; break;
+    case FI_PROTO_PSMX2:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__PSMX2; break;
+    case FI_PROTO_SHM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__SHM; break;
+    case FI_PROTO_MRAIL:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__MRAIL; break;
+    case FI_PROTO_RSTREAM:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__RSTREAM; break;
+    case FI_PROTO_RDMA_CM_IB_XRC:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_XRC; break;
+    case FI_PROTO_EFA:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__EFA; break;
+    case FI_PROTO_PSMX3:
+      me->ofiproto = MSTRO__OFI_ENDPOINT_KIND__PSMX3; break;
+    default:
+      ERR("Unsupported OFI protocol: %d (%s)\n",
+          fi->ep_attr->protocol,
+          fi_tostr(&fi->ep_attr->protocol, FI_TYPE_PROTOCOL));
+      s=MSTRO_UNIMPL;
+      goto BAILOUT;
+  }
+  
+  Mstro__OfiAddr *addr = malloc(sizeof(Mstro__OfiAddr));
+  if(addr==NULL) {
+    s=MSTRO_NOMEM;
+    goto BAILOUT;
+  } else {
+    mstro__ofi_addr__init(addr);
+    me->ofiaddr = addr;
+  }
+
+
+  switch(fi->addr_format) {
+    case FI_FORMAT_UNSPEC:
+      ERR("Unsupported address format FI_FORMAT_UNSPEC\n");
+      s=MSTRO_UNIMPL;
+      goto BAILOUT;
+    case FI_SOCKADDR: {
+      ERR("Unusupported address format FI_SOCKADDR\n");
+      s=MSTRO_UNIMPL;
+      goto BAILOUT;
+    }
+    case FI_SOCKADDR_IN: {
+      size_t addrlen = sizeof(struct sockaddr_in);
+      struct sockaddr_in buf;
+      int r = fi_getname(&ep->fid, &buf, &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_IN4;
+      addr->in4 = malloc(sizeof(Mstro__AddrSockaddrIN4));
+      if(addr->in4==NULL) {
+        s=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__addr_sockaddr__in4__init(addr->in4);
+        addr->in4->sin_family = buf.sin_family;
+        addr->in4->sin_port   = buf.sin_port;
+        addr->in4->sin_addr   = buf.sin_addr.s_addr;
+      }
+      break;
+    }
+    case FI_SOCKADDR_IN6: {
+      size_t addrlen = sizeof(struct sockaddr_in6);
+      struct sockaddr_in6 buf;
+      int r = fi_getname(&ep->fid, &buf, &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_IN6;
+      addr->in6 = malloc(sizeof(Mstro__AddrSockaddrIN6));
+      if(addr->in6==NULL) {
+        s=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__addr_sockaddr__in6__init(addr->in6);
+        addr->in6->sin6_family = buf.sin6_family;
+        addr->in6->sin6_port   = buf.sin6_port;
+        addr->in6->sin6_flowinfo = buf.sin6_flowinfo;
+        /* things are already in network byte order */
+        addr->in6->sin6_addr_0 = (   buf.sin6_addr.s6_addr[0]
+                                  | (buf.sin6_addr.s6_addr[1] <<  8)
+                                  | (buf.sin6_addr.s6_addr[2] << 16)
+                                  | (buf.sin6_addr.s6_addr[3] << 24));
+        addr->in6->sin6_addr_1 = (   buf.sin6_addr.s6_addr[4]
+                                  | (buf.sin6_addr.s6_addr[5] <<  8)
+                                  | (buf.sin6_addr.s6_addr[6] << 16)
+                                  | (buf.sin6_addr.s6_addr[7] << 24));
+        addr->in6->sin6_scope_id = buf.sin6_scope_id;
+      }
+      break;
+    }
+    case FI_ADDR_GNI: {
+      uint64_t buf[6];
+      size_t addrlen = sizeof(buf);
+
+      int r = fi_getname(&ep->fid, &(buf[0]), &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_GNI;
+      addr->gni = malloc(sizeof(Mstro__AddrGNI));
+      if(addr->gni==NULL) {
+        s=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__addr_gni__init(addr->gni);
+        addr->gni->a0 = buf[0];
+        addr->gni->a1 = buf[1];
+        addr->gni->a2 = buf[2];
+        addr->gni->a3 = buf[3];
+        addr->gni->a4 = buf[4];
+        addr->gni->a5 = buf[5];
+      }
+      break;
+      
+    }
+    case FI_ADDR_PSMX: {
+      uint64_t buf[1];
+      size_t addrlen = sizeof(buf);
+
+      int r = fi_getname(&ep->fid, &(buf[0]), &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_PSMX;
+      addr->psmx = buf[0];
+      break;
+    }
+      
+    case FI_ADDR_PSMX2: {
+      uint64_t buf[2];
+      size_t addrlen = sizeof(buf);
+
+      int r = fi_getname(&ep->fid, &(buf[0]), &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_PSMX2;
+      addr->psmx2 = malloc(sizeof(Mstro__AddrPSMX2));
+      if(addr->psmx2==NULL) {
+        s=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__addr_psmx2__init(addr->psmx2);
+        addr->psmx2->a0 = buf[0];
+        addr->psmx2->a1 = buf[1];
+      }
+      break;
+    }
+    case FI_ADDR_PSMX3: {
+      uint64_t buf[2];
+      size_t addrlen = sizeof(buf);
+
+      int r = fi_getname(&ep->fid, &(buf[0]), &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_PSMX3;
+      addr->psmx3 = malloc(sizeof(Mstro__AddrPSMX3));
+      if(addr->psmx3==NULL) {
+        s=MSTRO_NOMEM;
+        goto BAILOUT;
+      } else {
+        mstro__addr_psmx3__init(addr->psmx3);
+        addr->psmx3->a0 = buf[0];
+        addr->psmx3->a1 = buf[1];
+      }
+      break;
+    }
+    case FI_ADDR_STR: {
+      size_t addrlen=0;
+      void *buf=NULL;
+      int r = fi_getname(&ep->fid, buf, &addrlen);
+      if(r!=0 && r!=-FI_ETOOSMALL) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        goto BAILOUT;
+      }
+      if(r==-FI_ETOOSMALL) {
+        assert(addrlen>0);
+        buf=malloc(addrlen);
+        if(buf==NULL) {
+          ERR("Failed to allocate %zu bytes for string-format address\n", addrlen);
+          s=MSTRO_NOMEM;
+          goto BAILOUT;
+        }
+      }
+      assert(addrlen>0 && buf!=NULL);
+      r = fi_getname(&ep->fid, buf, &addrlen);
+      if(r!=0) {
+        ERR("Error obtaining the endpoint name (=local addr): %d (%s)\n",
+            r, fi_strerror(-r));
+        s=MSTRO_FAIL;
+        free(buf);
+        goto BAILOUT;
+      }
+      addr->val_case = MSTRO__OFI_ADDR__VAL_STR;
+      addr->str = buf;
+      break;
+    }
+      
+      /* unimplemented: */
+    case FI_SOCKADDR_IB:
+    case FI_ADDR_BGQ:
+    case FI_ADDR_MLX:
+    case FI_ADDR_IB_UD:
+    case FI_ADDR_EFA:
+        ERR("Unimplemented address format: %d (%s)\n",
+            fi->addr_format,
+            fi_tostr(&fi->addr_format, FI_TYPE_ADDR_FORMAT));
+        s=MSTRO_UNIMPL;
+        goto BAILOUT;
+    default:
+      ERR("Unknown address format: %d (%s)\n",
+          fi->addr_format,
+          fi_tostr(&fi->addr_format, FI_TYPE_ADDR_FORMAT));
+      s=MSTRO_UNIMPL;
+      goto BAILOUT;
+  }
+  
+  s=MSTRO_OK;
+
+BAILOUT:
+  if(s!=MSTRO_OK) {
+    if(me) {
+      if(me->ofiaddr)
+        free(me->ofiaddr);
+      free(me);
+    }
+  } else {
+    *result = me;
+  }
+  return s;
+}
+
+
+void
+mstro_ep__addr_describe(const Mstro__Endpoint *ep, char *buf, size_t buflen)
+{
+  assert(ep->addr_case==MSTRO__ENDPOINT__ADDR_OFIADDR); 
+  assert(ep->ofiaddr!=NULL);
+
+  assert(buf!=NULL);
+  assert(buflen>4);
+  size_t num_written;
+  const size_t unabbrev_len = buflen-3;
+  
+  switch(ep->ofiaddr->val_case) {
+    case MSTRO__OFI_ADDR__VAL__NOT_SET: 
+      num_written = snprintf(buf,unabbrev_len,"(unset)");
+      break;
+    case MSTRO__OFI_ADDR__VAL_UNSPEC:
+      num_written = snprintf(buf,unabbrev_len,"(unspecified)");
+      break;
+    case MSTRO__OFI_ADDR__VAL_IN4: {
+      uint32_t addr = ntohl(ep->ofiaddr->in4->sin_addr);
+      /* we don't print family as it needs to be AF_INET anyway */
+      num_written = snprintf(buf,unabbrev_len,"IPv4 %d.%d.%d.%d:%d",
+                             (addr & 0xff000000)>>24,
+                             (addr & 0x00ff0000)>>16,
+                             (addr & 0x0000ff00)>>8,
+                             (addr & 0x000000ff)>>0,
+                             ep->ofiaddr->in4->sin_port);
+      break;
+    }
+    case MSTRO__OFI_ADDR__VAL_IN6: {
+      uint64_t
+          addr0 = ep->ofiaddr->in6->sin6_addr_0,
+          addr1 = ep->ofiaddr->in6->sin6_addr_1;
+      struct sockaddr_in6 tmp = { .sin6_family          = ep->ofiaddr->in6->sin6_family,
+                                  .sin6_port            = ep->ofiaddr->in6->sin6_port,
+                                  .sin6_flowinfo        = ep->ofiaddr->in6->sin6_flowinfo,
+                                  .sin6_addr.s6_addr[0] = addr0 & 0x000000ff,
+                                  .sin6_addr.s6_addr[1] = addr0 & 0x0000ff00,
+                                  .sin6_addr.s6_addr[2] = addr0 & 0x00ff0000,
+                                  .sin6_addr.s6_addr[3] = addr0 & 0xff000000,
+                                  .sin6_addr.s6_addr[4] = addr1 & 0x000000ff,
+                                  .sin6_addr.s6_addr[5] = addr1 & 0x0000ff00,
+                                  .sin6_addr.s6_addr[6] = addr1 & 0x00ff0000,
+                                  .sin6_addr.s6_addr[7] = addr1 & 0xff000000,
+                                  .sin6_scope_id        = ep->ofiaddr->in6->sin6_scope_id};
+      char astring[INET6_ADDRSTRLEN];
+      
+      inet_ntop(AF_INET6, &(tmp.sin6_addr), astring, INET6_ADDRSTRLEN);
+      num_written = snprintf(buf,unabbrev_len,"IPv6 %s", astring);
+      break;
+    }
+    case MSTRO__OFI_ADDR__VAL_GNI: {
+      /* see _gnix_ep_name_to_str() */
+      union gnix {
+        uint64_t raw;
+        struct {
+          uint32_t dev_addr;
+          uint32_t cdm_id;
+        };
+      };
+      union gnix tmp = {.raw = ep->ofiaddr->gni->a0 };
+      num_written = snprintf(buf,unabbrev_len,
+                             "0x%08" PRIx32 ";0x%08" PRIx32 "",
+                             tmp.dev_addr, tmp.cdm_id);
+      break;
+    }
+    case MSTRO__OFI_ADDR__VAL_STR:
+      num_written = snprintf(buf,unabbrev_len,"%s", ep->ofiaddr->str);
+      break;
+      
+    case MSTRO__OFI_ADDR__VAL_SOCK:
+    case MSTRO__OFI_ADDR__VAL_IB:
+    case MSTRO__OFI_ADDR__VAL_PSMX:
+    case MSTRO__OFI_ADDR__VAL_BGQ:
+    case MSTRO__OFI_ADDR__VAL_MLX:
+    case MSTRO__OFI_ADDR__VAL_PSMX2:
+    case MSTRO__OFI_ADDR__VAL_IB_UD:
+    case MSTRO__OFI_ADDR__VAL_EFA:
+    case MSTRO__OFI_ADDR__VAL_PSMX3:
+      num_written = snprintf(buf,unabbrev_len,"(FIXME)");
+      break;
+    default:
+      ERR("Unsupported address type: %d\n", ep->ofiaddr->val_case);
+      num_written = snprintf(buf,unabbrev_len,"(FIXME)");
+      break;
+  }
+  if(num_written>=unabbrev_len) 
+    strncpy(buf+unabbrev_len-1,"...",4);
+  
+  return;
+}
+      
diff --git a/protocols/maestro-endpoints.h b/protocols/maestro-endpoints.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d449ef18a53fa68e0b09ef8261b7d9d49d2eec9
--- /dev/null
+++ b/protocols/maestro-endpoints.h
@@ -0,0 +1,168 @@
+/* -*- mode:c -*- */
+/** @file
+ ** @brief Maestro Endpoint (De-)Serialization
+ **/
+/*
+ * Copyright (C) 2021 Hewlett-Packaged (Schweiz) GmbH
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MAESTRO_ENDPOINTS_H_
+#define MAESTRO_ENDPOINTS_H_ 1
+
+
+#include "rdma/fabric.h"
+#include "rdma/fi_cm.h"
+
+#include "maestro/i_base64.h"
+#include "mstro_ep.pb-c.h"
+#include "maestro/status.h"
+
+/** Create a protobuf endpoint descriptor for the fi_info and (enabled) endpoint */
+mstro_status
+mstro_ep_fi_to_ep(const struct fi_info *fi, struct fid_ep *ep,
+                  Mstro__Endpoint **result);
+
+/** maximal buffer size needed to print an endpoint's address */
+#define MSTRO_EP__ADDR_STR_MAX 512
+
+/** create human-readable description of endpoint's address in @arg buf (truncating if @arg buflen is too small) */
+void
+mstro_ep__addr_describe(const Mstro__Endpoint *ep, char *buf, size_t buflen);
+
+#define MSTRO_EP_DESC_STRMAX 1024
+
+
+/** Bind @arg buf to a temporary buffer with a human-readable endpoint
+ * description of @arg endpoint/@arg credential/@arg inforegion while
+ * executing @body */
+#define WITH_MSTRO_EP_CRED_REG_DESCRIPTION(buf,ep,cred,reg,prefix,body) \
+  do {                                                                  \
+    assert(ep->proto_case==MSTRO__ENDPOINT__PROTO_OFIPROTO);            \
+    assert(ep->addr_case==MSTRO__ENDPOINT__ADDR_OFIADDR);               \
+    assert(ep->ofiaddr!=NULL);                                          \
+    assert(cred==NULL                                                   \
+           || cred->val_case==MSTRO__OFI_CREDENTIAL__VAL__NOT_SET       \
+           || (cred->val_case==MSTRO__OFI_CREDENTIAL__VAL_DRC           \
+               && cred->drc!=NULL));                                    \
+    assert(reg!=NULL);                                                  \
+                                                                        \
+    char addrbuf_[MSTRO_EP__ADDR_STR_MAX];                              \
+    mstro_ep__addr_describe(ep, addrbuf_, MSTRO_EP__ADDR_STR_MAX);      \
+                                                                        \
+    const size_t mrkeylen_ = 2*reg->raw_key.len + 1;                    \
+    char mrkeybuf_[mrkeylen_];                                          \
+    mrkeybuf_[0]='\0';                                                  \
+    for(size_t i_=0; i_<reg->raw_key.len; i_++) {                       \
+      snprintf(mrkeybuf_+2*i_, 3, "%02X",                               \
+               reg->raw_key.data[i_]);                                  \
+    }                                                                   \
+                                                                        \
+    do {                                                                \
+      size_t prefix_l_=strlen(prefix);                                  \
+      char buf[MSTRO_EP_DESC_STRMAX+prefix_l_];                         \
+      snprintf(buf, MSTRO_EP_DESC_STRMAX+prefix_l_,                     \
+               "%s"                                                     \
+               "proto %s addr %s "                                       \
+               "(credential: %" PRIu32 ", mraddr %" PRIx64 ", mrkey %s)\n", \
+               prefix,                                                  \
+               (protobuf_c_enum_descriptor_get_value(                   \
+                   &mstro__ofi_endpoint_kind__descriptor, ep->ofiproto) \
+                ->name),                                                \
+               addrbuf_,                                                \
+               (cred && cred->val_case==MSTRO__OFI_CREDENTIAL__VAL_DRC  \
+                ? cred->drc->credential : 0),                           \
+               reg->baseaddr,                                           \
+               mrkeybuf_);                                              \
+                                                                        \
+      body;                                                             \
+    } while(0);                                                         \
+  } while(0)
+
+
+/** Bind @arg buf to a temporary buffer with a human-readable endpoint
+ * description of @arg ep while executing @arg body */
+#define WITH_MSTRO_EP_DESCRIPTION(buf,ep,body) do {                     \
+    const Mstro__Endpoint *ep_ = ep->pbep;                              \
+    const Mstro__OfiMemoryRegion *omr_ = ep->inforeg;                   \
+    const Mstro__OfiCredential *oc_ = ep->cred;                         \
+    WITH_MSTRO_EP_CRED_REG_DESCRIPTION(buf,ep_,oc_,omr_,"",body);       \
+  } while(0)
+
+/** Bind @arg buf to a temporary buffer with a human-readable endpoint
+ * description of @arg epl (an Maestro_EndpointList*) while executing
+ * @arg body */
+#define WITH_MSTRO_EPL_DESCRIPTION(buf,epl,body) \
+  do {                                                                  \
+    assert(epl->n_eps==epl->n_credentials);                             \
+    assert(epl->n_eps==epl->n_inforegs);                                \
+    for(size_t i_=0; i_<epl->n_eps; i_++) {                             \
+      const Mstro__Endpoint *ep_ = epl->eps[i_];                        \
+      const Mstro__OfiMemoryRegion *omr_ = epl->inforegs[i_];           \
+      const Mstro__OfiCredential *oc_ = epl->credentials[i_];           \
+      char prefix_[16];                                                  \
+      snprintf(prefix_, 16, "EP %zu: ", i_);                             \
+      WITH_MSTRO_EP_CRED_REG_DESCRIPTION(buf,ep_,oc_,omr_,prefix_,body); \
+    }                                                                   \
+  } while(0)
+
+
+/** log a human-readable description of the endpoint list @arg el
+ * value at index @arg idx with @arg loglevel under @arg logmodule
+ * and with prefix @arg header */
+#define MSTRO_EP__EL_DESCRIBE(loglevel,logmodule,header,el,idx) do {    \
+    assert(el!=NULL);                                                   \
+    assert(el->n_eps>idx); assert(el->n_inforegs>idx); assert(el->n_credentials>idx); \
+    assert(el->eps!=NULL); assert(el->inforegs!=NULL); assert(el->credentials!=NULL); \
+    assert(el->eps[idx]!=NULL); assert(el->inforegs[idx]!=NULL);        \
+                                                                        \
+    WITH_MSTRO_EP_CRED_REG_DESCRIPTION(                                 \
+        buf, el->eps[idx], el->credentials[idx], el->inforegs[idx],     \
+        "",                                                             \
+        LOG(logmodule, loglevel, "%s: %s\n", header, buf););            \
+  } while(0);
+
+/** Bind @arg buf to a human-readable description of the endpoint list @arg el
+ * value at index @arg idx while executing @arg body */
+#define WITH_MSTRO_EPL_ENTRY_DESCRIPTION(buf,epl,idx,body) do {         \
+    assert(epl!=NULL);                                                  \
+    assert(epl->n_eps>idx); assert(epl->eps!=NULL);                     \
+    assert(epl->n_inforegs>idx); assert(epl->inforegs!=NULL);           \
+    assert(epl->n_credentials>idx); assert(epl->credentials!=NULL);     \
+    assert(epl->eps[idx]!=NULL); assert(epl->inforegs[idx]!=NULL);      \
+                                                                        \
+    WITH_MSTRO_EP_CRED_REG_DESCRIPTION(                                 \
+        buf, epl->eps[idx], epl->credentials[idx], epl->inforegs[idx],\
+        "",                                                             \
+        body);                                                          \
+  } while(0);
+
+
+
+#endif
diff --git a/protocols/mstro_ep.pb-c.c b/protocols/mstro_ep.pb-c.c
index 7e24c7ee3cfb3943fc6a01ee6196eeb5744250a5..c66ec4e5440a58f4bac936e0f9fee4825d66138c 100644
--- a/protocols/mstro_ep.pb-c.c
+++ b/protocols/mstro_ep.pb-c.c
@@ -7,364 +7,409 @@
 #endif
 
 #include "mstro_ep.pb-c.h"
-void   mstro__sockaddr__init
-                     (Mstro__Sockaddr         *message)
+void   mstro__addr_sockaddr__init
+                     (Mstro__AddrSockaddr         *message)
 {
-  static const Mstro__Sockaddr init_value = MSTRO__SOCKADDR__INIT;
+  static const Mstro__AddrSockaddr init_value = MSTRO__ADDR_SOCKADDR__INIT;
   *message = init_value;
 }
-size_t mstro__sockaddr__get_packed_size
-                     (const Mstro__Sockaddr *message)
+size_t mstro__addr_sockaddr__get_packed_size
+                     (const Mstro__AddrSockaddr *message)
 {
-  assert(message->base.descriptor == &mstro__sockaddr__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__sockaddr__pack
-                     (const Mstro__Sockaddr *message,
+size_t mstro__addr_sockaddr__pack
+                     (const Mstro__AddrSockaddr *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__sockaddr__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__sockaddr__pack_to_buffer
-                     (const Mstro__Sockaddr *message,
+size_t mstro__addr_sockaddr__pack_to_buffer
+                     (const Mstro__AddrSockaddr *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__sockaddr__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__Sockaddr *
-       mstro__sockaddr__unpack
+Mstro__AddrSockaddr *
+       mstro__addr_sockaddr__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__Sockaddr *)
-     protobuf_c_message_unpack (&mstro__sockaddr__descriptor,
+  return (Mstro__AddrSockaddr *)
+     protobuf_c_message_unpack (&mstro__addr_sockaddr__descriptor,
                                 allocator, len, data);
 }
-void   mstro__sockaddr__free_unpacked
-                     (Mstro__Sockaddr *message,
+void   mstro__addr_sockaddr__free_unpacked
+                     (Mstro__AddrSockaddr *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__sockaddr__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__sockaddr_in4__init
-                     (Mstro__SockaddrIn4         *message)
+void   mstro__addr_sockaddr__in4__init
+                     (Mstro__AddrSockaddrIN4         *message)
 {
-  static const Mstro__SockaddrIn4 init_value = MSTRO__SOCKADDR_IN4__INIT;
+  static const Mstro__AddrSockaddrIN4 init_value = MSTRO__ADDR_SOCKADDR__IN4__INIT;
   *message = init_value;
 }
-size_t mstro__sockaddr_in4__get_packed_size
-                     (const Mstro__SockaddrIn4 *message)
+size_t mstro__addr_sockaddr__in4__get_packed_size
+                     (const Mstro__AddrSockaddrIN4 *message)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in4__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in4__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__sockaddr_in4__pack
-                     (const Mstro__SockaddrIn4 *message,
+size_t mstro__addr_sockaddr__in4__pack
+                     (const Mstro__AddrSockaddrIN4 *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in4__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in4__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__sockaddr_in4__pack_to_buffer
-                     (const Mstro__SockaddrIn4 *message,
+size_t mstro__addr_sockaddr__in4__pack_to_buffer
+                     (const Mstro__AddrSockaddrIN4 *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in4__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in4__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__SockaddrIn4 *
-       mstro__sockaddr_in4__unpack
+Mstro__AddrSockaddrIN4 *
+       mstro__addr_sockaddr__in4__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__SockaddrIn4 *)
-     protobuf_c_message_unpack (&mstro__sockaddr_in4__descriptor,
+  return (Mstro__AddrSockaddrIN4 *)
+     protobuf_c_message_unpack (&mstro__addr_sockaddr__in4__descriptor,
                                 allocator, len, data);
 }
-void   mstro__sockaddr_in4__free_unpacked
-                     (Mstro__SockaddrIn4 *message,
+void   mstro__addr_sockaddr__in4__free_unpacked
+                     (Mstro__AddrSockaddrIN4 *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__sockaddr_in4__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in4__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__sockaddr_in6__init
-                     (Mstro__SockaddrIn6         *message)
+void   mstro__addr_sockaddr__in6__init
+                     (Mstro__AddrSockaddrIN6         *message)
 {
-  static const Mstro__SockaddrIn6 init_value = MSTRO__SOCKADDR_IN6__INIT;
+  static const Mstro__AddrSockaddrIN6 init_value = MSTRO__ADDR_SOCKADDR__IN6__INIT;
   *message = init_value;
 }
-size_t mstro__sockaddr_in6__get_packed_size
-                     (const Mstro__SockaddrIn6 *message)
+size_t mstro__addr_sockaddr__in6__get_packed_size
+                     (const Mstro__AddrSockaddrIN6 *message)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in6__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in6__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__sockaddr_in6__pack
-                     (const Mstro__SockaddrIn6 *message,
+size_t mstro__addr_sockaddr__in6__pack
+                     (const Mstro__AddrSockaddrIN6 *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in6__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in6__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__sockaddr_in6__pack_to_buffer
-                     (const Mstro__SockaddrIn6 *message,
+size_t mstro__addr_sockaddr__in6__pack_to_buffer
+                     (const Mstro__AddrSockaddrIN6 *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_in6__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in6__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__SockaddrIn6 *
-       mstro__sockaddr_in6__unpack
+Mstro__AddrSockaddrIN6 *
+       mstro__addr_sockaddr__in6__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__SockaddrIn6 *)
-     protobuf_c_message_unpack (&mstro__sockaddr_in6__descriptor,
+  return (Mstro__AddrSockaddrIN6 *)
+     protobuf_c_message_unpack (&mstro__addr_sockaddr__in6__descriptor,
                                 allocator, len, data);
 }
-void   mstro__sockaddr_in6__free_unpacked
-                     (Mstro__SockaddrIn6 *message,
+void   mstro__addr_sockaddr__in6__free_unpacked
+                     (Mstro__AddrSockaddrIN6 *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__sockaddr_in6__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__in6__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__sockaddr_ib__init
-                     (Mstro__SockaddrIb         *message)
+void   mstro__addr_sockaddr__ib__init
+                     (Mstro__AddrSockaddrIB         *message)
 {
-  static const Mstro__SockaddrIb init_value = MSTRO__SOCKADDR_IB__INIT;
+  static const Mstro__AddrSockaddrIB init_value = MSTRO__ADDR_SOCKADDR__IB__INIT;
   *message = init_value;
 }
-size_t mstro__sockaddr_ib__get_packed_size
-                     (const Mstro__SockaddrIb *message)
+size_t mstro__addr_sockaddr__ib__get_packed_size
+                     (const Mstro__AddrSockaddrIB *message)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_ib__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__ib__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__sockaddr_ib__pack
-                     (const Mstro__SockaddrIb *message,
+size_t mstro__addr_sockaddr__ib__pack
+                     (const Mstro__AddrSockaddrIB *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_ib__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__ib__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__sockaddr_ib__pack_to_buffer
-                     (const Mstro__SockaddrIb *message,
+size_t mstro__addr_sockaddr__ib__pack_to_buffer
+                     (const Mstro__AddrSockaddrIB *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__sockaddr_ib__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__ib__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__SockaddrIb *
-       mstro__sockaddr_ib__unpack
+Mstro__AddrSockaddrIB *
+       mstro__addr_sockaddr__ib__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__SockaddrIb *)
-     protobuf_c_message_unpack (&mstro__sockaddr_ib__descriptor,
+  return (Mstro__AddrSockaddrIB *)
+     protobuf_c_message_unpack (&mstro__addr_sockaddr__ib__descriptor,
                                 allocator, len, data);
 }
-void   mstro__sockaddr_ib__free_unpacked
-                     (Mstro__SockaddrIb *message,
+void   mstro__addr_sockaddr__ib__free_unpacked
+                     (Mstro__AddrSockaddrIB *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__sockaddr_ib__descriptor);
+  assert(message->base.descriptor == &mstro__addr_sockaddr__ib__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__gni__init
-                     (Mstro__Gni         *message)
+void   mstro__addr_gni__init
+                     (Mstro__AddrGNI         *message)
 {
-  static const Mstro__Gni init_value = MSTRO__GNI__INIT;
+  static const Mstro__AddrGNI init_value = MSTRO__ADDR_GNI__INIT;
   *message = init_value;
 }
-size_t mstro__gni__get_packed_size
-                     (const Mstro__Gni *message)
+size_t mstro__addr_gni__get_packed_size
+                     (const Mstro__AddrGNI *message)
 {
-  assert(message->base.descriptor == &mstro__gni__descriptor);
+  assert(message->base.descriptor == &mstro__addr_gni__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__gni__pack
-                     (const Mstro__Gni *message,
+size_t mstro__addr_gni__pack
+                     (const Mstro__AddrGNI *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__gni__descriptor);
+  assert(message->base.descriptor == &mstro__addr_gni__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__gni__pack_to_buffer
-                     (const Mstro__Gni *message,
+size_t mstro__addr_gni__pack_to_buffer
+                     (const Mstro__AddrGNI *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__gni__descriptor);
+  assert(message->base.descriptor == &mstro__addr_gni__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__Gni *
-       mstro__gni__unpack
+Mstro__AddrGNI *
+       mstro__addr_gni__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__Gni *)
-     protobuf_c_message_unpack (&mstro__gni__descriptor,
+  return (Mstro__AddrGNI *)
+     protobuf_c_message_unpack (&mstro__addr_gni__descriptor,
                                 allocator, len, data);
 }
-void   mstro__gni__free_unpacked
-                     (Mstro__Gni *message,
+void   mstro__addr_gni__free_unpacked
+                     (Mstro__AddrGNI *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__gni__descriptor);
+  assert(message->base.descriptor == &mstro__addr_gni__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__psmx2__init
-                     (Mstro__Psmx2         *message)
+void   mstro__addr_psmx2__init
+                     (Mstro__AddrPSMX2         *message)
 {
-  static const Mstro__Psmx2 init_value = MSTRO__PSMX2__INIT;
+  static const Mstro__AddrPSMX2 init_value = MSTRO__ADDR_PSMX2__INIT;
   *message = init_value;
 }
-size_t mstro__psmx2__get_packed_size
-                     (const Mstro__Psmx2 *message)
+size_t mstro__addr_psmx2__get_packed_size
+                     (const Mstro__AddrPSMX2 *message)
 {
-  assert(message->base.descriptor == &mstro__psmx2__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx2__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__psmx2__pack
-                     (const Mstro__Psmx2 *message,
+size_t mstro__addr_psmx2__pack
+                     (const Mstro__AddrPSMX2 *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__psmx2__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx2__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__psmx2__pack_to_buffer
-                     (const Mstro__Psmx2 *message,
+size_t mstro__addr_psmx2__pack_to_buffer
+                     (const Mstro__AddrPSMX2 *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__psmx2__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx2__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__Psmx2 *
-       mstro__psmx2__unpack
+Mstro__AddrPSMX2 *
+       mstro__addr_psmx2__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__Psmx2 *)
-     protobuf_c_message_unpack (&mstro__psmx2__descriptor,
+  return (Mstro__AddrPSMX2 *)
+     protobuf_c_message_unpack (&mstro__addr_psmx2__descriptor,
                                 allocator, len, data);
 }
-void   mstro__psmx2__free_unpacked
-                     (Mstro__Psmx2 *message,
+void   mstro__addr_psmx2__free_unpacked
+                     (Mstro__AddrPSMX2 *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__psmx2__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx2__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__ib_ub__init
-                     (Mstro__IbUb         *message)
+void   mstro__addr_psmx3__init
+                     (Mstro__AddrPSMX3         *message)
 {
-  static const Mstro__IbUb init_value = MSTRO__IB_UB__INIT;
+  static const Mstro__AddrPSMX3 init_value = MSTRO__ADDR_PSMX3__INIT;
   *message = init_value;
 }
-size_t mstro__ib_ub__get_packed_size
-                     (const Mstro__IbUb *message)
+size_t mstro__addr_psmx3__get_packed_size
+                     (const Mstro__AddrPSMX3 *message)
 {
-  assert(message->base.descriptor == &mstro__ib_ub__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx3__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__ib_ub__pack
-                     (const Mstro__IbUb *message,
+size_t mstro__addr_psmx3__pack
+                     (const Mstro__AddrPSMX3 *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__ib_ub__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx3__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__ib_ub__pack_to_buffer
-                     (const Mstro__IbUb *message,
+size_t mstro__addr_psmx3__pack_to_buffer
+                     (const Mstro__AddrPSMX3 *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__ib_ub__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx3__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__IbUb *
-       mstro__ib_ub__unpack
+Mstro__AddrPSMX3 *
+       mstro__addr_psmx3__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__IbUb *)
-     protobuf_c_message_unpack (&mstro__ib_ub__descriptor,
+  return (Mstro__AddrPSMX3 *)
+     protobuf_c_message_unpack (&mstro__addr_psmx3__descriptor,
                                 allocator, len, data);
 }
-void   mstro__ib_ub__free_unpacked
-                     (Mstro__IbUb *message,
+void   mstro__addr_psmx3__free_unpacked
+                     (Mstro__AddrPSMX3 *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__ib_ub__descriptor);
+  assert(message->base.descriptor == &mstro__addr_psmx3__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__ofi__init
-                     (Mstro__Ofi         *message)
+void   mstro__addr_ib__ud__init
+                     (Mstro__AddrIBUD         *message)
 {
-  static const Mstro__Ofi init_value = MSTRO__OFI__INIT;
+  static const Mstro__AddrIBUD init_value = MSTRO__ADDR_IB__UD__INIT;
   *message = init_value;
 }
-size_t mstro__ofi__get_packed_size
-                     (const Mstro__Ofi *message)
+size_t mstro__addr_ib__ud__get_packed_size
+                     (const Mstro__AddrIBUD *message)
 {
-  assert(message->base.descriptor == &mstro__ofi__descriptor);
+  assert(message->base.descriptor == &mstro__addr_ib__ud__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__ofi__pack
-                     (const Mstro__Ofi *message,
+size_t mstro__addr_ib__ud__pack
+                     (const Mstro__AddrIBUD *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__ofi__descriptor);
+  assert(message->base.descriptor == &mstro__addr_ib__ud__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__ofi__pack_to_buffer
-                     (const Mstro__Ofi *message,
+size_t mstro__addr_ib__ud__pack_to_buffer
+                     (const Mstro__AddrIBUD *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__ofi__descriptor);
+  assert(message->base.descriptor == &mstro__addr_ib__ud__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__Ofi *
-       mstro__ofi__unpack
+Mstro__AddrIBUD *
+       mstro__addr_ib__ud__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__Ofi *)
-     protobuf_c_message_unpack (&mstro__ofi__descriptor,
+  return (Mstro__AddrIBUD *)
+     protobuf_c_message_unpack (&mstro__addr_ib__ud__descriptor,
                                 allocator, len, data);
 }
-void   mstro__ofi__free_unpacked
-                     (Mstro__Ofi *message,
+void   mstro__addr_ib__ud__free_unpacked
+                     (Mstro__AddrIBUD *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__ofi__descriptor);
+  assert(message->base.descriptor == &mstro__addr_ib__ud__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   mstro__ofi_addr__init
+                     (Mstro__OfiAddr         *message)
+{
+  static const Mstro__OfiAddr init_value = MSTRO__OFI_ADDR__INIT;
+  *message = init_value;
+}
+size_t mstro__ofi_addr__get_packed_size
+                     (const Mstro__OfiAddr *message)
+{
+  assert(message->base.descriptor == &mstro__ofi_addr__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__ofi_addr__pack
+                     (const Mstro__OfiAddr *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__ofi_addr__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__ofi_addr__pack_to_buffer
+                     (const Mstro__OfiAddr *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__ofi_addr__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__OfiAddr *
+       mstro__ofi_addr__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__OfiAddr *)
+     protobuf_c_message_unpack (&mstro__ofi_addr__descriptor,
+                                allocator, len, data);
+}
+void   mstro__ofi_addr__free_unpacked
+                     (Mstro__OfiAddr *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__ofi_addr__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
 void   mstro__endpoint__init
@@ -457,94 +502,139 @@ void   mstro__ofi_memory_region__free_unpacked
   assert(message->base.descriptor == &mstro__ofi_memory_region__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__pm__init
-                     (Mstro__PM         *message)
+void   mstro__cred_drc__init
+                     (Mstro__CredDRC         *message)
 {
-  static const Mstro__PM init_value = MSTRO__PM__INIT;
+  static const Mstro__CredDRC init_value = MSTRO__CRED_DRC__INIT;
   *message = init_value;
 }
-size_t mstro__pm__get_packed_size
-                     (const Mstro__PM *message)
+size_t mstro__cred_drc__get_packed_size
+                     (const Mstro__CredDRC *message)
 {
-  assert(message->base.descriptor == &mstro__pm__descriptor);
+  assert(message->base.descriptor == &mstro__cred_drc__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__pm__pack
-                     (const Mstro__PM *message,
+size_t mstro__cred_drc__pack
+                     (const Mstro__CredDRC *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__pm__descriptor);
+  assert(message->base.descriptor == &mstro__cred_drc__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__pm__pack_to_buffer
-                     (const Mstro__PM *message,
+size_t mstro__cred_drc__pack_to_buffer
+                     (const Mstro__CredDRC *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__pm__descriptor);
+  assert(message->base.descriptor == &mstro__cred_drc__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__PM *
-       mstro__pm__unpack
+Mstro__CredDRC *
+       mstro__cred_drc__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__PM *)
-     protobuf_c_message_unpack (&mstro__pm__descriptor,
+  return (Mstro__CredDRC *)
+     protobuf_c_message_unpack (&mstro__cred_drc__descriptor,
                                 allocator, len, data);
 }
-void   mstro__pm__free_unpacked
-                     (Mstro__PM *message,
+void   mstro__cred_drc__free_unpacked
+                     (Mstro__CredDRC *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__pm__descriptor);
+  assert(message->base.descriptor == &mstro__cred_drc__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-void   mstro__pm_info__init
-                     (Mstro__PmInfo         *message)
+void   mstro__ofi_credential__init
+                     (Mstro__OfiCredential         *message)
 {
-  static const Mstro__PmInfo init_value = MSTRO__PM_INFO__INIT;
+  static const Mstro__OfiCredential init_value = MSTRO__OFI_CREDENTIAL__INIT;
   *message = init_value;
 }
-size_t mstro__pm_info__get_packed_size
-                     (const Mstro__PmInfo *message)
+size_t mstro__ofi_credential__get_packed_size
+                     (const Mstro__OfiCredential *message)
 {
-  assert(message->base.descriptor == &mstro__pm_info__descriptor);
+  assert(message->base.descriptor == &mstro__ofi_credential__descriptor);
   return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
 }
-size_t mstro__pm_info__pack
-                     (const Mstro__PmInfo *message,
+size_t mstro__ofi_credential__pack
+                     (const Mstro__OfiCredential *message,
                       uint8_t       *out)
 {
-  assert(message->base.descriptor == &mstro__pm_info__descriptor);
+  assert(message->base.descriptor == &mstro__ofi_credential__descriptor);
   return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
 }
-size_t mstro__pm_info__pack_to_buffer
-                     (const Mstro__PmInfo *message,
+size_t mstro__ofi_credential__pack_to_buffer
+                     (const Mstro__OfiCredential *message,
                       ProtobufCBuffer *buffer)
 {
-  assert(message->base.descriptor == &mstro__pm_info__descriptor);
+  assert(message->base.descriptor == &mstro__ofi_credential__descriptor);
   return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
 }
-Mstro__PmInfo *
-       mstro__pm_info__unpack
+Mstro__OfiCredential *
+       mstro__ofi_credential__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data)
 {
-  return (Mstro__PmInfo *)
-     protobuf_c_message_unpack (&mstro__pm_info__descriptor,
+  return (Mstro__OfiCredential *)
+     protobuf_c_message_unpack (&mstro__ofi_credential__descriptor,
                                 allocator, len, data);
 }
-void   mstro__pm_info__free_unpacked
-                     (Mstro__PmInfo *message,
+void   mstro__ofi_credential__free_unpacked
+                     (Mstro__OfiCredential *message,
                       ProtobufCAllocator *allocator)
 {
   if(!message)
     return;
-  assert(message->base.descriptor == &mstro__pm_info__descriptor);
+  assert(message->base.descriptor == &mstro__ofi_credential__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   mstro__endpoint_list__init
+                     (Mstro__EndpointList         *message)
+{
+  static const Mstro__EndpointList init_value = MSTRO__ENDPOINT_LIST__INIT;
+  *message = init_value;
+}
+size_t mstro__endpoint_list__get_packed_size
+                     (const Mstro__EndpointList *message)
+{
+  assert(message->base.descriptor == &mstro__endpoint_list__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__endpoint_list__pack
+                     (const Mstro__EndpointList *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__endpoint_list__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__endpoint_list__pack_to_buffer
+                     (const Mstro__EndpointList *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__endpoint_list__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__EndpointList *
+       mstro__endpoint_list__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__EndpointList *)
+     protobuf_c_message_unpack (&mstro__endpoint_list__descriptor,
+                                allocator, len, data);
+}
+void   mstro__endpoint_list__free_unpacked
+                     (Mstro__EndpointList *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__endpoint_list__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
 void   mstro__app_info__init
@@ -592,7 +682,7 @@ void   mstro__app_info__free_unpacked
   assert(message->base.descriptor == &mstro__app_info__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-static const ProtobufCFieldDescriptor mstro__sockaddr__field_descriptors[2] =
+static const ProtobufCFieldDescriptor mstro__addr_sockaddr__field_descriptors[2] =
 {
   {
     "sa_family",
@@ -600,7 +690,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr__field_descriptors[2] =
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Sockaddr, sa_family),
+    offsetof(Mstro__AddrSockaddr, sa_family),
     NULL,
     NULL,
     0,             /* flags */
@@ -612,38 +702,38 @@ static const ProtobufCFieldDescriptor mstro__sockaddr__field_descriptors[2] =
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_BYTES,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Sockaddr, sa_data),
+    offsetof(Mstro__AddrSockaddr, sa_data),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__sockaddr__field_indices_by_name[] = {
+static const unsigned mstro__addr_sockaddr__field_indices_by_name[] = {
   1,   /* field[1] = sa_data */
   0,   /* field[0] = sa_family */
 };
-static const ProtobufCIntRange mstro__sockaddr__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_sockaddr__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 2 }
 };
-const ProtobufCMessageDescriptor mstro__sockaddr__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_sockaddr__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.sockaddr",
-  "Sockaddr",
-  "Mstro__Sockaddr",
+  "mstro.AddrSockaddr",
+  "AddrSockaddr",
+  "Mstro__AddrSockaddr",
   "mstro",
-  sizeof(Mstro__Sockaddr),
+  sizeof(Mstro__AddrSockaddr),
   2,
-  mstro__sockaddr__field_descriptors,
-  mstro__sockaddr__field_indices_by_name,
-  1,  mstro__sockaddr__number_ranges,
-  (ProtobufCMessageInit) mstro__sockaddr__init,
+  mstro__addr_sockaddr__field_descriptors,
+  mstro__addr_sockaddr__field_indices_by_name,
+  1,  mstro__addr_sockaddr__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_sockaddr__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__sockaddr_in4__field_descriptors[3] =
+static const ProtobufCFieldDescriptor mstro__addr_sockaddr__in4__field_descriptors[3] =
 {
   {
     "sin_family",
@@ -651,7 +741,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in4__field_descriptors[3]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn4, sin_family),
+    offsetof(Mstro__AddrSockaddrIN4, sin_family),
     NULL,
     NULL,
     0,             /* flags */
@@ -663,7 +753,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in4__field_descriptors[3]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_UINT32,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn4, sin_port),
+    offsetof(Mstro__AddrSockaddrIN4, sin_port),
     NULL,
     NULL,
     0,             /* flags */
@@ -675,39 +765,39 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in4__field_descriptors[3]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED32,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn4, sin_addr),
+    offsetof(Mstro__AddrSockaddrIN4, sin_addr),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__sockaddr_in4__field_indices_by_name[] = {
+static const unsigned mstro__addr_sockaddr__in4__field_indices_by_name[] = {
   2,   /* field[2] = sin_addr */
   0,   /* field[0] = sin_family */
   1,   /* field[1] = sin_port */
 };
-static const ProtobufCIntRange mstro__sockaddr_in4__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_sockaddr__in4__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 3 }
 };
-const ProtobufCMessageDescriptor mstro__sockaddr_in4__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_sockaddr__in4__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.sockaddr_in4",
-  "SockaddrIn4",
-  "Mstro__SockaddrIn4",
+  "mstro.AddrSockaddr_IN4",
+  "AddrSockaddrIN4",
+  "Mstro__AddrSockaddrIN4",
   "mstro",
-  sizeof(Mstro__SockaddrIn4),
+  sizeof(Mstro__AddrSockaddrIN4),
   3,
-  mstro__sockaddr_in4__field_descriptors,
-  mstro__sockaddr_in4__field_indices_by_name,
-  1,  mstro__sockaddr_in4__number_ranges,
-  (ProtobufCMessageInit) mstro__sockaddr_in4__init,
+  mstro__addr_sockaddr__in4__field_descriptors,
+  mstro__addr_sockaddr__in4__field_indices_by_name,
+  1,  mstro__addr_sockaddr__in4__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_sockaddr__in4__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6] =
+static const ProtobufCFieldDescriptor mstro__addr_sockaddr__in6__field_descriptors[6] =
 {
   {
     "sin6_family",
@@ -715,7 +805,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_family),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_family),
     NULL,
     NULL,
     0,             /* flags */
@@ -727,7 +817,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_UINT32,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_port),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_port),
     NULL,
     NULL,
     0,             /* flags */
@@ -739,7 +829,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED32,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_flowinfo),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_flowinfo),
     NULL,
     NULL,
     0,             /* flags */
@@ -751,7 +841,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_addr_0),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_addr_0),
     NULL,
     NULL,
     0,             /* flags */
@@ -763,7 +853,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_addr_1),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_addr_1),
     NULL,
     NULL,
     0,             /* flags */
@@ -775,14 +865,14 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_in6__field_descriptors[6]
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED32,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIn6, sin6_scope_id),
+    offsetof(Mstro__AddrSockaddrIN6, sin6_scope_id),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__sockaddr_in6__field_indices_by_name[] = {
+static const unsigned mstro__addr_sockaddr__in6__field_indices_by_name[] = {
   3,   /* field[3] = sin6_addr_0 */
   4,   /* field[4] = sin6_addr_1 */
   0,   /* field[0] = sin6_family */
@@ -790,27 +880,27 @@ static const unsigned mstro__sockaddr_in6__field_indices_by_name[] = {
   1,   /* field[1] = sin6_port */
   5,   /* field[5] = sin6_scope_id */
 };
-static const ProtobufCIntRange mstro__sockaddr_in6__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_sockaddr__in6__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 6 }
 };
-const ProtobufCMessageDescriptor mstro__sockaddr_in6__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_sockaddr__in6__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.sockaddr_in6",
-  "SockaddrIn6",
-  "Mstro__SockaddrIn6",
+  "mstro.AddrSockaddr_IN6",
+  "AddrSockaddrIN6",
+  "Mstro__AddrSockaddrIN6",
   "mstro",
-  sizeof(Mstro__SockaddrIn6),
+  sizeof(Mstro__AddrSockaddrIN6),
   6,
-  mstro__sockaddr_in6__field_descriptors,
-  mstro__sockaddr_in6__field_indices_by_name,
-  1,  mstro__sockaddr_in6__number_ranges,
-  (ProtobufCMessageInit) mstro__sockaddr_in6__init,
+  mstro__addr_sockaddr__in6__field_descriptors,
+  mstro__addr_sockaddr__in6__field_indices_by_name,
+  1,  mstro__addr_sockaddr__in6__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_sockaddr__in6__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__sockaddr_ib__field_descriptors[3] =
+static const ProtobufCFieldDescriptor mstro__addr_sockaddr__ib__field_descriptors[3] =
 {
   {
     "sib_family",
@@ -818,7 +908,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_ib__field_descriptors[3] =
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIb, sib_family),
+    offsetof(Mstro__AddrSockaddrIB, sib_family),
     NULL,
     NULL,
     0,             /* flags */
@@ -830,7 +920,7 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_ib__field_descriptors[3] =
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIb, sib_addr_0),
+    offsetof(Mstro__AddrSockaddrIB, sib_addr_0),
     NULL,
     NULL,
     0,             /* flags */
@@ -842,278 +932,329 @@ static const ProtobufCFieldDescriptor mstro__sockaddr_ib__field_descriptors[3] =
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__SockaddrIb, sib_addr_1),
+    offsetof(Mstro__AddrSockaddrIB, sib_addr_1),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__sockaddr_ib__field_indices_by_name[] = {
+static const unsigned mstro__addr_sockaddr__ib__field_indices_by_name[] = {
   1,   /* field[1] = sib_addr_0 */
   2,   /* field[2] = sib_addr_1 */
   0,   /* field[0] = sib_family */
 };
-static const ProtobufCIntRange mstro__sockaddr_ib__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_sockaddr__ib__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 3 }
 };
-const ProtobufCMessageDescriptor mstro__sockaddr_ib__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_sockaddr__ib__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.sockaddr_ib",
-  "SockaddrIb",
-  "Mstro__SockaddrIb",
+  "mstro.AddrSockaddr_IB",
+  "AddrSockaddrIB",
+  "Mstro__AddrSockaddrIB",
   "mstro",
-  sizeof(Mstro__SockaddrIb),
+  sizeof(Mstro__AddrSockaddrIB),
   3,
-  mstro__sockaddr_ib__field_descriptors,
-  mstro__sockaddr_ib__field_indices_by_name,
-  1,  mstro__sockaddr_ib__number_ranges,
-  (ProtobufCMessageInit) mstro__sockaddr_ib__init,
+  mstro__addr_sockaddr__ib__field_descriptors,
+  mstro__addr_sockaddr__ib__field_indices_by_name,
+  1,  mstro__addr_sockaddr__ib__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_sockaddr__ib__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__gni__field_descriptors[6] =
+static const ProtobufCFieldDescriptor mstro__addr_gni__field_descriptors[6] =
 {
   {
-    "a1",
+    "a0",
     1,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a1),
+    offsetof(Mstro__AddrGNI, a0),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a2",
+    "a1",
     2,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a2),
+    offsetof(Mstro__AddrGNI, a1),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a3",
+    "a2",
     3,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a3),
+    offsetof(Mstro__AddrGNI, a2),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a4",
+    "a3",
     4,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a4),
+    offsetof(Mstro__AddrGNI, a3),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a5",
+    "a4",
     5,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a5),
+    offsetof(Mstro__AddrGNI, a4),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a6",
+    "a5",
     6,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Gni, a6),
+    offsetof(Mstro__AddrGNI, a5),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__gni__field_indices_by_name[] = {
-  0,   /* field[0] = a1 */
-  1,   /* field[1] = a2 */
-  2,   /* field[2] = a3 */
-  3,   /* field[3] = a4 */
-  4,   /* field[4] = a5 */
-  5,   /* field[5] = a6 */
+static const unsigned mstro__addr_gni__field_indices_by_name[] = {
+  0,   /* field[0] = a0 */
+  1,   /* field[1] = a1 */
+  2,   /* field[2] = a2 */
+  3,   /* field[3] = a3 */
+  4,   /* field[4] = a4 */
+  5,   /* field[5] = a5 */
 };
-static const ProtobufCIntRange mstro__gni__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_gni__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 6 }
 };
-const ProtobufCMessageDescriptor mstro__gni__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_gni__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.gni",
-  "Gni",
-  "Mstro__Gni",
+  "mstro.AddrGNI",
+  "AddrGNI",
+  "Mstro__AddrGNI",
   "mstro",
-  sizeof(Mstro__Gni),
+  sizeof(Mstro__AddrGNI),
   6,
-  mstro__gni__field_descriptors,
-  mstro__gni__field_indices_by_name,
-  1,  mstro__gni__number_ranges,
-  (ProtobufCMessageInit) mstro__gni__init,
+  mstro__addr_gni__field_descriptors,
+  mstro__addr_gni__field_indices_by_name,
+  1,  mstro__addr_gni__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_gni__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__psmx2__field_descriptors[2] =
+static const ProtobufCFieldDescriptor mstro__addr_psmx2__field_descriptors[2] =
 {
   {
-    "a1",
+    "a0",
     1,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Psmx2, a1),
+    offsetof(Mstro__AddrPSMX2, a0),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a2",
+    "a1",
     2,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__Psmx2, a2),
+    offsetof(Mstro__AddrPSMX2, a1),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__psmx2__field_indices_by_name[] = {
-  0,   /* field[0] = a1 */
-  1,   /* field[1] = a2 */
+static const unsigned mstro__addr_psmx2__field_indices_by_name[] = {
+  0,   /* field[0] = a0 */
+  1,   /* field[1] = a1 */
 };
-static const ProtobufCIntRange mstro__psmx2__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_psmx2__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 2 }
 };
-const ProtobufCMessageDescriptor mstro__psmx2__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_psmx2__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.psmx2",
-  "Psmx2",
-  "Mstro__Psmx2",
+  "mstro.AddrPSMX2",
+  "AddrPSMX2",
+  "Mstro__AddrPSMX2",
   "mstro",
-  sizeof(Mstro__Psmx2),
+  sizeof(Mstro__AddrPSMX2),
   2,
-  mstro__psmx2__field_descriptors,
-  mstro__psmx2__field_indices_by_name,
-  1,  mstro__psmx2__number_ranges,
-  (ProtobufCMessageInit) mstro__psmx2__init,
+  mstro__addr_psmx2__field_descriptors,
+  mstro__addr_psmx2__field_indices_by_name,
+  1,  mstro__addr_psmx2__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_psmx2__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__ib_ub__field_descriptors[4] =
+static const ProtobufCFieldDescriptor mstro__addr_psmx3__field_descriptors[2] =
 {
+  {
+    "a0",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__AddrPSMX3, a0),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
   {
     "a1",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__AddrPSMX3, a1),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mstro__addr_psmx3__field_indices_by_name[] = {
+  0,   /* field[0] = a0 */
+  1,   /* field[1] = a1 */
+};
+static const ProtobufCIntRange mstro__addr_psmx3__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 2 }
+};
+const ProtobufCMessageDescriptor mstro__addr_psmx3__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.AddrPSMX3",
+  "AddrPSMX3",
+  "Mstro__AddrPSMX3",
+  "mstro",
+  sizeof(Mstro__AddrPSMX3),
+  2,
+  mstro__addr_psmx3__field_descriptors,
+  mstro__addr_psmx3__field_indices_by_name,
+  1,  mstro__addr_psmx3__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_psmx3__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor mstro__addr_ib__ud__field_descriptors[4] =
+{
+  {
+    "a0",
     1,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__IbUb, a1),
+    offsetof(Mstro__AddrIBUD, a0),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a2",
+    "a1",
     2,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__IbUb, a2),
+    offsetof(Mstro__AddrIBUD, a1),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a3",
+    "a2",
     3,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__IbUb, a3),
+    offsetof(Mstro__AddrIBUD, a2),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "a4",
+    "a3",
     4,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
     0,   /* quantifier_offset */
-    offsetof(Mstro__IbUb, a4),
+    offsetof(Mstro__AddrIBUD, a3),
     NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__ib_ub__field_indices_by_name[] = {
-  0,   /* field[0] = a1 */
-  1,   /* field[1] = a2 */
-  2,   /* field[2] = a3 */
-  3,   /* field[3] = a4 */
+static const unsigned mstro__addr_ib__ud__field_indices_by_name[] = {
+  0,   /* field[0] = a0 */
+  1,   /* field[1] = a1 */
+  2,   /* field[2] = a2 */
+  3,   /* field[3] = a3 */
 };
-static const ProtobufCIntRange mstro__ib_ub__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__addr_ib__ud__number_ranges[1 + 1] =
 {
   { 1, 0 },
   { 0, 4 }
 };
-const ProtobufCMessageDescriptor mstro__ib_ub__descriptor =
+const ProtobufCMessageDescriptor mstro__addr_ib__ud__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.ib_ub",
-  "IbUb",
-  "Mstro__IbUb",
+  "mstro.AddrIB_UD",
+  "AddrIBUD",
+  "Mstro__AddrIBUD",
   "mstro",
-  sizeof(Mstro__IbUb),
+  sizeof(Mstro__AddrIBUD),
   4,
-  mstro__ib_ub__field_descriptors,
-  mstro__ib_ub__field_indices_by_name,
-  1,  mstro__ib_ub__number_ranges,
-  (ProtobufCMessageInit) mstro__ib_ub__init,
+  mstro__addr_ib__ud__field_descriptors,
+  mstro__addr_ib__ud__field_indices_by_name,
+  1,  mstro__addr_ib__ud__number_ranges,
+  (ProtobufCMessageInit) mstro__addr_ib__ud__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
+static const ProtobufCFieldDescriptor mstro__ofi_addr__field_descriptors[14] =
 {
   {
     "unspec",
     1,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_BYTES,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, unspec),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, unspec),
     NULL,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
@@ -1124,9 +1265,9 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     2,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, sock),
-    &mstro__sockaddr__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, sock),
+    &mstro__addr_sockaddr__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1136,9 +1277,9 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     3,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, in4),
-    &mstro__sockaddr_in4__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, in4),
+    &mstro__addr_sockaddr__in4__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1148,9 +1289,9 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     4,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, in6),
-    &mstro__sockaddr_in6__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, in6),
+    &mstro__addr_sockaddr__in6__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1160,9 +1301,9 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     5,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, ib),
-    &mstro__sockaddr_ib__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, ib),
+    &mstro__addr_sockaddr__ib__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1172,8 +1313,8 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     6,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, psmx),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, psmx),
     NULL,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
@@ -1184,9 +1325,9 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     7,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, gni),
-    &mstro__gni__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, gni),
+    &mstro__addr_gni__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1196,8 +1337,8 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     8,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, bgq),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, bgq),
     NULL,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
@@ -1208,8 +1349,8 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     9,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, mlx),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, mlx),
     NULL,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
@@ -1220,8 +1361,8 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     10,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_STRING,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, str),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, str),
     NULL,
     &protobuf_c_empty_string,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
@@ -1232,21 +1373,21 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     11,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, psmx2),
-    &mstro__psmx2__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, psmx2),
+    &mstro__addr_psmx2__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
   {
-    "ib_ub",
+    "ib_ud",
     12,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, ib_ub),
-    &mstro__ib_ub__descriptor,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, ib_ud),
+    &mstro__addr_ib__ud__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
@@ -1256,71 +1397,97 @@ static const ProtobufCFieldDescriptor mstro__ofi__field_descriptors[13] =
     13,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_FIXED64,
-    offsetof(Mstro__Ofi, val_case),
-    offsetof(Mstro__Ofi, efa),
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, efa),
     NULL,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "psmx3",
+    14,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__OfiAddr, val_case),
+    offsetof(Mstro__OfiAddr, psmx3),
+    &mstro__addr_psmx3__descriptor,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
-static const unsigned mstro__ofi__field_indices_by_name[] = {
+static const unsigned mstro__ofi_addr__field_indices_by_name[] = {
   7,   /* field[7] = bgq */
   12,   /* field[12] = efa */
   6,   /* field[6] = gni */
   4,   /* field[4] = ib */
-  11,   /* field[11] = ib_ub */
+  11,   /* field[11] = ib_ud */
   2,   /* field[2] = in4 */
   3,   /* field[3] = in6 */
   8,   /* field[8] = mlx */
   5,   /* field[5] = psmx */
   10,   /* field[10] = psmx2 */
+  13,   /* field[13] = psmx3 */
   1,   /* field[1] = sock */
   9,   /* field[9] = str */
   0,   /* field[0] = unspec */
 };
-static const ProtobufCIntRange mstro__ofi__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__ofi_addr__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 13 }
+  { 0, 14 }
 };
-const ProtobufCMessageDescriptor mstro__ofi__descriptor =
+const ProtobufCMessageDescriptor mstro__ofi_addr__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.Ofi",
-  "Ofi",
-  "Mstro__Ofi",
+  "mstro.OfiAddr",
+  "OfiAddr",
+  "Mstro__OfiAddr",
   "mstro",
-  sizeof(Mstro__Ofi),
-  13,
-  mstro__ofi__field_descriptors,
-  mstro__ofi__field_indices_by_name,
-  1,  mstro__ofi__number_ranges,
-  (ProtobufCMessageInit) mstro__ofi__init,
+  sizeof(Mstro__OfiAddr),
+  14,
+  mstro__ofi_addr__field_descriptors,
+  mstro__ofi_addr__field_indices_by_name,
+  1,  mstro__ofi_addr__number_ranges,
+  (ProtobufCMessageInit) mstro__ofi_addr__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__endpoint__field_descriptors[1] =
+static const ProtobufCFieldDescriptor mstro__endpoint__field_descriptors[2] =
 {
   {
-    "ofi",
+    "ofiproto",
     1,
     PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_ENUM,
+    offsetof(Mstro__Endpoint, proto_case),
+    offsetof(Mstro__Endpoint, ofiproto),
+    &mstro__ofi_endpoint_kind__descriptor,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "ofiaddr",
+    2,
+    PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__Endpoint, kind_case),
-    offsetof(Mstro__Endpoint, ofi),
-    &mstro__ofi__descriptor,
+    offsetof(Mstro__Endpoint, addr_case),
+    offsetof(Mstro__Endpoint, ofiaddr),
+    &mstro__ofi_addr__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
 static const unsigned mstro__endpoint__field_indices_by_name[] = {
-  0,   /* field[0] = ofi */
+  1,   /* field[1] = ofiaddr */
+  0,   /* field[0] = ofiproto */
 };
 static const ProtobufCIntRange mstro__endpoint__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 1 }
+  { 0, 2 }
 };
 const ProtobufCMessageDescriptor mstro__endpoint__descriptor =
 {
@@ -1330,14 +1497,14 @@ const ProtobufCMessageDescriptor mstro__endpoint__descriptor =
   "Mstro__Endpoint",
   "mstro",
   sizeof(Mstro__Endpoint),
-  1,
+  2,
   mstro__endpoint__field_descriptors,
   mstro__endpoint__field_indices_by_name,
   1,  mstro__endpoint__number_ranges,
   (ProtobufCMessageInit) mstro__endpoint__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__ofi_memory_region__field_descriptors[3] =
+static const ProtobufCFieldDescriptor mstro__ofi_memory_region__field_descriptors[2] =
 {
   {
     "baseaddr",
@@ -1351,21 +1518,9 @@ static const ProtobufCFieldDescriptor mstro__ofi_memory_region__field_descriptor
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
-  {
-    "keylen",
-    2,
-    PROTOBUF_C_LABEL_NONE,
-    PROTOBUF_C_TYPE_FIXED64,
-    0,   /* quantifier_offset */
-    offsetof(Mstro__OfiMemoryRegion, keylen),
-    NULL,
-    NULL,
-    0,             /* flags */
-    0,NULL,NULL    /* reserved1,reserved2, etc */
-  },
   {
     "raw_key",
-    3,
+    2,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_BYTES,
     0,   /* quantifier_offset */
@@ -1378,13 +1533,12 @@ static const ProtobufCFieldDescriptor mstro__ofi_memory_region__field_descriptor
 };
 static const unsigned mstro__ofi_memory_region__field_indices_by_name[] = {
   0,   /* field[0] = baseaddr */
-  1,   /* field[1] = keylen */
-  2,   /* field[2] = raw_key */
+  1,   /* field[1] = raw_key */
 };
 static const ProtobufCIntRange mstro__ofi_memory_region__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 3 }
+  { 0, 2 }
 };
 const ProtobufCMessageDescriptor mstro__ofi_memory_region__descriptor =
 {
@@ -1394,119 +1548,170 @@ const ProtobufCMessageDescriptor mstro__ofi_memory_region__descriptor =
   "Mstro__OfiMemoryRegion",
   "mstro",
   sizeof(Mstro__OfiMemoryRegion),
-  3,
+  2,
   mstro__ofi_memory_region__field_descriptors,
   mstro__ofi_memory_region__field_indices_by_name,
   1,  mstro__ofi_memory_region__number_ranges,
   (ProtobufCMessageInit) mstro__ofi_memory_region__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pm__field_descriptors[2] =
+static const ProtobufCFieldDescriptor mstro__cred_drc__field_descriptors[1] =
 {
   {
-    "ep",
+    "credential",
     1,
-    PROTOBUF_C_LABEL_REPEATED,
-    PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__PM, n_ep),
-    offsetof(Mstro__PM, ep),
-    &mstro__endpoint__descriptor,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED32,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__CredDRC, credential),
+    NULL,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+};
+static const unsigned mstro__cred_drc__field_indices_by_name[] = {
+  0,   /* field[0] = credential */
+};
+static const ProtobufCIntRange mstro__cred_drc__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 1 }
+};
+const ProtobufCMessageDescriptor mstro__cred_drc__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.CredDRC",
+  "CredDRC",
+  "Mstro__CredDRC",
+  "mstro",
+  sizeof(Mstro__CredDRC),
+  1,
+  mstro__cred_drc__field_descriptors,
+  mstro__cred_drc__field_indices_by_name,
+  1,  mstro__cred_drc__number_ranges,
+  (ProtobufCMessageInit) mstro__cred_drc__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor mstro__ofi_credential__field_descriptors[1] =
+{
   {
-    "ofi",
-    2,
+    "drc",
+    1,
     PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__PM, inforeg_case),
-    offsetof(Mstro__PM, ofi),
-    &mstro__ofi_memory_region__descriptor,
+    offsetof(Mstro__OfiCredential, val_case),
+    offsetof(Mstro__OfiCredential, drc),
+    &mstro__cred_drc__descriptor,
     NULL,
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__pm__field_indices_by_name[] = {
-  0,   /* field[0] = ep */
-  1,   /* field[1] = ofi */
+static const unsigned mstro__ofi_credential__field_indices_by_name[] = {
+  0,   /* field[0] = drc */
 };
-static const ProtobufCIntRange mstro__pm__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__ofi_credential__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 2 }
+  { 0, 1 }
 };
-const ProtobufCMessageDescriptor mstro__pm__descriptor =
+const ProtobufCMessageDescriptor mstro__ofi_credential__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.PM",
-  "PM",
-  "Mstro__PM",
+  "mstro.OfiCredential",
+  "OfiCredential",
+  "Mstro__OfiCredential",
   "mstro",
-  sizeof(Mstro__PM),
-  2,
-  mstro__pm__field_descriptors,
-  mstro__pm__field_indices_by_name,
-  1,  mstro__pm__number_ranges,
-  (ProtobufCMessageInit) mstro__pm__init,
+  sizeof(Mstro__OfiCredential),
+  1,
+  mstro__ofi_credential__field_descriptors,
+  mstro__ofi_credential__field_indices_by_name,
+  1,  mstro__ofi_credential__number_ranges,
+  (ProtobufCMessageInit) mstro__ofi_credential__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pm_info__field_descriptors[1] =
+static const ProtobufCFieldDescriptor mstro__endpoint_list__field_descriptors[3] =
 {
   {
-    "pm",
+    "eps",
     1,
     PROTOBUF_C_LABEL_REPEATED,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__PmInfo, n_pm),
-    offsetof(Mstro__PmInfo, pm),
-    &mstro__pm__descriptor,
+    offsetof(Mstro__EndpointList, n_eps),
+    offsetof(Mstro__EndpointList, eps),
+    &mstro__endpoint__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "inforegs",
+    2,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__EndpointList, n_inforegs),
+    offsetof(Mstro__EndpointList, inforegs),
+    &mstro__ofi_memory_region__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "credentials",
+    3,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__EndpointList, n_credentials),
+    offsetof(Mstro__EndpointList, credentials),
+    &mstro__ofi_credential__descriptor,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
-static const unsigned mstro__pm_info__field_indices_by_name[] = {
-  0,   /* field[0] = pm */
+static const unsigned mstro__endpoint_list__field_indices_by_name[] = {
+  2,   /* field[2] = credentials */
+  0,   /* field[0] = eps */
+  1,   /* field[1] = inforegs */
 };
-static const ProtobufCIntRange mstro__pm_info__number_ranges[1 + 1] =
+static const ProtobufCIntRange mstro__endpoint_list__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 1 }
+  { 0, 3 }
 };
-const ProtobufCMessageDescriptor mstro__pm_info__descriptor =
+const ProtobufCMessageDescriptor mstro__endpoint_list__descriptor =
 {
   PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-  "mstro.PmInfo",
-  "PmInfo",
-  "Mstro__PmInfo",
+  "mstro.EndpointList",
+  "EndpointList",
+  "Mstro__EndpointList",
   "mstro",
-  sizeof(Mstro__PmInfo),
-  1,
-  mstro__pm_info__field_descriptors,
-  mstro__pm_info__field_indices_by_name,
-  1,  mstro__pm_info__number_ranges,
-  (ProtobufCMessageInit) mstro__pm_info__init,
+  sizeof(Mstro__EndpointList),
+  3,
+  mstro__endpoint_list__field_descriptors,
+  mstro__endpoint_list__field_indices_by_name,
+  1,  mstro__endpoint_list__number_ranges,
+  (ProtobufCMessageInit) mstro__endpoint_list__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor mstro__app_info__field_descriptors[1] =
 {
   {
-    "ep",
+    "eps",
     1,
-    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_LABEL_NONE,
     PROTOBUF_C_TYPE_MESSAGE,
-    offsetof(Mstro__AppInfo, n_ep),
-    offsetof(Mstro__AppInfo, ep),
-    &mstro__endpoint__descriptor,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__AppInfo, eps),
+    &mstro__endpoint_list__descriptor,
     NULL,
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
 };
 static const unsigned mstro__app_info__field_indices_by_name[] = {
-  0,   /* field[0] = ep */
+  0,   /* field[0] = eps */
 };
 static const ProtobufCIntRange mstro__app_info__number_ranges[1 + 1] =
 {
@@ -1528,3 +1733,71 @@ const ProtobufCMessageDescriptor mstro__app_info__descriptor =
   (ProtobufCMessageInit) mstro__app_info__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
+static const ProtobufCEnumValue mstro__ofi_endpoint_kind__enum_values_by_number[22] =
+{
+  { "UNSPEC", "MSTRO__OFI_ENDPOINT_KIND__UNSPEC", 0 },
+  { "RDMA_CM_IB_RC", "MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_RC", 1 },
+  { "IWARP", "MSTRO__OFI_ENDPOINT_KIND__IWARP", 2 },
+  { "IB_UD", "MSTRO__OFI_ENDPOINT_KIND__IB_UD", 3 },
+  { "PSMX", "MSTRO__OFI_ENDPOINT_KIND__PSMX", 4 },
+  { "UDP", "MSTRO__OFI_ENDPOINT_KIND__UDP", 5 },
+  { "SOCK_TCP", "MSTRO__OFI_ENDPOINT_KIND__SOCK_TCP", 6 },
+  { "MXM", "MSTRO__OFI_ENDPOINT_KIND__MXM", 7 },
+  { "IWARP_RDM", "MSTRO__OFI_ENDPOINT_KIND__IWARP_RDM", 8 },
+  { "IB_RDM", "MSTRO__OFI_ENDPOINT_KIND__IB_RDM", 9 },
+  { "GNI", "MSTRO__OFI_ENDPOINT_KIND__GNI", 10 },
+  { "RXM", "MSTRO__OFI_ENDPOINT_KIND__RXM", 11 },
+  { "RXD", "MSTRO__OFI_ENDPOINT_KIND__RXD", 12 },
+  { "MLX", "MSTRO__OFI_ENDPOINT_KIND__MLX", 13 },
+  { "NETWORKDIRECT", "MSTRO__OFI_ENDPOINT_KIND__NETWORKDIRECT", 14 },
+  { "PSMX2", "MSTRO__OFI_ENDPOINT_KIND__PSMX2", 15 },
+  { "SHM", "MSTRO__OFI_ENDPOINT_KIND__SHM", 16 },
+  { "MRAIL", "MSTRO__OFI_ENDPOINT_KIND__MRAIL", 17 },
+  { "RSTREAM", "MSTRO__OFI_ENDPOINT_KIND__RSTREAM", 18 },
+  { "RDMA_CM_IB_XRC", "MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_XRC", 19 },
+  { "EFA", "MSTRO__OFI_ENDPOINT_KIND__EFA", 20 },
+  { "PSMX3", "MSTRO__OFI_ENDPOINT_KIND__PSMX3", 21 },
+};
+static const ProtobufCIntRange mstro__ofi_endpoint_kind__value_ranges[] = {
+{0, 0},{0, 22}
+};
+static const ProtobufCEnumValueIndex mstro__ofi_endpoint_kind__enum_values_by_name[22] =
+{
+  { "EFA", 20 },
+  { "GNI", 10 },
+  { "IB_RDM", 9 },
+  { "IB_UD", 3 },
+  { "IWARP", 2 },
+  { "IWARP_RDM", 8 },
+  { "MLX", 13 },
+  { "MRAIL", 17 },
+  { "MXM", 7 },
+  { "NETWORKDIRECT", 14 },
+  { "PSMX", 4 },
+  { "PSMX2", 15 },
+  { "PSMX3", 21 },
+  { "RDMA_CM_IB_RC", 1 },
+  { "RDMA_CM_IB_XRC", 19 },
+  { "RSTREAM", 18 },
+  { "RXD", 12 },
+  { "RXM", 11 },
+  { "SHM", 16 },
+  { "SOCK_TCP", 6 },
+  { "UDP", 5 },
+  { "UNSPEC", 0 },
+};
+const ProtobufCEnumDescriptor mstro__ofi_endpoint_kind__descriptor =
+{
+  PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+  "mstro.OfiEndpointKind",
+  "OfiEndpointKind",
+  "Mstro__OfiEndpointKind",
+  "mstro",
+  22,
+  mstro__ofi_endpoint_kind__enum_values_by_number,
+  22,
+  mstro__ofi_endpoint_kind__enum_values_by_name,
+  1,
+  mstro__ofi_endpoint_kind__value_ranges,
+  NULL,NULL,NULL,NULL   /* reserved[1234] */
+};
diff --git a/protocols/mstro_ep.pb-c.h b/protocols/mstro_ep.pb-c.h
index 856a8ee1fb488366f7af674d16837e634458f378..1bd476f2153c561111d829643afbb01f9d26f8cf 100644
--- a/protocols/mstro_ep.pb-c.h
+++ b/protocols/mstro_ep.pb-c.h
@@ -15,44 +15,77 @@ PROTOBUF_C__BEGIN_DECLS
 #endif
 
 
-typedef struct _Mstro__Sockaddr Mstro__Sockaddr;
-typedef struct _Mstro__SockaddrIn4 Mstro__SockaddrIn4;
-typedef struct _Mstro__SockaddrIn6 Mstro__SockaddrIn6;
-typedef struct _Mstro__SockaddrIb Mstro__SockaddrIb;
-typedef struct _Mstro__Gni Mstro__Gni;
-typedef struct _Mstro__Psmx2 Mstro__Psmx2;
-typedef struct _Mstro__IbUb Mstro__IbUb;
-typedef struct _Mstro__Ofi Mstro__Ofi;
+typedef struct _Mstro__AddrSockaddr Mstro__AddrSockaddr;
+typedef struct _Mstro__AddrSockaddrIN4 Mstro__AddrSockaddrIN4;
+typedef struct _Mstro__AddrSockaddrIN6 Mstro__AddrSockaddrIN6;
+typedef struct _Mstro__AddrSockaddrIB Mstro__AddrSockaddrIB;
+typedef struct _Mstro__AddrGNI Mstro__AddrGNI;
+typedef struct _Mstro__AddrPSMX2 Mstro__AddrPSMX2;
+typedef struct _Mstro__AddrPSMX3 Mstro__AddrPSMX3;
+typedef struct _Mstro__AddrIBUD Mstro__AddrIBUD;
+typedef struct _Mstro__OfiAddr Mstro__OfiAddr;
 typedef struct _Mstro__Endpoint Mstro__Endpoint;
 typedef struct _Mstro__OfiMemoryRegion Mstro__OfiMemoryRegion;
-typedef struct _Mstro__PM Mstro__PM;
-typedef struct _Mstro__PmInfo Mstro__PmInfo;
+typedef struct _Mstro__CredDRC Mstro__CredDRC;
+typedef struct _Mstro__OfiCredential Mstro__OfiCredential;
+typedef struct _Mstro__EndpointList Mstro__EndpointList;
 typedef struct _Mstro__AppInfo Mstro__AppInfo;
 
 
 /* --- enums --- */
 
+/*
+ ** Open Fabric endpoint protocols 
+ */
+typedef enum _Mstro__OfiEndpointKind {
+  /*
+   * endpoint types as of OFI 1.14 
+   */
+  MSTRO__OFI_ENDPOINT_KIND__UNSPEC = 0,
+  MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_RC = 1,
+  MSTRO__OFI_ENDPOINT_KIND__IWARP = 2,
+  MSTRO__OFI_ENDPOINT_KIND__IB_UD = 3,
+  MSTRO__OFI_ENDPOINT_KIND__PSMX = 4,
+  MSTRO__OFI_ENDPOINT_KIND__UDP = 5,
+  MSTRO__OFI_ENDPOINT_KIND__SOCK_TCP = 6,
+  MSTRO__OFI_ENDPOINT_KIND__MXM = 7,
+  MSTRO__OFI_ENDPOINT_KIND__IWARP_RDM = 8,
+  MSTRO__OFI_ENDPOINT_KIND__IB_RDM = 9,
+  MSTRO__OFI_ENDPOINT_KIND__GNI = 10,
+  MSTRO__OFI_ENDPOINT_KIND__RXM = 11,
+  MSTRO__OFI_ENDPOINT_KIND__RXD = 12,
+  MSTRO__OFI_ENDPOINT_KIND__MLX = 13,
+  MSTRO__OFI_ENDPOINT_KIND__NETWORKDIRECT = 14,
+  MSTRO__OFI_ENDPOINT_KIND__PSMX2 = 15,
+  MSTRO__OFI_ENDPOINT_KIND__SHM = 16,
+  MSTRO__OFI_ENDPOINT_KIND__MRAIL = 17,
+  MSTRO__OFI_ENDPOINT_KIND__RSTREAM = 18,
+  MSTRO__OFI_ENDPOINT_KIND__RDMA_CM_IB_XRC = 19,
+  MSTRO__OFI_ENDPOINT_KIND__EFA = 20,
+  MSTRO__OFI_ENDPOINT_KIND__PSMX3 = 21
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__OFI_ENDPOINT_KIND)
+} Mstro__OfiEndpointKind;
 
 /* --- messages --- */
 
 /*
  * see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/sys_socket.h.html 
  */
-struct  _Mstro__Sockaddr
+struct  _Mstro__AddrSockaddr
 {
   ProtobufCMessage base;
   uint64_t sa_family;
   ProtobufCBinaryData sa_data;
 };
-#define MSTRO__SOCKADDR__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__sockaddr__descriptor) \
+#define MSTRO__ADDR_SOCKADDR__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_sockaddr__descriptor) \
     , 0, {0,NULL} }
 
 
 /*
  * see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/netinet_in.h.html 
  */
-struct  _Mstro__SockaddrIn4
+struct  _Mstro__AddrSockaddrIN4
 {
   ProtobufCMessage base;
   /*
@@ -65,15 +98,15 @@ struct  _Mstro__SockaddrIn4
   uint32_t sin_port;
   uint32_t sin_addr;
 };
-#define MSTRO__SOCKADDR_IN4__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__sockaddr_in4__descriptor) \
+#define MSTRO__ADDR_SOCKADDR__IN4__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_sockaddr__in4__descriptor) \
     , 0, 0, 0 }
 
 
 /*
  * see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/netinet_in.h.html 
  */
-struct  _Mstro__SockaddrIn6
+struct  _Mstro__AddrSockaddrIN6
 {
   ProtobufCMessage base;
   /*
@@ -92,143 +125,169 @@ struct  _Mstro__SockaddrIn6
   uint64_t sin6_addr_1;
   uint32_t sin6_scope_id;
 };
-#define MSTRO__SOCKADDR_IN6__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__sockaddr_in6__descriptor) \
+#define MSTRO__ADDR_SOCKADDR__IN6__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_sockaddr__in6__descriptor) \
     , 0, 0, 0, 0, 0, 0 }
 
 
 /*
  * see linux/latest/source/include/rdma/ib.h 
  */
-struct  _Mstro__SockaddrIb
+struct  _Mstro__AddrSockaddrIB
 {
   ProtobufCMessage base;
   uint64_t sib_family;
   uint64_t sib_addr_0;
   uint64_t sib_addr_1;
 };
-#define MSTRO__SOCKADDR_IB__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__sockaddr_ib__descriptor) \
+#define MSTRO__ADDR_SOCKADDR__IB__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_sockaddr__ib__descriptor) \
     , 0, 0, 0 }
 
 
 /*
  * address of a GNI adapter 
  */
-struct  _Mstro__Gni
+struct  _Mstro__AddrGNI
 {
   ProtobufCMessage base;
+  uint64_t a0;
   uint64_t a1;
   uint64_t a2;
   uint64_t a3;
   uint64_t a4;
   uint64_t a5;
-  uint64_t a6;
 };
-#define MSTRO__GNI__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__gni__descriptor) \
+#define MSTRO__ADDR_GNI__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_gni__descriptor) \
     , 0, 0, 0, 0, 0, 0 }
 
 
-struct  _Mstro__Psmx2
+struct  _Mstro__AddrPSMX2
 {
   ProtobufCMessage base;
+  uint64_t a0;
   uint64_t a1;
-  uint64_t a2;
 };
-#define MSTRO__PSMX2__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__psmx2__descriptor) \
+#define MSTRO__ADDR_PSMX2__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_psmx2__descriptor) \
     , 0, 0 }
 
 
-struct  _Mstro__IbUb
+struct  _Mstro__AddrPSMX3
 {
   ProtobufCMessage base;
+  uint64_t a0;
+  uint64_t a1;
+};
+#define MSTRO__ADDR_PSMX3__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_psmx3__descriptor) \
+    , 0, 0 }
+
+
+struct  _Mstro__AddrIBUD
+{
+  ProtobufCMessage base;
+  uint64_t a0;
   uint64_t a1;
   uint64_t a2;
   uint64_t a3;
-  uint64_t a4;
 };
-#define MSTRO__IB_UB__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__ib_ub__descriptor) \
+#define MSTRO__ADDR_IB__UD__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__addr_ib__ud__descriptor) \
     , 0, 0, 0, 0 }
 
 
 typedef enum {
-  MSTRO__OFI__VAL__NOT_SET = 0,
-  MSTRO__OFI__VAL_UNSPEC = 1,
-  MSTRO__OFI__VAL_SOCK = 2,
-  MSTRO__OFI__VAL_IN4 = 3,
-  MSTRO__OFI__VAL_IN6 = 4,
-  MSTRO__OFI__VAL_IB = 5,
-  MSTRO__OFI__VAL_PSMX = 6,
-  MSTRO__OFI__VAL_GNI = 7,
-  MSTRO__OFI__VAL_BGQ = 8,
-  MSTRO__OFI__VAL_MLX = 9,
-  MSTRO__OFI__VAL_STR = 10,
-  MSTRO__OFI__VAL_PSMX2 = 11,
-  MSTRO__OFI__VAL_IB_UB = 12,
-  MSTRO__OFI__VAL_EFA = 13
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__OFI__VAL)
-} Mstro__Ofi__ValCase;
+  MSTRO__OFI_ADDR__VAL__NOT_SET = 0,
+  MSTRO__OFI_ADDR__VAL_UNSPEC = 1,
+  MSTRO__OFI_ADDR__VAL_SOCK = 2,
+  MSTRO__OFI_ADDR__VAL_IN4 = 3,
+  MSTRO__OFI_ADDR__VAL_IN6 = 4,
+  MSTRO__OFI_ADDR__VAL_IB = 5,
+  MSTRO__OFI_ADDR__VAL_PSMX = 6,
+  MSTRO__OFI_ADDR__VAL_GNI = 7,
+  MSTRO__OFI_ADDR__VAL_BGQ = 8,
+  MSTRO__OFI_ADDR__VAL_MLX = 9,
+  MSTRO__OFI_ADDR__VAL_STR = 10,
+  MSTRO__OFI_ADDR__VAL_PSMX2 = 11,
+  MSTRO__OFI_ADDR__VAL_IB_UD = 12,
+  MSTRO__OFI_ADDR__VAL_EFA = 13,
+  MSTRO__OFI_ADDR__VAL_PSMX3 = 14
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__OFI_ADDR__VAL)
+} Mstro__OfiAddr__ValCase;
 
 /*
- ** Open Fabric endpoint 
+ ** Open Fabric endpoint address 
  */
-struct  _Mstro__Ofi
+struct  _Mstro__OfiAddr
 {
   ProtobufCMessage base;
-  Mstro__Ofi__ValCase val_case;
+  Mstro__OfiAddr__ValCase val_case;
   union {
     /*
-     * provider-specific address kinds as of OFI 1.10 
+     * provider-specific address kinds as of OFI 1.14 
      */
     ProtobufCBinaryData unspec;
-    Mstro__Sockaddr *sock;
-    Mstro__SockaddrIn4 *in4;
-    Mstro__SockaddrIn6 *in6;
-    Mstro__SockaddrIb *ib;
+    Mstro__AddrSockaddr *sock;
+    Mstro__AddrSockaddrIN4 *in4;
+    Mstro__AddrSockaddrIN6 *in6;
+    Mstro__AddrSockaddrIB *ib;
     uint64_t psmx;
     /*
      * 6 elements, to be exact 
      */
-    Mstro__Gni *gni;
+    Mstro__AddrGNI *gni;
     uint64_t bgq;
     uint64_t mlx;
     char *str;
     /*
-     * 2 elements, to be exact 
+     * 2 uint64 values 
      */
-    Mstro__Psmx2 *psmx2;
+    Mstro__AddrPSMX2 *psmx2;
     /*
-     * 4 elements, to be exact 
+     * 4 uint64 values 
      */
-    Mstro__IbUb *ib_ub;
+    Mstro__AddrIBUD *ib_ud;
     uint64_t efa;
+    /*
+     * 2 uint64 values 
+     */
+    Mstro__AddrPSMX3 *psmx3;
   };
 };
-#define MSTRO__OFI__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__ofi__descriptor) \
-    , MSTRO__OFI__VAL__NOT_SET, {0} }
+#define MSTRO__OFI_ADDR__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__ofi_addr__descriptor) \
+    , MSTRO__OFI_ADDR__VAL__NOT_SET, {0} }
 
 
 typedef enum {
-  MSTRO__ENDPOINT__KIND__NOT_SET = 0,
-  MSTRO__ENDPOINT__KIND_OFI = 1
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__ENDPOINT__KIND)
-} Mstro__Endpoint__KindCase;
+  MSTRO__ENDPOINT__PROTO__NOT_SET = 0,
+  MSTRO__ENDPOINT__PROTO_OFIPROTO = 1
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__ENDPOINT__PROTO)
+} Mstro__Endpoint__ProtoCase;
+
+typedef enum {
+  MSTRO__ENDPOINT__ADDR__NOT_SET = 0,
+  MSTRO__ENDPOINT__ADDR_OFIADDR = 2
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__ENDPOINT__ADDR)
+} Mstro__Endpoint__AddrCase;
 
 struct  _Mstro__Endpoint
 {
   ProtobufCMessage base;
-  Mstro__Endpoint__KindCase kind_case;
+  Mstro__Endpoint__ProtoCase proto_case;
   union {
-    Mstro__Ofi *ofi;
+    Mstro__OfiEndpointKind ofiproto;
+  };
+  Mstro__Endpoint__AddrCase addr_case;
+  union {
+    Mstro__OfiAddr *ofiaddr;
   };
 };
 #define MSTRO__ENDPOINT__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__endpoint__descriptor) \
-    , MSTRO__ENDPOINT__KIND__NOT_SET, {0} }
+    , MSTRO__ENDPOINT__PROTO__NOT_SET, {0}, MSTRO__ENDPOINT__ADDR__NOT_SET, {0} }
 
 
 /*
@@ -238,223 +297,255 @@ struct  _Mstro__OfiMemoryRegion
 {
   ProtobufCMessage base;
   uint64_t baseaddr;
-  uint64_t keylen;
   ProtobufCBinaryData raw_key;
 };
 #define MSTRO__OFI_MEMORY_REGION__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__ofi_memory_region__descriptor) \
-    , 0, 0, {0,NULL} }
+    , 0, {0,NULL} }
+
+
+struct  _Mstro__CredDRC
+{
+  ProtobufCMessage base;
+  uint32_t credential;
+};
+#define MSTRO__CRED_DRC__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__cred_drc__descriptor) \
+    , 0 }
 
 
 typedef enum {
-  MSTRO__PM__INFOREG__NOT_SET = 0,
-  MSTRO__PM__INFOREG_OFI = 2
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__PM__INFOREG)
-} Mstro__PM__InforegCase;
+  MSTRO__OFI_CREDENTIAL__VAL__NOT_SET = 0,
+  MSTRO__OFI_CREDENTIAL__VAL_DRC = 1
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__OFI_CREDENTIAL__VAL)
+} Mstro__OfiCredential__ValCase;
 
-/*
- ** A Pool Manager endpoint list and basic info 
- */
-struct  _Mstro__PM
+struct  _Mstro__OfiCredential
 {
   ProtobufCMessage base;
-  /*
-   ** the list of endpoints on which this PM entity is reachable 
-   */
-  size_t n_ep;
-  Mstro__Endpoint **ep;
-  Mstro__PM__InforegCase inforeg_case;
+  Mstro__OfiCredential__ValCase val_case;
   union {
-    Mstro__OfiMemoryRegion *ofi;
+    /*
+     * Cray DRC credential 
+     */
+    Mstro__CredDRC *drc;
   };
 };
-#define MSTRO__PM__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__pm__descriptor) \
-    , 0,NULL, MSTRO__PM__INFOREG__NOT_SET, {0} }
+#define MSTRO__OFI_CREDENTIAL__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__ofi_credential__descriptor) \
+    , MSTRO__OFI_CREDENTIAL__VAL__NOT_SET, {0} }
 
 
 /*
- ** A Pool Manager Info block 
+ ** An endpoint list and basic info. All three arrays are parallel and must have the same length. 
  */
-struct  _Mstro__PmInfo
+struct  _Mstro__EndpointList
 {
   ProtobufCMessage base;
   /*
-   ** each PM entity provides one PM entry 
+   ** the list of endpoints on which this PM entity is reachable 
    */
-  size_t n_pm;
-  Mstro__PM **pm;
+  size_t n_eps;
+  Mstro__Endpoint **eps;
+  /*
+   ** access information for the Info Region 
+   */
+  size_t n_inforegs;
+  Mstro__OfiMemoryRegion **inforegs;
+  /*
+   ** possibly access credentials 
+   */
+  size_t n_credentials;
+  Mstro__OfiCredential **credentials;
 };
-#define MSTRO__PM_INFO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&mstro__pm_info__descriptor) \
-    , 0,NULL }
+#define MSTRO__ENDPOINT_LIST__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__endpoint_list__descriptor) \
+    , 0,NULL, 0,NULL, 0,NULL }
 
 
 /*
- ** An App Info block 
+ ** An application (incl pool manager) endpoint info block 
  */
 struct  _Mstro__AppInfo
 {
   ProtobufCMessage base;
-  size_t n_ep;
-  Mstro__Endpoint **ep;
+  Mstro__EndpointList *eps;
 };
 #define MSTRO__APP_INFO__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__app_info__descriptor) \
-    , 0,NULL }
+    , NULL }
 
 
-/* Mstro__Sockaddr methods */
-void   mstro__sockaddr__init
-                     (Mstro__Sockaddr         *message);
-size_t mstro__sockaddr__get_packed_size
-                     (const Mstro__Sockaddr   *message);
-size_t mstro__sockaddr__pack
-                     (const Mstro__Sockaddr   *message,
+/* Mstro__AddrSockaddr methods */
+void   mstro__addr_sockaddr__init
+                     (Mstro__AddrSockaddr         *message);
+size_t mstro__addr_sockaddr__get_packed_size
+                     (const Mstro__AddrSockaddr   *message);
+size_t mstro__addr_sockaddr__pack
+                     (const Mstro__AddrSockaddr   *message,
                       uint8_t             *out);
-size_t mstro__sockaddr__pack_to_buffer
-                     (const Mstro__Sockaddr   *message,
+size_t mstro__addr_sockaddr__pack_to_buffer
+                     (const Mstro__AddrSockaddr   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__Sockaddr *
-       mstro__sockaddr__unpack
+Mstro__AddrSockaddr *
+       mstro__addr_sockaddr__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__sockaddr__free_unpacked
-                     (Mstro__Sockaddr *message,
+void   mstro__addr_sockaddr__free_unpacked
+                     (Mstro__AddrSockaddr *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__SockaddrIn4 methods */
-void   mstro__sockaddr_in4__init
-                     (Mstro__SockaddrIn4         *message);
-size_t mstro__sockaddr_in4__get_packed_size
-                     (const Mstro__SockaddrIn4   *message);
-size_t mstro__sockaddr_in4__pack
-                     (const Mstro__SockaddrIn4   *message,
+/* Mstro__AddrSockaddrIN4 methods */
+void   mstro__addr_sockaddr__in4__init
+                     (Mstro__AddrSockaddrIN4         *message);
+size_t mstro__addr_sockaddr__in4__get_packed_size
+                     (const Mstro__AddrSockaddrIN4   *message);
+size_t mstro__addr_sockaddr__in4__pack
+                     (const Mstro__AddrSockaddrIN4   *message,
                       uint8_t             *out);
-size_t mstro__sockaddr_in4__pack_to_buffer
-                     (const Mstro__SockaddrIn4   *message,
+size_t mstro__addr_sockaddr__in4__pack_to_buffer
+                     (const Mstro__AddrSockaddrIN4   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__SockaddrIn4 *
-       mstro__sockaddr_in4__unpack
+Mstro__AddrSockaddrIN4 *
+       mstro__addr_sockaddr__in4__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__sockaddr_in4__free_unpacked
-                     (Mstro__SockaddrIn4 *message,
+void   mstro__addr_sockaddr__in4__free_unpacked
+                     (Mstro__AddrSockaddrIN4 *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__SockaddrIn6 methods */
-void   mstro__sockaddr_in6__init
-                     (Mstro__SockaddrIn6         *message);
-size_t mstro__sockaddr_in6__get_packed_size
-                     (const Mstro__SockaddrIn6   *message);
-size_t mstro__sockaddr_in6__pack
-                     (const Mstro__SockaddrIn6   *message,
+/* Mstro__AddrSockaddrIN6 methods */
+void   mstro__addr_sockaddr__in6__init
+                     (Mstro__AddrSockaddrIN6         *message);
+size_t mstro__addr_sockaddr__in6__get_packed_size
+                     (const Mstro__AddrSockaddrIN6   *message);
+size_t mstro__addr_sockaddr__in6__pack
+                     (const Mstro__AddrSockaddrIN6   *message,
                       uint8_t             *out);
-size_t mstro__sockaddr_in6__pack_to_buffer
-                     (const Mstro__SockaddrIn6   *message,
+size_t mstro__addr_sockaddr__in6__pack_to_buffer
+                     (const Mstro__AddrSockaddrIN6   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__SockaddrIn6 *
-       mstro__sockaddr_in6__unpack
+Mstro__AddrSockaddrIN6 *
+       mstro__addr_sockaddr__in6__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__sockaddr_in6__free_unpacked
-                     (Mstro__SockaddrIn6 *message,
+void   mstro__addr_sockaddr__in6__free_unpacked
+                     (Mstro__AddrSockaddrIN6 *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__SockaddrIb methods */
-void   mstro__sockaddr_ib__init
-                     (Mstro__SockaddrIb         *message);
-size_t mstro__sockaddr_ib__get_packed_size
-                     (const Mstro__SockaddrIb   *message);
-size_t mstro__sockaddr_ib__pack
-                     (const Mstro__SockaddrIb   *message,
+/* Mstro__AddrSockaddrIB methods */
+void   mstro__addr_sockaddr__ib__init
+                     (Mstro__AddrSockaddrIB         *message);
+size_t mstro__addr_sockaddr__ib__get_packed_size
+                     (const Mstro__AddrSockaddrIB   *message);
+size_t mstro__addr_sockaddr__ib__pack
+                     (const Mstro__AddrSockaddrIB   *message,
                       uint8_t             *out);
-size_t mstro__sockaddr_ib__pack_to_buffer
-                     (const Mstro__SockaddrIb   *message,
+size_t mstro__addr_sockaddr__ib__pack_to_buffer
+                     (const Mstro__AddrSockaddrIB   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__SockaddrIb *
-       mstro__sockaddr_ib__unpack
+Mstro__AddrSockaddrIB *
+       mstro__addr_sockaddr__ib__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__sockaddr_ib__free_unpacked
-                     (Mstro__SockaddrIb *message,
+void   mstro__addr_sockaddr__ib__free_unpacked
+                     (Mstro__AddrSockaddrIB *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__Gni methods */
-void   mstro__gni__init
-                     (Mstro__Gni         *message);
-size_t mstro__gni__get_packed_size
-                     (const Mstro__Gni   *message);
-size_t mstro__gni__pack
-                     (const Mstro__Gni   *message,
+/* Mstro__AddrGNI methods */
+void   mstro__addr_gni__init
+                     (Mstro__AddrGNI         *message);
+size_t mstro__addr_gni__get_packed_size
+                     (const Mstro__AddrGNI   *message);
+size_t mstro__addr_gni__pack
+                     (const Mstro__AddrGNI   *message,
                       uint8_t             *out);
-size_t mstro__gni__pack_to_buffer
-                     (const Mstro__Gni   *message,
+size_t mstro__addr_gni__pack_to_buffer
+                     (const Mstro__AddrGNI   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__Gni *
-       mstro__gni__unpack
+Mstro__AddrGNI *
+       mstro__addr_gni__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__gni__free_unpacked
-                     (Mstro__Gni *message,
+void   mstro__addr_gni__free_unpacked
+                     (Mstro__AddrGNI *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__Psmx2 methods */
-void   mstro__psmx2__init
-                     (Mstro__Psmx2         *message);
-size_t mstro__psmx2__get_packed_size
-                     (const Mstro__Psmx2   *message);
-size_t mstro__psmx2__pack
-                     (const Mstro__Psmx2   *message,
+/* Mstro__AddrPSMX2 methods */
+void   mstro__addr_psmx2__init
+                     (Mstro__AddrPSMX2         *message);
+size_t mstro__addr_psmx2__get_packed_size
+                     (const Mstro__AddrPSMX2   *message);
+size_t mstro__addr_psmx2__pack
+                     (const Mstro__AddrPSMX2   *message,
                       uint8_t             *out);
-size_t mstro__psmx2__pack_to_buffer
-                     (const Mstro__Psmx2   *message,
+size_t mstro__addr_psmx2__pack_to_buffer
+                     (const Mstro__AddrPSMX2   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__Psmx2 *
-       mstro__psmx2__unpack
+Mstro__AddrPSMX2 *
+       mstro__addr_psmx2__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__psmx2__free_unpacked
-                     (Mstro__Psmx2 *message,
+void   mstro__addr_psmx2__free_unpacked
+                     (Mstro__AddrPSMX2 *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__IbUb methods */
-void   mstro__ib_ub__init
-                     (Mstro__IbUb         *message);
-size_t mstro__ib_ub__get_packed_size
-                     (const Mstro__IbUb   *message);
-size_t mstro__ib_ub__pack
-                     (const Mstro__IbUb   *message,
+/* Mstro__AddrPSMX3 methods */
+void   mstro__addr_psmx3__init
+                     (Mstro__AddrPSMX3         *message);
+size_t mstro__addr_psmx3__get_packed_size
+                     (const Mstro__AddrPSMX3   *message);
+size_t mstro__addr_psmx3__pack
+                     (const Mstro__AddrPSMX3   *message,
                       uint8_t             *out);
-size_t mstro__ib_ub__pack_to_buffer
-                     (const Mstro__IbUb   *message,
+size_t mstro__addr_psmx3__pack_to_buffer
+                     (const Mstro__AddrPSMX3   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__IbUb *
-       mstro__ib_ub__unpack
+Mstro__AddrPSMX3 *
+       mstro__addr_psmx3__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__ib_ub__free_unpacked
-                     (Mstro__IbUb *message,
+void   mstro__addr_psmx3__free_unpacked
+                     (Mstro__AddrPSMX3 *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__Ofi methods */
-void   mstro__ofi__init
-                     (Mstro__Ofi         *message);
-size_t mstro__ofi__get_packed_size
-                     (const Mstro__Ofi   *message);
-size_t mstro__ofi__pack
-                     (const Mstro__Ofi   *message,
+/* Mstro__AddrIBUD methods */
+void   mstro__addr_ib__ud__init
+                     (Mstro__AddrIBUD         *message);
+size_t mstro__addr_ib__ud__get_packed_size
+                     (const Mstro__AddrIBUD   *message);
+size_t mstro__addr_ib__ud__pack
+                     (const Mstro__AddrIBUD   *message,
                       uint8_t             *out);
-size_t mstro__ofi__pack_to_buffer
-                     (const Mstro__Ofi   *message,
+size_t mstro__addr_ib__ud__pack_to_buffer
+                     (const Mstro__AddrIBUD   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__Ofi *
-       mstro__ofi__unpack
+Mstro__AddrIBUD *
+       mstro__addr_ib__ud__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__ofi__free_unpacked
-                     (Mstro__Ofi *message,
+void   mstro__addr_ib__ud__free_unpacked
+                     (Mstro__AddrIBUD *message,
+                      ProtobufCAllocator *allocator);
+/* Mstro__OfiAddr methods */
+void   mstro__ofi_addr__init
+                     (Mstro__OfiAddr         *message);
+size_t mstro__ofi_addr__get_packed_size
+                     (const Mstro__OfiAddr   *message);
+size_t mstro__ofi_addr__pack
+                     (const Mstro__OfiAddr   *message,
+                      uint8_t             *out);
+size_t mstro__ofi_addr__pack_to_buffer
+                     (const Mstro__OfiAddr   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__OfiAddr *
+       mstro__ofi_addr__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__ofi_addr__free_unpacked
+                     (Mstro__OfiAddr *message,
                       ProtobufCAllocator *allocator);
 /* Mstro__Endpoint methods */
 void   mstro__endpoint__init
@@ -494,43 +585,62 @@ Mstro__OfiMemoryRegion *
 void   mstro__ofi_memory_region__free_unpacked
                      (Mstro__OfiMemoryRegion *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__PM methods */
-void   mstro__pm__init
-                     (Mstro__PM         *message);
-size_t mstro__pm__get_packed_size
-                     (const Mstro__PM   *message);
-size_t mstro__pm__pack
-                     (const Mstro__PM   *message,
+/* Mstro__CredDRC methods */
+void   mstro__cred_drc__init
+                     (Mstro__CredDRC         *message);
+size_t mstro__cred_drc__get_packed_size
+                     (const Mstro__CredDRC   *message);
+size_t mstro__cred_drc__pack
+                     (const Mstro__CredDRC   *message,
+                      uint8_t             *out);
+size_t mstro__cred_drc__pack_to_buffer
+                     (const Mstro__CredDRC   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__CredDRC *
+       mstro__cred_drc__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__cred_drc__free_unpacked
+                     (Mstro__CredDRC *message,
+                      ProtobufCAllocator *allocator);
+/* Mstro__OfiCredential methods */
+void   mstro__ofi_credential__init
+                     (Mstro__OfiCredential         *message);
+size_t mstro__ofi_credential__get_packed_size
+                     (const Mstro__OfiCredential   *message);
+size_t mstro__ofi_credential__pack
+                     (const Mstro__OfiCredential   *message,
                       uint8_t             *out);
-size_t mstro__pm__pack_to_buffer
-                     (const Mstro__PM   *message,
+size_t mstro__ofi_credential__pack_to_buffer
+                     (const Mstro__OfiCredential   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__PM *
-       mstro__pm__unpack
+Mstro__OfiCredential *
+       mstro__ofi_credential__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__pm__free_unpacked
-                     (Mstro__PM *message,
+void   mstro__ofi_credential__free_unpacked
+                     (Mstro__OfiCredential *message,
                       ProtobufCAllocator *allocator);
-/* Mstro__PmInfo methods */
-void   mstro__pm_info__init
-                     (Mstro__PmInfo         *message);
-size_t mstro__pm_info__get_packed_size
-                     (const Mstro__PmInfo   *message);
-size_t mstro__pm_info__pack
-                     (const Mstro__PmInfo   *message,
+/* Mstro__EndpointList methods */
+void   mstro__endpoint_list__init
+                     (Mstro__EndpointList         *message);
+size_t mstro__endpoint_list__get_packed_size
+                     (const Mstro__EndpointList   *message);
+size_t mstro__endpoint_list__pack
+                     (const Mstro__EndpointList   *message,
                       uint8_t             *out);
-size_t mstro__pm_info__pack_to_buffer
-                     (const Mstro__PmInfo   *message,
+size_t mstro__endpoint_list__pack_to_buffer
+                     (const Mstro__EndpointList   *message,
                       ProtobufCBuffer     *buffer);
-Mstro__PmInfo *
-       mstro__pm_info__unpack
+Mstro__EndpointList *
+       mstro__endpoint_list__unpack
                      (ProtobufCAllocator  *allocator,
                       size_t               len,
                       const uint8_t       *data);
-void   mstro__pm_info__free_unpacked
-                     (Mstro__PmInfo *message,
+void   mstro__endpoint_list__free_unpacked
+                     (Mstro__EndpointList *message,
                       ProtobufCAllocator *allocator);
 /* Mstro__AppInfo methods */
 void   mstro__app_info__init
@@ -553,29 +663,32 @@ void   mstro__app_info__free_unpacked
                       ProtobufCAllocator *allocator);
 /* --- per-message closures --- */
 
-typedef void (*Mstro__Sockaddr_Closure)
-                 (const Mstro__Sockaddr *message,
+typedef void (*Mstro__AddrSockaddr_Closure)
+                 (const Mstro__AddrSockaddr *message,
+                  void *closure_data);
+typedef void (*Mstro__AddrSockaddrIN4_Closure)
+                 (const Mstro__AddrSockaddrIN4 *message,
                   void *closure_data);
-typedef void (*Mstro__SockaddrIn4_Closure)
-                 (const Mstro__SockaddrIn4 *message,
+typedef void (*Mstro__AddrSockaddrIN6_Closure)
+                 (const Mstro__AddrSockaddrIN6 *message,
                   void *closure_data);
-typedef void (*Mstro__SockaddrIn6_Closure)
-                 (const Mstro__SockaddrIn6 *message,
+typedef void (*Mstro__AddrSockaddrIB_Closure)
+                 (const Mstro__AddrSockaddrIB *message,
                   void *closure_data);
-typedef void (*Mstro__SockaddrIb_Closure)
-                 (const Mstro__SockaddrIb *message,
+typedef void (*Mstro__AddrGNI_Closure)
+                 (const Mstro__AddrGNI *message,
                   void *closure_data);
-typedef void (*Mstro__Gni_Closure)
-                 (const Mstro__Gni *message,
+typedef void (*Mstro__AddrPSMX2_Closure)
+                 (const Mstro__AddrPSMX2 *message,
                   void *closure_data);
-typedef void (*Mstro__Psmx2_Closure)
-                 (const Mstro__Psmx2 *message,
+typedef void (*Mstro__AddrPSMX3_Closure)
+                 (const Mstro__AddrPSMX3 *message,
                   void *closure_data);
-typedef void (*Mstro__IbUb_Closure)
-                 (const Mstro__IbUb *message,
+typedef void (*Mstro__AddrIBUD_Closure)
+                 (const Mstro__AddrIBUD *message,
                   void *closure_data);
-typedef void (*Mstro__Ofi_Closure)
-                 (const Mstro__Ofi *message,
+typedef void (*Mstro__OfiAddr_Closure)
+                 (const Mstro__OfiAddr *message,
                   void *closure_data);
 typedef void (*Mstro__Endpoint_Closure)
                  (const Mstro__Endpoint *message,
@@ -583,11 +696,14 @@ typedef void (*Mstro__Endpoint_Closure)
 typedef void (*Mstro__OfiMemoryRegion_Closure)
                  (const Mstro__OfiMemoryRegion *message,
                   void *closure_data);
-typedef void (*Mstro__PM_Closure)
-                 (const Mstro__PM *message,
+typedef void (*Mstro__CredDRC_Closure)
+                 (const Mstro__CredDRC *message,
+                  void *closure_data);
+typedef void (*Mstro__OfiCredential_Closure)
+                 (const Mstro__OfiCredential *message,
                   void *closure_data);
-typedef void (*Mstro__PmInfo_Closure)
-                 (const Mstro__PmInfo *message,
+typedef void (*Mstro__EndpointList_Closure)
+                 (const Mstro__EndpointList *message,
                   void *closure_data);
 typedef void (*Mstro__AppInfo_Closure)
                  (const Mstro__AppInfo *message,
@@ -598,18 +714,21 @@ typedef void (*Mstro__AppInfo_Closure)
 
 /* --- descriptors --- */
 
-extern const ProtobufCMessageDescriptor mstro__sockaddr__descriptor;
-extern const ProtobufCMessageDescriptor mstro__sockaddr_in4__descriptor;
-extern const ProtobufCMessageDescriptor mstro__sockaddr_in6__descriptor;
-extern const ProtobufCMessageDescriptor mstro__sockaddr_ib__descriptor;
-extern const ProtobufCMessageDescriptor mstro__gni__descriptor;
-extern const ProtobufCMessageDescriptor mstro__psmx2__descriptor;
-extern const ProtobufCMessageDescriptor mstro__ib_ub__descriptor;
-extern const ProtobufCMessageDescriptor mstro__ofi__descriptor;
+extern const ProtobufCEnumDescriptor    mstro__ofi_endpoint_kind__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_sockaddr__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_sockaddr__in4__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_sockaddr__in6__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_sockaddr__ib__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_gni__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_psmx2__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_psmx3__descriptor;
+extern const ProtobufCMessageDescriptor mstro__addr_ib__ud__descriptor;
+extern const ProtobufCMessageDescriptor mstro__ofi_addr__descriptor;
 extern const ProtobufCMessageDescriptor mstro__endpoint__descriptor;
 extern const ProtobufCMessageDescriptor mstro__ofi_memory_region__descriptor;
-extern const ProtobufCMessageDescriptor mstro__pm__descriptor;
-extern const ProtobufCMessageDescriptor mstro__pm_info__descriptor;
+extern const ProtobufCMessageDescriptor mstro__cred_drc__descriptor;
+extern const ProtobufCMessageDescriptor mstro__ofi_credential__descriptor;
+extern const ProtobufCMessageDescriptor mstro__endpoint_list__descriptor;
 extern const ProtobufCMessageDescriptor mstro__app_info__descriptor;
 
 PROTOBUF_C__END_DECLS
diff --git a/protocols/mstro_ep.proto b/protocols/mstro_ep.proto
index 2f0c236de8fef69f403aaf58eda9e750c81ee1eb..b6b5623381a9950509034809c51a6857821dea65 100644
--- a/protocols/mstro_ep.proto
+++ b/protocols/mstro_ep.proto
@@ -17,20 +17,20 @@ option optimize_for = SPEED;
  **/
 
 /* see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/sys_socket.h.html */
-message sockaddr {
+message AddrSockaddr {
   fixed64 sa_family = 1;
   bytes   sa_data   = 2;
 };
 
 /* see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/netinet_in.h.html */
-message sockaddr_in4 {
+message AddrSockaddr_IN4 {
   fixed64 sin_family = 1; /* * Posix does not specify exact size, but darwin as 1 byte, linux 2, so 8 should be plenty */
   uint32  sin_port   = 2; /* 16 bit, actually */
   fixed32 sin_addr   = 3;
 };
 
 /* see https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/netinet_in.h.html */
-message sockaddr_in6 {
+message AddrSockaddr_IN6 {
   fixed64 sin6_family   = 1; /* posix does not specify exact size */
   uint32  sin6_port     = 2; /* 16 bit, actually */
   fixed32 sin6_flowinfo = 3;
@@ -41,84 +41,123 @@ message sockaddr_in6 {
 };
 
 /* see linux/latest/source/include/rdma/ib.h */
-message sockaddr_ib {
+message AddrSockaddr_IB {
   fixed64 sib_family  = 1;
   fixed64 sib_addr_0  = 2;
   fixed64 sib_addr_1  = 3;
 };
 
 /* address of a GNI adapter */
-message gni {
-  fixed64 a1 = 1;
-  fixed64 a2 = 2;
-  fixed64 a3 = 3;
-  fixed64 a4 = 4;
-  fixed64 a5 = 5;
-  fixed64 a6 = 6;
+message AddrGNI {
+  fixed64 a0 = 1;
+  fixed64 a1 = 2;
+  fixed64 a2 = 3;
+  fixed64 a3 = 4;
+  fixed64 a4 = 5;
+  fixed64 a5 = 6;
 };
 
-message psmx2 {
-  fixed64 a1 = 1;
-  fixed64 a2 = 2;
+message AddrPSMX2 {
+  fixed64 a0 = 1;
+  fixed64 a1 = 2;
 };
 
-message ib_ub {
-  fixed64 a1 = 1;
-  fixed64 a2 = 2;
-  fixed64 a3 = 3;
-  fixed64 a4 = 4;
+message AddrPSMX3 {
+  fixed64 a0 = 1;
+  fixed64 a1 = 2;
 };
 
-/** Open Fabric endpoint */
-message Ofi {
+message AddrIB_UD {
+  fixed64 a0 = 1;
+  fixed64 a1 = 2;
+  fixed64 a2 = 3;
+  fixed64 a3 = 4;
+};
+
+/** Open Fabric endpoint address */
+message OfiAddr {
   oneof val {
-    /* provider-specific address kinds as of OFI 1.10 */
+    /* provider-specific address kinds as of OFI 1.14 */
     bytes            unspec = 1;
-    sockaddr         sock   = 2;
-    sockaddr_in4     in4    = 3;
-    sockaddr_in6     in6    = 4;
-    sockaddr_ib      ib     = 5;
+    AddrSockaddr     sock   = 2;
+    AddrSockaddr_IN4 in4    = 3;
+    AddrSockaddr_IN6 in6    = 4;
+    AddrSockaddr_IB  ib     = 5;
     fixed64          psmx   = 6;
-    gni              gni    = 7;  /* 6 elements, to be exact */
+    AddrGNI          gni    = 7;  /* 6 elements, to be exact */
     fixed64          bgq    = 8;
     fixed64          mlx    = 9;
     string           str    = 10;
-    psmx2            psmx2  = 11; /* 2 elements, to be exact */
-    ib_ub            ib_ub  = 12; /* 4 elements, to be exact */
+    AddrPSMX2        psmx2  = 11; /* 2 uint64 values */
+    AddrIB_UD        ib_ud  = 12; /* 4 uint64 values */
     fixed64          efa    = 13;
+    AddrPSMX3        psmx3  = 14; /* 2 uint64 values */
   }
 };
 
+/** Open Fabric endpoint protocols */
+enum OfiEndpointKind {
+  /* endpoint types as of OFI 1.14 */
+  UNSPEC          = 0;
+  RDMA_CM_IB_RC   = 1;
+  IWARP           = 2;
+  IB_UD           = 3;
+  PSMX            = 4;
+  UDP             = 5;
+  SOCK_TCP        = 6;
+  MXM             = 7;
+  IWARP_RDM       = 8;
+  IB_RDM          = 9;
+  GNI             = 10;
+  RXM             = 11;
+  RXD             = 12;
+  MLX             = 13;
+  NETWORKDIRECT   = 14;
+  PSMX2           = 15;
+  SHM             = 16;
+  MRAIL           = 17;
+  RSTREAM         = 18;
+  RDMA_CM_IB_XRC  = 19;
+  EFA             = 20;
+  PSMX3           = 21;  
+}
+
 message Endpoint {
-  oneof kind {
-    Ofi ofi = 1;
+  oneof proto {
+    OfiEndpointKind ofiproto = 1;
+  }
+  oneof addr {
+    OfiAddr ofiaddr = 2;
   }
 };
 
 /** Information about an OFI RDMA region */
 message OfiMemoryRegion {
   fixed64 baseaddr = 1;
-  fixed64 keylen = 2;
-  bytes   raw_key = 3;
+  bytes   raw_key = 2;
 }
 
-/** A Pool Manager endpoint list and basic info */
-message PM {
-  /** the list of endpoints on which this PM entity is reachable */
-  repeated Endpoint ep = 1;
-  /** access information for the PM Info Region */
-  oneof inforeg {
-    OfiMemoryRegion ofi = 2;
+message CredDRC {
+  fixed32 credential = 1;
+}
+
+message OfiCredential {
+  oneof val {
+    CredDRC drc = 1; /* Cray DRC credential */
   }
-};
+}
 
-/** A Pool Manager Info block */
-message PmInfo {
-  /** each PM entity provides one PM entry */
-  repeated PM pm = 1;
+/** An endpoint list and basic info. All three arrays are parallel and must have the same length. */
+message EndpointList {
+  /** the list of endpoints on which this PM entity is reachable */
+  repeated Endpoint eps = 1;
+  /** access information for the Info Region */
+  repeated OfiMemoryRegion inforegs = 2;
+  /** possibly access credentials */
+  repeated OfiCredential credentials = 3;
 };
 
-/** An App Info block */
+/** An application (incl pool manager) endpoint info block */
 message AppInfo {
-  repeated Endpoint ep = 1;
+  EndpointList eps = 1;
 };
diff --git a/protocols/mstro_pool.pb-c.c b/protocols/mstro_pool.pb-c.c
index b03e5ed7d1080e930270ad0cd8c99595693bc654..be5d2552512c60db9d90a6e75ae13720f0be4b16 100644
--- a/protocols/mstro_pool.pb-c.c
+++ b/protocols/mstro_pool.pb-c.c
@@ -637,6 +637,186 @@ void   mstro__pool__timestamp__free_unpacked
   assert(message->base.descriptor == &mstro__pool__timestamp__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
+void   mstro__pool__mmb_layout_irregular__init
+                     (Mstro__Pool__MmbLayoutIrregular         *message)
+{
+  static const Mstro__Pool__MmbLayoutIrregular init_value = MSTRO__POOL__MMB_LAYOUT_IRREGULAR__INIT;
+  *message = init_value;
+}
+size_t mstro__pool__mmb_layout_irregular__get_packed_size
+                     (const Mstro__Pool__MmbLayoutIrregular *message)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_irregular__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__pool__mmb_layout_irregular__pack
+                     (const Mstro__Pool__MmbLayoutIrregular *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_irregular__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__pool__mmb_layout_irregular__pack_to_buffer
+                     (const Mstro__Pool__MmbLayoutIrregular *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_irregular__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__Pool__MmbLayoutIrregular *
+       mstro__pool__mmb_layout_irregular__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__Pool__MmbLayoutIrregular *)
+     protobuf_c_message_unpack (&mstro__pool__mmb_layout_irregular__descriptor,
+                                allocator, len, data);
+}
+void   mstro__pool__mmb_layout_irregular__free_unpacked
+                     (Mstro__Pool__MmbLayoutIrregular *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_irregular__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   mstro__pool__mmb_dimensions__init
+                     (Mstro__Pool__MmbDimensions         *message)
+{
+  static const Mstro__Pool__MmbDimensions init_value = MSTRO__POOL__MMB_DIMENSIONS__INIT;
+  *message = init_value;
+}
+size_t mstro__pool__mmb_dimensions__get_packed_size
+                     (const Mstro__Pool__MmbDimensions *message)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_dimensions__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__pool__mmb_dimensions__pack
+                     (const Mstro__Pool__MmbDimensions *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_dimensions__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__pool__mmb_dimensions__pack_to_buffer
+                     (const Mstro__Pool__MmbDimensions *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_dimensions__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__Pool__MmbDimensions *
+       mstro__pool__mmb_dimensions__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__Pool__MmbDimensions *)
+     protobuf_c_message_unpack (&mstro__pool__mmb_dimensions__descriptor,
+                                allocator, len, data);
+}
+void   mstro__pool__mmb_dimensions__free_unpacked
+                     (Mstro__Pool__MmbDimensions *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__pool__mmb_dimensions__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   mstro__pool__mmb_layout_block__init
+                     (Mstro__Pool__MmbLayoutBlock         *message)
+{
+  static const Mstro__Pool__MmbLayoutBlock init_value = MSTRO__POOL__MMB_LAYOUT_BLOCK__INIT;
+  *message = init_value;
+}
+size_t mstro__pool__mmb_layout_block__get_packed_size
+                     (const Mstro__Pool__MmbLayoutBlock *message)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_block__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__pool__mmb_layout_block__pack
+                     (const Mstro__Pool__MmbLayoutBlock *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_block__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__pool__mmb_layout_block__pack_to_buffer
+                     (const Mstro__Pool__MmbLayoutBlock *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_block__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__Pool__MmbLayoutBlock *
+       mstro__pool__mmb_layout_block__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__Pool__MmbLayoutBlock *)
+     protobuf_c_message_unpack (&mstro__pool__mmb_layout_block__descriptor,
+                                allocator, len, data);
+}
+void   mstro__pool__mmb_layout_block__free_unpacked
+                     (Mstro__Pool__MmbLayoutBlock *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__pool__mmb_layout_block__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   mstro__pool__mmblayout__init
+                     (Mstro__Pool__Mmblayout         *message)
+{
+  static const Mstro__Pool__Mmblayout init_value = MSTRO__POOL__MMBLAYOUT__INIT;
+  *message = init_value;
+}
+size_t mstro__pool__mmblayout__get_packed_size
+                     (const Mstro__Pool__Mmblayout *message)
+{
+  assert(message->base.descriptor == &mstro__pool__mmblayout__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t mstro__pool__mmblayout__pack
+                     (const Mstro__Pool__Mmblayout *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &mstro__pool__mmblayout__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t mstro__pool__mmblayout__pack_to_buffer
+                     (const Mstro__Pool__Mmblayout *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &mstro__pool__mmblayout__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+Mstro__Pool__Mmblayout *
+       mstro__pool__mmblayout__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (Mstro__Pool__Mmblayout *)
+     protobuf_c_message_unpack (&mstro__pool__mmblayout__descriptor,
+                                allocator, len, data);
+}
+void   mstro__pool__mmblayout__free_unpacked
+                     (Mstro__Pool__Mmblayout *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &mstro__pool__mmblayout__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
 void   mstro__pool__group_members__init
                      (Mstro__Pool__GroupMembers         *message)
 {
@@ -3143,6 +3323,330 @@ const ProtobufCMessageDescriptor mstro__pool__timestamp__descriptor =
   (ProtobufCMessageInit) mstro__pool__timestamp__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
+static const ProtobufCFieldDescriptor mstro__pool__mmb_layout_irregular__field_descriptors[3] =
+{
+  {
+    "n_blocks",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__MmbLayoutIrregular, n_blocks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "offsets",
+    2,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_FIXED64,
+    offsetof(Mstro__Pool__MmbLayoutIrregular, n_offsets),
+    offsetof(Mstro__Pool__MmbLayoutIrregular, offsets),
+    NULL,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_PACKED,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "lengths",
+    3,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_FIXED64,
+    offsetof(Mstro__Pool__MmbLayoutIrregular, n_lengths),
+    offsetof(Mstro__Pool__MmbLayoutIrregular, lengths),
+    NULL,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_PACKED,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mstro__pool__mmb_layout_irregular__field_indices_by_name[] = {
+  2,   /* field[2] = lengths */
+  0,   /* field[0] = n_blocks */
+  1,   /* field[1] = offsets */
+};
+static const ProtobufCIntRange mstro__pool__mmb_layout_irregular__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 3 }
+};
+const ProtobufCMessageDescriptor mstro__pool__mmb_layout_irregular__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmbLayoutIrregular",
+  "MmbLayoutIrregular",
+  "Mstro__Pool__MmbLayoutIrregular",
+  "mstro.pool",
+  sizeof(Mstro__Pool__MmbLayoutIrregular),
+  3,
+  mstro__pool__mmb_layout_irregular__field_descriptors,
+  mstro__pool__mmb_layout_irregular__field_indices_by_name,
+  1,  mstro__pool__mmb_layout_irregular__number_ranges,
+  (ProtobufCMessageInit) mstro__pool__mmb_layout_irregular__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor mstro__pool__mmb_dimensions__field_descriptors[1] =
+{
+  {
+    "d",
+    1,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_FIXED64,
+    offsetof(Mstro__Pool__MmbDimensions, n_d),
+    offsetof(Mstro__Pool__MmbDimensions, d),
+    NULL,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_PACKED,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mstro__pool__mmb_dimensions__field_indices_by_name[] = {
+  0,   /* field[0] = d */
+};
+static const ProtobufCIntRange mstro__pool__mmb_dimensions__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 1 }
+};
+const ProtobufCMessageDescriptor mstro__pool__mmb_dimensions__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmbDimensions",
+  "MmbDimensions",
+  "Mstro__Pool__MmbDimensions",
+  "mstro.pool",
+  sizeof(Mstro__Pool__MmbDimensions),
+  1,
+  mstro__pool__mmb_dimensions__field_descriptors,
+  mstro__pool__mmb_dimensions__field_indices_by_name,
+  1,  mstro__pool__mmb_dimensions__number_ranges,
+  (ProtobufCMessageInit) mstro__pool__mmb_dimensions__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCEnumValue mstro__pool__mmb_layout_block__mmb_layout_order__enum_values_by_number[5] =
+{
+  { "MMB_LAYOUT_ORDER_NONE", "MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_LAYOUT_ORDER_NONE", 0 },
+  { "MMB_ROWMAJOR", "MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_ROWMAJOR", 1 },
+  { "MMB_COLMAJOR", "MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_COLMAJOR", 2 },
+  { "MMB_GENERIC_ND", "MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_GENERIC_ND", 3 },
+  { "MMB_LAYOUT_ORDER_MAX", "MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_LAYOUT_ORDER_MAX", 4 },
+};
+static const ProtobufCIntRange mstro__pool__mmb_layout_block__mmb_layout_order__value_ranges[] = {
+{0, 0},{0, 5}
+};
+static const ProtobufCEnumValueIndex mstro__pool__mmb_layout_block__mmb_layout_order__enum_values_by_name[5] =
+{
+  { "MMB_COLMAJOR", 2 },
+  { "MMB_GENERIC_ND", 3 },
+  { "MMB_LAYOUT_ORDER_MAX", 4 },
+  { "MMB_LAYOUT_ORDER_NONE", 0 },
+  { "MMB_ROWMAJOR", 1 },
+};
+const ProtobufCEnumDescriptor mstro__pool__mmb_layout_block__mmb_layout_order__descriptor =
+{
+  PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmbLayoutBlock.mmbLayoutOrder",
+  "mmbLayoutOrder",
+  "Mstro__Pool__MmbLayoutBlock__MmbLayoutOrder",
+  "mstro.pool",
+  5,
+  mstro__pool__mmb_layout_block__mmb_layout_order__enum_values_by_number,
+  5,
+  mstro__pool__mmb_layout_block__mmb_layout_order__enum_values_by_name,
+  1,
+  mstro__pool__mmb_layout_block__mmb_layout_order__value_ranges,
+  NULL,NULL,NULL,NULL   /* reserved[1234] */
+};
+static const ProtobufCFieldDescriptor mstro__pool__mmb_layout_block__field_descriptors[2] =
+{
+  {
+    "order",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_ENUM,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__MmbLayoutBlock, order),
+    &mstro__pool__mmb_layout_block__mmb_layout_order__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "dimensions",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__MmbLayoutBlock, dimensions),
+    &mstro__pool__mmb_dimensions__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mstro__pool__mmb_layout_block__field_indices_by_name[] = {
+  1,   /* field[1] = dimensions */
+  0,   /* field[0] = order */
+};
+static const ProtobufCIntRange mstro__pool__mmb_layout_block__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 2 }
+};
+const ProtobufCMessageDescriptor mstro__pool__mmb_layout_block__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmbLayoutBlock",
+  "MmbLayoutBlock",
+  "Mstro__Pool__MmbLayoutBlock",
+  "mstro.pool",
+  sizeof(Mstro__Pool__MmbLayoutBlock),
+  2,
+  mstro__pool__mmb_layout_block__field_descriptors,
+  mstro__pool__mmb_layout_block__field_indices_by_name,
+  1,  mstro__pool__mmb_layout_block__number_ranges,
+  (ProtobufCMessageInit) mstro__pool__mmb_layout_block__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCEnumValue mstro__pool__mmblayout__mmb_layout_type__enum_values_by_number[5] =
+{
+  { "MMB_LAYOUT_NONE", "MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_LAYOUT_NONE", 0 },
+  { "MMB_REGULAR", "MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_REGULAR", 1 },
+  { "MMB_REGULAR_BLOCK", "MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_REGULAR_BLOCK", 2 },
+  { "MMB_IRREGULAR", "MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_IRREGULAR", 3 },
+  { "MMB_LAYOUT_TYPE_MAX", "MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_LAYOUT_TYPE_MAX", 4 },
+};
+static const ProtobufCIntRange mstro__pool__mmblayout__mmb_layout_type__value_ranges[] = {
+{0, 0},{0, 5}
+};
+static const ProtobufCEnumValueIndex mstro__pool__mmblayout__mmb_layout_type__enum_values_by_name[5] =
+{
+  { "MMB_IRREGULAR", 3 },
+  { "MMB_LAYOUT_NONE", 0 },
+  { "MMB_LAYOUT_TYPE_MAX", 4 },
+  { "MMB_REGULAR", 1 },
+  { "MMB_REGULAR_BLOCK", 2 },
+};
+const ProtobufCEnumDescriptor mstro__pool__mmblayout__mmb_layout_type__descriptor =
+{
+  PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmblayout.mmbLayoutType",
+  "mmbLayoutType",
+  "Mstro__Pool__Mmblayout__MmbLayoutType",
+  "mstro.pool",
+  5,
+  mstro__pool__mmblayout__mmb_layout_type__enum_values_by_number,
+  5,
+  mstro__pool__mmblayout__mmb_layout_type__enum_values_by_name,
+  1,
+  mstro__pool__mmblayout__mmb_layout_type__value_ranges,
+  NULL,NULL,NULL,NULL   /* reserved[1234] */
+};
+static const ProtobufCFieldDescriptor mstro__pool__mmblayout__field_descriptors[6] =
+{
+  {
+    "type",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_ENUM,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__Mmblayout, type),
+    &mstro__pool__mmblayout__mmb_layout_type__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "n_dims",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__Mmblayout, n_dims),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "index",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__Mmblayout, index),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "element_size_bytes",
+    4,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__Mmblayout, element_size_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "block",
+    5,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__Pool__Mmblayout, layout_case),
+    offsetof(Mstro__Pool__Mmblayout, block),
+    &mstro__pool__mmb_layout_block__descriptor,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "irregular",
+    6,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__Pool__Mmblayout, layout_case),
+    offsetof(Mstro__Pool__Mmblayout, irregular),
+    &mstro__pool__mmb_layout_irregular__descriptor,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mstro__pool__mmblayout__field_indices_by_name[] = {
+  4,   /* field[4] = block */
+  3,   /* field[3] = element_size_bytes */
+  2,   /* field[2] = index */
+  5,   /* field[5] = irregular */
+  1,   /* field[1] = n_dims */
+  0,   /* field[0] = type */
+};
+static const ProtobufCIntRange mstro__pool__mmblayout__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 6 }
+};
+const ProtobufCMessageDescriptor mstro__pool__mmblayout__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mstro.pool.mmblayout",
+  "Mmblayout",
+  "Mstro__Pool__Mmblayout",
+  "mstro.pool",
+  sizeof(Mstro__Pool__Mmblayout),
+  6,
+  mstro__pool__mmblayout__field_descriptors,
+  mstro__pool__mmblayout__field_indices_by_name,
+  1,  mstro__pool__mmblayout__number_ranges,
+  (ProtobufCMessageInit) mstro__pool__mmblayout__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
 static const ProtobufCFieldDescriptor mstro__pool__group_members__field_descriptors[3] =
 {
   {
@@ -3207,7 +3711,7 @@ const ProtobufCMessageDescriptor mstro__pool__group_members__descriptor =
   (ProtobufCMessageInit) mstro__pool__group_members__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pool__aval__field_descriptors[10] =
+static const ProtobufCFieldDescriptor mstro__pool__aval__field_descriptors[11] =
 {
   {
     "bool",
@@ -3329,6 +3833,18 @@ static const ProtobufCFieldDescriptor mstro__pool__aval__field_descriptors[10] =
     0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "mmbLayout",
+    11,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mstro__Pool__AVal, val_case),
+    offsetof(Mstro__Pool__AVal, mmblayout),
+    &mstro__pool__mmblayout__descriptor,
+    NULL,
+    0 | PROTOBUF_C_FIELD_FLAG_ONEOF,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mstro__pool__aval__field_indices_by_name[] = {
   0,   /* field[0] = bool */
@@ -3337,6 +3853,7 @@ static const unsigned mstro__pool__aval__field_indices_by_name[] = {
   5,   /* field[5] = float */
   1,   /* field[1] = int32 */
   2,   /* field[2] = int64 */
+  10,   /* field[10] = mmbLayout */
   7,   /* field[7] = string */
   9,   /* field[9] = timestamp */
   3,   /* field[3] = uint32 */
@@ -3345,7 +3862,7 @@ static const unsigned mstro__pool__aval__field_indices_by_name[] = {
 static const ProtobufCIntRange mstro__pool__aval__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 10 }
+  { 0, 11 }
 };
 const ProtobufCMessageDescriptor mstro__pool__aval__descriptor =
 {
@@ -3355,7 +3872,7 @@ const ProtobufCMessageDescriptor mstro__pool__aval__descriptor =
   "Mstro__Pool__AVal",
   "mstro.pool",
   sizeof(Mstro__Pool__AVal),
-  10,
+  11,
   mstro__pool__aval__field_descriptors,
   mstro__pool__aval__field_indices_by_name,
   1,  mstro__pool__aval__number_ranges,
@@ -4128,7 +4645,7 @@ const ProtobufCMessageDescriptor mstro__pool__transport_methods__descriptor =
   (ProtobufCMessageInit) mstro__pool__transport_methods__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pool__initiate_transfer__field_descriptors[8] =
+static const ProtobufCFieldDescriptor mstro__pool__initiate_transfer__field_descriptors[10] =
 {
   {
     "srccdoid",
@@ -4226,21 +4743,47 @@ static const ProtobufCFieldDescriptor mstro__pool__initiate_transfer__field_desc
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "n_segments",
+    9,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_SFIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__InitiateTransfer, n_segments),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "distributed_cdo",
+    10,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_BOOL,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__InitiateTransfer, distributed_cdo),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mstro__pool__initiate_transfer__field_indices_by_name[] = {
   6,   /* field[6] = cp */
+  9,   /* field[9] = distributed_cdo */
   2,   /* field[2] = dst_appid */
   4,   /* field[4] = dst_attributes */
   3,   /* field[3] = dst_serialized_endpoint */
   1,   /* field[1] = dstcdoid */
   7,   /* field[7] = force_offer */
   5,   /* field[5] = methods */
+  8,   /* field[8] = n_segments */
   0,   /* field[0] = srccdoid */
 };
 static const ProtobufCIntRange mstro__pool__initiate_transfer__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 8 }
+  { 0, 10 }
 };
 const ProtobufCMessageDescriptor mstro__pool__initiate_transfer__descriptor =
 {
@@ -4250,7 +4793,7 @@ const ProtobufCMessageDescriptor mstro__pool__initiate_transfer__descriptor =
   "Mstro__Pool__InitiateTransfer",
   "mstro.pool",
   sizeof(Mstro__Pool__InitiateTransfer),
-  8,
+  10,
   mstro__pool__initiate_transfer__field_descriptors,
   mstro__pool__initiate_transfer__field_indices_by_name,
   1,  mstro__pool__initiate_transfer__number_ranges,
@@ -4513,7 +5056,7 @@ const ProtobufCMessageDescriptor mstro__pool__transfer_ticket_ofi__descriptor =
   (ProtobufCMessageInit) mstro__pool__transfer_ticket_ofi__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pool__transfer_ticket__field_descriptors[14] =
+static const ProtobufCFieldDescriptor mstro__pool__transfer_ticket__field_descriptors[18] =
 {
   {
     "srccdoid",
@@ -4683,17 +5226,69 @@ static const ProtobufCFieldDescriptor mstro__pool__transfer_ticket__field_descri
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "n_segments",
+    15,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_SFIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__TransferTicket, n_segments),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "src_offset",
+    16,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_SFIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__TransferTicket, src_offset),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "dst_offset",
+    17,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_SFIXED64,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__TransferTicket, dst_offset),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "distributed_cdo",
+    18,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_BOOL,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__TransferTicket, distributed_cdo),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mstro__pool__transfer_ticket__field_indices_by_name[] = {
   12,   /* field[12] = attributes */
   11,   /* field[11] = data_size */
+  17,   /* field[17] = distributed_cdo */
+  16,   /* field[16] = dst_offset */
   1,   /* field[1] = dstcdoid */
   3,   /* field[3] = dstid */
   13,   /* field[13] = force_offer */
   7,   /* field[7] = gfs */
   6,   /* field[6] = method */
   9,   /* field[9] = mio */
+  14,   /* field[14] = n_segments */
   10,   /* field[10] = ofi */
+  15,   /* field[15] = src_offset */
   5,   /* field[5] = src_serialized_endpoint */
   0,   /* field[0] = srccdoid */
   2,   /* field[2] = srcid */
@@ -4703,7 +5298,7 @@ static const unsigned mstro__pool__transfer_ticket__field_indices_by_name[] = {
 static const ProtobufCIntRange mstro__pool__transfer_ticket__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 14 }
+  { 0, 18 }
 };
 const ProtobufCMessageDescriptor mstro__pool__transfer_ticket__descriptor =
 {
@@ -4713,7 +5308,7 @@ const ProtobufCMessageDescriptor mstro__pool__transfer_ticket__descriptor =
   "Mstro__Pool__TransferTicket",
   "mstro.pool",
   sizeof(Mstro__Pool__TransferTicket),
-  14,
+  18,
   mstro__pool__transfer_ticket__field_descriptors,
   mstro__pool__transfer_ticket__field_indices_by_name,
   1,  mstro__pool__transfer_ticket__number_ranges,
@@ -5091,7 +5686,7 @@ const ProtobufCMessageDescriptor mstro__pool__unsubscribe__descriptor =
   (ProtobufCMessageInit) mstro__pool__unsubscribe__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mstro__pool__event__field_descriptors[24] =
+static const ProtobufCFieldDescriptor mstro__pool__event__field_descriptors[25] =
 {
   {
     "subscription_handle",
@@ -5153,6 +5748,18 @@ static const ProtobufCFieldDescriptor mstro__pool__event__field_descriptors[24]
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "ctime",
+    6,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_MESSAGE,
+    0,   /* quantifier_offset */
+    offsetof(Mstro__Pool__Event, ctime),
+    &mstro__pool__timestamp__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
   {
     "declare",
     16,
@@ -5383,37 +5990,38 @@ static const ProtobufCFieldDescriptor mstro__pool__event__field_descriptors[24]
   },
 };
 static const unsigned mstro__pool__event__field_indices_by_name[] = {
-  21,   /* field[21] = bye */
+  22,   /* field[22] = bye */
   4,   /* field[4] = cdo_name */
-  5,   /* field[5] = declare */
-  6,   /* field[6] = declare_ack */
-  12,   /* field[12] = demand */
-  14,   /* field[14] = dispose */
-  18,   /* field[18] = join */
+  5,   /* field[5] = ctime */
+  6,   /* field[6] = declare */
+  7,   /* field[7] = declare_ack */
+  13,   /* field[13] = demand */
+  15,   /* field[15] = dispose */
+  19,   /* field[19] = join */
   2,   /* field[2] = kind */
-  20,   /* field[20] = leave */
-  9,   /* field[9] = offer */
+  21,   /* field[21] = leave */
+  10,   /* field[10] = offer */
   3,   /* field[3] = origin_id */
-  10,   /* field[10] = require */
-  13,   /* field[13] = retract */
-  7,   /* field[7] = seal */
-  8,   /* field[8] = seal_group */
+  11,   /* field[11] = require */
+  14,   /* field[14] = retract */
+  8,   /* field[8] = seal */
+  9,   /* field[9] = seal_group */
   1,   /* field[1] = serial */
-  22,   /* field[22] = subscribe */
+  23,   /* field[23] = subscribe */
   0,   /* field[0] = subscription_handle */
-  17,   /* field[17] = transfer_completed */
-  15,   /* field[15] = transport_init */
-  16,   /* field[16] = transport_ticket */
-  23,   /* field[23] = unsubscribe */
-  19,   /* field[19] = welcome */
-  11,   /* field[11] = withdraw */
+  18,   /* field[18] = transfer_completed */
+  16,   /* field[16] = transport_init */
+  17,   /* field[17] = transport_ticket */
+  24,   /* field[24] = unsubscribe */
+  20,   /* field[20] = welcome */
+  12,   /* field[12] = withdraw */
 };
 static const ProtobufCIntRange mstro__pool__event__number_ranges[3 + 1] =
 {
   { 1, 0 },
-  { 16, 5 },
-  { 32, 18 },
-  { 0, 24 }
+  { 16, 6 },
+  { 32, 19 },
+  { 0, 25 }
 };
 const ProtobufCMessageDescriptor mstro__pool__event__descriptor =
 {
@@ -5423,7 +6031,7 @@ const ProtobufCMessageDescriptor mstro__pool__event__descriptor =
   "Mstro__Pool__Event",
   "mstro.pool",
   sizeof(Mstro__Pool__Event),
-  24,
+  25,
   mstro__pool__event__field_descriptors,
   mstro__pool__event__field_indices_by_name,
   3,  mstro__pool__event__number_ranges,
diff --git a/protocols/mstro_pool.pb-c.h b/protocols/mstro_pool.pb-c.h
index 6ef9ea2eab47da2f1c6d56d417fe98704d8cc845..98e013e3fe84a3667377477dc314e18ebca4a6f1 100644
--- a/protocols/mstro_pool.pb-c.h
+++ b/protocols/mstro_pool.pb-c.h
@@ -10,70 +10,90 @@ PROTOBUF_C__BEGIN_DECLS
 
 #if PROTOBUF_C_VERSION_NUMBER < 1003000
 # error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003002 < PROTOBUF_C_MIN_COMPILER_VERSION
+#elif 1004000 < PROTOBUF_C_MIN_COMPILER_VERSION
 # error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
 #endif
 
 
-typedef struct _Mstro__Pool__CDOID Mstro__Pool__CDOID;
-typedef struct _Mstro__Pool__Appid Mstro__Pool__Appid;
-typedef struct _Mstro__Pool__Apptoken Mstro__Pool__Apptoken;
-typedef struct _Mstro__Pool__VSMAnnouncement Mstro__Pool__VSMAnnouncement;
-typedef struct _Mstro__Pool__Join Mstro__Pool__Join;
-typedef struct _Mstro__Pool__Welcome Mstro__Pool__Welcome;
-typedef struct _Mstro__Pool__Leave Mstro__Pool__Leave;
-typedef struct _Mstro__Pool__EmergencyDetach Mstro__Pool__EmergencyDetach;
-typedef struct _Mstro__Pool__Resolve Mstro__Pool__Resolve;
-typedef struct _Mstro__Pool__ResolveReply Mstro__Pool__ResolveReply;
-typedef struct _Mstro__Pool__Bye Mstro__Pool__Bye;
-typedef struct _Mstro__Pool__Declare Mstro__Pool__Declare;
-typedef struct _Mstro__Pool__DeclareAck Mstro__Pool__DeclareAck;
-typedef struct _Mstro__Pool__Timestamp Mstro__Pool__Timestamp;
-typedef struct _Mstro__Pool__GroupMembers Mstro__Pool__GroupMembers;
-typedef struct _Mstro__Pool__AVal Mstro__Pool__AVal;
-typedef struct _Mstro__Pool__KvEntry Mstro__Pool__KvEntry;
-typedef struct _Mstro__Pool__Attributes Mstro__Pool__Attributes;
-typedef struct _Mstro__Pool__Attributes__Map Mstro__Pool__Attributes__Map;
-typedef struct _Mstro__Pool__Seal Mstro__Pool__Seal;
-typedef struct _Mstro__Pool__SealGroup Mstro__Pool__SealGroup;
-typedef struct _Mstro__Pool__Offer Mstro__Pool__Offer;
-typedef struct _Mstro__Pool__Withdraw Mstro__Pool__Withdraw;
-typedef struct _Mstro__Pool__Dispose Mstro__Pool__Dispose;
-typedef struct _Mstro__Pool__Require Mstro__Pool__Require;
-typedef struct _Mstro__Pool__Retract Mstro__Pool__Retract;
-typedef struct _Mstro__Pool__PoolOpAck Mstro__Pool__PoolOpAck;
-typedef struct _Mstro__Pool__Demand Mstro__Pool__Demand;
-typedef struct _Mstro__Pool__DemandAttr Mstro__Pool__DemandAttr;
-typedef struct _Mstro__Pool__DemandAttrRes Mstro__Pool__DemandAttrRes;
-typedef struct _Mstro__Pool__TransportMethods Mstro__Pool__TransportMethods;
-typedef struct _Mstro__Pool__InitiateTransfer Mstro__Pool__InitiateTransfer;
-typedef struct _Mstro__Pool__TransferTicketGFS Mstro__Pool__TransferTicketGFS;
-typedef struct _Mstro__Pool__TransferTicketUDJ Mstro__Pool__TransferTicketUDJ;
-typedef struct _Mstro__Pool__TransferTicketMIO Mstro__Pool__TransferTicketMIO;
-typedef struct _Mstro__Pool__RDMAHandle Mstro__Pool__RDMAHandle;
-typedef struct _Mstro__Pool__TransferTicketOFI Mstro__Pool__TransferTicketOFI;
-typedef struct _Mstro__Pool__TransferTicket Mstro__Pool__TransferTicket;
-typedef struct _Mstro__Pool__TransferCompleted Mstro__Pool__TransferCompleted;
-typedef struct _Mstro__Pool__CDOSelector Mstro__Pool__CDOSelector;
-typedef struct _Mstro__Pool__SubscriptionHandle Mstro__Pool__SubscriptionHandle;
-typedef struct _Mstro__Pool__Subscribe Mstro__Pool__Subscribe;
-typedef struct _Mstro__Pool__SubscribeAck Mstro__Pool__SubscribeAck;
-typedef struct _Mstro__Pool__Unsubscribe Mstro__Pool__Unsubscribe;
-typedef struct _Mstro__Pool__Event Mstro__Pool__Event;
-typedef struct _Mstro__Pool__EventAck Mstro__Pool__EventAck;
-typedef struct _Mstro__Pool__Configure Mstro__Pool__Configure;
-typedef struct _Mstro__Pool__CDODemandable Mstro__Pool__CDODemandable;
-typedef struct _Mstro__Pool__CDOWithdrawable Mstro__Pool__CDOWithdrawable;
-typedef struct _Mstro__Pool__Log Mstro__Pool__Log;
-typedef struct _Mstro__Pool__Query Mstro__Pool__Query;
-typedef struct _Mstro__Pool__QueryRes Mstro__Pool__QueryRes;
-typedef struct _Mstro__Pool__MstroMsg Mstro__Pool__MstroMsg;
-typedef struct _Mstro__Pool__MstroMsg__Opts Mstro__Pool__MstroMsg__Opts;
-typedef struct _Mstro__Pool__FragmentedMsg Mstro__Pool__FragmentedMsg;
+typedef struct Mstro__Pool__CDOID Mstro__Pool__CDOID;
+typedef struct Mstro__Pool__Appid Mstro__Pool__Appid;
+typedef struct Mstro__Pool__Apptoken Mstro__Pool__Apptoken;
+typedef struct Mstro__Pool__VSMAnnouncement Mstro__Pool__VSMAnnouncement;
+typedef struct Mstro__Pool__Join Mstro__Pool__Join;
+typedef struct Mstro__Pool__Welcome Mstro__Pool__Welcome;
+typedef struct Mstro__Pool__Leave Mstro__Pool__Leave;
+typedef struct Mstro__Pool__EmergencyDetach Mstro__Pool__EmergencyDetach;
+typedef struct Mstro__Pool__Resolve Mstro__Pool__Resolve;
+typedef struct Mstro__Pool__ResolveReply Mstro__Pool__ResolveReply;
+typedef struct Mstro__Pool__Bye Mstro__Pool__Bye;
+typedef struct Mstro__Pool__Declare Mstro__Pool__Declare;
+typedef struct Mstro__Pool__DeclareAck Mstro__Pool__DeclareAck;
+typedef struct Mstro__Pool__Timestamp Mstro__Pool__Timestamp;
+typedef struct Mstro__Pool__MmbLayoutIrregular Mstro__Pool__MmbLayoutIrregular;
+typedef struct Mstro__Pool__MmbDimensions Mstro__Pool__MmbDimensions;
+typedef struct Mstro__Pool__MmbLayoutBlock Mstro__Pool__MmbLayoutBlock;
+typedef struct Mstro__Pool__Mmblayout Mstro__Pool__Mmblayout;
+typedef struct Mstro__Pool__GroupMembers Mstro__Pool__GroupMembers;
+typedef struct Mstro__Pool__AVal Mstro__Pool__AVal;
+typedef struct Mstro__Pool__KvEntry Mstro__Pool__KvEntry;
+typedef struct Mstro__Pool__Attributes Mstro__Pool__Attributes;
+typedef struct Mstro__Pool__Attributes__Map Mstro__Pool__Attributes__Map;
+typedef struct Mstro__Pool__Seal Mstro__Pool__Seal;
+typedef struct Mstro__Pool__SealGroup Mstro__Pool__SealGroup;
+typedef struct Mstro__Pool__Offer Mstro__Pool__Offer;
+typedef struct Mstro__Pool__Withdraw Mstro__Pool__Withdraw;
+typedef struct Mstro__Pool__Dispose Mstro__Pool__Dispose;
+typedef struct Mstro__Pool__Require Mstro__Pool__Require;
+typedef struct Mstro__Pool__Retract Mstro__Pool__Retract;
+typedef struct Mstro__Pool__PoolOpAck Mstro__Pool__PoolOpAck;
+typedef struct Mstro__Pool__Demand Mstro__Pool__Demand;
+typedef struct Mstro__Pool__DemandAttr Mstro__Pool__DemandAttr;
+typedef struct Mstro__Pool__DemandAttrRes Mstro__Pool__DemandAttrRes;
+typedef struct Mstro__Pool__TransportMethods Mstro__Pool__TransportMethods;
+typedef struct Mstro__Pool__InitiateTransfer Mstro__Pool__InitiateTransfer;
+typedef struct Mstro__Pool__TransferTicketGFS Mstro__Pool__TransferTicketGFS;
+typedef struct Mstro__Pool__TransferTicketUDJ Mstro__Pool__TransferTicketUDJ;
+typedef struct Mstro__Pool__TransferTicketMIO Mstro__Pool__TransferTicketMIO;
+typedef struct Mstro__Pool__RDMAHandle Mstro__Pool__RDMAHandle;
+typedef struct Mstro__Pool__TransferTicketOFI Mstro__Pool__TransferTicketOFI;
+typedef struct Mstro__Pool__TransferTicket Mstro__Pool__TransferTicket;
+typedef struct Mstro__Pool__TransferCompleted Mstro__Pool__TransferCompleted;
+typedef struct Mstro__Pool__CDOSelector Mstro__Pool__CDOSelector;
+typedef struct Mstro__Pool__SubscriptionHandle Mstro__Pool__SubscriptionHandle;
+typedef struct Mstro__Pool__Subscribe Mstro__Pool__Subscribe;
+typedef struct Mstro__Pool__SubscribeAck Mstro__Pool__SubscribeAck;
+typedef struct Mstro__Pool__Unsubscribe Mstro__Pool__Unsubscribe;
+typedef struct Mstro__Pool__Event Mstro__Pool__Event;
+typedef struct Mstro__Pool__EventAck Mstro__Pool__EventAck;
+typedef struct Mstro__Pool__Configure Mstro__Pool__Configure;
+typedef struct Mstro__Pool__CDODemandable Mstro__Pool__CDODemandable;
+typedef struct Mstro__Pool__CDOWithdrawable Mstro__Pool__CDOWithdrawable;
+typedef struct Mstro__Pool__Log Mstro__Pool__Log;
+typedef struct Mstro__Pool__Query Mstro__Pool__Query;
+typedef struct Mstro__Pool__QueryRes Mstro__Pool__QueryRes;
+typedef struct Mstro__Pool__MstroMsg Mstro__Pool__MstroMsg;
+typedef struct Mstro__Pool__MstroMsg__Opts Mstro__Pool__MstroMsg__Opts;
+typedef struct Mstro__Pool__FragmentedMsg Mstro__Pool__FragmentedMsg;
 
 
 /* --- enums --- */
 
+typedef enum _Mstro__Pool__MmbLayoutBlock__MmbLayoutOrder {
+  MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_LAYOUT_ORDER_NONE = 0,
+  MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_ROWMAJOR = 1,
+  MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_COLMAJOR = 2,
+  MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_GENERIC_ND = 3,
+  MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_LAYOUT_ORDER_MAX = 4
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER)
+} Mstro__Pool__MmbLayoutBlock__MmbLayoutOrder;
+typedef enum _Mstro__Pool__Mmblayout__MmbLayoutType {
+  MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_LAYOUT_NONE = 0,
+  MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_REGULAR = 1,
+  MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_REGULAR_BLOCK = 2,
+  MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_IRREGULAR = 3,
+  MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_LAYOUT_TYPE_MAX = 4
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE)
+} Mstro__Pool__Mmblayout__MmbLayoutType;
 typedef enum _Mstro__Pool__PoolOpAck__PoolOpStatus {
   MSTRO__POOL__POOL_OP_ACK__POOL_OP_STATUS__OK = 0,
   /*
@@ -227,7 +247,7 @@ typedef enum _Mstro__Pool__EventKind {
  **
  ** A uuid, two quadwords
  */
-struct  _Mstro__Pool__CDOID
+struct  Mstro__Pool__CDOID
 {
   ProtobufCMessage base;
   uint64_t qw0;
@@ -250,7 +270,7 @@ struct  _Mstro__Pool__CDOID
  **
  ** Assigned at the time the Apptoken is handed out to new participants.
  */
-struct  _Mstro__Pool__Appid
+struct  Mstro__Pool__Appid
 {
   ProtobufCMessage base;
   uint64_t id;
@@ -267,7 +287,7 @@ struct  _Mstro__Pool__Appid
  ** particular an appid at WELCOME time. The reserved fields are for
  ** authorization token usage (rdma credential and a cookie)
  */
-struct  _Mstro__Pool__Apptoken
+struct  Mstro__Pool__Apptoken
 {
   ProtobufCMessage base;
   Mstro__Pool__Appid *appid;
@@ -290,7 +310,7 @@ struct  _Mstro__Pool__Apptoken
  ** pre-agreed OFI tag) to segregate all fragments from normal
  ** traffic.
  */
-struct  _Mstro__Pool__VSMAnnouncement
+struct  Mstro__Pool__VSMAnnouncement
 {
   ProtobufCMessage base;
   /*
@@ -317,7 +337,7 @@ struct  _Mstro__Pool__VSMAnnouncement
 /*
  ** A join message. *
  */
-struct  _Mstro__Pool__Join
+struct  Mstro__Pool__Join
 {
   ProtobufCMessage base;
   /*
@@ -349,7 +369,7 @@ struct  _Mstro__Pool__Join
 /*
  ** Welcome message: assigns the app token *
  */
-struct  _Mstro__Pool__Welcome
+struct  Mstro__Pool__Welcome
 {
   ProtobufCMessage base;
   /*
@@ -369,7 +389,7 @@ struct  _Mstro__Pool__Welcome
 /*
  ** Leave message: app wants to detach from pool *
  */
-struct  _Mstro__Pool__Leave
+struct  Mstro__Pool__Leave
 {
   ProtobufCMessage base;
 };
@@ -382,7 +402,7 @@ struct  _Mstro__Pool__Leave
  ** Emergency Detach: app is crashing, but tries to let us know in
  * case we can deal with it 
  */
-struct  _Mstro__Pool__EmergencyDetach
+struct  Mstro__Pool__EmergencyDetach
 {
   ProtobufCMessage base;
   char *reason;
@@ -396,14 +416,14 @@ typedef enum {
   MSTRO__POOL__RESOLVE__QUERY__NOT_SET = 0,
   MSTRO__POOL__RESOLVE__QUERY_CDOID = 2,
   MSTRO__POOL__RESOLVE__QUERY_APPID = 3
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__RESOLVE__QUERY)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__RESOLVE__QUERY__CASE)
 } Mstro__Pool__Resolve__QueryCase;
 
 /*
  ** Resolve data that is opaque to the end user but appears in
  * protocol messages 
  */
-struct  _Mstro__Pool__Resolve
+struct  Mstro__Pool__Resolve
 {
   ProtobufCMessage base;
   /*
@@ -424,13 +444,13 @@ struct  _Mstro__Pool__Resolve
 typedef enum {
   MSTRO__POOL__RESOLVE_REPLY__RESULT__NOT_SET = 0,
   MSTRO__POOL__RESOLVE_REPLY__RESULT_NAME = 2
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__RESOLVE_REPLY__RESULT)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__RESOLVE_REPLY__RESULT__CASE)
 } Mstro__Pool__ResolveReply__ResultCase;
 
 /*
  ** Reply to a resolve request 
  */
-struct  _Mstro__Pool__ResolveReply
+struct  Mstro__Pool__ResolveReply
 {
   ProtobufCMessage base;
   Mstro__Pool__Resolve *query;
@@ -447,7 +467,7 @@ struct  _Mstro__Pool__ResolveReply
 /*
  ** Bye message: Pool manager agrees that app has been detached. *
  */
-struct  _Mstro__Pool__Bye
+struct  Mstro__Pool__Bye
 {
   ProtobufCMessage base;
 };
@@ -462,7 +482,7 @@ struct  _Mstro__Pool__Bye
  ** A unique (for the sender) serial number and the CDO name are sent
  ** to the pool manager *
  */
-struct  _Mstro__Pool__Declare
+struct  Mstro__Pool__Declare
 {
   ProtobufCMessage base;
   uint64_t serial;
@@ -479,7 +499,7 @@ struct  _Mstro__Pool__Declare
  ** The serial is the number used in the DECLARE call
  ** previously. The reply will contain the pool-manager assigned uuid.
  */
-struct  _Mstro__Pool__DeclareAck
+struct  Mstro__Pool__DeclareAck
 {
   ProtobufCMessage base;
   /*
@@ -505,7 +525,7 @@ struct  _Mstro__Pool__DeclareAck
 /*
  ** Built-in type corresponding to @ref mstro_timestamp 
  */
-struct  _Mstro__Pool__Timestamp
+struct  Mstro__Pool__Timestamp
 {
   ProtobufCMessage base;
   int64_t sec;
@@ -517,7 +537,83 @@ struct  _Mstro__Pool__Timestamp
     , 0, 0, 0 }
 
 
-struct  _Mstro__Pool__GroupMembers
+/*
+ ** Definition of irregular mamba layout 
+ */
+struct  Mstro__Pool__MmbLayoutIrregular
+{
+  ProtobufCMessage base;
+  uint64_t n_blocks;
+  size_t n_offsets;
+  uint64_t *offsets;
+  size_t n_lengths;
+  uint64_t *lengths;
+};
+#define MSTRO__POOL__MMB_LAYOUT_IRREGULAR__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__mmb_layout_irregular__descriptor) \
+    , 0, 0,NULL, 0,NULL }
+
+
+/*
+ ** Definition of mmbDimensions 
+ */
+struct  Mstro__Pool__MmbDimensions
+{
+  ProtobufCMessage base;
+  size_t n_d;
+  uint64_t *d;
+};
+#define MSTRO__POOL__MMB_DIMENSIONS__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__mmb_dimensions__descriptor) \
+    , 0,NULL }
+
+
+/*
+ ** Definition of mmbLayoutBlock layout 
+ */
+struct  Mstro__Pool__MmbLayoutBlock
+{
+  ProtobufCMessage base;
+  Mstro__Pool__MmbLayoutBlock__MmbLayoutOrder order;
+  /*
+   * Regular Block-centric 
+   */
+  Mstro__Pool__MmbDimensions *dimensions;
+};
+#define MSTRO__POOL__MMB_LAYOUT_BLOCK__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__mmb_layout_block__descriptor) \
+    , MSTRO__POOL__MMB_LAYOUT_BLOCK__MMB_LAYOUT_ORDER__MMB_LAYOUT_ORDER_NONE, NULL }
+
+
+typedef enum {
+  MSTRO__POOL__MMBLAYOUT__LAYOUT__NOT_SET = 0,
+  MSTRO__POOL__MMBLAYOUT__LAYOUT_BLOCK = 5,
+  MSTRO__POOL__MMBLAYOUT__LAYOUT_IRREGULAR = 6
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__MMBLAYOUT__LAYOUT__CASE)
+} Mstro__Pool__Mmblayout__LayoutCase;
+
+/*
+ ** Built-in type corresponding to @ref mmbLayout 
+ */
+struct  Mstro__Pool__Mmblayout
+{
+  ProtobufCMessage base;
+  Mstro__Pool__Mmblayout__MmbLayoutType type;
+  uint64_t n_dims;
+  uint64_t index;
+  uint64_t element_size_bytes;
+  Mstro__Pool__Mmblayout__LayoutCase layout_case;
+  union {
+    Mstro__Pool__MmbLayoutBlock *block;
+    Mstro__Pool__MmbLayoutIrregular *irregular;
+  };
+};
+#define MSTRO__POOL__MMBLAYOUT__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__mmblayout__descriptor) \
+    , MSTRO__POOL__MMBLAYOUT__MMB_LAYOUT_TYPE__MMB_LAYOUT_NONE, 0, 0, 0, MSTRO__POOL__MMBLAYOUT__LAYOUT__NOT_SET, {0} }
+
+
+struct  Mstro__Pool__GroupMembers
 {
   ProtobufCMessage base;
   size_t n_declared_members;
@@ -542,11 +638,12 @@ typedef enum {
   MSTRO__POOL__AVAL__VAL_DOUBLE = 7,
   MSTRO__POOL__AVAL__VAL_STRING = 8,
   MSTRO__POOL__AVAL__VAL_BYTES = 9,
-  MSTRO__POOL__AVAL__VAL_TIMESTAMP = 10
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__AVAL__VAL)
+  MSTRO__POOL__AVAL__VAL_TIMESTAMP = 10,
+  MSTRO__POOL__AVAL__VAL_MMB_LAYOUT = 11
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__AVAL__VAL__CASE)
 } Mstro__Pool__AVal__ValCase;
 
-struct  _Mstro__Pool__AVal
+struct  Mstro__Pool__AVal
 {
   ProtobufCMessage base;
   Mstro__Pool__AVal__ValCase val_case;
@@ -564,6 +661,7 @@ struct  _Mstro__Pool__AVal
      */
     ProtobufCBinaryData bytes;
     Mstro__Pool__Timestamp *timestamp;
+    Mstro__Pool__Mmblayout *mmblayout;
   };
 };
 #define MSTRO__POOL__AVAL__INIT \
@@ -574,7 +672,7 @@ struct  _Mstro__Pool__AVal
 /*
  ** Attribute key-value entry 
  */
-struct  _Mstro__Pool__KvEntry
+struct  Mstro__Pool__KvEntry
 {
   ProtobufCMessage base;
   char *key;
@@ -585,7 +683,7 @@ struct  _Mstro__Pool__KvEntry
     , (char *)protobuf_c_empty_string, NULL }
 
 
-struct  _Mstro__Pool__Attributes__Map
+struct  Mstro__Pool__Attributes__Map
 {
   ProtobufCMessage base;
   size_t n_map;
@@ -600,7 +698,7 @@ typedef enum {
   MSTRO__POOL__ATTRIBUTES__VAL__NOT_SET = 0,
   MSTRO__POOL__ATTRIBUTES__VAL_YAML_STRING = 2,
   MSTRO__POOL__ATTRIBUTES__VAL_KV_MAP = 3
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__ATTRIBUTES__VAL)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__ATTRIBUTES__VAL__CASE)
 } Mstro__Pool__Attributes__ValCase;
 
 /*
@@ -609,7 +707,7 @@ typedef enum {
  * As usual, keys are interpreted as absolute if the start with '.'
  * (PERIOD), and relative to the DEFAULT_NAMESPACE if not.
  */
-struct  _Mstro__Pool__Attributes
+struct  Mstro__Pool__Attributes
 {
   ProtobufCMessage base;
   char *default_namespace;
@@ -629,7 +727,7 @@ struct  _Mstro__Pool__Attributes
  **
  ** Seal can be a variable-sized message.
  */
-struct  _Mstro__Pool__Seal
+struct  Mstro__Pool__Seal
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -645,7 +743,7 @@ struct  _Mstro__Pool__Seal
  **
  ** SealGroup can be a variable-sized message.
  */
-struct  _Mstro__Pool__SealGroup
+struct  Mstro__Pool__SealGroup
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -659,7 +757,7 @@ struct  _Mstro__Pool__SealGroup
 /*
  ** Offer a CDO 
  */
-struct  _Mstro__Pool__Offer
+struct  Mstro__Pool__Offer
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -672,7 +770,7 @@ struct  _Mstro__Pool__Offer
 /*
  ** Withdraw a CDO. 
  */
-struct  _Mstro__Pool__Withdraw
+struct  Mstro__Pool__Withdraw
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -685,7 +783,7 @@ struct  _Mstro__Pool__Withdraw
 /*
  ** Dispose a CDO. 
  */
-struct  _Mstro__Pool__Dispose
+struct  Mstro__Pool__Dispose
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -698,7 +796,7 @@ struct  _Mstro__Pool__Dispose
 /*
  ** Require a CDO. 
  */
-struct  _Mstro__Pool__Require
+struct  Mstro__Pool__Require
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -712,7 +810,7 @@ struct  _Mstro__Pool__Require
  ** Retract a CDO.
  **
  */
-struct  _Mstro__Pool__Retract
+struct  Mstro__Pool__Retract
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -726,13 +824,13 @@ typedef enum {
   MSTRO__POOL__POOL_OP_ACK__OPERAND__NOT_SET = 0,
   MSTRO__POOL__POOL_OP_ACK__OPERAND_CDOID = 2,
   MSTRO__POOL__POOL_OP_ACK__OPERAND_HANDLE = 3
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__POOL_OP_ACK__OPERAND)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__POOL_OP_ACK__OPERAND__CASE)
 } Mstro__Pool__PoolOpAck__OperandCase;
 
 typedef enum {
   MSTRO__POOL__POOL_OP_ACK__PAYLOAD__NOT_SET = 0,
   MSTRO__POOL__POOL_OP_ACK__PAYLOAD_UPDATED_ATTRIBUTES = 5
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__POOL_OP_ACK__PAYLOAD)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__POOL_OP_ACK__PAYLOAD__CASE)
 } Mstro__Pool__PoolOpAck__PayloadCase;
 
 /*
@@ -742,7 +840,7 @@ typedef enum {
  ** CDO it's sufficient to have a single ACK message type for all of
  ** them.
  */
-struct  _Mstro__Pool__PoolOpAck
+struct  Mstro__Pool__PoolOpAck
 {
   ProtobufCMessage base;
   /*
@@ -774,7 +872,7 @@ struct  _Mstro__Pool__PoolOpAck
 /*
  ** Demand a CDO. Start of triangle-4step-protocol. 
  */
-struct  _Mstro__Pool__Demand
+struct  Mstro__Pool__Demand
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -787,7 +885,7 @@ struct  _Mstro__Pool__Demand
 /*
  ** Demand a CDO's attributes 
  */
-struct  _Mstro__Pool__DemandAttr
+struct  Mstro__Pool__DemandAttr
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -800,7 +898,7 @@ struct  _Mstro__Pool__DemandAttr
 /*
  ** Result of a DemandAttr request. May be a VSM message. 
  */
-struct  _Mstro__Pool__DemandAttrRes
+struct  Mstro__Pool__DemandAttrRes
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -814,7 +912,7 @@ struct  _Mstro__Pool__DemandAttrRes
 /*
  ** A way to inform about the order of preference of transport 
  */
-struct  _Mstro__Pool__TransportMethods
+struct  Mstro__Pool__TransportMethods
 {
   ProtobufCMessage base;
   /*
@@ -837,7 +935,7 @@ struct  _Mstro__Pool__TransportMethods
  ** message. Request is to create a ticket for the app referenced in
  ** dst_serialized_endpoint.
  */
-struct  _Mstro__Pool__InitiateTransfer
+struct  Mstro__Pool__InitiateTransfer
 {
   ProtobufCMessage base;
   /*
@@ -874,13 +972,24 @@ struct  _Mstro__Pool__InitiateTransfer
    * it being available at DST_APPID after completed transfer 
    */
   protobuf_c_boolean force_offer;
+  /*
+   ** Number of segements or pieces to obtain the cdo ... 
+   * should be 1 for all non distributed cdos and >= 1 for distributed cdos
+   * cdo is fully recieved when the dst_cdo recieves all pieces as defined here 
+   */
+  int64_t n_segments;
+  /*
+   ** Flag this ticket as part of a (re)distribution, i.e. either 
+   * source or distination cdo or both are distributed cdos 
+   */
+  protobuf_c_boolean distributed_cdo;
 };
 #define MSTRO__POOL__INITIATE_TRANSFER__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__initiate_transfer__descriptor) \
-    , NULL, NULL, NULL, (char *)protobuf_c_empty_string, NULL, NULL, 0, 0 }
+    , NULL, NULL, NULL, (char *)protobuf_c_empty_string, NULL, NULL, 0, 0, 0, 0 }
 
 
-struct  _Mstro__Pool__TransferTicketGFS
+struct  Mstro__Pool__TransferTicketGFS
 {
   ProtobufCMessage base;
   char *path;
@@ -891,7 +1000,7 @@ struct  _Mstro__Pool__TransferTicketGFS
     , (char *)protobuf_c_empty_string, 0 }
 
 
-struct  _Mstro__Pool__TransferTicketUDJ
+struct  Mstro__Pool__TransferTicketUDJ
 {
   ProtobufCMessage base;
   char *reserved;
@@ -901,7 +1010,7 @@ struct  _Mstro__Pool__TransferTicketUDJ
     , (char *)protobuf_c_empty_string }
 
 
-struct  _Mstro__Pool__TransferTicketMIO
+struct  Mstro__Pool__TransferTicketMIO
 {
   ProtobufCMessage base;
   ProtobufCBinaryData objid;
@@ -920,14 +1029,14 @@ struct  _Mstro__Pool__TransferTicketMIO
  * Encapsulating the data obtained by fi_mr_reg (and fi_mr_regv later)
  * for an RDMA memory registation 
  */
-struct  _Mstro__Pool__RDMAHandle
+struct  Mstro__Pool__RDMAHandle
 {
   ProtobufCMessage base;
   /*
    *  oneof addr {
    */
   /*
-   * placeholder for repeated add/offset pairs 'iov' 
+   * placeholder for repeated add/offset pairs 'iov'
    *  };
    */
   uint64_t single;
@@ -945,7 +1054,7 @@ struct  _Mstro__Pool__RDMAHandle
 /*
  * The simplest possible ticket on OFI: SRC provides RMDA READ information for DST 
  */
-struct  _Mstro__Pool__TransferTicketOFI
+struct  Mstro__Pool__TransferTicketOFI
 {
   ProtobufCMessage base;
   Mstro__Pool__RDMAHandle *h;
@@ -962,7 +1071,7 @@ typedef enum {
   MSTRO__POOL__TRANSFER_TICKET__TICKET_UDJ = 9,
   MSTRO__POOL__TRANSFER_TICKET__TICKET_MIO = 10,
   MSTRO__POOL__TRANSFER_TICKET__TICKET_OFI = 11
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__TRANSFER_TICKET__TICKET)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__TRANSFER_TICKET__TICKET__CASE)
 } Mstro__Pool__TransferTicket__TicketCase;
 
 /*
@@ -970,7 +1079,7 @@ typedef enum {
  **
  ** This is step 3 of the triangle-4step-protocol.
  */
-struct  _Mstro__Pool__TransferTicket
+struct  Mstro__Pool__TransferTicket
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *srccdoid;
@@ -999,6 +1108,22 @@ struct  _Mstro__Pool__TransferTicket
    * it being available at DST_APPID after completed transfer 
    */
   protobuf_c_boolean force_offer;
+  /*
+   ** Number of segements or pieces to obtain the cdo ... 
+   * should be 1 for all non distributed cdos and >= 1 for distributed cdos
+   * cdo is fully recieved when the dst_cdo recieves all pieces as defined here 
+   */
+  int64_t n_segments;
+  /*
+   * offset at src and dst to read and write the current payload of data 
+   */
+  int64_t src_offset;
+  int64_t dst_offset;
+  /*
+   ** Flag this ticket as part of a (re)distribution, i.e. either 
+   * source or distination cdo or both are distributed cdos 
+   */
+  protobuf_c_boolean distributed_cdo;
   Mstro__Pool__TransferTicket__TicketCase ticket_case;
   union {
     Mstro__Pool__TransferTicketGFS *gfs;
@@ -1009,7 +1134,7 @@ struct  _Mstro__Pool__TransferTicket
 };
 #define MSTRO__POOL__TRANSFER_TICKET__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__transfer_ticket__descriptor) \
-    , NULL, NULL, NULL, NULL, 0, (char *)protobuf_c_empty_string, MSTRO__POOL__TRANSPORT_KIND__INVALID_TRANSPORT_KIND, 0, NULL, 0, MSTRO__POOL__TRANSFER_TICKET__TICKET__NOT_SET, {0} }
+    , NULL, NULL, NULL, NULL, 0, (char *)protobuf_c_empty_string, MSTRO__POOL__TRANSPORT_KIND__INVALID_TRANSPORT_KIND, 0, NULL, 0, 0, 0, 0, 0, MSTRO__POOL__TRANSFER_TICKET__TICKET__NOT_SET, {0} }
 
 
 /*
@@ -1021,7 +1146,7 @@ struct  _Mstro__Pool__TransferTicket
  ** source. dst_serialized_endpoint is an identifier for the app who
  ** has successfully received the data.
  */
-struct  _Mstro__Pool__TransferCompleted
+struct  Mstro__Pool__TransferCompleted
 {
   ProtobufCMessage base;
   /*
@@ -1051,7 +1176,7 @@ typedef enum {
   MSTRO__POOL__CDOSELECTOR__SEL_REGEX = 1,
   MSTRO__POOL__CDOSELECTOR__SEL_CDOID = 2,
   MSTRO__POOL__CDOSELECTOR__SEL_QUERY = 3
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__CDOSELECTOR__SEL)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__CDOSELECTOR__SEL__CASE)
 } Mstro__Pool__CDOSelector__SelCase;
 
 /*
@@ -1060,7 +1185,7 @@ typedef enum {
  ** Currently supports a regex on the name or a list of CDO-IDs, but
  ** could be extended to a Boolean predicate query.
  */
-struct  _Mstro__Pool__CDOSelector
+struct  Mstro__Pool__CDOSelector
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOSelector__SelCase sel_case;
@@ -1081,7 +1206,7 @@ struct  _Mstro__Pool__CDOSelector
 /*
  ** Subscriptions are referred to by a pool-manager assigned handle 
  */
-struct  _Mstro__Pool__SubscriptionHandle
+struct  Mstro__Pool__SubscriptionHandle
 {
   ProtobufCMessage base;
   uint64_t id;
@@ -1094,7 +1219,7 @@ struct  _Mstro__Pool__SubscriptionHandle
 /*
  ** CDO event subscription request 
  */
-struct  _Mstro__Pool__Subscribe
+struct  Mstro__Pool__Subscribe
 {
   ProtobufCMessage base;
   /*
@@ -1138,7 +1263,7 @@ struct  _Mstro__Pool__Subscribe
  ** The serial is the number used in the SUBSCRIBE message
  ** previously. The reply will contain the pool-manager assigned handle.
  */
-struct  _Mstro__Pool__SubscribeAck
+struct  Mstro__Pool__SubscribeAck
 {
   ProtobufCMessage base;
   /*
@@ -1158,7 +1283,7 @@ struct  _Mstro__Pool__SubscribeAck
 /*
  ** CDO event unsubscription 
  */
-struct  _Mstro__Pool__Unsubscribe
+struct  Mstro__Pool__Unsubscribe
 {
   ProtobufCMessage base;
   Mstro__Pool__SubscriptionHandle *subscription_handle;
@@ -1189,13 +1314,13 @@ typedef enum {
   MSTRO__POOL__EVENT__PAYLOAD_BYE = 35,
   MSTRO__POOL__EVENT__PAYLOAD_SUBSCRIBE = 36,
   MSTRO__POOL__EVENT__PAYLOAD_UNSUBSCRIBE = 37
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__EVENT__PAYLOAD)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__EVENT__PAYLOAD__CASE)
 } Mstro__Pool__Event__PayloadCase;
 
 /*
  ** Events that match a subscription 
  */
-struct  _Mstro__Pool__Event
+struct  Mstro__Pool__Event
 {
   ProtobufCMessage base;
   /*
@@ -1217,6 +1342,10 @@ struct  _Mstro__Pool__Event
    ** the CDO concerned. Will be unset for non-CDO events 
    */
   char *cdo_name;
+  /*
+   ** the timestamp on the origin application 
+   */
+  Mstro__Pool__Timestamp *ctime;
   Mstro__Pool__Event__PayloadCase payload_case;
   union {
     /*
@@ -1302,10 +1431,10 @@ struct  _Mstro__Pool__Event
 };
 #define MSTRO__POOL__EVENT__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mstro__pool__event__descriptor) \
-    , NULL, 0, MSTRO__POOL__EVENT_KIND__INVALID_EVENT, NULL, (char *)protobuf_c_empty_string, MSTRO__POOL__EVENT__PAYLOAD__NOT_SET, {0} }
+    , NULL, 0, MSTRO__POOL__EVENT_KIND__INVALID_EVENT, NULL, (char *)protobuf_c_empty_string, NULL, MSTRO__POOL__EVENT__PAYLOAD__NOT_SET, {0} }
 
 
-struct  _Mstro__Pool__EventAck
+struct  Mstro__Pool__EventAck
 {
   ProtobufCMessage base;
   /*
@@ -1321,7 +1450,7 @@ struct  _Mstro__Pool__EventAck
 /*
  ** Configure messages from pool manager to apps 
  */
-struct  _Mstro__Pool__Configure
+struct  Mstro__Pool__Configure
 {
   ProtobufCMessage base;
 };
@@ -1333,7 +1462,7 @@ struct  _Mstro__Pool__Configure
 /*
  ** advise that a CDO which has been required can now be demanded cheaply 
  */
-struct  _Mstro__Pool__CDODemandable
+struct  Mstro__Pool__CDODemandable
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -1346,7 +1475,7 @@ struct  _Mstro__Pool__CDODemandable
 /*
  ** advise that a CDO which has been offered can now be withdrawn cheaply 
  */
-struct  _Mstro__Pool__CDOWithdrawable
+struct  Mstro__Pool__CDOWithdrawable
 {
   ProtobufCMessage base;
   Mstro__Pool__CDOID *cdoid;
@@ -1362,7 +1491,7 @@ struct  _Mstro__Pool__CDOWithdrawable
  ** This may be a VSM message. If so, it will use the dedicated LOG
  ** channel.
  */
-struct  _Mstro__Pool__Log
+struct  Mstro__Pool__Log
 {
   ProtobufCMessage base;
 };
@@ -1374,7 +1503,7 @@ struct  _Mstro__Pool__Log
 /*
  ** Group construction by querying pool 
  */
-struct  _Mstro__Pool__Query
+struct  Mstro__Pool__Query
 {
   ProtobufCMessage base;
   /*
@@ -1396,7 +1525,7 @@ struct  _Mstro__Pool__Query
  **
  ** The usage is very similar to DECLARE
  */
-struct  _Mstro__Pool__QueryRes
+struct  Mstro__Pool__QueryRes
 {
   ProtobufCMessage base;
   uint64_t serial;
@@ -1411,7 +1540,7 @@ struct  _Mstro__Pool__QueryRes
 /*
  ** Message options type 
  */
-struct  _Mstro__Pool__MstroMsg__Opts
+struct  Mstro__Pool__MstroMsg__Opts
 {
   ProtobufCMessage base;
   /*
@@ -1457,10 +1586,10 @@ typedef enum {
   MSTRO__POOL__MSTRO_MSG__MSG_DETACH = 35,
   MSTRO__POOL__MSTRO_MSG__MSG_RESOLVE = 36,
   MSTRO__POOL__MSTRO_MSG__MSG_RESOLVE_REPLY = 37
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__MSTRO_MSG__MSG)
+    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(MSTRO__POOL__MSTRO_MSG__MSG__CASE)
 } Mstro__Pool__MstroMsg__MsgCase;
 
-struct  _Mstro__Pool__MstroMsg
+struct  Mstro__Pool__MstroMsg
 {
   ProtobufCMessage base;
   /*
@@ -1552,7 +1681,7 @@ struct  _Mstro__Pool__MstroMsg
     , NULL, NULL, 0,NULL, MSTRO__POOL__MSTRO_MSG__MSG__NOT_SET, {0} }
 
 
-struct  _Mstro__Pool__FragmentedMsg
+struct  Mstro__Pool__FragmentedMsg
 {
   ProtobufCMessage base;
   /*
@@ -1561,7 +1690,7 @@ struct  _Mstro__Pool__FragmentedMsg
   Mstro__Pool__VSMAnnouncement *vsm_data;
   /*
    ** Payload
-   **@} (end of group MSTRO_Pool_Protocol) 
+   **@} (end of group MSTRO_Pool_Protocol)
    **@} (end of addtogroup MSTRO_Core)
    */
   size_t n_data;
@@ -1838,6 +1967,82 @@ Mstro__Pool__Timestamp *
 void   mstro__pool__timestamp__free_unpacked
                      (Mstro__Pool__Timestamp *message,
                       ProtobufCAllocator *allocator);
+/* Mstro__Pool__MmbLayoutIrregular methods */
+void   mstro__pool__mmb_layout_irregular__init
+                     (Mstro__Pool__MmbLayoutIrregular         *message);
+size_t mstro__pool__mmb_layout_irregular__get_packed_size
+                     (const Mstro__Pool__MmbLayoutIrregular   *message);
+size_t mstro__pool__mmb_layout_irregular__pack
+                     (const Mstro__Pool__MmbLayoutIrregular   *message,
+                      uint8_t             *out);
+size_t mstro__pool__mmb_layout_irregular__pack_to_buffer
+                     (const Mstro__Pool__MmbLayoutIrregular   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__Pool__MmbLayoutIrregular *
+       mstro__pool__mmb_layout_irregular__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__pool__mmb_layout_irregular__free_unpacked
+                     (Mstro__Pool__MmbLayoutIrregular *message,
+                      ProtobufCAllocator *allocator);
+/* Mstro__Pool__MmbDimensions methods */
+void   mstro__pool__mmb_dimensions__init
+                     (Mstro__Pool__MmbDimensions         *message);
+size_t mstro__pool__mmb_dimensions__get_packed_size
+                     (const Mstro__Pool__MmbDimensions   *message);
+size_t mstro__pool__mmb_dimensions__pack
+                     (const Mstro__Pool__MmbDimensions   *message,
+                      uint8_t             *out);
+size_t mstro__pool__mmb_dimensions__pack_to_buffer
+                     (const Mstro__Pool__MmbDimensions   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__Pool__MmbDimensions *
+       mstro__pool__mmb_dimensions__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__pool__mmb_dimensions__free_unpacked
+                     (Mstro__Pool__MmbDimensions *message,
+                      ProtobufCAllocator *allocator);
+/* Mstro__Pool__MmbLayoutBlock methods */
+void   mstro__pool__mmb_layout_block__init
+                     (Mstro__Pool__MmbLayoutBlock         *message);
+size_t mstro__pool__mmb_layout_block__get_packed_size
+                     (const Mstro__Pool__MmbLayoutBlock   *message);
+size_t mstro__pool__mmb_layout_block__pack
+                     (const Mstro__Pool__MmbLayoutBlock   *message,
+                      uint8_t             *out);
+size_t mstro__pool__mmb_layout_block__pack_to_buffer
+                     (const Mstro__Pool__MmbLayoutBlock   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__Pool__MmbLayoutBlock *
+       mstro__pool__mmb_layout_block__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__pool__mmb_layout_block__free_unpacked
+                     (Mstro__Pool__MmbLayoutBlock *message,
+                      ProtobufCAllocator *allocator);
+/* Mstro__Pool__Mmblayout methods */
+void   mstro__pool__mmblayout__init
+                     (Mstro__Pool__Mmblayout         *message);
+size_t mstro__pool__mmblayout__get_packed_size
+                     (const Mstro__Pool__Mmblayout   *message);
+size_t mstro__pool__mmblayout__pack
+                     (const Mstro__Pool__Mmblayout   *message,
+                      uint8_t             *out);
+size_t mstro__pool__mmblayout__pack_to_buffer
+                     (const Mstro__Pool__Mmblayout   *message,
+                      ProtobufCBuffer     *buffer);
+Mstro__Pool__Mmblayout *
+       mstro__pool__mmblayout__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   mstro__pool__mmblayout__free_unpacked
+                     (Mstro__Pool__Mmblayout *message,
+                      ProtobufCAllocator *allocator);
 /* Mstro__Pool__GroupMembers methods */
 void   mstro__pool__group_members__init
                      (Mstro__Pool__GroupMembers         *message);
@@ -2629,6 +2834,18 @@ typedef void (*Mstro__Pool__DeclareAck_Closure)
 typedef void (*Mstro__Pool__Timestamp_Closure)
                  (const Mstro__Pool__Timestamp *message,
                   void *closure_data);
+typedef void (*Mstro__Pool__MmbLayoutIrregular_Closure)
+                 (const Mstro__Pool__MmbLayoutIrregular *message,
+                  void *closure_data);
+typedef void (*Mstro__Pool__MmbDimensions_Closure)
+                 (const Mstro__Pool__MmbDimensions *message,
+                  void *closure_data);
+typedef void (*Mstro__Pool__MmbLayoutBlock_Closure)
+                 (const Mstro__Pool__MmbLayoutBlock *message,
+                  void *closure_data);
+typedef void (*Mstro__Pool__Mmblayout_Closure)
+                 (const Mstro__Pool__Mmblayout *message,
+                  void *closure_data);
 typedef void (*Mstro__Pool__GroupMembers_Closure)
                  (const Mstro__Pool__GroupMembers *message,
                   void *closure_data);
@@ -2774,6 +2991,12 @@ extern const ProtobufCMessageDescriptor mstro__pool__bye__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__declare__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__declare_ack__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__timestamp__descriptor;
+extern const ProtobufCMessageDescriptor mstro__pool__mmb_layout_irregular__descriptor;
+extern const ProtobufCMessageDescriptor mstro__pool__mmb_dimensions__descriptor;
+extern const ProtobufCMessageDescriptor mstro__pool__mmb_layout_block__descriptor;
+extern const ProtobufCEnumDescriptor    mstro__pool__mmb_layout_block__mmb_layout_order__descriptor;
+extern const ProtobufCMessageDescriptor mstro__pool__mmblayout__descriptor;
+extern const ProtobufCEnumDescriptor    mstro__pool__mmblayout__mmb_layout_type__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__group_members__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__aval__descriptor;
 extern const ProtobufCMessageDescriptor mstro__pool__kv_entry__descriptor;
diff --git a/protocols/mstro_pool.proto b/protocols/mstro_pool.proto
index 9ef7c35f089575ce544aa09a5db7d74d3af31af9..56ebeba04fcbebcc10b63694fadcc790e55d1110 100644
--- a/protocols/mstro_pool.proto
+++ b/protocols/mstro_pool.proto
@@ -3,7 +3,7 @@
  * each app to a central pool manager, plus 3 dedicated channels:
  *
  * - INFO for communicating info messages (bidirectional)
- * 
+ *
  * - LOG for communicating logging info (to pool manager)
  *
  * - MGMT for pool manager messages to apps (bidirectional, but mostly
@@ -87,7 +87,7 @@ message VSMAnnouncement {
   /** Total number of fragments. Optional for fully dynamic fragmentation */
   fixed64 num_total_fragments = 2;
   /** One or more framnet sizes of subsequent messages */
-  repeated fixed64 fragment_size = 3; 
+  repeated fixed64 fragment_size = 3;
 };
 
 
@@ -96,15 +96,15 @@ message VSMAnnouncement {
 /** A join message. **/
 message Join {
   /** protocol version of app */
-  fixed32 protocol_version   = 1; 
+  fixed32 protocol_version   = 1;
   /** list of endpoints of app */
-  string serialized_endpoint = 2; 
+  string serialized_endpoint = 2;
   /** transport methods that this application is ready to accept */
   TransportMethods transport_methods = 3;
   /** the name of the component */
   string component_name = 4;
   /** the index among all JOINs using the same component_name (for multiple 'ranks' inside one component) */
-  fixed64 component_index = 5;      
+  fixed64 component_index = 5;
 };
 
 /** Welcome message: assigns the app token **/
@@ -142,7 +142,7 @@ message ResolveReply {
     string name = 2;
   }
 }
-  
+
 
 /** Bye message: Pool manager agrees that app has been detached. **/
 message Bye {
@@ -179,6 +179,52 @@ message Timestamp {
   sint32   offset = 3;
 };
 
+/** Definition of irregular mamba layout */
+message mmbLayoutIrregular {
+    fixed64   n_blocks = 1;
+    repeated fixed64  offsets = 2;
+    repeated fixed64  lengths = 3;
+};
+
+/** Definition of mmbDimensions */
+message mmbDimensions {
+    repeated fixed64  d = 1;
+};
+
+
+/** Definition of mmbLayoutBlock layout */
+message mmbLayoutBlock {
+    enum mmbLayoutOrder {
+              MMB_LAYOUT_ORDER_NONE = 0;
+              MMB_ROWMAJOR = 1;
+              MMB_COLMAJOR = 2;
+              MMB_GENERIC_ND =3;
+              MMB_LAYOUT_ORDER_MAX =4;
+              }
+    mmbLayoutOrder order = 1;
+    /* Regular Block-centric */
+    mmbDimensions dimensions = 2;
+};
+
+/** Built-in type corresponding to @ref mmbLayout */
+message mmblayout {
+  enum mmbLayoutType {
+        MMB_LAYOUT_NONE = 0;
+        MMB_REGULAR = 1;
+        MMB_REGULAR_BLOCK =2;
+        MMB_IRREGULAR=3;
+        MMB_LAYOUT_TYPE_MAX=4;
+        }
+  mmbLayoutType  type = 1;
+  fixed64   n_dims = 2;
+  fixed64   index = 3;
+  fixed64 element_size_bytes = 4;
+  oneof layout {
+  mmbLayoutBlock block = 5;
+  mmbLayoutIrregular irregular = 6;
+  };
+};
+
 message GroupMembers {
   repeated CDOID declared_members = 1;
   repeated string undeclared_members = 2;
@@ -194,7 +240,7 @@ message GroupMembers {
 message AVal {
   oneof val {
     bool       bool   = 1;
-    
+
     sfixed32   int32  = 2;
     sfixed64   int64  = 3;
     fixed32    uint32 = 4;
@@ -206,6 +252,7 @@ message AVal {
     string     string = 8;
     bytes      bytes  = 9; /* protobuf limits this to 2^32; this is used for mstro_blob attributes */
     Timestamp  timestamp = 10;
+    mmblayout  mmbLayout =11;
   };
 };
 
@@ -222,7 +269,7 @@ message KvEntry {
  * (PERIOD), and relative to the DEFAULT_NAMESPACE if not.
  **/
 message Attributes {
-  
+
   message Map {
     repeated KvEntry map = 1;
   };
@@ -305,11 +352,11 @@ message PoolOpAck {
     /* Note: Subscribe is acknowledged by SubscribeAck */
     UNSUBSCRIBE     = 6;
   };
-  
+
   PoolOp        op     = 1;    /**< CDO operation this refers to */
   oneof operand {
     CDOID          cdoid  = 2;    /**< ID that the event relates to */
-    SubscriptionHandle handle = 3; 
+    SubscriptionHandle handle = 3;
   };
   PoolOpStatus  status = 4;    /**< status value */
   /** SEAL may send updated attributes for a group-CDO */
@@ -381,6 +428,15 @@ message InitiateTransfer {
   /** Ensure an OFFER happens at the recipient, so that PM can rely on
    * it being available at DST_APPID after completed transfer */
   bool force_offer = 8;
+
+  /** Number of segements or pieces to obtain the cdo ... 
+    * should be 1 for all non distributed cdos and >= 1 for distributed cdos
+    * cdo is fully recieved when the dst_cdo recieves all pieces as defined here */
+  sfixed64 n_segments = 9;
+  
+  /** Flag this ticket as part of a (re)distribution, i.e. either 
+    * source or distination cdo or both are distributed cdos */  
+  bool distributed_cdo = 10;
 };
 
 message TransferTicketGFS {
@@ -403,7 +459,7 @@ message TransferTicketMIO {
 message RDMAHandle {
 //  oneof addr {
     fixed64 single = 1;
-    // placeholder for repeated add/offset pairs 'iov' 
+    // placeholder for repeated add/offset pairs 'iov'
 //  };
   /* the memory access key obtained, length is endpoint specific (see
    * from fi->domain_attr->mr_key_size) */
@@ -435,15 +491,15 @@ message TransferTicket {
   /* the kind of transport method chosen */
   /* FIXME: this duplicates some of the enum type that the oneof union generates */
   TransportKind method = 7;
-  
+
   /* one or more of the following will be filled by the sender, each non-null one can provide the data */
   oneof ticket {
     TransferTicketGFS gfs = 8;
     TransferTicketUDJ udj = 9;
     TransferTicketMIO mio = 10;
     TransferTicketOFI ofi = 11;
-  };  
- 
+  };
+
   sfixed64 data_size = 12;
   /* FIXME: proper attribute structure, plus eventual Mamba array info */
 
@@ -452,6 +508,19 @@ message TransferTicket {
    * it being available at DST_APPID after completed transfer */
   bool force_offer = 14;
 
+  /** Number of segements or pieces to obtain the cdo ... 
+    * should be 1 for all non distributed cdos and >= 1 for distributed cdos
+    * cdo is fully recieved when the dst_cdo recieves all pieces as defined here */
+  sfixed64 n_segments = 15;
+
+  /* offset at src and dst to read and write the current payload of data */
+  sfixed64 src_offset = 16;
+  sfixed64 dst_offset = 17;
+
+  /** Flag this ticket as part of a (re)distribution, i.e. either 
+   * source or distination cdo or both are distributed cdos */  
+  bool distributed_cdo = 18;
+
 };
 
 
@@ -509,7 +578,7 @@ message TransferCompleted {
 //   /** an unsubscribe was posted */
 //   UNSUBSCRIBE = 6;
 //   /** pool checkpoint starting */
-//   POOL_CHECKPOINT = 16; 
+//   POOL_CHECKPOINT = 16;
 //   /* there is no POOL_CHECKPOINT_DONE event as no listener would be
 //    * able to take notice of it */
 // }
@@ -588,7 +657,7 @@ enum EventKind {
   /** an unsubscribe was posted */
   UNSUBSCRIBE = 37;
   /** pool checkpoint starting */
-  CHECKPOINT  = 38; 
+  CHECKPOINT  = 38;
   /* there is no CHECKPOINT_DONE event as no listener would be
    * able to take notice of it */
 };
@@ -693,6 +762,8 @@ message Event {
   Appid origin_id                        = 4;
   /** the CDO concerned. Will be unset for non-CDO events */
   string cdo_name                        = 5;
+  /** the timestamp on the origin application */
+  Timestamp ctime                        = 6;
   /** the data associated with the event
    *
    * In most cases this conincides with the payload sent int the
@@ -755,7 +826,7 @@ message EventAck {
 /** @defgroup MstroPmprotoMgmt Management messages
  ** @{
  **/
-/** Configure messages from pool manager to apps */ 
+/** Configure messages from pool manager to apps */
 message Configure {
 }
 
@@ -774,7 +845,7 @@ message CDOWithdrawable {
 
 /** @defgroup MstroPmprotoLog Logging messages */
 /** @{ */
-    
+
 /** Log to a central destination by sending log info to the pool manager.
  **
  ** This may be a VSM message. If so, it will use the dedicated LOG
@@ -815,7 +886,7 @@ message QueryRes {
  ** All choices of the 'oneof msg' union are supposed to have a size
  ** sufficiently small to make the MstroMsg itself be small enough to
  ** fit the chosen buffer size. The leftover size is available for the
- ** 'immediate' payload in a MstroMsg. 
+ ** 'immediate' payload in a MstroMsg.
  **
  ** Message types that have a potentially large payload will need to
  ** use the VSMAnnouncement option and have a way to send fragmented
@@ -831,17 +902,17 @@ message QueryRes {
 
 message MstroMsg {
   /** the app token received during the JOIN/WELCOME handshake */
-  Apptoken token = 1;             
+  Apptoken token = 1;
 
   /** Message options type */
   message Opts {
     /** will message be followed by VSM messages? */
-    VSMAnnouncement vsm_data = 1; 
+    VSMAnnouncement vsm_data = 1;
   };
-  
+
   /** Message options */
   Opts opts = 2;
-  
+
   /* protobuf tag #3 is the payload at the end */
 
   oneof msg {
@@ -850,9 +921,9 @@ message MstroMsg {
     DeclareAck declare_ack   =  5;
     Seal       seal          =  6;
     SealGroup  seal_group    =  7;
-    
+
     /* get and set attribute is application-local. */
-       
+
     /* FIXME: what about the group equivalent of that. Or are they
      * perfectly immutable?
      */
@@ -866,19 +937,19 @@ message MstroMsg {
 
     Demand     demand        =  12;
     Withdraw   withdraw      =  13;
-    
+
     Dispose    dispose       = 15;
 
-    /** ack for OFFER/REQUIRE/RETRACT/WITHDRAW */ 
+    /** ack for OFFER/REQUIRE/RETRACT/WITHDRAW */
     PoolOpAck  ack           = 16;
-    
+
     /* transport */
     InitiateTransfer  initiate_transfer  = 17;
     TransferTicket    transfer_ticket    = 18;
     TransferCompleted transfer_completed = 19; // Does this message convey more semantics than "ack" above?
 
     /* XXX transformations messages placeholder */
-    
+
     /* infrequent messages */
     Join           join            = 20;
     Welcome        welcome         = 21;
@@ -886,7 +957,7 @@ message MstroMsg {
     Bye            bye             = 23;
     Subscribe      subscribe       = 24;
     SubscribeAck   subscribe_ack   = 25;
-    
+
     Unsubscribe    unsubscribe     = 26;
     //    Getinfo        get_info        = 27;
     Event          event           = 28;
@@ -897,7 +968,7 @@ message MstroMsg {
     DemandAttrRes  demand_attr_res = 32;
 
     Query          query           = 33;
-    QueryRes       query_res       = 34; 
+    QueryRes       query_res       = 34;
 
     /* potentially useful: emergency detach */
     EmergencyDetach detach         = 35;
@@ -906,7 +977,7 @@ message MstroMsg {
     Resolve        resolve         = 36;
     /** Resolve opaque data */
     ResolveReply   resolve_reply   = 37;
-    
+
     /* Workflow Management Entity related
     **
     ** PMInfo, checkpoint
@@ -920,13 +991,13 @@ message MstroMsg {
     // PMInfo	   pm_info	   = 30;
     // /* WF->App: This is the PMInfo */
     // AppGetsPMInfo  app_gets_pm_info = 31;
-    // /* PM should tell WF when its leaving, but we can use the "Leave" message already defined */ 
+    // /* PM should tell WF when its leaving, but we can use the "Leave" message already defined */
 
     // /* checkpoint initiated by the workflow management entity*/
     // ChkNotice	   chk_notice	   = 32;
     // ChkReady	   chk_ready	   = 33;
 
-    /* XXX failure messages placeholder */    
+    /* XXX failure messages placeholder */
 
 
   };
@@ -944,10 +1015,8 @@ message MstroMsg {
 message FragmentedMsg {
  VSMAnnouncement vsm_data = 1; /** more VSM data coming up? */
  repeated bytes data = 2; /** Payload
-                           **@} (end of group MSTRO_Pool_Protocol) 
+                           **@} (end of group MSTRO_Pool_Protocol)
                            **@} (end of addtogroup MSTRO_Core)
                            */
 
 };
-
-
diff --git a/tests/.gitignore b/tests/.gitignore
index d00a315df8fcf1f5770f8869f25ef604c329b98c..233d09578b1d20459d44b18979f8f875a00a67ac 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -39,3 +39,5 @@ colliding_client_1
 colliding_client_2
 reentrant_client
 check_pm_redundant_interlock.sh
+decode_pminfo
+check_decode_pminfo.sh
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 42b1b3b7d5f4861363b8cf9ef93f34aa92e304c2..9ffc798c0b65aa802c020ec5ea86b85b18445349 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -46,6 +46,10 @@ TEST_INCLUDES = -I$(top_srcdir)/include \
 	    -I$(top_srcdir)/attributes \
 	    -I$(top_builddir)/attributes   # schema_type_parse.h header is compile-time-generated
 
+if WITH_LOCAL_LIBFABRIC
+TEST_INCLUDES += -I$(top_srcdir)/deps/libfabric/include -I$(top_srcdir)/deps/libfabric/prov/gni/include
+endif
+
 AM_CPPFLAGS = $(TEST_INCLUDES) -DTOPSRCDIR=$(top_srcdir)
 
 LDADD = $(top_builddir)/libmaestro.la
@@ -60,9 +64,11 @@ TESTS = check_version check_init check_uuid \
 	check_symtab \
 	check_events \
 	check_protobuf_c \
- 	check_transport_gfs \
+	check_transport_gfs \
 	check_layout \
+	check_dispose_reuse \
         check_declare \
+	check_decode_pminfo.sh \
 	check_pool_local \
 	check_pool_local_putget \
 	check_pool_local_stress \
@@ -79,11 +85,12 @@ TESTS = check_version check_init check_uuid \
 	check_pm_interlock.sh \
 	check_pm_redundant_interlock.sh \
 	check_pm_interlock_async.sh \
+	check_pm_dist_cdo.sh \
 	check_subscribe.sh \
 	check_ecmwf_attr.sh \
 	check_ecmwf_events.sh \
 	check_ecmwf_handle.sh \
-	check_pm_declare_group.sh 
+	check_pm_declare_group.sh
 
 XFAIL_TESTS = \
 	check_events \
@@ -98,6 +105,7 @@ check_PROGRAMS = check_version check_init check_uuid \
 		 coverage check_memlock \
 		 check_protobuf_c \
 		 check_layout \
+		 check_dispose_reuse \
 		 check_transport_gfs \
                  check_declare \
 		 check_schema_parse \
@@ -116,16 +124,21 @@ check_PROGRAMS = check_version check_init check_uuid \
 		 check_subscribe_local \
 		 demo_mvp_d3_2 \
 		 simple_pool_manager \
+		 decode_pminfo \
 		 simple_client \
 		 simple_group_client \
 		 simple_group_injector \
 		 simple_interlock_client_1 \
+		 redundant_interlock_client_1 \
 		 simple_interlock_client_2 \
+		 redundant_interlock_client_2 \
 		 colliding_client_1 \
 		 colliding_client_2 \
 		 reentrant_client \
 		 simple_interlock_async_client_1 \
 		 simple_interlock_async_client_2 \
+		 simple_dist_client_1 \
+		 simple_dist_client_2 \
 		 simple_injector \
 		 simple_archiver \
 		 simple_telemetry_listener \
@@ -135,7 +148,7 @@ check_PROGRAMS = check_version check_init check_uuid \
 		 ecmwf_handle_producer \
 		 ecmwf_handle_consumer \
 		 ecmwf_events_producer \
-		 ecmwf_events_consumer 
+		 ecmwf_events_consumer
 
 
 if WITH_MIO
@@ -154,6 +167,16 @@ CLIENT1_OPS="DECLARE cdo1 1025\
              DISPOSE cdo1 -1\
              DISPOSE cdo2 -1"
 
+R_CLIENT1_OPS="DECLARE cdo1a 11\
+             SEAL cdo1a -1 \
+             DECLARE cdo2 20480\
+             REQUIRE cdo2 -1\
+             OFFER cdo1a -1 \
+             DEMAND cdo2 -1\
+             WITHDRAW cdo1a -1\
+             DISPOSE cdo1a -1\
+             DISPOSE cdo2 -1"
+
 CLIENT2_OPS="DECLARE cdo2 20480\
              SEAL cdo2 -1 \
              DECLARE cdo1 1025\
@@ -161,72 +184,83 @@ CLIENT2_OPS="DECLARE cdo2 20480\
              DECLARE cdo1a 11     REQUIRE cdo1a -1   \
              OFFER cdo2 -1 \
              DEMAND cdo1 -1\
-	     			 RETRACT cdo1a -1       DISPOSE cdo1a -1 \
+	     	 RETRACT cdo1a -1      DISPOSE cdo1a -1 \
              WITHDRAW cdo2 -1\
              DISPOSE cdo2 -1\
              DISPOSE cdo1 -1"
 
-
-ASYNC_CLIENT1_OPS="DECLARE cdo1 1025\
-						 DECLARE cdo2 1024\
-		         DECLARE cdo3 20480\
-						 WAIT cdo1 -1\
-						 SEAL cdo1 -1\
-						 WAIT cdo2 -1\
-						 SEAL cdo2 -1\
-						 WAIT cdo3 -1\
-						 SEAL cdo3 -1\
-						 WAIT cdo3 -1\
-						 REQUIRE cdo3 -1\
-						 WAIT cdo1 -1\
-						 WAIT cdo2 -1\
-						 WAIT cdo3 -1\
-						 OFFER cdo1 -1\
-						 WAIT cdo1 -1\
-						 DEMAND cdo3 -1\
-						 WAIT cdo3 -1\
-						 WITHDRAW cdo1 -1\
-						 WAIT cdo1 -1\
-						 DISPOSE cdo1 -1\
-						 DISPOSE cdo2 -1\
-						 DISPOSE cdo3 -1"
-
-ASYNC_CLIENT2_OPS="DECLARE cdo3 20480\
-	           DECLARE cdo2 1024\
-						 WAIT cdo2 -1\
-						 WAIT cdo3 -1\
-						 SEAL cdo3 -1\
+R_CLIENT2_OPS="DECLARE cdo2 20480\
+             SEAL cdo2 -1 \
              DECLARE cdo1 1025\
-						 WAIT cdo1 -1\
-						 SEAL cdo1 -1\
-						 SEAL cdo2 -1\
-						 WAIT cdo1 -1\
-						 WAIT cdo2 -1\
              REQUIRE cdo1 -1\
-						 REQUIRE cdo2 -1\
-						 WAIT cdo3 -1\
-						 WAIT cdo1 -1\
-             OFFER cdo3 -1\
-						 WAIT cdo3 -1\
+             DECLARE cdo1a 11     REQUIRE cdo1a -1   \
+             OFFER cdo2 -1 \
              DEMAND cdo1 -1\
-						 WAIT cdo1 -1\
-             WITHDRAW cdo3 -1\
-						 WAIT cdo2 -1\
-						 RETRACT cdo2 -1\
-						 WAIT cdo2 -1\
-						 WAIT cdo3 -1\
+	     	 DEMAND cdo1a -1       DISPOSE cdo1a -1 \
+             WITHDRAW cdo2 -1\
              DISPOSE cdo2 -1\
-						 DISPOSE cdo3 -1\
              DISPOSE cdo1 -1"
 
+ASYNC_CLIENT1_OPS="	DECLARE cdo1 1025\
+				   	DECLARE cdo2 1024\
+		           	DECLARE cdo3 20480\
+				 	WAIT cdo1 -1\
+					SEAL cdo1 -1\
+					WAIT cdo2 -1\
+					SEAL cdo2 -1\
+					WAIT cdo3 -1\
+					SEAL cdo3 -1\
+					WAIT cdo3 -1\
+					REQUIRE cdo3 -1\
+					WAIT cdo1 -1\
+					WAIT cdo2 -1\
+					WAIT cdo3 -1\
+					OFFER cdo1 -1\
+					WAIT cdo1 -1\
+					DEMAND cdo3 -1\
+					WAIT cdo3 -1\
+					WITHDRAW cdo1 -1\
+					WAIT cdo1 -1\
+					DISPOSE cdo1 -1\
+					DISPOSE cdo2 -1\
+					DISPOSE cdo3 -1"
+
+ASYNC_CLIENT2_OPS=" DECLARE cdo3 20480\
+	           		DECLARE cdo2 1024\
+					WAIT cdo2 -1\
+					WAIT cdo3 -1\
+					SEAL cdo3 -1\
+             		DECLARE cdo1 1025\
+					WAIT cdo1 -1\
+					SEAL cdo1 -1\
+					SEAL cdo2 -1\
+					WAIT cdo1 -1\
+					WAIT cdo2 -1\
+             		REQUIRE cdo1 -1\
+					REQUIRE cdo2 -1\
+					WAIT cdo3 -1\
+					WAIT cdo1 -1\
+             		OFFER cdo3 -1\
+					WAIT cdo3 -1\
+             		DEMAND cdo1 -1\
+					WAIT cdo1 -1\
+             		WITHDRAW cdo3 -1\
+					WAIT cdo2 -1\
+					RETRACT cdo2 -1\
+					WAIT cdo2 -1\
+					WAIT cdo3 -1\
+             		DISPOSE cdo2 -1\
+					DISPOSE cdo3 -1\
+             		DISPOSE cdo1 -1"
+
 INJECTOR_OPS="\
 			       DECLARE cdo1 1023      SEAL cdo1 -1       OFFER cdo1 -1    \
 			       DECLARE cdo2 1023000   SEAL cdo2 -1       OFFER cdo2 -1    \
 			       DECLARE cdo3 102       SEAL cdo3 -1       OFFER cdo3 -1    \
 			       DECLARE cdo1a 107      REQUIRE cdo1a -1                    \
-			 	     RETRACT cdo1a -1       DISPOSE cdo1a -1                    \
+			 	   RETRACT cdo1a -1       DISPOSE cdo1a -1                    \
 			       WITHDRAW cdo1 -1       WITHDRAW cdo2 -1   WITHDRAW cdo3 -1 \
-			 	     DISPOSE  cdo3 -1       DISPOSE  cdo2 -1   DISPOSE  cdo1 -1"
+			 	   DISPOSE  cdo3 -1       DISPOSE  cdo2 -1   DISPOSE  cdo1 -1"
 
 
 
@@ -236,6 +270,11 @@ simple_interlock_client_1_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=1 -DCLIENT_A
 simple_interlock_client_2_SOURCES = simple_interlock_client.c
 simple_interlock_client_2_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=2 -DCLIENT_ARGS=$(CLIENT2_OPS)
 
+redundant_interlock_client_1_SOURCES = simple_interlock_client.c
+redundant_interlock_client_1_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=1 -DCLIENT_ARGS=$(R_CLIENT1_OPS)
+redundant_interlock_client_2_SOURCES = simple_interlock_client.c
+redundant_interlock_client_2_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=2 -DCLIENT_ARGS=$(R_CLIENT2_OPS)
+
 colliding_client_1_SOURCES = colliding_client.c
 colliding_client_1_CPPFLAGS = $(AM_CPPFLAGS)  -DREPETITION=1
 colliding_client_2_SOURCES = colliding_client.c
@@ -248,10 +287,12 @@ simple_interlock_async_client_1_CPPFLAGS = $(AM_CPPFLAGS) -DCLIENT_ARGS=$(ASYNC_
 simple_interlock_async_client_2_SOURCES = simple_interlock_async_client.c
 simple_interlock_async_client_2_CPPFLAGS = $(AM_CPPFLAGS) -DCLIENT_ARGS=$(ASYNC_CLIENT2_OPS)
 
+simple_dist_client_1_SOURCES = simple_dist_client.c
+simple_dist_client_1_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=1
+simple_dist_client_2_SOURCES = simple_dist_client.c
+simple_dist_client_2_CPPFLAGS = $(AM_CPPFLAGS)  -DCOMPONENT_ID=2
 simple_injector_SOURCES = simple_interlock_client.c
 simple_injector_CPPFLAGS = $(AM_CPPFLAGS) -DCOMPONENT_ID=0 -DCLIENT_ARGS=$(INJECTOR_OPS)
-
-
 simple_group_injector_SOURCES = simple_group_client.c
 simple_group_injector_CPPFLAGS = $(AM_CPPFLAGS) -DINJECT_GROUP_MEMBERS=1
 
@@ -285,7 +326,7 @@ clean-local-check:
 all:
 
 if WITH_MIO
-check_PROGRAMS += mio-config-default.yaml mio-config-C1.yaml mio-config-C2.yaml mio-config-C1a.yaml mio-config-C2a.yaml mio-config-PM1.yaml mio-config-PM2.yaml mio-config-PM3.yaml mio-config-CINJ.yaml mio-config-CARCH.yaml 
+check_PROGRAMS += mio-config-default.yaml mio-config-C1.yaml mio-config-C2.yaml mio-config-C1a.yaml mio-config-C2a.yaml mio-config-PM1.yaml mio-config-PM2.yaml mio-config-PM3.yaml mio-config-CINJ.yaml mio-config-CARCH.yaml
 
 # avoid getting weird default values from automake for these
 mio_config_default_yaml_SOURCES =# empty
@@ -325,5 +366,3 @@ AM_TESTS_ENVIRONMENT=  MSTRO_MIO_CONFIG=mio-config-default.yaml; export MSTRO_MI
 # some tests set their component-name specific config when they run to replace this
 
 endif
-
-
diff --git a/tests/check_decode_pminfo.sh.in b/tests/check_decode_pminfo.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..4f877e200fa93ab0655d1f642d3184c1611fb88f
--- /dev/null
+++ b/tests/check_decode_pminfo.sh.in
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+#
+# Check decoding of PMINFO works
+#
+
+
+# Copyright (C) 2021 Hewlett-Packard (Schweiz) GmbH
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# aggressive error tracing
+set -euo pipefail
+
+DECODER_CMD="@top_builddir@/tests/decode_pminfo"
+
+#
+# Sample input
+PMINFO="Cq4CChoIEBIWUhRmaV9zaG06Ly85MzM0OjEwMDA6MAoaCBASFlIUZmlfc2htOi8vOTMzNDox
+MDAwOjEKGAgGEhQaEgkCAAAAAAAAABCptwEdfwAAAQoYCAYSFBoSCQIAAAAAAAAAEJ+HAR0K
+AAIPChMIBhIPIg0JCgAAAAAAAAAQn7sBCiEIBhIdIhsJCgAAAAAAAAAQs98BIf6AAAAAAAAA
+NQIAAAASEwlAqEeDDX8AABIIKgAAAAAAAAASEwlAqEeDDX8AABIIKgAAAAAAAAASEwlAqEeD
+DX8AABIIKgAAAAAAAAASEwlAqEeDDX8AABIIKgAAAAAAAAASEwlAqEeDDX8AABIIKgAAAAAA
+AAASEwlAqEeDDX8AABIIKgAAAAAAAAAaABoAGgAaABoAGgA="
+
+# expected parse result
+EXPECTED="Decoded EP 0: proto SHM addr fi_shm://9334:1000:0 (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)
+Decoded EP 1: proto SHM addr fi_shm://9334:1000:1 (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)
+Decoded EP 2: proto SOCK_TCP addr IPv4 127.0.0.1:23465 (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)
+Decoded EP 3: proto SOCK_TCP addr IPv4 10.0.2.15:17311 (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)
+Decoded EP 4: proto SOCK_TCP addr IPv6 :: (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)
+Decoded EP 5: proto SOCK_TCP addr IPv6 fe00:: (credential: 0, mraddr 7f0d8347a840, mrkey 2A00000000000000)"
+
+DECODED=$(echo $PMINFO | $DECODER_CMD)
+
+if test x"$EXPECTED" = x"$DECODED"; then
+	exit 0;
+else 
+	diff -u <(echo "$EXPECTED") <(echo "$DECODED") >&2
+	exit 1;
+fi
diff --git a/tests/check_dispose_reuse.c b/tests/check_dispose_reuse.c
new file mode 100644
index 0000000000000000000000000000000000000000..9202630d1bf22b14135aa6dc19b808d88aa8d47f
--- /dev/null
+++ b/tests/check_dispose_reuse.c
@@ -0,0 +1,107 @@
+/* check maestro dispose and reuse */
+/*
+ * Copyright (C) 2019 Cray Computer GmbH
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* needed before inclusion of cheat.h: */
+#ifndef __BASE_FILE__
+#define __BASE_FILE__ __FILE__
+#endif
+
+#include "cheat.h"
+
+#include "maestro.h"
+
+#include <string.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include "maestro/logging.h"
+
+#define N 10
+
+CHEAT_TEST(dispose_reuse_works,
+  size_t data_count = N*N;
+  int64_t bytes = data_count*sizeof(double);
+  double* src_data = malloc(bytes);
+  for(size_t i=0; i<data_count; i++) {
+    src_data[i]=random();
+  }
+  enum mstro_cdo_attr_value_type type;
+  const void* val;
+
+  cheat_assert(MSTRO_OK == mstro_init("Tests","DISPOSE_AND_REUSE",0));
+  char name[] = "recycling_pioneer";
+
+  mstro_cdo cdo_src=NULL;
+
+  cheat_assert(MSTRO_OK == mstro_cdo_declare(name, MSTRO_ATTR_DEFAULT, &cdo_src));
+  cheat_assert(MSTRO_OK == mstro_cdo_attribute_set(cdo_src,
+                                                   MSTRO_ATTR_CORE_CDO_RAW_PTR,
+                                                   src_data, false));
+  cheat_assert(MSTRO_OK == mstro_cdo_attribute_set(cdo_src,
+                                                   MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
+                                                   (void**)&bytes, true));
+  int64_t user_attr = 0;
+  int i;
+  for (i=0; i<10; i++) {
+    user_attr++;
+  /* We'll hijack the ELEMENT_SIZE attribute for this test's purpose.  
+	The aim is to show the user can fiddle with attributes even after the CDO
+	has been SEALED/OFFERED/WITHDRAWN, without re-declaring and setting the
+	attributes all over again */
+    cheat_assert(MSTRO_OK == mstro_cdo_attribute_set(cdo_src, 
+  			  MSTRO_ATTR_CORE_CDO_LAYOUT_ELEMENT_SIZE, &user_attr, true));
+ 
+    cheat_assert(MSTRO_OK == mstro_cdo_attribute_get(
+                            cdo_src, MSTRO_ATTR_CORE_CDO_LAYOUT_ELEMENT_SIZE, &type, &val));
+           fprintf(stdout, "CDO %s *user_attr* has now value: %d\n",
+                   name,*(const int64_t*)val);
+  
+  /* Producer side (there won't be any consumer here) */
+    cheat_assert(MSTRO_OK == mstro_cdo_declaration_seal(cdo_src));
+    cheat_assert(MSTRO_OK == mstro_cdo_offer(cdo_src));
+    cheat_assert(MSTRO_OK == mstro_cdo_withdraw(cdo_src));
+
+  /* unsealing *cdo_src* */
+    cheat_assert(MSTRO_OK == mstro_cdo_dispose_and_reuse(cdo_src));
+  }
+
+  /* Free resources */
+  cheat_assert(MSTRO_OK == mstro_cdo_dispose(cdo_src));
+
+  free(src_data);
+
+  cheat_assert(MSTRO_OK == mstro_finalize());
+
+  )
+
+
+
diff --git a/tests/check_pm_dist_cdo.sh.in b/tests/check_pm_dist_cdo.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..0214e2710e833336608f2af3bc3acf3de7216d94
--- /dev/null
+++ b/tests/check_pm_dist_cdo.sh.in
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+#
+# Run pool manager and 3 clients that interlock dist cdo OFFER/DEMAND
+#
+
+# Copyright (C) 2020 Cray Computer GmbH
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# aggressive error tracing
+set -euo pipefail
+
+# set a workflow name
+MSTRO_WORKFLOW_NAME="check_pm_dist_cdo_wf_$$"
+export MSTRO_WORKFLOW_NAME
+
+# ensure error and warnings get colors
+MSTRO_LOG_COLOR_ERRORS=1
+export MSTRO_LOG_COLOR_ERRORS
+
+# start pool manager as named pipe
+PM_CMD="@top_builddir@/tests/simple_pool_manager"
+# arguments are
+#  'O' -- offer 'R' -- require -- 'D' demand -- 'W' withdraw
+#  'S' -- sleep
+CLIENT_CMD="@top_builddir@/tests/simple_dist_client"
+
+# start pool manager, connect its output to fd 3:
+# (we need to run in a subshell to start a new process group)
+exec 3< <(env MSTRO_LOG_LEVEL=${MSTRO_LOG_LEVEL:-2} ${PM_CMD})
+PM_PID=$!
+
+terminate () {
+    #$1 is PM_PID, $2 is desired exit code Note that killing PM_PID is
+    # not enough, the actual PM will likely be a child of this one,
+    # and if we kill it it will be reparented to init(1).
+    echo "exit, stopping pool manager"
+    # close stdout of the pool manager; its heartbeat output on stdout
+    # will make it receive a SIGPIPE eventually, which initiates
+    # termination.
+    exec 3<&-
+    exit ${2:-99}
+}
+
+# ensure USR2 is not caught by this shell
+trap '' USR2
+
+# trap error exit of script: close pipe to pool manager and return error
+trap "terminate ${PM_PID} 99" err
+
+# read pm info: It comes as a CSV of MSTRO_POOL_INFO; base64 text;
+# Potentially we might (later) have the pool manager report changed
+# info over time which we could read with the same read command.
+#
+read -d ';' -u 3 pm_info_varname
+read -d ';' -u 3 pm_info
+echo "PM info: $pm_info"
+
+if test x"${pm_info_varname}" != xMSTRO_POOL_MANAGER_INFO; then
+    exit 99;
+fi
+
+
+MSTRO_POOL_MANAGER_INFO="$pm_info"
+export MSTRO_POOL_MANAGER_INFO
+
+# start client 1
+(env MSTRO_TRANSPORT_DEFAULT=RDMA MSTRO_COMPONENT_NAME="producer" STRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="CYAN" ${CLIENT_CMD}_1 -n) || exit 99  &
+
+# start client 2
+(env MSTRO_TRANSPORT_DEFAULT=RDMA MSTRO_COMPONENT_NAME="consumer"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="GREEN" ${CLIENT_CMD}_2 -n) || exit 99  &
+
+wait %1 || exit 99
+wait %2 || exit 99
+
+
+# trap normal script termination: close pipe to pool manager
+terminate ${PM_PID} 0
diff --git a/tests/check_pm_redundant_interlock.sh.in b/tests/check_pm_redundant_interlock.sh.in
index 6e14f68cb0df3383a9785b8498be3973dbb8c285..10d03f739d822296b0cc0bafae646a6f93deccdd 100644
--- a/tests/check_pm_redundant_interlock.sh.in
+++ b/tests/check_pm_redundant_interlock.sh.in
@@ -49,6 +49,8 @@ PM_CMD="@top_builddir@/tests/simple_pool_manager"
 #  'O' -- offer 'R' -- require -- 'D' demand -- 'W' withdraw
 #  'S' -- sleep
 CLIENT_CMD="@top_builddir@/tests/simple_interlock_client"
+R_CLIENT_CMD="@top_builddir@/tests/redundant_interlock_client"
+
 
 # start pool manager, connect its output to fd 3:
 # (we need to run in a subshell to start a new process group)
@@ -95,13 +97,14 @@ export MSTRO_POOL_MANAGER_INFO
 (env MSTRO_COMPONENT_NAME="Client1" MSTRO_TRANSPORT_DEFAULT="MIO" MSTRO_MIO_CONFIG="./mio-config-C1a.yaml"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="BLUE" ${CLIENT_CMD}_1 -n) || exit 99  &
 
 # start client 1 again (simulating redundant participants)
-(env MSTRO_COMPONENT_NAME="Client1a" MSTRO_TRANSPORT_DEFAULT="MIO" MSTRO_MIO_CONFIG="./mio-config-C1a.yaml"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="BLUE" ${CLIENT_CMD}_1 -n) || exit 99  &
+(env MSTRO_COMPONENT_NAME="Client1a" MSTRO_TRANSPORT_DEFAULT="MIO" MSTRO_MIO_CONFIG="./mio-config-C1a.yaml"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="BLUE" ${R_CLIENT_CMD}_1 -n) || exit 99  &
 
 # start client 2
-(env MSTRO_COMPONENT_NAME="Client2" MSTRO_TRANSPORT_DEFAULT="MIO" MSTRO_MIO_CONFIG="./mio-config-C2a.yaml"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="CYAN" ${CLIENT_CMD}_2 -n) || exit 99  &
+(env MSTRO_COMPONENT_NAME="Client2" MSTRO_TRANSPORT_DEFAULT="MIO" MSTRO_MIO_CONFIG="./mio-config-C2a.yaml"  MSTRO_LOG_COLOR_ERRORS=1 MSTRO_LOG_COLOR="CYAN" ${R_CLIENT_CMD}_2 -n) || exit 99  &
 
 wait %1 || exit 99
 wait %2 || exit 99
+wait %3 || exit 99
 
 # trap normal script termination: close pipe to pool manager
 terminate ${PM_PID} 0
diff --git a/tests/check_type_parser.c b/tests/check_type_parser.c
index 472b7db602bcdcd111ac533d21fdd9af39f4cb2c..95e4893273b095ce87929b05199ce18098a38380 100644
--- a/tests/check_type_parser.c
+++ b/tests/check_type_parser.c
@@ -1,6 +1,6 @@
 /* -*- mode:c -*- */
 /** @file
- ** @brief  check schema type parsing 
+ ** @brief  check schema type parsing
  **/
 
 /*
@@ -52,7 +52,7 @@
 
 
 #define XSTRINGIFY(s) #s
-#define STRINGIFY(s) XSTRINGIFY(s) 
+#define STRINGIFY(s) XSTRINGIFY(s)
 
 CHEAT_DECLARE(
     const char *strings[] = {
@@ -62,7 +62,7 @@ CHEAT_DECLARE(
       "timestamp()",
       /* regex is also built-in but can't be used without args;
        * see below */
-          
+
       /* typical restricted types */
       "uint(max=10, min=7)",
       "int(min=-17, max=42)",
@@ -72,6 +72,7 @@ CHEAT_DECLARE(
       "str(max=8)",
       "str(max=8, min=8)",
       "\tblob ( max = 4711) ",
+      "mmblayout (max = 4711)",
       "regex(';.*$',ignore_case=False, name='lisp comment')",
       "regex(';.*$',ignore_case=True, name='lisp comment')",
       /* multiple patterns */
@@ -95,4 +96,3 @@ CHEAT_TEST(core_schema_parse,
              cheat_yield();
            }
            )
-           
diff --git a/tests/decode_pminfo.c b/tests/decode_pminfo.c
new file mode 100644
index 0000000000000000000000000000000000000000..af435cee7da8a9342f20d98012e6cdf9c3df0e3a
--- /dev/null
+++ b/tests/decode_pminfo.c
@@ -0,0 +1,136 @@
+/* -*- mode:c -*- */
+/** @file
+ ** @brief Tool to decode serialized pool manager info
+ **/
+
+/*
+ * Copyright (C) 2021 Hewlett Packard (Schweiz) GmbH
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "maestro.h"
+#include "maestro/logging.h"
+#include "maestro/env.h"
+
+#include "protocols/maestro-endpoints.h"
+#include "maestro/i_ofi.h"
+
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+
+
+/* simplify logging */
+#define DEBUG(...) LOG_DEBUG(MSTRO_LOG_MODULE_USER,__VA_ARGS__)
+#define INFO(...)  LOG_INFO(MSTRO_LOG_MODULE_USER,__VA_ARGS__)
+#define WARN(...)  LOG_WARN(MSTRO_LOG_MODULE_USER,__VA_ARGS__)
+#define ERR(...)   LOG_ERR(MSTRO_LOG_MODULE_USER,__VA_ARGS__)
+
+
+
+int
+main(int argc, char ** argv)
+{
+  if(MSTRO_OK!=mstro_init("", "PMINFO decoder", 0))
+    exit(EXIT_FAILURE);
+
+  if(! (argc==1 || argc==2)) {
+usage:
+    fprintf(stderr, "Usage: %s [PM-INFO.txt]\n", argv[0]);
+    exit(1);
+  }
+
+  char* pm_info=NULL;
+  
+  
+  if(argc==1) {
+    /* read from stdin */
+#define BLOCKSIZE 1023
+    size_t total_size=0;
+    size_t pos=0;
+    do {
+	if(pos==total_size) {
+	  total_size+=BLOCKSIZE;
+	  pm_info=realloc(pm_info,total_size+1); // one extra so that there's always space for a NUL
+	  if(pm_info==NULL) {
+	    fprintf(stderr, "Failed to allocate buffer for input data\n");
+	    exit(EXIT_FAILURE);
+	  }
+	}
+    } while(!feof(stdin)
+	    && (pm_info[pos++]=fgetc(stdin))!=EOF);
+    pm_info[pos]='\0';
+  } else {
+    /* read from file */
+    FILE *f = fopen (argv[1], "r");
+    if(!f) {
+      goto usage;
+    }
+    fseek (f, 0, SEEK_END);
+    long length = ftell (f);
+    fseek (f, 0, SEEK_SET);
+    pm_info = malloc (length+1);
+    if (!pm_info) {
+      fprintf(stderr, "Failed to allocate buffer for file content of %s\n", argv[1]);
+      exit(EXIT_FAILURE);
+    }
+    size_t items_read = fread (pm_info, length, 1, f);
+    if(items_read!=1) {
+      fprintf(stderr, "Failed to read PMINFO: %d (%s)\n",
+              errno, strerror(errno));
+      exit(EXIT_FAILURE);
+    }
+    pm_info[length] = '\0';
+    fclose (f);
+  }
+
+  /* parse */
+  Mstro__AppInfo *epd=NULL;
+  mstro_status s=mstro_appinfo_deserialize(pm_info, &epd);
+  if(pm_info)
+    free(pm_info);
+
+  if(s!=MSTRO_OK) 
+    exit(EXIT_FAILURE);
+    
+
+  /* print */
+  WITH_MSTRO_EPL_DESCRIPTION(str, epd->eps, {
+      fprintf(stdout, "Decoded %s", str);
+    });
+
+  mstro__app_info__free_unpacked(epd, NULL);
+
+  if(MSTRO_OK!=mstro_finalize())
+    exit(EXIT_FAILURE);
+
+  return EXIT_SUCCESS;
+}
diff --git a/tests/simple_dist_client.c b/tests/simple_dist_client.c
new file mode 100644
index 0000000000000000000000000000000000000000..15b1623d5817c5356ff759605e0af928c4eaed7d
--- /dev/null
+++ b/tests/simple_dist_client.c
@@ -0,0 +1,270 @@
+/* -*- mode:c -*- */
+/** @file
+ ** @brief Connect to pool manager and offer/demand distributed CDOs
+ **/
+
+/*
+ * Copyright (C) 2018-2020 Cray Computer GmbH
+ * Copyright (C) 2021 HPE Switzerland GmbH
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* needed before inclusion of cheat.h: */
+#ifndef __BASE_FILE__
+#define __BASE_FILE__ __FILE__
+#endif
+
+#include "cheat.h"
+#include "mamba.h"
+#include "maestro.h"
+#include "maestro/i_utlist.h"
+#include "maestro/i_uthash.h"
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef COMPONENT_ID
+#error You need to define COMPONENT_ID when compiling this file
+#endif
+
+
+
+
+#define XSTRINGIFY(s) #s
+#define STRINGIFY(s) XSTRINGIFY(s)
+
+
+CHEAT_DECLARE(
+void fill_data(double *data, mmbLayout *layout) {
+   
+   size_t data_count = layout->irregular.lengths[layout->index]; 
+   size_t offset = layout->irregular.offsets[layout->index];
+   for (size_t i = 0; i < data_count; i++) {
+      data[i] = i + offset;
+   }
+   
+}
+
+void print_data(double *data, mmbLayout *layout) {
+   size_t data_count = layout->irregular.lengths[layout->index]; 
+   printf("data length = %zu \n ", data_count);
+   for (size_t i = 0; i < data_count; i++) {
+      printf("%lf ", data[i]);
+      if(i%10 == 0) {
+         printf("\n");
+      }
+   }
+   printf("\n--------------------------\n");
+   
+}
+)
+
+CHEAT_TEST(simple_dist_cdo,
+
+           char *component_id = STRINGIFY(COMPONENT_ID);
+           int my_id = atoi(component_id);
+           mmbLayoutEquivalence diff;
+
+           size_t *offsets = malloc(sizeof(size_t) *2);
+           size_t *sizes = malloc(sizeof(size_t) *2);
+
+           size_t *offsets2 = malloc(sizeof(size_t) *2);
+           size_t *sizes2 = malloc(sizeof(size_t) *2);
+
+           offsets[0] = 0;
+           offsets[1] = 100;
+           sizes[0] = 100;
+           sizes[1] = 50;
+
+           offsets2[0] = 0;
+           offsets2[1] = 75;
+           sizes2[0] = 75;
+           sizes2[1] = 75;
+
+           size_t *offsets3 = malloc(sizeof(size_t) *3);
+           size_t *sizes3 = malloc(sizeof(size_t) *3);
+           
+           offsets3[0] = 0;
+           offsets3[1] = 50;
+           offsets3[2] = 100;
+           sizes3[0] = 50;
+           sizes3[1] = 50;
+           sizes3[2] = 50;
+
+           char *sync_buf;
+           int64_t sync_size = 1;
+           double *data;
+
+           double *dist_buf, *dist_buf2, *dist_buf3;
+           int64_t dist_size = 50 * sizeof(double);
+           int64_t dist_size2 = 75 * sizeof(double);
+           int64_t dist_size3 = 75 * sizeof(double);
+           //
+           mmbLayout *dist_layout1, *dist_layout2, *dist_layout3, *dist_layout4;
+           cheat_assert(MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                                1,
+                                                2,
+                                                offsets,
+                                                sizes,
+                                                &dist_layout1));
+
+
+           cheat_assert(MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                                1,
+                                                2,
+                                                offsets2,
+                                                sizes2,
+                                                &dist_layout2));
+
+           cheat_assert(MMB_OK == mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                                        0,
+                                                        2,
+                                                        offsets2,
+                                                        sizes2,
+                                                        &dist_layout3));
+
+
+           cheat_assert(MMB_OK == mmb_layout_cmp(dist_layout1, dist_layout3, &diff));
+
+           cheat_assert(MMB_LAYOUT_DIFF_FIELDS == diff);
+           cheat_assert(MSTRO_OK == mstro_init(NULL,NULL,my_id));
+           cheat_yield();
+
+           mstro_cdo sync_cdo, dist_cdo, dist_cdo3, dist_cdo2;
+           char sync_cdo_name[20];
+           char dist_cdo_name[20];
+           size_t element_size = sizeof(double);
+           size_t index = 1;
+           size_t my_index;
+           size_t *nblocks = malloc(sizeof(size_t));
+           *nblocks = 2;
+
+
+           switch (my_id) {
+             case 1:
+
+             my_index = my_id - 1;
+             cheat_assert(0 == posix_memalign((void**)&dist_buf, (size_t) sysconf(_SC_PAGESIZE), dist_size));
+             cheat_assert(0 == posix_memalign((void**)&dist_buf2, (size_t) sysconf(_SC_PAGESIZE), dist_size2));
+             cheat_assert(0 == posix_memalign((void**)&dist_buf3, (size_t) sysconf(_SC_PAGESIZE), dist_size3));
+
+             fill_data(dist_buf, dist_layout1); 
+             fill_data(dist_buf2, dist_layout2);
+             fill_data(dist_buf3, dist_layout3);
+             print_data(dist_buf2,dist_layout2);
+             print_data(dist_buf3,dist_layout3);
+             sprintf(dist_cdo_name, "dist-cdo-%d", my_id);
+             
+             cheat_assert(MSTRO_OK == mstro_cdo_declare(dist_cdo_name, MSTRO_ATTR_DEFAULT, &dist_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_declare(dist_cdo_name, MSTRO_ATTR_DEFAULT, &dist_cdo2));
+             cheat_assert(MSTRO_OK == mstro_cdo_declare(dist_cdo_name, MSTRO_ATTR_DEFAULT, &dist_cdo3));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo, MSTRO_ATTR_CORE_CDO_DIST_LAYOUT, dist_layout1, true));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo2, MSTRO_ATTR_CORE_CDO_DIST_LAYOUT, dist_layout2, true));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo3, MSTRO_ATTR_CORE_CDO_DIST_LAYOUT, dist_layout3, true));
+             /*add distributed cdo layout*/
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE, &dist_size, true));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo, MSTRO_ATTR_CORE_CDO_RAW_PTR, dist_buf, false));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo2, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE, &dist_size2, true));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo2, MSTRO_ATTR_CORE_CDO_RAW_PTR, dist_buf2, false));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo3, MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE, &dist_size3, true));
+             cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo3, MSTRO_ATTR_CORE_CDO_RAW_PTR, dist_buf3, false));
+
+             cheat_assert(MSTRO_OK == mstro_cdo_seal(dist_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_seal(dist_cdo2));
+             cheat_assert(MSTRO_OK == mstro_cdo_seal(dist_cdo3));
+             // Syncing with the consumer
+             sprintf(sync_cdo_name, "sync-cdo-%d", my_id);
+             cheat_assert(MSTRO_OK == mstro_cdo_declare(sync_cdo_name, MSTRO_ATTR_DEFAULT, &sync_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_require(sync_cdo));
+
+             //cheat_assert(MSTRO_OK == mstro_cdo_offer(dist_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_offer(dist_cdo2));
+             cheat_assert(MSTRO_OK == mstro_cdo_offer(dist_cdo3));
+             
+             cheat_assert(MSTRO_OK == mstro_cdo_demand(sync_cdo));
+
+             cheat_yield();
+             //cheat_assert(MSTRO_OK == mstro_cdo_withdraw(dist_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_withdraw(dist_cdo2));
+             cheat_assert(MSTRO_OK == mstro_cdo_withdraw(dist_cdo3));
+             
+
+             // Clean up
+             cheat_assert(MSTRO_OK == mstro_cdo_dispose(sync_cdo));
+             
+             cheat_assert(MSTRO_OK == mstro_cdo_dispose(dist_cdo));
+             cheat_assert(MSTRO_OK == mstro_cdo_dispose(dist_cdo2));
+             cheat_assert(MSTRO_OK == mstro_cdo_dispose(dist_cdo3));
+
+             break;
+
+            case 2:
+               
+              mmb_layout_create_dist_irregular_1d(sizeof(double),
+                                                 1,
+                                                 3,
+                                                 offsets3,
+                                                 sizes3,
+                                                 &dist_layout4);
+              
+              sprintf(sync_cdo_name, "sync-cdo-%d", my_id-1);
+              cheat_assert(MSTRO_OK == mstro_cdo_declare(sync_cdo_name, MSTRO_ATTR_DEFAULT, &sync_cdo));
+              
+              sprintf(dist_cdo_name, "dist-cdo-%d", my_id-1);
+              cheat_assert(MSTRO_OK == mstro_cdo_declare(dist_cdo_name, MSTRO_ATTR_DEFAULT, &dist_cdo));
+              cheat_assert(MSTRO_OK==mstro_cdo_attribute_set(dist_cdo, MSTRO_ATTR_CORE_CDO_DIST_LAYOUT, dist_layout4, true));
+              
+              cheat_assert(MSTRO_OK == mstro_cdo_seal(dist_cdo));
+              cheat_assert(MSTRO_OK == mstro_cdo_require(dist_cdo));
+              
+              cheat_assert(MSTRO_OK == mstro_cdo_offer(sync_cdo));
+              cheat_assert(MSTRO_OK == mstro_cdo_demand(dist_cdo));
+              
+
+              cheat_assert(MSTRO_OK == mstro_cdo_withdraw(sync_cdo));
+              cheat_yield();
+               
+              cheat_assert(MSTRO_OK == mstro_cdo_attribute_get(dist_cdo, MSTRO_ATTR_CORE_CDO_RAW_PTR, NULL,(const void **) &data));
+              print_data(data,dist_layout4);
+              // Clean up
+              cheat_assert(MSTRO_OK == mstro_cdo_dispose(sync_cdo));
+              cheat_assert(MSTRO_OK == mstro_cdo_dispose(dist_cdo));
+              cheat_assert(MMB_OK == mmb_layout_destroy(dist_layout4));
+              
+              cheat_yield();
+              break;
+           }
+
+           cheat_assert(MSTRO_OK == mstro_finalize());
+           cheat_assert(MMB_OK == mmb_layout_destroy(dist_layout1));
+           cheat_assert(MMB_OK == mmb_layout_destroy(dist_layout2));
+           cheat_assert(MMB_OK == mmb_layout_destroy(dist_layout3));
+
+
+           )
diff --git a/tests/simple_telemetry_listener.c b/tests/simple_telemetry_listener.c
index 93273138e32b7befa4df09269859602e196ee880..102b443c7cc911f36e9e9d55ac0cee46cf68bda4 100644
--- a/tests/simple_telemetry_listener.c
+++ b/tests/simple_telemetry_listener.c
@@ -46,7 +46,6 @@
 #include <stdarg.h>
 
 
-#define NSEC_PER_SEC ((mstro_nanosec_t)1000*1000*1000)
 const mstro_nanosec_t  DEFAULT_MAX_WAIT = ((mstro_nanosec_t)15)*NSEC_PER_SEC; /* 15s */
 
 /** Configurable settings */
@@ -400,7 +399,7 @@ event_loop(void)
             if(g_conf_terminate_after) {
               if(g_conf_terminate_after_appid==tmp->leave.appid) {
                 if(g_verbose) {
-                  fprintf(stdout, "LEAVE of %" PRIappid " triggers termination of telemetry listener\n",
+                  fprintf(stdout, "LEAVE of app %" PRIappid " triggers termination of telemetry listener\n",
                           tmp->leave.appid);
                 }
                 done=true;
diff --git a/transport/gfs.c b/transport/gfs.c
index 45f44fcabc6d7dbb4b7aede42c6cc47e0f633417..6a396f3476fc4484d1351b9c02ce7f8e456d2c87 100644
--- a/transport/gfs.c
+++ b/transport/gfs.c
@@ -82,8 +82,11 @@ mstro_transport_gfs_src_execute(mstro_cdo src, Mstro__Pool__TransferTicket* tick
 	    return MSTRO_FAIL;
 	}
        
+/* add src offset to read from the correct place for dist_cdos */
+void * src_ptr = (void *) ((char *) dl.data + ticket->src_offset);  
 RETRY_GFS_TRANSPORT_WRITE: ;
-    size_t bytes_written = fwrite(dl.data, sizeof(char), dl.len, f);
+    
+    size_t bytes_written = fwrite(src_ptr, sizeof(char), dl.len, f); 
 	if (bytes_written != dl.len) {
       if (errno == EAGAIN)
 	    goto RETRY_GFS_TRANSPORT_WRITE;
@@ -145,7 +148,7 @@ mstro_transport_gfs_dst_execute(mstro_cdo dst,
     return MSTRO_FAIL;
   }
 RETRY_GFS_TRANSPORT_READ: ;
-  size_t bytes_read = fread(data, sizeof(char), len, f);
+  size_t bytes_read = fread(data+ticket->dst_offset, sizeof(char), len, f);
   if (bytes_read != (size_t)len) {
     if (errno == EAGAIN)
 	  goto RETRY_GFS_TRANSPORT_READ;
diff --git a/transport/mio.c b/transport/mio.c
index 0e2a1e62d08b15f13f8b4cbcaa590f737f2bc511..52ba5f1cdfbe629f0b8904328e70bdbebe0367fc 100644
--- a/transport/mio.c
+++ b/transport/mio.c
@@ -393,7 +393,7 @@ mstro_mio_obj_write_async(mstro_cdo src,
   const int64_t* val;
   
 	struct mio_iovec mvec;
-	mvec.miov_base = (char*)data; // cast to MIO's type
+	mvec.miov_base = (char*)data + ticket->src_offset; // cast to MIO's type +  read offset for dist cdos
         /* FIXME: should use constant value sym for attribute name */
 	if (!(MSTRO_OK == mstro_cdo_attribute_get(
                 src, ".maestro.core.cdo.layout.pre-pad", &type, (const void**)&val))) {
@@ -446,11 +446,15 @@ DEBUG ("Async write went ok (see callback for further report)\n");
 }
 
 mstro_status
-mstro_mio_obj_read_sync(mstro_cdo dst, struct mio_obj_id* oid, size_t len, struct mio_obj_id* semid)
+mstro_mio_obj_read_sync(mstro_cdo dst, 
+                        Mstro__Pool__TransferTicket* ticket,
+                        struct mio_obj_id* oid,
+                        struct mio_obj_id* semid)
 {
   if (dst == NULL || oid == NULL || semid == NULL)
     return MSTRO_INVARG;
   
+  int64_t len = ticket->data_size;
   void* data = NULL;
   mstro_status s;
   if (! (MSTRO_OK == mstro_transport_get_dst_buffer(dst, (void*)&data))) {
@@ -458,22 +462,14 @@ mstro_mio_obj_read_sync(mstro_cdo dst, struct mio_obj_id* oid, size_t len, struc
     return MSTRO_FAIL;
   }
   
-  /* fake local-size to the expected len */
-  s = mstro_attribute_dict_set(dst->attributes,
-                               MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-                               MSTRO_CDO_ATTR_VALUE_INVALID,
-                               &len, true);
-  if(s!=MSTRO_OK) {
-    ERR("Failed to set local size to to non-padded size\n");
-    return MSTRO_FAIL;
-  }
-  
   DEBUG("Sync read for now\n");
   /* retrieve specific data layout attributes FIXME redundancy */
   enum mstro_cdo_attr_value_type type;
   const int64_t* val;
-  struct mio_iovec mvec;	
-  mvec.miov_base = (char*)data; // cast to MIO's type 
+  struct mio_iovec mvec;
+  /* write data with the correct dst offset for distributed cdos*/
+  data =(void *) ((char *) data + ticket->dst_offset); 	
+  mvec.miov_base = (char*)data ; // cast to MIO's type 
   if (!(MSTRO_OK == mstro_cdo_attribute_get(
           dst, ".maestro.core.cdo.layout.pre-pad", &type, (const void**)&val))) {
     ERR("Failed to get size attribute pre-padding of CDO %s for transport\n", dst->name);
@@ -586,7 +582,7 @@ mstro_transport_mio_src_execute(mstro_cdo src, Mstro__Pool__TransferTicket* tick
 	struct mio_obj_id oid;
 	struct mio_obj_id semid;
 
-        assert(sizeof(struct mstro_cdo_id)==sizeof(struct mio_obj_id));
+	assert(sizeof(struct mstro_cdo_id)==sizeof(struct mio_obj_id));
 	memcpy(&semid, ticket->mio->semid.data, ticket->mio->semid.len);
 	memcpy(&oid, ticket->mio->objid.data, ticket->mio->objid.len);
 
@@ -625,7 +621,7 @@ mstro_transport_mio_dst_execute(mstro_cdo dst,
   }
   INFO("Executing mio transport dst side for CDO %s\n", dst->name);
 
-  int64_t len = ticket->data_size;
+  
   DEBUG("receiving %zu bytes\n", len);
   
   struct mstro_cdo_id oid;
@@ -640,7 +636,7 @@ mstro_transport_mio_dst_execute(mstro_cdo dst,
                    INFO("Sanity check object ID from ticket: %s\n",
                         idstr););
   
-  if (! (MSTRO_OK == mstro_mio_obj_read_sync(dst, (struct mio_obj_id*)&oid, len, (struct mio_obj_id*)&semid))) {
+  if (! (MSTRO_OK == mstro_mio_obj_read_sync(dst, ticket, (struct mio_obj_id*)&oid, (struct mio_obj_id*)&semid))) {
     /* Already printed an error */
     return MSTRO_FAIL;
   }
diff --git a/transport/rdma.c b/transport/rdma.c
index 6ad623a187a412ec575af7b0ba5f5ec5d6fc9704..8afb5cafa72be0908a9e9250a36803dfc6a3207a 100644
--- a/transport/rdma.c
+++ b/transport/rdma.c
@@ -222,7 +222,9 @@ FRESH_REGISTRATION:
       ERR("Couldn't get CDO data and size for RDMA transport (status: %s)\n", mstro_status_description(status));
       goto BAILOUT_UNLOCK;
     }
-    regentry->addr=dl.data;
+    /*Calculate the src ptr taking into account source offsets for dist_cdos */
+    regentry->addr=(void *) ((char *) dl.data+ticket->src_offset);
+    DEBUG("Moving cdo ptr by %zu as src offset \n", ticket->src_offset);
     regentry->len=dl.len;
 
     if (mstro_memlock(dl.data, dl.len) != MSTRO_OK) {
@@ -234,7 +236,7 @@ FRESH_REGISTRATION:
     /*  register this address for OFI */
     struct fid_mr *mr;
     uint64_t requested_key = e->ep->fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : mstro_memory_new_key();
-    int err = fi_mr_reg(e->ep->domain, dl.data, dl.len,
+    int err = fi_mr_reg(e->ep->domain, regentry->addr, dl.len,
                         FI_REMOTE_READ, 0, requested_key, 0, &mr, NULL);
     if(err) {
       ERR("Couldn't register memory region for RDMA transport (err: %d)\n", err);
@@ -300,7 +302,7 @@ mstro_transport_rdma_src_execute_bh(Mstro__Pool__TransferCompleted *tc)
   struct mstro_pm_app_registry_entry *e;
   status = mstro_pm_app_lookup(tc->dstid->id, &e);
   if(e==NULL) {
-    ERR("Target %" PRIappid " not in app table\n", tc->dstid->id);
+    ERR("Target app %" PRIappid " not in app table\n", tc->dstid->id);
     return MSTRO_FAIL;
   } 
 
@@ -426,8 +428,11 @@ mstro_transport_rdma_dst_execute(mstro_cdo cdo_dst, Mstro__Pool__TransferTicket*
   }
 */
 
+  /*write data with correct offset at dst for dist_cdos --silence compiler warnings*/
+  void * dst_ptr = (void *) ((char *) cdo_dst->raw_ptr+ticket->dst_offset); 
+
   uint64_t requested_key = app_entry->ep->fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : mstro_memory_new_key();
-  int err = fi_mr_reg(app_entry->ep->domain, cdo_dst->raw_ptr, len,
+  int err = fi_mr_reg(app_entry->ep->domain, dst_ptr, len,
                   FI_READ, 0, requested_key, 0, &mr, NULL);
   if (err) {
 	ERR("Couldn't register memory region for RDMA transport (err: %d, %s)\n", 
@@ -511,9 +516,11 @@ mstro_transport_rdma_dst_execute(mstro_cdo cdo_dst, Mstro__Pool__TransferTicket*
     goto BAILOUT;
   }
 
+
   int num_retries = 3;
+  
  RETRY_RDMA_TRANSPORT_READ: ;
-  ret = fi_read(app_entry->ep->ep, cdo_dst->raw_ptr, len, local_buf_mr_desc,
+  ret = fi_read(app_entry->ep->ep, dst_ptr, len, local_buf_mr_desc,
                     app_entry->addr, mr_addr, mr_key, ctx);
 
   if(ret==-FI_EAGAIN) {
diff --git a/transport/transport.c b/transport/transport.c
index dde01505c35a182329fd470273f0b25fe30f6ffb..0ed0cf0d5fb025b87b74d35d26e203be733ff0df 100644
--- a/transport/transport.c
+++ b/transport/transport.c
@@ -132,6 +132,7 @@ mstro_transport__src_datalen_get(mstro_cdo src,
 {
   enum mstro_cdo_attr_value_type type;
   const int64_t* val;
+  bool is_distributed = ticket->distributed_cdo;
 
   const void *sizep=NULL;
   int64_t src_size;
@@ -148,30 +149,37 @@ mstro_transport__src_datalen_get(mstro_cdo src,
     return MSTRO_FAIL;
   }
 
-  if(src->mamba_array!=NULL) {
+  if((!is_distributed) && (src->mamba_array!=NULL)) { /* Not distributed and mamba array */
     /* mamba array wins */
     mmbArray* ma = src->mamba_array;
     dl->data = (char*)ma->allocation->ptr;
     dl->len = ma->allocation->n_bytes;
     DEBUG("CDO size is %zu (from mamba, attributes say %" PRIi64 ", ticket %" PRIi64"), mamba array at %p\n",
           dl->len, src_size, ticket->data_size, ma);
-  } else if(src->raw_ptr != NULL) {
+    } else if ((is_distributed) && (src->mamba_array!=NULL)) { /* Distributed and mamba array */
+    /* mamba array wins */
+    mmbArray* ma = src->mamba_array;
+    dl->data = (char*)ma->allocation->ptr;
+    dl->len = ticket->data_size; /* the ticket has the actual size of intersection */
+    DEBUG("Instersection size is %zu (from mamba, attributes say cdo size %" PRIi64 ", ticket %" PRIi64"), mamba array at %p\n",
+          dl->len, src_size, ticket->data_size, ma);
+    } else if(src->raw_ptr != NULL) {
     assert(src_size==ticket->data_size);
     dl->len = ticket->data_size;
     dl->data = src->raw_ptr;
     DEBUG("CDO size is %zu, raw data at %p\n",
           dl->len, dl->data);
-  } else {
+    } else {
     /* both NULL */
     assert(ticket->data_size==0);
     dl->len = ticket->data_size;
     dl->data = src->raw_ptr;
     DEBUG("CDO size is %zu (from ticket, attributes say %" PRIi64 "), raw data at %p\n",
           dl->len, src_size, dl->data);
-  } 
-  
+    }
+
   NOISE("CDO size: %zu\n", dl->len);
-  
+
   return MSTRO_OK;
 }
 
@@ -281,7 +289,7 @@ mstro_transport_ticket_issue(
     return MSTRO_FAIL;
   }
   ticket->data_size = *(size_t*)val;
-  
+
   switch (ticket->ticket_case) {
     case MSTRO__POOL__TRANSFER_TICKET__TICKET_GFS: ;
       //char* filename = malloc(sizeof(char)*MSTRO_MAX_PATH_LEN);
@@ -289,16 +297,16 @@ mstro_transport_ticket_issue(
       //            gfs.path = filename;
       ticket->gfs->path = MSTRO_TRANSPORT_DEBUG_PATH;
       ticket->gfs->keep_file = 0; // Arbitrarily rm the transport file on dst
-      break; 
-      
+      break;
+
     case MSTRO__POOL__TRANSFER_TICKET__TICKET_MIO: ; //FIXME PM chooses a semid at initiate-transfer time
-      struct mstro_cdo_id* semid; 
+      struct mstro_cdo_id* semid;
       semid = malloc(sizeof(struct mio_obj_id));
       if (semid == NULL) {
         ERR("No more memory.\n");
         return MSTRO_FAIL;
       }
-      struct mstro_cdo_id* objid; 
+      struct mstro_cdo_id* objid;
       objid = malloc(sizeof(struct mio_obj_id));
       if (objid == NULL) {
         ERR("No more memory.\n");
@@ -317,38 +325,38 @@ mstro_transport_ticket_issue(
         ERR("Couldn't make an id from name for semaphore obj\n");
         return MSTRO_FAIL;
       }
-      
+
       WITH_MIO_OBJ_STR(idstr, (struct mstro_cdo_id*)semid,
                        INFO("Semaphore has ID: %s\n",
                             idstr););
       WITH_CDO_ID_STR(idstr, &(src_cdo->gid),
                       INFO("(CDO associated has ID: %s)\n",
                            idstr););
-      
+
       assert(sizeof(struct mstro_cdo_id) == 2*sizeof(uint64_t));
       assert(sizeof(struct mstro_cdo_id) == sizeof(struct mio_obj_id));
-      
+
       ticket->mio->semid.len = sizeof(struct mstro_cdo_id);
       ticket->mio->semid.data = semid;
       ticket->mio->objid.len = sizeof(struct mstro_cdo_id);
       objid->qw[0] = ticket->cdoid->qw0;
       objid->qw[1] = ticket->cdoid->qw1;
       ticket->mio->objid.data = objid;
-      
+
       ticket->mio->keep_obj = 0;
       break;
-      
+
     default:
       ERR("No ticket issueing case for this transport\n");
       return MSTRO_UNIMPL;
   }
 #endif
-  return MSTRO_UNIMPL; /* Issuing is done in maestro/ofi.c */ 
+  return MSTRO_UNIMPL; /* Issuing is done in maestro/ofi.c */
 }
 
 mstro_status
 mstro_transport_execute(
-    mstro_cdo  cdo, 
+    mstro_cdo  cdo,
     Mstro__Pool__TransferTicket* ticket)
 {
   if(cdo == NULL || ticket == NULL) {
@@ -356,7 +364,9 @@ mstro_transport_execute(
     return MSTRO_INVARG;
   }
   mstro_status s = MSTRO_OK;
-
+  /* Is this a ticket for a distributed cdo? */
+  bool is_distributed = ticket->distributed_cdo;
+  
   if (cdo->state & MSTRO_CDO_STATE_OFFERED) {
     /* we are called on the source side of transport */
     {
@@ -365,25 +375,29 @@ mstro_transport_execute(
       const void *sizep=NULL;
       int64_t src_size;
       mstro_status s = mstro_attribute_dict_get(cdo->attributes,
-	  MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
-	  &type, &sizep, NULL, false);
+	                                          MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE,
+	                                          &type, &sizep, NULL, false);
       if(s==MSTRO_OK) {
-	src_size=*(int64_t*)sizep;
+	      src_size=*(int64_t*)sizep;
       } else if(s==MSTRO_NOENT && type==MSTRO_CDO_ATTR_VALUE_INVALID) {
-	ERR("Attribute not found or value invalid\n");
-	src_size=-1;
+	      ERR("Attribute not found or value invalid\n");
+	      src_size=-1;
       } else {
-	ERR("Failed to look up %s (%d: %s)\n",
-	    MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE, s, mstro_status_description(s));
-	return MSTRO_FAIL;
+	      ERR("Failed to look up %s (%d: %s)\n",
+	      MSTRO_ATTR_CORE_CDO_SCOPE_LOCAL_SIZE, s, mstro_status_description(s));
+	      return MSTRO_FAIL;
       }
-
+      
       if (ticket->data_size == 0) {
-	DEBUG("0-transport\n");
-      } else if (src_size != ticket->data_size) {
-	ERR("SRC cdo has a size (%" PRIi64 ") that doesn't match ticket-specified size (%" PRIi64 ")\n",
-	    src_size, ticket->data_size);
-	return MSTRO_FAIL;
+	      DEBUG("0-transport\n");
+      } else if ((src_size != ticket->data_size) && (!is_distributed)) {
+	      ERR("SRC cdo has a size (%" PRIi64 ") that doesn't match ticket-specified size (%" PRIi64 ")\n",
+	      src_size, ticket->data_size);
+        return MSTRO_FAIL;
+      } else if ((src_size < ticket->data_size) && (is_distributed)) {
+        ERR("SRC cdo size (%" PRIi64 ") is less than ticket-specified intersection length (%" PRIi64 ")\n",
+	      src_size, ticket->data_size);
+        return MSTRO_FAIL;
       }
     }
     mstro_status (*f)(mstro_cdo src, Mstro__Pool__TransferTicket *t)
@@ -394,7 +408,7 @@ mstro_transport_execute(
       return MSTRO_FAIL;
     }
     if (ticket->data_size > 0) {
-      s=f(cdo, ticket);	
+      s=f(cdo, ticket);
     } else {
       DEBUG("Skipping (SRC-side) 0-length transfer\n");
     }
@@ -414,7 +428,44 @@ mstro_transport_execute(
 
     /* FIXME: we should also decide the most suitable memory layer
      * here for our space adjustment */
-    s = mstro_cdo__adjust_space(cdo, ticket->data_size, policy);
+
+    /* if it is a distributed cdo, then cdo is should be calculated from the distribution
+     * not the size recieved in the ticket, which could be only one piece of the required data*/
+
+    if(is_distributed) {
+      /*calculate the cdo size from distribution */
+      int64_t cdo_size = 0;
+      mmbLayout *dst_layout = NULL;
+      mmbLayout *src_layout = NULL;
+      mmbError stat;
+      s = mstro_attribute_pool_find_dist_layout(cdo->attributes_msg, &dst_layout);
+      assert(s == MSTRO_OK);
+    
+      if(dst_layout == NULL) {
+      /* cook a default layout for dst from src */
+      s = mstro_attribute_pool_find_dist_layout(ticket->attributes, &src_layout);
+      assert(s == MSTRO_OK);
+      stat = mmb_layout_dist_create_default_layout(src_layout, &dst_layout);
+      assert(stat == MMB_OK);
+      }
+      stat = mmb_layout_dist_find_piece_size(dst_layout, &cdo_size);
+      assert(stat == MMB_OK);
+      DEBUG("Size of my piece as calculated from my distribution layout is %"PRId64" \n", cdo_size);
+
+      /* free up allocated layout objects*/
+      if(src_layout) {
+        mmb_layout_destroy(src_layout);
+      }
+      if(dst_layout) {
+        mmb_layout_destroy(dst_layout);
+      }
+
+      s = mstro_cdo__adjust_space(cdo, cdo_size, policy);
+    }
+    else {
+      s = mstro_cdo__adjust_space(cdo, ticket->data_size, policy);
+    }
+    
     if(s!=MSTRO_OK) {
       ERR("Failed to adjust space of CDO destination handle\n");
       return s;
@@ -437,8 +488,8 @@ mstro_transport_execute(
     if (ticket->data_size > 0) {
       s=f(cdo, ticket);
       if(s!=MSTRO_OK) {
-	ERR("Incoming transfer failed\n");
-	goto BAILOUT;
+	      ERR("Incoming transfer failed\n");
+	      goto BAILOUT;
       }
     } else {
       DEBUG("Skipping (DST-side) 0-length transfer\n");
@@ -453,7 +504,7 @@ mstro_transport_execute(
 
     /* layout-related attributes and a few more are always precious, as
      * transformations will have handled them */
-#define NUM_PRECIOUS 7
+#define NUM_PRECIOUS 8
     const size_t num_precious_attr = NUM_PRECIOUS;
     const char *precious_attributes[NUM_PRECIOUS] = {
       MSTRO_ATTR_CORE_CDO_RAW_PTR,
@@ -462,7 +513,8 @@ mstro_transport_execute(
       MSTRO_ATTR_CORE_CDO_LAYOUT_ELEMENT_SIZE,
       MSTRO_ATTR_CORE_CDO_LAYOUT_NDIMS,
       MSTRO_ATTR_CORE_CDO_LAYOUT_ORDER,
-      MSTRO_ATTR_CORE_CDO_LAYOUT_DIMS_SIZE
+      MSTRO_ATTR_CORE_CDO_LAYOUT_DIMS_SIZE,
+      MSTRO_ATTR_CORE_CDO_DIST_LAYOUT
     };
 
     s = mstro_cdo_attributes_update_incoming(cdo, ticket->attributes,
@@ -488,19 +540,19 @@ mstro_transport_execute(
       struct mstro_cdo_id srccdoid = {.qw[0] = ticket->srccdoid->qw0,
                                       .qw[1] = ticket->srccdoid->qw1,
                                       .local_id = ticket->srccdoid->local_id };
-                                      
+
       s = mstro_pc__transport_send_completion(ticket->srcid->id,
                                               &srccdoid,&cdo->gid,
                                               ticket->want_completion);
       if (s!= MSTRO_OK) {
-	ERR("Couldn't send completion for RDMA transport of CDO `%s`\n", cdo->name);
-	goto BAILOUT;
+	      ERR("Couldn't send completion for RDMA transport of CDO `%s`\n", cdo->name);
+	      goto BAILOUT;
       }
 
       /* indicate that CDO is filled */
       s = mstro_cdo__mark_transfer_complete(cdo);
       if(s!=MSTRO_OK) {
-	ERR("Failed to indicate completion of incoming CDO: %d (%s)\n",
+	      ERR("Failed to indicate completion of incoming CDO: %d (%s)\n",
 	    s, mstro_status_description(s));
       }
     }
@@ -509,9 +561,7 @@ mstro_transport_execute(
 	cdo->name, cdo->state);
     return MSTRO_FAIL;
   }
-  
-BAILOUT:  
+
+BAILOUT:
   return s;
 }
-
-