From 0a87bdf3e9d3a71e8caa4a2849f1e1e682a73737 Mon Sep 17 00:00:00 2001
From: George Katevenis <gkatev@ics.forth.gr>
Date: Thu, 31 Oct 2024 11:03:35 +0200
Subject: [PATCH] coll/xhc: Bring in latest developments

Multiple changes, new features, and enchancements

Notable new features:

- Inlining data in control path for extra small messages.
- Allreduce goes through Bcast's implementation intead of duplicating it.
- Fine grained op-wise tuning for hierarchy, chunk size, cico threshold.
- Reduce reworked & improved, with double buffering and other optimizations.
- When smsc support is not present, handle only small messages (below
	cico threshold), instead of not at all.

Signed-off-by: George Katevenis <gkatev@ics.forth.gr>
---
 ompi/mca/coll/xhc/Makefile.am          |    6 +-
 ompi/mca/coll/xhc/README.md            |  461 +++++---
 ompi/mca/coll/xhc/coll_xhc.c           |  965 ++++++++--------
 ompi/mca/coll/xhc/coll_xhc.h           |  625 ++++++----
 ompi/mca/coll/xhc/coll_xhc_allreduce.c | 1474 ++++++++++++++----------
 ompi/mca/coll/xhc/coll_xhc_atomic.h    |  116 --
 ompi/mca/coll/xhc/coll_xhc_barrier.c   |   99 +-
 ompi/mca/coll/xhc/coll_xhc_bcast.c     |  516 +++++----
 ompi/mca/coll/xhc/coll_xhc_comm.c      |  382 ++++++
 ompi/mca/coll/xhc/coll_xhc_component.c |  570 ++++++---
 ompi/mca/coll/xhc/coll_xhc_hierarchy.c |  403 +++++++
 ompi/mca/coll/xhc/coll_xhc_intrinsic.h |  151 +++
 ompi/mca/coll/xhc/coll_xhc_module.c    |  718 +++---------
 ompi/mca/coll/xhc/coll_xhc_reduce.c    |   15 +-
 ompi/mca/coll/xhc/help-coll-xhc.txt    |   37 +-
 15 files changed, 3935 insertions(+), 2603 deletions(-)
 delete mode 100644 ompi/mca/coll/xhc/coll_xhc_atomic.h
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_comm.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_hierarchy.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_intrinsic.h

diff --git a/ompi/mca/coll/xhc/Makefile.am b/ompi/mca/coll/xhc/Makefile.am
index 35db0b89c12..0dab908bc75 100644
--- a/ompi/mca/coll/xhc/Makefile.am
+++ b/ompi/mca/coll/xhc/Makefile.am
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+# Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
 #                         Laboratory, ICS Forth. All rights reserved.
 # $COPYRIGHT$
 #
@@ -12,8 +12,10 @@ dist_opaldata_DATA = help-coll-xhc.txt
 
 sources = \
     coll_xhc.h \
-    coll_xhc_atomic.h \
+    coll_xhc_intrinsic.h \
     coll_xhc.c \
+    coll_xhc_comm.c \
+    coll_xhc_hierarchy.c \
     coll_xhc_component.c \
     coll_xhc_module.c \
     coll_xhc_bcast.c \
diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md
index 325170b7179..213062a5edc 100644
--- a/ompi/mca/coll/xhc/README.md
+++ b/ompi/mca/coll/xhc/README.md
@@ -1,64 +1,79 @@
 # XHC: XPMEM-based Hierarchical Collectives
 
-The XHC component, implements hierarchical & topology-aware intra-node MPI
-collectives, utilizing XPMEM in order to achieve efficient shared address space
-memory access between processes.
+XHC implements hierarchical & topology-aware intra-node MPI collectives,
+utilizing XPMEM for efficient shared address space memory access between
+processes.
 
 ## Main features
 
-* Constructs an **n-level hierarchy** (i.e. no algorithmic limitation on level
-count), following the system's hardware topology. Ranks/processes are grouped
-together according to their relative locations; this information is known
-thanks to Hwloc, and is obtained via OpenMPI's integrated book-keeping.
-
-    Topological features that can currently be defined (configurable via MCA params):
-
-    - NUMA node
-    - CPU Socket
-    - L1, L2, L3 cache
-    - Hwthread, core
-    - Node/flat (no hierarchy)
-
-    Example of a 3-level XHC hierarchy (sensitivity to numa & socket locality):
-
-    ![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg)
-
-    Furthermore, support for custom virtual user-defined hierarchies is
-    available, to aid when fine-grained control over the communication pattern
-    is necessary.
-
-* Support for both **zero-copy** and **copy-in-copy-out** data transportation.
-    - Switchover at static but configurable message size.
-
-    - CICO buffers permanently attached at module initialization
-
-    - Application buffers attached on the fly the first time they appear, saved
-    on and recovered from registration cache in subsequent appearances.
-    (assuming smsc/xpmem)
-
-* Integration with Open MPI's `opal/smsc` (shared-memory-single-copy)
-framework. Selection of `smsc/xpmem` is highly recommended.
-
-    - Bcast support: XPMEM, CMA, KNEM
-    - Allreduce support: XPMEM
-    - Barrier support: *(all, irrelevant)*
-
-* Data-wise **pipelining** across all levels of the hierarchy allows for
-lowering hierarchy-induced start-up overheads. Pipelining also allows for
-interleaving of operations in some collectives (reduce+bcast in allreduce).
-
-* **Lock-free** single-writer synchronization, with cache-line separation where
-necessary/beneficial. Consistency ensured via lightweight memory barriers.
+* XHC constructs an **n-level hierarchy** (i.e. no algorithmic limitation on
+level count), based on intra-node topological features. Rank/process locality
+information is known thanks to Hwloc, and is obtained from Open MPI's
+integrated book-keeping.
+	
+	Topological features that can currently be defined:
+	
+	- NUMA node
+	- CPU Socket
+	- L1/L2/L3 cache
+	- Hwthread/core
+	- Node (all ranks *are* in same node --> flat, no hierarchy at all)
+	
+	Example of a 3-level XHC hierarchy (numa+socket+node configuration):
+	
+	![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg)
+	
+	Furthermore, support for custom virtual user-defined hierarchies is
+	available, to allow fine-grained control over the communication pattern.
+
+* **Single-copy** transportation
+	
+	- Supported through integration with Open MPI's `opal/smsc`
+	(shared-memory-single-copy) framework. Selecting `smsc/xpmem` is highly
+	recommended.
+	
+		- Bcast support: XPMEM, CMA, KNEM
+		- Allreduce/Reduce support: XPMEM
+		- Barrier support: *(irrelevant)*
+	
+	- Application buffers are attached on the fly the first time they appear,
+	saved on and recovered from the registration cache in subsequent
+	appearances. (assuming smsc/xpmem)
+
+* **Copy-in-copy-out (CICO)** transportation
+	
+	- Through shared memory buffers that remain active throughout the
+	component's lifetime.
+	
+	- Switchover with single-copy at configurable message size.
+	
+	- Supported in all ops, regardless of smsc support or XPMEM presence (up to
+	maximum allowed message size).
+
+* **Inline** transportation
+	
+	- For especially small messages, payload data is inlined in the same cache
+	line as the control data.
+	
+	- Supported in all ops, regardless of smsc support or XPMEM presence (up to
+	maximum allowed message size).
+
+* Data-wise **pipelining** across all levels of the hierarchy. Allows for
+lowering hierarchy-induced start-up overheads, and interleaving of operations
+in applicable operations (e.g. reduce+bcast in allreduce).
+
+* **Lock-free** single-writer synchronization, with appropriate cache-line
+separation where necessary. Consistency ensured via lightweight *read* or
+*write* memory barriers.
 
 ## Configuration options -- MCA params
 
 XHC can be customized via a number of standard Open MPI MCA parameters, though
 defaults that should satisfy a wide number of systems are in place.
 
-The available parameters:
+The available parameters (also found in `coll_xhc_component.c`):
 
 #### *(prepend with "coll_xhc_")*
-*(list may be outdated, please also check `ompi_info` and `coll_xhc_component.c`)*
 
 * **priority** (default `0`): The priority of the coll/xhc component, used
 during the component selection process.
@@ -70,138 +85,146 @@ hierarchy and its configuration.
 used for XHC's synchronization fields and CICO buffers.
 
 * **dynamic_leader** (default `false`): Enables the feature that dynamically
-elects an XHC group leader at each collective (currently only applicable
-to bcast).
-
-* **dynamic_reduce** (default `1`=`non-float`): Controls the
-feature that allows for out-of-order reduction. XHC ranks reduce chunks
-directly from multiple peers' buffers; dynamic reduction allows them to
-temporarily skip a peer when the expected data is not yet prepared, instead of
-stalling. Setting to `2`=`all`, might/will harm reproducibility of float-based
-reductions.
-
-* **coll_xhc_lb_reduce_leader_assist** (default `top,first`): Controls the
-leader-to-member load balancing mode in reductions. If set to none/empty (`""`)
-only non-leader group members perform reductions. With `top` in the list, the
-leader of the top-most level also performs reductions in his group. With
-`first` in the list, leaders will help in the reduction workload for just one
-chunk at the beginning of the operation. If `all` is specified, all group
-members, including the leaders, perform reductions indiscriminately.
-
-* **force_reduce** (default `false`): Force enable the "special" Reduce
-implementation for all calls to MPI_Reduce. This implementation assumes that
-the `rbuf` parameter to MPI_Reduce is valid and appropriately sized for all
-ranks; not just the root -- you have to make sure that this is indeed the case
-with the application at hand. Only works with `root = 0`.
+elects an XHC-communicator leader at each collective (currently only applicable
+for bcast).
+
+* **dynamic_reduce** (default `1`=`non-float`): Enables support for
+out-of-order reduction. Ranks fetch data to reduce from multiple peers;
+out-of-order reduction allows them to temporarily skip a peer when the expected
+data is not yet prepared, instead of stalling. The default value auto-enables
+it when the data is of non-float type; setting to `2`=`enabled for all types`,
+might/will harm reproducibility of reductions with float types.
+
+* **reduce_load_balance** (default `0`=`non-leader`): Controls the
+leader-to-member load balancing mode in reductions. Under `non-leader`, the
+members, and not the leaders, perform reductions. With `top-level`, all members
+as well as the leader of the top-most level perform reductions. With
+`first-chunk`, leaders perform a single reduction on each level for a single
+chunk at the beginning of the operation. `top+first` combines `top-level` and
+`first-chunk`. Finally, with `all`, all ranks perform reductions equally.
 
 * **hierarchy** (default `"numa,socket"`): A comma separated list of
 topological feature to which XHC's hierarchy-building algorithm should be
 sensitive. `ompi_info` reports the possible values for the parameter.
-
-    - In some ways, this is "just" a suggestion. The resulting hierarchy may
-    not exactly match the requested one. Reasons that this will occur:
-
-        - A requested topological feature does not effectively segment the set
-        of ranks. (eg. `numa` was specified, but all ranks reside in the same
-        NUMA node)
-
-        - No feature that all ranks have in common was provided. This a more
-        intrinsic detail, that you probably don't need to be aware of, but you
-        might come across if eg. you investigate the output of `print_info`. An
-        additional level will automatically be added in this case, no need to
-        worry about it.
-
-            For all intents and purposes, a hierarchy of `numa,socket` is
-            interpreted as "segment the ranks according to NUMA node locality,
-            and then further segment them according to CPU socket locality".
-            Three groups will be created: the intra-NUMA one, the intra-socket
-            one, and an intra-node one.
-
-        - The provided features will automatically be re-ordered when their
-        order does not match their order in the physical system. (unless a
-        virtual hierarchy feature is present in the list)
-
-    - *Virtual Hierarchies*: The string may alternatively also contain "rank
-    lists" which specify exactly which ranks to group together, as well as some
-    other special modifiers. See in
-    `coll_xhc_component.c:xhc_component_parse_hierarchy()` for further
-    explanation as well as syntax information.
+	
+	- In some ways, this is "just" a suggestion. The resulting hierarchy may
+	not exactly match the requested one. Reasons that this will occur:
+		
+		- A requested topological feature does not effectively segment the set
+		of ranks. (eg. `numa` was specified, but all ranks reside in the same
+		NUMA node)
+		
+		- No feature that all ranks have in common was provided. This a more
+		intrinsic detail, that you probably don't need to be aware of, but you
+		might come across if eg. you investigate the output of `print_info`. An
+		additional level will automatically be added in this case, no need to
+		worry about it.
+			
+			For all intents and purposes, a hierarchy of `numa,socket` is
+			interpreted as "segment the ranks according to NUMA node locality,
+			and then further segment them according to CPU socket locality".
+		
+		- The provided features will automatically be re-ordered when their
+		order does not match their order in the physical system. (unless a
+		virtual hierarchy is present in the list)
+	
+	- *Virtual Hierarchies*: The string may alternatively also contain "rank
+	lists" which specify exactly which ranks to group together, as well as some
+	other special modifiers. See
+	`coll_xhc_component.c:xhc_component_parse_hierarchy()` for further
+	explanation as well as syntax information.
 
 * **chunk_size** (default `16K`): The chunk size for the pipelining process.
 Data is processed (eg broadcast, reduced) in this-much sized pieces at once.
-
-    - It's possible to have a different chunk size for each level of the
-    hierarchy, achieved via providing a comma-separated list of sizes (eg.
-    `"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT*
-    correspond to the items on hierarchy list; the hierarchy keys might be
-    re-ordered or reduced to match the system, but the chunk sizes will be
-    consumed in the order they are given, left-to-right -> bottom-to-top.
+	
+	- It's possible to have a different chunk size for each level of the
+	hierarchy, achieved via providing a comma-separated list of sizes (eg.
+	`"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT*
+	correspond to the items on hierarchy list; the hierarchy keys might be
+	re-ordered or reduced to match the system, but the chunk sizes will be
+	consumed in the order they are given, left-to-right -> bottom-to-top.
 
 * **uniform_chunks** (default `true`): Automatically optimize the chunk size
 in reduction collectives, according to the message size, so that all members
 will perform equal work.
 
 * **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk
-size when uniform chunks are enabled. Each worker will reduce at least this much
-data, or we don't bother splitting the workload up.
+size when uniform chunks are enabled.
 
-* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will be
-used for messages of *cico_max* or less bytes.
+* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will
+be used for messages of *cico_max* or less bytes.
 
 *(Removed Parameters)*
 
 * **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*:
 Limit to number of attachments that the registration cache should hold.
-
-    - A case can be made about their usefulness. If desired, should be
-    re-implemented at smsc-level.
+	
+	- A case can be made about their usefulness. If desired, shall be
+	re-implemented at smsc-level.
 
 ## Limitations
 
 - *Intra-node support only*
-    - Usage in multi-node scenarios is possible via OpenMPI's HAN.
+	- Define XHC as `coll/HAN`'s intra-node component to reap its benefits in
+	multi-node runs.
 
 - **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise)
-datatype representations. (determined according to `proc_arch` field)
+datatype representations. (determined according to Open MPI's `proc_arch`)
 
 - **Non-commutative** operators are not supported by XHC's reduction
-collectives. In past versions, they were supported, but only with the flat
-hierarchy configuration; this could make a return at some point.
+collectives. In past versions, they were, but only with a flat hierarchy; this
+could make a return at some point.
+
+- **Derived Datatypes** are currently not supported.
 
-- XHC's Reduce is not fully complete. Instead, it is a "special" implementation
-of MPI_Reduce, that is realized as a sub-case of XHC's Allreduce.
+- XHC's Reduce currently only supports rank 0 as the root, and will
+automatically fall back to another component for other cases.
 
-    - If the caller guarantees that the `rbuf` parameter is valid for all ranks
-    (not just the root), like in Allreduce, this special Reduce can be invoked
-    by specifying `root=-1`, which will trigger a Reduce to rank `0` (the only
-    one currently supported).
+## Building
 
-        - Current prime use-case: HAN's Allreduce
+This section describes how to compile the XHC component.
 
-    - Furthermore, if it is guaranteed that all Reduce calls in an application
-    satisfy the above criteria, see about the `force_reduce` MCA parameter.
+XPMEM support in Open MPI is required to reap the full benefits of XHC.
+	
+- The XHC component will build and work without XPMEM support, but for large
+messages (i.e. ones above the CICO threshold) Allreduce/Reduce will be
+disabled, and Broadcast will fall-back to less efficient mechanisms.
 
-    - XHC's Reduce is not yet fully optimized for small messages.
+- XPMEM can be obtained from <https://github.com/hpc/xpmem>, and then
+compiled like a common kernel module. You might need to manually point Open
+MPI's configure script to XPMEM's installation location, via the
+`--with-xpmem=` parameter.
 
-## Building
+- At run-time, you will need to insert the kernel module and obtain proper
+access rights to `/dev/xpmem`.
 
-XHC is built as a standard mca/coll component.
+Apart from instructing Open MPI to include XPMEM support, the rest of the build
+process is standard. General information on building Open MPI can be found in
+its documentation.
 
-To reap its full benefits, XPMEM support in OpenMPI is required. XHC will build
-and work without it, but the reduction operations will be disabled and
-broadcast will fall-back to less efficient mechanisms (CMA, KNEM).
+<https://www.open-mpi.org/doc/>  
+<https://www.open-mpi.org/faq/?category=building>  
+<https://github.com/open-mpi/ompi/blob/master/README.md>
 
 ## Running
 
-In order for the XHC component to be chosen, make sure that its priority is
-higher than other components that provide the collectives of interest; use the
-`coll_xhc_priority` MCA parameter. If a list of collective modules is included
-via the `coll` MCA parameter, make sure XHC is in the list.
+General information on running Open MPI jobs can be found here:  
+<https://www.open-mpi.org/faq/?category=running>  
+<https://docs.open-mpi.org/en/v5.0.x/launching-apps/index.html>
+
+`mpirun`'s man page will also be useful:  
+<https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man1/mpirun.1.html>
 
-* You may also want to add the `--bind-to core` param. Otherwise, the reported
-process localities might be too general, preventing XHC from correctly
-segmenting the system. (`coll_xhc_print_info` will report the generated
-hierarchy)
+In order for the XHC component to be chosen, its priority must be manually set
+higher than other collectives components that implement the same primitives,
+via the `coll_xhc_priority` MCA param.
+
+	- Example: `--mca coll_xhc_priority 100`
+
+* Most likely, you will also want the `--bind-to core` param. Otherwise, the
+reported process localities might be too general, preventing XHC from correctly
+segmenting the system. (MCA `coll_xhc_print_info` will report the generated
+hierarchy if you wish to experiment)
 
 ### Tuning
 
@@ -209,74 +232,140 @@ hierarchy)
 XHC's hierarchy should conform to. The default is `numa,socket`, which will
 group the processes according to NUMA locality and then further group them
 according to socket locality. See the `coll_xhc_hierarchy` param.
-
-    - Example: `--mca coll_xhc_hierarchy numa,socket`
-    - Example: `--mca coll_xhc_hierarchy numa`
-    - Example: `--mca coll_xhc_hierarchy flat`
-
-    In some systems, small-message Broadcast or the Barrier operation might
-    perform better with a flat tree instead of a hierarchical one. Currently,
-    manual benchmarking is required to accurately determine this.
+	
+	- Example: `--mca coll_xhc_hierarchy numa,socket`
+	- Example: `--mca coll_xhc_hierarchy numa`
+	- Example: `--mca coll_xhc_hierarchy flat`
+	
+	In some systems, small-message Broadcast or the Barrier operation might
+	perform better with a flat tree instead of a hierarchical one. Currently,
+	manual benchmarking is required to accurately determine this.
 
 * Optional: You might wish to tune XHC's chunk size (default `16K`). Use the
 `coll_xhc_chunk_size` param, and try values close to the default and see if
-improvements are observed. You may even try specifying different chunk sizes
-for each hierarchy level -- use the same process, starting from the same chunk
-size for all levels and decreasing/increasing from there.
-
-    - Example: `--mca coll_xhc_chunk_size 16K`
-    - Example: `--mca coll_xhc_chunk_size 16K,32K,128K`
+improvements are observed.
 
-* Optional: If you wish to focus on latencies of small messages, you can try
-altering the cico-to-zcopy switchover point (`coll_xhc_cico_max`, default
-`1K`).
+	- Example: `--mca coll_xhc_chunk_size 16K`
 
-    - Example: `--mca coll_xhc_cico_max 1K`
+* Optional: If you wish to focus on latencies of small/medium size messages,
+you can try altering the cico-to-zcopy switchover point (MCA
+`coll_xhc_cico_max`, default `1K`).
+	
+	- Example: `--mca coll_xhc_cico_max 1K`
 
 * Optional: If your application is heavy in Broadcast calls and you suspect
 that specific ranks might be joining the collective with delay and causing
-others to stall waiting for them, you could try enabling dynamic leadership
-(`coll_xhc_dynamic_leader`), and seeing if it marks an improvement.
-
-    - Example: `--mca coll_xhc_dynamic_leader 1`
+others to stall waiting for them, try enabling dynamic leadership (MCA
+`coll_xhc_dynamic_leader`), and seeing if it makes an improvement. Please let
+us know if it does :-).
+	
+	- Example: `--mca coll_xhc_dynamic_leader 1`
 
 ### Example command lines
 
 *Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.*
 
 Default XHC configuration:  
-`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --bind-to core <application>`
+`$ mpirun --mca coll_xhc_priority 100 --bind-to core <application>`
 
 XHC w/ numa-sensitive hierarchy, chunk size @ 16K:  
-`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core <application>`
+`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core <application>`
 
 XHC with flat hierarchy (ie. none at all):  
-`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] <application>`
+`$ mpirun --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] <application>`
 
-## Publications
+## Benchmarking
 
-1. **A framework for hierarchical single-copy MPI collectives on multicore nodes**,  
-*George Katevenis, Manolis Ploumidis, Manolis Marazakis*,  
-IEEE Cluster 2022, Heidelberg, Germany.  
-https://ieeexplore.ieee.org/document/9912729
+This section outlines some tips for benchmarking XHC and intra-node MPI
+collectives in general.
 
-## Contact
+### Micro-Benchmarks
 
-- George Katevenis (gkatev@ics.forth.gr)
-- Manolis Ploumidis (ploumid@ics.forth.gr)
+For our micro-benchmarking purposes, we have been using [OSU's microbenchmark
+suite](https://mvapich.cse.ohio-state.edu/benchmarks/). However, when
+micro-benchmarking intra-node collectives, there are some important details
+that one needs to look out for.
 
-Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth
+**CPU Cache** An issue with the OSU micro-benchmarks is that they use the same
+buffer for each iteration without altering it. Since modern processors
+implicitly cache data, this can lead to false/unrealistic/unrepresentative
+results, given that actual real-world applications do not (usually/optimally!)
+perform duplicate operations.
+
+Availability of collective operation source data on a processor's local cache
+hierarchy will cause certain phenomenons (e.g. slow path memory transactions)
+and their effects to remain hidden and undetected in the micro-benchmarking
+process, even though they *will* negatively impact performance in actual
+applications,
+
+We have created "data-varying" (`_dv` suffix) benchmarks to counter this
+problem, which will alter the data before each iteration.
+
+**Microbenchmark's pre-op Barrier** One also needs to be aware how the barrier
+that appears before each iteration in the OSU micro-benchmarks affects the
+result, especially so when latencies of small messages are concerned. The
+underlying implementation of this barrier and the speed/efficiency of its
+"release stage" will affect how fast and how synchronized ranks will exit the
+barrier, and therefore how fast/synchronized they will enter the benchmarked
+collective operation.
+
+For as accurate/clean performance reporting as possible, use a barrier
+implementation that has as low a latency as possible. Furthermore, ideally,
+all ranks should exit the barrier at the exact same time -- this is more
+complex to measure, but can make a difference. In order to have a common
+baseline when benchmarking and comparing multiple collectives implementation,
+use this same barrier implementation for all benchmark scenarios.
+
+In the environments we tested, XHC's barrier was the best performing one. To
+make using this barrier easier, we have put together a small new collective
+component, `XB` (= xhc barrier).
 
-##  Acknowledgments
+XB creates a new nested (duplicate) communicator with a hint to prioritize XHC,
+and delegates barrier operations to it. A slightly inconvenient side-effect is
+that XHC needs to be on the coll list (MCA `--mca coll`); it doesn't need to
+have a high priority, though it can't be less than 0.
 
-We thankfully acknowledge the support of the European Commission and the Greek
-General Secretariat for Research and Innovation under the EuroHPC Programme
-through the **DEEP-SEA** project (GA 955606). National contributions from the
-involved state members (including the Greek General Secretariat for Research
-and Innovation) match the EuroHPC funding.
+* To benchmark Open MPI's `coll/tuned` with XB: `--mca coll basic,libnbc,tuned,xb,xhc --mca coll_xhc_priority 0 --mca coll_xb_priority 95 --mca coll_tuned_priority 90`
 
-This work is partly supported by project **EUPEX**, which has received funding
-from the European High-Performance Computing Joint Undertaking (JU) under grant
-agreement No 101033975. The JU receives support from the European Union's
-Horizon 2020 re-search and innovation programme and France, Germany, Italy,
-Greece, United Kingdom, Czech Republic, Croatia.
+* Or XHC itself, with XB: `--mca coll basic,libnbc,xb,xhc --mca coll_xhc_priority 90 --mca coll_xb_priority 95`
+
+It is also possible to specify the hierarchy to be used for XB's barrier (the
+request will be passed in string form to XHC, only for the nested communicator)
+via the `coll_xb_hierarchy` MCA parameter.
+
+In our fork of the OSU micro-benchmarks, you will also find
+"integrity-checking" variants (`_integrity` suffix). These can help verify that
+collective operations complete successfully without data corruption.
+
+Our OSU micro-benchmarks fork:  
+<https://github.com/CARV-ICS-FORTH/XHC-OpenMPI/tree/xhc-fresh/osu-micro-benchmarks>
+
+The XB component:  
+<https://github.com/CARV-ICS-FORTH/XHC-OpenMPI/tree/xhc-fresh/xb>
+
+### Applications
+
+We expect to see any meaningful performance improvement with XHC in actual
+applications, only if they spend a non-insignificant percentage of their
+runtime in the collective operations that XHC implements: Broadcast, Barrier,
+Allreduce, Reduce.
+
+One known such application is [miniAMR](https://github.com/Mantevo/miniAMR).
+The application parameters (e.g. the refine count and frequency) will affect
+the amount of time spent in the Allreduce primitive.
+
+Another one is Microsoft's [CNTK](https://github.com/microsoft/CNTK), also
+heavy in Allreduce, though it actually makes use of the non-blocking
+`Iallreduce` variant. However, it can easily be converted to use the blocking
+variant instead (contact for patch). Comparing the performance of the
+unmodified CNTK with OpenMPI's `coll/libnbc`, versus that of the patched CNTK
+with XHC reveals that this modification is sensible and beneficial.
+
+Finally, while we have not yet rigorously evaluated it,
+[PiSvM](http://pisvm.sourceforge.net/) is another candidate, with intense use
+of MPI Broadcast.
+
+---
+
+Contact: George Katevenis (gkatev@ics.forth.gr), Manolis Ploumidis (ploumid@ics.forth.gr)  
+Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth
diff --git a/ompi/mca/coll/xhc/coll_xhc.c b/ompi/mca/coll/xhc/coll_xhc.c
index d7221ffb37a..e91e1e8124d 100644
--- a/ompi/mca/coll/xhc/coll_xhc.c
+++ b/ompi/mca/coll/xhc/coll_xhc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -9,577 +9,645 @@
  */
 
 #include "ompi_config.h"
-
 #include "mpi.h"
+
 #include "ompi/constants.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/coll.h"
-#include "ompi/mca/coll/base/base.h"
 
+#include "opal/class/opal_hash_table.h"
 #include "opal/mca/rcache/rcache.h"
 #include "opal/mca/shmem/base/base.h"
 #include "opal/mca/smsc/smsc.h"
 
-#include "opal/include/opal/align.h"
 #include "opal/util/show_help.h"
 #include "opal/util/minmax.h"
 
 #include "coll_xhc.h"
 
-static int xhc_comms_make(ompi_communicator_t *ompi_comm,
-    xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst,
-    int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len);
-static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count);
+// ------------------------------------------------
+
+static int xhc_alloc_bcast_cico(xhc_module_t *module,
+    ompi_communicator_t *comm);
 
-static int xhc_print_info(xhc_module_t *module,
-    ompi_communicator_t *comm, xhc_data_t *data);
+static int xhc_print_config_info(xhc_module_t *module,
+    ompi_communicator_t *comm);
+static int xhc_print_op_info(xhc_module_t *module,
+    ompi_communicator_t *comm, XHC_COLLTYPE_T colltype);
+static int xhc_print_op_hierarchy_dot(xhc_module_t *module,
+    ompi_communicator_t *comm, XHC_COLLTYPE_T colltype);
 
-static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
-    ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i);
-static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds);
 static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info);
 
 // ------------------------------------------------
 
 int mca_coll_xhc_lazy_init(xhc_module_t *module, ompi_communicator_t *comm) {
+    xhc_peer_info_t *peer_info = NULL;
 
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
 
-    xhc_peer_info_t *peer_info = module->peer_info;
+    int err, return_code = OMPI_SUCCESS;
 
-    opal_shmem_ds_t *peer_cico_ds = NULL;
-    xhc_data_t *data = NULL;
+    errno = 0;
 
-    xhc_coll_fns_t xhc_fns;
+    if(module->init)
+        return OMPI_SUCCESS;
+    else if(module->error)
+        return OMPI_ERROR;
 
-    int return_code = OMPI_SUCCESS;
-    int ret;
+    // ---
 
-    errno = 0;
+    peer_info = calloc(comm_size, sizeof(xhc_peer_info_t));
+    if(!peer_info) RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+    for(int r = 0; r < comm_size; r++) {
+        peer_info[r].proc = ompi_comm_peer_lookup(comm, r);
+        peer_info[r].locality = peer_info[r].proc->super.proc_flags;
+    }
+
+    peer_info[rank].locality |= ((1 << XHC_LOC_EXT_BITS) - 1) << XHC_LOC_EXT_START;
+
+    // ---
+
+    OBJ_CONSTRUCT(&module->hierarchy_cache, opal_hash_table_t);
+    err = opal_hash_table_init(&module->hierarchy_cache, XHC_COLLCOUNT);
+    if(err != OPAL_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
+
+    // ---
+
+    module->comm_size = comm_size;
+    module->rank = rank;
+
+    module->peer_info = peer_info;
+    module->init = true;
+
+    if((mca_coll_xhc_component.print_info & XHC_PRINT_INFO_CONFIG) && rank == 0) {
+        err = xhc_print_config_info(module, comm);
+
+        if(err != OMPI_SUCCESS) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: xhc_print_config_info() "
+                "failed with error code %d", err);
+        }
+    }
+
+    // ---
+
+    end:
+
+    if(return_code != OMPI_SUCCESS) {
+        free(peer_info);
+
+        opal_show_help("help-coll-xhc.txt", "xhc-init-failed", true,
+            return_code, errno, strerror(errno));
 
-    // ----
+        module->error = true;
+    }
+
+    return return_code;
+}
 
-    /* XHC requires rank communication during its initialization.
-     * Temporarily apply the saved fallback collective modules,
-     * and restore XHC's after initialization is done. */
-    xhc_module_install_fallback_fns(module, comm, &xhc_fns);
+int mca_coll_xhc_init_op(xhc_module_t *module,
+        ompi_communicator_t *comm, XHC_COLLTYPE_T colltype) {
 
-    // ----
+    int err, return_code = OMPI_SUCCESS;
 
-    ret = xhc_module_prepare_hierarchy(module, comm);
-    if(ret != OMPI_SUCCESS) {
-        RETURN_WITH_ERROR(return_code, ret, end);
+    if(!module->init && !module->error) {
+        err = xhc_lazy_init(module, comm);
+        if(err != OMPI_SUCCESS) return err;
     }
 
-    // ----
+    if(module->op_data[colltype].init)
+        return OMPI_SUCCESS;
+
+    // ---
+
+    xhc_op_config_t *config = &module->op_config[colltype];
+    xhc_op_data_t *data = &module->op_data[colltype];
 
-    data = malloc(sizeof(xhc_data_t));
-    peer_cico_ds = malloc(comm_size * sizeof(opal_shmem_ds_t));
-    if(!data || !peer_cico_ds) {
-        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    data->colltype = colltype;
+    data->seq = 0;
+
+    if(colltype == XHC_BCAST) {
+        err = xhc_alloc_bcast_cico(module, comm);
+        if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
     }
 
-    *data = (xhc_data_t) {
-        .comms = NULL,
-        .comm_count = -1,
+    // ---
 
-        .pvt_coll_seq = 0
-    };
+    /* Check if a hierarchy for this hierarchy string has already been
+     * prepared, and use (borrow) it instead of re-creating it. */
 
-    // ----
+    xhc_op_config_t *lender;
 
-    if(OMPI_XHC_CICO_MAX > 0) {
-        opal_shmem_ds_t cico_ds;
+    err = opal_hash_table_get_value_ptr(&module->hierarchy_cache,
+        config->hierarchy_string, strlen(config->hierarchy_string),
+        (void **) &lender);
 
-        void *my_cico = xhc_shmem_create(&cico_ds,
-            OMPI_XHC_CICO_MAX, comm, "cico", 0);
-        if(!my_cico) {
-            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    if(err == OPAL_SUCCESS) {
+        config->hierarchy = lender->hierarchy;
+        config->hierarchy_len = lender->hierarchy_len;
+    } else {
+        if(err != OPAL_ERR_NOT_FOUND) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: opal_hash_table_get_value_ptr() "
+                "returned %d", err);
         }
 
-        /* Manually "touch" to assert allocation in local NUMA node
-         * (assuming linux's default firt-touch-alloc NUMA policy) */
-        memset(my_cico, 0, OMPI_XHC_CICO_MAX);
+        err = xhc_hierarchy_make(module, comm, config->hierarchy_string,
+            &config->hierarchy, &config->hierarchy_len);
+        if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
 
-        ret = comm->c_coll->coll_allgather(&cico_ds,
-            sizeof(opal_shmem_ds_t), MPI_BYTE, peer_cico_ds,
-            sizeof(opal_shmem_ds_t), MPI_BYTE, comm,
-            comm->c_coll->coll_allgather_module);
-        if(ret != OMPI_SUCCESS) {
-            RETURN_WITH_ERROR(return_code, ret, end);
+        err = opal_hash_table_set_value_ptr(&module->hierarchy_cache,
+            config->hierarchy_string, strlen(config->hierarchy_string),
+            (void *) config);
+
+        if(err != OPAL_SUCCESS) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: opal_hash_table_set_value_ptr() "
+                "returned %d", err);
         }
+    }
+
+    // ---
+
+    err = xhc_comms_make(comm, module, config, data);
+    if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
 
-        for(int r = 0; r < comm_size; r++) {
-            peer_info[r].cico_ds = peer_cico_ds[r];
+    data->init = true;
+
+    if(mca_coll_xhc_component.print_info & (1 << colltype)) {
+        err = xhc_print_op_info(module, comm, colltype);
+
+        if(err != OMPI_SUCCESS) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: xhc_print_op_info() "
+                "failed with error code %d", err);
         }
 
-        peer_info[rank].cico_buffer = my_cico;
+        if(mca_coll_xhc_component.print_info & XHC_PRINT_INFO_HIER_DOT) {
+            err = xhc_print_op_hierarchy_dot(module, comm, colltype);
+
+            if(err != OMPI_SUCCESS) {
+                opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                    ompi_coll_base_framework.framework_output,
+                    "coll:xhc: Warning: xhc_print_op_hierarchy_dot() "
+                    "failed with error code %d", err);
+            }
+        }
     }
 
-    // ----
+    // ---
 
-    /* An XHC communicator is created for each level of the hierarchy.
-     * The hierachy must be in an order of most-specific to most-general. */
+    end:
 
-    ret = xhc_comms_make(comm, peer_info, &data->comms, &data->comm_count,
-        module->hierarchy, module->hierarchy_len);
-    if(ret != OMPI_SUCCESS) {
-        RETURN_WITH_ERROR(return_code, ret, end);
+    if(return_code != OMPI_SUCCESS) {
+        opal_show_help("help-coll-xhc.txt", "xhc-op-init-failed", true,
+            xhc_colltype_to_str(colltype), return_code, errno, strerror(errno));
     }
 
-    for(int i = 0, c = 0; i < data->comm_count; i++) {
-        data->comms[i].chunk_size = module->chunks[c];
-        c = opal_min(c + 1, module->chunks_len - 1);
-    }
+    return OMPI_SUCCESS;
+}
 
-    if(module->chunks_len < data->comm_count) {
-        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-            ompi_coll_base_framework.framework_output,
-            "coll:xhc: Warning: The chunk sizes count is shorter than the "
-            "hierarchy size; filling in with the last entry provided");
-    } else if(module->chunks_len > data->comm_count) {
-        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-            ompi_coll_base_framework.framework_output,
-            "coll:xhc: Warning: The chunk size count is larger than the "
-            "hierarchy size; omitting last entries");
-    }
+static int xhc_alloc_bcast_cico(xhc_module_t *module, ompi_communicator_t *comm) {
+    opal_shmem_ds_t *ds_list = NULL;
+    opal_shmem_ds_t cico_ds;
+    void *cico_buffer = NULL;
+    xhc_coll_fns_t xhc_fns;
 
-    // ----
+    int err, return_code = OMPI_SUCCESS;
 
-    if(mca_coll_xhc_component.print_info) {
-        ret = xhc_print_info(module, comm, data);
-        if(ret != OMPI_SUCCESS) {
-            RETURN_WITH_ERROR(return_code, ret, end);
-        }
-    }
+    int comm_size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
 
-    // ----
+    size_t cico_size = module->op_config[XHC_BCAST].cico_max;
 
-    module->data = data;
-    module->init = true;
+    if(cico_size == 0)
+        return OMPI_SUCCESS;
 
-    end:
+    xhc_module_set_coll_fns(comm, &module->prev_colls, &xhc_fns);
 
-    xhc_module_install_fns(module, comm, xhc_fns);
+    // --
 
-    free(peer_cico_ds);
+    ds_list = malloc(comm_size * sizeof(opal_shmem_ds_t));
+    if(!ds_list) return OMPI_ERR_OUT_OF_RESOURCE;
 
-    if(return_code != 0) {
-        opal_show_help("help-coll-xhc.txt", "xhc-init-failed", true,
-            return_code, errno, strerror(errno));
+    cico_buffer = xhc_shmem_create(&cico_ds, cico_size, comm, "cico", 0, 0);
+    if(!cico_buffer) RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+    /* Manually "touch" to assert allocation in local NUMA node
+     * (assuming linux's default first-touch-alloc policy). */
+    memset(cico_buffer, 0, cico_size);
+
+    err = comm->c_coll->coll_allgather(&cico_ds, sizeof(opal_shmem_ds_t),
+        MPI_BYTE, ds_list, sizeof(opal_shmem_ds_t), MPI_BYTE, comm,
+        comm->c_coll->coll_allgather_module);
+    if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
+
+    module->peer_info[rank].cico_ds = cico_ds;
+    module->peer_info[rank].cico_buffer = cico_buffer;
 
-        xhc_fini(module);
+    for(int r = 0; r < comm_size; r++) {
+        if(r == rank) continue;
+        module->peer_info[r].cico_ds = ds_list[r];
     }
 
-    return return_code;
-}
+    // --
 
-void mca_coll_xhc_fini(mca_coll_xhc_module_t *module) {
-    if(module->data) {
-        xhc_data_t *data = module->data;
+    end:
 
-        if(data->comm_count >= 0) {
-            xhc_comms_destroy(data->comms, data->comm_count);
-        }
+    free(ds_list);
+    xhc_module_set_coll_fns(comm, &xhc_fns, NULL);
 
-        free(data->comms);
-        free(data);
+    if(return_code != OMPI_SUCCESS) {
+        if(cico_buffer) {
+            opal_shmem_unlink(&cico_ds);
+            opal_shmem_segment_detach(&cico_ds);
+        }
     }
 
+    return return_code;
+}
+
+void mca_coll_xhc_fini(mca_coll_xhc_module_t *module) {
     if(module->peer_info) {
         for(int r = 0; r < module->comm_size; r++) {
+            if(module->peer_info[r].smsc_ep)
+                MCA_SMSC_CALL(return_endpoint, module->peer_info[r].smsc_ep);
+
             if(module->peer_info[r].cico_buffer) {
-                if(r == module->rank) {
-                    // OMPI issue #11123
-                    // opal_shmem_unlink(&module->peer_info[r].cico_ds);
-                }
+                // OMPI #11123
+                /* if(r == module->rank)
+                    opal_shmem_unlink(&module->peer_info[r].cico_ds); */
 
                 opal_shmem_segment_detach(&module->peer_info[r].cico_ds);
             }
-
-            if(module->peer_info[r].smsc_ep) {
-                MCA_SMSC_CALL(return_endpoint, module->peer_info[r].smsc_ep);
-            }
         }
+
+        free(module->peer_info);
     }
-}
 
-// ------------------------------------------------
+    xhc_op_config_t *ht_config_value;
+    void *ht_key;
 
-/* This method is where the hierarchy of XHC is constructed; it receives
- * the hierarchy specifications (hierarchy param) and groups ranks together
- * among them. The process begins with the first locality in the list. All
- * ranks that share this locality (determined via the relative peer to peer
- * distances) become siblings. The one amongst them with the lowest rank
- * number becomes the manager/leader of the group. The members don't really
- * need to keep track of the actual ranks of their siblings -- only the rank
- * of the group's leader/manager, the size of the group, and their own member
- * ID. The process continues with the next locality, only that now only the
- * ranks that became leaders in the previous level are eligible (determined
- * via comm_candidate, see inline comments). */
-static int xhc_comms_make(ompi_communicator_t *ompi_comm,
-        xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst,
-        int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len) {
-
-    int ompi_rank = ompi_comm_rank(ompi_comm);
-    int ompi_size = ompi_comm_size(ompi_comm);
-
-    xhc_comm_t *comms = NULL;
-    int comms_size = 0;
-    int comm_count = 0;
-
-    opal_shmem_ds_t *comm_ctrl_ds;
-    bool *comm_candidate;
-
-    size_t smsc_reg_size = 0;
-
-    int return_code = OMPI_SUCCESS;
-    int ret;
-
-    comms = malloc((comms_size = 5) * sizeof(xhc_comm_t));
-    comm_ctrl_ds = malloc(ompi_size * sizeof(opal_shmem_ds_t));
-    comm_candidate = malloc(ompi_size * sizeof(bool));
-
-    if(!comms || !comm_ctrl_ds || !comm_candidate) {
-        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
+    OPAL_HASH_TABLE_FOREACH_PTR(ht_key, ht_config_value,
+            &module->hierarchy_cache, {
+        free(ht_config_value->hierarchy);
+    });
 
-    if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) {
-        smsc_reg_size = mca_smsc_base_registration_data_size();
-    }
+    OBJ_DESTRUCT(&module->hierarchy_cache);
 
-    for(int h = 0; h < hierarchy_len; h++) {
-        xhc_comm_t *xc = &comms[comm_count];
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        if(module->op_data[t].init) {
+            xhc_comms_destroy(module->op_data[t].comms,
+                module->op_data[t].comm_count);
 
-        if(comm_count == comms_size) {
-            void *tmp = realloc(comms, (comms_size *= 2) * sizeof(xhc_comm_t));
-            if(!tmp) {
-                RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-            }
-            comms = tmp;
+            free(module->op_data[t].comms);
         }
+    }
 
-        *xc = (xhc_comm_t) {
-            .locality = hierarchy[h],
+    free(module->rbuf);
+}
 
-            .size = 0,
-            .manager_rank = -1,
+// ------------------------------------------------
 
-            .member_info = NULL,
-            .reduce_queue = NULL,
+int mca_coll_xhc_read_op_config(xhc_module_t *module,
+        ompi_communicator_t *comm, XHC_COLLTYPE_T colltype) {
 
-            .comm_ctrl = NULL,
-            .member_ctrl = NULL,
+    /* Assimilate the tuning parameters here, based on the various MCA params
+     * and any provided info keys. We want to determine the hierarchy, chunk
+     * size, and cico threshold, for each supported primitive. The info keys
+     * take precedence over the MCA params, with op-specific info keys having
+     * higher priority than global ones. Note that the op-specific and global
+     * MCA params are mutually exclusive; this is implemented in xhc_register().
+     */
 
-            .ctrl_ds = (opal_shmem_ds_t) {0}
-        };
+    xhc_op_config_t *config = &module->op_config[colltype];
 
-        // ----
+    const char *param[] = {"hierarchy", "chunk_size", "cico_max"};
+    for(size_t p = 0; p < sizeof(param)/sizeof(param[0]); p++) {
+        xhc_op_mca_t op_mca = mca_coll_xhc_component.op_mca[colltype];
+        xhc_config_source_t config_source;
 
-        /* Only ranks that were leaders in the previous level are candidates
-         * for this one. Every rank advertises whether others may consider
-         * it for inclusion via an Allgather. */
+        opal_cstring_t *info_val;
+        int info_flag = 0;
 
-        bool is_candidate = (comm_count == 0
-            || comms[comm_count - 1].manager_rank == ompi_rank);
+        int err;
 
-        ret = ompi_comm->c_coll->coll_allgather(&is_candidate, 1,
-            MPI_C_BOOL, comm_candidate, 1, MPI_C_BOOL,
-            ompi_comm, ompi_comm->c_coll->coll_allgather_module);
-        if(ret != OMPI_SUCCESS) {
-            RETURN_WITH_ERROR(return_code, ret, comm_error);
-        }
+        if(comm->super.s_info) {
+            char *key;
 
-        for(int r = 0; r < ompi_size; r++) {
+            // Op-specific info key
+            err = opal_asprintf(&key, "ompi_comm_coll_xhc_%s_%s",
+                xhc_colltype_to_str(colltype), param[p]);
+            if(err < 0) return OMPI_ERR_OUT_OF_RESOURCE;
 
-            /* If on a non-bottom comm, only managers of the previous
-             * comm are "full" members. However, this procedure also has
-             * to take place for the bottom-most comm; even if this is the
-             * current rank's bottom-most comm, it may not actually be so,
-             * for another rank (eg. with some non-symmetric hierarchies). */
-            if(comm_candidate[r] == false) {
-                continue;
-            }
+            opal_info_get(comm->super.s_info,
+                key, &info_val, &info_flag);
+            free(key);
 
-            // Non-local --> not part of the comm :/
-            if(!PEER_IS_LOCAL(peer_info, r, xc->locality)) {
-                continue;
-            }
+            if(info_flag) {
+                config_source = XHC_CONFIG_SOURCE_INFO_OP;
+            } else {
+                // Non-specific info key
+                err = opal_asprintf(&key, "ompi_comm_coll_xhc_%s",
+                    param[p]);
+                if(err < 0) return OMPI_ERR_OUT_OF_RESOURCE;
 
-            /* The member ID means slightly different things whether on the
-             * bottom-most comm or not. On the bottom-most comm, a rank can
-             * either be a "full" member or not. However, on higher-up comms,
-             * if a rank was not a manager on the previous comm, it will not
-             * a "full" member. Instead, it will be a "potential" member, in
-             * that it keeps information about this comm, and is ready to
-             * take over duties and act as a normal member for a specific
-             * collective (eg. dynamic leader feature, or root != manager). */
-            if(r == ompi_rank || (comm_count > 0 && r == comms[comm_count - 1].manager_rank)) {
-                xc->member_id = xc->size;
-            }
+                opal_info_get(comm->super.s_info, key,
+                    &info_val, &info_flag);
+                free(key);
 
-            // First rank to join the comm becomes the manager
-            if(xc->manager_rank == -1) {
-                xc->manager_rank = r;
+                if(info_flag)
+                    config_source = XHC_CONFIG_SOURCE_INFO_GLOBAL;
             }
-
-            xc->size++;
         }
 
-        /* If there are no local peers in regards to this locality, no
-         * XHC comm is created for this process on this level. */
-        if(xc->size <= 1) {
-            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-                ompi_coll_base_framework.framework_output,
-                "coll:xhc: Warning: Locality 0x%04x does not result "
-                "in any new groupings; skipping it", xc->locality);
-
-            /* All ranks must participate in the "control struct sharing"
-             * allgather, even if useless to this rank to some of them */
-
-            ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds,
-                sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds,
-                sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
-                ompi_comm->c_coll->coll_allgather_module);
-            if(ret != OMPI_SUCCESS) {
-                RETURN_WITH_ERROR(return_code, ret, comm_error);
-            }
+        if(!info_flag) {
+            const mca_base_var_t *var_global;
 
-            xhc_comms_destroy(xc, 1);
-            continue;
-        }
+            err = mca_base_var_get(mca_base_var_find("ompi",
+                "coll", "xhc", param[p]), &var_global);
+            if(err != OPAL_SUCCESS) return err;
 
-        // ----
+            config_source = (var_global->mbv_source != MCA_BASE_VAR_SOURCE_DEFAULT
+                ? XHC_CONFIG_SOURCE_MCA_GLOBAL : XHC_CONFIG_SOURCE_MCA_OP);
+        }
 
-        /* Init comm stuff */
+        switch(p) {
+            case 0:
+                config->hierarchy_string = strdup(info_flag ?
+                    info_val->string : op_mca.hierarchy);
 
-        xc->member_info = calloc(xc->size, sizeof(xhc_member_info_t));
-        if(xc->member_info == NULL) {
-            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error);
-        }
+                if(info_flag)
+                    OBJ_RELEASE(info_val);
 
-        xc->reduce_queue = OBJ_NEW(opal_list_t);
-        if(!xc->reduce_queue) {
-            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error);
-        }
+                if(!config->hierarchy_string)
+                    return OMPI_ERR_OUT_OF_RESOURCE;
 
-        for(int m = 0; m < xc->size - 1; m++) {
-            xhc_rq_item_t *item = OBJ_NEW(xhc_rq_item_t);
-            if(!item) {
-                RETURN_WITH_ERROR(return_code,
-                    OMPI_ERR_OUT_OF_RESOURCE, comm_error);
-            }
+                config->hierarchy_source = config_source;
 
-            opal_list_append(xc->reduce_queue, (opal_list_item_t *) item);
-        }
+                break;
+            case 1:
+                /* We don't *need* to keep record of the chunk string, since
+                 * we parse it here. But it's good to have; we currently use
+                 * it when showing information (print_info). */
+                config->chunk_string = strdup(info_flag ?
+                    info_val->string : op_mca.chunk_size);
 
-        // ----
+                if(info_flag)
+                    OBJ_RELEASE(info_val);
 
-        // Create shared structs
-        if(ompi_rank == xc->manager_rank) {
-            size_t ctrl_len = sizeof(xhc_comm_ctrl_t) + smsc_reg_size
-                + xc->size * sizeof(xhc_member_ctrl_t);
+                if(!config->chunk_string)
+                    return OMPI_ERR_OUT_OF_RESOURCE;
 
-            char *ctrl_base = xhc_shmem_create(&xc->ctrl_ds, ctrl_len,
-                ompi_comm, "ctrl", comm_count);
-            if(ctrl_base == NULL) {
-                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
-            }
+                err = xhc_component_parse_chunk_sizes(config->chunk_string,
+                    &config->chunks, &config->chunks_len);
 
-            /* Manually "touch" to assert allocation in local NUMA node
-            * (assuming linux's default firt-touch-alloc NUMA policy) */
-            memset(ctrl_base, 0, ctrl_len);
+                if(err != OMPI_SUCCESS)
+                    return err;
 
-            xc->comm_ctrl = (void *) ctrl_base;
-            xc->member_ctrl = (void *) (ctrl_base
-                + sizeof(xhc_comm_ctrl_t) + smsc_reg_size);
-        }
+                config->chunk_source = config_source;
 
-        /* The comm's managers share the details of the communication structs
-         * with their children, so that they may attach to them. Because
-         * there's not any MPI communicator formed that includes (only) the
-         * members of the XHC comm, the sharing is achieved with a single
-         * Allgather, instead of a Broadcast inside each XHC comm. */
-
-        ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds,
-            sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds,
-            sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
-            ompi_comm->c_coll->coll_allgather_module);
-        if(ret != OMPI_SUCCESS) {
-            RETURN_WITH_ERROR(return_code, ret, comm_error);
-        }
+                break;
+            case 2:
+                if(info_flag) {
+                    err = xhc_component_parse_cico_max(
+                        info_val->string, &config->cico_max);
+                    OBJ_RELEASE(info_val);
 
-        // Attach to manager's shared structs
-        if(ompi_rank != xc->manager_rank) {
-            xc->ctrl_ds = comm_ctrl_ds[xc->manager_rank];
+                    if(err != OMPI_SUCCESS)
+                        return err;
+                } else
+                    config->cico_max = op_mca.cico_max;
 
-            char *ctrl_base = xhc_shmem_attach(&xc->ctrl_ds);
-            if(ctrl_base == NULL) {
-                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
-            }
+                config->cico_max_source = config_source;
 
-            xc->comm_ctrl = (void *) ctrl_base;
-            xc->member_ctrl = (void *) (ctrl_base
-                + sizeof(xhc_comm_ctrl_t) + smsc_reg_size);
+                break;
         }
+    }
 
-        xc->my_member_ctrl = &xc->member_ctrl[xc->member_id];
-        xc->my_member_info = &xc->member_info[xc->member_id];
-
-        // ----
+    // ---
 
-        comm_count++;
+    /* Enforce a resonable minimum chunk size */
 
-        continue;
+    if(colltype != XHC_BARRIER) {
+        bool altered_chunks = false;
+        for(int i = 0; i < config->chunks_len; i++) {
+            if(config->chunks[i] < XHC_MIN_CHUNK_SIZE) {
+                config->chunks[i] = XHC_MIN_CHUNK_SIZE;
+                altered_chunks = true;
+            }
+        }
 
-        comm_error: {
-            xhc_comms_destroy(comms, comm_count+1);
-            comm_count = -1;
+        if(altered_chunks) {
+            char *tmp, *str = strdup("");
+            for(int i = 0; i < config->chunks_len; i++) {
+                int err = opal_asprintf(&tmp, "%s%s%zu", str,
+                    (i > 0 ? "," : ""), config->chunks[i]);
+                free(str); str = tmp;
+                if(err < 0) return OMPI_ERR_OUT_OF_RESOURCE;
+            }
 
-            goto end;
+            free(config->chunk_string);
+            config->chunk_string = str;
         }
     }
 
-    REALLOC(comms, comm_count, xhc_comm_t);
+    // ---
 
-    *comms_dst = comms;
-    *comm_count_dst = comm_count;
+    return OMPI_SUCCESS;
+}
 
-    end:
+// ------------------------------------------------
 
-    free(comm_ctrl_ds);
-    free(comm_candidate);
+static int xhc_print_config_info(xhc_module_t *module, ompi_communicator_t *comm) {
+    char *drval_str, *lb_policy_str, *un_min_str;
+    int err;
+
+    switch(mca_coll_xhc_component.dynamic_reduce) {
+        case XHC_DYNAMIC_REDUCE_DISABLED:
+            drval_str = "OFF"; break;
+        case XHC_DYNAMIC_REDUCE_NON_FLOAT:
+            drval_str = "ON (non-float)"; break;
+        case XHC_DYNAMIC_REDUCE_ALL:
+            drval_str = "ON (all)"; break;
+        default:
+            drval_str = "???";
+    }
 
-    if(return_code != OMPI_SUCCESS) {
-        free(comms);
+    switch((int) mca_coll_xhc_component.reduce_load_balance) {
+        case 0:
+            lb_policy_str = "none"; break;
+        case XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL:
+            lb_policy_str = "top level"; break;
+        case XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK:
+            lb_policy_str = "first chunk"; break;
+        case XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL
+                | XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK:
+            lb_policy_str = "top level + first chunk"; break;
+        case XHC_REDUCE_LB_LEADER_ASSIST_ALL:
+            lb_policy_str = "all"; break;
+        default:
+            lb_policy_str = "???";
     }
 
-    return return_code;
+    err = opal_asprintf(&un_min_str, " (min %zu bytes)",
+        mca_coll_xhc_component.uniform_chunks_min);
+    if(err < 0) return OMPI_ERR_OUT_OF_RESOURCE;
+
+    printf("------------------------------------------------\n"
+        "OMPI coll/xhc @ %s, priority %d\n"
+        "  dynamic leader '%s', dynamic reduce '%s'\n"
+        "  reduce load balance '%s'\n"
+        "  allreduce uniform chunks '%s'%s\n",
+        comm->c_name, mca_coll_xhc_component.priority,
+        (mca_coll_xhc_component.dynamic_leader ? "ON" : "OFF"),
+        drval_str, lb_policy_str,
+        (mca_coll_xhc_component.uniform_chunks ? "ON" : "OFF"),
+        (mca_coll_xhc_component.uniform_chunks ? un_min_str : ""));
+
+    free(un_min_str);
+
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        xhc_op_config_t *config = &module->op_config[t];
+
+        if(t == XHC_BARRIER) {
+            printf("\n"
+                "  [%s]\n"
+                "    Hierarchy: %s (source: %s)\n",
+                xhc_colltype_to_str(t),
+                config->hierarchy_string, xhc_config_source_to_str(config->hierarchy_source));
+        } else {
+            printf("\n"
+                "  [%s]\n"
+                "    Hierarchy: %s (source: %s)\n"
+                "    Chunk size(s): %s (source: %s)\n"
+                "    CICO: Up to %zu bytes (source: %s)\n",
+                xhc_colltype_to_str(t),
+                config->hierarchy_string, xhc_config_source_to_str(config->hierarchy_source),
+                config->chunk_string, xhc_config_source_to_str(config->chunk_source),
+                config->cico_max, xhc_config_source_to_str(config->cico_max_source));
+        }
+    }
+
+    printf("------------------------------------------------\n");
+
+    return OMPI_SUCCESS;
 }
 
-static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count) {
-    bool is_manager = true;
+static int xhc_print_op_info(xhc_module_t *module,
+        ompi_communicator_t *comm, XHC_COLLTYPE_T colltype) {
+
+    xhc_comm_t *comms = module->op_data[colltype].comms;
+    int comm_count = module->op_data[colltype].comm_count;
+
+    int rank = ompi_comm_rank(comm);
 
     for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+        char *memb_list = NULL, *tmp;
+        int err;
 
-        if(xc->member_id != 0) {
-            is_manager = false;
-        }
+        err = opal_asprintf(&memb_list, "%d", comms[i].owner_rank);
+        if(err < 0) return OMPI_ERR_OUT_OF_RESOURCE;
 
-        free(xc->member_info);
+        for(int m = 1; m < comms[i].size; m++) {
+            if(m == comms[i].my_id) {
+                if(i == 0 || comms[i-1].owner_rank == rank)
+                    err = opal_asprintf(&tmp, "%s %d", memb_list, rank);
+                else
+                    err = opal_asprintf(&tmp, "%s _", memb_list);
+            } else
+                err = opal_asprintf(&tmp, "%s x", memb_list);
 
-        if(xc->reduce_queue) {
-            OPAL_LIST_RELEASE(xc->reduce_queue);
-        }
+            free(memb_list);
+            memb_list = tmp;
 
-        if(xc->comm_ctrl) {
-            if(is_manager) {
-                // OMPI issue #11123
-                // opal_shmem_unlink(&xc->ctrl_ds);
-                (void) is_manager;
-            }
+            if(err < 0)
+                return OMPI_ERR_OUT_OF_RESOURCE;
+        }
 
-            opal_shmem_segment_detach(&xc->ctrl_ds);
+        if(colltype == XHC_BARRIER) {
+            printf("XHC_COMM ompi_comm=%s rank=%d op=%s loc=0x%08x members=%d [%s]\n",
+                comm->c_name, rank, xhc_colltype_to_str(colltype), comms[i].locality,
+                comms[i].size, memb_list);
+        } else {
+            printf("XHC_COMM ompi_comm=%s rank=%d op=%s loc=0x%08x chunk_size=%zu "
+                "cico_size=%zu members=%d [%s]\n", comm->c_name, rank,
+                xhc_colltype_to_str(colltype), comms[i].locality, comms[i].chunk_size,
+                comms[i].cico_size, comms[i].size, memb_list);
         }
 
-        *xc = (xhc_comm_t) {0};
+        free(memb_list);
     }
+
+    return OMPI_SUCCESS;
 }
 
-static int xhc_print_info(xhc_module_t *module,
-        ompi_communicator_t *comm, xhc_data_t *data) {
+static int xhc_print_op_hierarchy_dot(xhc_module_t *module,
+        ompi_communicator_t *comm, XHC_COLLTYPE_T colltype) {
 
-    int rank = ompi_comm_rank(comm);
-    int ret;
-
-    if(rank == 0) {
-        char *drval_str;
-        char *lb_rla_str;
-        char *un_min_str;
-
-        switch(mca_coll_xhc_component.dynamic_reduce) {
-            case OMPI_XHC_DYNAMIC_REDUCE_DISABLED:
-                drval_str = "OFF"; break;
-            case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT:
-                drval_str = "ON (non-float)"; break;
-            case OMPI_XHC_DYNAMIC_REDUCE_ALL:
-                drval_str = "ON (all)"; break;
-            default:
-                drval_str = "???";
-        }
+    xhc_coll_fns_t xhc_fns;
+    xhc_comm_t *comms = module->op_data[colltype].comms;
 
-        switch(mca_coll_xhc_component.lb_reduce_leader_assist) {
-            case OMPI_XHC_LB_RLA_TOP_LEVEL:
-                lb_rla_str = "top level"; break;
-            case OMPI_XHC_LB_RLA_FIRST_CHUNK:
-                lb_rla_str = "first chunk"; break;
-            case OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK:
-                lb_rla_str = "top level + first chunk"; break;
-            case OMPI_XHC_LB_RLA_ALL:
-                lb_rla_str = "all"; break;
-            default:
-                lb_rla_str = "???";
-        }
+    int *src_rank = NULL;
+    int my_src = -1;
 
-        ret = opal_asprintf(&un_min_str, " (min '%zu' bytes)",
-            mca_coll_xhc_component.uniform_chunks_min);
-        if(ret < 0) {
-            return OMPI_ERR_OUT_OF_RESOURCE;
-        }
+    FILE *outfile = stdout;
 
-        printf("------------------------------------------------\n"
-            "OMPI coll/xhc @ %s, priority %d\n"
-            "  dynamic leader '%s', dynamic reduce '%s'\n"
-            "  reduce load-balancing leader-assist '%s'\n"
-            "  allreduce uniform chunks '%s'%s\n"
-            "  CICO up until %zu bytes, barrier root %d\n\n"
-            "------------------------------------------------\n",
-            comm->c_name, mca_coll_xhc_component.priority,
-            (mca_coll_xhc_component.dynamic_leader ? "ON" : "OFF"),
-            drval_str, lb_rla_str,
-            (mca_coll_xhc_component.uniform_chunks ? "ON" : "OFF"),
-            (mca_coll_xhc_component.uniform_chunks ? un_min_str : ""),
-            mca_coll_xhc_component.cico_max,
-            mca_coll_xhc_component.barrier_root);
-
-        free(un_min_str);
+    if(module->rank == 0) {
+        src_rank = malloc(module->comm_size * sizeof(int));
+        if(!src_rank) return OMPI_ERR_OUT_OF_RESOURCE;
     }
 
-    for(int i = 0; i < data->comm_count; i++) {
-        char *mlist = NULL;
-        char *tmp;
-
-        ret = opal_asprintf(&mlist, "%d", data->comms[i].manager_rank);
-        if(ret < 0) {
-            return OMPI_ERR_OUT_OF_RESOURCE;
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        if(xc->my_id != 0) {
+            my_src = xc->owner_rank;
+            break;
         }
+    }
 
-        for(int m = 1; m < data->comms[i].size; m++) {
-            if(m == data->comms[i].member_id) {
-                if(i == 0 || data->comms[i-1].manager_rank == rank) {
-                    ret = opal_asprintf(&tmp, "%s %d", mlist, rank);
-                } else {
-                    ret = opal_asprintf(&tmp, "%s _", mlist);
-                }
-            } else {
-                ret = opal_asprintf(&tmp, "%s x", mlist);
-            }
+    xhc_module_set_coll_fns(comm, &module->prev_colls, &xhc_fns);
 
-            free(mlist);
-            mlist = tmp;
+    int err = comm->c_coll->coll_gather(&my_src,
+        1, MPI_INT, src_rank, 1, MPI_INT, 0,
+        comm, comm->c_coll->coll_gather_module);
 
-            if(ret < 0) {
-                return OMPI_ERR_OUT_OF_RESOURCE;
-            }
+    xhc_module_set_coll_fns(comm, &xhc_fns, NULL);
+
+    if(err != OMPI_SUCCESS) {
+        if(module->rank == 0)
+            free(src_rank);
+        return err;
+    }
+
+    if(module->rank == 0) {
+        char *dir;
+
+        switch(colltype) {
+            case XHC_BCAST:
+                dir = "forward"; break;
+            case XHC_REDUCE: case XHC_ALLREDUCE:
+                dir = "back"; break;
+            case XHC_BARRIER:
+                dir = "both"; break;
+            default:
+                dir = "none";
         }
 
-        printf("XHC comm loc=0x%08x chunk_size=%zu with %d members [%s]\n",
-            data->comms[i].locality, data->comms[i].chunk_size,
-            data->comms[i].size, mlist);
+        fprintf(outfile, "digraph xhc_%s_hierarchy {\n", xhc_colltype_to_str(colltype));
+
+        for(int r = 1; r < module->comm_size; r++)
+            fprintf(outfile, "\t%d -> %d [dir=%s];\n", src_rank[r], r, dir);
+
+        fprintf(outfile, "}\n");
 
-        free(mlist);
+        free(src_rank);
     }
 
     return OMPI_SUCCESS;
@@ -587,31 +655,31 @@ static int xhc_print_info(xhc_module_t *module,
 
 // ------------------------------------------------
 
-static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
-        ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i) {
+void *mca_coll_xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
+        ompi_communicator_t *ompi_comm, const char *name, int id1, int id2) {
 
     char *shmem_file;
-    int ret;
+    int err;
 
-    // xhc_shmem_seg.<UID>@<HOST>.<JOBID>.<RANK@COMM_WORLD>:<CID>_<CHRS>:<CHRI>
+    // xhc_shmem_seg.<UID>@<HOST>.<JOBID>.<RANK@COMM_WORLD>:<CID>_<NAME>:<ID1>:<ID2>
 
-    ret = opal_asprintf(&shmem_file, "%s" OPAL_PATH_SEP "xhc_shmem_seg.%u@%s.%x.%d:%d_%s:%d",
-        mca_coll_xhc_component.shmem_backing, geteuid(), opal_process_info.nodename,
-        OPAL_PROC_MY_NAME.jobid, ompi_comm_rank(MPI_COMM_WORLD), ompi_comm_get_local_cid(ompi_comm),
-        name_chr_s, name_chr_i);
+    err = opal_asprintf(&shmem_file, "%s" OPAL_PATH_SEP
+        "xhc_shmem_seg.%u@%s.%x.%d:%d_%s:%d:%d", mca_coll_xhc_component.shmem_backing,
+        geteuid(), opal_process_info.nodename, OPAL_PROC_MY_NAME.jobid,
+        ompi_comm_rank(MPI_COMM_WORLD), ompi_comm_get_local_cid(ompi_comm),
+        name, id1, id2);
 
-    if(ret < 0) {
+    if(err < 0)
         return NULL;
-    }
 
     // Not 100% sure what this does!, copied from btl/sm
     opal_pmix_register_cleanup(shmem_file, false, false, false);
 
-    ret = opal_shmem_segment_create(seg_ds, shmem_file, size);
+    err = opal_shmem_segment_create(seg_ds, shmem_file, size);
 
     free(shmem_file);
 
-    if(ret != OPAL_SUCCESS) {
+    if(err != OPAL_SUCCESS) {
         opal_output_verbose(MCA_BASE_VERBOSE_ERROR,
             ompi_coll_base_framework.framework_output,
             "coll:xhc: Error: Could not create shared memory segment");
@@ -621,14 +689,13 @@ static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
 
     void *addr = xhc_shmem_attach(seg_ds);
 
-    if(addr == NULL) {
+    if(addr == NULL)
         opal_shmem_unlink(seg_ds);
-    }
 
     return addr;
 }
 
-static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds) {
+void *mca_coll_xhc_shmem_attach(opal_shmem_ds_t *seg_ds) {
     void *addr = opal_shmem_segment_attach(seg_ds);
 
     if(addr == NULL) {
@@ -640,6 +707,15 @@ static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds) {
     return addr;
 }
 
+// ------------------------------------------------
+
+void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank) {
+    if(peer_info[rank].cico_buffer == NULL)
+        peer_info[rank].cico_buffer = xhc_shmem_attach(&peer_info[rank].cico_ds);
+
+    return peer_info[rank].cico_buffer;
+}
+
 static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info) {
     if(!peer_info->smsc_ep) {
         peer_info->smsc_ep = MCA_SMSC_CALL(get_endpoint, &peer_info->proc->super);
@@ -656,21 +732,9 @@ static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info) {
     return peer_info->smsc_ep;
 }
 
-// ------------------------------------------------
+int mca_coll_xhc_copy_expose_region(void *base,
+        size_t len, xhc_copy_data_t **region_data) {
 
-void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank) {
-    if(OMPI_XHC_CICO_MAX == 0) {
-        return NULL;
-    }
-
-    if(peer_info[rank].cico_buffer == NULL) {
-        peer_info[rank].cico_buffer = xhc_shmem_attach(&peer_info[rank].cico_ds);
-    }
-
-    return peer_info[rank].cico_buffer;
-}
-
-int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data) {
     if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) {
         void *data = MCA_SMSC_CALL(register_region, base, len);
 
@@ -697,9 +761,8 @@ int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info,
 
     mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info);
 
-    if(smsc_ep == NULL) {
+    if(smsc_ep == NULL)
         return -1;
-    }
 
     int status = MCA_SMSC_CALL(copy_from, smsc_ep,
         dst, src, size, access_token);
@@ -717,32 +780,22 @@ void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info,
 
     mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info);
 
-    if(smsc_ep == NULL) {
+    if(smsc_ep == NULL)
         return NULL;
-    }
-
-    /* MCA_RCACHE_FLAGS_PERSIST will cause the registration to stick around.
-     * Though actually, because smsc/xpmem initializes the ref count to 2,
-     * as a means of keeping the registration around (instead of using the
-     * flag), our flag here doesn't have much effect. If at some point we
-     * would wish to actually detach memory in some or all cases, we should
-     * either call the unmap method twice, or reach out to Open MPI devs and
-     * inquire about the ref count. */
 
     void *local_ptr;
 
     *reg = MCA_SMSC_CALL(map_peer_region, smsc_ep,
         MCA_RCACHE_FLAGS_PERSIST, peer_vaddr, size, &local_ptr);
 
-    if(*reg == NULL) {
+    if(*reg == NULL)
         return NULL;
-    }
 
     return local_ptr;
 }
 
-/* Won't actually unmap/detach, since we've set
- * the "persist" flag while creating the mapping */
 void mca_coll_xhc_return_registration(xhc_reg_t *reg) {
+    /* Won't actually unmap/detach, since we've set the
+     * MCA_RCACHE_FLAGS_PERSIST flag to map_peer_region */
     MCA_SMSC_CALL(unmap_peer_region, reg);
 }
diff --git a/ompi/mca/coll/xhc/coll_xhc.h b/ompi/mca/coll/xhc/coll_xhc.h
index 0160aa37369..bf6baea634d 100644
--- a/ompi/mca/coll/xhc/coll_xhc.h
+++ b/ompi/mca/coll/xhc/coll_xhc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -21,14 +21,19 @@
 #include "ompi/mca/mca.h"
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/base.h"
+#include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/mca/coll/base/coll_base_util.h"
+
 #include "ompi/communicator/communicator.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/op/op.h"
 
+#include "opal/class/opal_hash_table.h"
 #include "opal/mca/shmem/shmem.h"
 #include "opal/mca/smsc/smsc.h"
+#include "opal/util/minmax.h"
 
-#include "coll_xhc_atomic.h"
+#include "coll_xhc_intrinsic.h"
 
 #define RETURN_WITH_ERROR(var, err, label) do {(var) = (err); goto label;} \
     while(0)
@@ -41,60 +46,54 @@
 #define PEER_IS_LOCAL(peer_info, rank, loc) \
     (((peer_info)[(rank)].locality & (loc)) == (loc))
 
-#define OMPI_XHC_LOC_EXT_BITS (8*(sizeof(xhc_loc_t) - sizeof(opal_hwloc_locality_t)))
-#define OMPI_XHC_LOC_EXT_START (8*sizeof(opal_hwloc_locality_t))
+#define XHC_LOC_EXT_BITS (8*(sizeof(xhc_loc_t) - sizeof(opal_hwloc_locality_t)))
+#define XHC_LOC_EXT_START (8*sizeof(opal_hwloc_locality_t))
+
+#define XHC_CALL_FALLBACK(fns, colltype, op, ...) \
+    ((mca_coll_base_module_ ## op ## _fn_t) (fns).coll_fn[colltype]) \
+    (__VA_ARGS__, (mca_coll_base_module_t *) (fns).coll_module[colltype])
+
+// Change the pointers so that the fallback is always called in the future
+#define XHC_INSTALL_FALLBACK(module, comm, colltype, op) do { \
+    (comm)->c_coll->coll_ ## op = (mca_coll_base_module_ ## op ## _fn_t) \
+        (module)->prev_colls.coll_fn[colltype]; \
+    (comm)->c_coll->coll_ ## op ## _module = (mca_coll_base_module_t *) \
+        (module)->prev_colls.coll_module[colltype]; \
+} while(0)
+
+#define WARN_ONCE(...) do { \
+    static bool warn_shown = false; \
+    if(!warn_shown) { \
+        opal_output_verbose(MCA_BASE_VERBOSE_WARN, \
+            ompi_coll_base_framework.framework_output, __VA_ARGS__); \
+        warn_shown = true; \
+    } \
+} while(0)
 
 // ---
 
-#define OMPI_XHC_ACK_WIN 0
+// Set according to CPU cache line (safe bet)
+#define XHC_ALIGN 128
 
-// Align to CPU cache line (portable way to obtain it?)
-#define OMPI_XHC_ALIGN 64
+// Chunk size can't be set lower than this
+#define XHC_MIN_CHUNK_SIZE 64
 
 // Call opal_progress every this many ticks when busy-waiting
-#define OMPI_XHC_OPAL_PROGRESS_CYCLE 10000
-
-/* Reduction leader-member load balancing, AKA should leaders reduce data?
- * Normally, non-leaders reduce and leaders propagate. But there are instances
- * where leaders can/should also help with the group's reduction load.
- *
- * OMPI_XHC_LB_RLA_TOP_LEVEL: The top level's leader performs reductions
- *   on the top level as if a common member
- *
- * OMPI_XHC_LB_RLA_FIRST_CHUNK: Leaders reduce only a single chunk, on
- *   each level, at the beginning of the operation
- *
- * (OMPI_XHC_LB_RLA_TOP_LEVEL and OMPI_XHC_LB_RLA_FIRST_CHUNK are combinable)
- *
- * OMPI_XHC_LB_RLM_ALL: All leaders performs reductions exactly as if
- *   common members
- *
- * Generally, we might not want leaders reducing, as that may lead to load
- * imbalance, since they will also have to reduce the comm's result(s)
- * on upper levels. Unless a leader is also one on all levels! (e.g. the
- * top-level leader). This leader should probably be assisting in the
- * reduction; otherwise, the only thing he will be doing is checking
- * and updating synchronization flags.
- *
- * Regarding the load balancing problem, the leaders will actually not have
- * anything to do until the first chunk is reduced, so they might as well be
- * made to help the other members with this first chunk. Keep in mind though,
- * this might increase the memory load, and cause this first chunk to take
- * slightly more time to be produced. */
-#define OMPI_XHC_LB_RLA_TOP_LEVEL 0x01
-#define OMPI_XHC_LB_RLA_FIRST_CHUNK 0x02
-#define OMPI_XHC_LB_RLA_ALL 0x80
-
-enum {
-    OMPI_XHC_DYNAMIC_REDUCE_DISABLED,
-    OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT,
-    OMPI_XHC_DYNAMIC_REDUCE_ALL
-};
-
-#define OMPI_XHC_CICO_MAX (mca_coll_xhc_component.cico_max)
-
-/* For other configuration options and default
- * values check coll_xhc_component.c */
+#define XHC_OPAL_PROGRESS_CYCLE 10000
+
+#define XHC_PRINT_INFO_HIER_DOT (1 << (sizeof(mca_coll_xhc_component.print_info) * 8 - 4))
+#define XHC_PRINT_INFO_CONFIG (1 << (sizeof(mca_coll_xhc_component.print_info) * 8 - 3))
+#define XHC_PRINT_INFO_ALL (1 << (sizeof(mca_coll_xhc_component.print_info) * 8 - 2))
+
+/* While we treat the cache line as 128 bytes (XHC_ALIGN), we calculate
+ * IMM_SIZE according to conservative 64 bytes, to ensure the immediate data
+ * is always inside one cache line. Though, it might also be okay if it also
+ * occupied the next cache line, thanks to the L2 adjacent line prefetcher.
+ * The results were mixed and thus we chose the simplest of the two options. */
+#define XHC_BCAST_IMM_SIZE (64 - (offsetof(xhc_comm_ctrl_t, \
+    imm_data) - offsetof(xhc_comm_ctrl_t, seq)))
+#define XHC_REDUCE_IMM_SIZE (64 - (offsetof(xhc_member_ctrl_t, \
+    imm_data) - offsetof(xhc_member_ctrl_t, seq)))
 
 // ---
 
@@ -106,6 +105,8 @@ typedef uint32_t xhc_loc_t;
 typedef void xhc_reg_t;
 typedef void xhc_copy_data_t;
 
+typedef struct xhc_hierarchy_t xhc_hierarchy_t;
+
 typedef struct mca_coll_xhc_component_t mca_coll_xhc_component_t;
 typedef struct mca_coll_xhc_module_t mca_coll_xhc_module_t;
 typedef struct mca_coll_xhc_module_t xhc_module_t;
@@ -113,15 +114,18 @@ typedef struct mca_coll_xhc_module_t xhc_module_t;
 typedef struct xhc_coll_fns_t xhc_coll_fns_t;
 typedef struct xhc_peer_info_t xhc_peer_info_t;
 
-typedef struct xhc_data_t xhc_data_t;
-typedef struct xhc_comm_t xhc_comm_t;
+typedef struct xhc_op_mca_t xhc_op_mca_t;
+typedef struct xhc_op_config_t xhc_op_config_t;
+typedef struct xhc_op_data_t xhc_op_data_t;
 
+typedef struct xhc_comm_t xhc_comm_t;
 typedef struct xhc_comm_ctrl_t xhc_comm_ctrl_t;
 typedef struct xhc_member_ctrl_t xhc_member_ctrl_t;
 typedef struct xhc_member_info_t xhc_member_info_t;
 
-typedef struct xhc_reduce_area_t xhc_reduce_area_t;
+typedef struct xhc_sh_slice_t xhc_sh_slice_t;
 typedef struct xhc_reduce_queue_item_t xhc_rq_item_t;
+typedef struct xhc_reduce_area_t xhc_reduce_area_t;
 
 typedef struct xhc_rank_range_t xhc_rank_range_t;
 typedef struct xhc_loc_def_t xhc_loc_def_t;
@@ -133,44 +137,94 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(xhc_loc_def_item_t);
 
 // ----------------------------------------
 
-struct xhc_coll_fns_t {
-    mca_coll_base_module_allreduce_fn_t coll_allreduce;
-    mca_coll_base_module_t *coll_allreduce_module;
-
-    mca_coll_base_module_barrier_fn_t coll_barrier;
-    mca_coll_base_module_t *coll_barrier_module;
-
-    mca_coll_base_module_bcast_fn_t coll_bcast;
-    mca_coll_base_module_t *coll_bcast_module;
-
-    mca_coll_base_module_reduce_fn_t coll_reduce;
-    mca_coll_base_module_t *coll_reduce_module;
-};
+typedef enum xhc_dynamic_reduce_enum_t {
+    XHC_DYNAMIC_REDUCE_DISABLED = 0,
+    XHC_DYNAMIC_REDUCE_NON_FLOAT,
+    XHC_DYNAMIC_REDUCE_ALL
+} xhc_dynamic_reduce_enum_t;
+
+/* Reduce load balancing, AKA what should leaders do? Leaders propagate the
+ * group's results, and perform reductions on upper level(s). Thus we generally
+ * don't want them to not do reductions inside the group, so they can focus on
+ * the upper ones. However, in certain instances, like at the start of the op
+ * (when no chunks are ready to be reduced on the upper levels), or on the top
+ * level, the leaders do not actually have any other work to do, so might as
+ * well enlist the for the intra-group reductions.
+ *
+ * - ASSIST_TOP_LEVEL: The top level's leader performs reductions
+ *   (on the top level), as if a common member.
+ * - ASSIST_FIRST_CHUNK: Leaders reduce only a single chunk,
+ *   on each level, at the beginning of the operation.
+ * - ASSIST_ALL: All leaders performs reductions exactly as if common members
+ *
+ * (ASSIST_TOP_LEVEL and ASSIST_FIRST_CHUNK are combinable) */
+typedef enum xhc_reduce_load_balance_enum_t {
+    XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL = 0x01,
+    XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK = 0x02,
+    XHC_REDUCE_LB_LEADER_ASSIST_ALL = 0x80
+} xhc_reduce_load_balance_enum_t;
+
+/* Changes to XHC_COLLTYPE_T have to be reflected in:
+ * 1. xhc_colltype_to_universal_map[]
+ * 2. xhc_colltype_to_c_coll_fn_offset_map[]
+ * 3. xhc_colltype_to_c_coll_module_offset_map[]
+ * 4. xhc_colltype_to_coll_base_fn_offset_map[] */
+typedef enum XHC_COLLTYPE_T {
+    XHC_BCAST = 0,
+    XHC_BARRIER,
+    XHC_REDUCE,
+    XHC_ALLREDUCE,
+
+    XHC_COLLCOUNT
+} XHC_COLLTYPE_T;
+
+typedef enum xhc_config_source_t {
+    XHC_CONFIG_SOURCE_INFO_GLOBAL = 0,
+    XHC_CONFIG_SOURCE_INFO_OP,
+    XHC_CONFIG_SOURCE_MCA_GLOBAL,
+    XHC_CONFIG_SOURCE_MCA_OP,
+
+    XHC_CONFIG_SOURCE_COUNT
+} xhc_config_source_t;
+
+typedef enum xhc_copy_method_t {
+    /* 0 is reserved */
+    XHC_COPY_IMM = 1,
+    XHC_COPY_CICO,
+    XHC_COPY_SMSC_NO_MAP,
+    XHC_COPY_SMSC_MAP,
+    XHC_COPY_SMSC, /* kind of a catch-all value for ops that don't
+                    * make a distinction between map/no_map */
+} xhc_copy_method_t;
 
 struct mca_coll_xhc_component_t {
     mca_coll_base_component_t super;
 
     int priority;
-    bool print_info;
+    uint print_info;
 
     char *shmem_backing;
 
+    size_t memcpy_chunk_size;
+
     bool dynamic_leader;
 
     int barrier_root;
+    int allreduce_root;
 
-    int dynamic_reduce;
-    int lb_reduce_leader_assist;
-
-    bool force_reduce;
+    xhc_dynamic_reduce_enum_t dynamic_reduce;
+    xhc_reduce_load_balance_enum_t reduce_load_balance;
 
     bool uniform_chunks;
     size_t uniform_chunks_min;
 
-    size_t cico_max;
+    struct xhc_op_mca_t {
+        char *hierarchy;
+        char *chunk_size;
+        size_t cico_max;
+    } op_mca[XHC_COLLCOUNT];
 
-    char *hierarchy_mca;
-    char *chunk_size_mca;
+    xhc_op_mca_t op_mca_global;
 };
 
 struct mca_coll_xhc_module_t {
@@ -178,172 +232,225 @@ struct mca_coll_xhc_module_t {
 
     /* pointers to functions/modules of
      * previous coll components for fallback */
-    xhc_coll_fns_t prev_colls;
+    struct xhc_coll_fns_t {
+        void (*coll_fn[XHC_COLLCOUNT])(void);
+        void *coll_module[XHC_COLLCOUNT];
+    } prev_colls;
 
-    // copied from comm
+    // copied from OMPI comm
     int comm_size;
     int rank;
 
-    // list of localities to consider during grouping
-    char *hierarchy_string;
-    xhc_loc_t *hierarchy;
-    int hierarchy_len;
+    // ---
 
-    // list of requested chunk sizes, to be applied to comms
-    size_t *chunks;
-    int chunks_len;
+    bool zcopy_support;
+    bool zcopy_map_support;
 
     // temporary (private) internal buffer, for methods like Reduce
     void *rbuf;
     size_t rbuf_size;
 
-    // xhc-specific info for every other rank in the comm
-    xhc_peer_info_t *peer_info;
+    // book-keeping for info on other ranks
+    struct xhc_peer_info_t {
+        xhc_loc_t locality;
+        ompi_proc_t *proc;
 
-    xhc_data_t *data;
+        mca_smsc_endpoint_t *smsc_ep;
 
-    bool init;
-};
+        opal_shmem_ds_t cico_ds;
+        void *cico_buffer;
+    } *peer_info;
 
-struct xhc_peer_info_t {
-    xhc_loc_t locality;
+    // ---
 
-    ompi_proc_t *proc;
-    mca_smsc_endpoint_t *smsc_ep;
+    opal_hash_table_t hierarchy_cache;
 
-    opal_shmem_ds_t cico_ds;
-    void *cico_buffer;
-};
+    struct xhc_op_config_t {
+        char *hierarchy_string;
+        xhc_loc_t *hierarchy;
+        int hierarchy_len;
 
-struct xhc_data_t {
-    xhc_comm_t *comms;
-    int comm_count;
+        char *chunk_string;
+        size_t *chunks;
+        int chunks_len;
+
+        size_t cico_max;
+
+        xhc_config_source_t hierarchy_source;
+        xhc_config_source_t chunk_source;
+        xhc_config_source_t cico_max_source;
+    } op_config[XHC_COLLCOUNT];
+
+    struct xhc_op_data_t {
+        XHC_COLLTYPE_T colltype;
+        xf_sig_t seq;
+
+        xhc_comm_t *comms;
+        int comm_count;
+
+        bool init;
+    } op_data[XHC_COLLCOUNT];
 
-    xf_sig_t pvt_coll_seq;
+    bool init;
+    bool error;
 };
 
+// -----
+
 struct xhc_comm_t {
     xhc_loc_t locality;
+
     size_t chunk_size;
+    size_t cico_size;
 
+    int my_id;
     int size;
-    int manager_rank;
-    int member_id;
+
+    int owner_rank;
 
     // ---
 
+    /* Op state (any) */
+
     // Am I a leader in the current collective?
-    bool is_coll_leader;
-
-    // Have handshaked with all members in the current op? (useful to leader)
-    bool all_joined;
-
-    /* A reduce set defines a range/area of data to be reduced, and its
-     * settings. We require multiple areas, because there might be different
-     * circumstances:
-     *
-     * 1. Under certain load balancing policies, leaders perform reductions
-     *    for the just one chunk, and then they don't. Thus, the worker count
-     *    changes, and the settings have to recomputed for the next areas.
-     *
-     * 2. During the "middle" of the operation, all members continuously
-     *    reduce data in maximum-sized pieces (according to the configured
-     *    chunk size). But, towards the end of the operation, the remaining
-     *    elements are less than ((workers * elem_chunk)), we have to
-     *    recalculate `elem_chunk`, so that all workers will perform
-     *    equal work. */
+    bool is_leader;
+
+    // Op-specific state tracking
+    int op_state;
+
+    // Book-keeping for multi-sliced shared areas
+    struct xhc_sh_slice_t {
+        bool in_use; // slice currently in use
+
+        xf_sig_t seq; // if in use, the seq number of the op
+        size_t len; // if in use, the message size of the op
+        bool is_cico; // if in use, whether the op is CICO type
+    } *slices;
+    int n_slices;
+
+    int slice_id; // designated slice ID for this op
+    bool slice_ready; // is the designed slice ready to use?
+
+    // ---
+
+    /* Reduce op state */
+
+    int leader_id;
+    bool do_all_work;
+
+    // see init_reduce_areas()
     struct xhc_reduce_area_t {
         size_t start; // where the area begins
         size_t len; // the size of the area
         int workers; // how many processes perform reductions in the area
         size_t stride; /* how much to advance inside the area after
-                     * each reduction, unused for non-combo areas */
+                        * each reduction, unused for non-combo areas */
 
         // local process settings
         size_t work_begin; // where to begin the first reduction from
         size_t work_end; // up to where to reduce
         size_t work_chunk; // how much to reduce each time
         size_t work_leftover; /* assigned leftover elements to include as
-                            * part of the last reduction in the area */
-    } reduce_area[3];
+                               * part of the last reduction in the area */
+    } reduce_areas[3];
     int n_reduce_areas;
 
+    // To keep track of reduce progress for different peers
+    opal_list_t *reduce_queue;
+
+    /* These are cached copies of the respective fields in my_ctrl.
+     * We want to minimize accesses to the shared variables. */
+    xf_size_t reduce_ready;
+    xf_size_t reduce_done;
+
     struct xhc_member_info_t {
         xhc_reg_t *sbuf_reg, *rbuf_reg;
         void *sbuf, *rbuf;
-        bool init;
+        bool attach, join;
     } *member_info;
 
-    // Queue to keep track of individual reduction progress for different peers
-    opal_list_t *reduce_queue;
+    xhc_member_info_t *my_info; // = &member_info[my_id]
 
     // ---
 
+    /* Shared structs */
+
     xhc_comm_ctrl_t *comm_ctrl;
+
     xhc_member_ctrl_t *member_ctrl;
+    xhc_member_ctrl_t *my_ctrl; // = &member_ctrl[my_id]
 
-    opal_shmem_ds_t ctrl_ds;
+    // Shared CICO buffer for sharing data to be reduced
+    volatile char *reduce_buffer;
+
+    /* Original addresses of these potentially multi-sliced fields, as
+     * the respective pointers above may be modified in per-op basis */
+    void *comm_ctrl_base;
+    void *member_ctrl_base;
+    void *reduce_buffer_base;
+
+    opal_shmem_ds_t comm_ds;
 
     // ---
 
-    xhc_member_ctrl_t *my_member_ctrl; // = &member_ctrl[member_id]
-    xhc_member_info_t *my_member_info; // = &member_info[member_id]
+    xhc_comm_t *up, *down;
+    xhc_comm_t *top, *bottom;
+    bool is_top, is_bottom;
 };
 
-struct xhc_comm_ctrl_t {
-    // We want leader_seq, coll_ack, coll_seq to all lie in their own cache lines
+// -----
 
+struct xhc_comm_ctrl_t {
+    // leader_seq, ack, seq in different cache lines
     volatile xf_sig_t leader_seq;
+    volatile xf_sig_t ack __attribute__((aligned(XHC_ALIGN)));
+    volatile xf_sig_t seq __attribute__((aligned(XHC_ALIGN)));
 
-    volatile xf_sig_t coll_ack __attribute__((aligned(OMPI_XHC_ALIGN)));
-
-    volatile xf_sig_t coll_seq __attribute__((aligned(OMPI_XHC_ALIGN)));
-
-    /* - Reason *NOT* to keep below fields in the same cache line as coll_seq:
-     *
-     *   While members busy-wait on leader's coll_seq, initializing the rest of
-     *   the fields will trigger cache-coherency-related "invalidate" and then
-     *   "read miss" messages, for each store.
-     *
-     * - Reason to *DO* keep below fields in the same cache line as coll_seq:
-     *
-     *   Members load from coll_seq, and implicitly fetch the entire cache
-     *   line, which also contains the values of the other fields, that will
-     *   also need to be loaded soon.
-     *
-     * (not 100% sure of my description here)
-     *
-     * Bcast seemed to perform better with the second option, so I went with
-     * that one. The best option might also be influenced by the ranks' order
-     * of entering in the operation.
-     */
-
-    // "Guarded" by members' coll_seq
-    volatile int leader_id;
-    volatile int leader_rank;
-    volatile int cico_id;
-
-    void* volatile data_vaddr;
-    volatile xf_size_t bytes_ready;
-
-    char access_token[];
-} __attribute__((aligned(OMPI_XHC_ALIGN)));
+    // below fields in same cache line as seq
 
-struct xhc_member_ctrl_t {
-    volatile xf_sig_t member_ack; // written by member
+    union {
+        struct {
+            volatile int leader_rank;
+
+            volatile xf_size_t bytes_ready;
+            void* volatile data_vaddr;
+        };
 
-    // written by member, at beginning of operation
-    volatile xf_sig_t member_seq __attribute__((aligned(OMPI_XHC_ALIGN)));
-    volatile int rank;
+        /* sized 4 here, but will actually use all that's left
+         * of the cache line. Keep it last in the struct. */
+        volatile char imm_data[4];
+    };
 
-    void* volatile sbuf_vaddr;
-    void* volatile rbuf_vaddr;
-    volatile int cico_id;
+    /* imm_data can overwrite access_token. That's
+     * okay, the two aren't used at the same time. */
+    volatile char access_token[];
+} __attribute__((aligned(XHC_ALIGN)));
+
+struct xhc_member_ctrl_t {
+    volatile xf_sig_t ack;
+    volatile xf_sig_t seq __attribute__((aligned(XHC_ALIGN)));
+
+    // below fields in same cache line as seq
 
-    // reduction progress counters, written by member
     volatile xf_size_t reduce_ready;
     volatile xf_size_t reduce_done;
-} __attribute__((aligned(OMPI_XHC_ALIGN)));
+
+    union {
+        struct {
+            void* volatile sbuf_vaddr;
+            void* volatile rbuf_vaddr;
+
+            volatile int rank;
+            volatile bool is_leader;
+        };
+
+        /* sized 4 here, but will actually use all that's left
+         * of the cache line. Keep it last in the struct. */
+        volatile char imm_data[4];
+    };
+} __attribute__((aligned(XHC_ALIGN)));
+
+// -----
 
 struct xhc_reduce_queue_item_t {
     opal_list_item_t super;
@@ -352,18 +459,15 @@ struct xhc_reduce_queue_item_t {
     int area_id; // current reduce area
 };
 
-// ----------------------------------------
-
-struct xhc_rank_range_t {
-    int start_rank, end_rank;
-};
-
 struct xhc_loc_def_t {
     opal_list_item_t super;
 
     opal_hwloc_locality_t named_loc;
 
-    xhc_rank_range_t *rank_list;
+    struct xhc_rank_range_t {
+        int start_rank, end_rank;
+    } *rank_list;
+
     int rank_list_len;
 
     int split;
@@ -374,27 +478,74 @@ struct xhc_loc_def_t {
 
 // ----------------------------------------
 
+typedef struct xhc_bcast_ctx_t {
+    void *buf;
+    size_t datacount;
+    ompi_datatype_t *datatype;
+    int root;
+    ompi_communicator_t *ompi_comm;
+    xhc_module_t *module;
+
+    int rank;
+
+    xhc_op_data_t *data;
+
+    xf_sig_t seq;
+    xhc_comm_t *comms;
+
+    xhc_comm_t *src_comm;
+    xhc_copy_method_t method;
+
+    void *self_cico;
+    void *src_buffer;
+
+    xhc_copy_data_t *region_data;
+    xhc_reg_t *reg;
+
+    size_t bytes_total;
+    size_t bytes_avail;
+    size_t bytes_done;
+} xhc_bcast_ctx_t;
+
+// ----------------------------------------
+
 // coll_xhc_component.c
 // --------------------
 
-#define xhc_component_parse_hierarchy(...) mca_coll_xhc_component_parse_hierarchy(__VA_ARGS__)
-#define xhc_component_parse_chunk_sizes(...) mca_coll_xhc_component_parse_chunk_sizes(__VA_ARGS__)
+#define xhc_colltype_to_universal(...) \
+    mca_coll_xhc_colltype_to_universal(__VA_ARGS__)
+#define xhc_colltype_to_str(...) \
+    mca_coll_xhc_colltype_to_str(__VA_ARGS__)
+#define xhc_config_source_to_str(...) \
+    mca_coll_xhc_config_source_to_str(__VA_ARGS__)
+
+#define xhc_component_parse_hierarchy(...) \
+    mca_coll_xhc_component_parse_hierarchy(__VA_ARGS__)
+#define xhc_component_parse_chunk_sizes(...) \
+    mca_coll_xhc_component_parse_chunk_sizes(__VA_ARGS__)
+#define xhc_component_parse_cico_max(...) \
+    mca_coll_xhc_component_parse_cico_max(__VA_ARGS__)
 
 int mca_coll_xhc_component_init_query(bool enable_progress_threads,
     bool enable_mpi_threads);
+COLLTYPE_T mca_coll_xhc_colltype_to_universal(XHC_COLLTYPE_T xhc_colltype);
+const char *mca_coll_xhc_colltype_to_str(XHC_COLLTYPE_T colltype);
+const char *mca_coll_xhc_config_source_to_str(xhc_config_source_t source);
 
 int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
     opal_list_t **level_defs_dst, int *nlevel_defs_dst);
 int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str,
     size_t **vals_dst, int *len_dst);
+int mca_coll_xhc_component_parse_cico_max(const char *val_str,
+    size_t *cico_max_dst);
 
 // coll_xhc_module.c
 // -----------------
 
-#define xhc_module_install_fns(...) mca_coll_xhc_module_install_fns(__VA_ARGS__)
-#define xhc_module_install_fallback_fns(...) mca_coll_xhc_module_install_fallback_fns(__VA_ARGS__)
-
-#define xhc_module_prepare_hierarchy(...) mca_coll_xhc_module_prepare_hierarchy(__VA_ARGS__)
+#define xhc_module_set_coll_fns(...) \
+    mca_coll_xhc_module_set_coll_fns(__VA_ARGS__)
+#define xhc_module_prepare_hierarchy(...) \
+    mca_coll_xhc_module_prepare_hierarchy(__VA_ARGS__)
 
 mca_coll_base_module_t *mca_coll_xhc_module_comm_query(
     ompi_communicator_t *comm, int *priority);
@@ -404,22 +555,22 @@ int mca_coll_xhc_module_enable(mca_coll_base_module_t *module,
 int mca_coll_xhc_module_disable(mca_coll_base_module_t *module,
     ompi_communicator_t *comm);
 
-void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module,
-	ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst);
-void mca_coll_xhc_module_install_fns(xhc_module_t *module,
-	ompi_communicator_t *comm, xhc_coll_fns_t fns);
-
-int mca_coll_xhc_module_prepare_hierarchy(mca_coll_xhc_module_t *module,
-    ompi_communicator_t *comm);
+void mca_coll_xhc_module_set_coll_fns(ompi_communicator_t *comm,
+    xhc_coll_fns_t *new_fns, xhc_coll_fns_t *save_fns);
 
 // coll_xhc.c
 // ----------
 
 #define xhc_lazy_init(...) mca_coll_xhc_lazy_init(__VA_ARGS__)
+#define xhc_init_op(...) mca_coll_xhc_init_op(__VA_ARGS__)
 #define xhc_fini(...) mca_coll_xhc_fini(__VA_ARGS__)
+#define xhc_read_op_config(...) mca_coll_xhc_read_op_config(__VA_ARGS__)
 
 #define xhc_get_cico(...) mca_coll_xhc_get_cico(__VA_ARGS__)
 
+#define xhc_shmem_create(...) mca_coll_xhc_shmem_create(__VA_ARGS__)
+#define xhc_shmem_attach(...) mca_coll_xhc_shmem_attach(__VA_ARGS__)
+
 #define xhc_copy_expose_region(...) mca_coll_xhc_copy_expose_region(__VA_ARGS__)
 #define xhc_copy_region_post(...) mca_coll_xhc_copy_region_post(__VA_ARGS__)
 #define xhc_copy_from(...) mca_coll_xhc_copy_from(__VA_ARGS__)
@@ -429,11 +580,21 @@ int mca_coll_xhc_module_prepare_hierarchy(mca_coll_xhc_module_t *module,
 #define xhc_return_registration(...) mca_coll_xhc_return_registration(__VA_ARGS__)
 
 int mca_coll_xhc_lazy_init(mca_coll_xhc_module_t *module, ompi_communicator_t *comm);
+int mca_coll_xhc_init_op(xhc_module_t *module, ompi_communicator_t *comm,
+    XHC_COLLTYPE_T colltype);
 void mca_coll_xhc_fini(mca_coll_xhc_module_t *module);
 
+int mca_coll_xhc_read_op_config(xhc_module_t *module,
+    ompi_communicator_t *comm, XHC_COLLTYPE_T colltype);
+
+void *mca_coll_xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
+    ompi_communicator_t *ompi_comm, const char *name, int id1, int id2);
+void *mca_coll_xhc_shmem_attach(opal_shmem_ds_t *seg_ds);
+
 void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank);
 
-int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data);
+int mca_coll_xhc_copy_expose_region(void *base, size_t len,
+    xhc_copy_data_t **region_data);
 void mca_coll_xhc_copy_region_post(void *dst, xhc_copy_data_t *region_data);
 int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info, void *dst,
     void *src, size_t size, void *access_token);
@@ -443,6 +604,26 @@ void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info,
     void *peer_vaddr, size_t size, xhc_reg_t **reg);
 void mca_coll_xhc_return_registration(xhc_reg_t *reg);
 
+// coll_xhc_comm.c
+// --------------------
+
+#define xhc_comms_make(...) mca_coll_xhc_comms_make(__VA_ARGS__)
+#define xhc_comms_destroy(...) mca_coll_xhc_comms_destroy(__VA_ARGS__)
+
+int mca_coll_xhc_comms_make(ompi_communicator_t *ompi_comm,
+    xhc_module_t *module, xhc_op_config_t *config, xhc_op_data_t *data);
+
+void mca_coll_xhc_comms_destroy(xhc_comm_t *comms, int comm_count);
+
+// coll_xhc_hierarchy.c
+// --------------------
+
+#define xhc_hierarchy_make(...) mca_coll_xhc_hierarchy_make(__VA_ARGS__)
+
+int mca_coll_xhc_hierarchy_make(xhc_module_t *module,
+    ompi_communicator_t *comm, const char *hierarchy_string,
+    xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst);
+
 // Primitives (respective file)
 // ----------------------------
 
@@ -460,8 +641,30 @@ int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf,
     size_t count, ompi_datatype_t *datatype, ompi_op_t *op,
     ompi_communicator_t *comm, mca_coll_base_module_t *module);
 
-// Miscellaneous
-// -------------
+// coll_xhc_bcast.c
+// ----------------
+
+#define xhc_bcast_notify(...) mca_coll_xhc_bcast_notify(__VA_ARGS__)
+#define xhc_bcast_init(...) mca_coll_xhc_bcast_init(__VA_ARGS__)
+#define xhc_bcast_start(...) mca_coll_xhc_bcast_start(__VA_ARGS__)
+#define xhc_bcast_work(...) mca_coll_xhc_bcast_work(__VA_ARGS__)
+#define xhc_bcast_ack(...) mca_coll_xhc_bcast_ack(__VA_ARGS__)
+#define xhc_bcast_fini(...) mca_coll_xhc_bcast_fini(__VA_ARGS__)
+
+void mca_coll_xhc_bcast_notify(xhc_bcast_ctx_t *ctx,
+    xhc_comm_t *xc, size_t bytes_ready);
+
+int mca_coll_xhc_bcast_init(void *buf, size_t count, ompi_datatype_t *datatype,
+    int root, ompi_communicator_t *ompi_comm, xhc_module_t *module,
+    xhc_bcast_ctx_t *ctx_dst);
+
+int mca_coll_xhc_bcast_start(xhc_bcast_ctx_t *ctx);
+int mca_coll_xhc_bcast_work(xhc_bcast_ctx_t *ctx);
+void mca_coll_xhc_bcast_ack(xhc_bcast_ctx_t *ctx);
+void mca_coll_xhc_bcast_fini(xhc_bcast_ctx_t *ctx);
+
+// coll_xhc_allreduce.c
+// --------------------
 
 #define xhc_allreduce_internal(...) mca_coll_xhc_allreduce_internal(__VA_ARGS__)
 
@@ -471,7 +674,8 @@ int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, size_t count,
 
 // ----------------------------------------
 
-// Rollover-safe check that flag has reached/exceeded thresh, with max deviation
+/* Rollover-safe check that _flag_ has reached _thresh_,
+ * without having exceeded it by more than _win_. */
 static inline bool CHECK_FLAG(volatile xf_sig_t *flag,
         xf_sig_t thresh, xf_sig_t win) {
 
@@ -485,7 +689,7 @@ static inline void WAIT_FLAG(volatile xf_sig_t *flag,
     bool ready = false;
 
     do {
-        for(int i = 0; i < OMPI_XHC_OPAL_PROGRESS_CYCLE; i++) {
+        for(int i = 0; i < XHC_OPAL_PROGRESS_CYCLE; i++) {
             if(CHECK_FLAG(flag, thresh, win)) {
                 ready = true;
                 break;
@@ -501,12 +705,31 @@ static inline void WAIT_FLAG(volatile xf_sig_t *flag,
                     "thresh = %d\n", win, f, thresh); */
         }
 
-        if(!ready) {
+        if(!ready)
             opal_progress();
-        }
     } while(!ready);
 }
 
+/* Modeled after smsc/xpmem. We don't currently want to deviate from its
+ * behaviour. But this adds a second parameter (additionally to smsc/xpmem's)
+ * to set if one wants to adjust the limit. Ideally there would be a singular
+ * opal-level MCA parameter for this, and/or an opal_memcpy that would do the
+ * job that xhc_mempy and smsc/xpmem's respective function do. */
+static inline void xhc_memcpy(void *dst, const void *src, size_t size) {
+    for(size_t copied = 0; copied < size; ) {
+        size_t chunk = opal_min(size - copied,
+            mca_coll_xhc_component.memcpy_chunk_size);
+
+        memcpy((char *) dst + copied, (char *) src + copied, chunk);
+        copied += chunk;
+    }
+}
+
+static inline void xhc_memcpy_offset(void *dst, const void *src,
+        size_t offset, size_t size) {
+    xhc_memcpy((char *) dst + offset, (char *) src + offset, size);
+}
+
 // ----------------------------------------
 
 END_C_DECLS
diff --git a/ompi/mca/coll/xhc/coll_xhc_allreduce.c b/ompi/mca/coll/xhc/coll_xhc_allreduce.c
index 65bdaa9848c..9eb633a2e47 100644
--- a/ompi/mca/coll/xhc/coll_xhc_allreduce.c
+++ b/ompi/mca/coll/xhc/coll_xhc_allreduce.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -22,40 +22,59 @@
 
 #include "coll_xhc.h"
 
-#define MAX_REDUCE_AREAS(comm) \
-    ((int)(sizeof((comm)->reduce_area)/sizeof((comm)->reduce_area[0])))
+// -----------------------------
+
+#define COMM_ALL_JOINED 0x01
+#define COMM_REDUCE_FINI 0x02
 
 OBJ_CLASS_INSTANCE(xhc_rq_item_t, opal_list_item_t, NULL, NULL);
 
+static inline char *CICO_BUFFER(xhc_comm_t *xc, int member) {
+    return (char *) xc->reduce_buffer + member * xc->cico_size;
+}
+
 // -----------------------------
 
-/* For the reduction areas, see comments in xhc_reduce_area_t's definition.
- * For the leader reduction assistance policies see the flag definitions. */
-static void init_reduce_areas(xhc_comm_t *comms,
-        int comm_count, int allreduce_count, size_t dtype_size) {
+/* Calculate/generate the reduce sets/areas. A reduce set defines a
+ * range/area of data to be reduced, and its settings. We require
+ * multiple areas, because there might be different circumstances:
+ *
+ * 1. Under certain load balancing policies, leaders perform reductions
+ *    for just one chunk, and then they don't. Thus, the worker count
+ *    changes, and the settings have to recomputed for the next areas.
+ *
+ * 2. During the "middle" of the operation, all members continuously
+ *    reduce data in maximum-sized pieces (according to the configured
+ *    chunk size). But, towards the end of the operation, the remaining
+ *    elements are less than ((workers * elem_chunk)), we have to
+ *    recalculate `elem_chunk`, so that all workers will perform
+ *    equal work.
+ *
+ * See also the comments in the reduce area struct definition for the different
+ * fields, and the comments in the defintion of xhc_reduce_load_balance_enum_t. */
+static void init_reduce_areas(xhc_comm_t *comms, size_t allreduce_count,
+        size_t dtype_size, xhc_reduce_load_balance_enum_t lb_policy) {
 
     bool uniform_chunks = mca_coll_xhc_component.uniform_chunks;
-    int lb_rla = mca_coll_xhc_component.lb_reduce_leader_assist;
 
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        int max_reduce_areas = sizeof(xc->reduce_areas)/sizeof(xc->reduce_areas[0]);
+        int avail_workers[max_reduce_areas];
 
-        int avail_workers[MAX_REDUCE_AREAS(xc)];
-
-        for(int area_id = 0; area_id < MAX_REDUCE_AREAS(xc); area_id++) {
+        for(int area_id = 0; area_id < max_reduce_areas; area_id++) {
             int workers = xc->size - 1;
 
-            if(lb_rla & OMPI_XHC_LB_RLA_TOP_LEVEL) {
-                if(i == comm_count - 1 && workers < xc->size)
+            if(lb_policy & XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL) {
+                if(xc->is_top && workers < xc->size)
                     workers++;
             }
 
-            if(lb_rla & OMPI_XHC_LB_RLA_FIRST_CHUNK) {
+            if(lb_policy & XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK) {
                 if(area_id == 0 && workers < xc->size)
                     workers++;
             }
 
-            if(lb_rla & OMPI_XHC_LB_RLA_ALL) {
+            if(lb_policy & XHC_REDUCE_LB_LEADER_ASSIST_ALL) {
                 workers = xc->size;
             }
 
@@ -63,23 +82,24 @@ static void init_reduce_areas(xhc_comm_t *comms,
         }
 
         // Min/max work that a worker may perform (one step)
-        int min_elems = mca_coll_xhc_component.uniform_chunks_min / dtype_size;
-        int max_elems = xc->chunk_size / dtype_size;
+        size_t min_elems = mca_coll_xhc_component.uniform_chunks_min / dtype_size;
+        size_t max_elems = xc->chunk_size / dtype_size;
 
-        int area_id = 0, el_idx = 0;
+        int area_id = 0;
+        size_t el_idx = 0;
 
-        while(area_id < MAX_REDUCE_AREAS(xc) && el_idx < allreduce_count) {
-            xhc_reduce_area_t *area = &xc->reduce_area[area_id];
+        while(area_id < max_reduce_areas && el_idx < allreduce_count) {
+            xhc_reduce_area_t *area = &xc->reduce_areas[area_id];
 
             *area = (xhc_reduce_area_t) {0};
 
-            int remaining = allreduce_count - el_idx;
+            size_t remaining = allreduce_count - el_idx;
             int workers = avail_workers[area_id];
 
-            int elems_per_member;
-            int repeat = 0;
+            size_t elems_per_member;
+            size_t repeat = 0;
 
-            int area_elems = opal_min(max_elems * workers, remaining);
+            size_t area_elems = opal_min(max_elems * workers, remaining);
 
             /* We should consider the future size of the next area. If it's
              * too small in relation to the minimum chunk (min_elems), some
@@ -90,20 +110,21 @@ static void init_reduce_areas(xhc_comm_t *comms,
              * especially few, we make this area absorb the next one.
              * Specifically, we absorb it if the increase of each worker's
              * load is no more than 10% of the maximum load set. */
-            if(uniform_chunks && area_id < MAX_REDUCE_AREAS(xc) - 1) {
+            if(uniform_chunks && area_id < max_reduce_areas - 1) {
                 int next_workers = avail_workers[area_id+1];
-                int next_remaining = allreduce_count - (el_idx + area_elems);
+                size_t next_remaining = allreduce_count - (el_idx + area_elems);
 
                 if(next_remaining < next_workers * min_elems) {
-                    if(next_remaining/workers <= max_elems/10) {
+                    if(next_remaining/workers <= max_elems/10)
                         area_elems += next_remaining;
-                    } else {
-                        int ideal_donate = next_workers * min_elems - next_remaining;
+                    else {
+                        size_t ideal_donate =
+                            next_workers * min_elems - next_remaining;
 
                         /* Don't donate so much elements that this area
                          * won't cover its own min reduction chunk size */
-                        int max_donate = area_elems - workers * min_elems;
-                        max_donate = (max_donate > 0 ? max_donate : 0);
+                        ssize_t max_donate = area_elems - workers * min_elems;
+                        max_donate = opal_max(max_donate, 0);
 
                         area_elems -= opal_min(ideal_donate, max_donate);
                     }
@@ -126,8 +147,8 @@ static void init_reduce_areas(xhc_comm_t *comms,
 
             // If this is the middle area, try to maximize its size
             if(area_id == 1 && workers > 0) {
-                int set = workers * elems_per_member;
-                repeat = (int)((remaining-area_elems)/set);
+                size_t set = workers * elems_per_member;
+                repeat = (size_t)((remaining-area_elems)/set);
                 area_elems += repeat * set;
             }
 
@@ -140,7 +161,7 @@ static void init_reduce_areas(xhc_comm_t *comms,
              * the one with ID=0, because currently only member 0 becomes
              * the leader, and the leader is the only one that might not
              * be reducing. */
-            int worker_id = xc->member_id - (xc->size - avail_workers[area_id]);
+            int worker_id = xc->my_id - (xc->size - avail_workers[area_id]);
 
             area->work_begin = el_idx + worker_id * elems_per_member;
             area->work_chunk = (worker_id >= 0 && worker_id < workers ?
@@ -148,12 +169,11 @@ static void init_reduce_areas(xhc_comm_t *comms,
 
             area->work_leftover = 0;
 
-            int leftover_elems = (workers > 0 ?
+            size_t leftover_elems = (workers > 0 ?
                 (area_elems % (workers * elems_per_member)) : area_elems);
             if(leftover_elems) {
-                if(worker_id == (uniform_chunks ? workers - 1 : workers)) {
+                if(worker_id == (uniform_chunks ? workers - 1 : workers))
                     area->work_leftover = leftover_elems;
-                }
             }
 
             area->work_end = area->work_begin + (repeat * area->stride)
@@ -169,334 +189,493 @@ static void init_reduce_areas(xhc_comm_t *comms,
 
         // Erase zero-work areas
         while(xc->n_reduce_areas > 0
-                && xc->reduce_area[xc->n_reduce_areas - 1].work_chunk == 0
-                && xc->reduce_area[xc->n_reduce_areas - 1].work_leftover == 0) {
+                && xc->reduce_areas[xc->n_reduce_areas - 1].work_chunk == 0
+                && xc->reduce_areas[xc->n_reduce_areas - 1].work_leftover == 0) {
             xc->n_reduce_areas--;
         }
 
         /* If not a leader on this comm, nothing
          * to do on next ones whatsoever */
-        if(!xc->is_coll_leader) {
+        if(!xc->is_leader)
             break;
-        }
     }
 }
 
-static void xhc_allreduce_init_local(xhc_comm_t *comms, int comm_count,
-        size_t allreduce_count, size_t dtype_size, xf_sig_t seq) {
+static void xhc_allreduce_init_local(xhc_comm_t *comms,
+        size_t allreduce_count, size_t dtype_size, XHC_COLLTYPE_T colltype,
+        xhc_reduce_load_balance_enum_t lb_policy, xf_sig_t seq) {
 
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        // Non-leader by default
+        xc->is_leader = false;
+        xc->leader_id = -1;
 
-        xc->is_coll_leader = false;
+        xc->op_state = 0;
 
-        for(int m = 0; m < xc->size; m++) {
-            xc->member_info[m] = (xhc_member_info_t) {0};
-        }
+        /* Reduce is multi-sliced. In Allreduce, the slice's
+         * readiness is guaranteed with other mechanisms. */
+        xc->slice_ready = (colltype != XHC_REDUCE);
+        xc->slice_id = seq % xc->n_slices;
 
-        xc->all_joined = false;
-    }
+        /* Set pointers to the current slice's shared sync structs */
 
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+        xc->member_ctrl = (void *) ((char *) xc->member_ctrl_base
+            + xc->size * sizeof(xhc_member_ctrl_t) * xc->slice_id);
+        xc->my_ctrl = &xc->member_ctrl[xc->my_id];
+
+        xc->reduce_buffer = (char *) xc->reduce_buffer_base
+            + xc->size * xc->cico_size * xc->slice_id;
+
+        for(int m = 0; m < xc->size; m++)
+            xc->member_info[m] = (xhc_member_info_t) {0};
+    }
 
-        /* The manager is the leader. Even in the dynamic reduce case,
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        /* The owner is the leader. Even in the dynamic reduce case,
          * there (currently) shouldn't be any real benefit from the
          * leader being dynamic in allreduce. */
-        if(xc->member_id != 0) {
+        if(xc->my_id != 0)
             break;
-        }
 
-        xc->comm_ctrl->leader_seq = seq;
-        xc->is_coll_leader = true;
+        xc->is_leader = true;
+        xc->leader_id = xc->my_id;
+        xc->do_all_work = false;
     }
 
-    init_reduce_areas(comms, comm_count, allreduce_count, dtype_size);
-
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    init_reduce_areas(comms, allreduce_count, dtype_size, lb_policy);
 
-        size_t initial_count = (xc->n_reduce_areas > 0 ?
-            xc->reduce_area[0].work_begin : allreduce_count);
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        size_t init_count = (xc->n_reduce_areas > 0 ?
+            xc->reduce_areas[0].work_begin : allreduce_count);
 
         int m = 0;
         OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) {
-            if(m == xc->member_id) {
-                m++;
-            }
+            if(m == xc->my_id) m++;
 
             *item = (xhc_rq_item_t) {.super = item->super, .member = m++,
-                .count = initial_count, .area_id = 0};
+                .count = init_count, .area_id = 0};
         }
 
-        if(!xc->is_coll_leader) {
-            break;
+        /* Check if this member is the only one that does any reductions
+         * on this comm. Useful on the top level to know if the results
+         * can be placed directly on the root's rbuf. */
+        xc->do_all_work = true;
+        for(int a = 0; a < xc->n_reduce_areas; a++) {
+            xhc_reduce_area_t *area = &xc->reduce_areas[a];
+
+            if(area->work_begin != area->start
+                    || area->work_end != area->len) {
+                xc->do_all_work = false;
+                break;
+            }
         }
+
+        if(!xc->is_leader)
+            break;
     }
 }
 
-static void xhc_allreduce_init_comm(xhc_comm_t *comms, int comm_count,
-        void *rbuf, bool do_cico, int ompi_rank, xf_sig_t seq) {
+static void xhc_allreduce_init_comm(xhc_comm_t *comms,
+    int rank, xf_sig_t seq) {
 
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        if(!xc->is_leader) break;
 
-        if(!xc->is_coll_leader) {
-            break;
-        }
+        WAIT_FLAG(&xc->comm_ctrl->ack, seq - 1, 0);
 
-        WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0);
+        /* The rest of the fields in comm_ctrl ended up not being necessary.
+         * And since other operations have their own comm ctrl, there's no
+         * problem not keeping fields like comm seq up to date. */
+    }
+}
 
-        /* Because there is a control dependency with the load
-         * from coll_ack above and the code below, and because
-         * it is a load-store one (not load-load), I declare
-         * that a read-memory-barrier is not required here. */
+static int xhc_allreduce_init_member(xhc_comm_t *xc, xhc_peer_info_t *peer_info,
+        void *sbuf, void *rbuf, size_t allreduce_count, size_t dtype_size,
+        xhc_copy_method_t method, xhc_copy_method_t bcast_method, int rank,
+        xf_sig_t seq, bool should_block) {
+
+    /* Only makes sense to init a comm on which a member is participating,
+     * i.e. either any comm that it's a leader on, or the lowest comm on
+     * which it's not a leader (or comm 0, if not a leader on any level). */
+    assert(xc->is_bottom || xc->down->is_leader);
+
+    /* Make sure that the previous owner of my member ctrl is not still
+     * using it (tip: can occur with dynamic leadership (or non-zero
+     * root?!), when it is implemented ^^) . Note that thanks to the
+     * broadcast step in Allreduce, which has some synchronizing
+     * properties, by the time that bcast ends and member ack is set,
+     * it's guaranteed that no other member in a previous collective
+     * is accessing the member's flags or data. In reduce, where there
+     * is no broadcast step, this guarantee is kept by only setting
+     * member ack *after* comm ack. */
+
+    /* In the multi-slice approach, the seq number of the last operation
+     * that made use of the current slice depends on the number of slices. */
+    while(!CHECK_FLAG(&xc->my_ctrl->ack, seq - xc->n_slices, 0))
+        if(!should_block) return OMPI_ERR_WOULD_BLOCK;
+
+    xhc_rq_item_t *rq_first = (xhc_rq_item_t *)
+        opal_list_get_first(xc->reduce_queue);
 
-        xc->comm_ctrl->leader_id = xc->member_id;
-        xc->comm_ctrl->leader_rank = ompi_rank;
-        xc->comm_ctrl->data_vaddr = (!do_cico ? rbuf : NULL);
-        xc->comm_ctrl->bytes_ready = 0;
+    // ---
 
-        xhc_atomic_wmb();
+    if(method == XHC_COPY_IMM) {
+        memcpy((void *) xc->my_ctrl->imm_data, sbuf, allreduce_count * dtype_size);
+    } else {
+        xc->my_ctrl->rank = rank;
+        xc->my_ctrl->is_leader = (xc->is_leader);
 
-        xc->comm_ctrl->coll_seq = seq;
+        if(method == XHC_COPY_SMSC) {
+            xc->my_ctrl->sbuf_vaddr = (xc->is_bottom ? sbuf : rbuf);
+            xc->my_ctrl->rbuf_vaddr = (xc->is_leader ? rbuf : NULL);
+        }
     }
-}
-
-static void xhc_allreduce_init_member(xhc_comm_t *comms, int comm_count,
-        xhc_peer_info_t *peer_info, void *sbuf, void *rbuf, size_t allreduce_count,
-        bool do_cico, int ompi_rank, xf_sig_t seq) {
-
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
 
-        /* Essentially the value of reduce area-0's
-         * work_begin, as set in init_local() */
-        int rq_first_count = ((xhc_rq_item_t *)
-            opal_list_get_first(xc->reduce_queue))->count;
+    /* In single or imm copy, the data on the bottom comm is
+     * already in place and ready to be read by other members. */
+    xc->my_ctrl->reduce_ready = (xc->reduce_ready = (xc->is_bottom
+        && method != XHC_COPY_CICO ? allreduce_count : 0));
 
-        /* Make sure that the previous owner of my member ctrl (tip: can
-         * occur with dynamic leadership (or non-zero root!?), when it is
-         * implemented ^^) is not still using it. Also not that this
-         * previous owner will set member_ack only after the comm's coll_ack
-         * is set, so it also guarantees that no other member in the comm is
-         * accessing the member's flags from a previous collective. */
-        WAIT_FLAG(&xc->my_member_ctrl->member_ack, seq - 1, 0);
+    xc->my_ctrl->reduce_done = (xc->reduce_done = rq_first->count);
 
-        xc->my_member_ctrl->reduce_done = rq_first_count;
-        xc->my_member_ctrl->reduce_ready = (i == 0 && !do_cico ? allreduce_count : 0);
+    xhc_atomic_wmb();
+    xc->my_ctrl->seq = seq;
 
-        xc->my_member_ctrl->rank = ompi_rank;
+    // ---
 
-        if(!do_cico) {
-            xc->my_member_ctrl->sbuf_vaddr = (i == 0 ? sbuf : rbuf);
-            xc->my_member_ctrl->rbuf_vaddr = (xc->is_coll_leader ? rbuf : NULL);
+    switch(method) {
+        case XHC_COPY_IMM:
+            xc->my_info->sbuf = (void *) xc->my_ctrl->imm_data;
+            xc->my_info->rbuf = rbuf;
 
-            xc->my_member_ctrl->cico_id = -1;
+            break;
 
-            xc->my_member_info->sbuf = (i == 0 ? sbuf : rbuf);
-            xc->my_member_info->rbuf = rbuf;
-        } else {
-            xc->my_member_ctrl->sbuf_vaddr = NULL;
-            xc->my_member_ctrl->rbuf_vaddr = NULL;
+        case XHC_COPY_CICO:
+            xc->my_info->sbuf = CICO_BUFFER(xc, xc->my_id);
 
-            int cico_id = (i == 0 ? ompi_rank : comms[i-1].manager_rank);
-            xc->my_member_ctrl->cico_id = cico_id;
+            if(xc->up)
+                xc->my_info->rbuf = CICO_BUFFER(xc->up, xc->up->my_id);
+            else {
+                xc->my_info->rbuf = (xc->do_all_work ? rbuf
+                    : CICO_BUFFER(xc, xc->my_id));
+            }
 
-            xc->my_member_info->sbuf = xhc_get_cico(peer_info, cico_id);
-            xc->my_member_info->rbuf = xhc_get_cico(peer_info, ompi_rank);
-        }
+            break;
 
-        xhc_atomic_wmb();
-        xc->my_member_ctrl->member_seq = seq;
+        case XHC_COPY_SMSC:
+            xc->my_info->sbuf = (xc->is_bottom ? sbuf : rbuf);
+            xc->my_info->rbuf = rbuf;
 
-        if(!xc->is_coll_leader) {
             break;
-        }
+
+        default:
+            assert(0);
     }
+
+    /* If we're doing CICO broadcast, arrange for the top-comm
+     * results to be directly placed on the root's CICO buffer. */
+    if(bcast_method == XHC_COPY_CICO && xc->is_top && xc->is_leader)
+        xc->my_info->rbuf = xhc_get_cico(peer_info, rank);
+
+    xc->my_info->attach = true;
+    xc->my_info->join = true;
+
+    return OMPI_SUCCESS;
 }
 
 // -----------------------------
 
 static int xhc_allreduce_attach_member(xhc_comm_t *xc, int member,
-        xhc_peer_info_t *peer_info, size_t bytes, bool do_cico, xf_sig_t seq) {
+        xhc_peer_info_t *peer_info, size_t bytes, xhc_copy_method_t method,
+        xhc_copy_method_t bcast_method, xf_sig_t seq) {
 
-    if(xc->member_info[member].init) {
-        return 0;
-    }
+    xhc_member_info_t *m_info = &xc->member_info[member];
+    xhc_member_ctrl_t *m_ctrl = &xc->member_ctrl[member];
 
-    if(!do_cico) {
-        int member_rank = xc->member_ctrl[member].rank;
+    if(m_info->attach)
+        return OMPI_SUCCESS;
 
-        void *sbuf_vaddr = xc->member_ctrl[member].sbuf_vaddr;
-        void *rbuf_vaddr = xc->member_ctrl[member].rbuf_vaddr;
+    /* If we're doing CICO broadcast, arrange for the top-comm
+     * results to be directly placed on the root's CICO buffer. */
+    if(bcast_method == XHC_COPY_CICO && xc->is_top && m_ctrl->is_leader)
+        m_info->rbuf = xhc_get_cico(peer_info, m_ctrl->rank);
 
-        xc->member_info[member].sbuf = xhc_get_registration(
-            &peer_info[member_rank], sbuf_vaddr, bytes,
-            &xc->member_info[member].sbuf_reg);
+    switch(method) {
+        case XHC_COPY_IMM:
+            m_info->sbuf = (void *) m_ctrl->imm_data;
+            break;
 
-        if(xc->member_info[member].sbuf == NULL) {
-            return -1;
-        }
+        case XHC_COPY_CICO:
+            m_info->sbuf = CICO_BUFFER(xc, member);
 
-        // Leaders will also share their rbuf
-        if(rbuf_vaddr) {
-            if(rbuf_vaddr != sbuf_vaddr) {
-                xc->member_info[member].rbuf = xhc_get_registration(
-                    &peer_info[member_rank], rbuf_vaddr, bytes,
-                    &xc->member_info[member].rbuf_reg);
+            if(m_ctrl->is_leader && !m_info->rbuf) {
+                if(xc->up) {
+                    /* On the comm on the next level, all members
+                     * of this one appear under the same ID. */
+                    m_info->rbuf = CICO_BUFFER(xc->up, xc->up->my_id);
+                } else
+                    m_info->rbuf = CICO_BUFFER(xc, member);
+            }
 
-                if(xc->member_info[member].rbuf == NULL) {
-                    return -1;
-                }
-            } else
-                xc->member_info[member].rbuf = xc->member_info[member].sbuf;
-        }
-    } else {
-        /* Here's the deal with CICO buffers and the comm's manager: In order
-         * to avoid excessive amounts of attachments, ranks that are
-         * foreign to a comm only attach to the comm's manager's CICO buffer,
-         * instead of to every member's. Therefore, members will place their
-         * final data in the manager's CICO buffer, instead of the leader's
-         * (even though the leader and the manager actually very often are one
-         * and the same..). */
-
-        xc->member_info[member].sbuf = xhc_get_cico(peer_info,
-            xc->member_ctrl[member].cico_id);
-
-        if(CHECK_FLAG(&xc->comm_ctrl->coll_seq, seq, 0)
-                && member == xc->comm_ctrl->leader_id) {
-            xc->member_info[member].rbuf = xhc_get_cico(peer_info, xc->manager_rank);
+            break;
+
+        case XHC_COPY_SMSC: {
+            void *sbuf_vaddr = m_ctrl->sbuf_vaddr;
+            void *rbuf_vaddr = m_ctrl->rbuf_vaddr;
+
+            m_info->sbuf = xhc_get_registration(&peer_info[m_ctrl->rank],
+                sbuf_vaddr, bytes, &m_info->sbuf_reg);
+
+            if(m_info->sbuf == NULL)
+                return OMPI_ERR_UNREACH;
+
+            if(m_ctrl->is_leader && !m_info->rbuf) {
+                if(rbuf_vaddr != sbuf_vaddr) {
+                    m_info->rbuf = xhc_get_registration(&peer_info[m_ctrl->rank],
+                        rbuf_vaddr, bytes, &m_info->rbuf_reg);
+
+                    if(m_info->rbuf == NULL)
+                        return OMPI_ERR_UNREACH;
+                } else
+                    m_info->rbuf = m_info->sbuf;
+            }
+
+            break;
         }
+
+        default:
+            assert(0);
     }
 
-    xc->member_info[member].init = true;
+    // In imm, is_leader is repurposed for payload storage
+    if(method != XHC_COPY_IMM && m_ctrl->is_leader)
+        xc->leader_id = member;
+
+    m_info->attach = true;
 
-    return 0;
+    return OMPI_SUCCESS;
 }
 
-static void xhc_allreduce_leader_check_all_joined(xhc_comm_t *xc, xf_sig_t seq) {
+static bool xhc_allreduce_check_all_joined(xhc_comm_t *xc, xf_sig_t seq) {
+    bool status = true;
+
     for(int m = 0; m < xc->size; m++) {
-        if(m == xc->member_id) {
+        if(xc->member_info[m].join)
             continue;
-        }
 
-        if(!CHECK_FLAG(&xc->member_ctrl[m].member_seq, seq, 0)) {
-            return;
-        }
+        if(CHECK_FLAG(&xc->member_ctrl[m].seq, seq, 0))
+            xc->member_info[m].join = true;
+        else
+            status = false;
     }
 
-    xc->all_joined = true;
+    return status;
 }
 
-static void xhc_allreduce_disconnect_peers(xhc_comm_t *comms, int comm_count) {
-    xhc_comm_t *xc = comms;
+static void xhc_allreduce_slice_gc(xhc_comm_t *xc) {
+    xf_sig_t ack = xc->comm_ctrl->ack;
 
-    while(xc && xc->is_coll_leader) {
-        xc = (xc != &comms[comm_count-1] ? xc + 1 : NULL);
-    }
+    for(int s = 0; s < xc->n_slices; s++) {
+        xhc_sh_slice_t *slice = &xc->slices[s];
 
-    if(xc == NULL) {
-        return;
-    }
+        if(slice->in_use && ack >= slice->seq) {
+            if(slice->is_cico) {
+                char *buffer = (char *) xc->reduce_buffer_base
+                    + xc->size * xc->cico_size * (slice->seq % xc->n_slices)
+                    + xc->my_id * xc->cico_size;
 
-    xhc_reg_t *reg;
+                /* Prefetch RFO the reclaimed slice; the 1st
+                 * and 2nd slices into L2, the rest into L3. */
+                xhc_prefetchw(buffer, slice->len, (s < 2 ? 2 : 3));
+            }
 
-    for(int m = 0; m < xc->size; m++) {
-        if(m == xc->member_id) {
-            continue;
+            *slice = (xhc_sh_slice_t) {.in_use = false};
         }
+    }
+}
 
-        if((reg = xc->member_info[m].sbuf_reg)) {
-            xhc_return_registration(reg);
-        }
+static bool xhc_allreduce_slice_claim(xhc_comm_t *xc, xhc_peer_info_t *peer_info,
+        void *sbuf, void *rbuf, size_t allreduce_count, size_t dtype_size,
+        xhc_copy_method_t method, xhc_copy_method_t bcast_method, int rank,
+        xf_sig_t seq) {
+
+    xhc_sh_slice_t *slice = &xc->slices[xc->slice_id];
+
+    /* The slice to use has already been determined in init_local() according
+     * to the sequence number. We don't just use any one available slice,
+     * because we want to be predictable, so that (1) we know the slice early,
+     * and (2) so that others know it for certain, without communication. */
+
+    /* If not already recorded as available, check
+     * the ack number and try to reclaim slices. */
+    if(slice->in_use)
+        xhc_allreduce_slice_gc(xc);
+
+    if(!slice->in_use) {
+        int err = OMPI_SUCCESS;
 
-        if((reg = xc->member_info[m].rbuf_reg)) {
-            xhc_return_registration(reg);
+        /* slice_claim() might be called for a comm on which this process is
+         * not a participant; recall that this happens because the reduction
+         * result is placed directly on the CICO reduce buffer on the next
+         * level. For this type of call, we don't call init_member(). */
+
+        if(xc->is_bottom || xc->down->is_leader)
+            err = xhc_allreduce_init_member(xc, peer_info, sbuf, rbuf,
+                allreduce_count, dtype_size, method, bcast_method,
+                rank, seq, false);
+
+        if(err == OMPI_SUCCESS) {
+            *slice = (xhc_sh_slice_t) {
+                .in_use = true, .seq = seq,
+                .len = allreduce_count * dtype_size,
+                .is_cico = (method == XHC_COPY_CICO)
+            };
+
+            xc->slice_ready = true;
         }
     }
+
+    return xc->slice_ready;
 }
 
-// -----------------------------
+static void xhc_allreduce_disconnect_peers(xhc_comm_t *comms) {
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        xhc_reg_t *reg;
 
-static xhc_comm_t *xhc_allreduce_bcast_src_comm(xhc_comm_t *comms, int comm_count) {
-    xhc_comm_t *s = NULL;
+        for(int m = 0; m < xc->size; m++) {
+            if(!xc->member_info[m].attach)
+                continue;
 
-    for(int i = 0; i < comm_count; i++) {
-        if(!comms[i].is_coll_leader) {
-            s = &comms[i];
-            break;
+            if((reg = xc->member_info[m].sbuf_reg))
+                xhc_return_registration(reg);
+
+            if((reg = xc->member_info[m].rbuf_reg))
+                xhc_return_registration(reg);
         }
     }
-
-    return s;
 }
 
-static void xhc_allreduce_do_ack(xhc_comm_t *comms, int comm_count, xf_sig_t seq) {
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+// -----------------------------
+
+static void xhc_allreduce_ack(xhc_comm_t *comms,
+        xf_sig_t seq, xhc_bcast_ctx_t *bcast_ctx) {
 
-        xc->my_member_ctrl->member_ack = seq;
+    // Set personal ack(s), in the (all)reduce hierarchy
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        xc->my_ctrl->ack = seq;
 
-        if(!xc->is_coll_leader) {
+        if(!xc->is_leader)
             break;
-        }
 
-        for(int m = 0; m < xc->size; m++) {
-            if(m == xc->member_id) {
-                continue;
-            }
+        xhc_prefetchw((void *) &xc->comm_ctrl->ack,
+            sizeof(xc->comm_ctrl->ack), 1);
+    }
 
-            WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN);
-        }
+    /* Do the ACK process for the broadcast operation. This is necessary
+     * in order to appropriately set of the fields in the bcast hierarchy.
+     * Furthermore, we also leverage it for synchronizing the end of the
+     * allreduce operation; the broadcast being completed for all ranks,
+     * means that the allreduce is also completed for all. */
+
+    xhc_bcast_ack(bcast_ctx);
 
-        xc->comm_ctrl->coll_ack = seq;
+    /* Once xhc_bcast_ack completed, all ranks have acknowledged to have
+     * finished the collective. No need to check their member ack fields.
+     * Set ack appropriately. */
+
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        if(!xc->is_leader) break;
+        xc->comm_ctrl->ack = seq;
+    }
+
+    /* Note that relying on bcast's ack procedure like above, means that
+     * comm ack will be set only after the prodedure has finished on ALL
+     * levels of the bcast hierarchy. Theoretically, this might slighly
+     * delay the assignment of comm ack on some levels. One alternative
+     * could be to monitor acknowledgements in the broadcast ACK process
+     * as they happen, and when there's an xhc comm in the allreduce
+     * hierarchy whose all members have acknowledged, set comm ack. This
+     * is quite complex due to all the necessary translation, and might
+     * involve taking into account multiple pieces of information (e.g.
+     * dynamic leadership). Another alternative would be to do both the
+     * broadcast and the allreduce ack phases in parallel in non-blocking
+     * manner. But this would have us doing 2 similar ack phases, while
+     * only of them is really necessary, ultimately leading to increased
+     * coherency traffic and allegedly higher latency for the ack phase.
+     * I don't currently believe that delaying comm acks ever so slightly
+     * is too big a problem to warrant these more drastic alternatives. */
+}
+
+static void xhc_reduce_ack(xhc_comm_t *comms,
+        xhc_copy_method_t method, xf_sig_t seq) {
+
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        /* In single-copy mode we must absolutely wait for the leader to let
+         * us know that reductions are over before exiting. Otherwise, a peer
+         * might get stale data when accessing our app-level buffer (since the
+         * application is free to modify it). In CICO mode, we have increased
+         * control over our internal buffer(s), and members are allowed to exit
+         * even before all reductions have been completed. */
+        if(!xc->is_leader && method == XHC_COPY_SMSC)
+            WAIT_FLAG(&xc->comm_ctrl->ack, seq, 0);
+
+        /* Check the ack number of the comm, to reclaim previously thought
+         * in-use slices. We do this opportunistically at the finish of this
+         * operation, with the hope that we won't have to do it before the
+         * next one and delay its start. */
+        xhc_allreduce_slice_gc(xc);
+
+        /* Recall that in CICO mode, even if not a leader, we claim a slice
+         * on the next level, to directly place the reduction results there. */
+        if(!xc->is_leader && xc->up && xc->up->slice_ready)
+            xhc_allreduce_slice_gc(xc->up);
+
+        xc->my_ctrl->ack = seq;
+
+        if(!xc->is_leader)
+            break;
     }
 }
 
 // -----------------------------
 
 static void xhc_allreduce_cico_publish(xhc_comm_t *xc, void *data_src,
-        xhc_peer_info_t *peer_info, int ompi_rank, int allreduce_count,
+        xhc_peer_info_t *peer_info, int rank, size_t allreduce_count,
         size_t dtype_size) {
 
-    int ready = xc->my_member_ctrl->reduce_ready;
+    size_t ready = xc->reduce_ready;
 
-    /* The chunk size here is just a means of pipelining the CICO
-     * publishing, for whichever case this might be necessary in.
-     * There isn't really any reason to consult reduce areas and
-     * their chunk sizes here.*/
-    int elements = opal_min(xc->chunk_size/dtype_size, allreduce_count - ready);
+    /* The chunk size here is just a means of pipelining the CICO publishing,
+     * for whichever case this might be necessary in. There isn't really any
+     * reason to consult reduce areas and their chunk sizes here.*/
+    size_t elements = opal_min(xc->chunk_size/dtype_size, allreduce_count - ready);
 
     void *src = (char *) data_src + ready * dtype_size;
-    void *dst = (char *) xhc_get_cico(peer_info, ompi_rank) + ready * dtype_size;
+    void *dst = (char *) CICO_BUFFER(xc, xc->my_id) + ready * dtype_size;
 
-    memcpy(dst, src, elements * dtype_size);
+    xhc_memcpy(dst, src, elements * dtype_size);
     xhc_atomic_wmb();
 
-    volatile xf_size_t *rrp = &xc->my_member_ctrl->reduce_ready;
-    xhc_atomic_store_size_t(rrp, ready + elements);
+    xhc_atomic_store_size_t(&xc->my_ctrl->reduce_ready,
+        (xc->reduce_ready = ready + elements));
 }
 
-static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc,
-        xhc_peer_info_t *peer_info, size_t allreduce_count,
-        size_t dtype_size, bool do_cico, bool out_of_order_reduce,
-        xf_sig_t seq, xhc_rq_item_t **item_dst) {
-
-    xhc_rq_item_t *member_item = NULL;
-    int stalled_member = xc->size;
+static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc, xhc_peer_info_t *peer_info,
+        size_t allreduce_count, size_t dtype_size, xhc_copy_method_t method,
+        xhc_copy_method_t bcast_method, bool out_of_order_reduce, xf_sig_t seq,
+        xhc_rq_item_t **item_dst) {
 
     /* Iterate the reduce queue, to determine which member's data to reduce,
      * and from what index. The reduction queue aids in the implementation of
      * the rationale that members that are not ready at some point should be
      * temporarily skipped, to prevent stalling in the collective. Reasons
      * that a member may not be "ready" are (1) it has not yet joined the
-     * collective, (2) the necessary data have not yet been produced (eg.
+     * collective, (2) the necessary data has not yet been produced (eg.
      * because the member's children have not finished their reduction on the
-     * previous communicator) or have not been copied to the CICO buffer.
+     * previous communicator) or has not been copied to the CICO buffer.
      * However, when floating point data is concerned, skipping members and
      * therefore doing certain reductions in non-deterministic order results
-     * to reproducibility problems. Hence the existence of the "dynamic reduce"
+     * to reproducibility concerns. Hence the existence of the "dynamic reduce"
      * switch; when enabled, members are skipped when not ready. When disabled,
      * members are skipped, but only the data of members with a lower ID that
      * the one that has stalled can be reduced (eg. member 2 has stalled, but
@@ -505,34 +684,35 @@ static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc,
      * reduction queue is sorted according to the reduction progress counter in
      * each entry. This helps ensure fully reduced chunks are generated as soon
      * as possible, so that leaders can quickly propagate them upwards. */
+
+    xhc_rq_item_t *member_item = NULL;
+    int stalled_member = xc->size;
+
     OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) {
         int member = item->member;
 
-        if(!xc->member_info[member].init
-                && CHECK_FLAG(&xc->member_ctrl[member].member_seq, seq, 0)) {
+        if(!xc->member_info[member].attach
+                && CHECK_FLAG(&xc->member_ctrl[member].seq, seq, 0)) {
 
             xhc_atomic_rmb();
 
-            int ret = xhc_allreduce_attach_member(xc, member, peer_info,
-                allreduce_count * dtype_size, do_cico, seq);
+            int err = xhc_allreduce_attach_member(xc, member, peer_info,
+                allreduce_count * dtype_size, method, bcast_method, seq);
 
-            if(ret != 0) {
-                return ret;
-            }
+            if(err != OMPI_SUCCESS)
+                return err;
         }
 
-        if(xc->member_info[member].init && item->count < allreduce_count) {
-            xhc_reduce_area_t *area = &xc->reduce_area[item->area_id];
-            int elements = area->work_chunk;
+        if(xc->member_info[member].attach && item->count < allreduce_count) {
+            xhc_reduce_area_t *area = &xc->reduce_areas[item->area_id];
+            size_t elements = area->work_chunk;
 
-            if(item->count + elements + area->work_leftover == area->work_end) {
+            if(item->count + elements + area->work_leftover == area->work_end)
                 elements += area->work_leftover;
-            }
 
-            size_t self_ready = xc->my_member_ctrl->reduce_ready;
-
-            volatile xf_size_t *rrp = &xc->member_ctrl[member].reduce_ready;
-            size_t member_ready = xhc_atomic_load_size_t(rrp);
+            size_t self_ready = xc->reduce_ready;
+            size_t member_ready = xhc_atomic_load_size_t(
+                &xc->member_ctrl[member].reduce_ready);
 
             if(self_ready >= item->count + elements
                     && member_ready >= item->count + elements
@@ -543,89 +723,83 @@ static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc,
             }
         }
 
-        if(!out_of_order_reduce) {
+        if(!out_of_order_reduce)
             stalled_member = opal_min(stalled_member, member);
-        }
     }
 
     if(member_item) {
         opal_list_remove_item(xc->reduce_queue, (opal_list_item_t *) member_item);
+        *item_dst = member_item;
     }
 
-    *item_dst = member_item;
-
-    return 0;
+    return OMPI_SUCCESS;
 }
 
-static void xhc_allreduce_rq_item_analyze(xhc_comm_t *xc, xhc_rq_item_t *item,
-        bool *first_reduction, bool *last_reduction) {
+static void xhc_allreduce_do_reduce(xhc_comm_t *xc, xhc_rq_item_t *member_item,
+        void *tmp_rbuf, size_t allreduce_count, ompi_datatype_t *dtype,
+        size_t dtype_size, ompi_op_t *op, xhc_copy_method_t method) {
+
+    xhc_reduce_area_t *area = &xc->reduce_areas[member_item->area_id];
+
+    size_t elements = area->work_chunk;
+
+    if(member_item->count + elements + area->work_leftover == area->work_end)
+        elements += area->work_leftover;
 
-    *first_reduction = false;
-    *last_reduction = false;
+    // ---
+
+    bool first_reduction = false;
+    bool last_reduction = false;
+
+    /* Remember that member_item is not currently in
+     * the queue, as we removed it in get_next(). */
 
     if(opal_list_get_size(xc->reduce_queue) == 0) {
-        *first_reduction = true;
-        *last_reduction = true;
+        first_reduction = true;
+        last_reduction = true;
     } else {
-        xhc_rq_item_t *first_item = (xhc_rq_item_t *)
+        xhc_rq_item_t *rq_first = (xhc_rq_item_t *)
             opal_list_get_first(xc->reduce_queue);
-
-        xhc_rq_item_t *last_item = (xhc_rq_item_t *)
+        xhc_rq_item_t *rq_last = (xhc_rq_item_t *)
             opal_list_get_last(xc->reduce_queue);
 
         /* If this count is equal or larger than the last one, it means that
          * no other count in the queue is larger than it. Therefore, this is the
-         * first reduction taking place for the "member_item->count" chunk idx. */
-        if(item->count >= last_item->count) {
-            *first_reduction = true;
-        }
+         * first reduction taking place for the 'member_item->count' chunk idx. */
+        if(member_item->count >= rq_last->count)
+            first_reduction = true;
 
         /* If this count is uniquely minimum in the queue, this is the
          * last reduction taking place for this specific chunk index. */
-        if(item->count < first_item->count) {
-            *last_reduction = true;
-        }
+        if(member_item->count < rq_first->count)
+            last_reduction = true;
     }
-}
-
-static void xhc_allreduce_do_reduce(xhc_comm_t *xc, xhc_rq_item_t *member_item,
-        int allreduce_count, ompi_datatype_t *dtype, size_t dtype_size,
-        ompi_op_t *op) {
 
-    xhc_reduce_area_t *area = &xc->reduce_area[member_item->area_id];
-    int elements = area->work_chunk;
-
-    if(member_item->count + elements + area->work_leftover == area->work_end) {
-        elements += area->work_leftover;
-    }
+    // ---
 
+    char *src, *dst, *src2 = NULL;
     size_t offset = member_item->count * dtype_size;
 
-    char *src = (char *) xc->member_info[member_item->member].sbuf + offset;
-
-    char *dst;
-    char *src2 = NULL;
+    src = (char *) xc->member_info[member_item->member].sbuf + offset;
 
-    bool first_reduction, last_reduction;
+    /* In cico & zcopy, the leader is discovered during attach() (one of
+     * the members has member_ctrl->is_leader=true). In imm, is_leader is
+     * repurposed for the payload, but in that case we force the leader to
+     * do all reductions, so really he's the only that needs to know. */
+    assert(method != XHC_COPY_IMM || xc->is_leader);
+    assert(!last_reduction || xc->leader_id >= 0);
 
-    xhc_allreduce_rq_item_analyze(xc, member_item,
-        &first_reduction, &last_reduction);
+    if(last_reduction)
+        dst = (char *) xc->member_info[xc->leader_id].rbuf + offset;
+    else
+        dst = (char *) tmp_rbuf + offset;
 
-    /* Only access comm_ctrl when it's the last reduction. Otherwise,
-     * it's not guaranteed that the leader will have initialized it yet.*/
-    if(last_reduction) {
-        dst = (char *) xc->member_info[xc->comm_ctrl->leader_id].rbuf + offset;
-    } else {
-        dst = (char *) xc->my_member_info->rbuf + offset;
-    }
+    if(first_reduction)
+        src2 = (char *) xc->my_info->sbuf + offset;
+    else if(last_reduction)
+        src2 = (char *) tmp_rbuf + offset;
 
-    if(first_reduction) {
-        src2 = (char *) xc->my_member_info->sbuf + offset;
-    } else if(last_reduction) {
-        src2 = (char *) xc->my_member_info->rbuf + offset;
-    }
-
-    // Happens under certain circumstances with MPI_IN_PLACE or with CICO
+    // Might happen under MPI_IN_PLACE or CICO
     if(src2 == dst) {
         src2 = NULL;
     } else if(src == dst) {
@@ -635,29 +809,28 @@ static void xhc_allreduce_do_reduce(xhc_comm_t *xc, xhc_rq_item_t *member_item,
 
     xhc_atomic_rmb();
 
-    if(src2) {
-        ompi_3buff_op_reduce(op, src2, src, dst, elements, dtype);
-    } else {
-        ompi_op_reduce(op, src, dst, elements, dtype);
-    }
+    if(src2) ompi_3buff_op_reduce(op, src2, src, dst, elements, dtype);
+    else ompi_op_reduce(op, src, dst, elements, dtype);
+
+    // ---
 
     /* If we reached the end of the area after this reduction, switch
      * to the next one, or mark completion if it was the last one.
      * Otherwise, adjust the count according to the area's parameters. */
+
     if(member_item->count + elements == area->work_end) {
         if(member_item->area_id < xc->n_reduce_areas - 1) {
             member_item->area_id++;
-            member_item->count = xc->reduce_area[member_item->area_id].work_begin;
-        } else {
+            member_item->count = xc->reduce_areas[member_item->area_id].work_begin;
+        } else
             member_item->count = allreduce_count;
-        }
-    } else {
+    } else
         member_item->count += area->stride;
-    }
-}
 
-static void xhc_allreduce_reduce_return_item(xhc_comm_t *xc,
-        xhc_rq_item_t *member_item) {
+    // ---
+
+    /* Place the item back in the queue, keeping
+     * it incrementally sorted by item->count. */
 
     bool placed = false;
 
@@ -673,58 +846,20 @@ static void xhc_allreduce_reduce_return_item(xhc_comm_t *xc,
         }
     }
 
-    if(!placed) {
+    if(!placed)
         opal_list_prepend(xc->reduce_queue, (opal_list_item_t *) member_item);
-    }
 
-    xhc_rq_item_t *first_item = (xhc_rq_item_t *)
-        opal_list_get_first(xc->reduce_queue);
-
-    if(first_item->count > xc->my_member_ctrl->reduce_done) {
-        xhc_atomic_wmb();
+    /* Find the new min count in the queue after this reduction,
+     * and appropriately adjust reduce_done if/as necessary. */
 
-        volatile xf_size_t *rdp = &xc->my_member_ctrl->reduce_done;
-        xhc_atomic_store_size_t(rdp, first_item->count);
-    }
-}
-
-static void xhc_allreduce_do_bcast(xhc_comm_t *comms, int comm_count,
-        xhc_comm_t *src_comm, size_t bytes_total, size_t *bcast_done,
-        const void *bcast_src, void *bcast_dst, void *bcast_cico) {
-
-    size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - *bcast_done);
-
-    volatile xf_size_t *brp = &src_comm->comm_ctrl->bytes_ready;
-
-    if(xhc_atomic_load_size_t(brp) - *bcast_done >= copy_size) {
-        void *src = (char *) bcast_src + *bcast_done;
-        void *dst = (char *) bcast_dst + *bcast_done;
-        void *cico_dst = (char *) bcast_cico + *bcast_done;
-
-        xhc_atomic_rmb();
-
-        if(bcast_cico && comms[0].is_coll_leader) {
-            memcpy(cico_dst, src, copy_size);
-        } else {
-            memcpy(dst, src, copy_size);
-        }
-
-        *bcast_done += copy_size;
+    xhc_rq_item_t *rq_first = (xhc_rq_item_t *)
+        opal_list_get_first(xc->reduce_queue);
 
+    if(rq_first->count > xc->reduce_done) {
         xhc_atomic_wmb();
 
-        for(int i = 0; i < comm_count; i++) {
-            if(!comms[i].is_coll_leader) {
-                break;
-            }
-
-            volatile xf_size_t *brp_d = &comms[i].comm_ctrl->bytes_ready;
-            xhc_atomic_store_size_t(brp_d, *bcast_done);
-        }
-
-        if(bcast_cico && comms[0].is_coll_leader) {
-            memcpy(dst, cico_dst, copy_size);
-        }
+        xhc_atomic_store_size_t(&xc->my_ctrl->reduce_done,
+            (xc->reduce_done = rq_first->count));
     }
 }
 
@@ -734,254 +869,313 @@ int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, size_t count,
         ompi_datatype_t *datatype, ompi_op_t *op, ompi_communicator_t *ompi_comm,
         mca_coll_base_module_t *ompi_module, bool require_bcast) {
 
+    XHC_COLLTYPE_T colltype = (require_bcast ? XHC_ALLREDUCE : XHC_REDUCE);
     xhc_module_t *module = (xhc_module_t *) ompi_module;
 
-    if(!module->init) {
-        int ret = xhc_lazy_init(module, ompi_comm);
-        if(ret != OMPI_SUCCESS) {
-            return ret;
-        }
-    }
-
-    if(!ompi_datatype_is_predefined(datatype)) {
-        static bool warn_shown = false;
-
-        if(!warn_shown) {
-            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-                ompi_coll_base_framework.framework_output,
-                "coll:xhc: Warning: XHC does not currently support "
-                "derived datatypes; utilizing fallback component");
-            warn_shown = true;
-        }
+    int err;
 
-        xhc_coll_fns_t fallback = module->prev_colls;
+    // ---
 
-        if(require_bcast) {
-            return fallback.coll_allreduce(sbuf, rbuf, count, datatype,
-                op, ompi_comm, fallback.coll_allreduce_module);
-        } else {
-            return fallback.coll_reduce(sbuf, rbuf, count, datatype,
-                op, 0, ompi_comm, fallback.coll_reduce_module);
-        }
+    if(!ompi_datatype_is_predefined(datatype)) {
+        WARN_ONCE("coll:xhc: Warning: XHC does not currently support "
+            "derived datatypes; utilizing fallback component");
+        goto _fallback;
     }
 
     if(!ompi_op_is_commute(op)) {
-        static bool warn_shown = false;
-
-        if(!warn_shown) {
-            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-                ompi_coll_base_framework.framework_output,
-                "coll:xhc: Warning: (all)reduce does not support non-commutative "
-                "operators; utilizing fallback component");
-            warn_shown = true;
+        WARN_ONCE("coll:xhc: Warning: (all)reduce does not support "
+            "non-commutative operators; utilizing fallback component");
+        goto _fallback;
+    }
+
+    if(!module->zcopy_support) {
+        size_t dtype_size; ompi_datatype_type_size(datatype, &dtype_size);
+        size_t cico_size = module->op_config[colltype].cico_max;
+        if(count * dtype_size > cico_size) {
+            WARN_ONCE("coll:xhc: Warning: No smsc support; utilizing fallback "
+                "component for %s greater than %zu bytes", (require_bcast ?
+                "allreduce" : "reduce"), cico_size);
+            goto _fallback;
         }
+    }
 
-        xhc_coll_fns_t fallback = module->prev_colls;
+    if(!module->op_data[colltype].init) {
+        err = xhc_init_op(module, ompi_comm, colltype);
+        if(err != OMPI_SUCCESS) goto _fallback_permanent;
+    }
 
-        if(require_bcast) {
-            return fallback.coll_allreduce(sbuf, rbuf, count, datatype,
-                op, ompi_comm, fallback.coll_allreduce_module);
-        } else {
-            return fallback.coll_reduce(sbuf, rbuf, count, datatype,
-                op, 0, ompi_comm, fallback.coll_reduce_module);
-        }
+    if(require_bcast && !module->op_data[XHC_BCAST].init) {
+        err = xhc_init_op(module, ompi_comm, XHC_BCAST);
+        if(err != OMPI_SUCCESS) goto _fallback_permanent_bcast;
     }
 
-    // ----
+    // ---
 
     xhc_peer_info_t *peer_info = module->peer_info;
-    xhc_data_t *data = module->data;
+    xhc_op_data_t *op_data = &module->op_data[colltype];
 
-    xhc_comm_t *comms = data->comms;
-    int comm_count = data->comm_count;
+    xhc_comm_t *comms = op_data->comms;
 
     size_t dtype_size, bytes_total;
     ompi_datatype_type_size(datatype, &dtype_size);
     bytes_total = count * dtype_size;
 
-    bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX);
+    xhc_copy_method_t method;
+
     bool out_of_order_reduce = false;
+    xhc_reduce_load_balance_enum_t lb_policy;
 
     int rank = ompi_comm_rank(ompi_comm);
 
-    // ----
+    /* Currently hard-coded. Okay for Allreduce. For reduce, it's
+     * because we don't yet support non-zero root... (TODO) */
+    int root = comms->top->owner_rank;
+
+    // ---
 
     switch(mca_coll_xhc_component.dynamic_reduce) {
-        case OMPI_XHC_DYNAMIC_REDUCE_DISABLED:
+        case XHC_DYNAMIC_REDUCE_DISABLED:
             out_of_order_reduce = false;
             break;
 
-        case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT:
-            out_of_order_reduce = !(datatype->super.flags & OMPI_DATATYPE_FLAG_DATA_FLOAT);
+        case XHC_DYNAMIC_REDUCE_NON_FLOAT:
+            out_of_order_reduce = !(datatype->super.flags
+                & OMPI_DATATYPE_FLAG_DATA_FLOAT);
             break;
 
-        case OMPI_XHC_DYNAMIC_REDUCE_ALL:
+        case XHC_DYNAMIC_REDUCE_ALL:
             out_of_order_reduce = true;
             break;
     }
 
-    // ----
-
-    // rbuf won't be present for non-root ranks in MPI_Reduce
-    if(rbuf == NULL && !do_cico) {
+    if(bytes_total <= XHC_REDUCE_IMM_SIZE)
+        method = XHC_COPY_IMM;
+    else if(bytes_total <= comms[0].cico_size)
+        method = XHC_COPY_CICO;
+    else
+        method = XHC_COPY_SMSC;
+
+    /* In XHC_COPY_IMM, we force the leaders to perform the reductions,
+     * to greatly simplify imm buffer management. Don't see any reason
+     * for any other member to do them anyway... */
+    if(method == XHC_COPY_IMM) lb_policy = XHC_REDUCE_LB_LEADER_ASSIST_ALL;
+    else lb_policy = mca_coll_xhc_component.reduce_load_balance;
+
+    /* We require a buffer to store intermediate data. In cases like MPI_Ruduce,
+     * non-root ranks don't normally have an rbuf, so allocate an internal one.
+     * TODO: Strictly speaking, the members that won't do reductions, shouldn't
+     * require an rbuf; consult this and don't allocate one for them?? */
+    if(rbuf == NULL) {
         if(module->rbuf_size < bytes_total) {
-            void *tmp = realloc(module->rbuf, bytes_total);
+            void *new_rbuf = realloc(module->rbuf, bytes_total);
+            if(!new_rbuf) return OPAL_ERR_OUT_OF_RESOURCE;
 
-            if(tmp != NULL) {
-                module->rbuf = tmp;
-                module->rbuf_size = bytes_total;
-            } else {
-                return OPAL_ERR_OUT_OF_RESOURCE;
-            }
+            module->rbuf = new_rbuf;
+            module->rbuf_size = bytes_total;
         }
 
         rbuf = module->rbuf;
     }
 
-    // ----
-
-    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
-
-    if(sbuf == MPI_IN_PLACE) {
+    if(sbuf == MPI_IN_PLACE)
         sbuf = rbuf;
-    }
 
-    xhc_allreduce_init_local(comms, comm_count, count, dtype_size, pvt_seq);
-    xhc_allreduce_init_comm(comms, comm_count, rbuf, do_cico, rank, pvt_seq);
-    xhc_allreduce_init_member(comms, comm_count, peer_info,
-        (void *) sbuf, rbuf, count, do_cico, rank, pvt_seq);
+    // ---
+
+    xf_sig_t seq = ++op_data->seq;
 
-    void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank);
+    xhc_allreduce_init_local(comms, count, dtype_size, colltype, lb_policy, seq);
+    xhc_allreduce_init_comm(comms, rank, seq);
 
     // My conscience is clear!
-    if(require_bcast) {
-        goto _allreduce;
-    } else {
-        goto _reduce;
-    }
+    if(require_bcast) goto _allreduce;
+    else goto _reduce;
 
 // =============================================================================
 
 _allreduce: {
 
-    xhc_comm_t *bcast_comm =
-        xhc_allreduce_bcast_src_comm(comms, comm_count);
-
-    bool bcast_leader_joined = false;
+    xhc_bcast_ctx_t bcast_ctx;
+    bool bcast_started = false;
+
+    void *bcast_buf = rbuf;
+
+    /* In CICO, the reduced data is placed on the root's cico reduce buffer,
+     * unless the root does all the reduction, in which case just have him
+     * place the final data directly on his rbuf instead. Determine which
+     * case we're in, and supply the appropriate buffer to bcast. Even if
+     * the final data is in the reduce buffer and we have to copy it to rbuf,
+     * better do it after the bcast has started, rather than delay it. FYI,
+     * if bcast is also CICO, the data will end up getting placed directly
+     * on the root's bcast cico buffer (not a problem for setting bcast_buf,
+     * just so you know there's a bit more to it!). */
+    if(method == XHC_COPY_CICO && rank == root && !comms->top->do_all_work)
+        bcast_buf = CICO_BUFFER(comms->top, comms->top->my_id);
+
+    xhc_bcast_init(bcast_buf, count, datatype, root,
+        ompi_comm, module, &bcast_ctx);
+
+    /* Allreduce is not multi-sliced (no perf benefit from multi-slicing?),
+     * so init the member struct on all comms here, in blocking manner. */
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        xhc_allreduce_init_member(xc, peer_info, (void *) sbuf, rbuf, count,
+            dtype_size, method, bcast_ctx.method, rank, seq, true);
+        if(!xc->is_leader) break;
+    }
 
     for(size_t bytes_done = 0; bytes_done < bytes_total; ) {
-        for(int i = 0; i < comm_count; i++) {
-            xhc_comm_t *xc = &comms[i];
-            xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL);
 
-            if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) {
-                xhc_allreduce_cico_publish(xc, (void *) sbuf,
-                    peer_info, rank, count, dtype_size);
-            }
+        // CICO mode, copy-in phase
+        if(method == XHC_COPY_CICO && comms->bottom->reduce_ready < count)
+            xhc_allreduce_cico_publish(comms->bottom, (void *) sbuf,
+                peer_info, rank, count, dtype_size);
 
-            if(xc->is_coll_leader) {
-                size_t completed = 0;
+        for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
 
-                if(!xc->all_joined) {
-                    xhc_allreduce_leader_check_all_joined(xc, pvt_seq);
-                }
+            if(xc->reduce_done < count) {
+                xhc_rq_item_t *member_item = NULL;
 
-                if(xc->all_joined) {
-                    completed = count;
+                err = xhc_allreduce_reduce_get_next(xc, peer_info, count,
+                    dtype_size, method, bcast_ctx.method, out_of_order_reduce,
+                    seq, &member_item);
+                if(err != OMPI_SUCCESS) return err;
 
-                    for(int m = 0; m < xc->size; m++) {
-                        volatile xf_size_t *rdp = &xc->member_ctrl[m].reduce_done;
-                        size_t member_done = xhc_atomic_load_size_t(rdp);
+                if(member_item)
+                    xhc_allreduce_do_reduce(xc, member_item, rbuf,
+                        count, datatype, dtype_size, op, method);
+            }
 
-                        /* Watch out for double evaluation here, don't perform
-                         * sensitive loads inside opal_min()'s parameter list. */
-                        completed = opal_min(completed, member_done);
-                    }
-                }
+            /* If not a leader in this comm, no propagation to
+             * do, and not participating in higher-up comms. */
+            if(!xc->is_leader) break;
 
-                if(xnc && completed > xnc->my_member_ctrl->reduce_ready) {
-                    volatile xf_size_t *rrp = &xnc->my_member_ctrl->reduce_ready;
-                    xhc_atomic_store_size_t(rrp, completed);
-                } else if(!xnc) {
-                    size_t bytes_fully_reduced = completed * dtype_size;
-
-                    // Broadcast fully reduced data
-                    if(bytes_fully_reduced > bytes_done) {
-                        for(int k = 0; k < comm_count; k++) {
-                            volatile xf_size_t *brp =
-                                &comms[k].comm_ctrl->bytes_ready;
-                            xhc_atomic_store_size_t(brp, bytes_fully_reduced);
-                        }
-
-                        if(do_cico) {
-                            void *src = (char *) local_cico + bytes_done;
-                            void *dst = (char *) rbuf + bytes_done;
-                            memcpy(dst, src, bytes_fully_reduced - bytes_done);
-                        }
-
-                        bytes_done = bytes_fully_reduced;
-                    }
-                }
+            /* Leaders monitor children's progress and appropriately
+             * propagate towards upper levels. The top level leader
+             * initiates the broadcast of fully reduced chunks. */
+
+            // ---
+
+            if(xc->op_state & COMM_REDUCE_FINI)
+                continue;
+
+            /* Check if all have joined the collective, so that we may safely
+             * access their information in member_ctrl, plus we don't waste our
+             * time attempting propagation; reductions can't have finished if
+             * not all members have joined. */
+            if(!(xc->op_state & COMM_ALL_JOINED)) {
+                if(xhc_allreduce_check_all_joined(xc, seq))
+                    xc->op_state |= COMM_ALL_JOINED;
+                else continue;
             }
 
-            // Is the reduction phase completed?
-            if(xc->my_member_ctrl->reduce_done < count) {
-                xhc_rq_item_t *member_item = NULL;
+            /* Don't bother checking the members' shared counters (potentially
+             * expensive), if mine is about to hold the process back any way. */
+            if(!((xc->up && xc->reduce_done > xc->up->reduce_ready)
+            || (xc->is_top && xc->reduce_done * dtype_size > bytes_done)
+            || (xc->reduce_done >= count)))
+                continue;
+
+            // ---
+
+            size_t completed = count;
+            size_t completed_bytes;
+
+            for(int m = 0; m < xc->size; m++) {
+                size_t member_done = (m == xc->my_id ? xc->reduce_done :
+                    xhc_atomic_load_size_t(&xc->member_ctrl[m].reduce_done));
 
-                int ret = xhc_allreduce_reduce_get_next(xc,
-                    peer_info, count, dtype_size, do_cico,
-                    out_of_order_reduce, pvt_seq, &member_item);
+                /* Watch out for double evaluation here, don't perform
+                 * sensitive loads inside opal_min()'s parameter list. */
+                completed = opal_min(completed, member_done);
+            }
+
+            completed_bytes = completed * dtype_size;
 
-                if(ret != 0) {
-                    return OMPI_ERROR;
+            if(xc->up && completed > xc->up->reduce_ready) {
+                /* In imm mode: copy the data into my ctrl on
+                 * the next level as part of the propagation. */
+                if(method == XHC_COPY_IMM) {
+                    size_t up_bytes_ready = xc->up->reduce_ready * dtype_size;
+                    xhc_memcpy_offset((void *) xc->up->my_ctrl->imm_data,
+                        xc->my_info->rbuf, up_bytes_ready,
+                        completed_bytes - up_bytes_ready);
                 }
 
-                if(member_item) {
-                    xhc_allreduce_do_reduce(xc, member_item,
-                        count, datatype, dtype_size, op);
+                xhc_atomic_store_size_t(&xc->up->my_ctrl->reduce_ready,
+                    (xc->up->reduce_ready = completed));
+            } else if(xc->is_top && completed_bytes > bytes_done) {
+                for(xhc_comm_t *bxc = bcast_ctx.comms->top; bxc; bxc = bxc->down)
+                    xhc_bcast_notify(&bcast_ctx, bxc, completed_bytes);
 
-                    xhc_allreduce_reduce_return_item(xc, member_item);
+                if(xc->my_info->rbuf != rbuf) {
+                    xhc_memcpy_offset(rbuf, xc->my_info->rbuf,
+                        bytes_done, completed_bytes - bytes_done);
                 }
-            }
 
-            /* If not a leader in this comm, not
-             * participating in higher-up ones. */
-            if(!xc->is_coll_leader) {
-                break;
+                bytes_done = completed_bytes;
+                bcast_ctx.bytes_done = completed_bytes;
             }
+
+            /* As soon as reduction and propagation is fully finished on
+             * this comm, no reason to 'visit' it anymore. Furthermore,
+             * once all reductions are done, it's possible that members
+             * will receive (through bcast) all final data and exit the
+             * collective. And they may well enter a new collective and
+             * start initializing their member_ctrl, so it's not
+             * appropriate to keep reading their ctrl data. */
+            if(completed >= count)
+                xc->op_state |= COMM_REDUCE_FINI;
         }
 
-        if(bcast_comm && !bcast_leader_joined) {
-            if(CHECK_FLAG(&bcast_comm->comm_ctrl->coll_seq, pvt_seq, 0)) {
-                xhc_atomic_rmb();
+        // Broadcast
+        // ---------
 
-                int leader = bcast_comm->comm_ctrl->leader_id;
+        if(!bcast_started) {
+            err = xhc_bcast_start(&bcast_ctx);
+            if(err == OMPI_SUCCESS) bcast_started = true;
+            else if(err != OMPI_ERR_WOULD_BLOCK) return err;
+        }
 
-                if(!bcast_comm->member_info[leader].init) {
-                    WAIT_FLAG(&bcast_comm->member_ctrl[leader].member_seq,
-                        pvt_seq, 0);
+        if(bcast_ctx.src_comm && bcast_ctx.bytes_done < bcast_ctx.bytes_total) {
+            /* Currently, in single-copy mode, even though we already have
+             * some established xpmem attachments, these might need to be
+             * re-established in bcast. Some form of small/quick caching
+             * could be implemented in get_registration to avoid this. Or
+             * the allreduce implementation could hint to broadcast that
+             * registrations are available. */
 
-                    xhc_atomic_rmb();
+            err = xhc_bcast_work(&bcast_ctx);
 
-                    xhc_allreduce_attach_member(bcast_comm, leader,
-                        peer_info, bytes_total, do_cico, pvt_seq);
-                }
+            if(err != OMPI_SUCCESS && err != OMPI_ERR_WOULD_BLOCK)
+                return err;
 
-                bcast_leader_joined = true;
-            }
+            bytes_done = bcast_ctx.bytes_done;
         }
+    }
 
-        if(bcast_comm && bcast_leader_joined) {
-            int leader = bcast_comm->comm_ctrl->leader_id;
+    // ---
 
-            xhc_allreduce_do_bcast(comms, comm_count,
-                bcast_comm, bytes_total, &bytes_done,
-                bcast_comm->member_info[leader].rbuf,
-                rbuf, (do_cico ? local_cico : NULL));
-        }
+    /* This is theoretically necessary, for the case that a leader has copied
+     * all chunks and thus exited the loop, but hasn't yet notified some of
+     * its children (e.g. because they were still active in a previous op). */
+    while(!bcast_started) {
+        err = xhc_bcast_start(&bcast_ctx);
+        if(err == OMPI_SUCCESS) bcast_started = true;
+        else if(err != OMPI_ERR_WOULD_BLOCK) return err;
     }
 
-    xhc_allreduce_do_ack(comms, comm_count, pvt_seq);
+    xhc_allreduce_ack(comms, seq, &bcast_ctx);
+
+    xhc_bcast_fini(&bcast_ctx);
+
+    /* See respective comment in xhc_bcast_fini(). Note that prefetchw is also
+     * used in Reduce, but that happens inside xhc_allreduce_slice_gc(). */
+    if(method == XHC_COPY_CICO) {
+        for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+            xhc_prefetchw(CICO_BUFFER(xc, xc->my_id), bytes_total, 2);
+            if(!xc->is_leader) break;
+        }
+    }
 
     goto _finish;
 }
@@ -990,114 +1184,137 @@ _allreduce: {
 
 _reduce: {
 
-    size_t cico_copied = 0;
-    int completed_comms = 0;
-
-    while(completed_comms < comm_count) {
-        for(int i = completed_comms; i < comm_count; i++) {
-            xhc_comm_t *xc = &comms[i];
-            xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL);
+    for(size_t bytes_done = 0; bytes_done < bytes_total; ) {
+        for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+
+            /* We can't really proceed if the slice is not reserved.
+             * Recall that a number of counters (like reduce_ready) are
+             * initialized inside init_member(), which is asynchronously
+             * called from slice_claim() when possible. */
+            if(!xc->slice_ready) {
+                xhc_allreduce_slice_claim(xc, peer_info, (void *) sbuf, rbuf,
+                    count, dtype_size, method, 0, rank, seq);
+
+                if(!xc->slice_ready)
+                    break;
+            }
 
-            if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) {
+            /* CICO mode, copy-in phase. Watch out, unlike Allreduce,
+             * we want this here, *after* we've claimed the slice. */
+            if(method == XHC_COPY_CICO && xc->is_bottom && xc->reduce_ready < count)
                 xhc_allreduce_cico_publish(xc, (void *) sbuf,
                     peer_info, rank, count, dtype_size);
-            }
 
-            if(xc->is_coll_leader) {
-                size_t completed = 0;
+            if(xc->reduce_done < count) {
+                /* In CICO mode, reductions results are stored directly
+                 * on the buffer on the next level; make sure we are
+                 * allowed to do this (by reserving a slice). */
+                if(method == XHC_COPY_CICO && xc->up && !xc->up->slice_ready) {
+                    xhc_allreduce_slice_claim(xc->up, peer_info, (void *) sbuf,
+                        rbuf, count, dtype_size, method, 0, rank, seq);
 
-                if(!xc->all_joined) {
-                    xhc_allreduce_leader_check_all_joined(xc, pvt_seq);
+                    if(!xc->up->slice_ready)
+                        break;
                 }
 
-                if(xc->all_joined) {
-                    completed = count;
+                xhc_rq_item_t *member_item = NULL;
 
-                    for(int m = 0; m < xc->size; m++) {
-                        volatile xf_size_t *rdp = &xc->member_ctrl[m].reduce_done;
-                        size_t member_done = xhc_atomic_load_size_t(rdp);
+                err = xhc_allreduce_reduce_get_next(xc, peer_info, count,
+                    dtype_size, method, 0, out_of_order_reduce, seq, &member_item);
+                if(err != OMPI_SUCCESS) return err;
 
-                        /* Watch out for double evaluation here, don't perform
-                         * sensitive loads inside opal_min()'s parameter list. */
-                        completed = opal_min(completed, member_done);
-                    }
-                }
+                if(member_item)
+                    xhc_allreduce_do_reduce(xc, member_item, rbuf,
+                        count, datatype, dtype_size, op, method);
+            }
 
-                if(xnc && completed > xnc->my_member_ctrl->reduce_ready) {
-                    volatile xf_size_t *rrp = &xnc->my_member_ctrl->reduce_ready;
-                    xhc_atomic_store_size_t(rrp, completed);
-                } else if(!xnc) {
-                    size_t completed_bytes = completed * dtype_size;
+            if(!xc->is_leader) {
+                /* When both of these reach `count`, all my tasks
+                 * are done, and am free to exit the loop. */
+                bytes_done = opal_min(xc->reduce_ready,
+                    xc->reduce_done) * dtype_size;
 
-                    if(do_cico && completed_bytes > cico_copied) {
-                        void *src = (char *) local_cico + cico_copied;
-                        void *dst = (char *) rbuf + cico_copied;
+                /* Not a leader in this comm, so no propagation,
+                 * and not participating in higher-up ones. */
+                break;
+            }
 
-                        memcpy(dst, src, completed_bytes - cico_copied);
-                        cico_copied = completed_bytes;
-                    }
-                }
+            // ---
 
-                if(completed >= count) {
-                    xc->comm_ctrl->coll_ack = pvt_seq;
-                    completed_comms++;
-                }
+            if(xc->op_state & COMM_REDUCE_FINI)
+                continue;
+
+            /* Check if all have joined the collective, so that we may safely
+             * access their information in member_ctrl, plus we don't waste our
+             * time attempting propagation; reductions can't have finished if
+             * not all members have joined. */
+            if(!(xc->op_state & COMM_ALL_JOINED)) {
+                if(xhc_allreduce_check_all_joined(xc, seq))
+                    xc->op_state |= COMM_ALL_JOINED;
+                else continue;
             }
 
-            // Is the reduction phase completed?
-            if(xc->my_member_ctrl->reduce_done < count) {
-                xhc_rq_item_t *member_item = NULL;
+            /* Don't bother checking the members' shared counters (potentially
+             * expensive), if mine is about to hold the process back any way. */
+            if(!((xc->up && xc->reduce_done > xc->up->reduce_ready)
+            || (xc->is_top && xc->reduce_done * dtype_size > bytes_done)
+            || (xc->reduce_done >= count)))
+                continue;
 
-                int ret = xhc_allreduce_reduce_get_next(xc,
-                    peer_info, count, dtype_size, do_cico,
-                    out_of_order_reduce, pvt_seq, &member_item);
+            if(xc->up && !xc->up->slice_ready) {
+                xhc_allreduce_slice_claim(xc->up, peer_info, (void *) sbuf,
+                    rbuf, count, dtype_size, method, 0, rank, seq);
 
-                if(ret != 0) {
-                    return OMPI_ERROR;
-                }
+                if(!xc->up->slice_ready)
+                    break;
+            }
 
-                if(member_item) {
-                    xhc_allreduce_do_reduce(xc, member_item,
-                        count, datatype, dtype_size, op);
+            // ---
 
-                    xhc_allreduce_reduce_return_item(xc, member_item);
-                }
-            }
+            size_t completed = count;
+            size_t completed_bytes;
 
-            if(!xc->is_coll_leader) {
-                /* If all reduction-related tasks are done, and
-                 * not a leader on the next comm, can exit */
-                if(xc->my_member_ctrl->reduce_done >= count
-                        && xc->my_member_ctrl->reduce_ready >= count) {
-                    goto _reduce_done;
-                }
+            for(int m = 0; m < xc->size; m++) {
+                size_t member_done = (m == xc->my_id ? xc->reduce_done :
+                    xhc_atomic_load_size_t(&xc->member_ctrl[m].reduce_done));
 
-                /* Not a leader in this comm, so not
-                 * participating in higher-up ones. */
-                break;
+                /* Watch out for double evaluation here, don't perform
+                 * sensitive loads inside opal_min()'s parameter list. */
+                completed = opal_min(completed, member_done);
             }
-        }
-    }
 
-    _reduce_done:
+            completed_bytes = completed * dtype_size;
 
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+            if(xc->up && completed > xc->up->reduce_ready) {
+                /* In imm mode: copy the data into my ctrl on
+                 * the next level as part of the propagation. */
+                if(method == XHC_COPY_IMM) {
+                    size_t up_bytes_ready = xc->up->reduce_ready * dtype_size;
+                    xhc_memcpy_offset((void *) xc->up->my_ctrl->imm_data,
+                        xc->my_info->rbuf, up_bytes_ready,
+                        completed_bytes - up_bytes_ready);
+                }
 
-        /* Wait for the leader to give the signal that reduction
-         * has finished on this comm and members are free to exit */
-        if(!xc->is_coll_leader) {
-            WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, OMPI_XHC_ACK_WIN);
-        }
+                xhc_atomic_store_size_t(&xc->up->my_ctrl->reduce_ready,
+                    (xc->up->reduce_ready = completed));
+            } else if(xc->is_top && completed_bytes > bytes_done) {
+                if(xc->my_info->rbuf != rbuf) {
+                    xhc_memcpy_offset(rbuf, xc->my_info->rbuf,
+                        bytes_done, completed_bytes - bytes_done);
+                }
 
-        // load-store control dependency with coll_ack; no need for barrier
-        xc->my_member_ctrl->member_ack = pvt_seq;
+                bytes_done = completed_bytes;
+            }
 
-        if(!xc->is_coll_leader) {
-            break;
+            if(completed >= count) {
+                xc->comm_ctrl->ack = seq;
+                xc->op_state |= COMM_REDUCE_FINI;
+            }
         }
     }
 
+    xhc_reduce_ack(comms, method, seq);
+
     goto _finish;
 }
 
@@ -1105,17 +1322,40 @@ _reduce: {
 
 _finish:
 
-    if(!do_cico) {
-        xhc_allreduce_disconnect_peers(comms, comm_count);
-    }
+    if(method == XHC_COPY_SMSC)
+        xhc_allreduce_disconnect_peers(comms);
 
     return OMPI_SUCCESS;
+
+_fallback_permanent_bcast:
+
+    XHC_INSTALL_FALLBACK(module, ompi_comm, XHC_BCAST, bcast);
+
+_fallback_permanent:
+
+    if(require_bcast) {
+        XHC_INSTALL_FALLBACK(module,
+            ompi_comm, XHC_ALLREDUCE, allreduce);
+    } else {
+        XHC_INSTALL_FALLBACK(module,
+            ompi_comm, XHC_REDUCE, reduce);
+    }
+
+_fallback:
+
+    if(require_bcast) {
+        return XHC_CALL_FALLBACK(module->prev_colls, XHC_ALLREDUCE,
+            allreduce, sbuf, rbuf, count, datatype, op, ompi_comm);
+    } else {
+        return XHC_CALL_FALLBACK(module->prev_colls, XHC_REDUCE,
+            reduce, sbuf, rbuf, count, datatype, op, 0, ompi_comm);
+    }
 }
 
 int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf,
         size_t count, ompi_datatype_t *datatype, ompi_op_t *op,
         ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
 
-    return xhc_allreduce_internal(sbuf, rbuf,
-        count, datatype, op, ompi_comm, ompi_module, true);
-}
+    return xhc_allreduce_internal(sbuf, rbuf, count,
+        datatype, op, ompi_comm, ompi_module, true);
+}        
diff --git a/ompi/mca/coll/xhc/coll_xhc_atomic.h b/ompi/mca/coll/xhc/coll_xhc_atomic.h
deleted file mode 100644
index 79f1dce98cb..00000000000
--- a/ompi/mca/coll/xhc/coll_xhc_atomic.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
- *                         Laboratory, ICS Forth. All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#ifndef MCA_COLL_XHC_ATOMIC_EXPORT_H
-#define MCA_COLL_XHC_ATOMIC_EXPORT_H
-
-#include <stdint.h>
-#include "opal/sys/atomic.h"
-
-// ----------------------------------------
-
-#define IS_SIG_ATOMIC_X_BITS(x) \
-    (SIG_ATOMIC_MAX == INT ## x ## _MAX) || (SIG_ATOMIC_MAX == UINT ## x ## _MAX)
-
-// ----------------------------------------
-
-// If xf_sig_t is ever re-defined to be signed,
- // CHECK_FLAGS()'s comparisons must be adjusted
-#if IS_SIG_ATOMIC_X_BITS(64)
-    typedef uint64_t xf_sig_t;
-#elif IS_SIG_ATOMIC_X_BITS(32)
-    typedef uint32_t xf_sig_t;
-#elif IS_SIG_ATOMIC_X_BITS(16)
-    typedef uint16_t xf_sig_t;
-#elif IS_SIG_ATOMIC_X_BITS(8)
-    typedef uint8_t xf_sig_t;
-#endif
-
-typedef int __attribute__((aligned(SIZEOF_INT))) xf_int_t;
-typedef size_t __attribute__((aligned(SIZEOF_SIZE_T))) xf_size_t;
-
-// ----------------------------------------
-
-#define xhc_atomic_rmb opal_atomic_rmb
-#define xhc_atomic_wmb opal_atomic_wmb
-#define xhc_atomic_fmb opal_atomic_mb
-
-// https://github.com/open-mpi/ompi/issues/9722
-
-#if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
-    #define xhc_atomic_load_int(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
-    #define xhc_atomic_store_int(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
-
-    #define xhc_atomic_load_size_t(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
-    #define xhc_atomic_store_size_t(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
-#else
-    #define xhc_atomic_load_int(addr) (*(addr))
-    #define xhc_atomic_store_int(addr, val) (*(addr) = (val))
-
-    #define xhc_atomic_load_size_t(addr) (*(addr))
-    #define xhc_atomic_store_size_t(addr, val) (*(addr) = (val))
-
-    #warning "GCC or the C11 atomics backend was not found. XHC might not function correctly"
-/* #else
-    #error "XHC atomics do not yet work without the GCC or the C11 backend" */
-#endif
-
-
-// If/when opal atomic load/store size_t is added
-
-/* #define xhc_atomic_load_size_t(addr) \
-    opal_atomic_load_size_t ((opal_atomic_size_t *) addr)
-#define xhc_atomic_store_size_t(addr, val) \
-    opal_atomic_store_size_t ((opal_atomic_size_t *) addr, val) */
-
-
-// If/when opal atomic load/store is added, and if opal atomic load/store int is not
-
-/* #if SIZEOF_INT == 4
-    #define xhc_atomic_load_int(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
-    #define xhc_atomic_store_int(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
-#elif SIZEOF_INT == 8
-    #define xhc_atomic_load_int(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
-    #define xhc_atomic_store_int(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
-#else
-    #error "Unsupported int size"
-#endif */
-
-
-// If/when opal atomic load/store is added, and if opal atomic load/store size_t is not
-
-/* #if SIZEOF_SIZE_T == 4
-    #define xhc_atomic_load_size_t(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
-    #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
-#elif SIZEOF_SIZE_T == 8
-    #define xhc_atomic_load_size_t(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
-    #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
-#else
-    #error "Unsupported size_t size"
-#endif */
-
-static inline bool xhc_atomic_cmpxchg_strong_relaxed(volatile xf_sig_t *addr,
-        xf_sig_t *oldval, xf_sig_t newval) {
-
-    #if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
-        return __atomic_compare_exchange_n(addr, oldval, newval,
-            false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
-    #else
-        #if IS_SIG_ATOMIC_X_BITS(32)
-            return opal_atomic_compare_exchange_strong_32(addr, oldval, newval);
-        #elif IS_SIG_ATOMIC_X_BITS(64)
-            return opal_atomic_compare_exchange_strong_64(addr, oldval, newval);
-        #else
-            #error "Unsupported sig_atomic_t size"
-        #endif
-    #endif
-}
-
-#endif
diff --git a/ompi/mca/coll/xhc/coll_xhc_barrier.c b/ompi/mca/coll/xhc/coll_xhc_barrier.c
index ade1300134a..92bf356bc14 100644
--- a/ompi/mca/coll/xhc/coll_xhc_barrier.c
+++ b/ompi/mca/coll/xhc/coll_xhc_barrier.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -19,120 +19,117 @@
 static void xhc_barrier_leader(xhc_comm_t *comms, int comm_count,
         xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) {
 
-    // Non-leader by default
-    for(int i = 0; i < comm_count; i++) {
-        comms[i].is_coll_leader = false;
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        // Non-leader by default
+        xc->is_leader = false;
     }
 
-    for(int i = 0; i < comm_count; i++) {
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
         // I'm the root and therefore always a leader
         if(rank == root) {
-            comms[i].comm_ctrl->leader_seq = seq;
-            comms[i].is_coll_leader = true;
+            xc->comm_ctrl->leader_seq = seq;
+            xc->is_leader = true;
 
             continue;
         }
 
         // The root takes leadership precedence when local
-        if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) {
+        if(PEER_IS_LOCAL(peer_info, root, xc->locality))
             break;
-        }
 
-        // The member with the lowest ID (ie. the manager) becomes the leader
-        if(comms[i].member_id == 0) {
-            comms[i].comm_ctrl->leader_seq = seq;
-            comms[i].is_coll_leader = true;
+        // The member with the lowest ID (ie. the owner) becomes the leader
+        if(xc->my_id == 0) {
+            xc->comm_ctrl->leader_seq = seq;
+            xc->is_leader = true;
         }
 
         // Non-leaders exit; they can't become leaders on higher levels
-        if(comms[i].is_coll_leader == false) {
+        if(xc->is_leader == false)
             break;
-        }
     }
 }
 
 /* Hierarchical Barrier with seq/ack flags
- * ---------------------------------------
- * 1. Ranks write their coll_seq field to signal they have joined
+ * -----------------------------------------------------------------
+ * 1. Ranks write their member seq field to signal they have joined
  *    the collective. Leaders propagate this information towards
  *    the top-most comm's leader using the same method.
  *
- * 2. The top-most comm's leader (root) sets the comm's coll_ack
+ * 2. The top-most comm's leader (root) sets the comm's comm ack
  *    field to signal, that all ranks have joined the barrier.
  *
  * 3. Leaders propagate the info towards the bottom-most comm, using
- *    the same method. Ranks wait on thei coll_ack flag, set their
+ *    the same method. Ranks wait on their comm ack flag, set their
  *    own ack, and exit the collective.
- * --------------------------------------- */
+ * ----------------------------------------------------------------- */
 int mca_coll_xhc_barrier(ompi_communicator_t *ompi_comm,
         mca_coll_base_module_t *ompi_module) {
 
     xhc_module_t *module = (xhc_module_t *) ompi_module;
 
-    if(!module->init) {
-        int ret = xhc_lazy_init(module, ompi_comm);
-        if(ret != OMPI_SUCCESS) return ret;
+    if(!module->op_data[XHC_BARRIER].init) {
+        int err = xhc_init_op(module, ompi_comm, XHC_BARRIER);
+        if(err != OMPI_SUCCESS) goto _fallback_permanent;
     }
 
     xhc_peer_info_t *peer_info = module->peer_info;
-    xhc_data_t *data = module->data;
+    xhc_op_data_t *data = &module->op_data[XHC_BARRIER];
 
     xhc_comm_t *comms = data->comms;
     int comm_count = data->comm_count;
 
     int rank = ompi_comm_rank(ompi_comm);
 
-    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
+    xf_sig_t seq = ++data->seq;
 
     xhc_barrier_leader(comms, comm_count, peer_info, rank,
-        mca_coll_xhc_component.barrier_root, pvt_seq);
+        mca_coll_xhc_component.barrier_root, seq);
 
     // 1. Upwards SEQ Wave
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        xc->my_ctrl->seq = seq;
 
-        xc->my_member_ctrl->member_seq = pvt_seq;
-
-        if(!xc->is_coll_leader) {
+        if(!xc->is_leader)
             break;
-        }
 
         for(int m = 0; m < xc->size; m++) {
-            if(m == xc->member_id) {
+            if(m == xc->my_id)
                 continue;
-            }
 
             /* Poll comm members and wait for them to join the barrier.
              * No need for windowed comparison here; Ranks won't exit the
-             * barrier before the leader has set the coll_seq flag. */
-            WAIT_FLAG(&xc->member_ctrl[m].member_seq, pvt_seq, 0);
+             * barrier before the leader has set the comm ack flag. */
+            WAIT_FLAG(&xc->member_ctrl[m].seq, seq, 0);
         }
     }
 
     // 2. Wait for ACK (root won't wait!)
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
-
-        if(xc->is_coll_leader == false) {
-            WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, 0);
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        if(xc->is_leader == false) {
+            WAIT_FLAG(&xc->comm_ctrl->ack, seq, 0);
             break;
         }
     }
 
     // 3. Trigger ACK Wave
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
-
-        /* Not actually necessary for the barrier operation, but
-         * good for consistency between all seq/ack numbers */
-        xc->my_member_ctrl->member_ack = pvt_seq;
-
-        if(!xc->is_coll_leader) {
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        if(!xc->is_leader)
             break;
-        }
 
-        xc->comm_ctrl->coll_ack = pvt_seq;
+        xc->comm_ctrl->ack = seq;
     }
 
     return OMPI_SUCCESS;
+
+    // ---
+
+_fallback_permanent:
+
+    XHC_INSTALL_FALLBACK(module,
+        ompi_comm, XHC_BARRIER, barrier);
+
+// _fallback:
+
+    return XHC_CALL_FALLBACK(module->prev_colls,
+        XHC_BARRIER, barrier, ompi_comm);
 }
diff --git a/ompi/mca/coll/xhc/coll_xhc_bcast.c b/ompi/mca/coll/xhc/coll_xhc_bcast.c
index 006a1dbc797..d47c55dd50f 100644
--- a/ompi/mca/coll/xhc/coll_xhc_bcast.c
+++ b/ompi/mca/coll/xhc/coll_xhc_bcast.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -19,51 +19,58 @@
 
 #include "coll_xhc.h"
 
-/* When dynamic leadership is enabled, the first rank of each
- * xhc comm to join the collective will become its leader */
-static void xhc_bcast_try_leader(xhc_comm_t *comms, int comm_count,
-        xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) {
+// ------------------------------------------------
 
-    // Non-leader by default
-    for(int i = 0; i < comm_count; i++) {
-        comms[i].is_coll_leader = false;
+#define COMM_PREV_FINI 0x01
+#define COMM_CTRL_INIT 0x02
+
+// ------------------------------------------------
+
+/* When dynamic leadership is enabled, the first rank of
+ * each xhc comm to join the collective becomes leader */
+static void xhc_bcast_init_local(xhc_comm_t *comms, xhc_peer_info_t *peer_info,
+        int rank, int root, xf_sig_t seq) {
+
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
+        // Non-leader by default
+        xc->is_leader = false;
+
+        xc->op_state = 0;
     }
 
-    for(int i = 0; i < comm_count; i++) {
+    for(xhc_comm_t *xc = comms; xc; xc = xc->up) {
         // I'm the root and therefore always a leader
         if(rank == root) {
-            comms[i].comm_ctrl->leader_seq = seq;
-            comms[i].is_coll_leader = true;
+            xc->comm_ctrl->leader_seq = seq;
+            xc->is_leader = true;
 
             continue;
         }
 
         // The root takes leadership precedence when local
-        if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) {
+        if(PEER_IS_LOCAL(peer_info, root, xc->locality))
             break;
-        }
 
         if(mca_coll_xhc_component.dynamic_leader == false) {
             /* If dynamic leadership is disabled, the member with
-             * the lowest ID (ie. the manager) becomes the leader */
-            if(comms[i].member_id == 0) {
-                comms[i].comm_ctrl->leader_seq = seq;
-                comms[i].is_coll_leader = true;
+             * the lowest ID (ie. the owner) becomes the leader */
+            if(xc->my_id == 0) {
+                xc->comm_ctrl->leader_seq = seq;
+                xc->is_leader = true;
             }
         } else {
             // An opportunity exists to become the leader
-            if(comms[i].comm_ctrl->leader_seq != seq) {
+            if(xc->comm_ctrl->leader_seq != seq) {
                 xf_sig_t oldval = seq - 1;
 
-                comms[i].is_coll_leader = xhc_atomic_cmpxchg_strong_relaxed(
-                    &comms[i].comm_ctrl->leader_seq, &oldval, seq);
+                xc->is_leader = xhc_atomic_cmpxchg_strong_relaxed(
+                    &xc->comm_ctrl->leader_seq, &oldval, seq);
             }
         }
 
         // Non-leaders exit; they can't become leaders on higher levels
-        if(comms[i].is_coll_leader == false) {
+        if(xc->is_leader == false)
             break;
-        }
     }
 
     /* The writes and the cmpxchg to comm_ctrl->leader_seq, are relaxed.
@@ -73,269 +80,368 @@ static void xhc_bcast_try_leader(xhc_comm_t *comms, int comm_count,
      * or not. Only the result of the cmp operation is utilized. */
 }
 
-static void xhc_bcast_children_init(xhc_comm_t *comms, int comm_count,
-        void *buffer, size_t bytes_ready, xhc_copy_data_t *region_data,
-        bool do_cico, int rank, xf_sig_t seq) {
-
-    for(int i = comm_count - 1; i >= 0; i--) {
-        xhc_comm_t *xc = &comms[i];
+// ------------------------------------------------
 
-        if(!xc->is_coll_leader) {
-            continue;
-        }
-
-        WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0);
-
-        /* Because there is a control dependency with the loads
-         * from coll_ack above and the code below, and because it
-         * is a load-store one (not load-load), I declare that a
-         * read-memory-barrier is not required here. */
+void mca_coll_xhc_bcast_notify(xhc_bcast_ctx_t *ctx,
+        xhc_comm_t *xc, size_t bytes_ready) {
 
-        xc->comm_ctrl->leader_id = xc->member_id;
-        xc->comm_ctrl->leader_rank = rank;
+    if(!(xc->op_state & COMM_PREV_FINI))
+        return;
 
-        xc->comm_ctrl->cico_id = (do_cico ? comms[0].manager_rank : -1);
+    if(xc->op_state & COMM_CTRL_INIT) {
+        assert(ctx->method != XHC_COPY_IMM);
+        xhc_atomic_store_size_t(&xc->comm_ctrl->bytes_ready, bytes_ready);
+    } else {
+        if(ctx->method == XHC_COPY_IMM) {
+            xhc_memcpy((void *) xc->comm_ctrl->imm_data, ctx->buf, bytes_ready);
+        } else {
+            xc->comm_ctrl->leader_rank = ctx->rank;
+            xc->comm_ctrl->bytes_ready = bytes_ready;
 
-        xc->comm_ctrl->data_vaddr = (!do_cico ? buffer : NULL);
-        xc->comm_ctrl->bytes_ready = bytes_ready;
+            if(ctx->method == XHC_COPY_SMSC_MAP
+                    || ctx->method == XHC_COPY_SMSC_NO_MAP) {
+                xc->comm_ctrl->data_vaddr = ctx->buf;
 
-        if(region_data != NULL) {
-            xhc_copy_region_post(xc->comm_ctrl->access_token, region_data);
+                if(ctx->region_data != NULL)
+                    xhc_copy_region_post((void *) xc->comm_ctrl->access_token,
+                        ctx->region_data);
+            }
         }
 
-        /* The above comm_ctrl stores must have finished before the
-         * peers are notified to attach/copy. We don't need an atomic
-         * store to bytes_ready here, since it is guarded by coll_seq. */
+        /* Make sure the above stores complete
+         * before the one to the control flag */
         xhc_atomic_wmb();
 
-        xc->comm_ctrl->coll_seq = seq;
+        xc->comm_ctrl->seq = ctx->seq;
+        xc->op_state |= COMM_CTRL_INIT;
     }
 }
 
-static void xhc_bcast_children_set_bytes_ready(xhc_comm_t *comms,
-        int comm_count, size_t bytes) {
+// ------------------------------------------------
 
-    for(int i = comm_count - 1; i >= 0; i--) {
-        xhc_comm_t *xc = &comms[i];
+int mca_coll_xhc_bcast_init(void *buf, size_t count, ompi_datatype_t *datatype,
+        int root, ompi_communicator_t *ompi_comm, xhc_module_t *module,
+        xhc_bcast_ctx_t *ctx) {
 
-        if(!xc->is_coll_leader) {
-            continue;
-        }
+    ctx->buf = buf;
+    ctx->datacount = count;
+    ctx->root = root;
+    ctx->datatype = datatype;
+    ctx->ompi_comm = ompi_comm;
+    ctx->module = module;
 
-        volatile xf_size_t *brp = &xc->comm_ctrl->bytes_ready;
-        xhc_atomic_store_size_t(brp, bytes);
-    }
+    ctx->rank = ompi_comm_rank(ompi_comm);
 
-    /* Not much reason for a wmb() here or inside the loop.
-     * The stores may be reordered after any following stores,
-     * and within themselves. */
-}
+    ctx->src_comm = NULL;
 
-static void xhc_bcast_do_ack(xhc_comm_t *comms,
-        int comm_count, xf_sig_t seq) {
+    ctx->region_data = NULL;
+    ctx->reg = NULL;
 
-    // Set Ack(s)
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    ctx->bytes_done = 0;
+    ctx->bytes_avail = 0;
 
-        xc->my_member_ctrl->member_ack = seq;
+    // --
 
-        if(!xc->is_coll_leader) {
-            break;
-        }
-    }
+    size_t dtype_size;
+    ompi_datatype_type_size(datatype, &dtype_size);
+    ctx->bytes_total = count * dtype_size;
 
-    // Gather members' Ack(s) and set coll_ack
-    for(int i = 0; i < comm_count; i++) {
-        xhc_comm_t *xc = &comms[i];
+    ctx->data = &module->op_data[XHC_BCAST];
 
-        if(!xc->is_coll_leader) {
-            break;
-        }
+    ctx->seq = ++(ctx->data->seq);
+    ctx->comms = ctx->data->comms;
 
-        for(int m = 0; m < xc->size; m++) {
-            if(m == xc->member_id) {
-                continue;
-            }
-
-            WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN);
-        }
+    if(ctx->bytes_total <= XHC_BCAST_IMM_SIZE) {
+        ctx->method = XHC_COPY_IMM;
+        ctx->bytes_avail = ctx->bytes_total;
+    } else if(ctx->bytes_total <= ctx->comms[0].cico_size) {
+        ctx->method = XHC_COPY_CICO;
+        ctx->self_cico = xhc_get_cico(module->peer_info, ctx->rank);
+    } else {
+        ctx->method = (module->zcopy_map_support ?
+            XHC_COPY_SMSC_MAP : XHC_COPY_SMSC_NO_MAP);
 
-        xc->comm_ctrl->coll_ack = seq;
+        int err = xhc_copy_expose_region(ctx->buf,
+            ctx->bytes_total, &ctx->region_data);
+        if(err != 0) return OMPI_ERROR;
     }
-}
 
-static xhc_comm_t *xhc_bcast_src_comm(xhc_comm_t *comms, int comm_count) {
-    xhc_comm_t *s = NULL;
+    // --
 
-    for(int i = 0; i < comm_count; i++) {
-        if(!comms[i].is_coll_leader) {
-            s = &comms[i];
+    xhc_bcast_init_local(ctx->comms, ctx->module->peer_info,
+        ctx->rank, ctx->root, ctx->seq);
+
+    for(xhc_comm_t *xc = ctx->comms; xc; xc = xc->up) {
+        if(!xc->is_leader) {
+            ctx->src_comm = xc;
             break;
         }
     }
 
-    return s;
+    return OMPI_SUCCESS;
 }
 
-int mca_coll_xhc_bcast(void *buf, size_t count, ompi_datatype_t *datatype, int root,
-        ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
-
-    xhc_module_t *module = (xhc_module_t *) ompi_module;
+int mca_coll_xhc_bcast_start(xhc_bcast_ctx_t *ctx) {
+    int err = OMPI_SUCCESS;
 
-    if(!module->init) {
-        int ret = xhc_lazy_init(module, ompi_comm);
-        if(ret != OMPI_SUCCESS) return ret;
-    }
+    for(xhc_comm_t *xc = ctx->comms->top; xc; xc = xc->down) {
+        if(!xc->is_leader || (xc->op_state & COMM_PREV_FINI))
+            continue;
 
-    if(!ompi_datatype_is_predefined(datatype)) {
-        static bool warn_shown = false;
-
-        if(!warn_shown) {
-            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
-                ompi_coll_base_framework.framework_output,
-                "coll:xhc: Warning: XHC does not currently support "
-                "derived datatypes; utilizing fallback component");
-            warn_shown = true;
+        if(!CHECK_FLAG(&xc->comm_ctrl->ack, ctx->seq - 1, 0)) {
+            err = OMPI_ERR_WOULD_BLOCK;
+            continue;
         }
 
-        xhc_coll_fns_t fallback = ((xhc_module_t *) module)->prev_colls;
-        return fallback.coll_bcast(buf, count, datatype, root,
-            ompi_comm, fallback.coll_bcast_module);
+        /* Load-store control dependency between the load for comm ack
+         * and the stores in xhc_bcast_notify(); no barrier required */
+
+        xc->op_state |= COMM_PREV_FINI;
+
+        /* If comm ctrl is not initialized here/now through xhc_bcast_notify(),
+         * it will be later, lazily. With XHC_COPY_SMSC_MAP, children attach to
+         * their parents' buffers while waiting for the bytes_ready signal. In
+         * other cases, they don't do anything after receiving comm seq and
+         * before polling bytes_ready; avoid writing the cache line and thus
+         * triggering the cache coherency protocol twice. */
+
+        if(ctx->bytes_done > 0 || ctx->method == XHC_COPY_SMSC_MAP)
+            xhc_bcast_notify(ctx, xc, ctx->bytes_done);
     }
 
-    // ----
+    return err;
+}
 
-    xhc_peer_info_t *peer_info = module->peer_info;
-    xhc_data_t *data = module->data;
+int mca_coll_xhc_bcast_work(xhc_bcast_ctx_t *ctx) {
+    xhc_comm_t *src_comm = ctx->src_comm;
+    xhc_comm_ctrl_t *src_ctrl = src_comm->comm_ctrl;
 
-    xhc_comm_t *comms = data->comms;
-    int comm_count = data->comm_count;
+    xhc_peer_info_t *peer_info = ctx->module->peer_info;
 
-    size_t dtype_size, bytes_total;
-    ompi_datatype_type_size(datatype, &dtype_size);
-    bytes_total = count * dtype_size;
+    int err;
 
-    int rank = ompi_comm_rank(ompi_comm);
+    // ---
 
-    bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX);
-    void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank);
-    void *src_buffer;
+    // Check if leader has joined (without blocking)
+    if(!(src_comm->op_state & COMM_CTRL_INIT)) {
+        if(CHECK_FLAG(&src_ctrl->seq, ctx->seq, 0)) {
+            switch(ctx->method) {
+                case XHC_COPY_IMM:
+                    ctx->src_buffer = (void *) src_ctrl->imm_data;
+                    break;
 
-    // Only really necessary for smsc/knem
-    xhc_copy_data_t *region_data = NULL;
+                case XHC_COPY_CICO:
+                    ctx->src_buffer = xhc_get_cico(peer_info, src_ctrl->leader_rank);
+                    if(ctx->src_buffer == NULL) return OMPI_ERR_OUT_OF_RESOURCE;
+                    break;
 
-    // ----
+                case XHC_COPY_SMSC_MAP:
+                    ctx->src_buffer = xhc_get_registration(
+                        &ctx->module->peer_info[src_ctrl->leader_rank],
+                        src_ctrl->data_vaddr, ctx->bytes_total, &ctx->reg);
+                    if(ctx->src_buffer == NULL) return OMPI_ERROR;
+                    break;
 
-    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
+                case XHC_COPY_SMSC_NO_MAP:
+                    ctx->src_buffer = src_ctrl->data_vaddr;
+                    break;
 
-    xhc_bcast_try_leader(comms, comm_count, peer_info, rank, root, pvt_seq);
+                default:
+                    assert(0);
+            }
 
-    // No chunking for now... TODO?
-    if(rank == root && do_cico) {
-        memcpy(local_cico, buf, bytes_total);
-    }
+            src_comm->op_state |= COMM_CTRL_INIT;
+        } else
+            return OPAL_ERR_WOULD_BLOCK;
 
-    if(!do_cico) {
-        int err = xhc_copy_expose_region(buf, bytes_total, &region_data);
-        if(err != 0) {
-            return OMPI_ERROR;
-        }
+        xhc_atomic_rmb();
     }
 
-    xhc_bcast_children_init(comms, comm_count, buf,
-        (rank == root ? bytes_total : 0), region_data, do_cico, rank, pvt_seq);
+    // ---
+
+    size_t copy_size = opal_min(src_comm->chunk_size,
+        ctx->bytes_total - ctx->bytes_done);
+
+    // Check if data's ready to be copied (without blocking)
+    if(ctx->bytes_avail < copy_size) {
+        ctx->bytes_avail = xhc_atomic_load_size_t(
+            &src_ctrl->bytes_ready) - ctx->bytes_done;
+
+        if(ctx->bytes_avail < copy_size)
+            return OMPI_ERR_WOULD_BLOCK;
 
-    if(rank == root) {
-        goto coll_finish;
+        xhc_atomic_rmb();
     }
 
-    // ----
+    void *data_src = (char *) ctx->src_buffer + ctx->bytes_done;
+    void *data_dst = (char *) ctx->buf + ctx->bytes_done;
+    void *cico_dst = (char *) ctx->self_cico + ctx->bytes_done;
 
-    /* Not actually necessary for the broadcast operation, but
-     * good for consistency between all seq/ack numbers */
-    for(int i = 0; i < comm_count; i++) {
-        comms[i].my_member_ctrl->member_seq = pvt_seq;
-        if(!comms[i].is_coll_leader) {
+    bool is_leader = (ctx->comms[0].is_leader);
+
+    /* Pipelined copy not necessary when
+     * non-leader; copy all at once. */
+    if(!is_leader)
+        copy_size = ctx->bytes_avail;
+
+    switch(ctx->method) {
+        case XHC_COPY_IMM:
+        case XHC_COPY_SMSC_MAP:
+            xhc_memcpy(data_dst, data_src, copy_size);
             break;
-        }
-    }
 
-    xhc_comm_t *src_comm = xhc_bcast_src_comm(comms, comm_count);
-    xhc_comm_ctrl_t *src_ctrl = src_comm->comm_ctrl;
+        case XHC_COPY_CICO:
+            xhc_memcpy((is_leader ? cico_dst : data_dst), data_src, copy_size);
+            break;
 
-    WAIT_FLAG(&src_ctrl->coll_seq, pvt_seq, 0);
-    xhc_atomic_rmb();
+        case XHC_COPY_SMSC_NO_MAP:
+            err = xhc_copy_from(&peer_info[src_ctrl->leader_rank], data_dst,
+                data_src, copy_size, (void *) src_ctrl->access_token);
+            if(err != 0) return OMPI_ERROR;
 
-    if(!do_cico) {
-        src_buffer = src_ctrl->data_vaddr;
-    } else {
-        src_buffer = xhc_get_cico(peer_info, src_ctrl->cico_id);
-        if(src_buffer == NULL) return OMPI_ERR_OUT_OF_RESOURCE;
+            break;
+
+        default:
+            assert(0);
     }
 
-    size_t bytes_done = 0;
-    size_t bytes_available = 0;
+    ctx->bytes_done += copy_size;
+    ctx->bytes_avail -= copy_size;
 
-    while(bytes_done < bytes_total) {
-        size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - bytes_done);
+    /* Make sure the memcpy has completed before
+     * writing to bytes_ready (in xhc_bcast_notify) */
+    xhc_atomic_wmb();
 
-        void *data_dst = (char *) buf + bytes_done;
-        void *data_src = (char *) src_buffer + bytes_done;
-        void *data_cico_dst = (char *) local_cico + bytes_done;
+    // Notify any children
+    for(xhc_comm_t *xc = src_comm->down; xc; xc = xc->down)
+        xhc_bcast_notify(ctx, xc, ctx->bytes_done);
 
-        if(bytes_available < copy_size) {
-            do {
-                volatile xf_size_t *brp = &src_ctrl->bytes_ready;
-                bytes_available = xhc_atomic_load_size_t(brp) - bytes_done;
-            } while(bytes_available < copy_size);
+    if(ctx->method == XHC_COPY_CICO && is_leader)
+        xhc_memcpy(data_dst, cico_dst, copy_size);
 
-            // Wait on loads inside the loop
-            xhc_atomic_rmb();
-        }
+    return OMPI_SUCCESS;
+}
 
-        /* Pipelining is not necessary on the bottom
-         * level, copy all available at once */
-        if(!comms[0].is_coll_leader) {
-            copy_size = bytes_available;
-        }
+void mca_coll_xhc_bcast_ack(xhc_bcast_ctx_t *ctx) {
 
-        if(!do_cico) {
-            int err = xhc_copy_from(&peer_info[src_ctrl->leader_rank],
-                data_dst, data_src, copy_size, src_ctrl->access_token);
-            if(err != 0) {
-                return OMPI_ERROR;
-            }
-        } else {
-            memcpy((comms[0].is_coll_leader
-                ? data_cico_dst : data_dst), data_src, copy_size);
+    // Set personal ack(s)
+    for(xhc_comm_t *xc = ctx->comms; xc; xc = xc->up) {
+        xc->my_ctrl->ack = ctx->seq;
+
+        if(!xc->is_leader)
+            break;
+    }
+
+    // Gather members' acks and set comm ack
+    for(xhc_comm_t *xc = ctx->comms; xc; xc = xc->up) {
+        if(!xc->is_leader)
+            break;
+
+        /* We are about to write this after gathering
+         * the members' acks, let's prefetch it now! */
+        xhc_prefetchw((void *) &xc->comm_ctrl->ack,
+            sizeof(xc->comm_ctrl->ack), 1);
+
+        for(int m = 0; m < xc->size; m++) {
+            if(m == xc->my_id)
+                continue;
+
+            WAIT_FLAG(&xc->member_ctrl[m].ack, ctx->seq, 0);
         }
 
-        bytes_done += copy_size;
-        bytes_available -= copy_size;
+        xc->comm_ctrl->ack = ctx->seq;
+    }
+}
+
+void mca_coll_xhc_bcast_fini(xhc_bcast_ctx_t *ctx) {
+    if(ctx->reg)
+        xhc_return_registration(ctx->reg);
 
-        /* Do make sure the memcpy has completed before
-         * writing to the peers' bytes_ready. */
-        xhc_atomic_wmb();
+    if(ctx->region_data)
+        xhc_copy_close_region(ctx->region_data);
 
-        xhc_bcast_children_set_bytes_ready(comms, comm_count, bytes_done);
+    /* The operation is done, all children have copied from the leader's CICO
+     * buffer, and thus the cache lines comprising it are in various caches,
+     * probably in shared state. Do an RFO-prefetch, to claim them back in the
+     * local cache in exclusive state, to speed up writing to the CICO buffer
+     * in the next op. Specify prefetch to L2, to try to avoid cache pollution */
+    if(ctx->method == XHC_COPY_CICO && ctx->comms[0].is_leader)
+        xhc_prefetchw(ctx->self_cico, ctx->bytes_total, 2);
+}
+
+// ------------------------------------------------
+
+int mca_coll_xhc_bcast(void *buf, size_t count, ompi_datatype_t *datatype, int root,
+        ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    // ---
+
+    if(!ompi_datatype_is_predefined(datatype)) {
+        WARN_ONCE("coll:xhc: Warning: XHC does not currently support "
+            "derived datatypes; utilizing fallback component");
+        goto _fallback;
+    }
 
-        if(do_cico && comms[0].is_coll_leader) {
-            memcpy(data_dst, data_cico_dst, copy_size);
+    if(!module->zcopy_support) {
+        size_t dtype_size; ompi_datatype_type_size(datatype, &dtype_size);
+        size_t cico_size = module->op_config[XHC_BCAST].cico_max;
+        if(count * dtype_size > cico_size) {
+            WARN_ONCE("coll:xhc: Warning: No smsc support; utilizing fallback "
+                "component for bcast greater than %zu bytes", cico_size);
+            goto _fallback;
         }
     }
 
-    if(!do_cico) {
-        xhc_copy_close_region(region_data);
+    // ---
+
+    xhc_bcast_ctx_t ctx;
+    int err;
+
+    if(!module->op_data[XHC_BCAST].init) {
+        err = xhc_init_op(module, ompi_comm, XHC_BCAST);
+        if(err != OMPI_SUCCESS) goto _fallback_permanent;
     }
 
-    coll_finish:
+    err = xhc_bcast_init(buf, count, datatype,
+        root, ompi_comm, module, &ctx);
+    if(err != OMPI_SUCCESS) return err;
+
+    /* Safe to alter the CICO buffer without checking any flags,
+     * because this is this rank's personal buffer. In any past
+     * ops where others copied from it, the rank has gathered
+     * acks that these copies have completed. */
+    if(ctx.rank == root && ctx.method == XHC_COPY_CICO)
+        xhc_memcpy(ctx.self_cico, ctx.buf, ctx.bytes_total);
 
-    /* No wmb() necessary before sending ACK, as all operations
-     * that should be waited on (reads from shared buffers) have
-     * explicit barriers following them. */
+    if(ctx.rank == root)
+        ctx.bytes_done = ctx.bytes_total;
 
-    xhc_bcast_do_ack(comms, comm_count, pvt_seq);
+    while((err = xhc_bcast_start(&ctx)) != OMPI_SUCCESS)
+        if(err != OMPI_ERR_WOULD_BLOCK) return err;
+
+    while(ctx.bytes_done < ctx.bytes_total) {
+        err = xhc_bcast_work(&ctx);
+
+        if(err != OMPI_SUCCESS && err != OMPI_ERR_WOULD_BLOCK)
+            return err;
+    }
+
+    xhc_bcast_ack(&ctx);
+    xhc_bcast_fini(&ctx);
 
     return OMPI_SUCCESS;
+
+    // ---
+
+_fallback_permanent:
+
+    XHC_INSTALL_FALLBACK(module,
+        ompi_comm, XHC_BCAST, bcast);
+
+_fallback:
+
+    return XHC_CALL_FALLBACK(module->prev_colls, XHC_BCAST,
+        bcast, buf, count, datatype, root, ompi_comm);
 }
diff --git a/ompi/mca/coll/xhc/coll_xhc_comm.c b/ompi/mca/coll/xhc/coll_xhc_comm.c
new file mode 100644
index 00000000000..7569e32f705
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_comm.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+#include "opal/mca/shmem/base/base.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+// ------------------------------------------------
+
+/* This is the method that constructs XHC's hierarchies. It receives a
+ * config object that contains the specs (a list of localities) for the
+ * new hierarchiy. The algorithm groups ranks together according to it,
+ * and outputs the resulting list of xhc communicators. See also the
+ * inline comments for more specific implementation details. */
+int mca_coll_xhc_comms_make(ompi_communicator_t *ompi_comm,
+        xhc_module_t *module, xhc_op_config_t *config, xhc_op_data_t *data) {
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    xhc_coll_fns_t xhc_fns;
+
+    int rank = ompi_comm_rank(ompi_comm);
+    int n_ranks = ompi_comm_size(ompi_comm);
+
+    xhc_comm_t *comms = NULL;
+    int comms_size = 0;
+    int comm_count = 0;
+
+    opal_shmem_ds_t *ds_list;
+    bool *candidate_list;
+
+    size_t smsc_reg_size = 0;
+
+    int n_slices = 1;
+
+    int return_code = OMPI_SUCCESS;
+    int err;
+
+    xhc_module_set_coll_fns(ompi_comm, &module->prev_colls, &xhc_fns);
+
+    comms = malloc((comms_size = 5) * sizeof(xhc_comm_t));
+    ds_list = malloc(n_ranks * sizeof(opal_shmem_ds_t));
+    candidate_list = malloc(n_ranks * sizeof(bool));
+
+    if(!comms || !ds_list || !candidate_list)
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+    if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION))
+        smsc_reg_size = mca_smsc_base_registration_data_size();
+
+    // MPI_Reduce implementation is 'multi-sliced'!
+    if(data->colltype == XHC_REDUCE)
+        n_slices = 2;
+
+    // Initialize seq in a way that will have the first op use slice 0
+    data->seq = n_slices - 1;
+
+    /* We process each locality in the spec in order, and place all ranks
+     * that share it in the same group. The one amongst them with the lowest
+     * rank number becomes the 'owner' of the group, i.e. the one that will
+     * allocate the shared resources. The owner is also the de facto leader,
+     * only stepping aside in cases of dynamic leadership, or when another
+     * rank in the xhc comm is the root of the collective.
+     *
+     * For each locality, only the ranks that where owners in the comms
+     * resulting from the preceding locality are considered on this one. */
+
+    for(int h = 0; h < config->hierarchy_len; h++) {
+        xhc_comm_t *xc = &comms[comm_count];
+
+        if(comm_count == comms_size) {
+            void *tmp = realloc(comms, (comms_size *= 2) * sizeof(xhc_comm_t));
+            if(!tmp) RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+            comms = tmp;
+        }
+
+        *xc = (xhc_comm_t) {
+            .locality = config->hierarchy[h],
+
+            .chunk_size = (comm_count < config->chunks_len ?
+                config->chunks[comm_count] : comms[comm_count-1].chunk_size),
+
+            .cico_size = config->cico_max,
+
+            .my_id = -1,
+            .size = 0,
+
+            .owner_rank = -1,
+
+            .slices = NULL,
+            .n_slices = n_slices,
+
+            .reduce_queue = NULL,
+            .reduce_buffer = NULL
+        };
+
+        xc->slices = calloc(xc->n_slices, sizeof(xhc_sh_slice_t));
+        if(!xc->slices) RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+        // ----
+
+        /* Only ranks that were leaders in the previous level are candidates
+         * for this one. Every rank advertises whether others may consider
+         * it for inclusion on this one via an Allgather. */
+
+        bool is_candidate = (comm_count == 0
+            || rank == comms[comm_count - 1].owner_rank);
+
+        err = ompi_comm->c_coll->coll_allgather(&is_candidate, 1,
+            MPI_C_BOOL, candidate_list, 1, MPI_C_BOOL,
+            ompi_comm, ompi_comm->c_coll->coll_allgather_module);
+        if(err != OMPI_SUCCESS)
+            RETURN_WITH_ERROR(return_code, err, comm_error);
+
+        for(int r = 0; r < n_ranks; r++) {
+            /* Only consider ranks that were leaders on the previous comm.
+             * Don't get tempted to omit this check for the bottom comm; even
+             * if this is the local's rank's bottom comm, it may not be for a
+             * peer of his (e.g. with some non-symmetric hierarchies). */
+            if(candidate_list[r] == false)
+                continue;
+
+            // Non-local --> not part of the comm :/
+            if(!PEER_IS_LOCAL(peer_info, r, xc->locality))
+                continue;
+
+            /* The member ID will mean slightly different things whether on
+             * the bottom comm or on higher up ones. On the bottom comm,
+             * each member ID corresponds to a single rank. On higher-up comms,
+             * each member ID represents not a single process, but a whole
+             * comm for the preceding level. */
+            if(r == rank || (comm_count > 0 && r == comms[comm_count - 1].owner_rank))
+                xc->my_id = xc->size;
+
+            // First rank to join the comm becomes the owner
+            if(xc->owner_rank == -1)
+                xc->owner_rank = r;
+
+            xc->size++;
+        }
+
+        /* If there are no local peers in regards to this locality, no
+         * XHC comm is created for this process on this level. */
+        if(xc->size <= 1) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: Locality 0x%04x does not result "
+                "in any new groupings; skipping it", xc->locality);
+
+            /* Even though there was no other rank local to this one for
+             * this locality, and thus no XHC comm was created, this might
+             * not be the case for foreign ranks. We are obligated to
+             * participate in the Allgather they'll do on the ompi comm in
+             * order to share control structurs, even if it's useless to us. */
+
+            err = ompi_comm->c_coll->coll_allgather(&xc->comm_ds,
+                sizeof(opal_shmem_ds_t), MPI_BYTE, ds_list,
+                sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
+                ompi_comm->c_coll->coll_allgather_module);
+            if(err != OMPI_SUCCESS)
+                RETURN_WITH_ERROR(return_code, err, comm_error);
+
+            xhc_comms_destroy(xc, 1);
+            continue;
+        }
+
+        // ----
+
+        /* Init comm stuff */
+
+        xc->member_info = calloc(xc->size, sizeof(xhc_member_info_t));
+        if(xc->member_info == NULL)
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+
+        xc->my_info = &xc->member_info[xc->my_id];
+
+        if(data->colltype == XHC_REDUCE || data->colltype == XHC_ALLREDUCE) {
+            xc->reduce_queue = OBJ_NEW(opal_list_t);
+            if(!xc->reduce_queue) RETURN_WITH_ERROR(return_code,
+                OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+
+            for(int m = 0; m < xc->size - 1; m++) {
+                xhc_rq_item_t *item = OBJ_NEW(xhc_rq_item_t);
+                if(!item) RETURN_WITH_ERROR(return_code,
+                    OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+
+                opal_list_append(xc->reduce_queue, (opal_list_item_t *) item);
+            }
+        }
+
+        // ----
+
+        char *ds_base = NULL;
+
+        // Create shared structs
+        if(rank == xc->owner_rank) {
+            size_t ds_len = sizeof(xhc_comm_ctrl_t) + smsc_reg_size
+                + xc->size * sizeof(xhc_member_ctrl_t) * xc->n_slices;
+
+            if(data->colltype == XHC_REDUCE || data->colltype == XHC_ALLREDUCE)
+                ds_len += xc->size * xc->cico_size * xc->n_slices;
+
+            ds_base = xhc_shmem_create(&xc->comm_ds, ds_len,
+                ompi_comm, "ctrl", data->colltype, comm_count);
+            if(ds_base == NULL)
+                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
+
+            /* Manually 'touch' to assert allocation in local NUMA node
+            * (assuming linux's default first-touch-alloc NUMA policy) */
+            memset(ds_base, 0, ds_len);
+
+            /* Initialize comm/member ctrl */
+
+            xhc_comm_ctrl_t *c_ctrl = (void *) ds_base;
+
+            for(int s = 0; s < n_slices; s++) {
+                xhc_member_ctrl_t *m_ctrl = (void *) (ds_base
+                    + sizeof(xhc_comm_ctrl_t) + smsc_reg_size
+                    + xc->size * sizeof(xhc_member_ctrl_t) * s);
+
+                for(int m = 0; m < xc->size; m++) {
+                    /* At the beggining of an OP with seq 'x', a member
+                     * will check for completion of previous ops using
+                     * this slice by looking for an ack of 'x - n_slices'.
+                     * The first time ever that each slice _s_ will be
+                     * used, the seq number will be 'n_slices + s'
+                     * (data->seq is initialized to n_slices - 1). */
+                    m_ctrl[m] = (xhc_member_ctrl_t) {
+                        .seq = (xf_sig_t) s,
+                        .ack = (xf_sig_t) s,
+                    };
+                }
+            }
+
+            *c_ctrl = (xhc_comm_ctrl_t) {
+                .seq = data->seq,
+                .ack = data->seq,
+                .leader_seq = data->seq
+            };
+        }
+
+        /* The comm's owners share the details of the communication structs
+         * with their children, so that they may attach to them. There's no
+         * MPI communicator that only includes the member of the XHC comm,
+         * so a single Allgather on the original MPI comm is preformed. */
+
+        err = ompi_comm->c_coll->coll_allgather(&xc->comm_ds,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, ds_list,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
+            ompi_comm->c_coll->coll_allgather_module);
+        if(err != OMPI_SUCCESS)
+            RETURN_WITH_ERROR(return_code, err, comm_error);
+
+        // Attach to owner's shared structs
+        if(rank != xc->owner_rank) {
+            xc->comm_ds = ds_list[xc->owner_rank];
+
+            ds_base = xhc_shmem_attach(&xc->comm_ds);
+            if(ds_base == NULL)
+                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
+        }
+
+        // ----
+
+        xc->comm_ctrl_base = (void *) ds_base;
+        xc->member_ctrl_base = (void *) (ds_base
+            + sizeof(xhc_comm_ctrl_t) + smsc_reg_size);
+
+        if(data->colltype == XHC_REDUCE || data->colltype == XHC_ALLREDUCE) {
+            xc->reduce_buffer_base = (void *) (ds_base
+                + sizeof(xhc_comm_ctrl_t) + smsc_reg_size
+                + xc->size * sizeof(xhc_member_ctrl_t) * xc->n_slices);
+        }
+
+        // Assign to first slice by default
+        xc->comm_ctrl = xc->comm_ctrl_base;
+        xc->member_ctrl = xc->member_ctrl_base;
+        xc->my_ctrl = &xc->member_ctrl[xc->my_id];
+        xc->reduce_buffer = xc->reduce_buffer_base;
+
+        // ----
+
+        comm_count++;
+        continue;
+
+        comm_error: {
+            xhc_comms_destroy(comms, comm_count+1);
+            comm_count = -1;
+
+            goto end;
+        }
+    }
+
+    for(int i = 0; i < comm_count; i++) {
+        comms[i].up = (i < comm_count - 1 ? &comms[i + 1] : NULL);
+        comms[i].down = (i > 0 ? &comms[i - 1] : NULL);
+
+        comms[i].top = &comms[comm_count - 1];
+        comms[i].bottom = &comms[0];
+
+        comms[i].is_top = (comms[i].up == NULL);
+        comms[i].is_bottom = (comms[i].down == NULL);
+    }
+
+    if(config->chunks_len > 1 && config->chunks_len < comm_count) {
+        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: The chunk sizes count (%d) is shorter than the "
+            "hierarchy size (%d); filling in with the last entry provided",
+            config->chunks_len, comm_count);
+    } else if(config->chunks_len > comm_count) {
+        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: The chunk size count (%d) is larger than the "
+            "hierarchy size (%d); omitting last entries",
+            config->chunks_len, comm_count);
+    }
+
+    // --
+
+    REALLOC(comms, comm_count, xhc_comm_t);
+
+    data->comms = comms;
+    data->comm_count = comm_count;
+
+    // --
+
+    end:
+
+    xhc_module_set_coll_fns(ompi_comm, &xhc_fns, NULL);
+
+    free(candidate_list);
+    free(ds_list);
+
+    if(return_code != OMPI_SUCCESS)
+        free(comms);
+
+    return return_code;
+}
+
+void mca_coll_xhc_comms_destroy(xhc_comm_t *comms, int comm_count) {
+    bool is_owner = true;
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(xc->my_id != 0)
+            is_owner = false;
+
+        free(xc->slices);
+        free(xc->member_info);
+
+        if(xc->reduce_queue)
+            OPAL_LIST_RELEASE(xc->reduce_queue);
+
+        if(xc->comm_ctrl) {
+            if(is_owner) {
+                // OMPI issue #11123
+                // opal_shmem_unlink(&xc->comm_ds);
+                (void) is_owner;
+            }
+
+            opal_shmem_segment_detach(&xc->comm_ds);
+        }
+
+        *xc = (xhc_comm_t) {0};
+    }
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_component.c b/ompi/mca/coll/xhc/coll_xhc_component.c
index 4981d8643bd..c6de4b7a7de 100644
--- a/ompi/mca/coll/xhc/coll_xhc_component.c
+++ b/ompi/mca/coll/xhc/coll_xhc_component.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -13,21 +13,26 @@
 
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/base.h"
+#include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/mca/coll/base/coll_base_util.h"
 
+#include "opal/include/opal/align.h"
 #include "opal/mca/shmem/base/base.h"
 #include "opal/util/show_help.h"
 
 #include "coll_xhc.h"
 
+// -----------------------------
+
 typedef int (*csv_parse_conv_fn_t)(char *str, void *dst);
 typedef void (*csv_parse_destruct_fn_t)(void *data);
 
 static int xhc_register(void);
+static int xhc_var_check_exclusive(const char *param_a, const char *param_b);
 
-const char *mca_coll_xhc_component_version_string =
-    "Open MPI xhc collective MCA component version " OMPI_VERSION;
+// -----------------------------
 
-static const char *hwloc_topo_str[] = {
+static const char *xhc_topo_str[] = {
     "node", "flat",
     "socket",
     "numa",
@@ -38,7 +43,7 @@ static const char *hwloc_topo_str[] = {
     "hwthread", "thread"
 };
 
-static const xhc_loc_t hwloc_topo_val[] = {
+static const xhc_loc_t xhc_topo_val[] = {
     OPAL_PROC_ON_NODE, OPAL_PROC_ON_NODE,
     OPAL_PROC_ON_SOCKET,
     OPAL_PROC_ON_NUMA,
@@ -49,6 +54,23 @@ static const xhc_loc_t hwloc_topo_val[] = {
     OPAL_PROC_ON_HWTHREAD, OPAL_PROC_ON_HWTHREAD
 };
 
+static const COLLTYPE_T xhc_colltype_to_universal_map[XHC_COLLCOUNT] = {
+    [XHC_BCAST] = BCAST,
+    [XHC_BARRIER] = BARRIER,
+    [XHC_REDUCE] = REDUCE,
+    [XHC_ALLREDUCE] = ALLREDUCE
+};
+
+static const char *xhc_config_source_to_str_map[XHC_CONFIG_SOURCE_COUNT] = {
+    [XHC_CONFIG_SOURCE_INFO_GLOBAL] = "info/global",
+    [XHC_CONFIG_SOURCE_INFO_OP] = "info/op",
+    [XHC_CONFIG_SOURCE_MCA_GLOBAL] = "mca/global",
+    [XHC_CONFIG_SOURCE_MCA_OP] = "mca/op"
+};
+
+const char *mca_coll_xhc_component_version_string =
+    "Open MPI xhc collective MCA component version " OMPI_VERSION;
+
 mca_coll_xhc_component_t mca_coll_xhc_component = {
     .super = {
         .collm_version = {
@@ -70,198 +92,409 @@ mca_coll_xhc_component_t mca_coll_xhc_component = {
     },
 
     .priority = 0,
-    .print_info = false,
+    .print_info = 0,
 
     .shmem_backing = NULL,
 
+    .memcpy_chunk_size = 256 << 10,
+
     .dynamic_leader = false,
 
     .barrier_root = 0,
+    .allreduce_root = 0,
 
-    .dynamic_reduce = OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT,
-    .lb_reduce_leader_assist =
-        (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK),
+    .dynamic_reduce = XHC_DYNAMIC_REDUCE_NON_FLOAT,
+    .reduce_load_balance = (XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL
+        | XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK),
+
+    .uniform_chunks = true,
+    .uniform_chunks_min = 4096,
 
-    .force_reduce = false,
+    .op_mca[XHC_BCAST] = {
+        .hierarchy = "numa,socket",
+        .chunk_size = "16K",
+        .cico_max = 256
+    },
 
-    .cico_max = 1024,
+    .op_mca[XHC_BARRIER] = {
+        .hierarchy = "numa,socket",
+        .chunk_size = "1",
+        .cico_max = 0
+    },
 
-    .uniform_chunks = true,
-    .uniform_chunks_min = 1024,
+    .op_mca[XHC_REDUCE] = {
+        .hierarchy = "l3,numa,socket",
+        .chunk_size = "16K",
+        .cico_max = 4096
+    },
 
-    /* These are the parameters that will need
-     * processing, and their default values. */
-    .hierarchy_mca = "numa,socket",
-    .chunk_size_mca = "16K"
+    .op_mca[XHC_ALLREDUCE] = {
+        .hierarchy = "l3,numa,socket",
+        .chunk_size = "16K",
+        .cico_max = 4096
+    }
 };
 
+// -----------------------------
+
 /* Initial query function that is invoked during MPI_INIT, allowing
  * this component to disqualify itself if it doesn't support the
  * required level of thread support. */
 int mca_coll_xhc_component_init_query(bool enable_progress_threads,
         bool enable_mpi_threads) {
-
     return OMPI_SUCCESS;
 }
 
-static mca_base_var_enum_value_t dynamic_reduce_options[] = {
-    {OMPI_XHC_DYNAMIC_REDUCE_DISABLED, "disabled"},
-    {OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT, "non-float"},
-    {OMPI_XHC_DYNAMIC_REDUCE_ALL, "all"},
-    {0, NULL}
-};
+COLLTYPE_T mca_coll_xhc_colltype_to_universal(XHC_COLLTYPE_T xhc_colltype) {
+    if(xhc_colltype < 0 || xhc_colltype >= XHC_COLLCOUNT)
+        return -1;
+    return xhc_colltype_to_universal_map[xhc_colltype];
+}
 
-static mca_base_var_enum_value_flag_t lb_reduce_leader_assist_options[] = {
-    {OMPI_XHC_LB_RLA_TOP_LEVEL, "top", OMPI_XHC_LB_RLA_ALL},
-    {OMPI_XHC_LB_RLA_FIRST_CHUNK, "first", OMPI_XHC_LB_RLA_ALL},
-    {OMPI_XHC_LB_RLA_ALL, "all",
-        (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK)},
-    {0, NULL, 0}
-};
+const char *mca_coll_xhc_colltype_to_str(XHC_COLLTYPE_T colltype) {
+    return mca_coll_base_colltype_to_str(xhc_colltype_to_universal(colltype));
+}
+
+const char *mca_coll_xhc_config_source_to_str(xhc_config_source_t source) {
+    return (source >= 0 && source < XHC_CONFIG_SOURCE_COUNT ?
+        xhc_config_source_to_str_map[source] : NULL);
+}
 
 static int xhc_register(void) {
     mca_base_var_enum_t *var_enum;
     mca_base_var_enum_flag_t *var_enum_flag;
-    char *tmp, *desc;
-    int ret;
+    const mca_base_var_t *var = NULL;
+    int vari;
+
+    char *tmp, *name, *desc;
+    int err;
 
     /* Priority */
+    // -----------
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "priority", "Priority of the xhc component",
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "priority", "Priority of the xhc component.",
         MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_2,
         MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.priority);
 
-    /* Info */
-
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "print_info", "Print information during initialization",
-        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.print_info);
-
     /* SHM Backing dir */
+    // ------------------
 
     mca_coll_xhc_component.shmem_backing = (access("/dev/shm", W_OK) == 0 ?
         "/dev/shm" : opal_process_info.job_session_dir);
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
         "shmem_backing", "Directory to place backing files for shared-memory"
-        " control-data communication", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+        " control-data communication.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
         OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY,
         &mca_coll_xhc_component.shmem_backing);
 
-    /* Dynamic leader */
+    /* Memcpy limit (see smsc_xpmem_memcpy_chunk_size) */
+    // --------------------------------------------------
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "dynamic_leader", "Enable dynamic operation-wise group-leader selection",
-        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_leader);
+    int var_index = mca_base_var_find("opal", "smsc", "xpmem", "memcpy_chunk_size");
 
-    /* Dynamic reduce */
+    if(var_index >= 0) {
+        size_t *var_value_ptr;
 
-    ret = mca_base_var_enum_create("coll_xhc_dynamic_reduce_options",
-        dynamic_reduce_options, &var_enum);
-    if(ret != OPAL_SUCCESS) {
-        return ret;
+        err = mca_base_var_get_value(var_index, &var_value_ptr, NULL, NULL);
+
+        if(err == OPAL_SUCCESS)
+            mca_coll_xhc_component.memcpy_chunk_size = *var_value_ptr;
+        else {
+            opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+                ompi_coll_base_framework.framework_output, "coll:xhc:component: "
+                "Can't get smsc_xpmem_memcpy_chunk_size MCA param value "
+                "from var index %d (%d); using xhc default", var_index, err);
+        }
+    } else {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output, "coll:xhc:component: "
+            "Can't find smsc_xpmem_memcpy_chunk_size MCA param (%d); "
+            "using xhc default", var_index);
     }
 
-    /* Barrier root */
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "memcpy_chunk_size", "Maximum size to copy with a single call to memcpy, "
+        "following smsc/xpmem's paradigm. A smaller/larger value may provide "
+        "better performance on some systems.", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0,
+        MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
+        &mca_coll_xhc_component.memcpy_chunk_size);
+
+    /* Print Info */
+    // -------------
+
+    mca_base_var_enum_value_flag_t print_info_options[XHC_COLLCOUNT + 4] = {
+        [XHC_COLLCOUNT + 0] = {XHC_PRINT_INFO_HIER_DOT, "dot", 0},
+        [XHC_COLLCOUNT + 1] = {XHC_PRINT_INFO_CONFIG, "config", 0},
+        [XHC_COLLCOUNT + 2] = {XHC_PRINT_INFO_ALL, "all", 0},
+        [XHC_COLLCOUNT + 3] = {0, NULL, 0}
+    };
+
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        print_info_options[t] = (mca_base_var_enum_value_flag_t) {
+            1 << t, xhc_colltype_to_str(t), 0};
+    }
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "barrier_root", "Internal root for the barrier operation (rank ID)",
+    err = mca_base_var_enum_create_flag("coll_xhc_print_info",
+        print_info_options, &var_enum_flag);
+    if(err != OPAL_SUCCESS) return err;
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "print_info", "Print information during initialization.",
+        MCA_BASE_VAR_TYPE_UNSIGNED_INT, &var_enum_flag->super, 0, 0, OPAL_INFO_LVL_3,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.print_info);
+
+    if(mca_coll_xhc_component.print_info & XHC_PRINT_INFO_ALL) {
+        for(mca_base_var_enum_value_flag_t *flag = print_info_options;
+                flag->string; flag++) {
+            mca_coll_xhc_component.print_info |= flag->flag;
+        }
+    }
+
+    OBJ_RELEASE(var_enum_flag);
+
+    /* Root ranks for unrooted collectives */
+    // --------------------------------------
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "barrier_root", "Internal root for the barrier operation (rank ID).",
         MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
         MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.barrier_root);
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "dynamic_reduce", "Dynamic/out-of-order intra-group reduction",
+    // Currently, only rank 0 is supported in allreduce.
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "allreduce_root", "Internal root for the allreduce operation (rank ID).",
+        MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.allreduce_root);
+
+    /* Dynamic leader */
+    // -----------------
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "dynamic_leader", "Enable dynamic operation-wise group-leader selection.",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_leader);
+
+    /* Dynamic reduce */
+    // -----------------
+
+    static mca_base_var_enum_value_t dynamic_reduce_options[] = {
+        {XHC_DYNAMIC_REDUCE_DISABLED, "disabled"},
+        {XHC_DYNAMIC_REDUCE_NON_FLOAT, "non-float"},
+        {XHC_DYNAMIC_REDUCE_ALL, "all"},
+        {0, NULL}
+    };
+
+    err = mca_base_var_enum_create("coll_xhc_dynamic_reduce_options",
+        dynamic_reduce_options, &var_enum);
+    if(err != OPAL_SUCCESS) return err;
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "dynamic_reduce", "Dynamic/out-of-order intra-group reduction.",
         MCA_BASE_VAR_TYPE_INT, var_enum, 0, 0, OPAL_INFO_LVL_6,
         MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_reduce);
 
     OBJ_RELEASE(var_enum);
 
-    /* Load balancing: Reduce leader assistance */
+    /* Load balancing: Reduction leader assistance */
+    // ----------------------------------------------
 
-    ret = mca_base_var_enum_create_flag("coll_xhc_lb_reduce_leader_assist",
-        lb_reduce_leader_assist_options, &var_enum_flag);
-    if(ret != OPAL_SUCCESS) {
-        return ret;
-    }
+    mca_base_var_enum_value_flag_t reduce_load_balance_options[] = {
+        {XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL, "top", XHC_REDUCE_LB_LEADER_ASSIST_ALL},
+        {XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK, "first", XHC_REDUCE_LB_LEADER_ASSIST_ALL},
+        {XHC_REDUCE_LB_LEADER_ASSIST_ALL, "all", (XHC_REDUCE_LB_LEADER_ASSIST_TOP_LEVEL |
+            XHC_REDUCE_LB_LEADER_ASSIST_FIRST_CHUNK)},
+        {0, NULL, 0}
+    };
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "lb_reduce_leader_assist", "Reduction leader assistance modes for load balancing",
+    err = mca_base_var_enum_create_flag("coll_xhc_reduce_load_balance",
+        reduce_load_balance_options, &var_enum_flag);
+    if(err != OPAL_SUCCESS) return err;
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "reduce_load_balance", "Reduction leader assistance modes for load balancing.",
         MCA_BASE_VAR_TYPE_INT, &var_enum_flag->super, 0, 0, OPAL_INFO_LVL_6,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.lb_reduce_leader_assist);
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.reduce_load_balance);
 
     OBJ_RELEASE(var_enum_flag);
 
-    /* Force enable "hacky" reduce */
+    /* (All)reduce uniform chunks */
+    // ---------------------------
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "uniform_chunks", "Automatically optimize chunk size in reduction "
+        "collectives according to message size, for load balancing.",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.uniform_chunks);
+
+    mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "uniform_chunks_min", "Minimum chunk size for reduction collectives, "
+        "when \"uniform chunks\" are enabled.", MCA_BASE_VAR_TYPE_SIZE_T,
+        NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
+        &mca_coll_xhc_component.uniform_chunks_min);
+
+    /* Hierarchy */
+    // ------------
+
+    char *topo_list = NULL;
+
+    for(size_t i = 0; i < sizeof(xhc_topo_str)/sizeof(char *); i++) {
+        err = opal_asprintf(&tmp, "%s%s%s", (i > 0 ? topo_list : ""),
+            (i > 0 ? ", " : ""), xhc_topo_str[i]);
+        free(topo_list); topo_list = tmp;
+        if(err < 0) return OPAL_ERR_OUT_OF_RESOURCE;
+    }
+
+    err = opal_asprintf(&desc, "Comma-separated list of topology features to "
+        "consider for the hierarchy (%s), for all collectives. Mutually "
+        "exclusive with respective op-specific params.", topo_list);
+    if(err < 0) {free(topo_list); return OMPI_ERR_OUT_OF_RESOURCE;}
+
+    vari = mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "hierarchy", desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_DEF_UNSET,
+        OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_READONLY,
+        &mca_coll_xhc_component.op_mca_global.hierarchy);
+
+    free(desc);
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "force_reduce", "Force enable the \"special\" Reduce for all calls",
-        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.force_reduce);
+    mca_base_var_get(vari, &var);
 
-    /* Hierarchy features */
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        err = opal_asprintf(&name, "%s_hierarchy", xhc_colltype_to_str(t));
+        if(err < 0) {free(topo_list); return OPAL_ERR_OUT_OF_RESOURCE;}
 
-    desc = NULL;
+        err = opal_asprintf(&desc, "Comma-separated list of topology features to "
+            "consider for the hierarchy (%s), for %s.", topo_list, xhc_colltype_to_str(t));
+        if(err < 0) {free(topo_list); free(name); return OMPI_ERR_OUT_OF_RESOURCE;}
 
-    for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) {
-        ret = opal_asprintf(&tmp, "%s%s%s", (i > 0 ? desc : ""),
-            (i > 0 ? ", " : ""), hwloc_topo_str[i]);
-        free(desc); desc = tmp;
-        if(ret < 0) {
-            return OPAL_ERR_OUT_OF_RESOURCE;
+        mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+            name, desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_7,
+            MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.op_mca[t].hierarchy);
+
+        /* The op-specific MCA vars are mutually exclusive with
+         * the global one. Otherwise things get complicated. */
+        err = xhc_var_check_exclusive("hierarchy", name);
+
+        free(name);
+        free(desc);
+
+        if(err != OPAL_SUCCESS)
+            return err;
+
+        /* If the global MCA var was set (because the user set it, it's unset
+         * by default), it overrides the op-specific ones. (and both of them
+         * can't be set at the same time, as they are mutually exclusive. */
+        if(var && var->mbv_source != MCA_BASE_VAR_SOURCE_DEFAULT) {
+            mca_coll_xhc_component.op_mca[t].hierarchy =
+                strdup(mca_coll_xhc_component.op_mca_global.hierarchy);
         }
     }
 
-    ret = opal_asprintf(&tmp, "Comma-separated list of topology features to "
-        "consider for the hierarchy (%s)", desc);
-    free(desc); desc = tmp;
-    if(ret < 0) {
-        return OMPI_ERR_OUT_OF_RESOURCE;
+    free(topo_list);
+
+    /* Chunk size */
+    // ---------------
+
+    vari = mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "chunk_size", "Chunk size(s) for the pipeline (single value, or comma-separated "
+        "list for different hierarchy levels (bottom to top)), for all collectives. "
+        "Mutually exclusive with respective op-specific params.",
+        MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_DEF_UNSET, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.op_mca_global.chunk_size);
+
+    mca_base_var_get(vari, &var);
+
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        if(t == XHC_BARRIER)
+            continue;
+
+        err = opal_asprintf(&name, "%s_chunk_size", xhc_colltype_to_str(t));
+        if(err < 0) return OPAL_ERR_OUT_OF_RESOURCE;
+
+        err = opal_asprintf(&desc, "Chunk size(s) for the pipeline (single "
+            "value, or comma-separated list for different hierarchy levels "
+            "(bottom to top)), for %s.", xhc_colltype_to_str(t));
+        if(err < 0) {free(name); return OMPI_ERR_OUT_OF_RESOURCE;}
+
+        mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+            name, desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_8,
+            MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.op_mca[t].chunk_size);
+
+        err = xhc_var_check_exclusive("chunk_size", name);
+
+        free(name);
+        free(desc);
+
+        if(err != OPAL_SUCCESS)
+            return err;
+
+        if(var && var->mbv_source != MCA_BASE_VAR_SOURCE_DEFAULT) {
+            mca_coll_xhc_component.op_mca[t].chunk_size =
+                strdup(mca_coll_xhc_component.op_mca_global.chunk_size);
+        }
     }
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "hierarchy", desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_4,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.hierarchy_mca);
+    /* CICO threshold */
+    // -----------------
 
-    free(desc);
+    vari = mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "cico_max", "Maximum message size up to which to use CICO, for all collectives. "
+        "Mutually exclusive with respective op-specific params.",
+        MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_DEF_UNSET, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.op_mca_global.cico_max);
 
-    /* Chunk size(s) */
+    mca_coll_xhc_component.op_mca_global.cico_max = OPAL_ALIGN(
+        mca_coll_xhc_component.op_mca_global.cico_max, XHC_ALIGN, size_t);
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "chunk_size", "The chunk size(s) to be used for the pipeline "
-        "(single value, or comma separated list for different hierarchy levels "
-        "(bottom to top))",
-        MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_5,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.chunk_size_mca);
+    mca_base_var_get(vari, &var);
 
-    /* Allreduce uniform chunks */
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        if(t == XHC_BARRIER)
+            continue;
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "uniform_chunks", "Automatically optimize chunk size in reduction "
-        "collectives according to message size, for load balancing",
-        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.uniform_chunks);
+        err = opal_asprintf(&name, "%s_cico_max", xhc_colltype_to_str(t));
+        if(err < 0) return OPAL_ERR_OUT_OF_RESOURCE;
 
-    /* Allreduce uniform chunks min size */
+        err = opal_asprintf(&desc, "Maximum message size up to which "
+            "to use CICO, for %s.", xhc_colltype_to_str(t));
+        if(err < 0) {free(name); return OMPI_ERR_OUT_OF_RESOURCE;}
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "uniform_chunks_min", "Minimum chunk size for reduction collectives, "
-        "when \"uniform chunks\" are enabled", MCA_BASE_VAR_TYPE_SIZE_T,
-        NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
-        &mca_coll_xhc_component.uniform_chunks_min);
+        mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+            name, desc, MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
+            OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_READONLY,
+            &mca_coll_xhc_component.op_mca[t].cico_max);
+
+        mca_coll_xhc_component.op_mca[t].cico_max = OPAL_ALIGN(
+            mca_coll_xhc_component.op_mca[t].cico_max, XHC_ALIGN, size_t);
+
+        err = xhc_var_check_exclusive("cico_max", name);
 
-    /* CICO threshold (inclusive) */
+        free(name);
+        free(desc);
 
-    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
-        "cico_max", "Maximum message size up to which to use CICO",
-        MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_5,
-        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.cico_max);
+        if(err != OPAL_SUCCESS)
+            return err;
+
+        if(var && var->mbv_source != MCA_BASE_VAR_SOURCE_DEFAULT) {
+            mca_coll_xhc_component.op_mca[t].cico_max =
+                mca_coll_xhc_component.op_mca_global.cico_max;
+        }
+    }
 
     return OMPI_SUCCESS;
 }
 
+static int xhc_var_check_exclusive(const char *param_a, const char *param_b) {
+    return mca_base_var_check_exclusive("ompi",
+        mca_coll_xhc_component.super.collm_version.mca_type_name,
+        mca_coll_xhc_component.super.collm_version.mca_component_name, param_a,
+        mca_coll_xhc_component.super.collm_version.mca_type_name,
+        mca_coll_xhc_component.super.collm_version.mca_component_name, param_b);
+}
+
+// -----------------------------
+
 static int parse_csv(const char *csv_orig, char sep, char ignore_start,
         char ignore_end, void **vals_dst, int *len_dst, size_t type_size,
         csv_parse_conv_fn_t conv_fn, csv_parse_destruct_fn_t destructor_fn,
@@ -281,13 +514,11 @@ static int parse_csv(const char *csv_orig, char sep, char ignore_start,
 
     int return_code = OMPI_SUCCESS;
 
-    if(!(csv = strdup(csv_orig))) {
+    if(!(csv = strdup(csv_orig)))
         RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
 
-    if(!(vals = malloc((vals_size = 5) * type_size))) {
+    if(!(vals = malloc((vals_size = 5) * type_size)))
         RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
 
     int ignore_cnt = 0;
     char *token = csv;
@@ -299,22 +530,18 @@ static int parse_csv(const char *csv_orig, char sep, char ignore_start,
 
         if(ntokens == vals_size) {
             void *tmp = realloc(vals, (vals_size *= 2) * sizeof(type_size));
-            if(!tmp) {
-                RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-            }
+            if(!tmp) RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
             vals = tmp;
         }
 
         if(ignore_start != 0) {
-            if(*c == ignore_start) {
+            if(*c == ignore_start)
                 ignore_cnt++;
-            } else if(*c == ignore_end) {
+            else if(*c == ignore_end)
                 ignore_cnt--;
-            }
 
-            if(ignore_cnt < 0) {
+            if(ignore_cnt < 0)
                 RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-            }
         }
 
         if(ignore_cnt == 0 && (*c == sep || *c == '\0')) {
@@ -324,10 +551,9 @@ static int parse_csv(const char *csv_orig, char sep, char ignore_start,
             int status = conv_fn(token, (char *) vals + ntokens*type_size);
 
             if(status != OMPI_SUCCESS) {
-                if(err_help_header) {
+                if(err_help_header)
                     opal_show_help("help-coll-xhc.txt",
                         err_help_header, true, token, csv_orig);
-                }
 
                 RETURN_WITH_ERROR(return_code, status, end);
             }
@@ -348,9 +574,8 @@ static int parse_csv(const char *csv_orig, char sep, char ignore_start,
 
     if(return_code != OMPI_SUCCESS) {
         if(vals && destructor_fn) {
-            for(int i = 0; i < ntokens; i++) {
+            for(int i = 0; i < ntokens; i++)
                 destructor_fn((char *) vals + i*type_size);
-            }
         }
 
         free(vals);
@@ -359,6 +584,8 @@ static int parse_csv(const char *csv_orig, char sep, char ignore_start,
     return return_code;
 }
 
+// -----------------------------
+
 static int conv_xhc_loc_def_rank_list(char *str, void *result) {
     char *strs[2] = {str, NULL};
     int nums[2] = {-1, -1};
@@ -377,9 +604,8 @@ static int conv_xhc_loc_def_rank_list(char *str, void *result) {
 
         nums[i] = strtol(strs[i], &endptr, 10);
 
-        if(endptr[0] != '\0' || nums[i] < 0) {
+        if(endptr[0] != '\0' || nums[i] < 0)
             RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-        }
     }
 
     ((xhc_rank_range_t *) result)->start_rank = nums[0];
@@ -387,9 +613,8 @@ static int conv_xhc_loc_def_rank_list(char *str, void *result) {
 
     end:
 
-    if(range_op_pos) {
+    if(range_op_pos)
         *range_op_pos = '.';
-    }
 
     return return_code;
 }
@@ -416,9 +641,8 @@ static int conv_xhc_loc_def(char *str, void *result) {
     char *s = strdup(str);
     xhc_loc_def_t *def = OBJ_NEW(xhc_loc_def_t);
 
-    if(!s || !def) {
+    if(!s || !def)
         RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
 
     /* Parse modifiers and remove them from string */
 
@@ -430,23 +654,19 @@ static int conv_xhc_loc_def(char *str, void *result) {
     char *colon_pos = strrchr(s, ':');
     char *qmark_pos = strrchr(s, '?');
 
-    if(colon_pos && qmark_pos) {
+    if(colon_pos && qmark_pos)
         RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-    } else if(colon_pos || qmark_pos) {
+    else if(colon_pos || qmark_pos) {
         char *numstr = (colon_pos ? colon_pos : qmark_pos);
         char *endptr;
 
         int num = strtol(numstr + 1, &endptr, 10);
 
-        if(endptr[0] != '\0' || num <= 0) {
+        if(endptr[0] != '\0' || num <= 0)
             RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-        }
 
-        if(colon_pos) {
-            def->split = num;
-        } else {
-            def->max_ranks = num;
-        }
+        if(colon_pos) def->split = num;
+        else def->max_ranks = num;
 
         *numstr = '\0';
     }
@@ -454,9 +674,8 @@ static int conv_xhc_loc_def(char *str, void *result) {
     /* Parse locality definition */
 
     if(s[0] == '[') {
-        if(def->repeat) { // repeat only makes sense with named localities
+        if(def->repeat) // repeat only makes sense with named localities
             RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-        }
 
         s[strlen(s) - 1] = '\0';
 
@@ -464,23 +683,21 @@ static int conv_xhc_loc_def(char *str, void *result) {
             &def->rank_list_len, sizeof(xhc_rank_range_t),
             conv_xhc_loc_def_rank_list, NULL, NULL);
 
-        if(status != OMPI_SUCCESS) {
+        if(status != OMPI_SUCCESS)
             RETURN_WITH_ERROR(return_code, status, end);
-        }
     } else {
         bool found = false;
 
-        for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) {
-            if(strcasecmp(s, hwloc_topo_str[i]) == 0) {
-                def->named_loc = hwloc_topo_val[i];
+        for(size_t i = 0; i < sizeof(xhc_topo_str)/sizeof(char *); i++) {
+            if(strcasecmp(s, xhc_topo_str[i]) == 0) {
+                def->named_loc = xhc_topo_val[i];
                 found = true;
                 break;
             }
         }
 
-        if(!found) {
+        if(!found)
             RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
-        }
     }
 
     * (xhc_loc_def_t **) result = def;
@@ -489,9 +706,8 @@ static int conv_xhc_loc_def(char *str, void *result) {
 
     free(s);
 
-    if(return_code != OMPI_SUCCESS) {
+    if(return_code != OMPI_SUCCESS)
         OBJ_RELEASE_IF_NOT_NULL(def);
-    }
 
     return return_code;
 }
@@ -507,16 +723,13 @@ static int conv_xhc_loc_def_combination(char *str, void *result) {
     int status = parse_csv(str, '+', 0, 0, (void **) &defs,
         &ndefs, sizeof(xhc_loc_def_t *), conv_xhc_loc_def,
         destruct_xhc_loc_def, NULL);
-    if(status != OMPI_SUCCESS) {
-        return status;
-    }
+    if(status != OMPI_SUCCESS) return status;
 
     opal_list_t *def_list = (opal_list_t *) result;
     OBJ_CONSTRUCT(def_list, opal_list_t);
 
-    for(int i = 0; i < ndefs; i++) {
+    for(int i = 0; i < ndefs; i++)
         opal_list_append(def_list, (opal_list_item_t *) defs[i]);
-    }
 
     free(defs);
 
@@ -545,7 +758,7 @@ int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
      *
      * Each item in this '+'-separated list, can be of the following types:
      * 1. A "named locality", e.g. hwloc's localities (only ones currently
-     *    available), see hwloc_topo_str[].
+     *    available), see xhc_topo_str[].
      * 2. A list of ranks that should be grouped together. This is a comma-
      *    separated list of integers, enclosed in [] (I know, list-ception!).
      *    It may also contain range operators (..), to select multiple ranks
@@ -555,8 +768,8 @@ int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
      * Finally, each such item may be suffixed by a special modifier:
      * 1. The split modifier (:<n>) specifies to group according to the
      *    locality it refers to, but to split each such group into multiple
-     *    parts. E.g. the locality 'numa:2' will group ranks into half-numas
-     *    group, such that for each NUMA node, half the ranks are in one
+     *    parts. E.g. the locality 'numa:2' will group ranks into half-numa
+     *    groups, such that for each NUMA node, half the ranks are in one
      *    group, and the rest are in another.
      * 2. The max-ranks modifier (?<n>) works similarly to the split modifier,
      *    only that it specifies that at most _n_ ranks should be placed in
@@ -572,10 +785,10 @@ int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
      *   example, "numa", even though it is a single key, means to group
      *   all ranks that are in the same NUMA together, which will lead to
      *   multiple groups if multiple NUMA nodes are present. This is in
-     *   contract to rank lists, which only create a single group, containing
+     *   contrast to rank lists, which only create a single group, containing
      *   the ranks specified in it. The different items in the '+'-separated
      *   list are consumed in-order left-to-right, and any named localities
-     *   are automatically repeated to apply all ranks that are not included
+     *   are automatically repeated to apply to all ranks that are not included
      *   in other items. When multiple named localities are present one after
      *   the other, the last one is repeated, unless another repetition was
      *   explicitly requested via the repeat modifier.
@@ -619,7 +832,9 @@ int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
     return status;
 }
 
-static int conv_chunk_size(char *str, void *result) {
+// -----------------------------
+
+static int conv_num_size(char *str, void *result) {
     size_t last_idx = strlen(str) - 1;
     char saved_char = str[last_idx];
 
@@ -660,9 +875,8 @@ int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str,
 
     if(val_str == NULL) {
         *chunks_dst = malloc(sizeof(size_t));
-        if(*chunks_dst == NULL) {
+        if(*chunks_dst == NULL)
             return OMPI_ERR_OUT_OF_RESOURCE;
-        }
 
         (*chunks_dst)[0] = (size_t) -1;
         *len_dst = 1;
@@ -671,7 +885,23 @@ int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str,
     }
 
     int status = parse_csv(val_str, ',', 0, 0, (void **) chunks_dst, len_dst,
-        sizeof(size_t), conv_chunk_size, NULL, "bad-chunk-size-item");
+        sizeof(size_t), conv_num_size, NULL, "bad-chunk-size-item");
+
+    return status;
+}
+
+int mca_coll_xhc_component_parse_cico_max(const char *val_str,
+        size_t *cico_max_dst) {
+
+    int status;
+
+    if(val_str) {
+        status = conv_num_size((char *) val_str, cico_max_dst);
+    } else
+        status = OMPI_ERR_BAD_PARAM;
+
+    if(status == OMPI_ERR_BAD_PARAM)
+        opal_show_help("help-coll-xhc.txt", "bad_cico_max", true, val_str);
 
     return status;
 }
diff --git a/ompi/mca/coll/xhc/coll_xhc_hierarchy.c b/ompi/mca/coll/xhc/coll_xhc_hierarchy.c
new file mode 100644
index 00000000000..f413c8cdd35
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_hierarchy.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+// ------------------------------------------------
+
+static int xhc_hierarchy_create(xhc_module_t *module, ompi_communicator_t *comm,
+    opal_list_t *level_defs, int nlevel_defs, xhc_loc_t **hierarchy_dst,
+    int *hierarchy_len_dst);
+
+static int xhc_hierarchy_sort(mca_coll_xhc_module_t *module,
+    ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst,
+    int *hierarchy_len_dst);
+
+// ------------------------------------------------
+
+int mca_coll_xhc_hierarchy_make(xhc_module_t *module,
+        ompi_communicator_t *comm, const char *hierarchy_string,
+        xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst) {
+
+    xhc_loc_t *hierarchy = NULL;
+    int hierarchy_len;
+
+    opal_list_t *level_defs = NULL;
+    int nlevel_defs = 0;
+
+    int err, return_code = OMPI_SUCCESS;
+
+    // ---
+
+    err = xhc_component_parse_hierarchy(hierarchy_string,
+        &level_defs, &nlevel_defs);
+    if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
+
+    err = xhc_hierarchy_create(module, comm, level_defs,
+        nlevel_defs, &hierarchy, &hierarchy_len);
+    if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
+
+    err = xhc_hierarchy_sort(module, comm, &hierarchy, &hierarchy_len);
+    if(err != OMPI_SUCCESS) RETURN_WITH_ERROR(return_code, err, end);
+
+    // ---
+
+    *hierarchy_dst = hierarchy;
+    *hierarchy_len_dst = hierarchy_len;
+
+    end:
+
+    for(int i = 0; i < nlevel_defs; i++)
+        OPAL_LIST_DESTRUCT(&level_defs[i]);
+    free(level_defs);
+
+    if(err != OMPI_SUCCESS)
+        free(hierarchy);
+
+    return return_code;
+}
+
+// ------------------------------------------------
+
+static int xhc_hierarchy_create(xhc_module_t *module, ompi_communicator_t *comm,
+        opal_list_t *level_defs, int nlevel_defs, xhc_loc_t **hierarchy_dst,
+        int *hierarchy_len_dst) {
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    xhc_coll_fns_t xhc_fns;
+
+    int comm_size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+
+    xhc_loc_t *hierarchy = NULL;
+    int nvirt_hiers = 0;
+
+    int *rank_list;
+
+    opal_hwloc_locality_t *loc_list;
+    ompi_datatype_t *hwloc_locality_type = NULL;
+
+    int err, return_code = OMPI_SUCCESS;
+
+    xhc_module_set_coll_fns(comm, &module->prev_colls, &xhc_fns);
+
+    hierarchy = malloc(nlevel_defs * sizeof(xhc_loc_t));
+    rank_list = malloc(comm_size * sizeof(int));
+    loc_list = malloc(comm_size * sizeof(opal_hwloc_locality_t));
+
+    if(!hierarchy || !rank_list || !loc_list)
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+    switch(sizeof(opal_hwloc_locality_t)) {
+        case 1: hwloc_locality_type = MPI_UINT8_T; break;
+        case 2: hwloc_locality_type = MPI_UINT16_T; break;
+        case 4: hwloc_locality_type = MPI_UINT32_T; break;
+        case 8: hwloc_locality_type = MPI_UINT64_T; break;
+    }
+    assert(hwloc_locality_type);
+
+    for(int h = 0; h < nlevel_defs; h++) {
+        opal_list_t *defs = &level_defs[h];
+
+        xhc_loc_def_t *my_def = NULL;
+        xhc_loc_t locality;
+
+        xhc_loc_def_t *def_0 = (xhc_loc_def_t *) opal_list_get_first(defs);
+
+        bool is_virtual = (opal_list_get_size(defs) > 1 || def_0->rank_list
+            || def_0->split > 1 || def_0->max_ranks > 0);
+
+        if(is_virtual) {
+            if(nvirt_hiers == XHC_LOC_EXT_BITS) {
+                opal_show_help("help-coll-xhc.txt", "too-many-virt-hiers", true);
+                RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
+            }
+
+            locality = 1 << (XHC_LOC_EXT_START + nvirt_hiers);
+            nvirt_hiers++;
+        } else
+            locality = def_0->named_loc;
+
+        hierarchy[h] = locality;
+        def_0 = NULL;
+
+        xhc_loc_def_t *def, *def_next;
+
+        /* Handle rank lists; take note if I belong
+         * in one, and remove them from the mix */
+        OPAL_LIST_FOREACH_SAFE(def, def_next, defs, xhc_loc_def_t) {
+            if(def->rank_list) {
+                if(!my_def) {
+                    for(int rl = 0; rl < def->rank_list_len; rl++) {
+                        if(rank >= def->rank_list[rl].start_rank
+                                && rank <= def->rank_list[rl].end_rank) {
+                            my_def = def;
+                            break;
+                        }
+                    }
+                }
+
+                opal_list_remove_item(defs, (opal_list_item_t *) def);
+                if(def != my_def) OBJ_RELEASE(def);
+            }
+        }
+
+        bool dir_fwd = true;
+
+        /* When multiple locality defitions are present, they are assigned
+         * to groups in a left-to-right fashion. At every turn, the first
+         * rank (determined by the minimum ID) that's still not part of
+         * a locality, as well as the other ranks that are local with it,
+         * claim/consume the next locality from the list. The direction
+         * serves to implement the repeat modifier. When it is located,
+         * the process starts taking place right-to-left following the max
+         * ID. At the end and after the loop, the repeated locality will
+         * be the only one left and all remaining ranks will follow it. */
+        while(opal_list_get_size(defs) > 1) {
+            def = (xhc_loc_def_t *) (dir_fwd ? opal_list_get_first(defs)
+                : opal_list_get_last(defs));
+
+            if(dir_fwd && def->repeat) {
+                dir_fwd = false;
+                continue;
+            }
+
+            int ticket = (my_def == NULL ? rank : (dir_fwd ? comm_size : -1));
+            int chosen;
+
+            err = comm->c_coll->coll_allreduce(&ticket, &chosen, 1,
+                MPI_INT, (dir_fwd ? MPI_MIN : MPI_MAX), comm,
+                comm->c_coll->coll_allreduce_module);
+            if(err != OMPI_SUCCESS)
+                RETURN_WITH_ERROR(return_code, err, end);
+
+            if(chosen >= 0 && chosen < comm_size
+                    && PEER_IS_LOCAL(peer_info, chosen, def->named_loc))
+                my_def = def;
+
+            opal_list_remove_item(defs, (opal_list_item_t *) def);
+            if(def != my_def) OBJ_RELEASE(def);
+        }
+
+        if(opal_list_get_size(defs) > 0 && !my_def) {
+            my_def = (xhc_loc_def_t *) opal_list_get_first(defs);
+            opal_list_remove_item(defs, (opal_list_item_t *) my_def);
+        }
+
+        /* Share which named locality each rank follows; ranks that
+         * follow different localities shouldn't be grouped together */ 
+        opal_hwloc_locality_t follow_loc = (my_def ? my_def->named_loc : 0);
+        err = comm->c_coll->coll_allgather(&follow_loc, 1,
+            hwloc_locality_type, loc_list, 1, hwloc_locality_type,
+            comm, comm->c_coll->coll_allgather_module);
+        if(err != OMPI_SUCCESS)
+            RETURN_WITH_ERROR(return_code, err, end);
+
+        if(my_def == NULL)
+            continue;
+
+        int my_id = -1;
+        int members = 0;
+
+        // If working with rank list, set the ranks from the list as "local"
+        if(my_def->rank_list) {
+            for(int i = 0; i < my_def->rank_list_len; i++) {
+                for(int r = my_def->rank_list[i].start_rank;
+                        r <= my_def->rank_list[i].end_rank && r < comm_size; r++) {
+                    if(r == rank)
+                        my_id = members;
+
+                    peer_info[r].locality |= locality;
+                    rank_list[members++] = r;
+                }
+            }
+        } else if(is_virtual) {
+            /* We might have a named locality instead of a rank list, but if
+             * we still needed to create a virtual one, we need to apply it. */
+            for(int r = 0; r < comm_size; r++) {
+                if(loc_list[r] != my_def->named_loc)
+                    continue;
+
+                if(!PEER_IS_LOCAL(peer_info, r, my_def->named_loc))
+                    continue;
+
+                if(r == rank)
+                    my_id = members;
+
+                peer_info[r].locality |= locality;
+                rank_list[members++] = r;
+            }
+        }
+
+        /* If split or max ranks was specified, must partition the locality
+         * and remove the previously added locality mapping to some ranks. */
+        if(my_def->split > 1) {
+            int piece_size = members / my_def->split;
+            int leftover = members % my_def->split;
+
+            for(int m = 0, next_border = 0; m < members; m++) {
+                if(m == next_border) {
+                    next_border += piece_size + (leftover > 0 ? 1 : 0);
+                    if(leftover > 0) leftover--;
+
+                    if(my_id >= m && my_id < next_border) {
+                        m = next_border - 1;
+                        continue;
+                    }
+                }
+
+                peer_info[rank_list[m]].locality &= ~locality;
+            }
+        } else if(my_def->max_ranks > 1) {
+            for(int m = 0; m < members; m++) {
+                if(m % my_def->max_ranks == 0) {
+                    if(my_id >= m && my_id - m < my_def->max_ranks) {
+                        m += my_def->max_ranks - 1;
+                        continue;
+                    }
+                }
+
+                peer_info[rank_list[m]].locality &= ~locality;
+            }
+        }
+
+        OBJ_RELEASE_IF_NOT_NULL(my_def);
+    }
+
+    *hierarchy_dst = hierarchy;
+    *hierarchy_len_dst = nlevel_defs;
+
+end:
+
+    xhc_module_set_coll_fns(comm, &xhc_fns, NULL);
+
+    free(rank_list);
+    free(loc_list);
+
+    if(return_code != OMPI_SUCCESS)
+        free(hierarchy);
+
+    return return_code;
+}
+
+static int xhc_hierarchy_sort(mca_coll_xhc_module_t *module,
+        ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst,
+        int *hierarchy_len_dst) {
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    int comm_size = ompi_comm_size(comm);
+
+    xhc_loc_t *old_hier = *hierarchy_dst;
+    int hier_len = *hierarchy_len_dst;
+
+    xhc_loc_t *new_hier = NULL;
+    bool *hier_done = NULL;
+
+    int return_code = OMPI_SUCCESS;
+
+    new_hier = malloc((hier_len + 1) * sizeof(xhc_loc_t));
+    hier_done = calloc(hier_len, sizeof(bool));
+
+    if(new_hier == NULL || hier_done == NULL)
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+
+    bool has_virtual = false;
+    for(int i = 0; i < hier_len; i++) {
+        if(old_hier[i] >= (1 << XHC_LOC_EXT_START)) {
+            has_virtual = true;
+            break;
+        }
+    }
+
+    /* If any virtual hierarchy is involved, attempting to sort it is likely
+     * asking for trouble. Skip the sorting, and only consider adding a top
+     * common locality. There is a chance it wasn't actually necessary, but
+     * it never hurts. */
+
+    if(has_virtual) {
+        memcpy(new_hier, old_hier, hier_len * sizeof(xhc_loc_t));
+    } else {
+        for(int new_idx = hier_len - 1; new_idx >= 0; new_idx--) {
+            int max_matches_count = -1;
+            int max_matches_hier_idx = -1;
+
+            for(int i = 0; i < hier_len; i++) {
+                if(hier_done[i])
+                    continue;
+
+                int matches = 0;
+
+                for(int r = 0; r < comm_size; r++) {
+                    if(PEER_IS_LOCAL(peer_info, r, old_hier[i]))
+                        matches++;
+                }
+
+                if(matches > max_matches_count) {
+                    max_matches_count = matches;
+                    max_matches_hier_idx = i;
+                }
+            }
+
+            assert(max_matches_count != -1);
+
+            new_hier[new_idx] = old_hier[max_matches_hier_idx];
+            hier_done[max_matches_hier_idx] = true;
+        }
+    }
+
+    xhc_loc_t common_locality = (xhc_loc_t) -1;
+
+    for(int r = 0; r < comm_size; r++) {
+        ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r);
+        common_locality &= proc->super.proc_flags;
+    }
+
+    if(common_locality == 0) {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Error: There is no locality common "
+            "to all ranks in the communicator");
+
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
+    }
+
+    if(hier_len == 0 || (common_locality & new_hier[hier_len - 1])
+            != new_hier[hier_len - 1]) {
+
+        new_hier[hier_len] = common_locality;
+        hier_len++;
+    }
+
+    REALLOC(new_hier, hier_len, xhc_loc_t);
+
+    free(old_hier);
+
+    *hierarchy_dst = new_hier;
+    *hierarchy_len_dst = hier_len;
+
+end:
+
+    free(hier_done);
+
+    if(return_code != OMPI_SUCCESS)
+        free(new_hier);
+
+    return return_code;
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_intrinsic.h b/ompi/mca/coll/xhc/coll_xhc_intrinsic.h
new file mode 100644
index 00000000000..490b57407b5
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_intrinsic.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_COLL_XHC_ATOMIC_EXPORT_H
+#define MCA_COLL_XHC_ATOMIC_EXPORT_H
+
+#include <stdint.h>
+#include "opal/sys/atomic.h"
+
+// ----------------------------------------
+
+#define IS_SIG_ATOMIC_X_BITS(x) \
+	(SIG_ATOMIC_MAX == INT ## x ## _MAX) || (SIG_ATOMIC_MAX == UINT ## x ## _MAX)
+
+// If xf_sig_t is ever re-defined to be signed,
+ // CHECK_FLAGS()'s comparisons must be adjusted
+#if IS_SIG_ATOMIC_X_BITS(64)
+	typedef uint64_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(32)
+	typedef uint32_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(16)
+	typedef uint16_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(8)
+	typedef uint8_t xf_sig_t;
+#endif
+
+typedef int __attribute__((aligned(SIZEOF_INT))) xf_int_t;
+typedef size_t __attribute__((aligned(SIZEOF_SIZE_T))) xf_size_t;
+
+// ----------------------------------------
+
+#define xhc_atomic_rmb opal_atomic_rmb
+#define xhc_atomic_wmb opal_atomic_wmb
+#define xhc_atomic_fmb opal_atomic_mb
+
+// https://github.com/open-mpi/ompi/issues/9722
+
+#if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
+	#define xhc_atomic_load_int(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
+	#define xhc_atomic_store_int(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
+	
+	#define xhc_atomic_load_size_t(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
+	#define xhc_atomic_store_size_t(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
+#else
+	#define xhc_atomic_load_int(addr) (*(addr))
+	#define xhc_atomic_store_int(addr, val) (*(addr) = (val))
+	
+	#define xhc_atomic_load_size_t(addr) (*(addr))
+	#define xhc_atomic_store_size_t(addr, val) (*(addr) = (val))
+	
+	#warning "GCC or the C11 atomics backend was not found. XHC might not function correctly"
+/* #else
+	#error "XHC atomics do not yet work without the GCC or the C11 backend" */
+#endif
+
+
+// If/when opal atomic load/store size_t is added
+
+/* #define xhc_atomic_load_size_t(addr) \
+	opal_atomic_load_size_t ((opal_atomic_size_t *) addr)
+#define xhc_atomic_store_size_t(addr, val) \
+	opal_atomic_store_size_t ((opal_atomic_size_t *) addr, val) */
+
+
+// If/when opal atomic load/store is added, and if opal atomic load/store int is not
+
+/* #if SIZEOF_INT == 4
+	#define xhc_atomic_load_int(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
+	#define xhc_atomic_store_int(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
+#elif SIZEOF_INT == 8
+	#define xhc_atomic_load_int(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
+	#define xhc_atomic_store_int(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
+#else
+	#error "Unsupported int size"
+#endif */
+
+
+// If/when opal atomic load/store is added, and if opal atomic load/store size_t is not
+
+/* #if SIZEOF_SIZE_T == 4
+	#define xhc_atomic_load_size_t(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
+	#define xhc_atomic_store_size_t(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
+#elif SIZEOF_SIZE_T == 8
+	#define xhc_atomic_load_size_t(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
+	#define xhc_atomic_store_size_t(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
+#else
+	#error "Unsupported size_t size"
+#endif */
+
+static inline bool xhc_atomic_cmpxchg_strong_relaxed(volatile xf_sig_t *addr,
+		xf_sig_t *oldval, xf_sig_t newval) {
+
+	#if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
+		return __atomic_compare_exchange_n(addr, oldval, newval,
+			false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+	#else
+		#if IS_SIG_ATOMIC_X_BITS(32)
+			return opal_atomic_compare_exchange_strong_32(addr, oldval, newval);
+		#elif IS_SIG_ATOMIC_X_BITS(64)
+			return opal_atomic_compare_exchange_strong_64(addr, oldval, newval);
+		#else
+			#error "Unsupported sig_atomic_t size"
+		#endif
+	#endif
+}
+
+// ----------------------------------------
+
+static inline void xhc_prefetchw(void *addr, size_t len, int target) {
+	for(char *p = (char *) ((uintptr_t) addr & ~63);
+			p < (char *) addr + len; p += 64) {
+		
+		switch(target) {
+			case 0:
+			case 1: // L1 cache
+			#if defined(PLATFORM_ARCH_X86) || defined(PLATFORM_ARCH_X86_64)
+				__asm__ __volatile__("prefetchw (%1)" : : "i" (0), "r" (p));
+			#elif defined(PLATFORM_ARCH_AARCH64)
+				__asm__ __volatile__("prfm pstl1keep, %a0\n" : : "p" (p));
+			#endif
+				break;
+			
+			case 2: // L2 cache
+			#if defined(PLATFORM_ARCH_X86) || defined(PLATFORM_ARCH_X86_64)
+				__asm__ __volatile__("prefetchwt1 (%1)" : : "i" (0), "r" (p));
+			#elif defined(PLATFORM_ARCH_AARCH64)
+				__asm__ __volatile__("prfm pstl2keep, %a0\n" : : "p" (p));
+			#endif
+				break;
+			
+			case 3: // L3 cache
+			default:
+			#if defined(PLATFORM_ARCH_X86) || defined(PLATFORM_ARCH_X86_64)
+				// no such thing as a 'prefetchwt2'
+				__asm__ __volatile__("prefetchwt1 (%1)" : : "i" (0), "r" (p));
+			#elif defined(PLATFORM_ARCH_AARCH64)
+				__asm__ __volatile__("prfm pstl3keep, %a0\n" : : "p" (p));
+			#endif
+				break;
+		}
+	}
+}
+
+#endif
diff --git a/ompi/mca/coll/xhc/coll_xhc_module.c b/ompi/mca/coll/xhc/coll_xhc_module.c
index 56fc563f8e6..40325c508b1 100644
--- a/ompi/mca/coll/xhc/coll_xhc_module.c
+++ b/ompi/mca/coll/xhc/coll_xhc_module.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * Copyright (c) 2024      Jeffrey M. Squyres.  All rights reserved.
  * $COPYRIGHT$
@@ -26,37 +26,83 @@
 
 #include "coll_xhc.h"
 
-static int xhc_module_save_fallback_fns(
-    xhc_module_t *module, ompi_communicator_t *comm);
+// -----------------------------
 
-static int xhc_module_create_hierarchy(mca_coll_xhc_module_t *module,
-    ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs,
-    xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst);
+static void mca_coll_xhc_module_construct(mca_coll_xhc_module_t *module);
+static void mca_coll_xhc_module_destruct(mca_coll_xhc_module_t *module);
 
-static int xhc_module_sort_hierarchy(mca_coll_xhc_module_t *module,
-    ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst);
+OBJ_CLASS_INSTANCE(mca_coll_xhc_module_t, mca_coll_base_module_t,
+    mca_coll_xhc_module_construct, mca_coll_xhc_module_destruct);
 
 // -----------------------------
 
-static void xhc_module_clear(xhc_module_t *module) {
-    memset(&module->prev_colls, 0, sizeof(module->prev_colls));
+static size_t xhc_colltype_to_c_coll_fn_offset_map[XHC_COLLCOUNT] = {
+    [XHC_BCAST] = offsetof(mca_coll_base_comm_coll_t, coll_bcast),
+    [XHC_BARRIER] = offsetof(mca_coll_base_comm_coll_t, coll_barrier),
+    [XHC_REDUCE] = offsetof(mca_coll_base_comm_coll_t, coll_reduce),
+    [XHC_ALLREDUCE] = offsetof(mca_coll_base_comm_coll_t, coll_allreduce)
+};
+
+static size_t xhc_colltype_to_c_coll_module_offset_map[XHC_COLLCOUNT] = {
+    [XHC_BCAST] = offsetof(mca_coll_base_comm_coll_t, coll_bcast_module),
+    [XHC_BARRIER] = offsetof(mca_coll_base_comm_coll_t, coll_barrier_module),
+    [XHC_REDUCE] = offsetof(mca_coll_base_comm_coll_t, coll_reduce_module),
+    [XHC_ALLREDUCE] = offsetof(mca_coll_base_comm_coll_t, coll_allreduce_module)
+};
+
+static size_t xhc_colltype_to_base_module_fn_offset_map[XHC_COLLCOUNT] = {
+    [XHC_BCAST] = offsetof(mca_coll_base_module_t, coll_bcast),
+    [XHC_BARRIER] = offsetof(mca_coll_base_module_t, coll_barrier),
+    [XHC_REDUCE] = offsetof(mca_coll_base_module_t, coll_reduce),
+    [XHC_ALLREDUCE] = offsetof(mca_coll_base_module_t, coll_allreduce)
+};
+
+static inline void (*MODULE_COLL_FN(xhc_module_t *module,
+        XHC_COLLTYPE_T colltype))(void) {
+
+    return * (void (**)(void)) ((uintptr_t) &module->super
+        + xhc_colltype_to_base_module_fn_offset_map[colltype]);
+}
+
+static inline void INSTALL_COLL_API(ompi_communicator_t *comm,
+        XHC_COLLTYPE_T colltype, void (*coll_fn)(void), void *coll_module) {
+
+    * (void (**)(void)) ((uintptr_t) comm->c_coll
+        + xhc_colltype_to_c_coll_fn_offset_map[colltype]) = coll_fn;
+    * (void **) ((uintptr_t) comm->c_coll
+        + xhc_colltype_to_c_coll_module_offset_map[colltype]) = coll_module;
+}
+
+static inline void GET_COLL_API(ompi_communicator_t *comm, XHC_COLLTYPE_T colltype,
+        void (**coll_fn_dst)(void), void **coll_module_dst) {
+
+    *coll_fn_dst = * (void (**)(void)) ((uintptr_t) comm->c_coll
+        + xhc_colltype_to_c_coll_fn_offset_map[colltype]);
 
+    *coll_module_dst = * (void **) ((uintptr_t) comm->c_coll
+        + xhc_colltype_to_c_coll_module_offset_map[colltype]);
+}
+
+// -----------------------------
+
+static void xhc_module_clear(xhc_module_t *module) {
     module->comm_size = 0;
     module->rank = -1;
 
-    module->hierarchy_string = NULL;
-    module->hierarchy = NULL;
-    module->hierarchy_len = 0;
-
-    module->chunks = NULL;
-    module->chunks_len = 0;
+    module->zcopy_support = false;
+    module->zcopy_map_support = false;
 
     module->rbuf = NULL;
     module->rbuf_size = 0;
 
     module->peer_info = NULL;
-    module->data = NULL;
+
+    memset(&module->prev_colls, 0, sizeof(module->prev_colls));
+    memset(&module->op_config, 0, sizeof(module->op_config));
+    memset(&module->op_data, 0, sizeof(module->op_data));
+
     module->init = false;
+    module->error = false;
 }
 
 static void mca_coll_xhc_module_construct(mca_coll_xhc_module_t *module) {
@@ -64,30 +110,33 @@ static void mca_coll_xhc_module_construct(mca_coll_xhc_module_t *module) {
 }
 
 static void mca_coll_xhc_module_destruct(mca_coll_xhc_module_t *module) {
-    xhc_fini(module);
+    /* Anything that's allocated during the module's creation/enable, is
+     * deallocated here. The stuff that's allocated lazily inside/under
+     * xhc_lazy_init and xhc_init_op, is deallocated inside xhc_fini. */
 
-    free(module->hierarchy_string);
-    free(module->hierarchy);
-    free(module->chunks);
-    free(module->rbuf);
-    free(module->peer_info);
+    if(module->init)
+        xhc_fini(module);
+
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        free(module->op_config[t].hierarchy_string);
+        free(module->op_config[t].chunk_string);
+        free(module->op_config[t].chunks);
+    }
 
     xhc_module_clear(module);
 }
 
-OBJ_CLASS_INSTANCE(mca_coll_xhc_module_t, mca_coll_base_module_t,
-    mca_coll_xhc_module_construct, mca_coll_xhc_module_destruct);
-
 // -----------------------------
 
 mca_coll_base_module_t *mca_coll_xhc_module_comm_query(ompi_communicator_t *comm,
         int *priority) {
 
-    if((*priority = mca_coll_xhc_component.priority) < 0) {
+    if((*priority = mca_coll_xhc_component.priority) < 0)
         return NULL;
-    }
 
-    if(OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) == 1
+    int comm_size = ompi_comm_size(comm);
+
+    if(OMPI_COMM_IS_INTER(comm) || comm_size == 1
             || ompi_group_have_remote_peers (comm->c_local_group)) {
 
         opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
@@ -99,7 +148,6 @@ mca_coll_base_module_t *mca_coll_xhc_module_comm_query(ompi_communicator_t *comm
         return NULL;
     }
 
-    int comm_size = ompi_comm_size(comm);
     for(int r = 0; r < comm_size; r++) {
         ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r);
 
@@ -113,612 +161,116 @@ mca_coll_base_module_t *mca_coll_xhc_module_comm_query(ompi_communicator_t *comm
         }
     }
 
-    mca_coll_base_module_t *module =
-        (mca_coll_base_module_t *) OBJ_NEW(mca_coll_xhc_module_t);
+    mca_coll_xhc_module_t *module = OBJ_NEW(mca_coll_xhc_module_t);
 
-    if(module == NULL) {
+    if(module == NULL)
         return NULL;
-    }
-
-    module->coll_module_enable = mca_coll_xhc_module_enable;
-    module->coll_module_disable = mca_coll_xhc_module_disable;
 
-    module->coll_barrier = mca_coll_xhc_barrier;
+    module->zcopy_support = (mca_smsc != NULL);
+    module->zcopy_map_support = mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP);
 
-    if(mca_smsc == NULL) {
+    if(!module->zcopy_support) {
         opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
             ompi_coll_base_framework.framework_output,
             "coll:xhc: Warning: No opal/smsc support found; "
-            "only barrier will be enabled");
-
-        return module;
-    }
-
-    module->coll_bcast = mca_coll_xhc_bcast;
-
-    if(!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) {
+            "xhc will only work in CICO mode");
+    } else if(!module->zcopy_map_support) {
         opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
             ompi_coll_base_framework.framework_output,
-            "coll:xhc: Warning: opal/smsc module is not CAN_MAP capable; "
-            "(all)reduce will be disabled, bcast might see reduced performance");
-
-        return module;
+            "coll:xhc: Warning: opal/smsc module isn't CAN_MAP "
+            "capable; reduced performance is to be expected");
     }
 
-    module->coll_allreduce = mca_coll_xhc_allreduce;
-    module->coll_reduce = mca_coll_xhc_reduce;
+    module->super.coll_module_enable = mca_coll_xhc_module_enable;
+    module->super.coll_module_disable = mca_coll_xhc_module_disable;
 
-    return module;
-}
+    module->super.coll_bcast = mca_coll_xhc_bcast;
+    module->super.coll_barrier = mca_coll_xhc_barrier;
+    module->super.coll_allreduce = mca_coll_xhc_allreduce;
+    module->super.coll_reduce = mca_coll_xhc_reduce;
 
-#define COLL_FN_HELPER(_m, _api) .coll_ ## _api = (_m)->coll_ ## _api, \
-    .coll_ ## _api ## _module = (_m)
+    return &module->super;
+}
 
 int mca_coll_xhc_module_enable(mca_coll_base_module_t *ompi_module,
         ompi_communicator_t *comm) {
 
     xhc_module_t *module = (xhc_module_t *) ompi_module;
 
-    int ret;
-
     // ---
 
-    ret = xhc_module_save_fallback_fns(module, comm);
-
-    /* This can/will happen often (see #9885), but theoretically
-     * isn't a problem, as in these cases the component wouldn't
-     * end up getting used anyway. */
-    if(ret != OMPI_SUCCESS) {
-        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
-            ompi_coll_base_framework.framework_output,
-            "coll:xhc:module_enable (%s/%s): No previous fallback component "
-            "found; disabling myself", ompi_comm_print_cid(comm), comm->c_name);
-
-        return ret;
+    /* Assimilate the various MCA parameters. We do this inside module_enable
+     * rather than lazy_init, so we may use the values as early as possible
+     * (e.g. checking cico and single-copy support at the beginning of ops). */
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        int err = xhc_read_op_config(module, comm, t);
+        if(err != OMPI_SUCCESS)
+            return err;
     }
 
     // ---
 
-    module->comm_size = ompi_comm_size(comm);
-    module->rank = ompi_comm_rank(comm);
-
-    module->peer_info = calloc(module->comm_size, sizeof(xhc_peer_info_t));
-
-    for(int r = 0; r < module->comm_size; r++) {
-        ompi_proc_t *peer_proc = ompi_comm_peer_lookup(comm, r);
-
-        module->peer_info[r].proc = peer_proc;
-        module->peer_info[r].locality = peer_proc->super.proc_flags;
-    }
-
-    module->peer_info[module->rank].locality |=
-        ((1 << OMPI_XHC_LOC_EXT_BITS) - 1) << OMPI_XHC_LOC_EXT_START;
-
-    // ---
-
-    /* This needs to happen here, and we need to save the hierarchy string,
-     * because the info value will have been gone by the time lazy_init is
-     * called. Furthermore, we can't prepeare the hierarchy here, as it might
-     * required communication (allgather) with the other ranks. */
-
-    const char *hier_mca = mca_coll_xhc_component.hierarchy_mca;
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        /* Don't want to save a fallback for
+         * any op that we won't support */
+        if(MODULE_COLL_FN(module, t) == NULL)
+            continue;
 
-    opal_cstring_t *hier_info;
-    int hier_info_flag = 0;
+        void (*fallback_fn)(void), *fallback_module;
+        GET_COLL_API(comm, t, &fallback_fn, &fallback_module);
 
-    if(comm->super.s_info != NULL) {
-        opal_info_get(comm->super.s_info, "ompi_comm_coll_xhc_hierarchy",
-            &hier_info, &hier_info_flag);
+        if(fallback_fn == NULL || fallback_module == NULL) {
+            opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc:module_enable (%s/%s): No previous fallback component "
+                "found; disabling myself", ompi_comm_print_cid(comm), comm->c_name);
 
-        if(hier_info_flag) {
-            hier_mca = hier_info->string;
+            return OMPI_ERR_NOT_FOUND;
         }
-    }
-
-    module->hierarchy_string = strdup(hier_mca);
-
-    if(hier_info_flag) {
-        OBJ_RELEASE(hier_info);
-    }
 
-    if(!module->hierarchy_string) {
-        return OMPI_ERR_OUT_OF_RESOURCE;
+        module->prev_colls.coll_fn[t] = fallback_fn;
+        module->prev_colls.coll_module[t] = fallback_module;
     }
 
-    // ---
-
-    ret = xhc_component_parse_chunk_sizes(mca_coll_xhc_component.chunk_size_mca,
-        &module->chunks, &module->chunks_len);
-    if(ret != OMPI_SUCCESS) {
-        return ret;
+    /* We perform the pointer installation last, after we've
+     * successfully captured all the fallback pointers we need,
+     * and we know xhc_module_enable can no longer fail. */
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        void (*fn)(void) = MODULE_COLL_FN(module, t);
+        if(fn) INSTALL_COLL_API(comm, t, fn, module);
     }
 
     // ---
 
-    xhc_coll_fns_t xhc_fns = (xhc_coll_fns_t) {
-        COLL_FN_HELPER(ompi_module, allreduce),
-        COLL_FN_HELPER(ompi_module, barrier),
-        COLL_FN_HELPER(ompi_module, bcast),
-        COLL_FN_HELPER(ompi_module, reduce)
-    };
-
-    xhc_module_install_fns(module, comm, xhc_fns);
-
     return OMPI_SUCCESS;
 }
 
 int mca_coll_xhc_module_disable(mca_coll_base_module_t *ompi_module,
         ompi_communicator_t *comm) {
 
-    xhc_module_t *module = (xhc_module_t *) ompi_module;
-
-    xhc_module_install_fallback_fns(module, comm, NULL);
-    mca_coll_xhc_module_destruct(module);
-
+    mca_coll_xhc_module_destruct((xhc_module_t *) ompi_module);
     return OMPI_SUCCESS;
 }
 
 // -----------------------------
 
-#define SAVE_FALLBACK_COLL(_comm, _m, _dst, _api) do { \
-    if((_m)->coll_ ## _api) { \
-        MCA_COLL_SAVE_API(_comm, _api, (_dst).coll_ ## _api, \
-            (_dst).coll_ ## _api ## _module, "xhc"); \
-        \
-        if(!(_dst).coll_ ## _api || !(_dst).coll_ ## _api ## _module) { \
-            _save_status = OMPI_ERR_NOT_FOUND; \
-        } \
-    } \
-} while(0)
-
-#define INSTALL_FALLBACK_COLL(_comm, _m, _saved, _new, _api) do { \
-    if((_comm)->c_coll->coll_ ## _api ## _module == (_m)) { \
-        MCA_COLL_SAVE_API(_comm, _api, (_saved).coll_ ## _api, \
-            (_saved).coll_ ## _api ## _module, "xhc"); \
-        MCA_COLL_INSTALL_API(_comm, _api, (_new).coll_ ## _api, \
-            (_new).coll_ ## _api ## _module, "xhc"); \
-    } \
-} while(0)
-
-#define INSTALL_COLL(_comm, _src, _api) do { \
-    if((_src).coll_ ## _api) { \
-        MCA_COLL_INSTALL_API(_comm, _api, (_src).coll_ ## _api, \
-            (_src).coll_ ## _api ## _module, "xhc"); \
-    } \
-} while(0)
-
-/* Save the function pointers of the previous module, in XHC's
- * struct. Only the functions that XHC will provide are saved. */
-static int xhc_module_save_fallback_fns(
-        xhc_module_t *module, ompi_communicator_t *comm) {
-
-    mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module;
-
-    xhc_coll_fns_t colls = {0};
-    int _save_status = OMPI_SUCCESS;
-
-    SAVE_FALLBACK_COLL(comm, ompi_module, colls, allreduce);
-    SAVE_FALLBACK_COLL(comm, ompi_module, colls, barrier);
-    SAVE_FALLBACK_COLL(comm, ompi_module, colls, bcast);
-    SAVE_FALLBACK_COLL(comm, ompi_module, colls, reduce);
-
-    if(_save_status == OMPI_SUCCESS) {
-        module->prev_colls = colls;
-    }
-
-    return _save_status;
-}
-
-/* Replace XHC's pointers in c_coll with those from the fallback
- * component saved earlier. XHC's pointers are conveniently returned
- * in prev_fns_dst, to later pass to xhc_module_install_fns. */
-void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module,
-        ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst) {
-
-    mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module;
+void mca_coll_xhc_module_set_coll_fns(ompi_communicator_t *comm,
+        xhc_coll_fns_t *new_fns, xhc_coll_fns_t *saved_fns_dst) {
 
     xhc_coll_fns_t saved = {0};
 
-    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, allreduce);
-    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, barrier);
-    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, bcast);
-    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, reduce);
-
-    if(prev_fns_dst) {
-        *prev_fns_dst = saved;
-    }
-}
-
-/*  */
-void mca_coll_xhc_module_install_fns(xhc_module_t *module,
-        ompi_communicator_t *comm, xhc_coll_fns_t fns) {
-
-    (void) module;
-
-    INSTALL_COLL(comm, fns, allreduce);
-    INSTALL_COLL(comm, fns, barrier);
-    INSTALL_COLL(comm, fns, bcast);
-    INSTALL_COLL(comm, fns, reduce);
-}
-
-// -----------------------------
-
-int mca_coll_xhc_module_prepare_hierarchy(
-        xhc_module_t *module, ompi_communicator_t *comm) {
-
-    int ret;
-
-    opal_list_t *level_defs;
-    int nlevel_defs;
-
-    ret = xhc_component_parse_hierarchy(module->hierarchy_string,
-        &level_defs, &nlevel_defs);
-    if(ret != OMPI_SUCCESS) {
-        return ret;
-    }
-
-    ret = xhc_module_create_hierarchy(module, comm, level_defs,
-        nlevel_defs, &module->hierarchy, &module->hierarchy_len);
-    if(ret != OMPI_SUCCESS) {
-        return ret;
-    }
-
-    for(int i = 0; i < nlevel_defs; i++)
-        OPAL_LIST_DESTRUCT(&level_defs[i]);
-    free(level_defs);
-
-    ret = xhc_module_sort_hierarchy(module, comm,
-        &module->hierarchy, &module->hierarchy_len);
-    if(ret != OMPI_SUCCESS) {
-        return ret;
-    }
-
-    return OMPI_SUCCESS;
-}
-
-static int xhc_module_create_hierarchy(xhc_module_t *module,
-        ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs,
-        xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst) {
-
-    xhc_peer_info_t *peer_info = module->peer_info;
-
-    int comm_size = ompi_comm_size(comm);
-    int rank = ompi_comm_rank(comm);
-
-    xhc_loc_t *hierarchy = NULL;
-    int nvirt_hiers = 0;
-
-    int *rank_list;
-
-    opal_hwloc_locality_t *loc_list;
-    ompi_datatype_t *hwloc_locality_type = NULL;
-
-    int ret, return_code = OMPI_SUCCESS;
-
-    hierarchy = malloc(nlevel_defs * sizeof(xhc_loc_t));
-    rank_list = malloc(comm_size * sizeof(int));
-    loc_list = malloc(comm_size * sizeof(opal_hwloc_locality_t));
-
-    if(!hierarchy || !rank_list || !loc_list) {
-        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
-
-    switch(sizeof(opal_hwloc_locality_t)) {
-        case 1: hwloc_locality_type = MPI_UINT8_T; break;
-        case 2: hwloc_locality_type = MPI_UINT16_T; break;
-        case 4: hwloc_locality_type = MPI_UINT32_T; break;
-        case 8: hwloc_locality_type = MPI_UINT64_T; break;
-    }
-    assert(hwloc_locality_type);
-
-    for(int h = 0; h < nlevel_defs; h++) {
-        opal_list_t *defs = &level_defs[h];
-
-        xhc_loc_def_t *my_def = NULL;
-        xhc_loc_t locality;
-
-        xhc_loc_def_t *def_0 = (xhc_loc_def_t *) opal_list_get_first(defs);
-
-        bool is_virtual = (opal_list_get_size(defs) > 1 || def_0->rank_list
-            || def_0->split > 1 || def_0->max_ranks > 0);
-
-        if(is_virtual) {
-            if(nvirt_hiers == OMPI_XHC_LOC_EXT_BITS) {
-                opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
-                    ompi_coll_base_framework.framework_output,
-                    "coll:xhc: Error: Too many virtual hierarchies");
-
-                RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
-            }
-
-            locality = 1 << (OMPI_XHC_LOC_EXT_START + nvirt_hiers);
-            nvirt_hiers++;
-        } else {
-            locality = def_0->named_loc;
-        }
-
-        hierarchy[h] = locality;
-        def_0 = NULL;
-
-        xhc_loc_def_t *def, *def_next;
-
-        /* Handle rank lists; take note if I belong
-         * in one, and remove them from the mix */
-        OPAL_LIST_FOREACH_SAFE(def, def_next, defs, xhc_loc_def_t) {
-            if(def->rank_list) {
-                if(!my_def) {
-                    for(int rl = 0; rl < def->rank_list_len; rl++) {
-                        if(rank >= def->rank_list[rl].start_rank
-                                && rank <= def->rank_list[rl].end_rank) {
-                            my_def = def;
-                            break;
-                        }
-                    }
-                }
-
-                opal_list_remove_item(defs, (opal_list_item_t *) def);
-                if(def != my_def) {
-                    OBJ_RELEASE(def);
-                }
-            }
-        }
-
-        bool dir_fwd = true;
-
-        /* When multiple locality defitions are present, they are assigned
-         * to groups in a left-to-right fashion. At every turn, the first
-         * rank (determined by the minimum ID) that's still not part of
-         * a locality, as well as the other ranks that are local with it,
-         * claim/consume the next locality from the list. The direction
-         * serves to implement the repeat modifier. When it is located,
-         * the process starts taking place right-to-left following the max
-         * ID. At the end and after the loop, the repeated locality will
-         * be the only one left and all remaining ranks will follow it. */
-        while(opal_list_get_size(defs) > 1) {
-            def = (xhc_loc_def_t *) (dir_fwd ? opal_list_get_first(defs)
-                : opal_list_get_last(defs));
-
-            if(dir_fwd && def->repeat) {
-                dir_fwd = false;
-                continue;
-            }
-
-            int ticket = (my_def == NULL ? rank : (dir_fwd ? comm_size : -1));
-            int chosen;
-
-            ret = comm->c_coll->coll_allreduce(&ticket, &chosen, 1,
-                MPI_INT, (dir_fwd ? MPI_MIN : MPI_MAX), comm,
-                comm->c_coll->coll_allreduce_module);
-            if(ret != OMPI_SUCCESS) {
-                RETURN_WITH_ERROR(return_code, ret, end);
-            }
-
-            if(chosen >= 0 && chosen < comm_size
-                    && PEER_IS_LOCAL(peer_info, chosen, def->named_loc)) {
-
-                my_def = def;
-            }
-
-            opal_list_remove_item(defs, (opal_list_item_t *) def);
-            if(def != my_def) {
-                OBJ_RELEASE(def);
-            }
-        }
-
-        if(opal_list_get_size(defs) > 0 && !my_def) {
-            my_def = (xhc_loc_def_t *) opal_list_get_first(defs);
-            opal_list_remove_item(defs, (opal_list_item_t *) my_def);
-        }
-
-        /* Share which named locality each rank follows; ranks that
-         * follow different localities shouldn't be grouped together */
-        opal_hwloc_locality_t follow_loc = (my_def ? my_def->named_loc : 0);
-        ret = comm->c_coll->coll_allgather(&follow_loc, 1,
-            hwloc_locality_type, loc_list, 1, hwloc_locality_type,
-            comm, comm->c_coll->coll_allgather_module);
-        if(ret != OMPI_SUCCESS) {
-            RETURN_WITH_ERROR(return_code, ret, end);
-        }
-
-        if(my_def == NULL) {
+    for(int t = 0; t < XHC_COLLCOUNT; t++) {
+        if(!new_fns->coll_fn[t])
             continue;
-        }
-
-        int member_id = -1;
-        int members = 0;
-
-        // If working with rank list, set the ranks from the list as "local"
-        if(my_def->rank_list) {
-            for(int i = 0; i < my_def->rank_list_len; i++) {
-                for(int r = my_def->rank_list[i].start_rank;
-                        r <= my_def->rank_list[i].end_rank && r < comm_size; r++) {
-                    if(r == rank) {
-                        member_id = members;
-                    }
-
-                    peer_info[r].locality |= locality;
-                    rank_list[members++] = r;
-                }
-            }
-        } else if(is_virtual) {
-            /* We might have a named locality instead of a rank list, but if
-             * we still needed to create a virtual one, we need to apply it */
-            for(int r = 0; r < comm_size; r++) {
-                if(loc_list[r] != my_def->named_loc) {
-                    continue;
-                }
-
-                if(!PEER_IS_LOCAL(peer_info, r, my_def->named_loc)) {
-                    continue;
-                }
-
-                if(r == rank) {
-                    member_id = members;
-                }
-
-                peer_info[r].locality |= locality;
-                rank_list[members++] = r;
-            }
-        }
-
-        assert(member_id != -1);
-
-        /* If split or max ranks was specified, math partition the locality
-         * and remove the previously added locality mapping to some ranks */
-        if(my_def->split > 1) {
-            int piece_size = members / my_def->split;
-            int leftover = members % my_def->split;
-
-            for(int m = 0, next_border = 0; m < members; m++) {
-                if(m == next_border) {
-                    next_border += piece_size + (leftover > 0 ? 1 : 0);
-                    if(leftover > 0) {
-                        leftover--;
-                    }
-
-                    if(member_id >= m && member_id < next_border) {
-                        m = next_border - 1;
-                        continue;
-                    }
-                }
-
-                peer_info[rank_list[m]].locality &= ~locality;
-            }
-        } else if(my_def->max_ranks > 1) {
-            for(int m = 0; m < members; m++) {
-                if(m % my_def->max_ranks == 0) {
-                    if(member_id >= m && member_id - m < my_def->max_ranks) {
-                        m += my_def->max_ranks - 1;
-                        continue;
-                    }
-                }
-
-                peer_info[rank_list[m]].locality &= ~locality;
-            }
-        }
-
-        OBJ_RELEASE_IF_NOT_NULL(my_def);
-    }
-
-    *hierarchy_dst = hierarchy;
-    *hierarchy_len_dst = nlevel_defs;
-
-end:
-
-    free(rank_list);
-
-    if(return_code != OMPI_SUCCESS) {
-        free(hierarchy);
-    }
-
-    return return_code;
-}
-
-static int xhc_module_sort_hierarchy(xhc_module_t *module,
-        ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst,
-        int *hierarchy_len_dst) {
-
-    xhc_peer_info_t *peer_info = module->peer_info;
-    int comm_size = ompi_comm_size(comm);
-
-    xhc_loc_t *old_hier = *hierarchy_dst;
-    int hier_len = *hierarchy_len_dst;
-
-    xhc_loc_t *new_hier = NULL;
-    bool *hier_done = NULL;
-
-    int return_code = OMPI_SUCCESS;
-
-    new_hier = malloc((hier_len + 1) * sizeof(xhc_loc_t));
-    hier_done = calloc(hier_len, sizeof(bool));
-
-    if(new_hier == NULL || hier_done == NULL) {
-        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
-    }
-
-    bool has_virtual = false;
-    for(int i = 0; i < hier_len; i++) {
-        if(old_hier[i] >= (1 << OMPI_XHC_LOC_EXT_START)) {
-            has_virtual = true;
-            break;
-        }
-    }
-
-    /* If any virtual hierarchy is involved, attempting to sort it is likely
-     * asking for trouble. Skip the sorting, and only consider adding a top
-     * common locality. There is a chance it wasn't actually necessary, but
-     * it never hurts. */
-
-    if(has_virtual) {
-        memcpy(new_hier, old_hier, hier_len * sizeof(xhc_loc_t));
-    } else {
-        for(int new_idx = hier_len - 1; new_idx >= 0; new_idx--) {
-            int max_matches_count = -1;
-            int max_matches_hier_idx = -1;
-
-            for(int i = 0; i < hier_len; i++) {
-                if(hier_done[i]) {
-                    continue;
-                }
-
-                int matches = 0;
-
-                for(int r = 0; r < comm_size; r++) {
-                    if(PEER_IS_LOCAL(peer_info, r, old_hier[i])) {
-                        matches++;
-                    }
-                }
-
-                if(matches > max_matches_count) {
-                    max_matches_count = matches;
-                    max_matches_hier_idx = i;
-                }
-            }
-
-            assert(max_matches_count != -1);
-
-            new_hier[new_idx] = old_hier[max_matches_hier_idx];
-            hier_done[max_matches_hier_idx] = true;
-        }
-    }
-
-    xhc_loc_t common_locality = (xhc_loc_t) -1;
-
-    for(int r = 0; r < comm_size; r++) {
-        ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r);
-        common_locality &= proc->super.proc_flags;
-    }
-
-    if(common_locality == 0) {
-        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
-            ompi_coll_base_framework.framework_output,
-            "coll:xhc: Error: There is no locality common "
-            "to all ranks in the communicator");
-
-        RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
-    }
-
-    if(hier_len == 0 || (common_locality & new_hier[hier_len - 1])
-            != new_hier[hier_len - 1]) {
-
-        new_hier[hier_len] = common_locality;
-        hier_len++;
-    }
-
-    REALLOC(new_hier, hier_len, xhc_loc_t);
-
-    free(old_hier);
-
-    *hierarchy_dst = new_hier;
-    *hierarchy_len_dst = hier_len;
-
-end:
 
-    free(hier_done);
+        if(saved_fns_dst)
+            GET_COLL_API(comm, t, &saved.coll_fn[t],
+                &saved.coll_module[t]);
 
-    if(return_code != OMPI_SUCCESS) {
-        free(new_hier);
+        INSTALL_COLL_API(comm, t, new_fns->coll_fn[t],
+            new_fns->coll_module[t]);
     }
 
-    return return_code;
+    if(saved_fns_dst)
+        *saved_fns_dst = saved;
 }
diff --git a/ompi/mca/coll/xhc/coll_xhc_reduce.c b/ompi/mca/coll/xhc/coll_xhc_reduce.c
index e74ab26aefe..739249336e9 100644
--- a/ompi/mca/coll/xhc/coll_xhc_reduce.c
+++ b/ompi/mca/coll/xhc/coll_xhc_reduce.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ * Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
@@ -28,14 +28,15 @@ int mca_coll_xhc_reduce(const void *sbuf, void *rbuf,
 
     xhc_module_t *module = (xhc_module_t *) ompi_module;
 
-    // Currently, XHC's reduce only supports root = 0
+    /* Currently, XHC's reduce only supports the top-level
+     * owner as the root (typically rank 0). */
     if(root == 0) {
-        return xhc_allreduce_internal(sbuf, rbuf, count,
+        return mca_coll_xhc_allreduce_internal(sbuf, rbuf, count,
             datatype, op, ompi_comm, ompi_module, false);
     } else {
-        xhc_coll_fns_t fallback = module->prev_colls;
-
-        return fallback.coll_reduce(sbuf, rbuf, count, datatype,
-            op, root, ompi_comm, fallback.coll_reduce_module);
+        WARN_ONCE("coll:xhc: Warning: XHC does not currently support "
+            "non-zero-root reduce; utilizing fallback component");
+        return XHC_CALL_FALLBACK(module->prev_colls, XHC_REDUCE,
+            reduce, sbuf, rbuf, count, datatype, op, root, ompi_comm);
     }
 }
diff --git a/ompi/mca/coll/xhc/help-coll-xhc.txt b/ompi/mca/coll/xhc/help-coll-xhc.txt
index 453a96df4fc..c367f62d699 100644
--- a/ompi/mca/coll/xhc/help-coll-xhc.txt
+++ b/ompi/mca/coll/xhc/help-coll-xhc.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+# Copyright (c) 2021-2024 Computer Architecture and VLSI Systems (CARV)
 #                         Laboratory, ICS Forth. All rights reserved.
 # $COPYRIGHT$
 #
@@ -9,16 +9,35 @@
 #
 
 [bad-hierarchy-item]
-WARNING (coll/xhc)
-Unrecognized locality definition '%s' in hierarchy parameter string '%s'
-The component won't load
+ERROR (coll/xhc)
+Unrecognized locality definition: '%s'
+In hierarchy parameter string: '%s'
+Falling back to another component
 #
 [bad-chunk-size-item]
-WARNING (coll/xhc)
-Malformed item '%s' in chunk size parameter string '%s'
-The component won't load
+ERROR (coll/xhc)
+Malformed chunk size item: '%s'
+In chunk size parameter string: '%s'
+Falling back to another component
+#
+[bad-cico-max]
+ERROR (coll/xhc)
+Malformed cico_max parameter '%s'
+Falling back to another component
 #
 [xhc-init-failed]
-WARNING (coll/xhc)
-Component initialization failed with error code %d
+ERROR (coll/xhc)
+Initialization failed, error code %d
+Falling back to another component
+Errno: %d (%s)
+#
+[xhc-op-init-failed]
+ERROR (coll/xhc)
+Initialization for %s failed, error code %d
+Falling back to another component
 Errno: %d (%s)
+#
+[too-many-virt-hiers]
+ERROR (coll/xhc)
+Too many virtual hierarchies requested
+Falling back to another component