From 684ff97052e0ff091e072de93aaa6068c9bec39d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Wed, 12 Jun 2024 10:53:35 -0400
Subject: [PATCH 1/5] Add stream operations to accelerator components

- Stream-based alloc and free
- Stream-based memmove
- Wait for stream to complete

Also, enable querying for number of devices and memory bandwidth.
These operations are needed for operation device offloading.

Co-authored-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/accelerator.h            | 107 ++++++++++-
 opal/mca/accelerator/cuda/accelerator_cuda.c  | 181 ++++++++++++++++--
 opal/mca/accelerator/cuda/accelerator_cuda.h  |  14 +-
 .../cuda/accelerator_cuda_component.c         |  88 ++++++++-
 .../null/accelerator_null_component.c         |  64 ++++++-
 opal/mca/accelerator/rocm/accelerator_rocm.h  |   7 +-
 .../rocm/accelerator_rocm_component.c         |  53 ++++-
 .../rocm/accelerator_rocm_module.c            | 162 ++++++++++++++--
 .../accelerator/ze/accelerator_ze_module.c    | 132 ++++++++++---
 9 files changed, 724 insertions(+), 84 deletions(-)

diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index 0d660725acc..6279b7c615e 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -5,6 +5,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -184,6 +187,16 @@ typedef int (*opal_accelerator_base_module_check_addr_fn_t)(
 typedef int (*opal_accelerator_base_module_create_stream_fn_t)(
     int dev_id, opal_accelerator_stream_t **stream);
 
+/**
+ * Wait for the completion of all operations inserted into the stream.
+ *
+ * @param[IN] stram          The stream to wait for.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_sync_stream_fn_t)(
+    opal_accelerator_stream_t *stream);
+
 /**
  * Creates an event. An event is a synchronization marker that can be
  * appended to a stream to monitor device progress or synchronize the
@@ -193,7 +206,7 @@ typedef int (*opal_accelerator_base_module_create_stream_fn_t)(
  * @param[IN]  dev_id        Associated device for the event or
  *                           MCA_ACCELERATOR_NO_DEVICE_ID
  * @param[OUT] event         Event to create
- * @param[IN]  enable_ipc    support inter-process tracking of the event 
+ * @param[IN]  enable_ipc    support inter-process tracking of the event
  *
  * @return                   OPAL_SUCCESS or error status on failure.
  */
@@ -310,6 +323,31 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)(
     int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
     opal_accelerator_transfer_type_t type);
 
+
+/**
+ * Copies memory asynchronously from src to dest. Memory of dest and src
+ * may overlap. Optionally can specify the transfer type to
+ * avoid pointer detection for performance. The operations will be enqueued
+ * into the provided stream but are not guaranteed to be complete upon return.
+ *
+ * @param[IN] dest_dev_id    Associated device to copy to or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] src_dev_id     Associated device to copy from or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] dest           Destination to copy memory to
+ * @param[IN] src            Source to copy memory from
+ * @param[IN] size           Size of memory to copy
+ * @param[IN] stream         Stream to perform asynchronous move on
+ * @param[IN] type           Transfer type field for performance
+ *                           Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC
+ *                           if caller is unsure of the transfer direction.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_memmove_async_fn_t)(
+    int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+    opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
+
 /**
  * Allocates size bytes memory from the device and sets ptr to the
  * pointer of the allocated memory. The memory is not initialized.
@@ -340,6 +378,46 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)(
 typedef int (*opal_accelerator_base_module_mem_release_fn_t)(
     int dev_id, void *ptr);
 
+
+/**
+ * Allocates size bytes memory from the device and sets ptr to the
+ * pointer of the allocated memory. The memory is not initialized.
+ * The allocation request is placed into the stream object.
+ * Any use of the memory must succeed the completion of this
+ * operation on the stream.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[OUT] ptr           Returns pointer to allocated memory
+ * @param[IN] size           Size of memory to allocate
+ * @param[IN] stream         Stream into which to insert the allocation request
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)(
+    int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+
+/**
+ * Frees the memory space pointed to by ptr which has been returned by
+ * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t().
+ * If the function is called on a ptr that has already been freed,
+ * undefined behavior occurs. If ptr is NULL, no operation is performed,
+ * and the function returns OPAL_SUCCESS.
+ * The release of the memory will be inserted into the stream and occurs after
+ * all previous operations have completed.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] ptr            Pointer to free
+ * @param[IN] stream         Stream into which to insert the free operation
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)(
+    int dev_id, void *ptr, opal_accelerator_stream_t *stream);
+
+
+
 /**
  * Retrieves the base address and/or size of a memory allocation of the
  * device.
@@ -557,6 +635,26 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)(
 typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
     int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+/**
+ * Get the number of devices available.
+ *
+ * @param[OUT] stram         Number of devices.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices);
+
+/**
+ * Get the memory bandwidth of the device.
+ *
+ * @param[IN] device         The device to query.
+ * @param[OUT] bw            The returned bandwidth for the device.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw);
+
+
 /*
  * the standard public API data structure
  */
@@ -565,6 +663,7 @@ typedef struct {
     opal_accelerator_base_module_check_addr_fn_t check_addr;
 
     opal_accelerator_base_module_create_stream_fn_t create_stream;
+    opal_accelerator_base_module_sync_stream_fn_t sync_stream;
     opal_accelerator_base_module_create_event_fn_t create_event;
     opal_accelerator_base_module_record_event_fn_t record_event;
     opal_accelerator_base_module_query_event_fn_t query_event;
@@ -572,10 +671,13 @@ typedef struct {
 
     opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async;
     opal_accelerator_base_module_memcpy_fn_t mem_copy;
+    opal_accelerator_base_module_memmove_async_fn_t mem_move_async;
     opal_accelerator_base_module_memmove_fn_t mem_move;
 
     opal_accelerator_base_module_mem_alloc_fn_t mem_alloc;
     opal_accelerator_base_module_mem_release_fn_t mem_release;
+    opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream;
+    opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream;
     opal_accelerator_base_module_get_address_range_fn_t get_address_range;
 
     opal_accelerator_base_module_is_ipc_enabled_fn_t is_ipc_enabled;
@@ -595,6 +697,9 @@ typedef struct {
     opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
 
     opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
+
+    opal_accelerator_base_module_get_num_devices_fn_t num_devices;
+    opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index ce8b0f77484..6e1a94e8fa0 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -7,6 +7,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -36,10 +39,16 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                          size_t size, opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_cuda_mem_release(int dev_id, void *ptr);
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream);
+static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size);
 
@@ -68,6 +77,11 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d
 
 static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_cuda_sync_stream(opal_accelerator_stream_t *stream);
+static int accelerator_cuda_get_num_devices(int *num_devices);
+static int accelerator_cuda_get_mem_bw(int device, float *bw);
+
+
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream))
 
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
@@ -75,6 +89,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_check_addr,
 
     accelerator_cuda_create_stream,
+    accelerator_cuda_sync_stream,
 
     accelerator_cuda_create_event,
     accelerator_cuda_record_event,
@@ -83,9 +98,12 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
 
     accelerator_cuda_memcpy_async,
     accelerator_cuda_memcpy,
+    accelerator_cuda_memmove_async,
     accelerator_cuda_memmove,
     accelerator_cuda_mem_alloc,
     accelerator_cuda_mem_release,
+    accelerator_cuda_mem_alloc_stream,
+    accelerator_cuda_mem_release_stream,
     accelerator_cuda_get_address_range,
 
     accelerator_cuda_is_ipc_enabled,
@@ -104,7 +122,10 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_device_pci_attr,
     accelerator_cuda_device_can_access_peer,
 
-    accelerator_cuda_get_buffer_id
+    accelerator_cuda_get_buffer_id,
+
+    accelerator_cuda_get_num_devices,
+    accelerator_cuda_get_mem_bw
 };
 
 static inline opal_accelerator_cuda_delayed_init_check(void)
@@ -115,6 +136,24 @@ static inline opal_accelerator_cuda_delayed_init_check(void)
     return OPAL_SUCCESS;
 }
 
+static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
+    /* query the device from the context */
+    int dev_id = -1;
+    CUdevice ptr_dev;
+    cuCtxPushCurrent(mem_ctx);
+    cuCtxGetDevice(&ptr_dev);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        cuDeviceGet(&dev, i);
+        if (dev == ptr_dev) {
+            dev_id = i;
+            break;
+        }
+    }
+    cuCtxPopCurrent(&mem_ctx);
+    return dev_id;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
@@ -163,6 +202,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
+    } else {
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -178,6 +220,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
         /* Host memory, nothing to do here */
         return 0;
+    } else {
+        result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -225,7 +271,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         }
     }
 
-    /* WORKAROUND - They are times when the above code determines a pice of memory
+    /* WORKAROUND - There are times when the above code determines a pice of memory
      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
      * made it this far, then the assumption at this point is we have GPU memory.
@@ -452,13 +498,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * Additionally, cuMemcpy is not necessarily always synchronous. See:
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
-    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, GET_STREAM(opal_accelerator_cuda_memcpy_stream.super));
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(GET_STREAM(opal_accelerator_cuda_memcpy_stream.super));
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -467,11 +513,14 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
-static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                             opal_accelerator_transfer_type_t type)
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                          const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type)
 {
     CUdeviceptr tmp;
     CUresult result;
+    void *ptr;
 
     int delayed_init = opal_accelerator_cuda_delayed_init_check();
     if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
@@ -482,29 +531,41 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = cuMemAlloc(&tmp, size);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    tmp = (CUdeviceptr)ptr;
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream);
+}
+
+static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                    opal_accelerator_transfer_type_t type)
+{
+    int ret;
+
+    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type);
+    if (OPAL_SUCCESS != ret) {
+        return OPAL_ERROR;
+    }
+    ret = accelerator_cuda_sync_stream(&opal_accelerator_cuda_memcpy_stream.base);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
@@ -521,15 +582,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (size > 0) {
-        result = cuMemAlloc((CUdeviceptr *) ptr, size);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
+    result = cuMemAlloc((CUdeviceptr *) ptr, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
     }
-    return 0;
+    return OPAL_SUCCESS;
+}
+
+
+
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **addr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    /* fall-back to regular stream allocation */
+
+    CUresult result = cuMemAllocAsync((CUdeviceptr*)addr, size, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
 }
 
 static int accelerator_cuda_mem_release(int dev_id, void *ptr)
@@ -546,6 +627,38 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
     return 0;
 }
 
+static int accelerator_cuda_mem_release_stream(int dev_id, void *addr,
+                                               opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = cuMemFreeAsync((CUdeviceptr)addr, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_cuda_sync_stream(opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+    result = cuStreamSynchronize(*(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                        size_t *size)
 {
@@ -883,3 +996,29 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
     }
     return OPAL_SUCCESS;
 }
+
+
+
+static int accelerator_cuda_get_num_devices(int *num_devices)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    *num_devices = opal_accelerator_cuda_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_cuda_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_cuda_mem_bw != NULL);
+
+    *bw = opal_accelerator_cuda_mem_bw[device];
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 8d3529ce5ff..f22fe76f233 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -2,7 +2,13 @@
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+<<<<<<< HEAD
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+=======
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+>>>>>>> 26185d6108 (Add stream operations to accelerator components)
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -51,7 +57,7 @@ typedef struct opal_accelerator_cuda_ipc_event_handle_t opal_accelerator_cuda_ip
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_ipc_event_handle_t);
 
 /* Declare extern variables, defined in accelerator_cuda_component.c */
-extern CUstream opal_accelerator_cuda_memcpy_stream;
+extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream;
 extern opal_mutex_t opal_accelerator_cuda_stream_lock;
 extern bool mca_accelerator_cuda_init_complete;
 
@@ -61,6 +67,12 @@ extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
 extern int opal_accelerator_cuda_delayed_init(void);
 
+OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
+
+OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw;
+
+OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
+
 END_C_DECLS
 
 #endif /* MCA_ACCELERATOR_CUDA_H */
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index b7baffd6aec..aabc681d554 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -7,6 +7,9 @@
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -35,13 +38,15 @@
 #include "opal/sys/atomic.h"
 
 /* Define global variables, used in accelerator_cuda.c */
-CUstream opal_accelerator_cuda_memcpy_stream = NULL;
-opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0};
+int opal_accelerator_cuda_num_devices = 0;
 
 /* Initialization lock for delayed cuda initialization */
 static opal_mutex_t accelerator_cuda_init_lock;
 bool mca_accelerator_cuda_init_complete = false;
 
+float *opal_accelerator_cuda_mem_bw = NULL;
+
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
 
@@ -123,6 +128,7 @@ static int accelerator_cuda_component_register(void)
 int opal_accelerator_cuda_delayed_init()
 {
     int result = OPAL_SUCCESS;
+    int prio_lo, prio_hi;
     CUcontext cuContext;
 
     /* Double checked locking to avoid having to
@@ -138,6 +144,8 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     }
 
+    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
+
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
     result = cuCtxGetCurrent(&cuContext);
@@ -147,19 +155,50 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
-        result = OPAL_ERROR;
-        goto out;
+
+        /* create a context for each device */
+        for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+            CUdevice dev;
+            result = cuDeviceGet(&dev, i);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDeviceGet failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            result = cuDevicePrimaryCtxRetain(&cuContext, dev);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDevicePrimaryCtxRetain failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            if (0 == i) {
+                result = cuCtxPushCurrent(cuContext);
+                if (CUDA_SUCCESS != result) {
+                    opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                        "CUDA: cuCtxPushCurrent failed");
+                    result = OPAL_ERROR;
+                    goto out;
+                }
+            }
+        }
+
+
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
-    result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
+    CUstream memcpy_stream;
+    result = cuStreamCreate(&memcpy_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         goto out;
     }
+    opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream));
+    *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream;
 
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
@@ -167,11 +206,36 @@ int opal_accelerator_cuda_delayed_init()
          * This is not a fatal error. */
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true,
                        &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem");
-
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
     }
+
+    /* determine the memory bandwidth */
+    opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        result = cuDeviceGet(&dev, i);
+        if (CUDA_SUCCESS != result) {
+            opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                "CUDA: cuDeviceGet failed");
+            goto out;
+        }
+        int mem_clock_rate; // kHz
+        result = cuDeviceGetAttribute(&mem_clock_rate,
+                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                                dev);
+        int bus_width; // bit
+        result = cuDeviceGetAttribute(&bus_width,
+                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                                dev);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_cuda_mem_bw[i] = bw;
+    }
+
     result = OPAL_SUCCESS;
     opal_atomic_wmb();
     mca_accelerator_cuda_init_complete = true;
@@ -182,8 +246,9 @@ int opal_accelerator_cuda_delayed_init()
 
 static opal_accelerator_base_module_t* accelerator_cuda_init(void)
 {
-    OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
     OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t);
+
     /* First check if the support is enabled.  In the case that the user has
      * turned it off, we do not need to continue with any CUDA specific
      * initialization.  Do this after MCA parameter registration. */
@@ -207,11 +272,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     if (CUDA_SUCCESS != result) {
         ctx_ok = 0;
     }
-    if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
-        cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
+
+    if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) {
+        OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream);
     }
 
-    OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
+    free(opal_accelerator_cuda_mem_bw);
+    opal_accelerator_cuda_mem_bw = NULL;
+
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
     return;
 }
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 1bd6e0e2811..8a6f0f8d810 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -9,6 +9,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -50,11 +53,15 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                                    size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                     opal_accelerator_transfer_type_t type);
 
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_null_mem_release(int dev_id, void *ptr);
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream);
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size);
 
 static bool accelerator_null_is_ipc_enabled(void);
@@ -82,6 +89,12 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de
 
 static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_null_sync_stream(opal_accelerator_stream_t *stream);
+
+static int accelerator_null_get_num_devices(int *num_devices);
+
+static int accelerator_null_get_mem_bw(int device, float *bw);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -125,6 +138,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_check_addr,
 
     accelerator_null_create_stream,
+    accelerator_null_sync_stream,
 
     accelerator_null_create_event,
     accelerator_null_record_event,
@@ -133,9 +147,12 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
 
     accelerator_null_memcpy_async,
     accelerator_null_memcpy,
+    accelerator_null_memmove_async,
     accelerator_null_memmove,
     accelerator_null_mem_alloc,
     accelerator_null_mem_release,
+    accelerator_null_mem_alloc_stream,
+    accelerator_null_mem_release_stream,
     accelerator_null_get_address_range,
 
     accelerator_null_is_ipc_enabled,
@@ -154,7 +171,10 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_get_device_pci_attr,
     accelerator_null_device_can_access_peer,
 
-    accelerator_null_get_buffer_id
+    accelerator_null_get_buffer_id,
+
+    accelerator_null_get_num_devices,
+    accelerator_null_get_mem_bw
 };
 
 static int accelerator_null_open(void)
@@ -237,6 +257,13 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+{
+    memmove(dest, src, size);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size)
 {
     *ptr = malloc(size);
@@ -249,6 +276,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr)
     return OPAL_SUCCESS;
 }
 
+
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    *ptr = malloc(size);
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr,
+                                               opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    free(ptr);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size)
 {
@@ -331,3 +375,21 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
+
+static int accelerator_null_sync_stream(opal_accelerator_stream_t *stream)
+{
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_get_num_devices(int *num_devices)
+{
+    *num_devices = 0;
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_null_get_mem_bw(int device, float *bw)
+{
+    *bw = 1.0; // return something that is not 0
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index 8eab728b4d2..4d4e1f0e65f 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -1,5 +1,8 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -67,11 +70,13 @@ struct opal_accelerator_rocm_ipc_event_handle_t {
 typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t);
 
-extern hipStream_t opal_accelerator_rocm_MemcpyStream;
+extern hipStream_t *opal_accelerator_rocm_MemcpyStream;
 extern int opal_accelerator_rocm_memcpy_async;
 extern int opal_accelerator_rocm_verbose;
 extern size_t opal_accelerator_rocm_memcpyH2D_limit;
 extern size_t opal_accelerator_rocm_memcpyD2H_limit;
+extern int opal_accelerator_rocm_num_devices;
+extern float *opal_accelerator_rocm_mem_bw;
 
 extern int opal_accelerator_rocm_lazy_init(void);
 
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 2f40c0e35f5..4e3fb88e175 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -7,6 +7,9 @@
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,7 +22,9 @@
 #include <stdio.h>
 
 #include "opal/mca/dl/base/base.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "opal/runtime/opal_params.h"
+#include "opal/util/proc.h"
 #include "accelerator_rocm.h"
 
 int opal_accelerator_rocm_memcpy_async = 0;
@@ -31,7 +36,10 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
 
-hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
+/* Define global variables, used in accelerator_rocm.c */
+int opal_accelerator_rocm_num_devices = 0;
+float *opal_accelerator_rocm_mem_bw = NULL;
+hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
 /*
  * Public string showing the accelerator rocm component version number
@@ -174,14 +182,41 @@ int opal_accelerator_rocm_lazy_init()
         goto out;
     }
 
-    err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
+    err = hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
     if (hipSuccess != err) {
-	err = OPAL_ERROR;  // we got hipErrorInvalidValue, pretty bad
-        opal_output(0, "Could not create hipStream, err=%d %s\n",
+        opal_output(0, "Failed to query device count, err=%d %s\n",
                 err, hipGetErrorString(err));
+	    err = OPAL_ERROR;
         goto out;
     }
 
+    hipStream_t memcpy_stream;
+    err = hipStreamCreate(&memcpy_stream);
+    if (hipSuccess != err) {
+        opal_output(0, "Could not create hipStream, err=%d %s\n",
+                err, hipGetErrorString(err));
+	    err = OPAL_ERROR;  // we got hipErrorInvalidValue, pretty bad
+        goto out;
+    }
+    opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t));
+    *opal_accelerator_rocm_MemcpyStream = memcpy_stream;
+
+    opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices);
+    for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) {
+        int mem_clock_rate; // kHz
+        err = hipDeviceGetAttribute(&mem_clock_rate,
+                                    hipDeviceAttributeMemoryClockRate,
+                                    i);
+        int bus_width; // bit
+        err = hipDeviceGetAttribute(&bus_width,
+                                    hipDeviceAttributeMemoryBusWidth,
+                                    i);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_rocm_mem_bw[i] = bw;
+    }
     err = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_rocm_init_complete = true;
@@ -193,7 +228,7 @@ int opal_accelerator_rocm_lazy_init()
 static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 {
     OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t);
-    
+
     hipError_t err;
 
     if (opal_rocm_runtime_initialized) {
@@ -215,12 +250,16 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 
 static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
 {
-    if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
-        hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
+    if (NULL != opal_accelerator_rocm_MemcpyStream) {
+        hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err) {
             opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
         }
+        free(opal_accelerator_rocm_MemcpyStream);
         opal_accelerator_rocm_MemcpyStream = NULL;
+
+        free(opal_accelerator_rocm_mem_bw);
+        opal_accelerator_rocm_mem_bw = NULL;
     }
 
     OBJ_DESTRUCT(&accelerator_rocm_init_lock);
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 6db5e0d4927..608d2183e20 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -1,6 +1,9 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * Additional copyrights may follow
  *
@@ -27,10 +30,17 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                              const void *src, size_t size,
+                                              opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                                 opal_accelerator_stream_t *stream);
+static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -59,6 +69,11 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i
 
 static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_rocm_sync_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices);
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw);
 
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream))
 
@@ -67,6 +82,7 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_check_addr,
 
     mca_accelerator_rocm_create_stream,
+    mca_accelerator_rocm_sync_stream,
 
     mca_accelerator_rocm_create_event,
     mca_accelerator_rocm_record_event,
@@ -75,9 +91,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
 
     mca_accelerator_rocm_memcpy_async,
     mca_accelerator_rocm_memcpy,
+    mca_accelerator_rocm_memmove_async,
     mca_accelerator_rocm_memmove,
     mca_accelerator_rocm_mem_alloc,
     mca_accelerator_rocm_mem_release,
+    mca_accelerator_rocm_mem_alloc_stream,
+    mca_accelerator_rocm_mem_release_stream,
     mca_accelerator_rocm_get_address_range,
 
     mca_accelerator_rocm_is_ipc_enabled,
@@ -96,7 +115,10 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_get_device_pci_attr,
     mca_accelerator_rocm_device_can_access_peer,
 
-    mca_accelerator_rocm_get_buffer_id
+    mca_accelerator_rocm_get_buffer_id,
+
+    mca_accelerator_rocm_get_num_devices,
+    mca_accelerator_rocm_get_mem_bw
 };
 
 
@@ -233,7 +255,7 @@ OBJ_CLASS_INSTANCE(
     opal_accelerator_event_t,
     NULL,
     mca_accelerator_rocm_event_destruct);
- 
+
 static int mca_accelerator_rocm_record_event(int dev_id, opal_accelerator_event_t *event,
                                              opal_accelerator_stream_t *stream)
 {
@@ -348,14 +370,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error starting async copy\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream after async copy\n");
@@ -373,6 +395,44 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
     return OPAL_SUCCESS;
 }
 
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                              size_t size, opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type)
+{
+    hipDeviceptr_t tmp;
+    hipError_t result;
+    int ret;
+    void *ptr;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == dest || NULL == src || size <= 0) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    ret = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
+        return OPAL_ERROR;
+    }
+    tmp = (hipDeviceptr_t)ptr;
+    result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream);
+}
+
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest,
                                         const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type)
@@ -393,7 +453,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
@@ -401,14 +461,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
         }
 
         err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream for memmove\n");
@@ -535,7 +595,7 @@ static int mca_accelerator_rocm_get_ipc_handle(int dev_id, void *dev_ptr,
                             "Error in hipIpcGetMemHandle dev_ptr %p", dev_ptr);
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -597,7 +657,7 @@ static int mca_accelerator_rocm_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HAN
 
 static void mca_accelerator_rocm_ipc_event_handle_destruct(opal_accelerator_rocm_ipc_handle_t *handle)
 {
-    // Just a place holder, there is no hipIpcCloseEventHandle. 
+    // Just a place holder, there is no hipIpcCloseEventHandle.
 }
 
 OBJ_CLASS_INSTANCE(
@@ -617,7 +677,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
     hipIpcEventHandle_t rocm_ipc_handle;
     opal_accelerator_rocm_ipc_event_handle_t *rocm_handle = (opal_accelerator_rocm_ipc_event_handle_t *) handle;
     OBJ_CONSTRUCT(rocm_handle, opal_accelerator_rocm_ipc_event_handle_t);
-    
+
     memset(rocm_ipc_handle.reserved, 0, HIP_IPC_HANDLE_SIZE);
     hipError_t err = hipIpcGetEventHandle(&rocm_ipc_handle,
                                           *((hipEvent_t *)event->event));
@@ -626,7 +686,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
                             "error in hipIpcGetEventHandle");
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -664,7 +724,7 @@ static int mca_accelerator_rocm_open_ipc_event_handle(opal_accelerator_ipc_event
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                             "error in hipIpcOpenEventHandle");
         return OPAL_ERROR;
-    }    
+    }
 
     return OPAL_SUCCESS;
 }
@@ -802,3 +862,81 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal
 #endif
     return OPAL_SUCCESS;
 }
+
+
+static int mca_accelerator_rocm_mem_alloc_stream(
+    int dev_id,
+    void **addr,
+    size_t size,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == stream || NULL == addr || 0 == size) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error allocating memory\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_mem_release_stream(
+    int dev_id,
+    void *addr,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipFreeAsync(addr, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error freeing memory\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_sync_stream(opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+    result = hipStreamSynchronize(*(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error synchronizing stream\n");
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices)
+{
+    *num_devices = opal_accelerator_rocm_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_rocm_mem_bw != NULL);
+
+    *bw = opal_accelerator_rocm_mem_bw[device];
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c
index b4d019f67c4..a5f7f37d5ac 100644
--- a/opal/mca/accelerator/ze/accelerator_ze_module.c
+++ b/opal/mca/accelerator/ze/accelerator_ze_module.c
@@ -32,10 +32,17 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_ze_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream);
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -65,11 +72,18 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p
 
 static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_ze_sync_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices);
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw);
+
 opal_accelerator_base_module_t opal_accelerator_ze_module =
 {
     .check_addr = mca_accelerator_ze_check_addr,
 
     .create_stream = mca_accelerator_ze_create_stream,
+    .sync_stream = mca_accelerator_ze_sync_stream,
     .create_event = mca_accelerator_ze_create_event,
     .record_event = mca_accelerator_ze_record_event,
     .query_event = mca_accelerator_ze_query_event,
@@ -77,10 +91,13 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
 
     .mem_copy_async = mca_accelerator_ze_memcpy_async,
     .mem_copy = mca_accelerator_ze_memcpy,
+    .mem_move_async = mca_accelerator_ze_memmove_async,
     .mem_move = mca_accelerator_ze_memmove,
 
     .mem_alloc = mca_accelerator_ze_mem_alloc,
     .mem_release = mca_accelerator_ze_mem_release,
+    .mem_alloc_stream = mca_accelerator_ze_mem_alloc_stream,
+    .mem_release_stream = mca_accelerator_ze_mem_release_stream,
     .get_address_range = mca_accelerator_ze_get_address_range,
 
     .is_ipc_enabled = mca_accelerator_ze_is_ipc_enabled,
@@ -99,7 +116,9 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
     .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr,
     .device_can_access_peer = mca_accelerator_ze_device_can_access_peer,
 
-    .get_buffer_id = mca_accelerator_ze_get_buffer_id
+    .get_buffer_id = mca_accelerator_ze_get_buffer_id,
+    .num_devices = mca_accelerator_ze_get_num_devices,
+    .get_mem_bw = mca_accelerator_ze_get_mem_bw
 };
 
 static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice)
@@ -137,7 +156,7 @@ static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_
 
     memset(&attr, 0, sizeof(ze_memory_allocation_properties_t));
 
-    zret = zeMemGetAllocProperties(opal_accelerator_ze_context, 
+    zret = zeMemGetAllocProperties(opal_accelerator_ze_context,
                                   addr,
                                   &attr,
                                   &hDevice);
@@ -200,7 +219,7 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
         OBJ_RELEASE(*stream);
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
-   
+
     if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
@@ -208,9 +227,9 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
     }
     ze_stream->dev_id = dev_id;
 
-    zret = zeCommandQueueCreate(opal_accelerator_ze_context, 
+    zret = zeCommandQueueCreate(opal_accelerator_ze_context,
                                 hDevice,
-                                &cmdQueueDesc, 
+                                &cmdQueueDesc,
                                 &ze_stream->hCommandQueue);
     if (ZE_RESULT_SUCCESS != zret) {
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
@@ -226,12 +245,12 @@ static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_
         .stype =  ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
         .pNext = NULL,
         .commandQueueGroupOrdinal = 0,
-        .flags = 0, 
+        .flags = 0,
     };
 
-    zret = zeCommandListCreate(opal_accelerator_ze_context, 
-                               opal_accelerator_ze_devices_handle[0], 
-                               &commandListDesc, 
+    zret = zeCommandListCreate(opal_accelerator_ze_context,
+                               opal_accelerator_ze_devices_handle[0],
+                               &commandListDesc,
                                &ze_stream->hCommandList);
     if (ZE_RESULT_SUCCESS != zret) {
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
@@ -359,7 +378,7 @@ static int mca_accelerator_ze_record_event(int dev_id, opal_accelerator_event_t
                            "zeCommandListClose returned %d", zret);
         return OPAL_ERROR;
     }
-    
+
     zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue,
                                              1,
                                              &ze_stream->hCommandList,
@@ -469,7 +488,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
 
     if (NULL == src || NULL == dest || size <0) {
         return OPAL_ERR_BAD_PARAM;
-    }                           
+    }
     if (0 == size) {
         return OPAL_SUCCESS;
     }
@@ -486,7 +505,7 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
         if (OPAL_SUCCESS != ret) {
             return ret;
         }
-    }            
+    }
 
     ze_stream = (opal_accelerator_ze_stream_t *)opal_accelerator_ze_MemcpyStream[dev_id]->stream;
     zret = zeCommandListAppendMemoryCopy(ze_stream->hCommandList,
@@ -509,8 +528,8 @@ static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest
         return OPAL_ERROR;
     }
 
-    zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, 
-                                             1, 
+    zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue,
+                                             1,
                                              &ze_stream->hCommandList,
                                              NULL);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -548,12 +567,23 @@ static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *des
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
 
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size)
 {
    ze_result_t zret;
    size_t mem_alignment;
    ze_device_handle_t hDevice;
-   
+
    ze_device_mem_alloc_desc_t device_desc = {
         .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
         .pNext = NULL,
@@ -570,10 +600,10 @@ static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size)
     /* Currently ZE ignores this argument and uses an internal alignment
      * value. However, this behavior can change in the future. */
     mem_alignment = 1;
-    zret = zeMemAllocDevice(opal_accelerator_ze_context, 
-                           &device_desc, 
-                           size, 
-                           mem_alignment, 
+    zret = zeMemAllocDevice(opal_accelerator_ze_context,
+                           &device_desc,
+                           size,
+                           mem_alignment,
                            hDevice,
                            ptr);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -603,6 +633,23 @@ static int mca_accelerator_ze_mem_release(int dev_id, void *ptr)
     return OPAL_ERROR;
 }
 
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
 						  size_t *size)
 {
@@ -615,7 +662,7 @@ static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, voi
     }
 
     zret = zeMemGetAddressRange(opal_accelerator_ze_context,
-                                ptr,                             
+                                ptr,
                                 &pBase,
                                 &pSize);
     if (ZE_RESULT_SUCCESS != zret) {
@@ -694,7 +741,7 @@ static int mca_accelerator_ze_host_unregister(int dev_id, void *ptr)
 static int mca_accelerator_ze_get_device(int *dev_id)
 {
     /*
-     * this method does not map to the Zero Level API, just return 0.  
+     * this method does not map to the Zero Level API, just return 0.
      * This may just work if the runtime is use the ZE_AFFINITY_MASK
      * environment variable to control the visible PV(s) for a given process.
      */
@@ -709,15 +756,15 @@ static int mca_accelerator_ze_get_device(int *dev_id)
 }
 
 static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr)
-{                                       
+{
     ze_result_t zret;
     ze_device_handle_t hDevice;
     ze_pci_ext_properties_t pPciProperties;
-    
+
     if (NULL == pci_attr) {
         return OPAL_ERR_BAD_PARAM;
     }
-    
+
     if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
@@ -730,15 +777,15 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p
                             "zeDevicePciGetPropertiesExt returned %d", zret);
         return OPAL_ERROR;
     }
-    
+
     pci_attr->domain_id = (uint16_t)pPciProperties.address.domain;
     pci_attr->bus_id = (uint8_t) pPciProperties.address.bus;
     pci_attr->device_id = (uint8_t)pPciProperties.address.device;
     pci_attr->function_id = (uint8_t)pPciProperties.address.function;
 
     return OPAL_SUCCESS;
-}       
-            
+}
+
 
 /*
  * could zeDeviceGetP2PProperties be used instead here?
@@ -756,7 +803,7 @@ static int mca_accelerator_ze_device_can_access_peer(int *access, int dev1, int
 
     hDevice = opal_accelerator_ze_devices_handle[dev1];
     hPeerDevice = opal_accelerator_ze_devices_handle[dev2];
-    
+
     zret = zeDeviceCanAccessPeer(hDevice,
                                  hPeerDevice,
                                  &value);
@@ -781,7 +828,7 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { 
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) {
         hDevice = opal_accelerator_ze_devices_handle[0];
     } else {
         hDevice = opal_accelerator_ze_devices_handle[dev_id];
@@ -798,6 +845,31 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a
     }
 
     *buf_id = pMemAllocProperties.id;
-                                   
+
     return OPAL_SUCCESS;
 }
+
+
+static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
\ No newline at end of file

From 3b3f03d5590218f914b2bc61a5c94fb9825f6d59 Mon Sep 17 00:00:00 2001
From: Wenduo Wang <wenduwan@amazon.com>
Date: Thu, 13 Jun 2024 14:03:25 +0000
Subject: [PATCH 2/5] accelerator/cuda: fix typos

Fix a couple minor typos to make compiler happy

Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c  | 21 ++++++++++---------
 opal/mca/accelerator/cuda/accelerator_cuda.h  |  3 ---
 .../cuda/accelerator_cuda_component.c         |  1 -
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 6e1a94e8fa0..d09850bcdcd 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -81,8 +81,8 @@ static int accelerator_cuda_sync_stream(opal_accelerator_stream_t *stream);
 static int accelerator_cuda_get_num_devices(int *num_devices);
 static int accelerator_cuda_get_mem_bw(int device, float *bw);
 
-
-#define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream))
+#define GET_STREAM(_stream) \
+    ((_stream) == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *) (_stream)->stream))
 
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
@@ -128,7 +128,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_mem_bw
 };
 
-static inline opal_accelerator_cuda_delayed_init_check(void)
+static inline int opal_accelerator_cuda_delayed_init_check(void)
 {
     if (OPAL_UNLIKELY(true != mca_accelerator_cuda_init_complete)) {
         return opal_accelerator_cuda_delayed_init();
@@ -314,7 +314,7 @@ static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t
     }
 
     result = cuStreamCreate((*stream)->stream, 0);
-    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         free((*stream)->stream);
@@ -498,13 +498,14 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * Additionally, cuMemcpy is not necessarily always synchronous. See:
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
-    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, GET_STREAM(opal_accelerator_cuda_memcpy_stream.super));
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size,
+                           (CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(GET_STREAM(opal_accelerator_cuda_memcpy_stream.super));
+    result = cuStreamSynchronize((CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -532,7 +533,7 @@ static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void
     }
 
     result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream);
-    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         return OPAL_ERROR;
     }
     tmp = (CUdeviceptr)ptr;
@@ -561,9 +562,9 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERROR;
     }
     ret = accelerator_cuda_sync_stream(&opal_accelerator_cuda_memcpy_stream.base);
-    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
-                       OPAL_PROC_MY_HOSTNAME, result);
+                       OPAL_PROC_MY_HOSTNAME, ret);
         return OPAL_ERROR;
     }
     return OPAL_SUCCESS;
@@ -982,7 +983,7 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
     }
 
     result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
-    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
                        result);
         return OPAL_ERROR;
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index f22fe76f233..70091729918 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -2,13 +2,10 @@
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
-<<<<<<< HEAD
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
-=======
  * Copyright (c) 2024      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
->>>>>>> 26185d6108 (Add stream operations to accelerator components)
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index aabc681d554..e93e61a4a0d 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -128,7 +128,6 @@ static int accelerator_cuda_component_register(void)
 int opal_accelerator_cuda_delayed_init()
 {
     int result = OPAL_SUCCESS;
-    int prio_lo, prio_hi;
     CUcontext cuContext;
 
     /* Double checked locking to avoid having to

From 2514a936223a831f360584981c0abfd6455fa417 Mon Sep 17 00:00:00 2001
From: Alex Margolin <alex.margolin@mail.huji.ac.il>
Date: Fri, 14 Jun 2024 22:16:14 +0300
Subject: [PATCH 3/5] dist/buildrpm: fix incorrect test, which passed even with
 an empty path

Signed-off-by: Alex Margolin <alex.margolin@mail.huji.ac.il>
---
 contrib/dist/linux/buildrpm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/dist/linux/buildrpm.sh b/contrib/dist/linux/buildrpm.sh
index c5daec67188..3fcb5b147bf 100755
--- a/contrib/dist/linux/buildrpm.sh
+++ b/contrib/dist/linux/buildrpm.sh
@@ -253,7 +253,7 @@ echo "--> Found specfile: $specfile"
 #
 # try to find Libfabric lib subir
 #
-if test -n $libfabric_path; then
+if test -n "$libfabric_path"; then
     # does lib64 exist?
     if test -d $libfabric_path/lib64; then
         # yes, so I will use lib64 as include dir

From 25bf9e4f320652a7a70c8d234113dfe3926504ad Mon Sep 17 00:00:00 2001
From: Jeff Squyres <jeff@squyres.com>
Date: Mon, 17 Jun 2024 16:57:07 -0400
Subject: [PATCH 4/5] wrappers: install the pkgconfig scripts better

Whenever we're installing binaries, install the pkgconfig scripts.
Remove the AM conditional logic from deep inside other conditionals
and just make it standalone at the top of the file (because installing
the .pc files really does not depend on whether we're installing the
script wrappers or binary wrappers).

Signed-off-by: Jeff Squyres <jeff@squyres.com>
---
 ompi/tools/wrappers/Makefile.am | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/ompi/tools/wrappers/Makefile.am b/ompi/tools/wrappers/Makefile.am
index 0c785631804..1d5b24a9372 100644
--- a/ompi/tools/wrappers/Makefile.am
+++ b/ompi/tools/wrappers/Makefile.am
@@ -22,6 +22,21 @@
 # $HEADER$
 #
 
+if OPAL_INSTALL_BINARIES
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = ompi.pc ompi-c.pc
+
+if OMPI_HAVE_CXX_COMPILER
+pkgconfig_DATA += ompi-cxx.pc
+endif
+
+if OMPI_HAVE_FORTRAN_COMPILER
+pkgconfig_DATA += ompi-fort.pc
+endif
+endif # OPAL_INSTALL_BINARIES
+
+#-----------------
+
 if OPAL_WANT_SCRIPT_WRAPPER_COMPILERS
 
 bin_SCRIPTS = ompi_wrapper_script
@@ -70,17 +85,6 @@ else # OPAL_WANT_SCRIPT_WRAPPER_COMPILERS
 
 if OPAL_INSTALL_BINARIES
 
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = ompi.pc ompi-c.pc
-
-if OMPI_HAVE_CXX_COMPILER
-pkgconfig_DATA += ompi-cxx.pc
-endif
-
-if OMPI_HAVE_FORTRAN_COMPILER
-pkgconfig_DATA += ompi-fort.pc
-endif
-
 if OMPI_WANT_JAVA_BINDINGS
 bin_SCRIPTS = mpijavac.pl
 endif

From 63cc112751de8755d92792b0b4156e8c18a6d6bb Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 20 Jun 2024 10:50:06 -0400
Subject: [PATCH 5/5] accelerator/cuca: Dereference pointer to stream

cuMemcpyAsync and cuStreamSynchronize take a CUstream, not a pointer
to CUstream.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index d09850bcdcd..0e80a64085c 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -499,13 +499,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
     result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size,
-                           (CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
+                           *(CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize((CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
+    result = cuStreamSynchronize(*(CUstream *) opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);