Skip to content

Commit

Permalink
Merge pull request #12841 from Akshay-Venkatesh/topic/4.1.x/handle-ma…
Browse files Browse the repository at this point in the history
…sync-assign-ctx

opal/cuda: Handle stream-ordered allocations and assign primary device context
  • Loading branch information
bwbarrett authored Oct 8, 2024
2 parents 467fbb9 + a697a27 commit f3bda96
Showing 1 changed file with 117 additions and 10 deletions.
127 changes: 117 additions & 10 deletions opal/mca/common/cuda/common_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ struct cudaFunctionTable {
#if OPAL_CUDA_GET_ATTRIBUTES
int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
#if OPAL_CUDA_VMM_SUPPORT
int (*cuDevicePrimaryCtxRetain)(CUcontext*, CUdevice);
int (*cuDevicePrimaryCtxGetState)(CUdevice, unsigned int*, int*);
int (*cuMemPoolGetAccess)(CUmemAccess_flags*, CUmemoryPool, CUmemLocation*);
int (*cuDeviceGetAttribute)(int*, CUdevice_attribute, CUdevice);
int (*cuDeviceGetCount)(int*);
int (*cuMemRelease)(CUmemGenericAllocationHandle);
int (*cuMemRetainAllocationHandle)(CUmemGenericAllocationHandle*, void*);
Expand Down Expand Up @@ -488,6 +492,10 @@ int mca_common_cuda_stage_one_init(void)
OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
#if OPAL_CUDA_VMM_SUPPORT
OPAL_CUDA_DLSYM(libcuda_handle, cuDevicePrimaryCtxRetain);
OPAL_CUDA_DLSYM(libcuda_handle, cuDevicePrimaryCtxGetState);
OPAL_CUDA_DLSYM(libcuda_handle, cuMemPoolGetAccess);
OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGetAttribute);
OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGetCount);
OPAL_CUDA_DLSYM(libcuda_handle, cuMemRelease);
OPAL_CUDA_DLSYM(libcuda_handle, cuMemRetainAllocationHandle);
Expand Down Expand Up @@ -1745,7 +1753,90 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
}
#endif /* OPAL_ENABLE_DEBUG */

static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type)
static int mca_common_cuda_check_mpool(CUdeviceptr dbuf, CUmemorytype *mem_type,
int *dev_id)
{
#if OPAL_CUDA_VMM_SUPPORT
static int device_count = -1;
static int mpool_supported = -1;
CUresult result;
CUmemoryPool mpool;
CUmemAccess_flags flags;
CUmemLocation location;

if (mpool_supported <= 0) {
if (mpool_supported == -1) {
if (device_count == -1) {
result = cuFunc.cuDeviceGetCount(&device_count);
if (result != CUDA_SUCCESS || (0 == device_count)) {
mpool_supported = 0; /* never check again */
device_count = 0;
return 0;
}
}

/* assume uniformity of devices */
result = cuFunc.cuDeviceGetAttribute(&mpool_supported,
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, 0);
if (result != CUDA_SUCCESS) {
mpool_supported = 0;
}
}
if (0 == mpool_supported) {
return 0;
}
}

result = cuFunc.cuPointerGetAttribute(&mpool,
CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
dbuf);
if (CUDA_SUCCESS != result) {
return 0;
}

/* check if device has access */
for (int i = 0; i < device_count; i++) {
location.type = CU_MEM_LOCATION_TYPE_DEVICE;
location.id = i;
result = cuFunc.cuMemPoolGetAccess(&flags, mpool, &location);
if ((CUDA_SUCCESS == result) &&
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
*mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = i;
return 1;
}
}

/* host must have access as device access possibility is exhausted */
*mem_type = CU_MEMORYTYPE_HOST;
*dev_id = -1;
return 0;
#endif

return 0;
}

static int mca_common_cuda_get_primary_context(CUdevice dev_id, CUcontext *pctx)
{
CUresult result;
unsigned int flags;
int active;

result = cuFunc.cuDevicePrimaryCtxGetState(dev_id, &flags, &active);
if (CUDA_SUCCESS != result) {
return OPAL_ERROR;
}

if (active) {
result = cuFunc.cuDevicePrimaryCtxRetain(pctx, dev_id);
return OPAL_SUCCESS;
}

return OPAL_ERROR;
}

static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
int *dev_id)
{
#if OPAL_CUDA_VMM_SUPPORT
static int device_count = -1;
Expand Down Expand Up @@ -1775,6 +1866,7 @@ static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type)

if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
*mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = prop.location.id;
cuFunc.cuMemRelease(alloc_handle);
return 1;
}
Expand All @@ -1788,6 +1880,7 @@ static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type)
if ((CUDA_SUCCESS == result) &&
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
*mem_type = CU_MEMORYTYPE_DEVICE;
*dev_id = i;
cuFunc.cuMemRelease(alloc_handle);
return 1;
}
Expand All @@ -1796,6 +1889,7 @@ static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type)

/* host must have access as device access possibility is exhausted */
*mem_type = CU_MEMORYTYPE_HOST;
*dev_id = -1;
cuFunc.cuMemRelease(alloc_handle);
return 1;

Expand All @@ -1809,12 +1903,17 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
{
int res;
int is_vmm = 0;
int is_mpool = 0;
CUmemorytype vmm_mem_type = 0;
CUmemorytype mpool_mem_type = 0;
CUmemorytype memType = 0;
int vmm_dev_id = -1;
int mpool_dev_id = -1;
CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
CUcontext ctx = NULL, memCtx = NULL;

is_vmm = mca_common_cuda_check_vmm(dbuf, &vmm_mem_type);
is_vmm = mca_common_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
is_mpool = mca_common_cuda_check_mpool(dbuf, &mpool_mem_type, &mpool_dev_id);

#if OPAL_CUDA_GET_ATTRIBUTES
uint32_t isManaged = 0;
Expand Down Expand Up @@ -1844,6 +1943,8 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
} else if (memType == CU_MEMORYTYPE_HOST) {
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
memType = CU_MEMORYTYPE_DEVICE;
} else if (is_mpool && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
memType = CU_MEMORYTYPE_DEVICE;
} else {
/* Host memory, nothing to do here */
return 0;
Expand All @@ -1864,6 +1965,8 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
} else if (memType == CU_MEMORYTYPE_HOST) {
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
memType = CU_MEMORYTYPE_DEVICE;
} else if (is_mpool && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
memType = CU_MEMORYTYPE_DEVICE;
} else {
/* Host memory, nothing to do here */
return 0;
Expand Down Expand Up @@ -1893,14 +1996,18 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
return OPAL_ERROR;
}
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
if (is_vmm) {
/* This function is expected to set context if pointer is device
* accessible but VMM allocations have NULL context associated
* which cannot be set against the calling thread */
opal_output(0,
"CUDA: unable to set context with the given pointer"
"ptr=%p aborting...", dbuf);
return OPAL_ERROR;
if (is_vmm || is_mpool) {
if (OPAL_SUCCESS ==
mca_common_cuda_get_primary_context(
is_vmm ? vmm_dev_id : mpool_dev_id, &memCtx)) {
/* As VMM/mempool allocations have no context associated
* with them, check if device primary context can be set */
} else {
opal_output(0,
"CUDA: unable to set ctx with the given pointer"
"ptr=%p aborting...", pUserBuf);
return OPAL_ERROR;
}
}

res = cuFunc.cuCtxSetCurrent(memCtx);
Expand Down

0 comments on commit f3bda96

Please sign in to comment.