Skip to content

Commit

Permalink
CUDA: Also detect NVSwitch when checking the number of gpus sharing a…
Browse files Browse the repository at this point in the history
… bus
  • Loading branch information
sthibaul committed Mar 12, 2024
1 parent 84bf645 commit ecb87bc
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 4 deletions.
5 changes: 4 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1686,6 +1686,9 @@ if test x$enable_cuda = xyes; then
], [], [[#include <nvml.h>]])
AC_DEFINE([STARPU_HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
else
AC_MSG_WARN([nvidia-ml could not be found. This will prevent from correct understanding of the machine topology.])
NO_NVML="Warning: no nvidia-ml found"
fi
AC_MSG_CHECKING(whether nvidia-ml should be used)
AC_MSG_RESULT($have_valid_nvml)
Expand Down Expand Up @@ -4499,7 +4502,7 @@ AC_OUTPUT([
AC_MSG_NOTICE([
CPUs enabled: $enable_cpu
CUDA enabled: $enable_cuda
CUDA enabled: $enable_cuda $NO_NVML
HIP enabled: $enable_hip
OpenCL enabled: $enable_opencl
Max FPGA enabled: $enable_max_fpga
Expand Down
57 changes: 54 additions & 3 deletions src/drivers/cuda/driver_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,13 @@ void _starpu_cuda_discover_devices(struct _starpu_machine_config *config)
}

#ifdef STARPU_HAVE_LIBNVIDIA_ML
static int _starpu_cuda_direct_link(unsigned devid1, unsigned devid2)
static int _starpu_cuda_direct_link(struct _starpu_machine_config *config, unsigned devid1, unsigned devid2)
{
unsigned i;
struct cudaDeviceProp props_dev1;
struct cudaDeviceProp props_dev2;
cudaError_t cures;
int nvswitch = 0;

cures = cudaGetDeviceProperties(&props_dev1, devid1);
if (cures != cudaSuccess)
Expand All @@ -267,14 +268,64 @@ static int _starpu_cuda_direct_link(unsigned devid1, unsigned devid2)

nvmlPciInfo_t pci;
nvmlDeviceGetNvLinkRemotePciInfo(nvml_dev1, i, &pci);

hwloc_obj_t obj = hwloc_get_pcidev_by_busid(config->topology.hwtopology,
pci.domain, pci.bus, pci.device, 0);
if (obj && obj->type == HWLOC_OBJ_PCI_DEVICE
&& (obj->attr->pcidev.class_id >> 8 == 0x06)
&& (obj->attr->pcidev.vendor_id == 0x10de))
{
/* This is an NVIDIA PCI bridge, i.e. an NVSwitch */
/* NVSwitch */
nvswitch = 1;
break;
}

if ((int) pci.domain == props_dev2.pciDomainID &&
(int) pci.bus == props_dev2.pciBusID &&
(int) pci.device == props_dev2.pciDeviceID)
/* We have a direct NVLink! */
return 1;
}

/* No direct NVLink found */
if (!nvswitch)
{
/* No direct NVLink or NVSwitch found for dev1 */
return 0;
}

nvmlDevice_t nvml_dev2 = _starpu_cuda_get_nvmldev(&props_dev2);

if (!nvml_dev2)
return 0;

for (i = 0; i < NVML_NVLINK_MAX_LINKS; i++)
{
nvmlEnableState_t active;
nvmlReturn_t ret;
ret = nvmlDeviceGetNvLinkState(nvml_dev2, i, &active);
if (ret == NVML_ERROR_NOT_SUPPORTED)
continue;
if (active != NVML_FEATURE_ENABLED)
continue;

nvmlPciInfo_t pci;
nvmlDeviceGetNvLinkRemotePciInfo(nvml_dev2, i, &pci);

hwloc_obj_t obj = hwloc_get_pcidev_by_busid(config->topology.hwtopology,
pci.domain, pci.bus, pci.device, 0);
if (obj && obj->type == HWLOC_OBJ_PCI_DEVICE
&& (obj->attr->pcidev.class_id >> 8 == 0x06)
&& (obj->attr->pcidev.vendor_id == 0x10de))
{
/* This is an NVIDIA PCI bridge, i.e. an NVSwitch */
/* NVSwitch */
/* TODO: follow answers to https://forums.developer.nvidia.com/t/how-to-distinguish-different-nvswitch/241983 */
return 1;
}
}

/* No NVSwitch found for dev2 */
return 0;
}
#endif
Expand Down Expand Up @@ -531,7 +582,7 @@ void _starpu_cuda_init_worker_memory(struct _starpu_machine_config *config, int
_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = bus12;
#ifndef STARPU_SIMGRID
#ifdef STARPU_HAVE_LIBNVIDIA_ML
if (_starpu_cuda_direct_link(devid, devid2))
if (_starpu_cuda_direct_link(config, devid, devid2))
{
starpu_bus_set_ngpus(bus21, 1);
starpu_bus_set_ngpus(bus12, 1);
Expand Down

0 comments on commit ecb87bc

Please sign in to comment.