Skip to content

Commit

Permalink
handle failing amdsmi functions
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Nov 23, 2023
1 parent 2a57ebf commit ee198c2
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 12 deletions.
5 changes: 4 additions & 1 deletion optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def check_continuous_isolation(self) -> None:
if self.device == "cuda":
self.isolation_process = Process(
target=check_cuda_continuous_isolation,
kwargs={"isolated_pid": os.getpid()},
kwargs={
"isolated_pid": os.getpid(),
"isolation_check_interval": self.config.isolation_check_interval,
},
daemon=True,
)
self.isolation_process.start()
Expand Down
4 changes: 4 additions & 0 deletions optimum_benchmark/backends/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class BackendConfig(ABC):

# device isolation options
continuous_isolation: bool = True
isolation_check_interval: Optional[int] = None

# clean up options
delete_cache: bool = False
Expand All @@ -31,5 +32,8 @@ def __post_init__(self):
if self.intra_op_num_threads == -1:
self.intra_op_num_threads = cpu_count()

if self.isolation_check_interval is None:
self.isolation_check_interval = 1 # 1 second


BackendConfigT = TypeVar("BackendConfigT", bound=BackendConfig)
42 changes: 33 additions & 9 deletions optimum_benchmark/backends/isolation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,22 @@ def check_cuda_isolation(isolated_devices: List[int], permitted_pids: List[int])
devices_handles = amdsmi.amdsmi_get_processor_handles()
for device_id in isolated_devices:
device_handle = devices_handles[device_id]
processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
try:
# these functions fail a lot for no apparent reason
processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
except Exception:
continue

for process_handle in processes_handles:
info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
try:
# these functions fail a lot for no apparent reason
info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
except Exception:
continue

if info["memory_usage"]["vram_mem"] == 4096:
continue

if info["pid"] not in permitted_pids:
LOGGER.warning(f"Found unexpected process {info['pid']} on device {device_id}.")
LOGGER.warning(f"Process info: {info}")
Expand All @@ -72,11 +83,22 @@ def check_cuda_isolation(isolated_devices: List[int], permitted_pids: List[int])
devices_handles = amdsmi.amdsmi_get_device_handles()
for device_id in isolated_devices:
device_handle = devices_handles[device_id]
processes_handles = amdsmi.amdsmi_get_process_list(device_handle)
try:
# these functions fail a lot for no apparent reason
processes_handles = amdsmi.amdsmi_get_process_list(device_handle)
except Exception:
continue

for process_handle in processes_handles:
info = amdsmi.amdsmi_get_process_info(device_handle, process_handle)
try:
# these functions fail a lot for no apparent reason
info = amdsmi.amdsmi_get_process_info(device_handle, process_handle)
except Exception:
continue

if info["memory_usage"]["vram_mem"] == 4096:
continue

if info["pid"] not in permitted_pids:
LOGGER.warning(f"Found unexpected process {info['pid']} on device {device_id}.")
LOGGER.warning(f"Process info: {info}")
Expand All @@ -99,7 +121,7 @@ def check_cuda_isolation(isolated_devices: List[int], permitted_pids: List[int])
raise RuntimeError(error_message)


def check_cuda_continuous_isolation(isolated_pid: int) -> None:
def check_cuda_continuous_isolation(isolated_pid: int, isolation_check_interval: int = 1) -> None:
"""
Kills the isolated process if any other process than the permitted ones is running on the specified CUDA devices.
"""
Expand Down Expand Up @@ -138,8 +160,10 @@ def check_cuda_continuous_isolation(isolated_pid: int) -> None:
while True:
try:
check_cuda_isolation(isolated_devices, permitted_pids)
time.sleep(0.1)
except Exception as e:
LOGGER.error(f"Error while checking CUDA isolation: {e}")
time.sleep(isolation_check_interval)
except RuntimeError as e:
LOGGER.error("Error while checking CUDA isolation:")
LOGGER.error(e)
LOGGER.error("Killing isolated process...")
os.kill(isolated_pid, signal.SIGTERM) # graceful kill, will trigger the backend cleanup
exit(1)
e.with_traceback()
9 changes: 7 additions & 2 deletions optimum_benchmark/backends/pytorch/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,13 @@ def configure(self, config: PyTorchConfig) -> None:
# for now we rely on this env variable to know if we're in a distributed setting
if os.environ.get("LOCAL_WORLD_SIZE", None) is not None:
LOGGER.info(f"\t+ Detected local world size: {os.environ['LOCAL_WORLD_SIZE']}")
LOGGER.info(f"\t+ Setting device to its corresponding local rank: {os.environ['LOCAL_RANK']}")
torch.cuda.set_device(int(os.environ.get("LOCAL_RANK", None)))
local_rank = int(os.environ["LOCAL_RANK"])
LOGGER.info(f"\t+ Detected local rank: {local_rank}")
available_devices = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")))
LOGGER.info(f"\t+ Detected available devices: {available_devices}")
default_device = available_devices[local_rank]
LOGGER.info(f"\t+ Setting default device to: {default_device}")
torch.cuda.set_device(default_device)

# Gradients options
if self.config.disable_grad:
Expand Down

0 comments on commit ee198c2

Please sign in to comment.