From b1255667a26f0804225163044dd4cb9ab2c1ff63 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 14 Feb 2024 07:03:56 +0000 Subject: [PATCH] amd gpu memory deallocation --- optimum_benchmark/env_utils.py | 35 ++++++++++--------- optimum_benchmark/experiment.py | 57 +++---------------------------- optimum_benchmark/import_utils.py | 38 ++++++++++++++++++++- tests/test_api.py | 14 +++++--- 4 files changed, 70 insertions(+), 74 deletions(-) diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py index 9d332eab..43f82b3c 100644 --- a/optimum_benchmark/env_utils.py +++ b/optimum_benchmark/env_utils.py @@ -2,7 +2,6 @@ import re import platform import subprocess -import importlib.util from typing import Optional, List from .import_utils import is_pynvml_available, is_amdsmi_available, torch_version @@ -184,19 +183,21 @@ def get_cuda_device_ids() -> str: return device_ids -def get_git_revision_hash(package_name: str) -> Optional[str]: - """ - Returns the git commit SHA of a package installed from a git repository. - """ - - try: - path = importlib.util.find_spec(package_name).origin - except Exception: - return None - - try: - git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip() - except Exception: - return None - - return git_hash +def get_system_info() -> dict: + system_dict = { + "cpu": get_cpu(), + "cpu_count": os.cpu_count(), + "cpu_ram_mb": get_cpu_ram_mb(), + "system": platform.system(), + "machine": platform.machine(), + "platform": platform.platform(), + "processor": platform.processor(), + "python_version": platform.python_version(), + } + + if is_nvidia_system() or is_rocm_system(): + system_dict["gpu"] = get_gpus() + system_dict["gpu_count"] = len(get_gpus()) + system_dict["gpu_vram_mb"] = get_gpu_vram_mb() + + return system_dict diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index c9b6d733..2fd0345e 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -1,33 +1,18 @@ import os -import platform from logging import getLogger from tempfile import TemporaryDirectory from dataclasses import dataclass, field from typing import Any, Dict, Type, Optional, TYPE_CHECKING from hydra.utils import get_class +from transformers.configuration_utils import PushToHubMixin +from .env_utils import get_system_info +from .import_utils import get_hf_libs_info from .benchmarks.report import BenchmarkReport from .benchmarks.config import BenchmarkConfig from .launchers.config import LauncherConfig from .backends.config import BackendConfig -from .import_utils import ( - transformers_version, - accelerate_version, - diffusers_version, - optimum_version, - timm_version, - peft_version, -) -from .env_utils import ( - get_git_revision_hash, - is_nvidia_system, - is_rocm_system, - get_gpu_vram_mb, - get_cpu_ram_mb, - get_gpus, - get_cpu, -) if TYPE_CHECKING: # avoid importing any torch to be able to set @@ -42,7 +27,7 @@ @dataclass -class ExperimentConfig: +class ExperimentConfig(PushToHubMixin): # BACKEND CONFIGURATION backend: Any # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386 # LAUNCHER CONFIGURATION @@ -59,39 +44,7 @@ class ExperimentConfig: library: Optional[str] = None # deprecated # ENVIRONMENT CONFIGURATION - environment: Dict = field( - default_factory=lambda: { - "cpu": get_cpu(), - "cpu_count": os.cpu_count(), - "cpu_ram_mb": get_cpu_ram_mb(), - "system": platform.system(), - "python_version": platform.python_version(), - # libraries - "transformers_version": transformers_version(), - "transformers_commit": get_git_revision_hash("transformers"), - "accelerate_version": accelerate_version(), - "accelerate_commit": get_git_revision_hash("accelerate"), - "diffusers_version": diffusers_version(), - "diffusers_commit": get_git_revision_hash("diffusers"), - "optimum_version": optimum_version(), - "optimum_commit": get_git_revision_hash("optimum"), - "timm_version": timm_version(), - "timm_commit": get_git_revision_hash("timm"), - "peft_version": peft_version(), - "peft_commit": get_git_revision_hash("peft"), - } - ) - - def __post_init__(self): - # adding GPU information to the environment - if is_nvidia_system() or is_rocm_system(): - available_gpus = get_gpus() - if len(available_gpus) > 0: - self.environment["gpu"] = available_gpus[0] - self.environment["gpu_count"] = len(available_gpus) - self.environment["gpu_vram_mb"] = get_gpu_vram_mb() - else: - LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.") + environment: Dict = field(default_factory=lambda: {**get_system_info(), **get_hf_libs_info()}) def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport: diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 2bf3f811..914d7c9e 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -1,6 +1,7 @@ +from typing import Optional import importlib.metadata import importlib.util - +import subprocess _transformers_available = importlib.util.find_spec("transformers") is not None _accelerate_available = importlib.util.find_spec("accelerate") is not None @@ -178,3 +179,38 @@ def peft_version(): def tesnorrt_llm_version(): if _tensorrt_llm_available: return importlib.metadata.version("tensorrt_llm") + + +def get_git_revision_hash(package_name: str) -> Optional[str]: + """ + Returns the git commit SHA of a package installed from a git repository. + """ + + try: + path = importlib.util.find_spec(package_name).origin + except Exception: + return None + + try: + git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip() + except Exception: + return None + + return git_hash + + +def get_hf_libs_info(): + return { + "transformers_version": transformers_version(), + "transformers_commit": get_git_revision_hash("transformers"), + "accelerate_version": accelerate_version(), + "accelerate_commit": get_git_revision_hash("accelerate"), + "diffusers_version": diffusers_version(), + "diffusers_commit": get_git_revision_hash("diffusers"), + "optimum_version": optimum_version(), + "optimum_commit": get_git_revision_hash("optimum"), + "timm_version": timm_version(), + "timm_commit": get_git_revision_hash("timm"), + "peft_version": peft_version(), + "peft_commit": get_git_revision_hash("peft"), + } diff --git a/tests/test_api.py b/tests/test_api.py index ab7feea6..7f36165d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -93,10 +93,14 @@ def test_api_memory_tracker(device, backend): final_memory = tracker.get_max_memory() final_memory.log() - if device == "cuda" and backend == "pytorch": - measured_memory = final_memory.max_allocated - initial_memory.max_allocated - elif device == "cuda": - measured_memory = final_memory.max_vram - initial_memory.max_vram + if device == "cuda": + if backend == "pytorch": + measured_memory = final_memory.max_allocated - initial_memory.max_allocated + else: + measured_memory = final_memory.max_vram - initial_memory.max_vram + if torch.version.hip is not None: + # something is wrong with amdsmi + measured_memory -= 1600 else: measured_memory = final_memory.max_ram - initial_memory.max_ram @@ -105,6 +109,8 @@ def test_api_memory_tracker(device, backend): del array gc.collect() + if device == "cuda": + torch.cuda.empty_cache() @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)