diff --git a/.gitignore b/.gitignore index ff31100b..8d507699 100644 --- a/.gitignore +++ b/.gitignore @@ -168,6 +168,7 @@ data/ version.txt .engine/ +actions-runner-duplicate/ actions-runner/ experiments/ amdsmi/ diff --git a/Makefile b/Makefile index 0253c183..86176c0a 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,5 @@ # List of targets that are not associated with files -.PHONY: quality style install \ - build_docker_cpu, build_docker_cuda, build_docker_rocm, \ - test_cli_cpu_pytorch, test_cli_rocm_pytorch, \ - test_cli_cpu_neural_compressor, test_cli_cpu_onnxruntime, test_cli_cpu_openvino, \ - test_api_cpu, test_api_cuda, test_api_rocm, test_api_misc +.PHONY: quality style install build_docker_cpu build_docker_cuda build_docker_rocm test_cli_cpu_neural_compressor test_cli_cpu_onnxruntime test_cli_cpu_openvino test_cli_cpu_pytorch test_cli_rocm_pytorch test_cli_cuda_pytorch test_api_cpu test_api_cuda test_api_rocm test_api_misc quality: ruff check . @@ -28,6 +24,7 @@ build_docker_rocm: test_cli_cpu_neural_compressor: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -36,6 +33,7 @@ test_cli_cpu_neural_compressor: test_cli_cpu_onnxruntime: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -44,6 +42,7 @@ test_cli_cpu_onnxruntime: test_cli_cpu_openvino: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -52,6 +51,7 @@ test_cli_cpu_openvino: test_cli_cpu_pytorch: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -60,6 +60,7 @@ test_cli_cpu_pytorch: test_cli_rocm_pytorch: docker run \ --rm \ + --pid=host \ --device=/dev/kfd \ --device /dev/dri/renderD128 \ --device /dev/dri/renderD129 \ @@ -72,6 +73,7 @@ test_cli_rocm_pytorch: test_cli_cuda_pytorch: docker run \ --rm \ + --pid=host \ --gpus '"device=0,1"' \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ @@ -81,6 +83,7 @@ test_cli_cuda_pytorch: test_api_cpu: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -89,6 +92,7 @@ test_api_cpu: test_api_cuda: docker run \ --rm \ + --pid=host \ --gpus '"device=0,1"' \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ @@ -98,6 +102,7 @@ test_api_cuda: test_api_rocm: docker run \ --rm \ + --pid=host \ --device=/dev/kfd \ --device /dev/dri/renderD128 \ --device /dev/dri/renderD129 \ @@ -110,6 +115,7 @@ test_api_rocm: test_api_misc: docker run \ --rm \ + --pid=host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 2be47a11..1e55bdc9 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -10,7 +10,7 @@ from ..task_utils import get_automodel_class_for_task from .config import BackendConfigT -from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config +from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config from .transformers_utils import ( PretrainedProcessor, @@ -41,7 +41,7 @@ def __init__(self, config: BackendConfigT): if self.config.library == "diffusers": self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs) - self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs) + self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.hub_kwargs) self.model_type = self.config.task self.generation_config = None self.pre_processor = None diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py index 5b0f56ce..042ea3d0 100644 --- a/optimum_benchmark/backends/diffusers_utils.py +++ b/optimum_benchmark/backends/diffusers_utils.py @@ -12,7 +12,7 @@ def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]: return diffusers.DiffusionPipeline.load_config(model, **kwargs) -def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]: +def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]: config = diffusers.DiffusionPipeline.load_config(model, **kwargs) shapes = {} diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index 0d2fc857..33e2694a 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -332,13 +332,14 @@ def prepare_for_inference(self, **kwargs) -> None: def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.library == "diffusers": - return {"prompt": inputs["prompt"]} + return inputs LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}") for key, value in list(inputs.items()): if key in self.inputs_names: inputs[key] = value.to(self.config.device) else: + LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.") inputs.pop(key) return inputs diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 6835617a..ea2ffde6 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -42,7 +42,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained try: # sometimes contains information about the model's input shapes that are not available in the config return AutoProcessor.from_pretrained(model, **kwargs) - except ValueError: + except Exception: return None diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py index 07c4f9ee..d9420b0b 100644 --- a/optimum_benchmark/benchmarks/inference/benchmark.py +++ b/optimum_benchmark/benchmarks/inference/benchmark.py @@ -11,6 +11,7 @@ from ..base import Benchmark from ..report import BenchmarkMeasurements, BenchmarkReport from .config import InferenceConfig +from .inputs_utils import extract_text_generation_inputs if is_torch_distributed_available(): import torch.distributed @@ -72,8 +73,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None: "The batch size must be divisible by the number of processes in a distributed environment" ) self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size() - if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS: - TEXT_GENERATION_TASKS["synced_gpus"] = True LOGGER.info("\t+ Creating input generator") self.input_generator = InputGenerator( @@ -81,28 +80,28 @@ def run(self, backend: Backend[BackendConfigT]) -> None: ) if backend.config.task in TEXT_GENERATION_TASKS: - LOGGER.info("\t+ Generating and preparing Text Generation input") - self.forward_inputs = self.input_generator(mode="forward") - self.generate_input = self.input_generator(mode="generate") + LOGGER.info("\t+ Generating and preparing Text Generation inputs") + self.forward_inputs = self.input_generator() self.forward_inputs = backend.prepare_inputs(self.forward_inputs) - self.generate_input = backend.prepare_inputs(self.generate_input) + self.generate_inputs = extract_text_generation_inputs(self.forward_inputs) LOGGER.info("\t+ Updating Text Generation kwargs with default values") self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs} LOGGER.info("\t+ Initializing Text Generation report") self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements()) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - LOGGER.info("\t+ Generating and preparing Image Diffusion input") - self.diffuse_input = self.input_generator(mode="call") - self.diffuse_input = backend.prepare_inputs(self.diffuse_input) + LOGGER.info("\t+ Generating Image Diffusion inputs") + self.call_inputs = self.input_generator() + self.call_inputs = backend.prepare_inputs(self.call_inputs) + self.call_inputs = {"prompt": self.call_inputs["prompt"]} LOGGER.info("\t+ Updating Image Diffusion kwargs with default values") - self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs} + self.config.call_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.call_kwargs} LOGGER.info("\t+ Initializing Image Diffusion report") self.report = ImageDiffusionReport(call=BenchmarkMeasurements()) else: - LOGGER.info("\t+ Generating and preparing Inference input") - self.forward_inputs = self.input_generator(mode="forward") + LOGGER.info("\t+ Generating and preparing Inference inputs") + self.forward_inputs = self.input_generator() self.forward_inputs = backend.prepare_inputs(self.forward_inputs) LOGGER.info("\t+ Initializing Inference report") self.report = InferenceReport(forward=BenchmarkMeasurements()) @@ -111,16 +110,17 @@ def run(self, backend: Backend[BackendConfigT]) -> None: backend.prepare_for_inference( **backend.model_shapes, **self.config.input_shapes, - **self.config.forward_kwargs, **self.config.generate_kwargs, + **self.config.forward_kwargs, + **self.config.call_kwargs, ) LOGGER.info("\t+ Warming up backend for Inference") for _ in range(self.config.warmup_runs): if backend.config.task in TEXT_GENERATION_TASKS: - _ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2}) + _ = backend.generate(self.generate_inputs, {"max_new_tokens": 2, "min_new_tokens": 2}) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - _ = backend.call(self.diffuse_input, {"num_inference_steps": 2}) + _ = backend.call(self.call_inputs, {"num_inference_steps": 2}) else: _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) @@ -164,8 +164,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None: self.report.log_energy() self.report.log_efficiency() - self.report.log() - ## Memory tracking def run_text_generation_memory_tracking(self, backend: Backend): LOGGER.info("\t+ Running memory tracking") @@ -177,7 +175,7 @@ def run_text_generation_memory_tracking(self, backend: Backend): self.memory_tracker.reset() with self.memory_tracker.track(): - _ = backend.generate(self.generate_input, self.config.generate_kwargs) + _ = backend.generate(self.generate_inputs, self.config.generate_kwargs) self.report.decode.memory = self.memory_tracker.get_max_memory() @@ -185,7 +183,7 @@ def run_image_diffusion_memory_tracking(self, backend: Backend): LOGGER.info("\t+ Running memory tracking") self.memory_tracker.reset() with self.memory_tracker.track(): - _ = backend.call(self.diffuse_input, self.config.forward_kwargs) + _ = backend.call(self.call_inputs, self.config.call_kwargs) self.report.call.memory = self.memory_tracker.get_max_memory() @@ -205,7 +203,9 @@ def run_text_generation_latency_tracking(self, backend: Backend): with self.latency_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - self.report.prefill.latency = self.latency_tracker.get_latency() + forward_latency = self.latency_tracker.get_latency() + forward_latency.log(prefix="forward") + self.report.prefill.latency = forward_latency self.report.prefill.throughput = self.latency_tracker.get_throughput( volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT ) @@ -213,9 +213,11 @@ def run_text_generation_latency_tracking(self, backend: Backend): self.latency_tracker.reset() while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): - _ = backend.generate(self.generate_input, self.config.generate_kwargs) + _ = backend.generate(self.generate_inputs, self.config.generate_kwargs) - self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean + generate_latency = self.latency_tracker.get_latency() + generate_latency.log(prefix="generate") + self.report.decode.latency = generate_latency - self.report.prefill.latency.mean self.report.decode.throughput = Throughput.from_latency( self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT ) @@ -225,7 +227,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend): self.latency_tracker.reset() while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): - _ = backend.call(self.diffuse_input, self.config.forward_kwargs) + _ = backend.call(self.call_inputs, self.config.call_kwargs) self.report.call.latency = self.latency_tracker.get_latency() self.report.call.throughput = Throughput.from_latency( @@ -258,7 +260,7 @@ def run_text_generation_energy_tracking(self, backend: Backend): self.energy_tracker.reset() with self.energy_tracker.track(): - _ = backend.generate(self.generate_input, self.config.generate_kwargs) + _ = backend.generate(self.generate_inputs, self.config.generate_kwargs) self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy self.report.decode.efficiency = Efficiency.from_energy( @@ -269,7 +271,7 @@ def run_image_diffusion_energy_tracking(self, backend: Backend): LOGGER.info("\t+ Running energy tracking") self.energy_tracker.reset() with self.energy_tracker.track(): - _ = backend.call(self.diffuse_input, self.config.forward_kwargs) + _ = backend.call(self.call_inputs, self.config.call_kwargs) self.report.call.energy = self.energy_tracker.get_energy() self.report.call.efficiency = Efficiency.from_energy( @@ -297,7 +299,7 @@ def prefill_volume(self) -> int: # in tokens @property def call_volume(self) -> int: # in images - return self.config.input_shapes["batch_size"] * self.config.forward_kwargs["num_images_per_prompt"] + return self.config.input_shapes["batch_size"] * self.config.call_kwargs["num_images_per_prompt"] @property def decode_volume(self) -> int: # in tokens diff --git a/optimum_benchmark/benchmarks/inference/inputs_utils.py b/optimum_benchmark/benchmarks/inference/inputs_utils.py new file mode 100644 index 00000000..f4dc5bd1 --- /dev/null +++ b/optimum_benchmark/benchmarks/inference/inputs_utils.py @@ -0,0 +1,17 @@ +def extract_text_generation_inputs(inputs): + if "pixel_values" in inputs: + # image input + text_generation_inputs = {"inputs": inputs["pixel_values"]} + elif "input_values" in inputs: + # speech input + text_generation_inputs = {"inputs": inputs["input_values"]} + elif "input_features" in inputs: + # waveform input + text_generation_inputs = {"inputs": inputs["input_features"]} + elif "input_ids" in inputs: + # text input + text_generation_inputs = {"inputs": inputs["input_ids"]} + else: + raise ValueError("Could not find any valid text generation inputs.") + + return text_generation_inputs diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index 0dfc3050..8c09802b 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -22,23 +22,6 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s "please submit a PR or a feature request to optimum-benchmark. " ) - def __call__(self, mode: str) -> Dict[str, Any]: + def __call__(self) -> Dict[str, Any]: task_input = self.task_generator() - - if mode == "generate": - if "pixel_values" in task_input: - # image input - task_input = {"inputs": task_input["pixel_values"]} - elif "input_values" in task_input: - # speech input - task_input = {"inputs": task_input["input_values"]} - elif "input_features" in task_input: - # waveform input - task_input = {"inputs": task_input["input_features"]} - elif "input_ids" in task_input: - # text input - task_input = {"inputs": task_input["input_ids"]} - elif mode == "call": - task_input = {"prompt": task_input["prompt"]} - return task_input diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index e076875f..e7a10f6d 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -140,14 +140,13 @@ def _cpu_latency(self): self.end_events.append(end) def get_elapsed_time(self) -> float: - # we measured in cpu to not synchronize all events + # we measure it in cpu to not synchronize all events return time.perf_counter() - self.start_time def get_latency(self) -> Latency: if self.backend == "pytorch" and self.device == "cuda": - # synchronize the last event to make sure it has been recorded - self.start_events[-1].synchronize() - self.end_events[-1].synchronize() + # synchronize the device to make sure all events have been recorded + torch.cuda.synchronize() latencies_list = [ self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events)) @@ -210,12 +209,7 @@ def __init__(self, device: str, backend: str): self.reset() def reset(self): - if self.device == "cuda" and self.backend == "pytorch": - event = torch.cuda.Event(enable_timing=True) - event.record() - self.events = [event] - else: - self.events = [time.perf_counter()] + self.events: List[Union[float, torch.cuda.Event]] = [] def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): if self.device == "cuda" and self.backend == "pytorch": diff --git a/pyproject.toml b/pyproject.toml index 58e5b284..1f9b5b08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,7 @@ -# [tool.isort] -# profile = "ruff" -# lines_after_imports = 2 -# known_first_party = "optimum_benchmark" - [tool.ruff] line-length = 120 -ignore = ["C901", "E501", "E741", "W605"] -select = ["C", "E", "F", "I", "W", "I001"] +lint.ignore = ["C901", "E501"] +lint.select = ["C", "E", "F", "I", "W", "I001"] [tool.ruff.format] line-ending = "auto" diff --git a/tests/configs/_bert_sweep_.yaml b/tests/configs/_bert_sweep_.yaml index c4986d0d..f618a34f 100644 --- a/tests/configs/_bert_sweep_.yaml +++ b/tests/configs/_bert_sweep_.yaml @@ -1,5 +1,5 @@ hydra: sweeper: params: - backend.model: hf-internal-testing/tiny-random-bert backend.task: fill-mask,text-classification,token-classification,question-answering + backend.model: hf-internal-testing/tiny-random-bert,hf-internal-testing/tiny-random-roberta diff --git a/tests/configs/_diffusers_.yaml b/tests/configs/_diffusers_.yaml index 5429fdc5..0c836c16 100644 --- a/tests/configs/_diffusers_.yaml +++ b/tests/configs/_diffusers_.yaml @@ -4,5 +4,5 @@ backend: model: hf-internal-testing/tiny-stable-diffusion-torch benchmark: - forward_kwargs: + call_kwargs: num_inference_steps: 2 diff --git a/tests/test_api.py b/tests/test_api.py index b076bb54..19d95021 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -5,6 +5,10 @@ import pytest import torch +from optimum_benchmark.backends.diffusers_utils import ( + extract_diffusers_shapes_from_model, + get_diffusers_pretrained_config, +) from optimum_benchmark.backends.pytorch.config import PyTorchConfig from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config from optimum_benchmark.backends.transformers_utils import ( @@ -19,7 +23,6 @@ from optimum_benchmark.launchers.inline.config import InlineConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.launchers.torchrun.config import TorchrunConfig -from optimum_benchmark.task_utils import IMAGE_DIFFUSION_TASKS, TEXT_GENERATION_TASKS from optimum_benchmark.trackers.latency import LatencyTracker from optimum_benchmark.trackers.memory import MemoryTracker @@ -112,18 +115,17 @@ def test_api_input_generator(library, task, model): elif library == "timm": model_config = get_timm_pretrained_config(model) model_shapes = extract_timm_shapes_from_config(model_config) - else: - raise ValueError(f"Unknown library {library}") + elif library == "diffusers": + model_config = get_diffusers_pretrained_config(model) + model_shapes = extract_diffusers_shapes_from_model(model) - generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes) + input_generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes) + generated_inputs = input_generator() - if task in TEXT_GENERATION_TASKS: - _ = generator(mode="forward") - _ = generator(mode="generate") - elif task in IMAGE_DIFFUSION_TASKS: - _ = generator(mode="call") - else: - _ = generator(mode="forward") + assert len(generated_inputs) > 0, "No inputs were generated" + + for key in generated_inputs: + assert len(generated_inputs[key]) == INPUT_SHAPES["batch_size"], "Incorrect batch size" @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)