Skip to content

Commit

Permalink
Fix ort inputs filtering (#129)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Feb 20, 2024
1 parent 06dab18 commit 924a4c7
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 84 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ data/
version.txt

.engine/
actions-runner-duplicate/
actions-runner/
experiments/
amdsmi/
16 changes: 11 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
# List of targets that are not associated with files
.PHONY: quality style install \
build_docker_cpu, build_docker_cuda, build_docker_rocm, \
test_cli_cpu_pytorch, test_cli_rocm_pytorch, \
test_cli_cpu_neural_compressor, test_cli_cpu_onnxruntime, test_cli_cpu_openvino, \
test_api_cpu, test_api_cuda, test_api_rocm, test_api_misc
.PHONY: quality style install build_docker_cpu build_docker_cuda build_docker_rocm test_cli_cpu_neural_compressor test_cli_cpu_onnxruntime test_cli_cpu_openvino test_cli_cpu_pytorch test_cli_rocm_pytorch test_cli_cuda_pytorch test_api_cpu test_api_cuda test_api_rocm test_api_misc

quality:
ruff check .
Expand All @@ -28,6 +24,7 @@ build_docker_rocm:
test_cli_cpu_neural_compressor:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -36,6 +33,7 @@ test_cli_cpu_neural_compressor:
test_cli_cpu_onnxruntime:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -44,6 +42,7 @@ test_cli_cpu_onnxruntime:
test_cli_cpu_openvino:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -52,6 +51,7 @@ test_cli_cpu_openvino:
test_cli_cpu_pytorch:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -60,6 +60,7 @@ test_cli_cpu_pytorch:
test_cli_rocm_pytorch:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -72,6 +73,7 @@ test_cli_rocm_pytorch:
test_cli_cuda_pytorch:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -81,6 +83,7 @@ test_cli_cuda_pytorch:
test_api_cpu:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand All @@ -89,6 +92,7 @@ test_api_cpu:
test_api_cuda:
docker run \
--rm \
--pid=host \
--gpus '"device=0,1"' \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
Expand All @@ -98,6 +102,7 @@ test_api_cuda:
test_api_rocm:
docker run \
--rm \
--pid=host \
--device=/dev/kfd \
--device /dev/dri/renderD128 \
--device /dev/dri/renderD129 \
Expand All @@ -110,6 +115,7 @@ test_api_rocm:
test_api_misc:
docker run \
--rm \
--pid=host \
--entrypoint /bin/bash \
--volume $(PWD):/workspace \
--workdir /workspace \
Expand Down
4 changes: 2 additions & 2 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ..task_utils import get_automodel_class_for_task
from .config import BackendConfigT
from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config
from .transformers_utils import (
PretrainedProcessor,
Expand Down Expand Up @@ -41,7 +41,7 @@ def __init__(self, config: BackendConfigT):

if self.config.library == "diffusers":
self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_config(self.config.model, **self.config.hub_kwargs)
self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.hub_kwargs)
self.model_type = self.config.task
self.generation_config = None
self.pre_processor = None
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/diffusers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
return diffusers.DiffusionPipeline.load_config(model, **kwargs)


def extract_diffusers_shapes_from_config(model: str, **kwargs) -> Dict[str, int]:
def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
config = diffusers.DiffusionPipeline.load_config(model, **kwargs)

shapes = {}
Expand Down
3 changes: 2 additions & 1 deletion optimum_benchmark/backends/onnxruntime/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,14 @@ def prepare_for_inference(self, **kwargs) -> None:

def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
if self.config.library == "diffusers":
return {"prompt": inputs["prompt"]}
return inputs

LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}")
for key, value in list(inputs.items()):
if key in self.inputs_names:
inputs[key] = value.to(self.config.device)
else:
LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.")
inputs.pop(key)

return inputs
Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained
try:
# sometimes contains information about the model's input shapes that are not available in the config
return AutoProcessor.from_pretrained(model, **kwargs)
except ValueError:
except Exception:
return None


Expand Down
54 changes: 28 additions & 26 deletions optimum_benchmark/benchmarks/inference/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..base import Benchmark
from ..report import BenchmarkMeasurements, BenchmarkReport
from .config import InferenceConfig
from .inputs_utils import extract_text_generation_inputs

if is_torch_distributed_available():
import torch.distributed
Expand Down Expand Up @@ -72,37 +73,35 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
"The batch size must be divisible by the number of processes in a distributed environment"
)
self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS:
TEXT_GENERATION_TASKS["synced_gpus"] = True

LOGGER.info("\t+ Creating input generator")
self.input_generator = InputGenerator(
task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes
)

if backend.config.task in TEXT_GENERATION_TASKS:
LOGGER.info("\t+ Generating and preparing Text Generation input")
self.forward_inputs = self.input_generator(mode="forward")
self.generate_input = self.input_generator(mode="generate")
LOGGER.info("\t+ Generating and preparing Text Generation inputs")
self.forward_inputs = self.input_generator()
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
self.generate_input = backend.prepare_inputs(self.generate_input)
self.generate_inputs = extract_text_generation_inputs(self.forward_inputs)
LOGGER.info("\t+ Updating Text Generation kwargs with default values")
self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
LOGGER.info("\t+ Initializing Text Generation report")
self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements())

elif backend.config.task in IMAGE_DIFFUSION_TASKS:
LOGGER.info("\t+ Generating and preparing Image Diffusion input")
self.diffuse_input = self.input_generator(mode="call")
self.diffuse_input = backend.prepare_inputs(self.diffuse_input)
LOGGER.info("\t+ Generating Image Diffusion inputs")
self.call_inputs = self.input_generator()
self.call_inputs = backend.prepare_inputs(self.call_inputs)
self.call_inputs = {"prompt": self.call_inputs["prompt"]}
LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
self.config.call_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.call_kwargs}
LOGGER.info("\t+ Initializing Image Diffusion report")
self.report = ImageDiffusionReport(call=BenchmarkMeasurements())

else:
LOGGER.info("\t+ Generating and preparing Inference input")
self.forward_inputs = self.input_generator(mode="forward")
LOGGER.info("\t+ Generating and preparing Inference inputs")
self.forward_inputs = self.input_generator()
self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
LOGGER.info("\t+ Initializing Inference report")
self.report = InferenceReport(forward=BenchmarkMeasurements())
Expand All @@ -111,16 +110,17 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
backend.prepare_for_inference(
**backend.model_shapes,
**self.config.input_shapes,
**self.config.forward_kwargs,
**self.config.generate_kwargs,
**self.config.forward_kwargs,
**self.config.call_kwargs,
)

LOGGER.info("\t+ Warming up backend for Inference")
for _ in range(self.config.warmup_runs):
if backend.config.task in TEXT_GENERATION_TASKS:
_ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2})
_ = backend.generate(self.generate_inputs, {"max_new_tokens": 2, "min_new_tokens": 2})
elif backend.config.task in IMAGE_DIFFUSION_TASKS:
_ = backend.call(self.diffuse_input, {"num_inference_steps": 2})
_ = backend.call(self.call_inputs, {"num_inference_steps": 2})
else:
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)

Expand Down Expand Up @@ -164,8 +164,6 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
self.report.log_energy()
self.report.log_efficiency()

self.report.log()

## Memory tracking
def run_text_generation_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
Expand All @@ -177,15 +175,15 @@ def run_text_generation_memory_tracking(self, backend: Backend):

self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.memory = self.memory_tracker.get_max_memory()

def run_image_diffusion_memory_tracking(self, backend: Backend):
LOGGER.info("\t+ Running memory tracking")
self.memory_tracker.reset()
with self.memory_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.memory = self.memory_tracker.get_max_memory()

Expand All @@ -205,17 +203,21 @@ def run_text_generation_latency_tracking(self, backend: Backend):
with self.latency_tracker.track():
_ = backend.forward(self.forward_inputs, self.config.forward_kwargs)

self.report.prefill.latency = self.latency_tracker.get_latency()
forward_latency = self.latency_tracker.get_latency()
forward_latency.log(prefix="forward")
self.report.prefill.latency = forward_latency
self.report.prefill.throughput = self.latency_tracker.get_throughput(
volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
)

self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean
generate_latency = self.latency_tracker.get_latency()
generate_latency.log(prefix="generate")
self.report.decode.latency = generate_latency - self.report.prefill.latency.mean
self.report.decode.throughput = Throughput.from_latency(
self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT
)
Expand All @@ -225,7 +227,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend):
self.latency_tracker.reset()
while self.latency_tracker.get_elapsed_time() < self.config.duration:
with self.latency_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.latency = self.latency_tracker.get_latency()
self.report.call.throughput = Throughput.from_latency(
Expand Down Expand Up @@ -258,7 +260,7 @@ def run_text_generation_energy_tracking(self, backend: Backend):

self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.generate(self.generate_input, self.config.generate_kwargs)
_ = backend.generate(self.generate_inputs, self.config.generate_kwargs)

self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy
self.report.decode.efficiency = Efficiency.from_energy(
Expand All @@ -269,7 +271,7 @@ def run_image_diffusion_energy_tracking(self, backend: Backend):
LOGGER.info("\t+ Running energy tracking")
self.energy_tracker.reset()
with self.energy_tracker.track():
_ = backend.call(self.diffuse_input, self.config.forward_kwargs)
_ = backend.call(self.call_inputs, self.config.call_kwargs)

self.report.call.energy = self.energy_tracker.get_energy()
self.report.call.efficiency = Efficiency.from_energy(
Expand Down Expand Up @@ -297,7 +299,7 @@ def prefill_volume(self) -> int: # in tokens

@property
def call_volume(self) -> int: # in images
return self.config.input_shapes["batch_size"] * self.config.forward_kwargs["num_images_per_prompt"]
return self.config.input_shapes["batch_size"] * self.config.call_kwargs["num_images_per_prompt"]

@property
def decode_volume(self) -> int: # in tokens
Expand Down
17 changes: 17 additions & 0 deletions optimum_benchmark/benchmarks/inference/inputs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def extract_text_generation_inputs(inputs):
if "pixel_values" in inputs:
# image input
text_generation_inputs = {"inputs": inputs["pixel_values"]}
elif "input_values" in inputs:
# speech input
text_generation_inputs = {"inputs": inputs["input_values"]}
elif "input_features" in inputs:
# waveform input
text_generation_inputs = {"inputs": inputs["input_features"]}
elif "input_ids" in inputs:
# text input
text_generation_inputs = {"inputs": inputs["input_ids"]}
else:
raise ValueError("Could not find any valid text generation inputs.")

return text_generation_inputs
19 changes: 1 addition & 18 deletions optimum_benchmark/generators/input_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,6 @@ def __init__(self, task: str, input_shapes: Dict[str, int], model_shapes: Dict[s
"please submit a PR or a feature request to optimum-benchmark. "
)

def __call__(self, mode: str) -> Dict[str, Any]:
def __call__(self) -> Dict[str, Any]:
task_input = self.task_generator()

if mode == "generate":
if "pixel_values" in task_input:
# image input
task_input = {"inputs": task_input["pixel_values"]}
elif "input_values" in task_input:
# speech input
task_input = {"inputs": task_input["input_values"]}
elif "input_features" in task_input:
# waveform input
task_input = {"inputs": task_input["input_features"]}
elif "input_ids" in task_input:
# text input
task_input = {"inputs": task_input["input_ids"]}
elif mode == "call":
task_input = {"prompt": task_input["prompt"]}

return task_input
14 changes: 4 additions & 10 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,13 @@ def _cpu_latency(self):
self.end_events.append(end)

def get_elapsed_time(self) -> float:
# we measured in cpu to not synchronize all events
# we measure it in cpu to not synchronize all events
return time.perf_counter() - self.start_time

def get_latency(self) -> Latency:
if self.backend == "pytorch" and self.device == "cuda":
# synchronize the last event to make sure it has been recorded
self.start_events[-1].synchronize()
self.end_events[-1].synchronize()
# synchronize the device to make sure all events have been recorded
torch.cuda.synchronize()

latencies_list = [
self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events))
Expand Down Expand Up @@ -210,12 +209,7 @@ def __init__(self, device: str, backend: str):
self.reset()

def reset(self):
if self.device == "cuda" and self.backend == "pytorch":
event = torch.cuda.Event(enable_timing=True)
event.record()
self.events = [event]
else:
self.events = [time.perf_counter()]
self.events: List[Union[float, torch.cuda.Event]] = []

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
if self.device == "cuda" and self.backend == "pytorch":
Expand Down
Loading

0 comments on commit 924a4c7

Please sign in to comment.