Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement baselines as a fixture and with simple rebase support #1732

Merged
merged 10 commits into from
Feb 7, 2025
96 changes: 90 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,76 @@
import json
import logging
from pathlib import Path

import pytest


BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("tests") / Path("baselines") / Path("fixture")


def walk_path(path: Path):
"""
Taken from https://stackoverflow.com/a/76236680

Path.walk() is not available until python 3.12
"""
subdirs = [d for d in path.iterdir() if d.is_dir()]
files = [f for f in path.iterdir() if f.is_file()]
yield path, subdirs, files
for s in subdirs:
yield from walk_path(s)


class Baseline:
def __init__(self, session):
self.rebase = session.config.option.rebase
self.references = {}

if BASELINE_DIRECTORY.exists():
for root, dirs, files in walk_path(BASELINE_DIRECTORY):
for name in files:
with (root / name).open() as f:
self.references.update(json.load(f))

def get_reference(self, addr, context=[]):
reference = self.references.setdefault(addr, {})
for c in context:
reference = reference.setdefault(c, {})
return reference

def finalize(self):
if self.rebase:
# aggregate refs by test file
refsbyfile = {}
for case, ref in self.references.items():
key = case.split("::")[0]
reffile = BASELINE_DIRECTORY / Path(key).with_suffix(".json")
refsbyfile.setdefault(reffile, {})[case] = ref

# dump aggregated refs into their own files
for reffile, refs in refsbyfile.items():
reffile.parent.mkdir(parents=True, exist_ok=True)
with reffile.open("w+") as f:
json.dump(refs, f, indent=2, sort_keys=True)


class BaselineRequest:
def __init__(self, request):
self.baseline = request.session.stash["baseline"]
self.addr = request.node.nodeid

def assertRef(self, compare, context=[], **kwargs):
reference = self.baseline.get_reference(self.addr, context)
if self.baseline.rebase:
reference.update(**kwargs)

for key, actual in kwargs.items():
ref = reference.get(key, None)
logging.getLogger().info(f"{'.'.join(context + [key])}:actual = {actual}")
logging.getLogger().info(f"{'.'.join(context + [key])}:ref = {ref}")
assert compare(actual, ref)


class Secret:
"""
Taken from: https://stackoverflow.com/a/67393351
Expand All @@ -15,11 +88,22 @@ def __str___(self):

def pytest_addoption(parser):
parser.addoption("--token", action="store", default=None)
parser.addoption("--rebase", action="store_true", help="rebase baseline references from current run")


@pytest.fixture
def token(request):
return Secret(request.config.option.token)


def pytest_sessionstart(session):
session.stash["baseline"] = Baseline(session)


def pytest_sessionfinish(session):
session.stash["baseline"].finalize()


def pytest_generate_tests(metafunc):
# This is called for every test. Only get/set command line arguments
# if the argument is specified in the list of test "fixturenames".
option_value = Secret(metafunc.config.option.token)
if "token" in metafunc.fixturenames:
metafunc.parametrize("token", [option_value])
@pytest.fixture
def baseline(request):
return BaselineRequest(request)
32 changes: 32 additions & 0 deletions tests/baselines/fixture/tests/test_encoder_decoder.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[facebook/bart-large-cnn-Habana/bart-2-2]": {
"gaudi1": {
"predict_rougeLsum": 29.174,
"predict_samples_per_second": 2.304
},
"gaudi2": {
"predict_rougeLsum": 28.9801,
"predict_samples_per_second": 4.339
}
},
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[t5-3b-Habana/t5-2-1]": {
"gaudi1": {
"predict_rougeLsum": 21.7286,
"predict_samples_per_second": 1.005
},
"gaudi2": {
"predict_rougeLsum": 21.8877,
"predict_samples_per_second": 3.848
}
},
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_translation_bf16[t5-small-Habana/t5-2-1]": {
"gaudi1": {
"predict_bleu": 11.6126,
"predict_samples_per_second": 9.188
},
"gaudi2": {
"predict_bleu": 11.7277,
"predict_samples_per_second": 11.648
}
}
}
8 changes: 8 additions & 0 deletions tests/baselines/fixture/tests/test_fp8_examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"tests/test_fp8_examples.py::test_fp8_train[mistralai/Mistral-7B-Instruct-v0.2-tatsu-lab/alpaca--language-modeling-8-8-run_lora_clm.py]": {
"gaudi2": {
"eval_accuracy": 0.7538,
"train_samples_per_second": 12.373
}
}
}
14 changes: 14 additions & 0 deletions tests/baselines/fixture/tests/test_fsdp_examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"tests/test_fsdp_examples.py::test_fsdp_bf16[bert-base-uncased-Habana/bert-base-uncased-question-answering-24-8-run_qa.py-full_shard]": {
"gaudi2": {
"eval_f1": 85.7077,
"train_samples_per_second": 2983.533
}
},
"tests/test_fsdp_examples.py::test_fsdp_bf16[meta-llama/Llama-2-7b-hf--language-modeling-8-8-run_lora_clm.py-auto_wrap]": {
"gaudi2": {
"train_loss": 0.9093,
"train_samples_per_second": 85.016
}
}
}
94 changes: 94 additions & 0 deletions tests/baselines/fixture/tests/test_image_to_text_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"tests/test_image_to_text_example.py::test_image_to_text_bf16[HuggingFaceM4/idefics2-8b-1]": {
"gaudi2": {
"throughput": 21.89944593215077
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-2B-Instruct-1]": {
"gaudi2": {
"throughput": 28.755882208438422
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-7B-Instruct-1]": {
"gaudi2": {
"throughput": 19.32562189532818
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[google/paligemma-3b-mix-224-1]": {
"gaudi2": {
"throughput": 132.8949150246155
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-13b-hf-1]": {
"gaudi1": {
"throughput": 16.704731010481538
},
"gaudi2": {
"throughput": 48.54364937033955
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-7b-hf-1]": {
"gaudi1": {
"throughput": 28.04096918512148
},
"gaudi2": {
"throughput": 77.98733740859008
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
"gaudi1": {
"throughput": 10.759228696741
},
"gaudi2": {
"throughput": 33.17984878151546
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
"gaudi1": {
"throughput": 6.96732060769783
},
"gaudi2": {
"throughput": 23.527610042925
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
"gaudi2": {
"throughput": 35.00608681379742
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[meta-llama/Llama-3.2-11B-Vision-Instruct-1]": {
"gaudi2": {
"throughput": 18.974541922240313
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[tiiuae/falcon-11B-vlm-1]": {
"gaudi2": {
"throughput": 23.69260849957278
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-13b-hf-1]": {
"gaudi2": {
"throughput": 67.20488222876344
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-7b-hf-1]": {
"gaudi2": {
"throughput": 98.72578382705062
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
"gaudi2": {
"throughput": 45.011551008367086
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
"gaudi2": {
"throughput": 30.9535718774675
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
"gaudi2": {
"throughput": 45.18544502949674
}
}
}
18 changes: 18 additions & 0 deletions tests/baselines/fixture/tests/test_openclip_vqa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"tests/test_openclip_vqa.py::test_openclip_vqa_bf16[laion/CLIP-ViT-g-14-laion2B-s12B-b42K]": {
"gaudi1": {
"throughput": 550
},
"gaudi2": {
"throughput": 1472
}
},
"tests/test_openclip_vqa.py::test_openclip_vqa_bf16[microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224]": {
"gaudi1": {
"throughput": 1200
},
"gaudi2": {
"throughput": 1816
}
}
}
17 changes: 17 additions & 0 deletions tests/baselines/fixture/tests/test_pipeline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[Salesforce/blip-image-captioning-base-44]": {
"generated_text": "a soccer player is playing a game on the app"
},
"tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[nlpconnect/vit-gpt2-image-captioning-44]": {
"generated_text": "a soccer game with a player jumping to catch"
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/hf-seamless-m4t-medium]": {
"sampling_rate": 16000
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/mms-tts-eng]": {
"sampling_rate": 16000
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[microsoft/speecht5_tts]": {
"sampling_rate": 16000
}
}
Loading