-
Notifications
You must be signed in to change notification settings - Fork 233
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import json | ||
import os | ||
import re | ||
import subprocess | ||
from pathlib import Path | ||
from tempfile import TemporaryDirectory | ||
|
||
import pytest | ||
|
||
from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR | ||
|
||
|
||
if os.environ.get("GAUDI2_CI", "0") == "1": | ||
# Gaudi2 CI baselines | ||
MODELS_TO_TEST = { | ||
"bf16": [ | ||
("bert-base-uncased", "Habana/bert-base-uncased", 2807, 85.4688, "question-answering", 24, 8, "run_qa.py", "full_shard"), | ||
], | ||
} | ||
else: | ||
# Gaudi1 CI baselines | ||
MODELS_TO_TEST = { | ||
"bf16": [ | ||
("bert-base-uncased", "Habana/bert-base-uncased", 2807/3.0, 85.4688, "question-answering", 24, 8, "run_qa.py", "full_shard"), | ||
], | ||
} | ||
|
||
|
||
def _test_fsdp( | ||
model_name: str, | ||
gaudi_config: str, | ||
baseline: float, | ||
baseline_acc: float, | ||
task: str, | ||
batch_size_train: int, | ||
batch_size_eval: int, | ||
script: str, | ||
policy: str, | ||
world_size: int = 8, | ||
): | ||
path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" | ||
|
||
# Install question-answering example requirements | ||
cmd_line = f"pip install -r {path_to_example_dir / task / 'requirements.txt'}".split() | ||
p = subprocess.Popen(cmd_line) | ||
return_code = p.wait() | ||
assert return_code == 0 | ||
|
||
command = ["python3"] | ||
|
||
|
||
command += [ | ||
f"{path_to_example_dir / 'gaudi_spawn.py'}", | ||
"--use_mpi", | ||
f"--world_size {world_size}", | ||
] | ||
|
||
command += [ | ||
f"{path_to_example_dir / task / script}", | ||
f"--model_name_or_path {model_name}", | ||
"--do_train", | ||
"--dataset_name squad", | ||
"--max_seq_length 384", | ||
"--use_lazy_mode", | ||
f"--per_device_eval_batch_size {batch_size_eval}", | ||
f"--per_device_train_batch_size {batch_size_train}", | ||
"--learning_rate 3e-05", | ||
"--num_train_epochs 2.0", | ||
"--logging_steps 20", | ||
"--save_steps 5000", | ||
"--seed 42", | ||
"--doc_stride 128", | ||
"--use_habana", | ||
"--overwrite_output_dir", | ||
f"--gaudi_config_name {gaudi_config}", | ||
"--throughput_warmup_steps 100", | ||
f"--fsdp_config {path_to_example_dir / task / 'fsdp_config.json'}", | ||
f"--fsdp '{policy}'", | ||
"--do_eval", | ||
"--torch_compile_backend aot_hpu_training_backend", | ||
"--torch_compile" | ||
] | ||
|
||
with TemporaryDirectory() as tmp_dir: | ||
command.append(f"--output_dir {tmp_dir}") | ||
print(f"\n\nCommand to test: {' '.join(command)}\n") | ||
|
||
pattern = re.compile(r"([\"\'].+?[\"\'])|\s") | ||
command = [x for y in command for x in re.split(pattern, y) if x] | ||
|
||
proc = subprocess.run(command) | ||
|
||
# Ensure the run finished without any issue | ||
# Use try-except to avoid logging the token if used | ||
try: | ||
assert proc.returncode == 0 | ||
except AssertionError as e: | ||
if "'--token', 'hf_" in e.args[0]: | ||
e.args = (f"The following command failed:\n{' '.join(command[:-2])}",) | ||
raise | ||
|
||
with open(Path(tmp_dir) / "all_results.json") as fp: | ||
results = json.load(fp) | ||
|
||
# Ensure performance requirements (throughput) are met | ||
assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline | ||
assert results["eval_f1"] >= ACCURACY_PERF_FACTOR * baseline_acc | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy", MODELS_TO_TEST["bf16"] | ||
) | ||
def test_fsdp_bf16( | ||
model_name: str, | ||
gaudi_config: str, | ||
baseline: float, | ||
baseline_acc: float, | ||
task: str, | ||
bs_train: int, | ||
bs_eval: int, | ||
script: str, | ||
policy: str, | ||
): | ||
_test_fsdp(model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy) |