Skip to content

Commit

Permalink
Add FSDP tests
Browse files Browse the repository at this point in the history
  • Loading branch information
vivekgoe committed Dec 14, 2023
1 parent e2cfa29 commit ffaeef8
Showing 1 changed file with 124 additions and 0 deletions.
124 changes: 124 additions & 0 deletions tests/test_fsdp_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import json
import os
import re
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest

from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR


if os.environ.get("GAUDI2_CI", "0") == "1":
# Gaudi2 CI baselines
MODELS_TO_TEST = {
"bf16": [
("bert-base-uncased", "Habana/bert-base-uncased", 2807, 85.4688, "question-answering", 24, 8, "run_qa.py", "full_shard"),
],
}
else:
# Gaudi1 CI baselines
MODELS_TO_TEST = {
"bf16": [
("bert-base-uncased", "Habana/bert-base-uncased", 2807/3.0, 85.4688, "question-answering", 24, 8, "run_qa.py", "full_shard"),
],
}


def _test_fsdp(
model_name: str,
gaudi_config: str,
baseline: float,
baseline_acc: float,
task: str,
batch_size_train: int,
batch_size_eval: int,
script: str,
policy: str,
world_size: int = 8,
):
path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"

# Install question-answering example requirements
cmd_line = f"pip install -r {path_to_example_dir / task / 'requirements.txt'}".split()
p = subprocess.Popen(cmd_line)
return_code = p.wait()
assert return_code == 0

command = ["python3"]


command += [
f"{path_to_example_dir / 'gaudi_spawn.py'}",
"--use_mpi",
f"--world_size {world_size}",
]

command += [
f"{path_to_example_dir / task / script}",
f"--model_name_or_path {model_name}",
"--do_train",
"--dataset_name squad",
"--max_seq_length 384",
"--use_lazy_mode",
f"--per_device_eval_batch_size {batch_size_eval}",
f"--per_device_train_batch_size {batch_size_train}",
"--learning_rate 3e-05",
"--num_train_epochs 2.0",
"--logging_steps 20",
"--save_steps 5000",
"--seed 42",
"--doc_stride 128",
"--use_habana",
"--overwrite_output_dir",
f"--gaudi_config_name {gaudi_config}",
"--throughput_warmup_steps 100",
f"--fsdp_config {path_to_example_dir / task / 'fsdp_config.json'}",
f"--fsdp '{policy}'",
"--do_eval",
"--torch_compile_backend aot_hpu_training_backend",
"--torch_compile"
]

with TemporaryDirectory() as tmp_dir:
command.append(f"--output_dir {tmp_dir}")
print(f"\n\nCommand to test: {' '.join(command)}\n")

pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
command = [x for y in command for x in re.split(pattern, y) if x]

proc = subprocess.run(command)

# Ensure the run finished without any issue
# Use try-except to avoid logging the token if used
try:
assert proc.returncode == 0
except AssertionError as e:
if "'--token', 'hf_" in e.args[0]:
e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
raise

with open(Path(tmp_dir) / "all_results.json") as fp:
results = json.load(fp)

# Ensure performance requirements (throughput) are met
assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
assert results["eval_f1"] >= ACCURACY_PERF_FACTOR * baseline_acc


@pytest.mark.parametrize(
"model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy", MODELS_TO_TEST["bf16"]
)
def test_fsdp_bf16(
model_name: str,
gaudi_config: str,
baseline: float,
baseline_acc: float,
task: str,
bs_train: int,
bs_eval: int,
script: str,
policy: str,
):
_test_fsdp(model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy)

0 comments on commit ffaeef8

Please sign in to comment.