From a18e2c0db226b7118ed7ebbaecd8edb57dc59335 Mon Sep 17 00:00:00 2001 From: jacobfulano <62222220+jacobfulano@users.noreply.github.com> Date: Thu, 3 Aug 2023 14:14:45 -0400 Subject: [PATCH] Update BERT yamls for compatibility with main branch (#427) * typo * update bf16 --> amp_bf16 in bert yamls * update mcloud yamls to run on mosaicml/examples main branch with correct file structure * updated git repo branch in 40gb yaml --- .../benchmarks/bert/yamls/main/mcloud_run_a100_40gb.yaml | 9 +++++---- .../benchmarks/bert/yamls/main/mcloud_run_a100_80gb.yaml | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/benchmarks/bert/yamls/main/mcloud_run_a100_40gb.yaml b/examples/benchmarks/bert/yamls/main/mcloud_run_a100_40gb.yaml index d945eacc4..02880d0d7 100644 --- a/examples/benchmarks/bert/yamls/main/mcloud_run_a100_40gb.yaml +++ b/examples/benchmarks/bert/yamls/main/mcloud_run_a100_40gb.yaml @@ -21,7 +21,7 @@ compute: integrations: - integration_type: git_repo git_repo: mosaicml/examples - git_branch: v0.0.4 # use your branch + git_branch: main # use your branch # git_commit: # OR use your commit hash pip_install: -e .[bert] ssh_clone: false # Should be true if using a private repo @@ -31,8 +31,9 @@ integrations: # For real training runs, follow the instructions in `examples/bert/README.md` # to convert and host the full 'train' dataset. command: | - cd examples/examples/bert - python ../common/convert_dataset.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val + cd examples/examples/benchmarks/bert + pip install -r requirements.txt + python src/convert_dataset.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val composer main.py /mnt/config/parameters.yaml \ train_loader.dataset.split=train_small @@ -117,7 +118,7 @@ parameters: seed: 17 device_eval_batch_size: 128 device_train_microbatch_size: 128 - precision: bf16 + precision: amp_bf16 # update for Composer release/v0.12.1 # Logging progress_bar: false diff --git a/examples/benchmarks/bert/yamls/main/mcloud_run_a100_80gb.yaml b/examples/benchmarks/bert/yamls/main/mcloud_run_a100_80gb.yaml index 324caae3f..48f2b0563 100644 --- a/examples/benchmarks/bert/yamls/main/mcloud_run_a100_80gb.yaml +++ b/examples/benchmarks/bert/yamls/main/mcloud_run_a100_80gb.yaml @@ -22,7 +22,7 @@ compute: integrations: - integration_type: git_repo git_repo: mosaicml/examples - git_branch: v0.0.4 # use your branch + git_branch: main # use your branch # git_commit: # OR use your commit hash pip_install: -e .[bert] ssh_clone: false # Should be true if using a private repo @@ -32,8 +32,9 @@ integrations: # For real training runs, follow the instructions in `examples/bert/README.md` # to convert and host the full 'train' dataset. command: | - cd examples/examples/bert - python ../common/convert_dataset.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val + cd examples/examples/benchmarks/bert + pip install -r requirements.txt + python src/convert_dataset.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val composer main.py /mnt/config/parameters.yaml \ train_loader.dataset.split=train_small @@ -117,7 +118,7 @@ parameters: seed: 17 device_eval_batch_size: 512 device_train_microbatch_size: 512 - precision: bf16 + precision: amp_bf16 # update for Composer release/v0.12.1 # Logging progress_bar: false