From c86aa52f8607afad634d81695c528e03b4a56c5c Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 9 Aug 2023 19:24:31 +0000 Subject: [PATCH 1/5] Add sleeps after dataloader inits --- diffusion/train.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/diffusion/train.py b/diffusion/train.py index fb1bdd60..1614d2c6 100644 --- a/diffusion/train.py +++ b/diffusion/train.py @@ -4,6 +4,7 @@ """Train model.""" import operator +import time from collections.abc import Iterable from typing import Any, Dict, List, Optional, Union @@ -38,6 +39,8 @@ def train(config: DictConfig) -> None: config.dataset.train_dataset, batch_size=config.dataset.train_batch_size // dist.get_world_size(), ) + # Need to sleep for a bit to avoid dataloader crash + time.sleep(10) # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None @@ -52,6 +55,8 @@ def train(config: DictConfig) -> None: config.dataset.eval_batch_size // dist.get_world_size(), ) evaluator = hydra.utils.instantiate(eval_conf.evaluator, dataloader=eval_dataloader) + # Need to sleep for a bit to avoid dataloader crash + time.sleep(10) evaluators.append(evaluator) eval_set = evaluators @@ -59,6 +64,8 @@ def train(config: DictConfig) -> None: else: eval_set = hydra.utils.instantiate(config.dataset.eval_dataset, batch_size=config.dataset.eval_batch_size // dist.get_world_size()) + # Need to sleep for a bit to avoid dataloader crash + time.sleep(10) # Build list of loggers, callbacks, and algorithms to pass to trainer logger: List[LoggerDestination] = [] From a26e5e5273d51615fb11ea8bccbad6d3209a7162 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 9 Aug 2023 19:28:39 +0000 Subject: [PATCH 2/5] Update reqs for h100s --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index d05090ce..fe95510d 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml@git+https://github.com/mosaicml/composer.git@6cf3d3a1aa300834c650f89460b5ac9bbc5a1e46', + 'mosaicml', 'mosaicml-streaming>=0.4.0,<1.0', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', - 'diffusers[torch]==0.16.0', - 'transformers[torch]==4.29.2', + 'diffusers[torch]==0.19.3', + 'transformers[torch]==4.31.0', 'wandb==0.15.4', - 'xformers==0.0.16', + 'xformers==0.0.20', 'triton==2.0.0', - 'torchmetrics[image]==0.11.3', + 'torchmetrics[image]==0.11.4', 'clean-fid', 'clip@git+https://github.com/openai/CLIP.git', ] From 9cc41d07f62af8d1db106e16881ca4c678d4dea3 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Thu, 10 Aug 2023 05:16:13 +0000 Subject: [PATCH 3/5] Pin composer version as well --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fe95510d..f76492fd 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml', + 'mosaicml==0.15.1', 'mosaicml-streaming>=0.4.0,<1.0', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', From e1b3c61d644f2899476a41ff9b17dcd24cc2704b Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Thu, 10 Aug 2023 17:40:51 +0000 Subject: [PATCH 4/5] Update readme with H100 instructions --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 0b13e385..b1f8668e 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,19 @@ Here are the system settings we recommend to start training your own diffusion m - Ubuntu Version: 20.04 - Use a system with NVIDIA GPUs +- For running on NVIDIA H100s, use a docker image with PyTorch 1.13+ e.g. [MosaicML's PyTorch base image](https://hub.docker.com/r/mosaicml/pytorch/tags) + - Recommended tag: `mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04` + - This image comes pre-configured with the following dependencies: + - PyTorch Version: 2.0.1 + - CUDA Version: 11.8 + - Python Version: 3.10 + - Ubuntu Version: 20.04 + - Depending on the training config, an additional install of `xformers` may be needed: + ``` + pip install -U ninja + pip install -U git+https://github.com/facebookresearch/xformers + ``` + # How many GPUs do I need? We benchmarked the U-Net training throughput as we scale the number of A100 GPUs from 8 to 128. Our time estimates are based on training Stable Diffusion 2.0 base on 1,126,400,000 images at 256x256 resolution and 1,740,800,000 images at 512x512 resolution. Our cost estimates are based on $2 / A100-hour. Since the time and cost estimates are for the U-Net only, these only hold if the VAE and CLIP latents are computed before training. It took 3,784 A100-hours (cost of $7,600) to pre-compute the VAE and CLIP latents offline. If you are computing VAE and CLIP latents while training, expect a 1.4x increase in time and cost. From 36aac39db496a54c87be3948e7124ed8773a3246 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Thu, 10 Aug 2023 17:42:23 +0000 Subject: [PATCH 5/5] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1f8668e..14f09e9e 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Here are the system settings we recommend to start training your own diffusion m - Ubuntu Version: 20.04 - Use a system with NVIDIA GPUs -- For running on NVIDIA H100s, use a docker image with PyTorch 1.13+ e.g. [MosaicML's PyTorch base image](https://hub.docker.com/r/mosaicml/pytorch/tags) +- For running on NVIDIA H100s, use a docker image with PyTorch 2.0+ e.g. [MosaicML's PyTorch base image](https://hub.docker.com/r/mosaicml/pytorch/tags) - Recommended tag: `mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04` - This image comes pre-configured with the following dependencies: - PyTorch Version: 2.0.1