From 9d9d4ff7b427512ffe3300bd247b98fa20f63091 Mon Sep 17 00:00:00 2001 From: Zaccharie Ramzi Date: Tue, 12 May 2020 12:31:16 +0200 Subject: [PATCH] Tf example dask (#30) --- .gitignore | 139 ++++++++++++++++++ examples/tf/README.md | 35 ++++- examples/tf/dask_script.py | 61 ++++++++ examples/tf/mnist_example.py | 24 ++- examples/tf/mnist_submission_script.slurm | 1 + .../mnist_submission_script_multi_gpus.slurm | 1 + 6 files changed, 251 insertions(+), 10 deletions(-) create mode 100644 .gitignore create mode 100644 examples/tf/dask_script.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b26ab7e --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/examples/tf/README.md b/examples/tf/README.md index 45c0f57..cc0d290 100644 --- a/examples/tf/README.md +++ b/examples/tf/README.md @@ -1,6 +1,6 @@ # Tensorflow example script -To run this script you will need to first install click in your environment. +To run the examples you will need to first install `click` in your environment. ``` module load python/3.7.5 &&\ pip install click @@ -12,7 +12,38 @@ cd $WORK &&\ git clone https://github.com/jean-zay-users/jean-zay-doc.git ``` -Finally you can just launch the batch job (single GPU) via: +## Classical examples + +For the single GPU job you can do: ``` sbatch jean-zay-doc/examples/tf/mnist_submission_script.slurm ``` + +For the multi GPU job you can do: +``` +sbatch jean-zay-doc/examples/tf/mnist_submission_script_multi_gpus.slurm +``` + +## Dask example + +To run the dask example you will need to install `dask-jobqueue` in your environment additionally. +Notice that this time you need to use the python module with tensorflow loaded, because [dask will +by default use the same python for the worker as the one you used for the +scheduler](https://jobqueue.dask.org/en/latest/debug.html). +See this [GitHub issue](https://github.com/dask/dask-jobqueue/issues/408) for more information. +``` +module load tensorflow-gpu/py3/2.1.0 &&\ +pip install click dask-jobqueue +``` + +You can then do: +``` +python jean-zay-doc/examples/tf/dask_script.py 64 +``` + +where 64 is the batch size you want to run the mnist example with. +If you want multiple batch sizes just have them space-separated. + +Be sure to load the tensorflow module before launching the dask script because otherwise Tensorflow will not be loaded. +This is because the python executable used to launch the dask worker is the same as the one used to launch the scheduler by default. +You can set it otherwise in the cluster if you want something more tailored. diff --git a/examples/tf/dask_script.py b/examples/tf/dask_script.py new file mode 100644 index 0000000..ddf665d --- /dev/null +++ b/examples/tf/dask_script.py @@ -0,0 +1,61 @@ +import click +from dask.distributed import Client +from dask_jobqueue import SLURMCluster + +from mnist_example import train_dense_model + + +@click.command() +@click.argument( + 'batch_sizes', + nargs=-1, + type=int, +) +@click.option( + 'save', + '-s', + '--save', + is_flag=True, + help='Whether you want to save the models or not', +) +def launch_dask_tasks(batch_sizes, save): + job_name = 'dask_mnist_tf_example' + + cluster = SLURMCluster( + cores=1, + job_cpu=10, + memory='10GB', + job_name=job_name, + walltime='1:00:00', + interface='ib0', + job_extra=[ + f'--gres=gpu:1', + '--qos=qos_gpu-dev', + '--distribution=block:block', + '--hint=nomultithread', + '--output=%x_%j.out', + ], + ) + n_jobs = len(batch_sizes) + cluster.scale(jobs=n_jobs) + print(cluster.job_script()) + + client = Client(cluster) + futures = [client.submit( + # function to execute + train_dense_model, + # *args + None, save, batch_size, + # this function has potential side effects + pure=not save, + ) for batch_size in batch_sizes] + job_result = client.gather(futures) + if all(job_result): + print('All jobs finished without errors') + else: + print('One job errored out') + print('Shutting down dask workers') + + +if __name__ == '__main__': + launch_dask_tasks() diff --git a/examples/tf/mnist_example.py b/examples/tf/mnist_example.py index 78bd906..66f4ce7 100644 --- a/examples/tf/mnist_example.py +++ b/examples/tf/mnist_example.py @@ -1,10 +1,6 @@ # all taken from https://www.tensorflow.org/guide/keras/functional -import os - import click -import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers + @click.command() @click.option( @@ -20,8 +16,19 @@ is_flag=True, help='Whether you want to save the model or not', ) -def train_dense_model(cuda_visible_devices, save): +def train_dense_model_click(cuda_visible_devices, save): + return train_dense_model(cuda_visible_devices, save, batch_size=64) + + +def train_dense_model(cuda_visible_devices, save, batch_size): + # limit imports oustide the call to the function, in order to launch quickly + # when using dask + import tensorflow as tf + from tensorflow import keras + from tensorflow.keras import layers + if cuda_visible_devices is not None: + import os os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices # model building tf.keras.backend.clear_session() # For easy reset of notebook state. @@ -45,7 +52,7 @@ def train_dense_model(cuda_visible_devices, save): optimizer=keras.optimizers.RMSprop(), metrics=['accuracy']) history = model.fit(x_train, y_train, - batch_size=64, + batch_size=batch_size, epochs=5, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) @@ -55,6 +62,7 @@ def train_dense_model(cuda_visible_devices, save): # saving if save: model.save(os.environ['SCRATCH']) + return True if __name__ == '__main__': - train_dense_model() + train_dense_model_click() diff --git a/examples/tf/mnist_submission_script.slurm b/examples/tf/mnist_submission_script.slurm index 66a4f38..c84554b 100644 --- a/examples/tf/mnist_submission_script.slurm +++ b/examples/tf/mnist_submission_script.slurm @@ -10,6 +10,7 @@ #SBATCH --time=3:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=tf_mnist%j.out # output file name #SBATCH --error=tf_mnist%j.out # error file name +#SBATCH --qos=qos_gpu-dev # we are submitting a test job set -x cd $WORK/jean-zay-doc/examples/tf diff --git a/examples/tf/mnist_submission_script_multi_gpus.slurm b/examples/tf/mnist_submission_script_multi_gpus.slurm index 31fa04e..045aa19 100644 --- a/examples/tf/mnist_submission_script_multi_gpus.slurm +++ b/examples/tf/mnist_submission_script_multi_gpus.slurm @@ -11,6 +11,7 @@ #SBATCH --output=tf_mnist_multi_gpus%A_%a.out # output file name #SBATCH --error=tf_mnist_multi_gpus%A_%a.out # error file name #SBATCH --array=0-1 # one job array with 2 jobs +#SBATCH --qos=qos_gpu-dev # we are submitting a test job set -x cd $WORK/jean-zay-doc/examples/tf