jean-zay-users · lesteve · May 12, 2020 · Apr 9, 2020 · Apr 9, 2020 · Apr 9, 2020
diff --git a/examples/tf/.gitignore b/examples/tf/.gitignore
@@ -0,0 +1,139 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/examples/tf/README.md b/examples/tf/README.md
@@ -1,6 +1,6 @@
 # Tensorflow example script
 
-To run this script you will need to first install click in your environment.
+To run the examples you will need to first install `click` in your environment.
 ```
 module load python/3.7.5 &&\
 pip install click
@@ -12,7 +12,27 @@ cd $WORK &&\
 git clone https://github.com/jean-zay-users/jean-zay-doc.git
 ```
 
-Finally you can just launch the batch job (single GPU) via:
+## Classical examples
+
+For the single GPU job you can do:
 ```
 sbatch jean-zay-doc/examples/tf/mnist_submission_script.slurm
 ```
+
+For the multi GPU job you can do:
+```
+sbatch jean-zay-doc/examples/tf/mnist_submission_script_multi_gpus.slurm
+```
+
+## Dask example
+
+To run the dask example you will need to install `dask-jobqueue` in your environment additionally.
+```
+module load python/3.7.5 &&\
+pip install click dask-jobqueue
+```
+
+You can then do:
+```
+python jean-zay-doc/examples/tf/dask_script.py
+```
diff --git a/examples/tf/dask_script.py b/examples/tf/dask_script.py
@@ -0,0 +1,73 @@
+import click
+from dask.distributed import Client
+from dask_jobqueue import SLURMCluster
+
+from mnist_example import train_dense_model
+
+
+@click.command()
+@click.option(
+    'n_gpus',
+    '-n',
+    default=1,
+    help='The number of GPUs on which to run the mnist examples in parallel. Defaults to 1.',
+    type=int,
+)
+@click.option(
+    'save',
+    '-s',
+    '--save',
+    is_flag=True,
+    help='Whether you want to save the models or not',
+)
+def launch_dask_tasks(n_gpus, save):
+    assert 0 < n_gpus < 5, 'You need to request between 1 and 4 GPUs.'
+
+    job_name = 'dask_mnist_tf_example'
+    if n_gpus > 1:
+        job_name += '_multi_gpus'
+
+    cluster = SLURMCluster(
+        n_workers=n_gpus,
+        cores=n_gpus,
+        job_cpu=10,
+        memory='10GB',
+        job_name=job_name,
+        walltime='1:00:00',
+        interface='ib0',
+        job_extra=[
+            f'--gres=gpu:{n_gpus}',
+            '--qos=qos_gpu-dev',
+            '--distribution=block:block',
+            '--hint=nomultithread',
+            '--output=%x_%j.out',
+        ],
+        env_extra=[
+            'module purge',
+            'module load tensorflow-gpu/py3/2.1.0',
+        ],
+        extra=[f'--resources GPU={n_gpus}'],
+    )
+
+    print(cluster.job_script())
+
+    client = Client(cluster)
+    futures = [client.submit(
+        # function to execute
+        train_dense_model,
+        # *args
+        None, save,
+        # this function has potential side effects
+        pure=not save,
+        resources={'GPU': 1},
+    ) for _ in range(n_gpus)]
+    job_result = client.gather(futures)
+    if all(job_result):
+        print('All jobs finished without errors')
+    else:
+        print('One job errored out')
+    print('Shutting down dask workers')
+
+
+if __name__ == '__main__':
+    launch_dask_tasks()
diff --git a/examples/tf/mnist_example.py b/examples/tf/mnist_example.py
@@ -1,10 +1,6 @@
 # all taken from https://www.tensorflow.org/guide/keras/functional
-import os
-
 import click
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
+
 
 @click.command()
 @click.option(
@@ -20,8 +16,19 @@
     is_flag=True,
     help='Whether you want to save the model or not',
 )
+def train_dense_model_click(cuda_visible_devices, save):
+    return train_dense_model(cuda_visible_devices, save)
+
+
 def train_dense_model(cuda_visible_devices, save):
+    # limit imports oustide the call to the function, in order to launch quickly
+    # when using dask
+    import tensorflow as tf
+    from tensorflow import keras
+    from tensorflow.keras import layers
+
     if cuda_visible_devices is not None:
+        import os
         os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
     # model building
     tf.keras.backend.clear_session()  # For easy reset of notebook state.
@@ -55,6 +62,7 @@ def train_dense_model(cuda_visible_devices, save):
     # saving
     if save:
         model.save(os.environ['SCRATCH'])
+    return True
 
 if __name__ == '__main__':
-    train_dense_model()
+    train_dense_model_click()
diff --git a/examples/tf/mnist_submission_script.slurm b/examples/tf/mnist_submission_script.slurm
@@ -10,6 +10,7 @@
 #SBATCH --time=3:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=tf_mnist%j.out # output file name
 #SBATCH --error=tf_mnist%j.out  # error file name
+#SBATCH --qos=qos_gpu-t4         # it's a testy job
 
 set -x
 cd $WORK/jean-zay-doc/examples/tf

diff --git a/examples/tf/mnist_submission_script_multi_gpus.slurm b/examples/tf/mnist_submission_script_multi_gpus.slurm
@@ -10,6 +10,7 @@
 #SBATCH --time=3:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=tf_mnist_multi_gpus%j.out # output file name
 #SBATCH --error=tf_mnist_multi_gpus%j.out  # error file name
+#SBATCH --qos=qos_gpu-t4         # it's a testy job
 
 set -x
 cd $WORK/jean-zay-doc/examples/tf