From 9d9d4ff7b427512ffe3300bd247b98fa20f63091 Mon Sep 17 00:00:00 2001
From: Zaccharie Ramzi <zaccharie.ramzi@gmail.com>
Date: Tue, 12 May 2020 12:31:16 +0200
Subject: [PATCH] Tf example dask (#30)

---
 .gitignore                                    | 139 ++++++++++++++++++
 examples/tf/README.md                         |  35 ++++-
 examples/tf/dask_script.py                    |  61 ++++++++
 examples/tf/mnist_example.py                  |  24 ++-
 examples/tf/mnist_submission_script.slurm     |   1 +
 .../mnist_submission_script_multi_gpus.slurm  |   1 +
 6 files changed, 251 insertions(+), 10 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 examples/tf/dask_script.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b26ab7e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,139 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/examples/tf/README.md b/examples/tf/README.md
index 45c0f57..cc0d290 100644
--- a/examples/tf/README.md
+++ b/examples/tf/README.md
@@ -1,6 +1,6 @@
 # Tensorflow example script
 
-To run this script you will need to first install click in your environment.
+To run the examples you will need to first install `click` in your environment.
 ```
 module load python/3.7.5 &&\
 pip install click
@@ -12,7 +12,38 @@ cd $WORK &&\
 git clone https://github.com/jean-zay-users/jean-zay-doc.git
 ```
 
-Finally you can just launch the batch job (single GPU) via:
+## Classical examples
+
+For the single GPU job you can do:
 ```
 sbatch jean-zay-doc/examples/tf/mnist_submission_script.slurm
 ```
+
+For the multi GPU job you can do:
+```
+sbatch jean-zay-doc/examples/tf/mnist_submission_script_multi_gpus.slurm
+```
+
+## Dask example
+
+To run the dask example you will need to install `dask-jobqueue` in your environment additionally.
+Notice that this time you need to use the python module with tensorflow loaded, because [dask will
+by default use the same python for the worker as the one you used for the
+scheduler](https://jobqueue.dask.org/en/latest/debug.html).
+See this [GitHub issue](https://github.com/dask/dask-jobqueue/issues/408) for more information.
+```
+module load tensorflow-gpu/py3/2.1.0 &&\
+pip install click dask-jobqueue
+```
+
+You can then do:
+```
+python jean-zay-doc/examples/tf/dask_script.py 64
+```
+
+where 64 is the batch size you want to run the mnist example with.
+If you want multiple batch sizes just have them space-separated.
+
+Be sure to load the tensorflow module before launching the dask script because otherwise Tensorflow will not be loaded.
+This is because the python executable used to launch the dask worker is the same as the one used to launch the scheduler by default.
+You can set it otherwise in the cluster if you want something more tailored.
diff --git a/examples/tf/dask_script.py b/examples/tf/dask_script.py
new file mode 100644
index 0000000..ddf665d
--- /dev/null
+++ b/examples/tf/dask_script.py
@@ -0,0 +1,61 @@
+import click
+from dask.distributed import Client
+from dask_jobqueue import SLURMCluster
+
+from mnist_example import train_dense_model
+
+
+@click.command()
+@click.argument(
+    'batch_sizes',
+    nargs=-1,
+    type=int,
+)
+@click.option(
+    'save',
+    '-s',
+    '--save',
+    is_flag=True,
+    help='Whether you want to save the models or not',
+)
+def launch_dask_tasks(batch_sizes, save):
+    job_name = 'dask_mnist_tf_example'
+
+    cluster = SLURMCluster(
+        cores=1,
+        job_cpu=10,
+        memory='10GB',
+        job_name=job_name,
+        walltime='1:00:00',
+        interface='ib0',
+        job_extra=[
+            f'--gres=gpu:1',
+            '--qos=qos_gpu-dev',
+            '--distribution=block:block',
+            '--hint=nomultithread',
+            '--output=%x_%j.out',
+        ],
+    )
+    n_jobs = len(batch_sizes)
+    cluster.scale(jobs=n_jobs)
+    print(cluster.job_script())
+
+    client = Client(cluster)
+    futures = [client.submit(
+        # function to execute
+        train_dense_model,
+        # *args
+        None, save, batch_size,
+        # this function has potential side effects
+        pure=not save,
+    ) for batch_size in batch_sizes]
+    job_result = client.gather(futures)
+    if all(job_result):
+        print('All jobs finished without errors')
+    else:
+        print('One job errored out')
+    print('Shutting down dask workers')
+
+
+if __name__ == '__main__':
+    launch_dask_tasks()
diff --git a/examples/tf/mnist_example.py b/examples/tf/mnist_example.py
index 78bd906..66f4ce7 100644
--- a/examples/tf/mnist_example.py
+++ b/examples/tf/mnist_example.py
@@ -1,10 +1,6 @@
 # all taken from https://www.tensorflow.org/guide/keras/functional
-import os
-
 import click
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
+
 
 @click.command()
 @click.option(
@@ -20,8 +16,19 @@
     is_flag=True,
     help='Whether you want to save the model or not',
 )
-def train_dense_model(cuda_visible_devices, save):
+def train_dense_model_click(cuda_visible_devices, save):
+    return train_dense_model(cuda_visible_devices, save, batch_size=64)
+
+
+def train_dense_model(cuda_visible_devices, save, batch_size):
+    # limit imports oustide the call to the function, in order to launch quickly
+    # when using dask
+    import tensorflow as tf
+    from tensorflow import keras
+    from tensorflow.keras import layers
+
     if cuda_visible_devices is not None:
+        import os
         os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
     # model building
     tf.keras.backend.clear_session()  # For easy reset of notebook state.
@@ -45,7 +52,7 @@ def train_dense_model(cuda_visible_devices, save):
                   optimizer=keras.optimizers.RMSprop(),
                   metrics=['accuracy'])
     history = model.fit(x_train, y_train,
-                        batch_size=64,
+                        batch_size=batch_size,
                         epochs=5,
                         validation_split=0.2)
     test_scores = model.evaluate(x_test, y_test, verbose=2)
@@ -55,6 +62,7 @@ def train_dense_model(cuda_visible_devices, save):
     # saving
     if save:
         model.save(os.environ['SCRATCH'])
+    return True
 
 if __name__ == '__main__':
-    train_dense_model()
+    train_dense_model_click()
diff --git a/examples/tf/mnist_submission_script.slurm b/examples/tf/mnist_submission_script.slurm
index 66a4f38..c84554b 100644
--- a/examples/tf/mnist_submission_script.slurm
+++ b/examples/tf/mnist_submission_script.slurm
@@ -10,6 +10,7 @@
 #SBATCH --time=3:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=tf_mnist%j.out # output file name
 #SBATCH --error=tf_mnist%j.out  # error file name
+#SBATCH --qos=qos_gpu-dev         # we are submitting a test job
 
 set -x
 cd $WORK/jean-zay-doc/examples/tf
diff --git a/examples/tf/mnist_submission_script_multi_gpus.slurm b/examples/tf/mnist_submission_script_multi_gpus.slurm
index 31fa04e..045aa19 100644
--- a/examples/tf/mnist_submission_script_multi_gpus.slurm
+++ b/examples/tf/mnist_submission_script_multi_gpus.slurm
@@ -11,6 +11,7 @@
 #SBATCH --output=tf_mnist_multi_gpus%A_%a.out # output file name
 #SBATCH --error=tf_mnist_multi_gpus%A_%a.out  # error file name
 #SBATCH --array=0-1            # one job array with 2 jobs
+#SBATCH --qos=qos_gpu-dev         # we are submitting a test job
 
 set -x
 cd $WORK/jean-zay-doc/examples/tf