Skip to content

Commit

Permalink
Tf example dask (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
zaccharieramzi authored May 12, 2020
1 parent 3684909 commit 9d9d4ff
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 10 deletions.
139 changes: 139 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
35 changes: 33 additions & 2 deletions examples/tf/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Tensorflow example script

To run this script you will need to first install click in your environment.
To run the examples you will need to first install `click` in your environment.
```
module load python/3.7.5 &&\
pip install click
Expand All @@ -12,7 +12,38 @@ cd $WORK &&\
git clone https://github.com/jean-zay-users/jean-zay-doc.git
```

Finally you can just launch the batch job (single GPU) via:
## Classical examples

For the single GPU job you can do:
```
sbatch jean-zay-doc/examples/tf/mnist_submission_script.slurm
```

For the multi GPU job you can do:
```
sbatch jean-zay-doc/examples/tf/mnist_submission_script_multi_gpus.slurm
```

## Dask example

To run the dask example you will need to install `dask-jobqueue` in your environment additionally.
Notice that this time you need to use the python module with tensorflow loaded, because [dask will
by default use the same python for the worker as the one you used for the
scheduler](https://jobqueue.dask.org/en/latest/debug.html).
See this [GitHub issue](https://github.com/dask/dask-jobqueue/issues/408) for more information.
```
module load tensorflow-gpu/py3/2.1.0 &&\
pip install click dask-jobqueue
```

You can then do:
```
python jean-zay-doc/examples/tf/dask_script.py 64
```

where 64 is the batch size you want to run the mnist example with.
If you want multiple batch sizes just have them space-separated.

Be sure to load the tensorflow module before launching the dask script because otherwise Tensorflow will not be loaded.
This is because the python executable used to launch the dask worker is the same as the one used to launch the scheduler by default.
You can set it otherwise in the cluster if you want something more tailored.
61 changes: 61 additions & 0 deletions examples/tf/dask_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import click
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

from mnist_example import train_dense_model


@click.command()
@click.argument(
'batch_sizes',
nargs=-1,
type=int,
)
@click.option(
'save',
'-s',
'--save',
is_flag=True,
help='Whether you want to save the models or not',
)
def launch_dask_tasks(batch_sizes, save):
job_name = 'dask_mnist_tf_example'

cluster = SLURMCluster(
cores=1,
job_cpu=10,
memory='10GB',
job_name=job_name,
walltime='1:00:00',
interface='ib0',
job_extra=[
f'--gres=gpu:1',
'--qos=qos_gpu-dev',
'--distribution=block:block',
'--hint=nomultithread',
'--output=%x_%j.out',
],
)
n_jobs = len(batch_sizes)
cluster.scale(jobs=n_jobs)
print(cluster.job_script())

client = Client(cluster)
futures = [client.submit(
# function to execute
train_dense_model,
# *args
None, save, batch_size,
# this function has potential side effects
pure=not save,
) for batch_size in batch_sizes]
job_result = client.gather(futures)
if all(job_result):
print('All jobs finished without errors')
else:
print('One job errored out')
print('Shutting down dask workers')


if __name__ == '__main__':
launch_dask_tasks()
24 changes: 16 additions & 8 deletions examples/tf/mnist_example.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# all taken from https://www.tensorflow.org/guide/keras/functional
import os

import click
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


@click.command()
@click.option(
Expand All @@ -20,8 +16,19 @@
is_flag=True,
help='Whether you want to save the model or not',
)
def train_dense_model(cuda_visible_devices, save):
def train_dense_model_click(cuda_visible_devices, save):
return train_dense_model(cuda_visible_devices, save, batch_size=64)


def train_dense_model(cuda_visible_devices, save, batch_size):
# limit imports oustide the call to the function, in order to launch quickly
# when using dask
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

if cuda_visible_devices is not None:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
# model building
tf.keras.backend.clear_session() # For easy reset of notebook state.
Expand All @@ -45,7 +52,7 @@ def train_dense_model(cuda_visible_devices, save):
optimizer=keras.optimizers.RMSprop(),
metrics=['accuracy'])
history = model.fit(x_train, y_train,
batch_size=64,
batch_size=batch_size,
epochs=5,
validation_split=0.2)
test_scores = model.evaluate(x_test, y_test, verbose=2)
Expand All @@ -55,6 +62,7 @@ def train_dense_model(cuda_visible_devices, save):
# saving
if save:
model.save(os.environ['SCRATCH'])
return True

if __name__ == '__main__':
train_dense_model()
train_dense_model_click()
1 change: 1 addition & 0 deletions examples/tf/mnist_submission_script.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#SBATCH --time=3:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=tf_mnist%j.out # output file name
#SBATCH --error=tf_mnist%j.out # error file name
#SBATCH --qos=qos_gpu-dev # we are submitting a test job

set -x
cd $WORK/jean-zay-doc/examples/tf
Expand Down
1 change: 1 addition & 0 deletions examples/tf/mnist_submission_script_multi_gpus.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#SBATCH --output=tf_mnist_multi_gpus%A_%a.out # output file name
#SBATCH --error=tf_mnist_multi_gpus%A_%a.out # error file name
#SBATCH --array=0-1 # one job array with 2 jobs
#SBATCH --qos=qos_gpu-dev # we are submitting a test job

set -x
cd $WORK/jean-zay-doc/examples/tf
Expand Down

0 comments on commit 9d9d4ff

Please sign in to comment.