ECP-CANDLE · gihanpanapitiya · Dec 23, 2022 · Dec 23, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,15 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/[email protected]
+      - uses: pre-commit/[email protected]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+-   repo: https://github.com/pre-commit/mirrors-yapf    # To format the code to conform YAPF
+    rev: v0.31.0
+    hooks:
+      - id: yapf
+        args: ['--in-place', '--recursive', '--style', 'google']
+
+-   repo: https://github.com/myint/docformatter    # To format the doc strings to conform PEP257
+    rev: v1.4
+    hooks:
+    - id: docformatter
+      args: [--in-place]
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks    # Some common pre-commit hooks
+    rev: v3.4.0
+    hooks:
+    - id: check-yaml               # Checks the syntax of .yaml files.
+      args: [--allow-multiple-documents]
+      exclude: 'meta.yaml'    # Exclude this because it gives an error for '%' in Line 1 and couldn't fix yet
+    - id: end-of-file-fixer        # Makes sure files end with a newline.
+    - id: trailing-whitespace      # Checks for any tabs or spaces after the last non-whitespace character on the line.
+    - id: check-docstring-first    # Checks that code comes after the docstrings.
diff --git a/README.adoc b/README.adoc
@@ -1 +1,39 @@
 See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more information.
+
+# Running the feature domain based comparison
+
+- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR
+ - drug_features.csv shoulld contain the drug features of at least the test set drug molecules
+- The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh
+- Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test
+- upf-1.txt is used as the input file to specify the model hyperparameters as well as the model name and candle_image location.
+
+```
+{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"}
+{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"}
+{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"}
+{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"}
+```
+
+### Running the specific example at workflows/cmp-cv/test
+
+- Clone Supervisor from https://github.com/ECP-CANDLE/Supervisor
+- Clone the DrugCell and SWnet model directories from https://github.com/gihanpanapitiya/DrugCell/tree/to_candle and https://github.com/gihanpanapitiya/SWnet/tree/to_candle
+    - Checkout to_candle branches and create the Singularity containers (.sif files) using the command,
+
+    ```
+    singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def
+    singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def
+    ```
+
+- Add /path/for/sif/DerugCell.sif and  /path/for/sif/SWnet.sif to the PYTHONPATH in workflow.sh
+- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR
+- Run the command ./test-small-1.sh SITE
+
+
+#### Known issues
+
+- some input files required for analysis have to be manually added to candle data dir
+- outputs get written to 'experiments' not CANDLE_DATA_DIR
+- python paths have to be explicitly specified in workflow.sh
+- singularity container is not being used even though the CANDLE_MODEL_TYPE=SINGULARITY is specified
diff --git a/archives/py-loc/p.swift b/archives/py-loc/p.swift
@@ -5,11 +5,11 @@ import location;
 
 L0 = locationFromRank(0);
 L1 = locationFromRank(1);
-  
+
 @location=L0 python_persist("L = []");
 @location=L1 python_persist("L = []");
-string D[]; 
-foreach j in [0:9] { 
+string D[];
+foreach j in [0:9] {
   L = locationFromRank(j%%2);
   D[j] = @location=L python_persist("L.append(repr(2+%i)) " % j);
 }

diff --git a/archives/templates/README.md b/archives/templates/README.md
@@ -16,17 +16,17 @@ In more detail, here are the steps required for running an arbitrary workflow on
 1. Ensure the `$SITE` and `$CANDLE` variables are exported to the environment as specified [here](#CANDLE-settings-at-different-SITEs).
 1. Copy the submission script `$CANDLE/Supervisor/templates/submit_candle_job.sh` to a working directory.
 1. Specify the model in the submission script:
-   1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp").  Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate.
-   1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory.  Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file.
+   1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp"). Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate.
+   1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file.
 1. Specify the workflow in the submission script:
    1. Set the `$WORKFLOW_TYPE` variable as appropriate (currently supported are "upf", and, to a less-tested extent, "mlrMBO").
-   1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory.  Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file.
+   1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file.
 1. Adjust any other variables in the submission script such as the output directory (specified by `$EXPERIMENTS`), the scheduler settings, etc.
 1. Run the script from a submit node like `./submit_candle_job.sh`.
 
 ## Background
 
-In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory.  For example, here is a sample submission script:
+In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory. For example, here is a sample submission script:
 
 ```bash
 #!/bin/bash
@@ -60,13 +60,13 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt
 $CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/swift/workflow.sh $SITE -a $CANDLE/Supervisor/workflows/common/sh/cfg-sys-$SITE.sh $WORKFLOW_SETTINGS_FILE
 ```
 
-When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`.  The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`.  The results of the job will be output in `$EXPERIMENTS`.  Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g.,
+When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`. The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`. The results of the job will be output in `$EXPERIMENTS`. Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g.,
 
 ```bash
 export WORKFLOW_TYPE="mlrMBO"
 ```
 
-In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28".  However, often a model, its default parameters, and a workflow's settings can be reused.
+In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28". However, often a model, its default parameters, and a workflow's settings can be reused.
 
 Thus, we provide templates of these three types of files in the `$CANDLE/Supervisor/templates` directory, the current structure of which is:
 
@@ -102,7 +102,7 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt
 export WORKFLOW_SETTINGS_FILE="$CANDLE/Supervisor/templates/workflow_settings/upf1.txt"
 ```
 
-The template submission script located  at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly.
+The template submission script located at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly.
 
 ## Notes
 
@@ -119,10 +119,10 @@ mymodel_common = candle.Benchmark(file_path, os.getenv("DEFAULT_PARAMS_FILE"), '
 
 I'd recommend this be added to the standard method for making a model [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html).
 
-Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname.  Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link.  In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code.
+Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname. Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link. In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code.
 
 ### CANDLE settings at different SITEs
 
-`$SITE` | `$CANDLE`
-:---: | :---:
-biowulf | /data/BIDS-HPC/public/candle
+| `$SITE` |          `$CANDLE`           |
+| :-----: | :--------------------------: |
+| biowulf | /data/BIDS-HPC/public/candle |
diff --git a/archives/templates/language_agnostic/submit_candle_job.sh b/archives/templates/language_agnostic/submit_candle_job.sh
@@ -8,7 +8,7 @@ export SITE="biowulf"
 # Job specification
 export EXPERIMENTS="$MY_DIR"
 #TODO GZ: These 2 variables are not needed
-export MODEL_NAME="mnist_upf_test" 
+export MODEL_NAME="mnist_upf_test"
 export OBJ_RETURN="val_loss"
 
 # Scheduler settings

diff --git a/archives/templates/language_agnostic/train_model.py b/archives/templates/language_agnostic/train_model.py
@@ -1,8 +1,8 @@
-import sys
-import pickle
 import os
+import pickle
 import random
+import sys
 
-#Generate a random loss function
+# Generate a random loss function
 print(str(sys.argv))
-print(random.uniform(0,1))
+print(random.uniform(0, 1))
diff --git a/archives/templates/model_params/mnist1.txt b/archives/templates/model_params/mnist1.txt
@@ -3,4 +3,4 @@ epochs=20
 batch_size=128
 activation='relu'
 optimizer='rmsprop'
-num_filters=32
+num_filters=32
diff --git a/archives/templates/model_params/uno1.txt b/archives/templates/model_params/uno1.txt
@@ -51,4 +51,4 @@ use_landmark_genes = True
 validation_split = 0.2
 verbose = None
 warmup_lr = False
-save='save/uno'
+save='save/uno'
diff --git a/archives/templates/models/mnist/mnist.py b/archives/templates/models/mnist/mnist.py
@@ -1,13 +1,14 @@
 # add candle_keras library in path
-candle_lib = '/data/BIDS-HPC/public/candle/Candle/common'
+candle_lib = "/data/BIDS-HPC/public/candle/Candle/common"
 import sys
-sys.path.append(candle_lib)
 
+sys.path.append(candle_lib)
 
 import os
-#import sys
+
+# import sys
 file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
+lib_path = os.path.abspath(os.path.join(file_path, "..", "..", "common"))
 sys.path.append(lib_path)
 
 import candle_keras as candle
@@ -19,10 +20,11 @@
 additional_definitions = None
 required = None
 
+
 class MNIST(candle.Benchmark):
+
     def set_locals(self):
         if required is not None:
             self.required = set(required)
         if additional_definitions is not None:
             self.additional_definitions = additional_definitions
-
diff --git a/archives/templates/models/mnist/mnist_mlp.py b/archives/templates/models/mnist/mnist_mlp.py
@@ -1,62 +1,64 @@
-import mnist
 import os
 
-from keras.callbacks import CSVLogger
+import mnist
 from keras import backend as K
+from keras.callbacks import CSVLogger
+
 
 def initialize_parameters():
-    mnist_common = mnist.MNIST(mnist.file_path,
+    mnist_common = mnist.MNIST(
+        mnist.file_path,
         os.getenv("DEFAULT_PARAMS_FILE"),
-        'keras',
-        prog='mnist_mlp',
-        desc='MNIST example'
+        "keras",
+        prog="mnist_mlp",
+        desc="MNIST example",
     )
 
     import candle_keras as candle
 
     # Initialize parameters
     gParameters = candle.initialize_parameters(mnist_common)
-    csv_logger = CSVLogger('{}/params.log'.format(gParameters))
+    csv_logger = CSVLogger("{}/params.log".format(gParameters))
 
     return gParameters
 
+
 def run(gParameters):
     ##########################################
     # Your DL start here. See mnist_mlp.py   #
     ##########################################
-    '''Trains a simple deep NN on the MNIST dataset.
+    """Trains a simple deep NN on the MNIST dataset.
 
-    Gets to 98.40% test accuracy after 20 epochs
-    (there is *a lot* of margin for parameter tuning).
-    2 seconds per epoch on a K520 GPU.
-    '''
+    Gets to 98.40% test accuracy after 20 epochs (there is *a lot* of
+    margin for parameter tuning). 2 seconds per epoch on a K520 GPU.
+    """
 
     # from __future__ import print_function
 
     import keras
     from keras.datasets import mnist
-    from keras.models import Sequential
     from keras.layers import Dense, Dropout
+    from keras.models import Sequential
     from keras.optimizers import RMSprop
 
-    batch_size = gParameters['batch_size']
+    batch_size = gParameters["batch_size"]
     num_classes = 10
-    epochs = gParameters['epochs']
+    epochs = gParameters["epochs"]
 
-    activation = gParameters['activation']
-    optimizer = gParameters['optimizer']
+    activation = gParameters["activation"]
+    optimizer = gParameters["optimizer"]
 
     # the data, split between train and test sets
     (x_train, y_train), (x_test, y_test) = mnist.load_data()
 
     x_train = x_train.reshape(60000, 784)
     x_test = x_test.reshape(10000, 784)
-    x_train = x_train.astype('float32')
-    x_test = x_test.astype('float32')
+    x_train = x_train.astype("float32")
+    x_test = x_test.astype("float32")
     x_train /= 255
     x_test /= 255
-    print(x_train.shape[0], 'train samples')
-    print(x_test.shape[0], 'test samples')
+    print(x_train.shape[0], "train samples")
+    print(x_test.shape[0], "test samples")
 
     # convert class vectors to binary class matrices
     y_train = keras.utils.to_categorical(y_train, num_classes)
@@ -67,32 +69,37 @@ def run(gParameters):
     model.add(Dropout(0.2))
     model.add(Dense(512, activation=activation))
     model.add(Dropout(0.2))
-    model.add(Dense(num_classes, activation='softmax'))
+    model.add(Dense(num_classes, activation="softmax"))
 
     model.summary()
 
-    model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-
-    history = model.fit(x_train, y_train,
-                        batch_size=batch_size,
-                        epochs=epochs,
-                        verbose=1,
-                        validation_data=(x_test, y_test))
+    model.compile(loss="categorical_crossentropy",
+                  optimizer=optimizer,
+                  metrics=["accuracy"])
+
+    history = model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=1,
+        validation_data=(x_test, y_test),
+    )
     score = model.evaluate(x_test, y_test, verbose=0)
-    print('Test loss:', score[0])
-    print('Test accuracy:', score[1])
+    print("Test loss:", score[0])
+    print("Test accuracy:", score[1])
     ##########################################
     # End of mnist_mlp.py ####################
     ##########################################
     return history
 
+
 def main():
     gParameters = initialize_parameters()
     run(gParameters)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
     try:
         K.clear_session()