Merge pull request #14 from picciama/feature/curvecurator_module

Feature/curvecurator module
nf-core · Jan 30, 2025 · f69bf66 · f69bf66
2 parents 9f502d6 + 1176dc4
commit f69bf66
Show file tree

Hide file tree

Showing 19 changed files with 272 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ Initial release of nf-core/drugresponseeval, created with the [nf-core](https://
 - Added the docker container and the conda env.yml in the nextflow.config. We just need one container for all
   processes as this pipeline automates the PyPI package drevalpy.
 - Added usage and output documentation.
+- Added CurveCurator to preprocess curves of custom datasets
 
 ### `Fixed`
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -18,6 +18,10 @@
 
   > Bernett, J, Iversen, P, Picciani, M, Wilhelm, M, Baum, K, List, M. Will be published soon.
 
+- [CurveCurator](https://www.nature.com/articles/s41467-023-43696-z): For custom curve fitting on custom datasets. We also used it to re-process the response curves of GDSC1, GDSC2, CCLE, and CTRP.
+
+  > Bayer, F.P., Gander, M., Kuster, B., The, M. CurveCurator: a recalibrated F-statistic to assess, classify, and explore significance of dose–response curves. Nature Communications. 2023 Nov;14(7902).
+
 - [DIPK](https://doi.org/10.1093/bib/bbae153): Implemented model in the pipeline.
 
   > Li P, Jiang Z, Liu T, Liu X, Qiao H, Yao X. Improving drug response prediction via integrating gene relationships with deep learning. Briefings in Bioinformatics. 2024 May;25(3):bbae153.

diff --git a/bin/load_response.py b/bin/load_response.py
@@ -15,12 +15,18 @@ def get_parser():
         help="List of datasets to use to evaluate predictions across studies. "
         "Default is empty list which means no cross-study datasets are used.",
     )
+    parser.add_argument(
+        "--measure",
+        type=str,
+        default="LN_IC50",
+        help="Name of the column in the dataset containing the drug response measures."
+    )
     return parser
 
 
 def main(args):
-    response_data = load_dataset(dataset_name=args.dataset_name, path_data=args.path_data)
-    cross_study_datasets = [load_dataset(dataset_name=ds, path_data=args.path_data) for ds in args.cross_study_datasets]
+    response_data = load_dataset(dataset_name=args.dataset_name, path_data=args.path_data, measure=args.measure)
+    cross_study_datasets = [load_dataset(dataset_name=ds, path_data=args.path_data, measure=args.measure) for ds in args.cross_study_datasets]
 
     # Pickle the object to a file
     with open("response_dataset.pkl", "wb") as f:

diff --git a/bin/postprocess_curvecurator_output.py b/bin/postprocess_curvecurator_output.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+from drevalpy.datasets.curvecurator import postprocess
+import argparse
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Postprocess CurveCurator viability data.")
+    parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.")
+    return parser
+
+
+def main(args):
+    postprocess(output_folder='./', dataset_name=args.dataset_name)
+
+
+if __name__ == "__main__":
+    arg_parser = get_parser()
+    args = arg_parser.parse_args()
+    main(args)
diff --git a/bin/preprocess_raw_viability.py b/bin/preprocess_raw_viability.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+from drevalpy.datasets.curvecurator import preprocess
+from pathlib import Path
+import argparse
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Preprocess CurveCurator viability data.")
+    parser.add_argument("--path_data", type=str, default="", help="Path to base folder containing datasets.")
+    parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.")
+    parser.add_argument("--cores", type=int, default=0, help="The number of cores used for CurveCurator fitting.")
+    return parser
+
+
+def main(args):
+    input_file = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv"
+    preprocess(
+        input_file=input_file,
+        output_dir=args.dataset_name,
+        dataset_name=args.dataset_name,
+        cores=args.cores
+    )
+
+if __name__ == "__main__":
+    arg_parser = get_parser()
+    args = arg_parser.parse_args()
+    main(args)
diff --git a/conf/base.config b/conf/base.config
@@ -60,4 +60,9 @@ process {
     withLabel:process_gpu {
         ext.use_gpu = { use_gpu }
     }
+    withLabel:high_cpu_low_mem {
+        cpus   = { 32   * task.attempt }
+        memory = { 16.GB  * task.attempt }
+        time   = { 6.h  * task.attempt }
+    }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -26,6 +26,22 @@ process {
         ]
     }
 
+    withName: 'PREPROCESS_RAW_VIABILITY' {
+        publishDir = [
+            path: { params.path_data },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> null }
+        ]
+    }
+
+    withName: 'FIT_CURVES' {
+        publishDir = [
+            path: { params.path_data },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> null }
+        ]
+    }
+
     withName: 'CV_SPLIT' {
         publishDir = [
             path: { params.path_data },

diff --git a/docs/output.md b/docs/output.md
@@ -13,7 +13,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 1. [Parameter check](#parameter-check): Several parameters are validated to ensure that the pipeline can run
    successfully.
-2. `RUN_CV` subworkflow: Finds the optimal hyperparameters for each model in a cross-validation setting.
+2. `PREPROCESS_CUSTOM` subworkflow: This subworkflow is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`. If this is the case, CurveCurator is run on the raw data.
+   - [Preprocess raw viability](#preprocess-raw-viability): The raw viability data is put in a format suitable for CurveCurator.
+   - [Fit curves](#fit-curves): Curves are fitted using CurveCurator.
+   - [Postprocess CurveCurator data](#postprocess-curvecurator-data): The individual curves.tsv files are collected and one output file is written.
+3. `RUN_CV` subworkflow: Finds the optimal hyperparameters for each model in a cross-validation setting.
    - [Load response](#load-response): The response data is loaded.
    - [CV split](#cv-split): The response data is split into cross-validation folds.
    - [Make model channel](#make-model-channel): From the input baseline and model names, channels are created. This
@@ -23,7 +27,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
    - [Train and predict CV](#train-and-predict-cv): All models are trained and evaluated in a cross-validation setting.
    - [Evaluate and find max](#evaluate-and-find-max): For each CV split, the best hyperparameters are determined
      using a grid search per model
-3. `MODEL_TESTING` subworkflow: The best hyperparameters are used to train the models on the full training set
+4. `MODEL_TESTING` subworkflow: The best hyperparameters are used to train the models on the full training set
    and predict the test set. Optionally, randomization and robustness testes are performed.
    - [Predict full](#predict-full): The model is trained on the full training set (train & validation) with the best
      hyperparameters to predict the test set.
@@ -37,7 +41,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
    - [Evaluate final](#evaluate-final): The performance of the models is calculated on the test set results.
    - [Collect results](#collect-results): The results of the evaluation metrics per model are collected into four
      overview tables.
-4. `VISUALIZATION` subworkflow: Plots are created summarizing the results.
+5. `VISUALIZATION` subworkflow: Plots are created summarizing the results.
    - [Critical difference plot](#critical-difference): A critical difference plot is created to compare the performance
      of the models.
    - [Violin plot](#violin-plot): A violin plot is created to compare the performance of the models over the CV folds.
@@ -49,14 +53,15 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
    - [Save tables](#save-tables): Saves the performance metrics of the models in a table.
    - [Write html](#write-html): Writes the plots to an HTML file per setting (LPO/LCO/LDO).
    - [Write index](#write-index): Writes an index.html file that links to all the HTML files.
-5. [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+6. [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
 ### Parameter check
 
 The process `PARAMS_CHECK` performs the following checks:
 
 - `--models` / `--baselines`: Check if the model and baseline names are valid (for valid names, see the [usage](usage.md) page).
 - `--test_mode`: Check whether the test mode is LPO, LCO, LDO or a combination of these.
+- `--path_data`: Check if the path to the data is valid.
 - `--dataset_name`: Check if the dataset name is valid, i.e., GDSC1, GDSC2, or CCLE.
 - `--cross_study_datasets`: If supplied, check if the datasets are valid, i.e., GDSC1, GDSC2, or CCLE or a
   combination of these.
@@ -68,14 +73,55 @@ The process `PARAMS_CHECK` performs the following checks:
   Partial_Correlation.
 - `--response_transformation`: If supplied, checks whether the response transformation is either standard,
   minmax, or robust.
+- `--measure`: Which measure of drug response should be used for the file. Available options are "LN_IC50", "EC50", "IC50", "pEC50", "AUC", "response". Default: "LN_IC50".
+- `--curve_curator`: Whether to run CurveCurator on a custom dataset. Default: false. This requires raw viability data to be located at "<path_data>/<dataset_name>/<dataset_name>\_raw.csv".
 
 It emits the path to the data but mostly so that the other processes wait for `PARAMS_CHECK` to finish before starting.
 
+### Subworkflow `PREPROCESS_CUSTOM`
+
+This process is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`.
+
+#### Preprocess raw viability
+
+The file is processed to be in a format suitable for CurveCurator. One process will be started per dosage.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- "${dataset_name}/\*/config.toml": Configuration files for CurveCurator. Each subdirectory corresponds to a different dosage.
+- "${dataset_name}/\*/curvecurator_input.tsv": Input file for CurveCurator. Each subdirectory corresponds to a different dosage.
+
+</details>
+
+#### Fit curves
+
+CurveCurator is run on the input files to fit the curves.
+
+<details markdown="1">
+<summary>Output files</summary>
+- "curves.tsv": The fitted curves. These will be collected and postprocessed in the next step.
+- "mad.txt": Other output - Median absolute deviation analysis is performed to detect problematic experiments, the results are stored in this file.
+- "dashboard.html" - A dashboard with an overview of the fitted curves.
+- "curveCurator.log" - Log file of the CurveCurator run.
+</details>
+
+#### Postprocess CurveCurator data
+
+The individual curves.tsv files are collected and one output file is written to `path_data/dataset_name/dataset_name.csv`.
+This file contains the new adjusted measures; available are pEC50 and AUC (now internally renamed as pEC50_curvecurator, AUC_curvecurator).
+
+<details markdown="1">
+<summary>Output files</summary>
+- "dataset_name.csv": The postprocessed data; exported to the path_data folder.
+</details>
+
 ### Subworkflow `RUN_CV`
 
 #### Load response
 
-The response data is loaded into the pipeline. The downloaded data is exported to `--path_data`
+The response data is loaded into the pipeline. If the data does not lie in `--path_data` it is downloaded and exported to
+`--path_data`.
 This step is necessary to provide the pipeline with the response data that will be used to train and evaluate the models.
 
 <details markdown="1">

diff --git a/docs/usage.md b/docs/usage.md
@@ -142,6 +142,35 @@ The following datasets are available and can be supplied via `--dataset_name`:
 Our pipeline also supports cross-study prediction, i.e., training on one dataset and testing on another (or multiple
 others) to assess the generalization of the model. This dataset name can be supplied via `--cross_study_datasets`.
 
+The drug response measure that you want to use as the target variable can be specified via the `--measure` parameter.
+Available measures are `[“AUC”, “pEC50”, “EC50”, “IC50”]`.
+
+We have re-fitted all the curves in the available datasets with <b>CurveCurator</b> to ensure that the data is processed
+well. If you want to use those measures, enable the `--curve_curator` flag.
+
+#### Custom datasets
+
+You can also provide your own custom dataset via the `--dataset_name` parameter by specifying a name that is not in the list of the available datasets.
+This can be prefit data (not recommended for comparability reasons) or raw viability data that is automatically fit
+with the exact same procedure that was used to refit the available datasets in the previous section.
+
+<i>Raw viability data</i>
+
+We expect a csv-formatted file in the location `<path_data>/<dataset>/<dataset_name>_raw.csv`
+(corresponding to the `--path_data` and `--dataset_name` options), which contains the raw viability data in long format
+with the columns `[“dose”, “response”, “sample”, “drug”]` and an optional “replicate” column.
+If replicates are provided, the procedure will fit one curve per sample / drug pair using all replicates.
+
+The pipeline then fits the curves using CurveCurator and saves the processed file to `<path_data>/<dataset>/<dataset_name>.csv`
+For individual results, look in the work directories.
+
+<i>Prefit viability data</i>
+
+We expect a csv-formatted file in the location `<path_data>/<dataset>/<dataset_name>.csv`
+(corresponding to the `--path_data` and `--dataset_name` options), with at least the columns `[“cell_line_id”, “drug_id”, <measure>”]`
+where `<measure>` is replaced with the name of the measure you provide (`[“AUC”, “pEC50”, “EC50”, “IC50”]`).
+It is required that you use measure names that are also working with the available datasets if you use the `--cross_study_datasets` option.
+
 ### Available Randomization Tests
 
 We have several randomization modes and types available.

diff --git a/env.yml b/env.yml
@@ -3,7 +3,6 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - ray-tune
   - pip
   - pip:
       - drevalpy==1.1.1
diff --git a/modules/local/fit_curves/main.nf b/modules/local/fit_curves/main.nf
@@ -0,0 +1,17 @@
+process FIT_CURVES {
+    tag "$dir_name"
+    label 'high_cpu_low_mem'
+
+    input:
+    val dataset_name
+    tuple val(dir_name), path(toml), path(curvecurator_input)
+
+    output:
+    path("curves.tsv"), emit: path_to_curvecurator_out
+    tuple path("mad.txt"), path("dashboard.html"), path("curveCurator.log") // other output
+
+    script:
+    """
+    CurveCurator ${toml} --mad
+    """
+}
diff --git a/modules/local/load_response/main.nf b/modules/local/load_response/main.nf
@@ -18,7 +18,8 @@ process LOAD_RESPONSE {
     load_response.py \\
         --dataset_name ${dataset_name} \\
         --path_data ${work_path} \\
-        ${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''}
+        ${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''} \\
+        --measure ${measure}
     """
 
 }
diff --git a/modules/local/params_check/main.nf b/modules/local/params_check/main.nf
@@ -34,11 +34,11 @@ process PARAMS_CHECK {
         --n_trials_robustness $n_trials_robustness \\
         --dataset_name $dataset_name \\
         ${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''} \\
-        ${curve_curator ? '--curve_curator' : ''} \\
+        ${curve_curator ? '--curve_curator --curve_curator_cores 1' : ''} \\
         --path_data $work_path \\
         --measure $measure \\
         --optim_metric $optim_metric \\
         --n_cv_splits $n_cv_splits \\
-        --response_transformation $response_transformation
+        --response_transformation $response_transformation \\
     """
 }
diff --git a/modules/local/postprocess_curvecurator_output/main.nf b/modules/local/postprocess_curvecurator_output/main.nf
@@ -0,0 +1,18 @@
+process POSTPROCESS_CURVECURATOR_DATA {
+    label 'process_single'
+    publishDir "${params.path_data}/${dataset_name}", mode: 'copy'
+
+    input:
+    val dataset_name
+    path(curve_data, stageAs: "?/*")
+    val measure
+
+    output:
+    path "${dataset_name}.csv", emit: path_to_dataset
+    val "${measure}" + "_curvecurator", emit: measure
+
+    script:
+    """
+    postprocess_curvecurator_output.py --dataset_name ${dataset_name}
+    """
+}
diff --git a/modules/local/preprocess_raw_viability/main.nf b/modules/local/preprocess_raw_viability/main.nf
@@ -0,0 +1,17 @@
+process PREPROCESS_RAW_VIABILITY {
+    label 'process_low'
+
+    input:
+    val(dataset_name)
+    path(work_path)
+    val useless_count
+
+    output:
+    path "${dataset_name}/*/config.toml", emit: path_to_toml
+    path "${dataset_name}/*/curvecurator_input.tsv", emit: curvecurator_input
+
+    script:
+    """
+    preprocess_raw_viability.py --path_data ${work_path} --dataset_name ${dataset_name} --cores ${task.cpus}
+    """
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -39,7 +39,7 @@
                 "dataset_name": {
                     "type": "string",
                     "description": "Name of the dataset.",
-                    "help_text": "Name of the dataset used for the pipeline. Allowed values are GDSC1, GDSC2, and Custom."
+                    "help_text": "Name of the dataset used for the pipeline. This can be either one of the provided datasets ('GDSC1', 'GDSC2', 'CCLE', 'CTRPv2', 'Toy_Data') in which case the datasets with the fitted curves is downloaded, or a custom dataset name, pointing either to raw viability measurements for automatic curve fitting (see 'curve_curator' option, which is required in this case), or prefit data (not recommended for dataset comparability reasons due to potential differences in fitting procedures)."
                 },
                 "outdir": {
                     "type": "string",
@@ -116,6 +116,12 @@
                     "description": "Path to the data directory.",
                     "help_text": "Path to the data directory."
                 },
+                "measure": {
+                    "type": "string",
+                    "default": "LN_IC50",
+                    "description": "The name of the measure to use",
+                    "help": "The name of the measure to use, default 'LN_IC50'. If using one of the available datasets (see 'dataset_name' option), this is restricted to one of ['LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response']. This corresponds to the names of the columns that contain theses measures in the provided input dataset. If providing a custom dataset (see 'dataset_name' option), this may differ. If the option 'curve_curator' is set, the prefix '_curvecurator' is automatically appended, e.g. 'LN_IC50_curvecurator', to allow using the refit measures instead of the ones originally published for the available datasets (see 'dataset_name' option for details), allowing for better dataset comparability (refit measures are already provided in the available datasets or computed as part of the fitting procedure when providing custom raw viability datasets, see 'curve_curator' option for details)."
+                },
                 "cross_study_datasets": {
                     "type": "string",
                     "description": "Datasets for cross-study prediction.",
@@ -131,14 +137,8 @@
             "properties": {
                 "curve_curator": {
                     "type": "boolean",
-                    "description": "Run the curve curator.",
-                    "help_text": "Whether to run \" \"CurveCurator \" \"to sort out \" \"non-reactive \" \"curves"
-                },
-                "measure": {
-                    "type": "string",
-                    "description": "Which measure of drug response should be used. Only 'LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response' or their equivalents including the '_curvecurator' suffix are allowed.",
-                    "default": "LN_IC50",
-                    "help_text": "Measure to use for the pipeline. Default is LN_IC50."
+                    "description": "If True, use refit measures instead of original measures reported by the authors for the available datasets, or invoke automatic fitting of custom raw viability data.",
+                    "help_text": "This allows using refit measures (see 'measure' option for details) for available datasets, which allows better comparability between datasets. When providing a custom dataset (see 'dataset_name' option), setting this to True expects a csv-formatted file at <path_data>/<dataset_name>/<dataset_name>_raw.csv (also see 'path_data' option), which is fitted automatically with the same procedure as the available datasets, to provide fair comparison. The fitted data will then be stored at <path_data>/<dataset_name>/<dataset_name>.csv."
                 },
                 "optim_metric": {
                     "type": "string",