Skip to content

Commit

Permalink
Merge pull request #14 from picciama/feature/curvecurator_module
Browse files Browse the repository at this point in the history
Feature/curvecurator module
  • Loading branch information
JudithBernett authored Jan 30, 2025
2 parents 9f502d6 + 1176dc4 commit f69bf66
Show file tree
Hide file tree
Showing 19 changed files with 272 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Initial release of nf-core/drugresponseeval, created with the [nf-core](https://
- Added the docker container and the conda env.yml in the nextflow.config. We just need one container for all
processes as this pipeline automates the PyPI package drevalpy.
- Added usage and output documentation.
- Added CurveCurator to preprocess curves of custom datasets

### `Fixed`

Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

> Bernett, J, Iversen, P, Picciani, M, Wilhelm, M, Baum, K, List, M. Will be published soon.
- [CurveCurator](https://www.nature.com/articles/s41467-023-43696-z): For custom curve fitting on custom datasets. We also used it to re-process the response curves of GDSC1, GDSC2, CCLE, and CTRP.

> Bayer, F.P., Gander, M., Kuster, B., The, M. CurveCurator: a recalibrated F-statistic to assess, classify, and explore significance of dose–response curves. Nature Communications. 2023 Nov;14(7902).
- [DIPK](https://doi.org/10.1093/bib/bbae153): Implemented model in the pipeline.

> Li P, Jiang Z, Liu T, Liu X, Qiao H, Yao X. Improving drug response prediction via integrating gene relationships with deep learning. Briefings in Bioinformatics. 2024 May;25(3):bbae153.
Expand Down
10 changes: 8 additions & 2 deletions bin/load_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,18 @@ def get_parser():
help="List of datasets to use to evaluate predictions across studies. "
"Default is empty list which means no cross-study datasets are used.",
)
parser.add_argument(
"--measure",
type=str,
default="LN_IC50",
help="Name of the column in the dataset containing the drug response measures."
)
return parser


def main(args):
response_data = load_dataset(dataset_name=args.dataset_name, path_data=args.path_data)
cross_study_datasets = [load_dataset(dataset_name=ds, path_data=args.path_data) for ds in args.cross_study_datasets]
response_data = load_dataset(dataset_name=args.dataset_name, path_data=args.path_data, measure=args.measure)
cross_study_datasets = [load_dataset(dataset_name=ds, path_data=args.path_data, measure=args.measure) for ds in args.cross_study_datasets]

# Pickle the object to a file
with open("response_dataset.pkl", "wb") as f:
Expand Down
19 changes: 19 additions & 0 deletions bin/postprocess_curvecurator_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python
from drevalpy.datasets.curvecurator import postprocess
import argparse


def get_parser():
parser = argparse.ArgumentParser(description="Postprocess CurveCurator viability data.")
parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.")
return parser


def main(args):
postprocess(output_folder='./', dataset_name=args.dataset_name)


if __name__ == "__main__":
arg_parser = get_parser()
args = arg_parser.parse_args()
main(args)
26 changes: 26 additions & 0 deletions bin/preprocess_raw_viability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python
from drevalpy.datasets.curvecurator import preprocess
from pathlib import Path
import argparse

def get_parser():
parser = argparse.ArgumentParser(description="Preprocess CurveCurator viability data.")
parser.add_argument("--path_data", type=str, default="", help="Path to base folder containing datasets.")
parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.")
parser.add_argument("--cores", type=int, default=0, help="The number of cores used for CurveCurator fitting.")
return parser


def main(args):
input_file = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv"
preprocess(
input_file=input_file,
output_dir=args.dataset_name,
dataset_name=args.dataset_name,
cores=args.cores
)

if __name__ == "__main__":
arg_parser = get_parser()
args = arg_parser.parse_args()
main(args)
5 changes: 5 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,9 @@ process {
withLabel:process_gpu {
ext.use_gpu = { use_gpu }
}
withLabel:high_cpu_low_mem {
cpus = { 32 * task.attempt }
memory = { 16.GB * task.attempt }
time = { 6.h * task.attempt }
}
}
16 changes: 16 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,22 @@ process {
]
}

withName: 'PREPROCESS_RAW_VIABILITY' {
publishDir = [
path: { params.path_data },
mode: params.publish_dir_mode,
saveAs: { filename -> null }
]
}

withName: 'FIT_CURVES' {
publishDir = [
path: { params.path_data },
mode: params.publish_dir_mode,
saveAs: { filename -> null }
]
}

withName: 'CV_SPLIT' {
publishDir = [
path: { params.path_data },
Expand Down
56 changes: 51 additions & 5 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

1. [Parameter check](#parameter-check): Several parameters are validated to ensure that the pipeline can run
successfully.
2. `RUN_CV` subworkflow: Finds the optimal hyperparameters for each model in a cross-validation setting.
2. `PREPROCESS_CUSTOM` subworkflow: This subworkflow is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`. If this is the case, CurveCurator is run on the raw data.
- [Preprocess raw viability](#preprocess-raw-viability): The raw viability data is put in a format suitable for CurveCurator.
- [Fit curves](#fit-curves): Curves are fitted using CurveCurator.
- [Postprocess CurveCurator data](#postprocess-curvecurator-data): The individual curves.tsv files are collected and one output file is written.
3. `RUN_CV` subworkflow: Finds the optimal hyperparameters for each model in a cross-validation setting.
- [Load response](#load-response): The response data is loaded.
- [CV split](#cv-split): The response data is split into cross-validation folds.
- [Make model channel](#make-model-channel): From the input baseline and model names, channels are created. This
Expand All @@ -23,7 +27,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Train and predict CV](#train-and-predict-cv): All models are trained and evaluated in a cross-validation setting.
- [Evaluate and find max](#evaluate-and-find-max): For each CV split, the best hyperparameters are determined
using a grid search per model
3. `MODEL_TESTING` subworkflow: The best hyperparameters are used to train the models on the full training set
4. `MODEL_TESTING` subworkflow: The best hyperparameters are used to train the models on the full training set
and predict the test set. Optionally, randomization and robustness testes are performed.
- [Predict full](#predict-full): The model is trained on the full training set (train & validation) with the best
hyperparameters to predict the test set.
Expand All @@ -37,7 +41,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Evaluate final](#evaluate-final): The performance of the models is calculated on the test set results.
- [Collect results](#collect-results): The results of the evaluation metrics per model are collected into four
overview tables.
4. `VISUALIZATION` subworkflow: Plots are created summarizing the results.
5. `VISUALIZATION` subworkflow: Plots are created summarizing the results.
- [Critical difference plot](#critical-difference): A critical difference plot is created to compare the performance
of the models.
- [Violin plot](#violin-plot): A violin plot is created to compare the performance of the models over the CV folds.
Expand All @@ -49,14 +53,15 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Save tables](#save-tables): Saves the performance metrics of the models in a table.
- [Write html](#write-html): Writes the plots to an HTML file per setting (LPO/LCO/LDO).
- [Write index](#write-index): Writes an index.html file that links to all the HTML files.
5. [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
6. [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution

### Parameter check

The process `PARAMS_CHECK` performs the following checks:

- `--models` / `--baselines`: Check if the model and baseline names are valid (for valid names, see the [usage](usage.md) page).
- `--test_mode`: Check whether the test mode is LPO, LCO, LDO or a combination of these.
- `--path_data`: Check if the path to the data is valid.
- `--dataset_name`: Check if the dataset name is valid, i.e., GDSC1, GDSC2, or CCLE.
- `--cross_study_datasets`: If supplied, check if the datasets are valid, i.e., GDSC1, GDSC2, or CCLE or a
combination of these.
Expand All @@ -68,14 +73,55 @@ The process `PARAMS_CHECK` performs the following checks:
Partial_Correlation.
- `--response_transformation`: If supplied, checks whether the response transformation is either standard,
minmax, or robust.
- `--measure`: Which measure of drug response should be used for the file. Available options are "LN_IC50", "EC50", "IC50", "pEC50", "AUC", "response". Default: "LN_IC50".
- `--curve_curator`: Whether to run CurveCurator on a custom dataset. Default: false. This requires raw viability data to be located at "<path_data>/<dataset_name>/<dataset_name>\_raw.csv".

It emits the path to the data but mostly so that the other processes wait for `PARAMS_CHECK` to finish before starting.

### Subworkflow `PREPROCESS_CUSTOM`

This process is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`.

#### Preprocess raw viability

The file is processed to be in a format suitable for CurveCurator. One process will be started per dosage.

<details markdown="1">
<summary>Output files</summary>

- "${dataset_name}/\*/config.toml": Configuration files for CurveCurator. Each subdirectory corresponds to a different dosage.
- "${dataset_name}/\*/curvecurator_input.tsv": Input file for CurveCurator. Each subdirectory corresponds to a different dosage.

</details>

#### Fit curves

CurveCurator is run on the input files to fit the curves.

<details markdown="1">
<summary>Output files</summary>
- "curves.tsv": The fitted curves. These will be collected and postprocessed in the next step.
- "mad.txt": Other output - Median absolute deviation analysis is performed to detect problematic experiments, the results are stored in this file.
- "dashboard.html" - A dashboard with an overview of the fitted curves.
- "curveCurator.log" - Log file of the CurveCurator run.
</details>

#### Postprocess CurveCurator data

The individual curves.tsv files are collected and one output file is written to `path_data/dataset_name/dataset_name.csv`.
This file contains the new adjusted measures; available are pEC50 and AUC (now internally renamed as pEC50_curvecurator, AUC_curvecurator).

<details markdown="1">
<summary>Output files</summary>
- "dataset_name.csv": The postprocessed data; exported to the path_data folder.
</details>

### Subworkflow `RUN_CV`

#### Load response

The response data is loaded into the pipeline. The downloaded data is exported to `--path_data`
The response data is loaded into the pipeline. If the data does not lie in `--path_data` it is downloaded and exported to
`--path_data`.
This step is necessary to provide the pipeline with the response data that will be used to train and evaluate the models.

<details markdown="1">
Expand Down
29 changes: 29 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,35 @@ The following datasets are available and can be supplied via `--dataset_name`:
Our pipeline also supports cross-study prediction, i.e., training on one dataset and testing on another (or multiple
others) to assess the generalization of the model. This dataset name can be supplied via `--cross_study_datasets`.

The drug response measure that you want to use as the target variable can be specified via the `--measure` parameter.
Available measures are `[“AUC”, “pEC50”, “EC50”, “IC50”]`.

We have re-fitted all the curves in the available datasets with <b>CurveCurator</b> to ensure that the data is processed
well. If you want to use those measures, enable the `--curve_curator` flag.

#### Custom datasets

You can also provide your own custom dataset via the `--dataset_name` parameter by specifying a name that is not in the list of the available datasets.
This can be prefit data (not recommended for comparability reasons) or raw viability data that is automatically fit
with the exact same procedure that was used to refit the available datasets in the previous section.

<i>Raw viability data</i>

We expect a csv-formatted file in the location `<path_data>/<dataset>/<dataset_name>_raw.csv`
(corresponding to the `--path_data` and `--dataset_name` options), which contains the raw viability data in long format
with the columns `[“dose”, “response”, “sample”, “drug”]` and an optional “replicate” column.
If replicates are provided, the procedure will fit one curve per sample / drug pair using all replicates.

The pipeline then fits the curves using CurveCurator and saves the processed file to `<path_data>/<dataset>/<dataset_name>.csv`
For individual results, look in the work directories.

<i>Prefit viability data</i>

We expect a csv-formatted file in the location `<path_data>/<dataset>/<dataset_name>.csv`
(corresponding to the `--path_data` and `--dataset_name` options), with at least the columns `[“cell_line_id”, “drug_id”, <measure>”]`
where `<measure>` is replaced with the name of the measure you provide (`[“AUC”, “pEC50”, “EC50”, “IC50”]`).
It is required that you use measure names that are also working with the available datasets if you use the `--cross_study_datasets` option.

### Available Randomization Tests

We have several randomization modes and types available.
Expand Down
1 change: 0 additions & 1 deletion env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ channels:
- conda-forge
- defaults
dependencies:
- ray-tune
- pip
- pip:
- drevalpy==1.1.1
17 changes: 17 additions & 0 deletions modules/local/fit_curves/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process FIT_CURVES {
tag "$dir_name"
label 'high_cpu_low_mem'

input:
val dataset_name
tuple val(dir_name), path(toml), path(curvecurator_input)

output:
path("curves.tsv"), emit: path_to_curvecurator_out
tuple path("mad.txt"), path("dashboard.html"), path("curveCurator.log") // other output

script:
"""
CurveCurator ${toml} --mad
"""
}
3 changes: 2 additions & 1 deletion modules/local/load_response/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ process LOAD_RESPONSE {
load_response.py \\
--dataset_name ${dataset_name} \\
--path_data ${work_path} \\
${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''}
${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''} \\
--measure ${measure}
"""

}
4 changes: 2 additions & 2 deletions modules/local/params_check/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ process PARAMS_CHECK {
--n_trials_robustness $n_trials_robustness \\
--dataset_name $dataset_name \\
${cross_study_datasets != '' ? '--cross_study_datasets ' + cross_study_datasets.replace(',', ' ') : ''} \\
${curve_curator ? '--curve_curator' : ''} \\
${curve_curator ? '--curve_curator --curve_curator_cores 1' : ''} \\
--path_data $work_path \\
--measure $measure \\
--optim_metric $optim_metric \\
--n_cv_splits $n_cv_splits \\
--response_transformation $response_transformation
--response_transformation $response_transformation \\
"""
}
18 changes: 18 additions & 0 deletions modules/local/postprocess_curvecurator_output/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
process POSTPROCESS_CURVECURATOR_DATA {
label 'process_single'
publishDir "${params.path_data}/${dataset_name}", mode: 'copy'

input:
val dataset_name
path(curve_data, stageAs: "?/*")
val measure

output:
path "${dataset_name}.csv", emit: path_to_dataset
val "${measure}" + "_curvecurator", emit: measure

script:
"""
postprocess_curvecurator_output.py --dataset_name ${dataset_name}
"""
}
17 changes: 17 additions & 0 deletions modules/local/preprocess_raw_viability/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process PREPROCESS_RAW_VIABILITY {
label 'process_low'

input:
val(dataset_name)
path(work_path)
val useless_count

output:
path "${dataset_name}/*/config.toml", emit: path_to_toml
path "${dataset_name}/*/curvecurator_input.tsv", emit: curvecurator_input

script:
"""
preprocess_raw_viability.py --path_data ${work_path} --dataset_name ${dataset_name} --cores ${task.cpus}
"""
}
18 changes: 9 additions & 9 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"dataset_name": {
"type": "string",
"description": "Name of the dataset.",
"help_text": "Name of the dataset used for the pipeline. Allowed values are GDSC1, GDSC2, and Custom."
"help_text": "Name of the dataset used for the pipeline. This can be either one of the provided datasets ('GDSC1', 'GDSC2', 'CCLE', 'CTRPv2', 'Toy_Data') in which case the datasets with the fitted curves is downloaded, or a custom dataset name, pointing either to raw viability measurements for automatic curve fitting (see 'curve_curator' option, which is required in this case), or prefit data (not recommended for dataset comparability reasons due to potential differences in fitting procedures)."
},
"outdir": {
"type": "string",
Expand Down Expand Up @@ -116,6 +116,12 @@
"description": "Path to the data directory.",
"help_text": "Path to the data directory."
},
"measure": {
"type": "string",
"default": "LN_IC50",
"description": "The name of the measure to use",
"help": "The name of the measure to use, default 'LN_IC50'. If using one of the available datasets (see 'dataset_name' option), this is restricted to one of ['LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response']. This corresponds to the names of the columns that contain theses measures in the provided input dataset. If providing a custom dataset (see 'dataset_name' option), this may differ. If the option 'curve_curator' is set, the prefix '_curvecurator' is automatically appended, e.g. 'LN_IC50_curvecurator', to allow using the refit measures instead of the ones originally published for the available datasets (see 'dataset_name' option for details), allowing for better dataset comparability (refit measures are already provided in the available datasets or computed as part of the fitting procedure when providing custom raw viability datasets, see 'curve_curator' option for details)."
},
"cross_study_datasets": {
"type": "string",
"description": "Datasets for cross-study prediction.",
Expand All @@ -131,14 +137,8 @@
"properties": {
"curve_curator": {
"type": "boolean",
"description": "Run the curve curator.",
"help_text": "Whether to run \" \"CurveCurator \" \"to sort out \" \"non-reactive \" \"curves"
},
"measure": {
"type": "string",
"description": "Which measure of drug response should be used. Only 'LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response' or their equivalents including the '_curvecurator' suffix are allowed.",
"default": "LN_IC50",
"help_text": "Measure to use for the pipeline. Default is LN_IC50."
"description": "If True, use refit measures instead of original measures reported by the authors for the available datasets, or invoke automatic fitting of custom raw viability data.",
"help_text": "This allows using refit measures (see 'measure' option for details) for available datasets, which allows better comparability between datasets. When providing a custom dataset (see 'dataset_name' option), setting this to True expects a csv-formatted file at <path_data>/<dataset_name>/<dataset_name>_raw.csv (also see 'path_data' option), which is fitted automatically with the same procedure as the available datasets, to provide fair comparison. The fitted data will then be stored at <path_data>/<dataset_name>/<dataset_name>.csv."
},
"optim_metric": {
"type": "string",
Expand Down
Loading

0 comments on commit f69bf66

Please sign in to comment.