diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc370376..6522ff26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,6 +48,8 @@ jobs: exec_profile: singularity - test_profile: test_lfq exec_profile: conda + - test_profile: test_dda_id + exec_profile: conda steps: - name: Check out pipeline code uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index ac206374..135dc698 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#386](https://github.com/bigbio/quantms/pull/386) Make validation of ontology terms optional +- [#398](https://github.com/bigbio/quantms/pull/398) Python scripts moved to quantms-utils package +- [#389](https://github.com/bigbio/quantms/pull/389) Introduction to DIANN 1.9.1 to the pipeline, only available in Singularity. +- [#397](https://github.com/bigbio/quantms/pull/397) More options included in SDRF validation. ### Fixed @@ -15,12 +18,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#391](https://github.com/bigbio/quantms/pull/391) Move mzML statistics to parquet files from csv +- [#386](https://github.com/bigbio/quantms/pull/386) Make optional the validation of ontology terms in the input SDRF file. +- [#374](https://github.com/bigbio/quantms/pull/374) Create the common msgf+ database in one step before the msgf+ runs on each ms run file. +- + ### `Fixed` - [#396](https://github.com/bigbio/quantms/pull/396) Added verification of tar archive unpacking to prevent silent failures. ### `Dependencies` +- quantms-utils==0.0.7 +- diann==1.9.1 + ### `Parameters` - `validate_ontologies`: enable or disable validating ontologies in the input SDRF file. diff --git a/bin/add_sage_feature.py b/bin/add_sage_feature.py deleted file mode 100755 index 8d53fa06..00000000 --- a/bin/add_sage_feature.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# Add extra features in sage idXML. Adding extra feature in Sage isn't known input for PSMFeatureExtractor - -import sys - -import pandas as pd -import pyopenms as oms - - -def add_feature(idx_file, output_file, feat_file): - extra_feat = [] - feat = pd.read_csv(feat_file, sep="\t") - for _, row in feat.iterrows(): - if row["feature_generator"] == "psm_file": - continue - else: - extra_feat.append(row["feature_name"]) - print("Adding extra feature: {}".format(extra_feat)) - protein_ids = [] - peptide_ids = [] - oms.IdXMLFile().load(idx_file, protein_ids, peptide_ids) - SearchParameters = protein_ids[0].getSearchParameters() - features = SearchParameters.getMetaValue("extra_features") - extra_features = features + "," + ",".join(extra_feat) - SearchParameters.setMetaValue("extra_features", extra_features) - protein_ids[0].setSearchParameters(SearchParameters) - oms.IdXMLFile().store(output_file, protein_ids, peptide_ids) - print("Done") - - -def main(): - idx_file = sys.argv[1] - output_file = sys.argv[2] - feat_file = sys.argv[3] - add_feature(idx_file, output_file, feat_file) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index e7157b25..00000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python - -# nf-core: Update the script to check the sdrf -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - -import errno -import os -import sys - -import click -import pandas as pd -from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame -from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY - -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) - - -@click.group(context_settings=CONTEXT_SETTINGS) -def cli(): - """ - This is the main tool that gives access to all commands to convert SDRF files into pipelines-specific configuration - files. - """ - pass - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - -def check_sdrf( - input_sdrf: str, - skip_ms_validation: bool = False, - skip_factor_validation: bool = False, - skip_experimental_design_validation: bool = False, - use_ols_cache_only: bool = False, - skip_sdrf_validation: bool = False, -): - """ - Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code. - @param input_sdrf: Path to the SDRF file to check - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) - @param skip_factor_validation: Disable the validation of factor values in SDRF - @param skip_experimental_design_validation: Disable the validation of experimental design - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service - @param skip_sdrf_validation: Disable the validation of SDRF - """ - if skip_sdrf_validation: - print("No SDRF validation was performed.") - sys.exit(0) - - df = SdrfDataFrame.parse(input_sdrf) - errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only) - - if not skip_ms_validation: - errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only) - - if not skip_factor_validation: - errors = errors + df.validate_factor_values() - - if not skip_experimental_design_validation: - errors = errors + df.validate_experimental_design() - - for error in errors: - print(error) - - sys.exit(bool(errors)) - - -def check_expdesign(expdesign): - """ - Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code. - @param expdesign: Path to the expdesign file to check - """ - data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) - data = data.dropna() - schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"] - schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"] - - # check table format: two table - with open(expdesign, "r") as f: - lines = f.readlines() - try: - empty_row = lines.index("\n") - except ValueError: - print( - "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf" - ) - sys.exit(1) - if lines.index("\n") >= len(lines): - print( - "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf" - ) - sys.exit(1) - - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] - s_header = lines[empty_row + 1].replace("\n", "").split("\t") - s_DataFrame = pd.DataFrame(s_table, columns=s_header) - - # check missed mandatory column - missed_columns = set(schema_file) - set(data.columns) - if len(missed_columns) != 0: - print("{0} column missed".format(" ".join(missed_columns))) - sys.exit(1) - - missed_columns = set(schema_sample) - set(s_DataFrame.columns) - if len(missed_columns) != 0: - print("{0} column missed".format(" ".join(missed_columns))) - sys.exit(1) - - if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_DataFrame.columns: - print("MSstats_Mixture column missed in ISO experiments") - sys.exit(1) - - # check a logical problem: may be improved - check_expdesign_logic(data, s_DataFrame) - - -def check_expdesign_logic(fTable, sTable): - if int(max(fTable.Fraction_Group)) > len(set(fTable.Fraction_Group)): - print("Fraction_Group discontinuous!") - sys.exit(1) - fTable_D = fTable.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"]) - if fTable_D.shape[0] < fTable.shape[0]: - print( - "Existing duplicate entries in Fraction_Group, Fraction, Label and Sample" - ) - sys.exit(1) - if len(set(sTable.Sample)) < sTable.shape[0]: - print("Existing duplicate Sample in sample table!") - sys.exit(1) - - -@click.command( - "validate", short_help="Reformat nf-core/quantms sdrf file and check its contents." -) -@click.option("--exp_design", help="SDRF/Expdesign file to be validated") -@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True) -@click.option( - "--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True -) -@click.option( - "--skip_ms_validation", - help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)", - is_flag=True, -) -@click.option( - "--skip_factor_validation", - help="Disable the validation of factor values in SDRF", - is_flag=True, -) -@click.option( - "--skip_experimental_design_validation", - help="Disable the validation of experimental design", - is_flag=True, -) -@click.option( - "--use_ols_cache_only", - help="Use ols cache for validation of the terms and not OLS internet service", - is_flag=True, -) -def validate( - exp_design: str, - is_sdrf: bool = False, - skip_sdrf_validation: bool = False, - skip_ms_validation: bool = False, - skip_factor_validation: bool = False, - skip_experimental_design_validation: bool = False, - use_ols_cache_only: bool = False, -): - """ - Reformat nf-core/quantms sdrf file and check its contents. - @param exp_design: SDRF/Expdesign file to be validated - @param is_sdrf: SDRF file or Expdesign file - @param skip_sdrf_validation: Disable the validation of SDRF - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) - @param skip_factor_validation: Disable the validation of factor values in SDRF - @param skip_experimental_design_validation: Disable the validation of experimental design - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service - - """ - # TODO validate expdesign file - if is_sdrf: - check_sdrf( - input_sdrf=exp_design, - skip_sdrf_validation=skip_sdrf_validation, - skip_ms_validation=skip_ms_validation, - skip_factor_validation=skip_factor_validation, - skip_experimental_design_validation=skip_experimental_design_validation, - use_ols_cache_only=use_ols_cache_only, - ) - else: - check_expdesign(exp_design) - - -cli.add_command(validate) - - -def main(): - try: - cli() - except SystemExit as e: - if e.code != 0: - raise - - -if __name__ == "__main__": - main() diff --git a/bin/diann_convert.py b/bin/diann_convert.py deleted file mode 100755 index becf0b4a..00000000 --- a/bin/diann_convert.py +++ /dev/null @@ -1,1633 +0,0 @@ -#!/usr/bin/env python -""" -This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab. -License: Apache 2.0 -Authors: Hong Wong, Yasset Perez-Riverol -Revisions: - 2023-Aug-05: J. Sebastian Paez -""" -import logging -import os -import re -import warnings -from pathlib import Path -from typing import Any, Dict, List, Set, Tuple, Union - -import click -import numpy as np -import pandas as pd -from pyopenms import AASequence, FASTAFile, ModificationsDB -from pyopenms.Constants import PROTON_MASS_U - -pd.set_option("display.max_rows", 500) -pd.set_option("display.max_columns", 500) -pd.set_option("display.width", 1000) - -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) -REVISION = "0.1.1" - -logging.basicConfig( - format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG -) -logger = logging.getLogger(__name__) - - -@click.group(context_settings=CONTEXT_SETTINGS) -def cli(): - pass - - -@click.command("convert") -@click.option("--folder", "-f") -@click.option("--exp_design", "-d") -@click.option("--diann_version", "-v") -@click.option("--dia_params", "-p") -@click.option("--charge", "-c") -@click.option("--missed_cleavages", "-m") -@click.option("--qvalue_threshold", "-q", type=float) -@click.pass_context -def convert( - ctx, - folder, - exp_design, - dia_params, - diann_version, - charge, - missed_cleavages, - qvalue_threshold, -): - """ - Convert DIA-NN output to MSstats, Triqler or mzTab. - The output formats are used for quality control and downstream analysis. - - :param folder: DiannConvert specifies the folder where the required file resides. The folder contains - the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence - FASTA file, version file of DiaNN and ms_info TSVs - :type folder: str - :param dia_params: A list contains DIA parameters - :type dia_params: list - :param diann_version: Path to a version file of DIA-NN - :type diann_version: str - :param charge: The charge assigned by DIA-NN(max_precursor_charge) - :type charge: int - :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN - :type missed_cleavages: int - :param qvalue_threshold: Threshold for filtering q value - :type qvalue_threshold: float - """ - logger.debug(f"Revision {REVISION}") - logger.debug("Reading input files...") - diann_directory = DiannDirectory(folder, diann_version_file=diann_version) - report = diann_directory.main_report_df(qvalue_threshold=qvalue_threshold) - s_DataFrame, f_table = get_exp_design_dfs(exp_design) - - # Convert to MSstats - msstats_columns_keep = [ - "Protein.Names", - "Modified.Sequence", - "Precursor.Charge", - "Precursor.Quantity", - "File.Name", - "Run", - ] - - logger.debug("Converting to MSstats format...") - out_msstats = report[msstats_columns_keep] - out_msstats.columns = [ - "ProteinName", - "PeptideSequence", - "PrecursorCharge", - "Intensity", - "Reference", - "Run", - ] - out_msstats = out_msstats[out_msstats["Intensity"] != 0] - - # Q: What is this line doing? - out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply( - lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1 - ) - out_msstats["FragmentIon"] = "NA" - out_msstats["ProductCharge"] = "0" - out_msstats["IsotopeLabelType"] = "L" - unique_reference_map = { - k: os.path.basename(k) for k in out_msstats["Reference"].unique() - } - out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map) - del unique_reference_map - - logger.debug("\n\nReference Column >>>") - logger.debug(out_msstats["Reference"]) - - logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>") - logger.debug(out_msstats.head(5)) - - logger.debug(f"\n\nf_table ({f_table.shape})>>>") - logger.debug(f_table.head(5)) - - logger.debug(f"\n\ns_DataFrame ({s_DataFrame.shape})>>>") - logger.debug(s_DataFrame.head(5)) - - logger.debug("Adding Fraction, BioReplicate, Condition columns") - # Changing implementation from apply to merge went from several minutes to - # ~50ms - out_msstats = out_msstats.merge( - ( - s_DataFrame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]] - .merge(f_table[["Fraction", "Sample", "run"]], on="Sample") - .rename( - columns={ - "run": "Run", - "MSstats_BioReplicate": "BioReplicate", - "MSstats_Condition": "Condition", - } - ) - .drop(columns=["Sample"]) - ), - on="Run", - validate="many_to_one", - ) - exp_out_prefix = Path(exp_design).stem - out_msstats.to_csv(exp_out_prefix + "_msstats_in.csv", sep=",", index=False) - logger.info(f"MSstats input file is saved as {exp_out_prefix}_msstats_in.csv") - - # Convert to Triqler - triqler_cols = [ - "ProteinName", - "PeptideSequence", - "PrecursorCharge", - "Intensity", - "Run", - "Condition", - ] - out_triqler = out_msstats[triqler_cols] - del out_msstats - out_triqler.columns = [ - "proteins", - "peptide", - "charge", - "intensity", - "run", - "condition", - ] - out_triqler = out_triqler[out_triqler["intensity"] != 0] - - out_triqler.loc[:, "searchScore"] = report["Q.Value"] - out_triqler.loc[:, "searchScore"] = 1 - out_triqler["searchScore"] - out_triqler.to_csv(exp_out_prefix + "_triqler_in.tsv", sep="\t", index=False) - logger.info(f"Triqler input file is saved as {exp_out_prefix}_triqler_in.tsv") - del out_triqler - - mztab_out = f"{Path(exp_design).stem}_out.mzTab" - # Convert to mzTab - diann_directory.convert_to_mztab( - report=report, - f_table=f_table, - charge=charge, - missed_cleavages=missed_cleavages, - dia_params=dia_params, - out=mztab_out, - ) - - -def _true_stem(x): - """ - Return the true stem of a file name, i.e. the - file name without the extension. - - :param x: The file name - :type x: str - :return: The true stem of the file name - :rtype: str - - Examples: - >>> _true_stem("foo.mzML") - 'foo' - >>> _true_stem("foo.d.tar") - 'foo' - - These examples can be tested with pytest: - $ pytest -v --doctest-modules - """ - split = os.path.basename(x).split(".") - stem = split[0] - - # Should I check here that the extensions are - # allowed? I can see how this would break if the - # file name contains a period. - return stem - - -def get_exp_design_dfs(exp_design_file): - logger.info(f"Reading experimental design file: {exp_design_file}") - with open(exp_design_file, "r") as f: - data = f.readlines() - empty_row = data.index("\n") - f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]] - f_header = data[0].replace("\n", "").split("\t") - f_table = pd.DataFrame(f_table, columns=f_header) - f_table.loc[:, "run"] = f_table.apply( - lambda x: _true_stem(x["Spectra_Filepath"]), axis=1 - ) - - s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] - s_header = data[empty_row + 1].replace("\n", "").split("\t") - s_DataFrame = pd.DataFrame(s_table, columns=s_header) - - return s_DataFrame, f_table - - -def compute_mass_modified_peptide(peptide_seq: str) -> float: - """ - Function that takes a peptide sequence including modifications and compute the mass using the AASequence class from - pyopenms. The notation of a peptidoform for pyopenms is the following: - - if not modifications is present: - AVQVHQDTLRTMYFAXR -> AVQVHQDTLRTMYFAX[178.995499]R - if modification is present in Methionine: - AVQVHQDTLRTM(Oxidation)YFAXR -> AVQVHQDTLRTM(Oxidation)YFAX[178.995499]R - - @param peptide_seq: str, peptide sequence - @return: float, mass of the peptide - """ - peptide_parts: List[str] = [] - not_mod = True - aa_mass = { - "X": "X[178.98493453312]", # 196.995499 - 17.003288 - 1.00727646688 - "U": "X[132.94306553312]", # 150.95363 - 17.003288 - 1.00727646688 - "O": "X[237.14773053312]", # 255.158295 - 17.003288 - 1.00727646688 - } - for aa in peptide_seq: - # Check if the letter is in aminoacid - if aa == "(": - not_mod = False - elif aa == ")": - not_mod = True - # Check aminoacid letter - if aa in aa_mass and not_mod: - aa = aa_mass[aa] - elif ( - aa - not in [ - "G", - "A", - "V", - "L", - "I", - "F", - "M", - "P", - "W", - "S", - "C", - "T", - "Y", - "N", - "Q", - "D", - "E", - "K", - "R", - "H", - ] - and not_mod - and aa != ")" - ): - logger.info(f"Unknown amino acid with mass not known:{aa}") - peptide_parts.append(aa) - new_peptide_seq = "".join(peptide_parts) - mass = AASequence.fromString(new_peptide_seq).getMonoWeight() - logger.debug(new_peptide_seq + ":" + str(mass)) - return mass - - -class DiannDirectory: - def __init__(self, base_path, diann_version_file): - self.base_path = Path(base_path) - if not self.base_path.exists() and not self.base_path.is_dir(): - raise NotADirectoryError(f"Path {self.base_path} does not exist") - self.diann_version_file = Path(diann_version_file) - if not self.diann_version_file.is_file(): - raise FileNotFoundError(f"Path {self.diann_version_file} does not exist") - - def find_first_file_with_suffix(self, suffix: str) -> os.PathLike: - """Finds a file with a given suffix in the directory. - - :param suffix: The suffix to search for - :type suffix: str - - :raises FileNotFoundError: If no file with the given suffix is found - """ - try: - return next(self.base_path.glob(f"**/*{suffix}")) - except StopIteration: - raise FileNotFoundError(f"Could not find file with suffix {suffix}") - - @property - def report(self) -> os.PathLike: - return self.find_first_file_with_suffix("report.tsv") - - @property - def pg_matrix(self) -> os.PathLike: - return self.find_first_file_with_suffix("pg_matrix.tsv") - - @property - def pr_matrix(self) -> os.PathLike: - return self.find_first_file_with_suffix("pr_matrix.tsv") - - @property - def fasta(self) -> os.PathLike: - try: - return self.find_first_file_with_suffix(".fasta") - except FileNotFoundError: - return self.find_first_file_with_suffix(".fa") - - @property - def ms_info(self) -> os.PathLike: - return self.find_first_file_with_suffix("ms_info.tsv") - - @property - def diann_version(self) -> str: - logger.debug("Validating DIANN version") - diann_version_id = None - with open(self.diann_version_file) as f: - for line in f: - if "DIA-NN" in line: - logger.debug(f"Found DIA-NN version: {line}") - diann_version_id = line.rstrip("\n").split(": ")[1] - - if diann_version_id is None: - raise ValueError( - f"Could not find DIA-NN version in file {self.diann_version_file}" - ) - - return diann_version_id - - def validate_diann_version(self) -> None: - supported_diann_versions = ["1.8.1", "1.9.beta.1"] - if self.diann_version not in supported_diann_versions: - raise ValueError(f"Unsupported DIANN version {self.diann_version}") - - def convert_to_mztab( - self, - report, - f_table, - charge: int, - missed_cleavages: int, - dia_params: List[Any], - out: os.PathLike, - ) -> None: - logger.info("Converting to mzTab") - self.validate_diann_version() - - # This could be a branching point if we want to support other versions - # of DIA-NN, maybe something like this: - # if diann_version_id == "1.8.1": - # self.convert_to_mztab_1_8_1(report, f_table, charge, missed_cleavages, dia_params) - # else: - # raise ValueError(f"Unsupported DIANN version {diann_version_id}, supported versions are 1.8.1 ...") - - logger.info(f"Reading fasta file: {self.fasta}") - entries: list = [] - f = FASTAFile() - f.load(str(self.fasta), entries) - fasta_entries = [(e.identifier, e.sequence, len(e.sequence)) for e in entries] - fasta_df = pd.DataFrame(fasta_entries, columns=["id", "seq", "len"]) - - logger.info("Mapping run information to report") - index_ref = f_table.copy() - index_ref.rename( - columns={ - "Fraction_Group": "ms_run", - "Sample": "study_variable", - "run": "Run", - }, - inplace=True, - ) - index_ref["ms_run"] = index_ref["ms_run"].astype("int") - index_ref["study_variable"] = index_ref["study_variable"].astype("int") - report = report.merge( - index_ref[["ms_run", "Run", "study_variable"]], - on="Run", - validate="many_to_one", - ) - - MTD, database = mztab_MTD( - index_ref, dia_params, str(self.fasta), charge, missed_cleavages - ) - pg = pd.read_csv( - self.pg_matrix, - sep="\t", - header=0, - ) - PRH = mztab_PRH(report, pg, index_ref, database, fasta_df) - del pg - pr = pd.read_csv( - self.pr_matrix, - sep="\t", - header=0, - ) - precursor_list = list(report["Precursor.Id"].unique()) - PEH = mztab_PEH(report, pr, precursor_list, index_ref, database) - del pr - PSH = mztab_PSH(report, str(self.base_path), database) - del report - MTD.loc["", :] = "" - PRH.loc[len(PRH) + 1, :] = "" - PEH.loc[len(PEH) + 1, :] = "" - with open(out, "w", newline="") as f: - MTD.to_csv(f, mode="w", sep="\t", index=False, header=False) - PRH.to_csv(f, mode="w", sep="\t", index=False, header=True) - PEH.to_csv(f, mode="w", sep="\t", index=False, header=True) - PSH.to_csv(f, mode="w", sep="\t", index=False, header=True) - - logger.info(f"mzTab file generated successfully! at {out}_out.mzTab") - - def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: - remain_cols = [ - "File.Name", - "Run", - "Protein.Group", - "Protein.Names", - "Protein.Ids", - "First.Protein.Description", - "PG.MaxLFQ", - "RT.Start", - "Global.Q.Value", - "Lib.Q.Value", - "PEP", - "Precursor.Normalised", - "Precursor.Id", - "Q.Value", - "Modified.Sequence", - "Stripped.Sequence", - "Precursor.Charge", - "Precursor.Quantity", - "Global.PG.Q.Value", - ] - report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols) - - # filter based on qvalue parameter for downstream analysiss - logger.debug( - f"Filtering report based on qvalue threshold: {qvalue_threshold}, {len(report)} rows" - ) - report = report[report["Q.Value"] < qvalue_threshold] - logger.debug(f"Report filtered, {len(report)} rows remaining") - - logger.debug("Calculating Precursor.Mz") - # Making the map is 10x faster, and includes the mass of - # the modification. with respect to the previous implementation. - uniq_masses = { - k: compute_mass_modified_peptide(k) - for k in report["Modified.Sequence"].unique() - } - mass_vector = report["Modified.Sequence"].map(uniq_masses) - report["Calculate.Precursor.Mz"] = ( - mass_vector + (PROTON_MASS_U * report["Precursor.Charge"]) - ) / report["Precursor.Charge"] - - logger.debug("Indexing Precursors") - # Making the map is 1500x faster - precursor_index_map = { - k: i for i, k in enumerate(report["Precursor.Id"].unique()) - } - report["precursor.Index"] = report["Precursor.Id"].map(precursor_index_map) - - logger.debug(f"Shape of main report {report.shape}") - logger.debug(str(report.head())) - - return report - - -def MTD_mod_info(fix_mod, var_mod): - """ - Convert fixed and variable modifications to the format required by the MTD sub-table. - - :param fix_mod: Fixed modifications from DIA parameter list - :type fix_mod: str - :param var_mod: Variable modifications from DIA parameter list - :type var_mod: str - :return: A tuple contains fixed and variable modifications, and flags indicating whether they are null - :rtype: tuple - """ - var_ptm = [] - fix_ptm = [] - mods_db = ModificationsDB() - - if fix_mod != "null": - fix_flag = 1 - for mod in fix_mod.split(","): - mod_obj = mods_db.getModification(mod) - mod_name = mod_obj.getId() - mod_accession = mod_obj.getUniModAccession() - site = mod_obj.getOrigin() - fix_ptm.append( - ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site) - ) - else: - fix_flag = 0 - fix_ptm.append("[MS, MS:1002453, No fixed modifications searched, ]") - - if var_mod != "null": - var_flag = 1 - for mod in var_mod.split(","): - mod_obj = mods_db.getModification(mod) - mod_name = mod_obj.getId() - mod_accession = mod_obj.getUniModAccession() - site = mod_obj.getOrigin() - var_ptm.append( - ("[UNIMOD, " + mod_accession.upper() + ", " + mod_name + ", ]", site) - ) - else: - var_flag = 0 - var_ptm.append("[MS, MS:1002454, No variable modifications searched, ]") - - return fix_ptm, var_ptm, fix_flag, var_flag - - -def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages): - """ - Construct MTD sub-table. - - :param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching - :type index_ref: pandas.core.frame.DataFrame - :param dia_params: A list contains DIA parameters - :type dia_params: list - :param fasta: Fasta file path - :type fasta: str - :param charge: Charges set by Dia-NN - :type charge: int - :param missed_cleavages: Missed cleavages set by Dia-NN - :type missed_cleavages: int - :return: MTD sub-table - :rtype: pandas.core.frame.DataFrame - """ - logger.info("Constructing MTD sub-table...") - dia_params_list = dia_params.split(";") - dia_params_list = ["null" if i == "" else i for i in dia_params_list] - FragmentMassTolerance = dia_params_list[0] - FragmentMassToleranceUnit = dia_params_list[1] - PrecursorMassTolerance = dia_params_list[2] - PrecursorMassToleranceUnit = dia_params_list[3] - Enzyme = dia_params_list[4] - FixedModifications = dia_params_list[5] - VariableModifications = dia_params_list[6] - out_mztab_MTD = pd.DataFrame() - out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0" - out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary" - out_mztab_MTD.loc[1, "mzTab-type"] = "Quantification" - out_mztab_MTD.loc[1, "title"] = "ConsensusMap export from OpenMS" - out_mztab_MTD.loc[1, "description"] = "OpenMS export from consensusXML" - out_mztab_MTD.loc[1, "protein_search_engine_score[1]"] = ( - "[, , DIA-NN Global.PG.Q.Value, ]" - ) - out_mztab_MTD.loc[1, "peptide_search_engine_score[1]"] = ( - "[, , DIA-NN Q.Value (minimum of the respective precursor q-values), ]" - ) - out_mztab_MTD.loc[1, "psm_search_engine_score[1]"] = ( - "[MS, MS:MS:1001869, protein-level q-value, ]" - ) - out_mztab_MTD.loc[1, "software[1]"] = "[MS, MS:1003253, DIA-NN, Release (v1.8.1)]" - out_mztab_MTD.loc[1, "software[1]-setting[1]"] = fasta - out_mztab_MTD.loc[1, "software[1]-setting[2]"] = "db_version:null" - out_mztab_MTD.loc[1, "software[1]-setting[3]"] = ( - "fragment_mass_tolerance:" + FragmentMassTolerance - ) - out_mztab_MTD.loc[1, "software[1]-setting[4]"] = ( - "fragment_mass_tolerance_unit:" + FragmentMassToleranceUnit - ) - out_mztab_MTD.loc[1, "software[1]-setting[5]"] = ( - "precursor_mass_tolerance:" + PrecursorMassTolerance - ) - out_mztab_MTD.loc[1, "software[1]-setting[6]"] = ( - "precursor_mass_tolerance_unit:" + PrecursorMassToleranceUnit - ) - out_mztab_MTD.loc[1, "software[1]-setting[7]"] = "enzyme:" + Enzyme - out_mztab_MTD.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full" - out_mztab_MTD.loc[1, "software[1]-setting[9]"] = "charges:" + str(charge) - out_mztab_MTD.loc[1, "software[1]-setting[10]"] = "missed_cleavages:" + str( - missed_cleavages - ) - out_mztab_MTD.loc[1, "software[1]-setting[11]"] = ( - "fixed_modifications:" + FixedModifications - ) - out_mztab_MTD.loc[1, "software[1]-setting[12]"] = ( - "variable_modifications:" + VariableModifications - ) - - (fixed_mods, variable_mods, fix_flag, var_flag) = MTD_mod_info( - FixedModifications, VariableModifications - ) - if fix_flag == 1: - for i in range(1, len(fixed_mods) + 1): - out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]"] = fixed_mods[i - 1][0] - out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-site"] = fixed_mods[i - 1][ - 1 - ] - out_mztab_MTD.loc[1, "fixed_mod[" + str(i) + "]-position"] = "Anywhere" - else: - out_mztab_MTD.loc[1, "fixed_mod[1]"] = fixed_mods[0] - - if var_flag == 1: - for i in range(1, len(variable_mods) + 1): - out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]"] = variable_mods[i - 1][ - 0 - ] - out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[ - i - 1 - ][1] - out_mztab_MTD.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere" - else: - out_mztab_MTD.loc[1, "variable_mod[1]"] = variable_mods[0] - - out_mztab_MTD.loc[1, "quantification_method"] = ( - "[MS, MS:1001834, LC-MS label-free quantitation analysis, ]" - ) - out_mztab_MTD.loc[1, "protein-quantification_unit"] = "[, , Abundance, ]" - out_mztab_MTD.loc[1, "peptide-quantification_unit"] = "[, , Abundance, ]" - - for i in range(1, max(index_ref["ms_run"]) + 1): - out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-format"] = ( - "[MS, MS:1000584, mzML file, ]" - ) - out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-location"] = ( - "file://" - + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0] - ) - out_mztab_MTD.loc[1, "ms_run[" + str(i) + "]-id_format"] = ( - "[MS, MS:1000777, spectrum identifier nativeID format, ]" - ) - out_mztab_MTD.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = ( - "[MS, MS:1002038, unlabeled sample, ]" - ) - out_mztab_MTD.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = ( - "ms_run[" + str(i) + "]" - ) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # This is used here in order to ignore performance warnings from pandas. - for i in range(1, max(index_ref["study_variable"]) + 1): - study_variable = [] - for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values): - study_variable.append("assay[" + str(j) + "]") - out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ( - ",".join(study_variable) - ) - out_mztab_MTD.loc[1, "study_variable[" + str(i) + "]-description"] = ( - "no description given" - ) - - # The former loop makes a very sharded frame, this - # makes the frame more compact in memory. - out_mztab_MTD = out_mztab_MTD.copy() - out_mztab_MTD.loc[2, :] = "MTD" - - # Transpose out_mztab_MTD - col = list(out_mztab_MTD.columns) - row = list(out_mztab_MTD.index) - out_mztab_MTD_T = pd.DataFrame(out_mztab_MTD.values.T, index=col, columns=row) - out_mztab_MTD_T.columns = ["inf", "index"] - out_mztab_MTD_T.insert(0, "title", out_mztab_MTD_T.index) - index = out_mztab_MTD_T.loc[:, "index"] - out_mztab_MTD_T.drop(labels=["index"], axis=1, inplace=True) - out_mztab_MTD_T.insert(0, "index", index) - database = os.path.basename(fasta.split(".")[-2]) - - return out_mztab_MTD_T, database - - -def mztab_PRH(report, pg, index_ref, database, fasta_df): - """ - Construct PRH sub-table. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param pg: Dataframe for Dia-NN protein groups matrix - :type pg: pandas.core.frame.DataFrame - :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching - :type index_ref: pandas.core.frame.DataFrame - :param database: Path to fasta file - :type database: str - :param fasta_df: A dataframe contains protein IDs, sequences and lengths - :type fasta_df: pandas.core.frame.DataFrame - :return: PRH sub-table - :rtype: pandas.core.frame.DataFrame - """ - logger.info("Constructing PRH sub-table...") - logger.debug( - f"Input report shape: {report.shape}," - f" input pg shape: {pg.shape}," - f" input index_ref shape: {index_ref.shape}," - f" input fasta_df shape: {fasta_df.shape}" - ) - file = list(pg.columns[5:]) - col = {} - for i in file: - col[i] = ( - "protein_abundance_assay[" - + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) - + "]" - ) - - pg.rename(columns=col, inplace=True) - - logger.debug("Classifying results type ...") - pg["opt_global_result_type"] = "single_protein" - pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = ( - "indistinguishable_protein_group" - ) - - out_mztab_PRH = pg - del pg - out_mztab_PRH = out_mztab_PRH.drop(["Protein.Names"], axis=1) - out_mztab_PRH.rename( - columns={"First.Protein.Description": "description"}, inplace=True - ) - out_mztab_PRH.loc[:, "database"] = database - - null_col = [ - "taxid", - "species", - "database_version", - "search_engine", - "opt_global_Posterior_Probability_score", - "opt_global_nr_found_peptides", - "opt_global_cv_PRIDE:0000303_decoy_hit", - ] - for i in null_col: - out_mztab_PRH.loc[:, i] = "null" - - logger.debug("Extracting accession values (keeping first)...") - out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply( - lambda x: x["Protein.Group"].split(";")[0], axis=1 - ) - - protein_details_df = out_mztab_PRH[ - out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group" - ] - prh_series = ( - protein_details_df["Protein.Group"] - .str.split(";", expand=True) - .stack() - .reset_index(level=1, drop=True) - ) - prh_series.name = "accession" - protein_details_df = ( - protein_details_df.drop("accession", axis=1) - .join(prh_series) - .reset_index() - .drop(columns="index") - ) - if len(protein_details_df) > 0: - logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups") - # The Following line fails if there are no indistinguishable protein groups - protein_details_df.loc[:, "col"] = "protein_details" - # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")] - out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index( - drop=True - ) - else: - logger.info("No indistinguishable protein groups found") - - logger.debug("Calculating protein coverage (bottleneck)...") - # This is a bottleneck - # reimplementation runs in 67s vs 137s (old) in my data - out_mztab_PRH.loc[:, "protein_coverage"] = calculate_protein_coverages( - report=report, out_mztab_PRH=out_mztab_PRH, fasta_df=fasta_df - ) - - logger.debug("Getting ambiguity members...") - # IN THEORY this should be the same as - # out_mztab_PRH["ambiguity_members"] = out_mztab_PRH["Protein.Ids"] - # out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null" - # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null" - out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply( - lambda x: ( - x["Protein.Group"] - if x["opt_global_result_type"] == "indistinguishable_protein_group" - else "null" - ), - axis=1, - ) - - logger.debug("Matching PRH to best search engine score...") - score_looker = ModScoreLooker(report) - out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = ( - out_mztab_PRH.apply( - lambda x: score_looker.get_score(x["Protein.Group"]), - axis=1, - result_type="expand", - ) - ) - - logger.debug("Matching PRH to modifications...") - out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply( - lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand" - ) - - logger.debug("Matching PRH to protein quantification...") - ## quantity at protein level: PG.MaxLFQ - # This used to be a bottleneck in performance - # This implementation drops the run time from 57s to 25ms - protein_agg_report = ( - report[["PG.MaxLFQ", "Protein.Group", "study_variable"]] - .groupby(["study_variable", "Protein.Group"]) - .agg({"PG.MaxLFQ": ["mean", "std", "sem"]}) - .reset_index() - .pivot(columns=["study_variable"], index="Protein.Group") - .reset_index() - ) - protein_agg_report.columns = [ - "::".join([str(s) for s in col]).strip() - for col in protein_agg_report.columns.values - ] - subname_mapper = { - "Protein.Group::::": "Protein.Group", - "PG.MaxLFQ::mean": "protein_abundance_study_variable", - "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable", - "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable", - } - name_mapper = name_mapper_builder(subname_mapper) - protein_agg_report.rename(columns=name_mapper, inplace=True) - # out_mztab_PRH has columns accession and Protein.Ids; 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2'] - # the report table has 'Protein.Group' and 'Protein.Ids': 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2' - # Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab - # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values. - out_mztab_PRH = out_mztab_PRH.merge( - protein_agg_report, - on="Protein.Group", - how="left", - validate="many_to_one", - copy=True, - ) - del name_mapper - del subname_mapper - del protein_agg_report - # end of (former) bottleneck - - out_mztab_PRH.loc[:, "PRH"] = "PRT" - index = out_mztab_PRH.loc[:, "PRH"] - out_mztab_PRH.drop( - ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True - ) - out_mztab_PRH.insert(0, "PRH", index) - out_mztab_PRH.fillna("null", inplace=True) - out_mztab_PRH.loc[:, "database"] = database - new_cols = [col for col in out_mztab_PRH.columns if not col.startswith("opt_")] + [ - col for col in out_mztab_PRH.columns if col.startswith("opt_") - ] - out_mztab_PRH = out_mztab_PRH[new_cols] - return out_mztab_PRH - - -def mztab_PEH( - report: pd.DataFrame, - pr: pd.DataFrame, - precursor_list: List[str], - index_ref: pd.DataFrame, - database: os.PathLike, -) -> pd.DataFrame: - """ - Construct PEH sub-table. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param pr: Dataframe for Dia-NN precursors matrix - :type pr: pandas.core.frame.DataFrame - :param precursor_list: A list contains all precursor IDs - :type precursor_list: list - :param index_ref: On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching - :type index_ref: pandas.core.frame.DataFrame - :param database: Path to fasta file - :type database: str - :return: PEH sub-table - :rtype: pandas.core.frame.DataFrame - """ - logger.info("Constructing PEH sub-table...") - logger.debug( - f"report.shape: {report.shape}, " - f" pr.shape: {pr.shape}," - f" len(precursor_list): {len(precursor_list)}," - f" index_ref.shape: {index_ref.shape}" - ) - out_mztab_PEH = pd.DataFrame() - out_mztab_PEH = pr.iloc[:, 0:10] - out_mztab_PEH.drop( - ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"], - axis=1, - inplace=True, - ) - out_mztab_PEH.rename( - columns={ - "Stripped.Sequence": "sequence", - "Protein.Group": "accession", - "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence", - "Precursor.Charge": "charge", - }, - inplace=True, - ) - - logger.debug("Finding modifications...") - out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply( - lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), - axis=1, - result_type="expand", - ) - - logger.debug("Extracting sequence...") - out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = ( - out_mztab_PEH.apply( - lambda x: AASequence.fromString( - x["opt_global_cv_MS:1000889_peptidoform_sequence"] - ).toString(), - axis=1, - ) - ) - - logger.debug("Checking accession uniqueness...") - out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply( - lambda x: "0" if ";" in str(x["accession"]) else "1", - axis=1, - result_type="expand", - ) - - null_col = [ - "database_version", - "search_engine", - "retention_time_window", - "mass_to_charge", - "opt_global_feature_id", - ] - for i in null_col: - out_mztab_PEH.loc[:, i] = "null" - out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" - - logger.debug("Matching precursor IDs...") - # Pre-calculating the indices and using a lookup table drops run time from - # ~6.5s to 11ms - precursor_indices = {k: i for i, k in enumerate(precursor_list)} - pr_ids = out_mztab_PEH["Precursor.Id"].map(precursor_indices) - out_mztab_PEH["pr_id"] = pr_ids - del precursor_indices - - logger.debug("Getting scores per run") - # This implementation is 422-700x faster than the apply-based one - tmp = ( - report.groupby(["precursor.Index", "ms_run"]) - .agg({"Q.Value": ["min"]}) - .reset_index() - .pivot(columns=["ms_run"], index="precursor.Index") - .reset_index() - ) - tmp.columns = pd.Index( - ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values] - ) - subname_mapper = { - "precursor.Index::::": "precursor.Index", - "Q.Value::min": "search_engine_score[1]_ms_run", - } - name_mapper = name_mapper_builder(subname_mapper) - tmp.rename(columns=name_mapper, inplace=True) - out_mztab_PEH = out_mztab_PEH.merge( - tmp.rename(columns={"precursor.Index": "pr_id"}), - on="pr_id", - validate="one_to_one", - ) - del tmp - del subname_mapper - del name_mapper - - logger.debug("Getting peptide abundances per study variable") - pep_study_report = per_peptide_study_report(report) - out_mztab_PEH = out_mztab_PEH.merge( - pep_study_report, on="pr_id", how="left", validate="one_to_one", copy=True - ) - del pep_study_report - - logger.debug("Getting peptide properties...") - # Re-implementing this section from apply -> assign to groupby->agg - # speeds up the process from 11s to 25ms in my data (~440x faster) - # Notably, this changes slightly... - # "opt_global_q-value" was the FIRST "Global.Q.Value", now its the min - # "opt_global_SpecEValue_score" was the FIRST "Lib.Q.Value" now its the min - # I believe picking the first is inconsistent because no sorting is checked - # and the first is arbitrary. - - aggtable = ( - report.groupby(["precursor.Index"]) - .agg( - { - "Q.Value": "min", - "RT.Start": "mean", - "Global.Q.Value": "min", - "Lib.Q.Value": "min", - "Calculate.Precursor.Mz": "mean", - } - ) - .reset_index() - .rename( - columns={ - "precursor.Index": "pr_id", - "Q.Value": "best_search_engine_score[1]", - "RT.Start": "retention_time", - "Global.Q.Value": "opt_global_q-value", - "Lib.Q.Value": "opt_global_SpecEValue_score", - "Calculate.Precursor.Mz": "mass_to_charge", - } - ) - ) - del out_mztab_PEH["mass_to_charge"] - out_mztab_PEH = out_mztab_PEH.merge(aggtable, on="pr_id", validate="one_to_one") - - logger.debug("Re-ordering columns...") - out_mztab_PEH.loc[:, "PEH"] = "PEP" - out_mztab_PEH.loc[:, "database"] = str(database) - index = out_mztab_PEH.loc[:, "PEH"] - out_mztab_PEH.drop(["PEH", "Precursor.Id", "Genes", "pr_id"], axis=1, inplace=True) - out_mztab_PEH.insert(0, "PEH", index) - out_mztab_PEH.fillna("null", inplace=True) - new_cols = [col for col in out_mztab_PEH.columns if not col.startswith("opt_")] + [ - col for col in out_mztab_PEH.columns if col.startswith("opt_") - ] - out_mztab_PEH = out_mztab_PEH[new_cols] - - return out_mztab_PEH - - -def mztab_PSH(report, folder, database): - """ - Construct PSH sub-table. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param folder: DiannConvert specifies the folder where the required file resides. The folder contains - the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence - FASTA file, version file of DiaNN and ms_info TSVs - :type folder: str - :param database: Path to fasta file - :type database: str - :return: PSH sub-table - :rtype: pandas.core.frame.DataFrame - """ - logger.info("Constructing PSH sub-table") - - def __find_info(directory, n): - # This line matches n="220101_myfile", folder="." to - # "myfolder/220101_myfile_ms_info.tsv" - files = list(Path(directory).rglob(f"{n}_ms_info.parquet")) - # Check that it matches one and only one file - if not files: - raise ValueError(f"Could not find {n} info file in {directory}") - if len(files) > 1: - raise ValueError(f"Found multiple {n} info files in {directory}: {files}") - - return files[0] - - out_mztab_PSH = pd.DataFrame() - for n, group in report.groupby(["Run"]): - if isinstance(n, tuple) and len(n) == 1: - # This is here only to support versions of pandas where the groupby - # key is a tuple. - # related: https://github.com/pandas-dev/pandas/pull/51817 - n = n[0] - - file = __find_info(folder, n) - target = pd.read_parquet(file) - group.sort_values(by="RT.Start", inplace=True) - target = target[["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"]] - target.columns = [ - "RT.Start", - "opt_global_spectrum_reference", - "exp_mass_to_charge", - ] - # Standardize spectrum identifier format for bruker data - if type(target.loc[0, "opt_global_spectrum_reference"]) != str: - target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[ - :, "opt_global_spectrum_reference" - ].astype(str) - - # TODO seconds returned from precursor.getRT() - target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1) - out_mztab_PSH = pd.concat( - [ - out_mztab_PSH, - pd.merge_asof(group, target, on="RT.Start", direction="nearest"), - ] - ) - del report - - ## Score at PSM level: Q.Value - out_mztab_PSH = out_mztab_PSH[ - [ - "Stripped.Sequence", - "Protein.Group", - "Q.Value", - "RT.Start", - "Precursor.Charge", - "Calculate.Precursor.Mz", - "exp_mass_to_charge", - "Modified.Sequence", - "PEP", - "Global.Q.Value", - "Global.Q.Value", - "opt_global_spectrum_reference", - "ms_run", - ] - ] - out_mztab_PSH.columns = [ - "sequence", - "accession", - "search_engine_score[1]", - "retention_time", - "charge", - "calc_mass_to_charge", - "exp_mass_to_charge", - "opt_global_cv_MS:1000889_peptidoform_sequence", - "opt_global_SpecEValue_score", - "opt_global_q-value", - "opt_global_q-value_score", - "opt_global_spectrum_reference", - "ms_run", - ] - - out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" - out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index - out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply( - lambda x: "0" if ";" in str(x["accession"]) else "1", - axis=1, - result_type="expand", - ) - out_mztab_PSH.loc[:, "database"] = database - - null_col = [ - "database_version", - "search_engine", - "pre", - "post", - "start", - "end", - "opt_global_feature_id", - "opt_global_map_index", - ] - for i in null_col: - out_mztab_PSH.loc[:, i] = "null" - - logger.info("Finding Modifications ...") - out_mztab_PSH.loc[:, "modifications"] = out_mztab_PSH.apply( - lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), - axis=1, - result_type="expand", - ) - - out_mztab_PSH.loc[:, "spectra_ref"] = out_mztab_PSH.apply( - lambda x: "ms_run[{}]:".format(x["ms_run"]) - + x["opt_global_spectrum_reference"], - axis=1, - result_type="expand", - ) - - out_mztab_PSH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = ( - out_mztab_PSH.apply( - lambda x: AASequence.fromString( - x["opt_global_cv_MS:1000889_peptidoform_sequence"] - ).toString(), - axis=1, - result_type="expand", - ) - ) - - out_mztab_PSH.loc[:, "PSH"] = "PSM" - index = out_mztab_PSH.loc[:, "PSH"] - out_mztab_PSH.drop(["PSH", "ms_run"], axis=1, inplace=True) - out_mztab_PSH.insert(0, "PSH", index) - out_mztab_PSH.fillna("null", inplace=True) - new_cols = [col for col in out_mztab_PSH.columns if not col.startswith("opt_")] + [ - col for col in out_mztab_PSH.columns if col.startswith("opt_") - ] - out_mztab_PSH = out_mztab_PSH[new_cols] - # out_mztab_PSH.to_csv("./out_psms.mztab", sep=",", index=False) - - return out_mztab_PSH - - -def add_info(target, index_ref): - """ - On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching. - - :param target: The value of "Run" column in f_table - :type target: str - :param index_ref: A dataframe on the basis of f_table - :type index_ref: pandas.core.frame.DataFrame - :return: A tuple contains ms_run and study_variable - :rtype: tuple - """ - match = index_ref[index_ref["Run"] == target] - ms_run = match["ms_run"].values[0] - study_variable = match["study_variable"].values[0] - - return ms_run, study_variable - - -def classify_result_type(target): - """Classify proteins - - :param target: The target dataframe contains "Protein.Group" and "Protein.Ids" - :type target: pandas.core.frame.DataFrame - :return: A string implys protein type - :rtype: str - """ - if ";" in target["Protein.Group"]: - return "indistinguishable_protein_group" - return "single_protein" - - -def match_in_report(report, target, max_, flag, level): - """ - This function is used to match the columns "ms_run" and "study_variable" from the report and - get the corresponding information for the mztab ms_run and study_values metadata values. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein") - :type target: str - :param max_: max_assay or max_study_variable - :type max_: int - :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result - :type flag: int - :param level: "pep" or "protein" - :type level: str - :return: A tuple contains multiple messages - :rtype: tuple - """ # noqa - if flag == 1 and level == "pep": - result = report[report["precursor.Index"] == target] - PEH_params = [] - for i in range(1, max_ + 1): - match = result[result["study_variable"] == i] - PEH_params.extend( - [ - match["Precursor.Normalised"].mean(), - "null", - "null", - "null", - match["RT.Start"].mean(), - ] - ) - - return tuple(PEH_params) - - if flag == 0 and level == "pep": - result = report[report["precursor.Index"] == target] - q_value = [] - for i in range(1, max_ + 1): - match = result[result["ms_run"] == i] - q_value.append( - match["Q.Value"].values[0] - if match["Q.Value"].values.size > 0 - else np.nan - ) - - return tuple(q_value) - - if flag == 1 and level == "protein": - result = report[report["Protein.Group"] == target] - PRH_params = [] - for i in range(1, max_ + 1): - match = result[result["study_variable"] == i] - PRH_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"]) - - return tuple(PRH_params) - - -class ModScoreLooker: - """ - Class used to cache the lookup table of accessions to best scores and their - respective mod sequences. - - Pre-computing the lookup table leverages a lot of speedum and vectortized - operations from pandas, and is much faster than doing the lookup on the fly - in a loop. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - """ - - def __init__(self, report: pd.DataFrame) -> None: - self.lookup_dict = self.make_lookup_dict(report) - - def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]: - grouped_df = ( - report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]] - .sort_values("Global.PG.Q.Value", ascending=True) - .groupby(["Protein.Group"]) - .head(1) - ) - # Modified.Sequence Protein.Group Global.PG.Q.Value - # 78265 LFNEQNFFQR Q8IV63;Q8IV63-2;Q8IV63-3 0.000252 - # 103585 NPTIVNFPITNVDLR Q53GS9;Q53GS9-2 0.000252 - # 103586 NPTWKPLIR Q7Z4Q2;Q7Z4Q2-2 0.000252 - # 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252 - - out = { - row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) - for _, row in grouped_df.iterrows() - } - return out - - def get_score(self, protein_id: str) -> Tuple[Union[str, float], float]: - """Returns a tuple contains modified sequences and the score at protein level. - - Gets the best score and corresponding peptide for a given protein_id - - Note that protein id can be something like 'Q8IV63;Q8IV63-2;Q8IV63-3' - - Note2: This implementation also fixes a bug where the function would - return the first peptide in the report, not the best one. (but with the - score of the best one for that accession) - - :param protein_id: The value of "accession" column in report - :type target: str - :return: A tuple that contains (best modified sequence, best score) - if the accession is not found, (np.nan, np.nan) is returned. - :rtype: tuple - """ - # Q: in what cases can the accession not exist in the table? - # or an accession not have peptides? - val = self.lookup_dict.get(protein_id, (np.nan, np.nan)) - return val - - -# Pre-compiling the regex makes the next function 2x faster -# in my benchmarking - JSPP -MODIFICATION_PATTERN = re.compile(r"\((.*?)\)") - - -def find_modification(peptide): - """ - Identify the modification site based on the peptide containing modifications. - - :param peptide: Sequences of peptides - :type peptide: str - :return: Modification sites - :rtype: str - - Examples: - >>> find_modification("PEPM(UNIMOD:35)IDE") - '4-UNIMOD:35' - >>> find_modification("SM(UNIMOD:35)EWEIRDS(UNIMOD:21)EPTIDEK") - '2-UNIMOD:35,9-UNIMOD:21' - """ - peptide = str(peptide) - original_mods = MODIFICATION_PATTERN.findall(peptide) - peptide = MODIFICATION_PATTERN.sub(".", peptide) - position = [i for i, x in enumerate(peptide) if x == "."] - for j in range(1, len(position)): - position[j] -= j - - for k in range(0, len(original_mods)): - original_mods[k] = str(position[k]) + "-" + original_mods[k].upper() - - original_mods = ( - ",".join(str(i) for i in original_mods) if len(original_mods) > 0 else "null" - ) - - return original_mods - - -def name_mapper_builder(subname_mapper): - """Returns a function that renames the columns of the grouped table to match the ones - in the final table. - - Examples: - >>> mapping_dict = { - ... "precursor.Index::::": "pr_id", - ... "Precursor.Normalised::mean": "peptide_abundance_study_variable" - ... } - >>> name_mapper = name_mapper_builder(mapping_dict) - >>> name_mapper("precursor.Index::::") - "pr_id" - >>> name_mapper("Precursor.Normalised::mean::1") - "peptide_abundance_study_variable[1]" - """ - num_regex = re.compile(r"(.*)::(\d+)$") - - def name_mapper(x): - """Renames the columns of the grouped table to match the ones - in the final table. - - Examples: - >>> name_mapper("precursor.Index::::") - "pr_id" - >>> name_mapper("Precursor.Normalised::mean::1") - "peptide_abundance_study_variable[1]" - """ - orig_x = x - for k, v in subname_mapper.items(): - if k in x: - x = x.replace(k, v) - out = num_regex.sub(r"\1[\2]", x) - if out == orig_x: - # This should never happen but I am adding it here - # to prevent myself from shoting myself in the foot in the future. - raise ValueError(f"Column name {x} not found in subname_mapper") - return out - - return name_mapper - - -def per_peptide_study_report(report: pd.DataFrame) -> pd.DataFrame: - """Summarizes the report at peptide/study level and flattens the columns. - - This function was implemented to replace an 'apply -> filter' approach. - In my benchmarking it went from 35.23 seconds for 4 samples, 4 conditions to - 0.007 seconds. - - This implementation differs in several aspects in the output values: - 1. in the fact that it actually gets values for the m/z - 2. always returns a float, whilst the apply version returns an 'object' dtype. - 3. The original implementation, missing values had the string 'null', here they have the value np.nan. - 4. The order of the final output is different; the original orders columns by - study variables > calculated value, this one is calculated value > study variables. - - Calculates the mean, standard deviation and std error of the precursor - abundances, as well as the mean retention time and m/z. - - The names in the end are called "peptide" but thechnically the are at the - precursor level. (peptide+charge combinations). - - The columns will look like this in the end: - [ - 'pr_id', - 'peptide_abundance_study_variable[1]', - ... - 'peptide_abundance_stdev_study_variable[1]', - ... - 'peptide_abundance_std_error_study_variable[1]', - ... - 'opt_global_retention_time_study_variable[1]', - ... - 'opt_global_mass_to_charge_study_variable[1]', - ... - ] - """ - pep_study_grouped = ( - report.groupby(["study_variable", "precursor.Index"]) - .agg( - { - "Precursor.Normalised": ["mean", "std", "sem"], - "RT.Start": ["mean"], - "Calculate.Precursor.Mz": ["mean"], - } - ) - .reset_index() - .pivot(columns=["study_variable"], index="precursor.Index") - .reset_index() - ) - pep_study_grouped.columns = pd.Index( - [ - "::".join([str(s) for s in col]).strip() - for col in pep_study_grouped.columns.values - ] - ) - # Columns here would be like: - # [ - # "precursor.Index::::", - # "Precursor.Normalised::mean::1", - # "Precursor.Normalised::mean::2", - # "Precursor.Normalised::std::1", - # "Precursor.Normalised::std::2", - # "Precursor.Normalised::sem::1", - # "Precursor.Normalised::sem::2", - # "RT.Start::mean::1", - # "RT.Start::mean::2", - # ] - # So the right names need to be given and the table can be joined with the other one - subname_mapper = { - "precursor.Index::::": "pr_id", - "Precursor.Normalised::mean": "peptide_abundance_study_variable", - "Precursor.Normalised::std": "peptide_abundance_stdev_study_variable", - "Precursor.Normalised::sem": "peptide_abundance_std_error_study_variable", - "Calculate.Precursor.Mz::mean": "opt_global_mass_to_charge_study_variable", - "RT.Start::mean": "opt_global_retention_time_study_variable", - } - name_mapper = name_mapper_builder(subname_mapper) - - pep_study_grouped.rename( - columns=name_mapper, - inplace=True, - ) - - return pep_study_grouped - - -def calculate_coverage(ref_sequence: str, sequences: Set[str]): - """ - Calculates the coverage of the reference sequence by the given sequences. - - Examples: - >>> calculate_coverage("WATEROVERTHEDUCKSBACK", {"WATER", "DUCK"}) - 0.42857142857142855 - >>> calculate_coverage("DUCKDUCKDUCK", {"DUCK"}) - 1.0 - >>> calculate_coverage("WATEROVERTHEDUCK", {"DUCK"}) - 0.25 - >>> calculate_coverage("WATER", {"WAT", "TER"}) - 1.0 - >>> calculate_coverage("WATERGLASS", {"WAT", "TER"}) - 0.5 - """ - starts = [] - lengths = [] - for sequence in sequences: - local_start = 0 - while True: - local_start = ref_sequence.find(sequence, local_start) - if local_start == -1: - break - starts.append(local_start) - lengths.append(len(sequence)) - local_start += 1 - - # merge overlapping intervals - merged_starts: list = [] - merged_lengths: list = [] - for start, length in sorted(zip(starts, lengths)): - if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start: - merged_lengths[-1] = ( - max(merged_starts[-1] + merged_lengths[-1], start + length) - - merged_starts[-1] - ) - else: - merged_starts.append(start) - merged_lengths.append(length) - - # calculate coverage - coverage = sum(merged_lengths) / len(ref_sequence) - return coverage - - -def calculate_protein_coverages( - report: pd.DataFrame, out_mztab_PRH: pd.DataFrame, fasta_df: pd.DataFrame -) -> List[str]: - """Calculates protein coverages for the PRH table. - - The protein coverage is calculated as the fraction of the protein sequence - in the fasta df, covered by the peptides in the report table, for every - protein in the PRH table (defined by accession, not protein.ids). - """ - nested_df = ( - report[["Protein.Group", "Stripped.Sequence"]] - .groupby("Protein.Group") - .agg({"Stripped.Sequence": set}) - .reset_index() - ) - # Protein.Group Stripped.Sequence - # 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR} - # 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER} - # 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,... - ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"])) - acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Group"])) - fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"])) - acc_to_fasta_ids: dict = {} - - # Since fasta ids are something like sp|P51451|BLK_HUMAN but - # accessions are something like Q9Y6V7-2, we need to find a - # partial string match between the two (the best one) - for acc in acc_to_ids: - # I am pretty sure this is the slowest part of the code - matches = fasta_df[fasta_df["id"].str.contains(acc)]["id"] - if len(matches) == 0: - logger.warning( - f"Could not find fasta id for accession {acc} in the fasta file." - ) - acc_to_fasta_ids[acc] = None - elif len(matches) == 1: - acc_to_fasta_ids[acc] = matches.iloc[0] - else: - # If multiple, find best match. ej. Pick Q9Y6V7 over Q9Y6V7-2 - # This can be acquired by finding the shortest string, since - # it entails more un-matched characters. - acc_to_fasta_ids[acc] = min(matches, key=len) - - out: List[str] = [""] * len(out_mztab_PRH["accession"]) - - for i, acc in enumerate(out_mztab_PRH["accession"]): - f_id = acc_to_fasta_ids[acc] - if f_id is None: - out_cov = "null" - else: - cov = calculate_coverage( - fasta_id_to_seqs[f_id], ids_to_seqs[acc_to_ids[acc]] - ) - out_cov = format(cov, ".03f") - - out[i] = out_cov - - return out - - -cli.add_command(convert) - -if __name__ == "__main__": - cli() diff --git a/bin/extract_sample.py b/bin/extract_sample.py deleted file mode 100755 index 84270d5c..00000000 --- a/bin/extract_sample.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -import argparse -import errno -import os -import sys -from pathlib import Path - -import pandas as pd - - -def parse_args(args=None): - Description = "Extract sample information from an experiment design file" - Epilog = "Example usage: python extract_sample.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("EXP", help="Expdesign file to be extracted") - return parser.parse_args(args) - - -def extract_sample(expdesign): - data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) - fTable = data.dropna() - - # two table format - with open(expdesign, "r") as f: - lines = f.readlines() - empty_row = lines.index("\n") - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] - s_header = lines[empty_row + 1].replace("\n", "").split("\t") - s_DataFrame = pd.DataFrame(s_table, columns=s_header) - - sample_dt = pd.DataFrame() - if "MSstats_Mixture" not in s_DataFrame.columns: - fTable = fTable[["Spectra_Filepath", "Sample"]] - fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) - else: - fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True) - for _, row in fTable.iterrows(): - mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]][ - "MSstats_Mixture" - ] - sample_dt = sample_dt.append( - {"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id}, - ignore_index=True, - ) - sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) - - -def main(args=None): - args = parse_args(args) - extract_sample(args.EXP) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py deleted file mode 100755 index aa9196c0..00000000 --- a/bin/ms2rescore_cli.py +++ /dev/null @@ -1,239 +0,0 @@ -#!/usr/bin/env python -# Written by Jonas Scheid under the MIT license - - -import importlib.resources -import json -import logging -import sys -from typing import List - -import click -import pandas as pd -import pyopenms as oms -from ms2rescore import package_data, rescore -from psm_utils import PSMList -from psm_utils.io.idxml import IdXMLReader, IdXMLWriter - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") - - -def parse_cli_arguments_to_config(**kwargs): - """Update default MS²Rescore config with CLI arguments""" - config = json.load( - importlib.resources.open_text(package_data, "config_default.json") - ) - - for key, value in kwargs.items(): - # Skip these arguments since they need to set in a nested dict of feature_generators - if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]: - continue - - elif key == "feature_generators": - feature_generators = value.split(",") - # Reset feature generator dict since there might be default generators we don't want - config["ms2rescore"]["feature_generators"] = {} - if "basic" in feature_generators: - config["ms2rescore"]["feature_generators"]["basic"] = {} - if "ms2pip" in feature_generators: - config["ms2rescore"]["feature_generators"]["ms2pip"] = { - "model_dir": kwargs["ms2pip_model_dir"], - "model": kwargs["ms2pip_model"], - "ms2_tolerance": kwargs["ms2_tolerance"], - } - if "deeplc" in feature_generators: - config["ms2rescore"]["feature_generators"]["deeplc"] = { - "deeplc_retrain": False, - "calibration_set_size": kwargs["calibration_set_size"], - } - if "maxquant" in feature_generators: - config["ms2rescore"]["feature_generators"]["maxquant"] = {} - if "ionmob" in feature_generators: - config["ms2rescore"]["feature_generators"]["ionmob"] = {} - - elif key == "rescoring_engine": - # Reset rescoring engine dict we want to allow only computing features - config["ms2rescore"]["rescoring_engine"] = {} - if value == "mokapot": - config["ms2rescore"]["rescoring_engine"]["mokapot"] = { - "write_weights": True, - "write_txt": False, - "write_flashlfq": False, - "rng": kwargs["rng"], - "test_fdr": kwargs["test_fdr"], - "max_workers": kwargs["processes"], - } - if value == "percolator": - logging.info( - "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step." - ) - continue - - else: - config["ms2rescore"][key] = value - - return config - - -def rescore_idxml(input_file, output_file, config) -> None: - """Rescore PSMs in an idXML file and keep other information unchanged.""" - # Read PSMs - reader = IdXMLReader(input_file) - psm_list = reader.read_file() - - # Rescore - rescore(config, psm_list) - - # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators - peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids) - - # Write - writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered) - writer.write_file(psm_list) - - -def filter_out_artifact_psms( - psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] -) -> List[oms.PeptideIdentification]: - """Filter out PeptideHits that could not be processed by all feature generators""" - num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list]) - new_psm_list = PSMList( - psm_list=[ - psm - for psm in psm_list - if len(psm.rescoring_features) == num_mandatory_features - ] - ) - - # get differing peptidoforms of both psm lists - psm_list_peptides = set( - [next(iter(psm.provenance_data.items()))[1] for psm in psm_list] - ) - new_psm_list_peptides = set( - [next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list] - ) - not_supported_peptides = psm_list_peptides - new_psm_list_peptides - - # no need to filter if all peptides are supported - if len(not_supported_peptides) == 0: - return peptide_ids - # Create new peptide ids and filter out not supported peptides - new_peptide_ids = [] - for peptide_id in peptide_ids: - new_hits = [] - for hit in peptide_id.getHits(): - if hit.getSequence().toString() in not_supported_peptides: - continue - new_hits.append(hit) - if len(new_hits) == 0: - continue - peptide_id.setHits(new_hits) - new_peptide_ids.append(peptide_id) - logging.info( - f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}" - ) - return new_peptide_ids - - -@click.command() -@click.option( - "-p", - "--psm_file", - help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", - required=True, -) -@click.option( - "-s", - "--spectrum_path", - help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", - required=True, -) -@click.option( - "-o", - "--output_path", - help="Path and stem for output file names (default: derive from identification file)", -) -@click.option( - "-l", "--log_level", help="Logging level (default: `info`)", default="info" -) -@click.option( - "-n", - "--processes", - help="Number of parallel processes available to MS²Rescore", - type=int, - default=16, -) -@click.option("-f", "--fasta_file", help="Path to FASTA file") -@click.option( - "-t", - "--test_fdr", - help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)", - default=0.05, -) -@click.option( - "-fg", - "--feature_generators", - help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", - default="", -) -@click.option( - "-pipm", - "--ms2pip_model", - help="MS²PIP model (default: `Immuno-HCD`)", - type=str, - default="Immuno-HCD", -) -@click.option( - "-md", - "--ms2pip_model_dir", - help="The path of MS²PIP model (default: `./`)", - type=str, - default="./", -) -@click.option( - "-ms2tol", - "--ms2_tolerance", - help="Fragment mass tolerance [Da](default: `0.02`)", - type=float, - default=0.02, -) -@click.option( - "-cs", - "--calibration_set_size", - help="Percentage of number of calibration set for DeepLC (default: `0.15`)", - default=0.15, -) -@click.option( - "-re", - "--rescoring_engine", - help="Either mokapot or percolator (default: `mokapot`)", - default="mokapot", -) -@click.option( - "-rng", - "--rng", - help="Seed for mokapot's random number generator (default: `4711`)", - type=int, - default=4711, -) -@click.option( - "-d", - "--id_decoy_pattern", - help="Regex decoy pattern (default: `DECOY_`)", - default="^DECOY_", -) -@click.option( - "-lsb", - "--lower_score_is_better", - help="Interpretation of primary search engine score (default: True)", - default=True, -) -def main(**kwargs): - config = parse_cli_arguments_to_config(**kwargs) - logging.info("MS²Rescore config:") - logging.info(config) - rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/mzml_statistics.py b/bin/mzml_statistics.py deleted file mode 100755 index f738fb98..00000000 --- a/bin/mzml_statistics.py +++ /dev/null @@ -1,233 +0,0 @@ -#!/usr/bin/env python -""" -This script parses the mass spectrum file and generates a set of statistics about the file. -License: Apache 2.0 -Authors: Hong Wong, Yasset Perez-Riverol -""" -import re -import sqlite3 -import sys -from pathlib import Path - -import pandas as pd -import pyarrow -from pyopenms import MSExperiment, MzMLFile - - -def ms_dataframe(ms_path: str, id_only: bool = False) -> None: - """ - Parse the mass spectrum file and generate a set of statistics about the file. - @param ms_path: The file name of the mass spectrum file - @param id_only: If True, it will save a csv with the spectrum id, mz and intensity - - """ - file_columns = [ - "SpectrumID", - "MSLevel", - "Charge", - "MS_peaks", - "Base_Peak_Intensity", - "Summed_Peak_Intensities", - "Retention_Time", - "Exp_Mass_To_Charge", - "AcquisitionDateTime", - ] - - def parse_mzml(file_name: str, file_columns: list, id_only: bool = False): - """ - Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv. - @param file_name: The file name of the mzML file - @param file_columns: The columns of the DataFrame - @param id_only: If True, it will save a csv with the spectrum id, mz and intensity - @return: A pandas DataFrame with the information of the mzML file - """ - - info = [] - psm_part_info = [] - exp = MSExperiment() - acquisition_datetime = exp.getDateTime().get() - MzMLFile().load(file_name, exp) - for spectrum in exp: - id_ = spectrum.getNativeID() - MSLevel = spectrum.getMSLevel() - rt = spectrum.getRT() if spectrum.getRT() else None - - peaks_tuple = spectrum.get_peaks() - peak_per_ms = len(peaks_tuple[0]) - - if not spectrum.metaValueExists("base peak intensity"): - bpc = max(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None - else: - bpc = spectrum.getMetaValue("base peak intensity") - - if not spectrum.metaValueExists("total ion current"): - tic = sum(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None - else: - tic = spectrum.getMetaValue("total ion current") - - if MSLevel == 1: - info_list = [ - id_, - MSLevel, - None, - peak_per_ms, - bpc, - tic, - rt, - None, - acquisition_datetime, - ] - elif MSLevel == 2: - charge_state = spectrum.getPrecursors()[0].getCharge() - emz = ( - spectrum.getPrecursors()[0].getMZ() - if spectrum.getPrecursors()[0].getMZ() - else None - ) - info_list = [ - id_, - MSLevel, - charge_state, - peak_per_ms, - bpc, - tic, - rt, - emz, - acquisition_datetime, - ] - mz_array = peaks_tuple[0] - intensity_array = peaks_tuple[1] - else: - info_list = [ - id_, - MSLevel, - None, - None, - None, - None, - rt, - None, - acquisition_datetime, - ] - - if id_only and MSLevel == 2: - psm_part_info.append( - [ - re.findall(r"[scan|spectrum]=(\d+)", id_)[0], - MSLevel, - mz_array, - intensity_array, - ] - ) - info.append(info_list) - - if id_only and len(psm_part_info) > 0: - pd.DataFrame( - psm_part_info, columns=["scan", "ms_level", "mz", "intensity"] - ).to_parquet(f"{Path(ms_path).stem}_spectrum_df.parquet", index=False) - - return pd.DataFrame(info, columns=file_columns) - - def parse_bruker_d(file_name: str, file_columns: list): - sql_filepath = f"{file_name}/analysis.tdf" - if not Path(sql_filepath).exists(): - msg = f"File '{sql_filepath}' not found" - raise FileNotFoundError(msg) - conn = sqlite3.connect(sql_filepath) - c = conn.cursor() - - datetime_cmd = ( - "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'" - ) - AcquisitionDateTime = c.execute(datetime_cmd).fetchall()[0][0] - - df = pd.read_sql_query( - "SELECT Id, MsMsType, NumPeaks, MaxIntensity, SummedIntensities, Time FROM frames", - conn, - ) - df["AcquisitionDateTime"] = AcquisitionDateTime - - # {8:'DDA-PASEF', 9:'DIA-PASEF'} - if 8 in df["MsMsType"].values: - mslevel_map = {0: 1, 8: 2} - elif 9 in df["MsMsType"].values: - mslevel_map = {0: 1, 9: 2} - else: - msg = f"Unrecognized ms type '{df['MsMsType'].values}'" - raise ValueError(msg) - df["MsMsType"] = df["MsMsType"].map(mslevel_map) - - try: - # This line raises an sqlite error if the table does not exist - _ = conn.execute("SELECT * from Precursors LIMIT 1").fetchall() - precursor_df = pd.read_sql_query("SELECT * from Precursors", conn) - except sqlite3.OperationalError as e: - if "no such table: Precursors" in str(e): - print( - f"No precursers recorded in {file_name}, This is normal for DIA data." - ) - precursor_df = pd.DataFrame() - else: - raise - - if len(df) == len(precursor_df): - df = pd.concat([df, precursor_df["Charge", "MonoisotopicMz"]], axis=1) - df["Charge"] = df["Charge"].fillna(0) - else: - df[["Charge", "Exp_Mass_To_Charge"]] = None, None - - df = df[ - [ - "Id", - "MsMsType", - "Charge", - "NumPeaks", - "MaxIntensity", - "SummedIntensities", - "Time", - "Exp_Mass_To_Charge", - "AcquisitionDateTime", - ] - ] - df.columns = pd.Index(file_columns) - - return df - - if not (Path(ms_path).exists()): - print(f"Not found '{ms_path}', trying to find alias") - ms_path_path = Path(ms_path) - path_stem = str(ms_path_path.stem) - candidates = ( - list(ms_path_path.parent.glob("*.d")) - + list(ms_path_path.parent.glob("*.mzml")) - + list(ms_path_path.parent.glob("*.mzML")) - ) - - candidates = [c for c in candidates if path_stem in str(c)] - - if len(candidates) == 1: - ms_path = str(candidates[0].resolve()) - else: - raise FileNotFoundError() - - if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir(): - ms_df = parse_bruker_d(ms_path, file_columns) - elif Path(ms_path).suffix in [".mzML", ".mzml"]: - ms_df = parse_mzml(ms_path, file_columns, id_only) - else: - msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist" - raise RuntimeError(msg) - - ms_df.to_parquet( - f"{Path(ms_path).stem}_ms_info.parquet", engine="pyarrow", index=False - ) - - -def main(): - ms_path = sys.argv[1] - id_only = sys.argv[2] - ms_dataframe(ms_path, id_only) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/prepare_diann_parameters.py b/bin/prepare_diann_parameters.py deleted file mode 100755 index 1c7eddf4..00000000 --- a/bin/prepare_diann_parameters.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python - -""" -This script converts SDRF parameters to DIA-NN parameters -License: Apache 2.0 -Authors: Dai Chengxin, Yasset Perez-Riverol -""" - -import re -from typing import List, Tuple - -import click -from sdrf_pipelines.openms.unimod import UnimodDatabase - -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) - - -@click.group(context_settings=CONTEXT_SETTINGS) -def cli(): - pass - - -@click.command("generate") -@click.option("--enzyme", "-e", help="") -@click.option("--fix_mod", "-f", help="") -@click.option("--var_mod", "-v", help="") -@click.pass_context -def generate_cfg(ctx, enzyme, fix_mod, var_mod): - cut = enzyme_cut(enzyme) - unimod_database = UnimodDatabase() - fix_ptm, var_ptm = convert_mod(unimod_database, fix_mod, var_mod) - - var_ptm_str = " --var-mod " - fix_ptm_str = " --fixed-mod " - diann_fix_ptm = "" - diann_var_ptm = "" - for mod in fix_ptm: - diann_fix_ptm += fix_ptm_str + mod - for mod in var_ptm: - diann_var_ptm += var_ptm_str + mod - - with open("diann_config.cfg", "w") as file: - file.write("--cut " + cut + diann_fix_ptm + diann_var_ptm) - - -def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List]: - pattern = re.compile(r"\((.*?)\)") - var_ptm = [] - fix_ptm = [] - - if fix_mod != "": - for mod in fix_mod.split(","): - tag = 0 - for modification in unimod_database.modifications: - if modification.get_name() == mod.split(" ")[0]: - diann_mod = ( - modification.get_name() - + "," - + str(modification._delta_mono_mass) - ) - tag = 1 - break - if tag == 0: - print( - "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " - + mod - ) - continue - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] - if site == "Protein N-term": - site = "*n" - elif site == "N-term": - site = "n" - - if ( - "TMT" in diann_mod - or "Label" in diann_mod - or "iTRAQ" in diann_mod - or "mTRAQ" in diann_mod - ): - fix_ptm.append(diann_mod + "," + site + "," + "label") - else: - fix_ptm.append(diann_mod + "," + site) - - if var_mod != "": - for mod in var_mod.split(","): - tag = 0 - for modification in unimod_database.modifications: - if modification.get_name() == mod.split(" ")[0]: - diann_mod = ( - modification.get_name() - + "," - + str(modification._delta_mono_mass) - ) - tag = 1 - break - if tag == 0: - print( - "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " - + mod - ) - continue - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] - if site == "Protein N-term": - site = "*n" - elif site == "N-term": - site = "n" - - if ( - "TMT" in diann_mod - or "Label" in diann_mod - or "iTRAQ" in diann_mod - or "mTRAQ" in diann_mod - ): - var_ptm.append(diann_mod + "," + site + "," + "label") - else: - var_ptm.append(diann_mod + "," + site) - - return fix_ptm, var_ptm - - -_ENZYME_SPECIFICITY = { - "Trypsin": "K*,R*,!*P", - "Trypsin/P": "K*,R*", - "Arg-C": "R*,!*P", - "Asp-N": "*B,*D", - "Chymotrypsin": "F*,W*,Y*,L*,!*P", - "Lys-C": "K*,!*P", -} - - -def enzyme_cut(enzyme: str) -> str: - return _ENZYME_SPECIFICITY.get(enzyme) or "--cut" - - -cli.add_command(generate_cfg) - -if __name__ == "__main__": - cli() diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py deleted file mode 100755 index 79ac1517..00000000 --- a/bin/psm_conversion.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env python - -import os -import re -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -import pyopenms as oms - -_parquet_field = [ - "sequence", - "protein_accessions", - "protein_start_positions", - "protein_end_positions", - "modifications", - "retention_time", - "charge", - "exp_mass_to_charge", - "reference_file_name", - "scan_number", - "peptidoform", - "posterior_error_probability", - "global_qvalue", - "is_decoy", - "consensus_support", - "mz_array", - "intensity_array", - "num_peaks", - "search_engines", - "id_scores", - "hit_rank", -] - - -def mods_position(peptide): - if peptide.startswith("."): - peptide = peptide[1:] - pattern = re.compile(r"\((.*?)\)") - original_mods = pattern.findall(peptide) - peptide = re.sub(r"\(.*?\)", ".", peptide) - position = [i.start() for i in re.finditer(r"\.", peptide)] - for j in range(1, len(position)): - position[j] -= j - - for k in range(0, len(original_mods)): - original_mods[k] = str(position[k]) + "-" + original_mods[k] - - original_mods = ( - [str(i) for i in original_mods] if len(original_mods) > 0 else np.nan - ) - - return original_mods - - -def convert_psm(idxml, spectra_file, export_decoy_psm): - prot_ids = [] - pep_ids = [] - parquet_data = [] - consensus_support = np.nan - mz_array = [] - intensity_array = [] - num_peaks = np.nan - id_scores = [] - search_engines = [] - - oms.IdXMLFile().load(idxml, prot_ids, pep_ids) - if "ConsensusID" in prot_ids[0].getSearchEngine(): - if prot_ids[0].getSearchParameters().metaValueExists("SE:MS-GF+"): - search_engines = ["MS-GF+"] - if prot_ids[0].getSearchParameters().metaValueExists("SE:Comet"): - search_engines.append("Comet") - if prot_ids[0].getSearchParameters().metaValueExists("SE:Sage"): - search_engines.append("Sage") - else: - search_engines = [prot_ids[0].getSearchEngine()] - - reference_file_name = os.path.splitext( - prot_ids[0].getMetaValue("spectra_data")[0].decode("UTF-8") - )[0] - spectra_df = pd.read_parquet(spectra_file) if spectra_file else None - - for peptide_id in pep_ids: - retention_time = peptide_id.getRT() - exp_mass_to_charge = peptide_id.getMZ() - scan_number = int( - re.findall( - r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference") - )[0][1] - ) - - if isinstance(spectra_df, pd.DataFrame): - spectra = spectra_df[spectra_df["scan"] == scan_number] - mz_array = spectra["mz"].values - intensity_array = spectra["intensity"].values - num_peaks = len(mz_array) - - for hit in peptide_id.getHits(): - # if remove decoy when mapped to target+decoy? - is_decoy = 0 if hit.getMetaValue("target_decoy") == "target" else 1 - if export_decoy_psm == "false" and is_decoy: - continue - global_qvalue = np.nan - if len(search_engines) > 1: - if "q-value" in peptide_id.getScoreType(): - global_qvalue = hit.getScore() - consensus_support = hit.getMetaValue("consensus_support") - elif search_engines == "Comet": - id_scores = ["Comet:Expectation value: " + str(hit.getScore())] - elif search_engines == "MS-GF+": - id_scores = ["MS-GF:SpecEValue: " + str(hit.getScore())] - elif search_engines == "Sage": - id_scores = ["Sage:hyperscore: " + str(hit.getScore())] - - if hit.metaValueExists("MS:1001491"): - global_qvalue = hit.getMetaValue("MS:1001491") - - charge = hit.getCharge() - peptidoform = hit.getSequence().toString() - modifications = mods_position(peptidoform) - sequence = hit.getSequence().toUnmodifiedString() - protein_accessions = [ - ev.getProteinAccession() for ev in hit.getPeptideEvidences() - ] - posterior_error_probability = hit.getMetaValue( - "Posterior Error Probability_score" - ) - protein_start_positions = [ - ev.getStart() for ev in hit.getPeptideEvidences() - ] - protein_end_positions = [ev.getEnd() for ev in hit.getPeptideEvidences()] - hit_rank = hit.getRank() - - parquet_data.append( - [ - sequence, - protein_accessions, - protein_start_positions, - protein_end_positions, - modifications, - retention_time, - charge, - exp_mass_to_charge, - reference_file_name, - scan_number, - peptidoform, - posterior_error_probability, - global_qvalue, - is_decoy, - consensus_support, - mz_array, - intensity_array, - num_peaks, - search_engines, - id_scores, - hit_rank, - ] - ) - - pd.DataFrame(parquet_data, columns=_parquet_field).to_csv( - f"{Path(idxml).stem}_psm.csv", mode="w", index=False, header=True - ) - - -def main(): - idxml_path = sys.argv[1] - spectra_file = sys.argv[2] - export_decoy_psm = sys.argv[3] - convert_psm(idxml_path, spectra_file, export_decoy_psm) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 8ef5b3d1..e2adcaee 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -2,11 +2,10 @@ process DIANNCONVERT { tag "$meta.experiment_id" label 'process_medium' - conda "conda-forge::pandas_schema conda-forge::lzstring bioconda::pmultiqc=0.0.25" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.25--pyhdfd78af_0' : - 'biocontainers/pmultiqc:0.0.25--pyhdfd78af_0' }" - + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: path(report) @@ -34,7 +33,7 @@ process DIANNCONVERT { meta.precursormasstoleranceunit,meta.enzyme,meta.fixedmodifications,meta.variablemodifications].join(';') """ - diann_convert.py convert \\ + quantmsutilsc diann2mztab \\ --folder ./ \\ --exp_design ${exp_design} \\ --diann_version ./version/versions.yml \\ diff --git a/modules/local/extract_psm/main.nf b/modules/local/extract_psm/main.nf index 32f4ceb8..83768eca 100644 --- a/modules/local/extract_psm/main.nf +++ b/modules/local/extract_psm/main.nf @@ -2,10 +2,10 @@ process PSMCONVERSION { tag "$meta.mzml_id" label 'process_medium' - conda "bioconda::pmultiqc=0.0.25" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pmultiqc:0.0.25--pyhdfd78af_0' : - 'biocontainers/pmultiqc:0.0.25--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: @@ -19,12 +19,13 @@ process PSMCONVERSION { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}" + def string_export_decoy_psm = params.export_decoy_psm == true ? "--export_decoy_psm" : "" """ - psm_conversion.py "${idxml_file}" \\ - ${spectrum_df} \\ - $params.export_decoy_psm \\ + quantmsutilsc psmconvert --idxml "${idxml_file}" \\ + --spectra_file ${spectrum_df} \\ + ${string_export_decoy_psm} \\ 2>&1 | tee extract_idxml.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf index 9cddd307..bbf44d65 100644 --- a/modules/local/generate_diann_cfg/main.nf +++ b/modules/local/generate_diann_cfg/main.nf @@ -2,11 +2,10 @@ process GENERATE_DIANN_CFG { tag "$meta.experiment_id" label 'process_low' - conda 'conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.29' + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' : - 'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }" - + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: val(meta) @@ -20,7 +19,7 @@ process GENERATE_DIANN_CFG { def args = task.ext.args ?: '' """ - prepare_diann_parameters.py generate \\ + quantmsutilsc dianncfg \\ --enzyme "${meta.enzyme}" \\ --fix_mod "${meta.fixedmodifications}" \\ --var_mod "${meta.variablemodifications}" \\ diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf index 473a2e63..4e7c1508 100644 --- a/modules/local/ms2rescore/main.nf +++ b/modules/local/ms2rescore/main.nf @@ -2,10 +2,10 @@ process MS2RESCORE { tag "$meta.mzml_id" label 'process_high' - conda "bioconda::ms2rescore=3.0.3 bioconda::psm-utils=0.8.2 conda-forge::pydantic=1.10.14 pygam=0.9.1 bioconda::deeplc=2.2.27 bioconda::ms2pip=4.0.0.dev8 bioconda::deeplcretrainer=0.2.11 conda-forge::scikit-learn=1.4.2 conda-forge::scipy=1.13.0" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.3--pyhdfd78af_0': - 'biocontainers/ms2rescore:3.0.3--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" // userEmulation settings when docker is specified containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' @@ -43,7 +43,7 @@ process MS2RESCORE { } """ - ms2rescore_cli.py \\ + quantmsutilsc ms2rescore \\ --psm_file $idxml \\ --spectrum_path . \\ --ms2_tolerance $ms2_tolerence \\ diff --git a/modules/local/mzmlstatistics/main.nf b/modules/local/mzmlstatistics/main.nf index 31bb55dd..8cd15d16 100644 --- a/modules/local/mzmlstatistics/main.nf +++ b/modules/local/mzmlstatistics/main.nf @@ -3,10 +3,10 @@ process MZMLSTATISTICS { label 'process_medium' label 'process_single' - conda "bioconda::quantms-utils=0.0.2" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.2--pyhdfd78af_0' : - 'biocontainers/quantms-utils:0.0.2--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: tuple val(meta), path(ms_file) @@ -20,10 +20,11 @@ process MZMLSTATISTICS { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}" + def string_id_only = params.id_only == true ? "--id_only" : "" """ - mzml_statistics.py "${ms_file}" \\ - $params.id_only \\ + quantmsutilsc mzmlstats --ms_path "${ms_file}" \\ + ${string_id_only} \\ 2>&1 | tee mzml_statistics.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/preprocess_expdesign.nf b/modules/local/preprocess_expdesign.nf index 61842c08..5fb26703 100644 --- a/modules/local/preprocess_expdesign.nf +++ b/modules/local/preprocess_expdesign.nf @@ -6,10 +6,10 @@ process PREPROCESS_EXPDESIGN { tag "$design.Name" label 'process_low' - conda "bioconda::sdrf-pipelines=0.0.29" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' : - 'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: path design diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 11614b61..c53c4049 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -3,10 +3,10 @@ process SAMPLESHEET_CHECK { tag "$input_file" label 'process_single' - conda "bioconda::sdrf-pipelines=0.0.29" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' : - 'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: path input_file @@ -32,7 +32,7 @@ process SAMPLESHEET_CHECK { def string_is_sdrf = is_sdrf == true ? "--is_sdrf" : "" """ - check_samplesheet.py validate --exp_design "${input_file}" ${string_is_sdrf} \\ + quantmsutilsc checksamplesheet --exp_design "${input_file}" ${string_is_sdrf} \\ ${string_skip_sdrf_validation} \\ ${string_skip_ms_validation} \\ ${string_skip_factor_validation} \\ diff --git a/modules/local/sdrfparsing/main.nf b/modules/local/sdrfparsing/main.nf index c90a6f9a..fa2acc54 100644 --- a/modules/local/sdrfparsing/main.nf +++ b/modules/local/sdrfparsing/main.nf @@ -2,10 +2,10 @@ process SDRFPARSING { tag "$sdrf.Name" label 'process_low' - conda "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.29" + conda "bioconda::quantms-utils=0.0.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.29--pyhdfd78af_0' : - 'biocontainers/sdrf-pipelines:0.0.29--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/quantms-utils:0.0.7--pyhdfd78af_0' : + 'biocontainers/quantms-utils:0.0.7--pyhdfd78af_0' }" input: path sdrf