update: ibaq_usecase

bigbio · Jan 15, 2025 · 331f43f · 331f43f
1 parent 357ac11
commit 331f43f
Show file tree

Hide file tree

Showing 4 changed files with 141 additions and 3 deletions.
diff --git a/docs/ibaq_usecase.adoc b/docs/ibaq_usecase.adoc
@@ -0,0 +1,88 @@
+== IBAQ Usage
+
+Users can now convert existing public datasets in PRIDE Archive to peptide features files using the quantms.io toolkit, remap the proteins to a new version of the UniProt database and recompute IBAQ values using ibaqpy after annotating the SDRF file. ibaqpy benefits from SDRF's compatibility with public proteomics repositories like PRIDE, facilitating seamless integration into submission workflows and downstream analyses. The use of SDRF automates the parsing of metadata, reducing manual effort and user error while enabling better cross-study comparisons.
+
+=== convert to feature
+==== quantms
+
+nf-core/quantms is a bioinformatics best-practice analysis pipeline for Quantitative Mass Spectrometry (MS). Currently, the workflow supports three major MS-based analytical methods: (i) Data dependant acquisition (DDA) label-free and Isobaric quantitation (e.g. TMT, iTRAQ); (ii) Data independent acquisition (DIA) label-free quantification.
+
+===== Data dependant acquisition
+In experiments analyzed using quantms with data-dependent acquisition, you need to download the following files:
+
+* mzTab file: mzTab is intended as a lightweight supplement to the existing standard mzML to store and represent peptide and protein and identifications together with experimental metadata and basic quantitative information.
+* MSstats file: MSstats is generated for all three pipelines DDA-LFQ, DDA-ISO and DIA-LFQ. A simple tsv file ready to be read by the OpenMStoMSstats function of the MSstats R package. It should hold the same quantities as the consensusXML but rearranged in a “long” table format with additional information about the experimental design used by MSstats.
+* SDRF file: SDRF is a file format based on MAGE-TAB, designed to record sample metadata and the relationships between samples and data in proteomics experiments. The main purpose of SDRF is to enhance the interpretability and re-analyzability of proteomics data, as well as to facilitate integration with other omics data.
+
+
+====== convert report to feature
+
+[source, shell]
+----
+quantmsioc convert-feature 
+--sdrf_file PXD004452-Hella-trypsin.sdrf.tsv 
+--msstats_file PXD004452-Hella-trypsin.sdrf_openms_design_msstats_in.csv
+--mztab_file PXD004452-Hella-trypsin.sdrf_openms_design_openms.mzTab 
+--file_num 30 
+--output_folder res 
+--duckdb_max_memory 64GB 
+--output_prefix_file PXD004452
+----
+
+===== Data independent acquisition
+
+The DIA experiment was processed by the DIANN software, and you need to obtain the `diann_report` file and the `SDRF` file. You also need to download all the mass spectrometry information.
+
+[source, shell]
+----
+quantmsioc convert-diann 
+--report_path diann_report.tsv 
+--qvalue_threshold 0.05 
+--mzml_info_folder mzml 
+--sdrf_path PXD048325-Hela.sdrf.tsv 
+--output_folder res1 
+--output_prefix_file PXD048325 
+--duckdb_max_memory 64GB 
+--file_num 30
+----
+
+==== MaxQuant
+
+In MaxQuant, you need to download the `evidence` file and the `SDRF` file.
+
+[source, shell]
+----
+quantmsioc convert-maxquant-feature
+--evidence_file evidence.txt
+--sdrf_file PXD014414.sdrf.tsv
+--output_folder result
+--output_prefix_file PXD014414
+--chunksize 1000000
+----
+
+
+=== Map to the latest Uniport database
+
+You can download the latest `UniProt FASTA` file to remap the peptides and proteins.
+
+[source, shell]
+----
+quantmsioc map-latest-uniport
+--feature_file PXD048325-29419338-aa11-47fb-b183-db1650465b57.feature.parquet
+--fasta uniprotkb_Human_AND_reviewed_true_AND_m_2024_12_02.fasta
+--output_folder result
+--output_prefix_file PXD048325
+----
+
+=== Obtain a feature use case for calculating IBAQ
+
+Finally, for use with `ibaqpy`, you need to inject experimental information into the `feature` file using the `SDRF` file.
+
+[source, shell]
+----
+quantmsioc convert-ibaq 
+--feature_file res/PXD048325-29419338-aa11-47fb-b183-db1650465b57.feature.parquet 
+--sdrf_file PXD048325-Hela.sdrf.tsv 
+--output_folder ibaq 
+--output_prefix_file PXD048325
+----
diff --git a/quantmsio/commands/map_latest_uniport_command.py b/quantmsio/commands/map_latest_uniport_command.py
@@ -0,0 +1,50 @@
+import click
+from quantmsio.core.project import create_uuid_filename
+from quantmsio.operate.tools import map_peptide_to_protein
+
+
+@click.command(
+    "map-latest-uniport",
+    short_help="Map the peptides to the latest UniProt Fasta file.",
+)
+@click.option(
+    "--feature_file",
+    help="feature file",
+    required=True,
+)
+@click.option(
+    "--fasta",
+    help="the latest UniProt Fasta file",
+    required=True,
+)
+@click.option(
+    "--output_folder",
+    help="Folder where the Json file will be generated",
+    required=True,
+)
+@click.option(
+    "--output_prefix_file",
+    help="Prefix of the parquet file needed to generate the file name",
+    required=False,
+)
+def map_latest_uniport(
+    feature_file: str,
+    fasta: str,
+    output_folder: str,
+    output_prefix_file: str,
+):
+    """
+    :param feature_file: feature file
+    :param sdrf_file: the SDRF file needed to extract some of the metadata
+    :param output_folder: Folder where the Json file will be generated
+    :param output_prefix_file: Prefix of the Json file needed to generate the file name
+    """
+
+    if feature_file is None or fasta is None or output_folder is None:
+        raise click.UsageError("Please provide all the required parameters")
+
+    if not output_prefix_file:
+        output_prefix_file = "feature"
+
+    filename = create_uuid_filename(output_prefix_file, ".feature.parquet")
+    map_peptide_to_protein(feature_file, fasta, output_folder, filename)
diff --git a/quantmsio/operate/tools.py b/quantmsio/operate/tools.py
@@ -131,12 +131,11 @@ def get_peptide_map(unique_peptides, fasta):
                 peptide_map[peptide].append(accession)
     return peptide_map
 
-def map_peptide_to_protein(parquet_file: str, fasta: str, output_folder, label="feature"):
+def map_peptide_to_protein(parquet_file: str, fasta: str, output_folder: str, filename: str, label="feature"):
     p = Query(parquet_file)
     unique_peptides = p.get_unique_peptides()
     peptide_map = get_peptide_map(unique_peptides, fasta)
     pqwriter = None
-    filename = os.path.basename(parquet_file)
     for table in p.iter_chunk(batch_size=2000000):
         table["pg_accessions"] = table["sequence"].map(peptide_map)
         table = table[table['pg_accessions'].apply(lambda x: len(x) > 0)]

diff --git a/quantmsio/quantmsioc.py b/quantmsio/quantmsioc.py
@@ -19,7 +19,7 @@
 from quantmsio.commands.statistic_command import statistics
 from quantmsio.commands.maxquant_command import convert_maxquant_psm, convert_maxquant_feature
 from quantmsio.commands.ibaq_command import convert_ibaq_file
-
+from quantmsio.commands.map_latest_uniport_command import map_latest_uniport
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
@@ -49,6 +49,7 @@ def cli():
 cli.add_command(convert_maxquant_psm)
 cli.add_command(convert_maxquant_feature)
 cli.add_command(convert_ibaq_file)
+cli.add_command(map_latest_uniport)
 
 
 def quantms_io_main():