Merge remote-tracking branch 'origin/dev' into dev

# Conflicts: # quantmsio/quantmsioc.py
bigbio · Jan 15, 2025 · 2338a32 · 2338a32
2 parents 9f6e958 + 331f43f
commit 2338a32
Show file tree

Hide file tree

Showing 14 changed files with 1,518 additions and 1,113 deletions.
diff --git a/docs/ibaq_usecase.adoc b/docs/ibaq_usecase.adoc
@@ -0,0 +1,88 @@
+== IBAQ Usage
+
+Users can now convert existing public datasets in PRIDE Archive to peptide features files using the quantms.io toolkit, remap the proteins to a new version of the UniProt database and recompute IBAQ values using ibaqpy after annotating the SDRF file. ibaqpy benefits from SDRF's compatibility with public proteomics repositories like PRIDE, facilitating seamless integration into submission workflows and downstream analyses. The use of SDRF automates the parsing of metadata, reducing manual effort and user error while enabling better cross-study comparisons.
+
+=== convert to feature
+==== quantms
+
+nf-core/quantms is a bioinformatics best-practice analysis pipeline for Quantitative Mass Spectrometry (MS). Currently, the workflow supports three major MS-based analytical methods: (i) Data dependant acquisition (DDA) label-free and Isobaric quantitation (e.g. TMT, iTRAQ); (ii) Data independent acquisition (DIA) label-free quantification.
+
+===== Data dependant acquisition
+In experiments analyzed using quantms with data-dependent acquisition, you need to download the following files:
+
+* mzTab file: mzTab is intended as a lightweight supplement to the existing standard mzML to store and represent peptide and protein and identifications together with experimental metadata and basic quantitative information.
+* MSstats file: MSstats is generated for all three pipelines DDA-LFQ, DDA-ISO and DIA-LFQ. A simple tsv file ready to be read by the OpenMStoMSstats function of the MSstats R package. It should hold the same quantities as the consensusXML but rearranged in a “long” table format with additional information about the experimental design used by MSstats.
+* SDRF file: SDRF is a file format based on MAGE-TAB, designed to record sample metadata and the relationships between samples and data in proteomics experiments. The main purpose of SDRF is to enhance the interpretability and re-analyzability of proteomics data, as well as to facilitate integration with other omics data.
+
+
+====== convert report to feature
+
+[source, shell]
+----
+quantmsioc convert-feature 
+--sdrf_file PXD004452-Hella-trypsin.sdrf.tsv 
+--msstats_file PXD004452-Hella-trypsin.sdrf_openms_design_msstats_in.csv
+--mztab_file PXD004452-Hella-trypsin.sdrf_openms_design_openms.mzTab 
+--file_num 30 
+--output_folder res 
+--duckdb_max_memory 64GB 
+--output_prefix_file PXD004452
+----
+
+===== Data independent acquisition
+
+The DIA experiment was processed by the DIANN software, and you need to obtain the `diann_report` file and the `SDRF` file. You also need to download all the mass spectrometry information.
+
+[source, shell]
+----
+quantmsioc convert-diann 
+--report_path diann_report.tsv 
+--qvalue_threshold 0.05 
+--mzml_info_folder mzml 
+--sdrf_path PXD048325-Hela.sdrf.tsv 
+--output_folder res1 
+--output_prefix_file PXD048325 
+--duckdb_max_memory 64GB 
+--file_num 30
+----
+
+==== MaxQuant
+
+In MaxQuant, you need to download the `evidence` file and the `SDRF` file.
+
+[source, shell]
+----
+quantmsioc convert-maxquant-feature
+--evidence_file evidence.txt
+--sdrf_file PXD014414.sdrf.tsv
+--output_folder result
+--output_prefix_file PXD014414
+--chunksize 1000000
+----
+
+
+=== Map to the latest Uniport database
+
+You can download the latest `UniProt FASTA` file to remap the peptides and proteins.
+
+[source, shell]
+----
+quantmsioc map-latest-uniport
+--feature_file PXD048325-29419338-aa11-47fb-b183-db1650465b57.feature.parquet
+--fasta uniprotkb_Human_AND_reviewed_true_AND_m_2024_12_02.fasta
+--output_folder result
+--output_prefix_file PXD048325
+----
+
+=== Obtain a feature use case for calculating IBAQ
+
+Finally, for use with `ibaqpy`, you need to inject experimental information into the `feature` file using the `SDRF` file.
+
+[source, shell]
+----
+quantmsioc convert-ibaq 
+--feature_file res/PXD048325-29419338-aa11-47fb-b183-db1650465b57.feature.parquet 
+--sdrf_file PXD048325-Hela.sdrf.tsv 
+--output_folder ibaq 
+--output_prefix_file PXD048325
+----
diff --git a/quantmsio/commands/diann_command.py b/quantmsio/commands/diann_command.py
@@ -114,3 +114,62 @@ def diann_convert_to_parquet(
             file_num=file_num,
             protein_file=protein_file,
         )
+
+
+@click.command(
+    "convert-diann-pg",
+    short_help="Convert diann_report to pg file of quantms.io format",
+)
+@click.option(
+    "--report_path",
+    help="the diann report file path",
+    required=True,
+)
+@click.option(
+    "--output_folder",
+    help="Folder where the Json file will be generated",
+    required=True,
+)
+@click.option(
+    "--output_prefix_file",
+    help="Prefix of the Json file needed to generate the file name",
+    required=False,
+)
+@click.option(
+    "--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", required=False
+)
+@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)", required=False)
+@click.option(
+    "--file_num",
+    help="The number of files being processed at the same time",
+    default=100,
+)
+def diann_pg_convert_to_parquet(
+    report_path: str,
+    output_folder: str,
+    output_prefix_file: str,
+    duckdb_max_memory: str,
+    duckdb_threads: int,
+    file_num: int,
+):
+    if report_path is None  is None or output_folder is None:
+        raise click.UsageError("Please provide all the required parameters")
+
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    if not output_prefix_file:
+        output_prefix_file = "pg"
+    filename = create_uuid_filename(output_prefix_file, ".pg.parquet")
+    pg_output_path = output_folder + "/" + filename
+
+    dia_nn = DiaNNConvert(
+        diann_report=report_path,
+        sdrf_path=None,
+        duckdb_max_memory=duckdb_max_memory,
+        duckdb_threads=duckdb_threads,
+    )
+    dia_nn.write_pg_matrix_to_file(
+        output_path= pg_output_path,
+        file_num=file_num
+    )
diff --git a/quantmsio/commands/ibaq_command.py b/quantmsio/commands/ibaq_command.py
@@ -47,4 +47,4 @@ def convert_ibaq_file(
         output_prefix_file = ""
 
     output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".ibaq.parquet")
-    write_ibaq_feature(feature_file, sdrf_file, output_path)
+    write_ibaq_feature(sdrf_file, feature_file, output_path)
diff --git a/quantmsio/commands/map_latest_uniport_command.py b/quantmsio/commands/map_latest_uniport_command.py
@@ -0,0 +1,50 @@
+import click
+from quantmsio.core.project import create_uuid_filename
+from quantmsio.operate.tools import map_peptide_to_protein
+
+
+@click.command(
+    "map-latest-uniport",
+    short_help="Map the peptides to the latest UniProt Fasta file.",
+)
+@click.option(
+    "--feature_file",
+    help="feature file",
+    required=True,
+)
+@click.option(
+    "--fasta",
+    help="the latest UniProt Fasta file",
+    required=True,
+)
+@click.option(
+    "--output_folder",
+    help="Folder where the Json file will be generated",
+    required=True,
+)
+@click.option(
+    "--output_prefix_file",
+    help="Prefix of the parquet file needed to generate the file name",
+    required=False,
+)
+def map_latest_uniport(
+    feature_file: str,
+    fasta: str,
+    output_folder: str,
+    output_prefix_file: str,
+):
+    """
+    :param feature_file: feature file
+    :param sdrf_file: the SDRF file needed to extract some of the metadata
+    :param output_folder: Folder where the Json file will be generated
+    :param output_prefix_file: Prefix of the Json file needed to generate the file name
+    """
+
+    if feature_file is None or fasta is None or output_folder is None:
+        raise click.UsageError("Please provide all the required parameters")
+
+    if not output_prefix_file:
+        output_prefix_file = "feature"
+
+    filename = create_uuid_filename(output_prefix_file, ".feature.parquet")
+    map_peptide_to_protein(feature_file, fasta, output_folder, filename)
diff --git a/quantmsio/core/ae.py b/quantmsio/core/ae.py
@@ -17,7 +17,7 @@
 def get_ibaq_columns(path):
     with open(path) as f:
         line = f.readline()
-        return line.split("\n")[0].split(",")
+        return line.split("\n")[0].split("\t")
 
 
 class AbsoluteExpressionHander:
@@ -66,7 +66,7 @@ def load_ibaq_file(self, path, protein_str=None):
         for col in usecols:
             if col not in ibaq_columns:
                 raise Exception(f"Not found {col} in ibaq file")
-        ibaqs = pd.read_csv(path, usecols=usecols)
+        ibaqs = pd.read_csv(path, usecols=usecols, sep="\t")
         ibaqs.rename(columns=AbsoluteExpressionHander.LABEL_MAP, inplace=True)
         if protein_str:
             ibaqs = ibaqs[ibaqs["protein"].str.contains(f"{protein_str}", na=False)]

diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py
@@ -1,5 +1,5 @@
 from quantmsio import __version__
-from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS
+from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS, PG_FIELDS
 import pyarrow as pa
 
 PSM_MAP = {
@@ -66,7 +66,19 @@
     "Genes": "gg_names",
     "Run": "run",
 }
+DIANN_PG_MAP = {
+    "Protein.Group": "pg_accessions",
+    "Protein.Names": "pg_names",
+    "Genes": "gg_accessions",
+    "Run": "reference_file_name",
+    "Global.PG.Q.Value": "global_qvalue",
+    'PG.Quantity': "intensity", 
+    'PG.Normalised': "normalize_intensity",
+    "PG.MaxLFQ": "lfq",
+    'PG.Q.Value': "qvalue"
+}
 DIANN_USECOLS = list(DIANN_MAP.keys())
+DIANN_PG_USECOLS = list(DIANN_PG_MAP.keys())
 
 MAXQUANT_PSM_MAP = {
     "Sequence": "sequence",
@@ -135,3 +147,8 @@
     IBAQ_FIELDS,
     metadata={"description": "ibaq file in quantms.io format"},
 )
+PG_SCHEMA = pa.schema(
+    PG_FIELDS,
+    metadata={"description": "PG file in quantms.io format"},
+)
+