update

bigbio · Dec 12, 2024 · ae65a4d · ae65a4d
1 parent 8ae62c6
commit ae65a4d
Show file tree

Hide file tree

Showing 9 changed files with 305 additions and 28 deletions.
diff --git a/quantmsio/commands/diann_command.py b/quantmsio/commands/diann_command.py
@@ -114,3 +114,62 @@ def diann_convert_to_parquet(
             file_num=file_num,
             protein_file=protein_file,
         )
+
+
+@click.command(
+    "convert-diann-pg",
+    short_help="Convert diann_report to pg file of quantms.io format",
+)
+@click.option(
+    "--report_path",
+    help="the diann report file path",
+    required=True,
+)
+@click.option(
+    "--output_folder",
+    help="Folder where the Json file will be generated",
+    required=True,
+)
+@click.option(
+    "--output_prefix_file",
+    help="Prefix of the Json file needed to generate the file name",
+    required=False,
+)
+@click.option(
+    "--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", required=False
+)
+@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)", required=False)
+@click.option(
+    "--file_num",
+    help="The number of files being processed at the same time",
+    default=100,
+)
+def diann_pg_convert_to_parquet(
+    report_path: str,
+    output_folder: str,
+    output_prefix_file: str,
+    duckdb_max_memory: str,
+    duckdb_threads: int,
+    file_num: int,
+):
+    if report_path is None  is None or output_folder is None:
+        raise click.UsageError("Please provide all the required parameters")
+
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    if not output_prefix_file:
+        output_prefix_file = "pg"
+    filename = create_uuid_filename(output_prefix_file, ".pg.parquet")
+    pg_output_path = output_folder + "/" + filename
+
+    dia_nn = DiaNNConvert(
+        diann_report=report_path,
+        sdrf_path=None,
+        duckdb_max_memory=duckdb_max_memory,
+        duckdb_threads=duckdb_threads,
+    )
+    dia_nn.write_pg_matrix_to_file(
+        output_path= pg_output_path,
+        file_num=file_num
+    )
diff --git a/quantmsio/core/ae.py b/quantmsio/core/ae.py
@@ -17,7 +17,7 @@
 def get_ibaq_columns(path):
     with open(path) as f:
         line = f.readline()
-        return line.split("\n")[0].split(",")
+        return line.split("\n")[0].split("\t")
 
 
 class AbsoluteExpressionHander:
@@ -66,7 +66,7 @@ def load_ibaq_file(self, path, protein_str=None):
         for col in usecols:
             if col not in ibaq_columns:
                 raise Exception(f"Not found {col} in ibaq file")
-        ibaqs = pd.read_csv(path, usecols=usecols)
+        ibaqs = pd.read_csv(path, usecols=usecols, sep="\t")
         ibaqs.rename(columns=AbsoluteExpressionHander.LABEL_MAP, inplace=True)
         if protein_str:
             ibaqs = ibaqs[ibaqs["protein"].str.contains(f"{protein_str}", na=False)]

diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py
@@ -1,5 +1,5 @@
 from quantmsio import __version__
-from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS
+from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS, PG_FIELDS
 import pyarrow as pa
 
 PSM_MAP = {
@@ -66,7 +66,19 @@
     "Genes": "gg_names",
     "Run": "run",
 }
+DIANN_PG_MAP = {
+    "Protein.Group": "pg_accessions",
+    "Protein.Names": "pg_names",
+    "Genes": "gg_accessions",
+    "Run": "reference_file_name",
+    "Global.PG.Q.Value": "global_qvalue",
+    'PG.Quantity': "intensity", 
+    'PG.Normalised': "normalize_intensity",
+    "PG.MaxLFQ": "lfq",
+    'PG.Q.Value': "qvalue"
+}
 DIANN_USECOLS = list(DIANN_MAP.keys())
+DIANN_PG_USECOLS = list(DIANN_PG_MAP.keys())
 
 MAXQUANT_PSM_MAP = {
     "Sequence": "sequence",
@@ -135,3 +147,8 @@
     IBAQ_FIELDS,
     metadata={"description": "ibaq file in quantms.io format"},
 )
+PG_SCHEMA = pa.schema(
+    PG_FIELDS,
+    metadata={"description": "PG file in quantms.io format"},
+)
+
diff --git a/quantmsio/core/diann.py b/quantmsio/core/diann.py
@@ -3,9 +3,11 @@
 import numpy as np
 import pandas as pd
 import os
+import pyarrow as pa
 import pyarrow.parquet as pq
 import concurrent.futures
 from pathlib import Path
+from collections import defaultdict
 from pyopenms import AASequence
 from pyopenms.Constants import PROTON_MASS_U
 from quantmsio.operate.tools import get_ahocorasick
@@ -15,27 +17,22 @@
 from quantmsio.core.feature import Feature
 from quantmsio.core.duckdb import DuckDB
 from quantmsio.utils.pride_utils import generate_scan_number
-from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS
+from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS, DIANN_PG_MAP, DIANN_PG_USECOLS, PG_SCHEMA
 
 DIANN_SQL = ", ".join([f'"{name}"' for name in DIANN_USECOLS])
-
+DIANN_PG_SQL = ", ".join([f'"{name}"' for name in DIANN_PG_USECOLS])
 
 class DiaNNConvert(DuckDB):
 
-    def __init__(self, diann_report, sdrf_path, duckdb_max_memory="16GB", duckdb_threads=4):
+    def __init__(self, diann_report, sdrf_path=None, duckdb_max_memory="16GB", duckdb_threads=4):
         super(DiaNNConvert, self).__init__(diann_report, duckdb_max_memory, duckdb_threads)
-        self.init_duckdb()
-        self._sdrf = SDRFHandler(sdrf_path)
-        self._mods_map = self._sdrf.get_mods_dict()
-        self._automaton = get_ahocorasick(self._mods_map)
-        self._sample_map = self._sdrf.get_sample_map_run()
-
-    def init_duckdb(self):
-
-        self._duckdb.execute("""CREATE INDEX idx_precursor_q ON report ("Precursor.Id", "Q.Value")""")
-        self._duckdb.execute("""CREATE INDEX idx_run ON report ("Run")""")
+        if sdrf_path:
+            self._sdrf = SDRFHandler(sdrf_path)
+            self._mods_map = self._sdrf.get_mods_dict()
+            self._automaton = get_ahocorasick(self._mods_map)
+            self._sample_map = self._sdrf.get_sample_map_run()
 
-    def get_report_from_database(self, runs: list) -> pd.DataFrame:
+    def get_report_from_database(self, runs: list, sql:str = DIANN_SQL) -> pd.DataFrame:
         """
         This function loads the report from the duckdb database for a group of ms_runs.
         :param runs: A list of ms_runs
@@ -48,7 +45,7 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame:
             from report
             where Run IN {}
             """.format(
-                DIANN_SQL, tuple(runs)
+                sql, tuple(runs)
             )
         )
         report = database.df()
@@ -90,6 +87,46 @@ def get_peptide_map_from_database(self):
         logging.info("Time to load peptide map {} seconds".format(et))
         return best_ref_map
 
+    def get_peptide_count(self, df):
+        peptide_count = defaultdict(int)
+        for proteins in df["pg_accessions"]:
+            for protein in proteins.split(";"):
+                peptide_count[protein] += 1
+        return peptide_count
+
+    def generate_pg_matrix(self, report):
+        peptide_count = self.get_peptide_count(report)
+        report.drop_duplicates(subset=["pg_accessions"],inplace=True)
+        report["gg_accessions"] = report["gg_accessions"].str.split(";")
+        report["pg_names"] = report["pg_names"].str.split(";")
+        report["reference_file_name"] = report["reference_file_name"].apply(lambda x: x.split(".")[0])
+        report["pg_accessions"] = report["pg_accessions"].str.split(";")
+        report.loc[:,"peptides"] = report["pg_accessions"].apply(
+            lambda proteins: [{
+                "protein_name": protein, 
+                "peptide_count": peptide_count[protein]
+            } for protein in proteins]
+        )
+        report.loc[:, "is_decoy"] = 0
+        report.loc[:, "additional_intensities"] = report[["normalize_intensity","lfq"]].apply(
+            lambda rows: [
+                {
+                    "intensity_name": name,
+                    "intensity_value": rows[name]
+                } for name in ["normalize_intensity","lfq"]
+            ],
+            axis=1,
+        )
+        report.loc[:, "additional_scores"] = report["qvalue"].apply(
+            lambda value: [{
+                "score_name": "qvalue",
+                "score_value": value
+            }]
+        )
+        report.loc[:, "contaminant"] = None
+        report.loc[:, "anchor_protein"] = None
+        return report
+
     def main_report_df(self, qvalue_threshold: float, mzml_info_folder: str, file_num: int, protein_str: str = None):
         def intergrate_msg(n):
             nonlocal report
@@ -127,6 +164,7 @@ def intergrate_msg(n):
         for refs in info_list:
             report = self.get_report_from_database(refs)
             report.rename(columns=DIANN_MAP, inplace=True)
+            report.dropna(subset=["pg_accessions"], inplace=True)
             if protein_str:
                 report = report[report["pg_accessions"].str.contains(f"{protein_str}", na=False)]
             # restrict
@@ -228,6 +266,23 @@ def generate_feature(
             logging.info("Time to generate psm and feature file {} seconds".format(et))
             yield report
 
+    def write_pg_matrix_to_file(self, output_path:str,file_num=20):
+        info_list = self.get_unique_references("Run")
+        info_list = [info_list[i : i + file_num] for i in range(0, len(info_list), file_num)]
+        pqwriter = None
+        for refs in info_list:
+            report = self.get_report_from_database(refs, DIANN_PG_SQL)
+            report.rename(columns=DIANN_PG_MAP, inplace=True)
+            report.dropna(subset=["pg_accessions"], inplace=True)
+            for ref in refs:
+                df = report[report["reference_file_name"] == ref].copy()
+                df = self.generate_pg_matrix(df)
+                pg_parquet = pa.Table.from_pandas(df, schema=PG_SCHEMA)
+                if not pqwriter:
+                    pqwriter = pq.ParquetWriter(output_path, pg_parquet.schema)
+                pqwriter.write_table(pg_parquet)
+        close_file(pqwriter=pqwriter)
+        self.destroy_duckdb_database()
     def write_feature_to_file(
         self,
         qvalue_threshold: float,

diff --git a/quantmsio/core/format.py b/quantmsio/core/format.py
@@ -256,6 +256,68 @@
     pa.field("intensity", pa.float32(), metadata={"description": "intensity value"}),
 ]
 
+PG_FIELDS = [
+    pa.field(
+        "pg_accessions",
+        pa.list_(pa.string()),
+        metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
+    ),
+    pa.field(
+        "pg_names",
+        pa.list_(pa.string()),
+        metadata={"description": "Protein group names"},
+    ),
+    pa.field(
+        "gg_accessions",
+        pa.list_(pa.string()),
+        metadata={"description": "Gene group accessions, as a string array"},
+    ),
+    pa.field(
+        "reference_file_name",
+        pa.string(),
+        metadata={"description": "Spectrum file name with no path information and not including the file extension"},
+    ),
+    pa.field(
+        "global_qvalue",
+        pa.float32(),
+        metadata={"description": "Global q-value of the protein group at the experiment level"},
+    ),
+    pa.field(
+        "intensity",
+        pa.float32(),
+        metadata={"description": "the intensity-based abundance of the protein group in the reference file"},
+    ),
+    pa.field(
+        "additional_intensities",
+        pa.list_(pa.struct([("intensity_name", pa.string()), ("intensity_value", pa.float32())])),
+        metadata={"description": "The intensity-based abundance of the peptide in the sample"},
+    ),
+    pa.field(
+        "is_decoy",
+        pa.int32(),
+        metadata={"description": "Decoy indicator, 1 if the PSM is a decoy, 0 target"},
+    ),
+    pa.field(
+        "contaminant",
+        pa.int32(),
+        metadata={"description": "If the protein is a contaminant"},
+    ),
+    pa.field(
+        "peptides",
+        pa.list_(pa.struct([("protein_name", pa.string()), ("peptide_count", pa.int32())])),
+        metadata={"description": "Number of peptides per protein in the protein group"},
+    ),
+    pa.field(
+        "anchor_protein",
+        pa.string(),
+        metadata={"description": "The anchor protein of the protein group, leading protein or representative"},
+    ),
+    pa.field(
+        "additional_scores",
+        pa.list_(pa.struct([("score_name", pa.string()), ("score_value", pa.float32())])),
+        metadata={"description": "List of structures, each structure contains two fields: name and value"},
+    )
+]
 
 PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS