update: mq

bigbio · Oct 17, 2024 · 8c451e0 · 8c451e0
1 parent 9f990ce
commit 8c451e0
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 113 deletions.
diff --git a/docs/README.adoc b/docs/README.adoc
@@ -103,48 +103,33 @@ Those three properties can be combined, for example, in a string like one string
 When represented in parquet files <<psm>>, <<feature>>, modification details will be a list of struct:
 
 ```json
-   {
-      "name": "modifications",
-      "type": {
-        "type": "array",
-        "items": {
-          "type": "record",
-          "name": "modification_details",
-          "fields": [
-            {
-              "name": "accession",
-              "type": "string",
-              "doc": "Accession number of the modification (e.g., UNIMOD:35)"
-            },
-            {
-              "name": "positions",
-              "type": {
-                "type": "array",
-                "items": {
-                  "type": "record",
-                  "name": "PositionDetails",
-                  "fields": [
-                    {
-                      "name": "position",
-                      "type": "int",
-                      "doc": "Position of the modification on the peptide"
-                    },
-                    {
-                      "name": "localization_probability",
-                      "type": "float",
-                      "doc": "Probability that the modification is localized at this position"
-                    }
-                  ]
-                }
-              },
-              "doc": "Positions and corresponding localization probabilitie"
-            }
-          ]
+  [{
+      "name": "UNIMOD:35",
+      "fields": [
+        {
+          "position": 2,
+          "localization_probability": 0.94
+        },
+        {
+          "position": 12,
+          "localization_probability": 0.06
         }
-      },
-      "doc": "List of modifications and their details"
+      ]
+    },
+    {
+      "name": "UNIMOD:0",
+      "fields": [
+        {
+          "position": 0,
+          "localization_probability": 0.92
+        },
+        {
+          "position": 16,
+          "localization_probability": 0.08
+        }
+      ]
     }
-
+    ]
 ```
 
 [[scan]]

diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py
@@ -61,20 +61,17 @@
 }
 MAXQUANT_MAP = {
     "Sequence": "sequence",
-    "Proteins": "pg_accessions",
+    "Proteins": "mp_accessions",
     "PEP": "posterior_error_probability",
     "Modifications": "modifications",
     "Reverse": "is_decoy",
     "m/z": "calculated_mz",
-    "MS/MS m/z": "observed_mz",
-    "Gene names": "gg_names",
-    "Calibrated retention time": "rt",
-    "Calibrated retention time start": "rt_start",
-    "Calibrated retention time finish": "rt_stop",
+    "Scan number": "scan",
+    "Retention time": "rt",
     "Charge": "precursor_charge",
     "Modified sequence": "peptidoform",
     "Raw file": "reference_file_name",
-    "Intensity": "intensity",
+    "Score": "additional_scores"
 }
 
 

diff --git a/quantmsio/core/format.py b/quantmsio/core/format.py
@@ -18,7 +18,7 @@
     ),
     pa.field(
         "modification_details",
-        pa.list_(pa.string()),
+        pa.list_(pa.struct([("name", pa.string()), ("fields", pa.list_(pa.struct([("position", pa.int32()), ("localization_probability", pa.float32())])))])),
         metadata={
             "description": "List of alternative site probabilities for the modification format: read the specification for more details"
         },
@@ -46,20 +46,30 @@
         },
     ),
     pa.field(
-        "additional_scores",
+        "best_id_score",
         pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])),
-        metadata={"description": "List of structures, each structure contains two fields: name and value"},
+        metadata={"description": "A named score type and value representing an identification's measure of confidence or input feature"},
     ),
     pa.field(
-        "pg_accessions",
-        pa.list_(pa.string()),
-        metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
+        "additional_scores",
+        pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])),
+        metadata={"description": "List of structures, each structure contains two fields: name and value"},
     ),
     pa.field(
-        "pg_positions",
-        pa.list_(pa.string()),
-        metadata={"description": "Protein start and end positions written as start_post:end_post"},
-    ),
+        "consensus_support",
+        pa.float32(),
+        metadata={"description": "Consensus support for the given peptide spectrum match, when multiple search engines are used"},
+    ),
+    # pa.field(
+    #     "pg_accessions",
+    #     pa.list_(pa.string()),
+    #     metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
+    # ),
+    # pa.field(
+    #     "pg_positions",
+    #     pa.list_(pa.string()),
+    #     metadata={"description": "Protein start and end positions written as start_post:end_post"},
+    # ),
     pa.field(
         "unique",
         pa.int32(),
@@ -68,20 +78,25 @@
         },
     ),
     pa.field(
-        "protein_global_qvalue",
+        "pg_global_qvalue",
         pa.float32(),
         metadata={"description": "Global q-value of the protein group at the experiment level"},
     ),
     pa.field(
-        "gg_accessions",
-        pa.list_(pa.string()),
-        metadata={"description": "Gene accessions, as string array"},
-    ),
-    pa.field(
-        "gg_names",
+        "mp_accessions",
         pa.list_(pa.string()),
-        metadata={"description": "Gene names, as string array"},
-    ),
+        metadata={"description": "Protein accessions of all the proteins that the peptide maps to"},
+    ),
+    # pa.field(
+    #     "gg_accessions",
+    #     pa.list_(pa.string()),
+    #     metadata={"description": "Gene accessions, as string array"},
+    # ),
+    # pa.field(
+    #     "gg_names",
+    #     pa.list_(pa.string()),
+    #     metadata={"description": "Gene names, as string array"},
+    # ),
     pa.field(
         "precursor_charge",
         pa.int32(),
@@ -102,11 +117,6 @@
         pa.float32(),
         metadata={"description": "Predicted retention time of the peptide (in seconds)"},
     ),
-    pa.field(
-        "quantmsio_version",
-        pa.string(),
-        metadata={"description": "The version of quantms.io"},
-    ),
 ]
 
 PSM_UNIQUE_FIELDS = [
@@ -116,17 +126,25 @@
         metadata={"description": "Spectrum file name with no path information and not including the file extension"},
     ),
     pa.field(
-        "scan_number",
+        "scan",
         pa.string(),
-        metadata={"description": "Scan number of the spectrum"},
+        metadata={"description": "Scan index (number of nativeId) of the spectrum identified"},
+    ),
+    pa.field(
+        "rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
+    ),
+    pa.field(
+        "cv_params",
+        pa.list_(pa.struct([("name", pa.string()), ("value", pa.string())])),
+        metadata={"description": "Optional list of CV parameters for additional metadata"},
     ),
     pa.field(
         "ion_mobility",
         pa.float32(),
         metadata={"description": "Ion mobility value for the precursor ion"},
     ),
     pa.field(
-        "num_peaks",
+        "number_peaks",
         pa.int32(),
         metadata={"description": "Number of peaks in the spectrum used for the peptide spectrum match"},
     ),
@@ -140,14 +158,6 @@
         pa.list_(pa.float32()),
         metadata={"description": "Array of intensity values for the spectrum used for the peptide spectrum match"},
     ),
-    pa.field(
-        "rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
-    ),
-    pa.field(
-        "cv_params",
-        pa.list_(pa.struct([("name", pa.string()), ("value", pa.string())])),
-        metadata={"description": "Optional list of CV parameters for additional metadata"},
-    ),
 ]
 
 

diff --git a/quantmsio/core/maxquant.py b/quantmsio/core/maxquant.py
@@ -8,7 +8,7 @@
 from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab
 from quantmsio.core.common import QUANTMSIO_VERSION, MAXQUANT_MAP, MAXQUANT_USECOLS
 from quantmsio.core.feature import Feature
-
+from quantmsio.core.psm import Psm
 # format the log entries
 logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
 
@@ -76,17 +76,23 @@ def generate_mods(row, mod_map):
 
 
 class MaxQuant:
-    def __init__(self, sdrf_path, evidence_path):
+    def __init__(self, sdrf_path, msms_path):
         self._modifications = SDRFHandler(sdrf_path).get_mods_dict()
         self._sdrf_path = sdrf_path
-        self._evidence_path = evidence_path
+        self._msms_path = msms_path
         self.mods_map = get_mod_map(sdrf_path)
 
     def iter_batch(self, chunksize: int = 100000):
+        col_df = pd.read_csv(self._msms_path, sep="\t", nrows=1)
+        cols = []
+        for key in self.mods_map.keys():
+            col = f"{key} Probabilities"
+            if col in col_df.columns:
+                cols.append(col)
         for df in pd.read_csv(
-            self._evidence_path,
+            self._msms_path,
             sep="\t",
-            usecols=MAXQUANT_USECOLS + ["Potential contaminant"],
+            usecols=MAXQUANT_USECOLS + cols,
             low_memory=False,
             chunksize=chunksize,
         ):
@@ -97,15 +103,49 @@ def open_from_zip_archive(self, zip_file, file_name):
         """Open file from zip archive."""
         with zipfile.ZipFile(zip_file) as z:
             with z.open(file_name) as f:
-                df = pd.read_csv(f, sep="\t", usecols=MAXQUANT_USECOLS + ["Potential contaminant"], low_memory=False)
+                df = pd.read_csv(f, sep="\t", usecols=MAXQUANT_USECOLS, low_memory=False)
         return df
 
+    def generate_modification_details(self, df):
+        keys = {}
+        pattern = r'\((\d+\.?\d*)\)'
+        for key in self.mods_map.keys():
+            col = f"{key} Probabilities"
+            if col in df.columns:
+                keys[key] = col
+        def get_details(rows):
+            modification_details = []
+            for key, col in keys.items():
+                modification_map = {
+                    'name': self.mods_map[key]
+                }
+                details = []
+                seq = rows[col]
+                if(not isinstance(seq, str)):
+                    continue
+                match_obj = re.search(pattern, seq)
+                while(match_obj):
+                    details.append({
+                        "position": match_obj.start(0),
+                        "localization_probability": float(match_obj.group(1))
+                    })
+                    seq = seq.replace(match_obj.group(0),'',1)
+                    match_obj = re.search(pattern, seq)
+                modification_map['fields'] = details
+                modification_details.append(modification_map)
+            if len(modification_details) == 0:
+                return None
+            else:
+                return modification_details
+        if len(keys.values()) != 0:
+            df.loc[:, 'modification_details'] = df[keys.values()].apply(get_details,axis=1)
+        else:
+            df.loc[:, 'modification_details'] = None
+
     def main_operate(self, df: pd.DataFrame):
         df.loc[:, "Modifications"] = df[["Modified sequence", "Modifications"]].apply(
             lambda row: generate_mods(row, self.mods_map), axis=1
         )
-        df = df.query('`Potential contaminant`!="+"')
-        df = df.drop("Potential contaminant", axis=1)
         df = df[df["PEP"] < 0.05]
         df = df.rename(columns=MAXQUANT_MAP)
         df.loc[:, "peptidoform"] = df[["sequence", "modifications"]].apply(
@@ -114,18 +154,27 @@ def main_operate(self, df: pd.DataFrame):
             ),
             axis=1,
         )
-        df.loc[:, "is_decoy"] = df["is_decoy"].map({None: "0", np.nan: "0", "+": "1"})
-        df["unique"] = df["pg_accessions"].apply(lambda x: "0" if ";" in str(x) else "1")
-        df.loc[:, "gg_names"] = df["gg_names"].str.split(",")
-        df.loc[:, "additional_scores"] = None
-        df.loc[:, "modification_details"] = None
+        df["is_decoy"] = df["is_decoy"].map({None: "0", np.nan: "0", "+": "1"})
+        self.generate_modification_details(df)
+        df.loc[:, 'unique'] = df["mp_accessions"].apply(lambda x: "0" if ";" in str(x) else "1")
+        df["additional_scores"] = df["additional_scores"].apply(lambda x: [{"name": "maxquant", "value": np.float32(x)}])
+        df.loc[:, "best_id_score"] = None
         df.loc[:, "cv_params"] = None
-        df.loc[:, "quantmsio_version"] = QUANTMSIO_VERSION
-        df.loc[:, "gg_accessions"] = None
+        df.loc[:, "consensus_support"] = None
         df.loc[:, "predicted_rt"] = None
-        df.loc[:, "channel"] = "LFQ"
+        df.loc[:, "global_qvalue"] = None
+        df.loc[:, "observed_mz"] = None
+        df.loc[:, "pg_global_qvalue"] = None
         return df
 
+    @staticmethod
+    def transform_psm(df: pd.DataFrame):
+        df.loc[:, "intensity_array"] = None
+        df.loc[:, "ion_mobility"] = None
+        df.loc[:, "mz_array"] = None
+        df.loc[:, "number_peaks"] = None
+        df.loc[:, "rank"] = None
+
     def transform_feature(self, df: pd.DataFrame):
         df = self.merge_sdrf(df)
         df.loc[:, "global_qvalue"] = None
@@ -157,11 +206,25 @@ def merge_sdrf(self, df: pd.DataFrame):
     def convert_to_parquet(self, output_path: str, chunksize: int = None):
         pqwriter = None
         for df in self.iter_batch(chunksize=chunksize):
-            df = self.transform_feature(df)
-            Feature.convert_to_parquet_format(df, self._modifications)
-            feature = Feature.transform_feature(df)
+            self.transform_psm(df)
+            Psm.convert_to_parquet_format(df,self._modifications)
+            parquet = Psm.transform_parquet(df)
             if not pqwriter:
-                pqwriter = pq.ParquetWriter(output_path, feature.schema)
-            pqwriter.write_table(feature)
-        if pqwriter:
-            pqwriter.close()
+                pqwriter = pq.ParquetWriter(output_path, parquet.schema)
+            pqwriter.write_table(parquet)
+            if pqwriter:
+                pqwriter.close()
+
+
+    # def convert_to_parquet(self, output_path: str, chunksize: int = None):
+    #     pqwriter = None
+    #     for df in self.iter_batch(chunksize=chunksize):
+    #         df = self.transform_feature(df)
+    #         Feature.convert_to_parquet_format(df, self._modifications)
+    #         feature = Feature.transform_feature(df)
+    #         if not pqwriter:
+    #             pqwriter = pq.ParquetWriter(output_path, feature.schema)
+    #         pqwriter.write_table(feature)
+    #     if pqwriter:
+    #         pqwriter.close()
+