diff --git a/.gitignore b/.gitignore index 381d127..476aec7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ build .idea data docs/.vscode -__pycache__ \ No newline at end of file +__pycache__ +.vscode \ No newline at end of file diff --git a/docs/feature.avsc b/docs/feature.avsc index 826afa3..397673b 100644 --- a/docs/feature.avsc +++ b/docs/feature.avsc @@ -19,8 +19,8 @@ {"name": "pg_positions", "type": ["null", {"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"}, {"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"}, {"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"}, - {"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, - {"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, + {"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, + {"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, {"name": "precursor_charge", "type": "int", "doc": "Precursor charge"}, {"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"}, diff --git a/docs/peptide.avsc b/docs/peptide.avsc index 983156b..7e0a0c6 100644 --- a/docs/peptide.avsc +++ b/docs/peptide.avsc @@ -19,8 +19,8 @@ {"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"}, {"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"}, {"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"}, - {"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, - {"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, + {"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, + {"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, {"name": "precursor_charge", "type": "int", "doc": "Precursor charge"}, {"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"}, diff --git a/docs/protein.avsc b/docs/protein.avsc index aedbace..6f4220c 100644 --- a/docs/protein.avsc +++ b/docs/protein.avsc @@ -9,8 +9,8 @@ {"name": "global_qvalue","type": ["null", "float32"], "doc": "The global qvalue for a given protein or protein groups"}, {"name": "is_decoy","type": ["null", "int"], "doc": "If the protein is decoy"}, {"name": "best_id_score", "type": "string", "doc": "The best search engine score for the identification"}, - {"name": "gene_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"}, - {"name": "gene_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"}, + {"name": "gg_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"}, + {"name": "gg_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"}, {"name": "number_peptides","type": ["null", "int"], "doc": "The total number of peptides for a give protein"}, {"name": "number_psms","type": ["null", "int"], "doc": "The total number of peptide spectrum matches"}, {"name": "number_unique_peptides","type": ["null", "int"], "doc": "The total number of unique peptides"}, diff --git a/docs/psm.avsc b/docs/psm.avsc index 25279ad..508ba6a 100644 --- a/docs/psm.avsc +++ b/docs/psm.avsc @@ -28,8 +28,8 @@ {"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"}, {"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"}, {"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"}, - {"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, - {"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, + {"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"}, + {"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"}, {"name": "precursor_charge", "type": "int", "doc": "Precursor charge"}, {"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"}, diff --git a/quantmsio/temp_core/common.py b/quantmsio/temp_core/common.py new file mode 100644 index 0000000..559084f --- /dev/null +++ b/quantmsio/temp_core/common.py @@ -0,0 +1,42 @@ +PSM_MAP = { + "sequence": "sequence", + #"opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform", + "modifications": "modifications", + + "opt_global_Posterior_Error_Probability_score": "posterior_error_probability", + "opt_global_q-value": "global_qvalue", + "opt_global_cv_MS:1002217_decoy_peptide": "is_decoy", + "calc_mass_to_charge": "calculated_mz", + "accession": "pg_accessions", + "unique": "unique", + "charge": "precursor_charge", + "exp_mass_to_charge": "observed_mz", + "retention_time": "rt", +} + +PSM_USECOLS = list(PSM_MAP.keys()) + [ + "spectra_ref", + "start", + "end", + "search_engine", + "search_engine_score[1]", +] + +ADDITIONS = [ + "peptidoform", + "modification_details", + "additional_scores", + "pg_positions", + "protein_global_qvalue", + "gg_accessions", + "gg_names", + "predicted_rt" + "reference_file_name" + "scan_number" + "ion_mobility" + "num_peaks" + "mz_array" + "intensity_array" + "rank" + "cv_params" + ] \ No newline at end of file diff --git a/quantmsio/temp_core/format.py b/quantmsio/temp_core/format.py new file mode 100644 index 0000000..b2b05c7 --- /dev/null +++ b/quantmsio/temp_core/format.py @@ -0,0 +1,165 @@ +import pyarrow as pa +PEPTIDE_FIELDS = [ + pa.field( + "sequence", + pa.string(), + metadata={"description": "The peptide’s sequence corresponding to the PSM"}, + ), + pa.field( + "peptidoform", + pa.string(), + metadata={"description": "Peptide sequence with modifications: Read the specification for more details"}, + ), + pa.field( + "modifications", + pa.list_(pa.string()), + metadata={"description": "List of modifications as string array, easy for search and filter"}, + ), + pa.field( + "modification_details", + pa.list_(pa.string()), + metadata={"description": "List of alternative site probabilities for the modification format: read the specification for more details"}, + ), + pa.field( + "posterior_error_probability", + pa.float32(), + metadata={"description": "Posterior error probability for the given peptide spectrum match"}, + ), + pa.field("global_qvalue", pa.float32(), metadata={"description": "Global q-value of the peptide or psm at the level of the experiment"}), + + pa.field( + "is_decoy", + pa.int32(), + metadata={"description": "Decoy indicator, 1 if the PSM is a decoy, 0 target"}, + ), + pa.field( + "calculated_mz", + pa.float32(), + metadata={"description": "Theoretical peptide mass-to-charge ratio based on identified sequence and modifications"}, + ), + pa.field( + "additional_scores", + pa.list_( + pa.struct([ + ("name", pa.string()), + ("score", pa.float32()) + ]) + ), + metadata={"description": "List of structures, each structure contains two fields: name and value"}, + ), + + pa.field( + "pg_accessions", + pa.list_(pa.string()), + metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"}, + ), + pa.field( + "pg_positions", + pa.list_(pa.string()), + metadata={"description": "Protein start and end positions written as start_post:end_post"}, + ), + pa.field( + "unique", + pa.int32(), + metadata={"description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"}, + ), + pa.field( + "protein_global_qvalue", + pa.float32(), + metadata={"description": "Global q-value of the protein group at the experiment level"}, + ), + pa.field( + "gg_accessions", + pa.list_(pa.string()), + metadata={"description": "Gene accessions, as string array"}, + ), + pa.field( + "gg_names", + pa.list_(pa.string()), + metadata={"description": "Gene names, as string array"}, + ), + + pa.field( + "precursor_charge", + pa.int32(), + metadata={"description": "Precursor charge"}, + ), + pa.field( + "observed_mz", + pa.float32(), + metadata={"description": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"}, + ), + pa.field( + "rt", + pa.float32(), + metadata={"description": "MS2 scan’s precursor retention time (in seconds)"}, + ), + pa.field( + "predicted_rt", + pa.float32(), + metadata={"description": "Predicted retention time of the peptide (in seconds)"}, + ), + pa.field( + "quantmsio_version", + pa.string(), + metadata={"description": "The version of quantms.io"}, + ) +] + +PSM_UNIQUE_FIELDS = [ + pa.field( + "reference_file_name", + pa.string(), + metadata={"description": "Spectrum file name with no path information and not including the file extension"}, + ), + pa.field( + "scan_number", + pa.string(), + metadata={"description": "Scan number of the spectrum"}, + ), + pa.field( + "ion_mobility", + pa.float32(), + metadata={"description": "Ion mobility value for the precursor ion"}, + ), + pa.field("num_peaks", pa.int32(), metadata={"description": "Number of peaks in the spectrum used for the peptide spectrum match"}), + pa.field( + "mz_array", + pa.list_(pa.float32()), + metadata={"description": "Array of m/z values for the spectrum used for the peptide spectrum match"}, + ), + pa.field( + "intensity_array", + pa.list_(pa.float32()), + metadata={"description": "Array of intensity values for the spectrum used for the peptide spectrum match"}, + ), + pa.field("rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}), + + pa.field( + "cv_params", + pa.list_( + pa.struct([ + ("name", pa.string()), + ("value", pa.string()) + ]) + ), + metadata={"description": "Optional list of CV parameters for additional metadata"}, + ), +] + + + + + + + + + + + + + + + + +PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS \ No newline at end of file diff --git a/quantmsio/temp_core/mzTab.py b/quantmsio/temp_core/mzTab.py new file mode 100644 index 0000000..42b87e5 --- /dev/null +++ b/quantmsio/temp_core/mzTab.py @@ -0,0 +1,209 @@ +import codecs +import os +import re +import numpy as np +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from quantmsio.utils.pride_utils import get_quantmsio_modifications + +def fetch_modifications_from_mztab_line(line: str, _modifications: dict) -> dict: + """ + get the modifications from a mztab line. An mzTab modification could be a fixed or variable modification. + The structure of a fixed is the following: + MTD fixed_mod[1] [UNIMOD, UNIMOD:4, Carbamidomethyl, ] + MTD fixed_mod[1]-site C + MTD fixed_mod[1]-position Anywhere + while the structure of a variable modification is the following: + MTD var_mod[1] [UNIMOD, UNIMOD:21, Phospho, ] + MTD var_mod[1]-site S + MTD var_mod[1]-position Anywhere + + :param line: mztab line + :param _modifications: modifications dictionary + :return: modification dictionary + """ + line = line.strip() + line_parts = line.split("\t") + if line_parts[0] == "MTD" and "_mod[" in line_parts[1]: + if "site" not in line_parts[1] and "position" not in line_parts[1]: + values = line_parts[2].replace("[", "").replace("]", "").split(",") + accession = values[1].strip() + name = values[2].strip() + index = line_parts[1].split("[")[1].split("]")[0] + _modifications[accession] = [name, index, None, None] + elif "site" in line_parts[1]: + index = line_parts[1].split("[")[1].split("]")[0] + accession = None + for ( + key, + value, + ) in _modifications.items(): # for name, age in dictionary.iteritems(): (for Python 2.x) + if value[1] == index: + accession = key + if accession is None: + raise Exception("The accession for the modification is None") + _modifications[accession][2] = line_parts[2] + elif "position" in line_parts[1]: + index = line_parts[1].split("[")[1].split("]")[0] + accession = None + for key, value in _modifications.items(): + if value[1] == index: + accession = key + if accession is None: + raise Exception("The accession for the modification is None") + _modifications[accession][3] = line_parts[2] + return _modifications + +class MzTab: + def __init__(self,mzTab_path: str) -> None: + self.mztab_path = mzTab_path + # psm pos + self._psm_pos = None + # psm len + self._psm_len = None + # pep pos + self._pep_pos = None + # pep len + self._pep_len = None + # prt pos + self._prt_pos = None + # prt len + self._prt_len = None + # load psms columns + self._psms_columns = None + # load pep columns + self._pep_columns = None + + def __get_pos(self, header): + if header == "PSH" and self._pep_pos is not None: + return self._pep_pos + self._pep_len - 1 + elif header == "PEH" and self._prt_pos is not None: + return self._prt_pos + self._prt_len - 1 + else: + return 0 + + def __extract_len(self, header): + map_tag = {"PSH": "PSM", "PEH": "PEP", "PRH": "PRT"} + if os.stat(self.mztab_path).st_size == 0: + raise ValueError("File is empty") + f = open(self.mztab_path) + pos = self.__get_pos(header) + f.seek(pos) + line = f.readline() + while not line.startswith(header): + pos = f.tell() + line = f.readline() + + if header == "PSH": + self._psms_columns = line.split("\n")[0].split("\t") + if header == 'PEH': + self._pep_columns = line.split('\n')[0].split('\t') + + line = f.readline() + fle_len = 0 + while line.startswith(map_tag[header]): + fle_len += 1 + line = f.readline() + f.close() + return fle_len, pos + + def __load_second(self, header, **kwargs): + f = open(self.mztab_path) + if header == "PSH": + f.seek(self._psm_pos) + return pd.read_csv(f, nrows=self._psm_len, **kwargs) + elif header == "PEH": + f.seek(self._pep_pos) + return pd.read_csv(f, nrows=self._pep_len, **kwargs) + else: + f.seek(self._prt_pos) + return pd.read_csv(f, nrows=self._prt_len, **kwargs) + + def __set_table_config(self, header, length, pos): + if header == "PSH": + self._psm_pos = pos + self._psm_len = length + elif header == "PEH": + self._pep_pos = pos + self._pep_len = length + else: + self._prt_pos = pos + self._prt_len = length + + def skip_and_load_csv(self, header, **kwargs): + if self._psm_pos is not None and header == "PSH": + return self.__load_second(header, **kwargs) + if self._pep_pos is not None and header == "PEH": + return self.__load_second(header, **kwargs) + if self._prt_pos is not None and header == "PRH": + return self.__load_second(header, **kwargs) + fle_len, pos = self.__extract_len(header) + if os.stat(self.mztab_path).st_size == 0: + raise ValueError("File is empty") + f = open(self.mztab_path) + f.seek(pos) + self.__set_table_config(header, fle_len, pos) + return pd.read_csv(f, nrows=fle_len, sep='\t', **kwargs) + + def extract_ms_runs(self): + if os.stat(self.mztab_path).st_size == 0: + raise ValueError("File is empty") + f = codecs.open(self.mztab_path, "r", "utf-8") + line = f.readline() + ms_runs = {} + while line.split("\t")[0] == "MTD": + if line.split("\t")[1].split("-")[-1] == "location": + ms_runs[line.split("\t")[1].split("-")[0]] = line.split("\t")[2].split("//")[-1].split(".")[0] + line = f.readline() + f.close() + return ms_runs + + def get_protein_map(self, protein_str=None): + """ + return: a dict about protein score + """ + prt = self.skip_and_load_csv( + "PRH", + usecols=["ambiguity_members", "best_search_engine_score[1]"], + ) + if protein_str: + prt = prt[prt["ambiguity_members"].str.contains(f"{protein_str}", na=False)] + prt_score = prt.groupby("ambiguity_members").min() + protein_map = prt_score.to_dict()["best_search_engine_score[1]"] + return protein_map + + def get_software_score(self,name,score) -> dict: + return {'name':name,'score':score} + + def generate_positions(self,start,end) -> list: + start = start.split(',') + end = end.split(',') + return [start + ':' + end for start,end in zip(start,end)] + + def get_modifications(self): + if os.stat(self.mztab_path).st_size == 0: + raise ValueError("File is empty") + f = codecs.open(self.mztab_path, "r", "utf-8") + line = f.readline() + mod_dict = {} + while line.split("\t")[0] == "MTD": + if "_mod[" in line: + mod_dict = fetch_modifications_from_mztab_line(line, mod_dict) + line = f.readline() + f.close() + return mod_dict + + def _generate_modification_list(self, modification_str: str): + + if pd.isna(modification_str): + return None + modifications = get_quantmsio_modifications(modification_str, self._modifications) + modifications_string = "" + for key, value in modifications.items(): + modifications_string += "|".join(map(str, value["position"])) + modifications_string = modifications_string + "-" + value["unimod_accession"] + "," + modifications_string = modifications_string[:-1] # Remove last comma + modification_list = modifications_string.split(",") + + return modification_list \ No newline at end of file diff --git a/quantmsio/temp_core/psm.py b/quantmsio/temp_core/psm.py new file mode 100644 index 0000000..a90f4c1 --- /dev/null +++ b/quantmsio/temp_core/psm.py @@ -0,0 +1,98 @@ +import pyarrow as pa +import pyarrow.parquet as pq +from quantmsio.utils.file_utils import extract_protein_list +from quantmsio.utils.pride_utils import generate_scan_number +from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab +from quantmsio.temp_core.common import PSM_USECOLS,PSM_MAP +from quantmsio.temp_core.mzTab import MzTab +from quantmsio.temp_core.format import PSM_FIELDS +import pandas as pd + +PSM_SCHEMA = pa.schema( + PSM_FIELDS, + metadata={"description": "psm file in quantms.io format"}, +) + +class PsmInMemory(MzTab): + def __init__(self,mzTab_path): + super(PsmInMemory,self).__init__(mzTab_path) + self._ms_runs = self.extract_ms_runs() + self._protein_global_qvalue_map = self.get_protein_map() + self._modifications = self.get_modifications() + + def generate_report(self,chunksize=1000000,protein_str=None): + for df in self.skip_and_load_csv('PSH',chunksize=chunksize): + if protein_str: + df = df[df["accession"].str.contains(f"{protein_str}", na=False)] + no_cols = set(PSM_USECOLS) - set(df.columns) + for col in no_cols: + if col=='unique': + df.loc[:,col] = df['accession'].apply(lambda x: 0 if ';' in x else 1) + else: + df.loc[:,col] = None + df = df[PSM_USECOLS] + df.rename(columns=PSM_MAP,inplace=True) + self.transform_psm(df) + self.add_addition_msg(df) + df = self.convert_to_parquet(df) + yield df + + def transform_psm(self,df): + df.loc[:, 'pg_positions'] = df[['start','end']].apply( + lambda row: self.generate_positions(row["start"], row["end"]), + axis=1 + ) + df.loc[:, "scan_number"] = df["spectra_ref"].apply(lambda x: generate_scan_number(x)) + + df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[:x.index(':')]]) + df.loc[:, "additional_scores"] = None + df.loc[:,'peptidoform'] = df[["modifications", "sequence"]].apply( + lambda row: get_peptidoform_proforma_version_in_mztab(row["sequence"], row["modifications"], self._modifications), + axis=1 + ) + df.drop(['start','end',"spectra_ref","search_engine","search_engine_score[1]"], inplace=True, axis=1) + + def add_addition_msg(self,df): + df.loc[:,"protein_global_qvalue"] = df['pg_accessions'].map(self._protein_global_qvalue_map) + df.loc[:,"modification_details"] = None + df.loc[:,"predicted_rt"] = None + df.loc[:,"gg_accessions"] = None + df.loc[:,"gg_names"] = None + df.loc[:,"ion_mobility"] = None + df.loc[:,"num_peaks"] = None + df.loc[:,"mz_array"] = None + df.loc[:,"intensity_array"] = None + df.loc[:,"rank"] = None + df.loc[:,"cv_params"] = None + df.loc[:,"quantmsio_version"] = None + + def write_feature_to_file(self,output_path, chunksize=1000000, protein_file=None): + protein_list = extract_protein_list(protein_file) if protein_file else None + protein_str = "|".join(protein_list) if protein_list else None + pqwriter = None + for p in self.generate_report(chunksize=chunksize,protein_str=protein_str): + if not pqwriter: + pqwriter = pq.ParquetWriter(output_path, p.schema) + pqwriter.write_table(p) + if pqwriter: + pqwriter.close() + + def convert_to_parquet(self, res): + res["pg_accessions"] = res["pg_accessions"].str.split(";") + res["protein_global_qvalue"] = res["protein_global_qvalue"].astype(float) + res["unique"] = res["unique"].astype("Int32") + res["modifications"] = res["modifications"].apply(lambda x: self._generate_modification_list(x)) + res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") + res["calculated_mz"] = res["calculated_mz"].astype(float) + res["observed_mz"] = res["observed_mz"].astype(float) + res["posterior_error_probability"] = res["posterior_error_probability"].astype(float) + res["global_qvalue"] = res["global_qvalue"].astype(float) + res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32") + + res["scan_number"] = res["scan_number"].astype(str) + + if "rt" in res.columns: + res["rt"] = res["rt"].astype(float) + else: + res.loc[:, "rt"] = None + return pa.Table.from_pandas(res, schema=PSM_SCHEMA) \ No newline at end of file diff --git a/tests/test_new_psm.py b/tests/test_new_psm.py new file mode 100644 index 0000000..7274e06 --- /dev/null +++ b/tests/test_new_psm.py @@ -0,0 +1,10 @@ +from .common import datafile +from unittest import TestCase +from quantmsio.temp_core.psm import PsmInMemory +class TestPSMHandler(TestCase): + + def test_convert_mztab_to_feature(self): + mztab_path = datafile("DDA-plex/MSV000079033.mzTab") + psm = PsmInMemory(mztab_path) + for _ in psm.generate_report(): + print("ok") \ No newline at end of file