Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Oct 23, 2024
2 parents b99477c + 3234963 commit 1a279ef
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 97 deletions.
2 changes: 1 addition & 1 deletion quantmsio/commands/psm_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def convert_psm_file(

psm_manager = Psm(mzTab_path=mztab_file)
output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet")
psm_manager.write_feature_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)
psm_manager.write_psm_to_file(output_path=output_path, chunksize=chunksize, protein_file=protein_file)


@click.command("compare-set-psms", short_help="plot venn for a set of Psms parquet")
Expand Down
17 changes: 14 additions & 3 deletions quantmsio/core/common.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from quantmsio import __version__

from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS
import pyarrow as pa
PSM_MAP = {
"sequence": "sequence",
"modifications": "modifications",
"opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform",
"opt_global_Posterior_Error_Probability_score": "posterior_error_probability",
"opt_global_q-value": "global_qvalue",
#"opt_global_q-value": "global_qvalue",
"opt_global_cv_MS:1002217_decoy_peptide": "is_decoy",
"calc_mass_to_charge": "calculated_mz",
"accession": "mp_accessions",
"unique": "unique",
#"unique": "unique",
"charge": "precursor_charge",
"exp_mass_to_charge": "observed_mz",
"retention_time": "rt",
Expand Down Expand Up @@ -75,3 +77,12 @@
MAXQUANT_USECOLS = list(MAXQUANT_MAP.keys())

QUANTMSIO_VERSION = __version__

PSM_SCHEMA = pa.schema(
PSM_FIELDS,
metadata={"description": "psm file in quantms.io format"},
)
FEATURE_SCHEMA = pa.schema(
FEATURE_FIELDS,
metadata={"description": "feature file in quantms.io format"},
)
6 changes: 0 additions & 6 deletions quantmsio/core/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,6 @@
)
from quantmsio.utils.constants import ITRAQ_CHANNEL, TMT_CHANNELS
from quantmsio.core.common import MSSTATS_MAP, MSSTATS_USECOLS, SDRF_USECOLS, SDRF_MAP, QUANTMSIO_VERSION
from quantmsio.core.format import FEATURE_FIELDS

FEATURE_SCHEMA = pa.schema(
FEATURE_FIELDS,
metadata={"description": "feature file in quantms.io format"},
)


class Feature(MzTab):
Expand Down
64 changes: 32 additions & 32 deletions quantmsio/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@
),
pa.field(
"modifications",
pa.list_(pa.string()),
metadata={"description": "List of modifications as string array, easy for search and filter"},
),
pa.field(
"modification_details",
pa.list_(
pa.struct(
[
Expand All @@ -43,11 +38,6 @@
pa.float32(),
metadata={"description": "Posterior error probability for the given peptide spectrum match"},
),
pa.field(
"global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the peptide or psm at the level of the experiment"},
),
pa.field(
"is_decoy",
pa.int32(),
Expand Down Expand Up @@ -77,18 +67,6 @@
pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
),
pa.field(
"unique",
pa.int32(),
metadata={
"description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"
},
),
pa.field(
"pg_global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the protein group at the experiment level"},
),
pa.field(
"mp_accessions",
pa.list_(pa.string()),
Expand Down Expand Up @@ -117,21 +95,11 @@
]

PSM_UNIQUE_FIELDS = [
pa.field(
"consensus_support",
pa.float32(),
metadata={
"description": "Consensus support for the given peptide spectrum match, when multiple search engines are used"
},
),
pa.field(
"rt",
pa.float32(),
metadata={"description": "MS2 scan’s precursor retention time (in seconds)"},
),
pa.field(
"rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
),
pa.field(
"ion_mobility",
pa.float32(),
Expand Down Expand Up @@ -278,3 +246,35 @@
PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS

FEATURE_FIELDS = PEPTIDE_FIELDS + FEATURE_UNIQUE_FIELDS

# pa.field(
# "modifications",
# pa.list_(pa.string()),
# metadata={"description": "List of modifications as string array, easy for search and filter"},
# ),

# pa.field(
# "pg_global_qvalue",
# pa.float32(),
# metadata={"description": "Global q-value of the protein group at the experiment level"},
# ),

# pa.field(
# "consensus_support",
# pa.float32(),
# metadata={
# "description": "Consensus support for the given peptide spectrum match, when multiple search engines are used"
# },
# ),

# pa.field(
# "unique",
# pa.int32(),
# metadata={
# "description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"
# },
# ),

# pa.field(
# "rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
# ),
20 changes: 1 addition & 19 deletions quantmsio/core/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pyarrow.parquet as pq
from quantmsio.core.sdrf import SDRFHandler
from pyopenms import AASequence
from quantmsio.operate.tools import get_ahocorasick, get_modification_details
from quantmsio.operate.tools import get_ahocorasick, get_modification_details, get_mod_map
from pyopenms.Constants import PROTON_MASS_U
from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab
from quantmsio.core.common import MAXQUANT_MAP, MAXQUANT_USECOLS
Expand Down Expand Up @@ -49,24 +49,6 @@ def find_modification(peptide):
return original_mods


def get_mod_map(sdrf_path):
sdrf = pd.read_csv(sdrf_path, sep="\t", nrows=1)
mod_cols = [col for col in sdrf.columns if col.startswith("comment[modification parameters]")]
mod_map = {}
for col in mod_cols:
mod_msg = sdrf[col].values[0].split(";")
mod_dict = {k.split("=")[0]: k.split("=")[1] for k in mod_msg}
if "TA" in mod_dict:
mod = f"{mod_dict['NT']} ({mod_dict['TA']})"
elif "PP" in mod_dict:
mod = f"{mod_dict['NT']} ({mod_dict['PP']})"
else:
mod = mod_dict["NT"]
mod_map[mod] = mod_dict["AC"]

return mod_map


def generate_mods(row, mod_map):
mod_seq = row["Modified sequence"]
mod_p = find_modification(mod_seq)
Expand Down
39 changes: 38 additions & 1 deletion quantmsio/core/mztab.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import codecs
import os
import pandas as pd
import re
from pyopenms import ModificationsDB
from quantmsio.utils.pride_utils import get_quantmsio_modifications

from quantmsio.operate.tools import get_modification_details

def generate_modification_list(modification_str: str, modifications):

Expand Down Expand Up @@ -221,3 +223,38 @@ def get_modifications(self):
line = f.readline()
f.close()
return mod_dict

def get_mods_map(self):
if os.stat(self.mztab_path).st_size == 0:
raise ValueError("File is empty")
f = codecs.open(self.mztab_path, "r", "utf-8")
line = f.readline()
mods_map = {}
modifications_db = ModificationsDB()
while line.startswith("MTD"):
if "software" in line and "_modifications" in line:
parts = line.split("\t")
mods = parts[2].split(":")[1].strip().split(",")
for mod in mods:
match = re.search(r'\((.*?)\)', mod)
mod = re.search(r'^[a-zA-Z]+', mod)
if match:
site = match.group(1)
else:
site = "X"
mod = mod.group(0)
Mod = modifications_db.getModification(mod)
unimod = Mod.getUniModAccession()
mods_map[mod] = [unimod.upper(), site]
mods_map[unimod.upper()] = [mod, site]
line = f.readline()
f.close()
return mods_map

def generate_modifications_details(self, seq, mods_map, automaton):
seq = seq.replace('.','')
modification_details = get_modification_details(seq, mods_map, automaton)
if(len(modification_details) == 0):
return None
else:
return modification_details
46 changes: 17 additions & 29 deletions quantmsio/core/psm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,27 @@
from quantmsio.utils.file_utils import extract_protein_list
from quantmsio.utils.pride_utils import generate_scan_number
from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab
from quantmsio.core.common import PSM_USECOLS, PSM_MAP, QUANTMSIO_VERSION
from quantmsio.operate.tools import get_ahocorasick, get_modification_details, get_mod_map
from quantmsio.core.common import PSM_USECOLS, PSM_MAP, PSM_SCHEMA
from quantmsio.core.mztab import MzTab, generate_modification_list
from quantmsio.core.format import PSM_FIELDS
import pandas as pd

PSM_SCHEMA = pa.schema(
PSM_FIELDS,
metadata={"description": "psm file in quantms.io format"},
)


class Psm(MzTab):
def __init__(self, mzTab_path):
super(Psm, self).__init__(mzTab_path)
self._ms_runs = self.extract_ms_runs()
self._protein_global_qvalue_map = self.get_protein_map()
self._modifications = self.get_modifications()
self._score_names = self.get_score_names()
self._mods_map = self.get_mods_map()
self._automaton = get_ahocorasick(self._mods_map)

def iter_psm_table(self, chunksize=1000000, protein_str=None):
for df in self.skip_and_load_csv("PSH", chunksize=chunksize):
if protein_str:
df = df[df["accession"].str.contains(f"{protein_str}", na=False)]
no_cols = set(PSM_USECOLS) - set(df.columns)
for col in no_cols:
if col == "unique":
df.loc[:, col] = df["accession"].apply(lambda x: 0 if ";" in x else 1)
else:
df.loc[:, col] = None
df.loc[:, col] = None
df.rename(columns=PSM_MAP, inplace=True)
yield df

Expand All @@ -52,11 +44,12 @@ def generate_report(self, chunksize=1000000, protein_str=None):
for df in self.iter_psm_table(chunksize=chunksize, protein_str=protein_str):
self.transform_psm(df)
self.add_addition_msg(df)
self.convert_to_parquet_format(df, self._modifications)
self.convert_to_parquet_format(df)
df = self.transform_parquet(df)
yield df

def transform_psm(self, df):
modifications = df["peptidoform"].apply(lambda seq: self.generate_modifications_details(seq, self._mods_map, self._automaton))
df.loc[:, "scan"] = df["spectra_ref"].apply(generate_scan_number)

df.loc[:, "reference_file_name"] = df["spectra_ref"].apply(lambda x: self._ms_runs[x[: x.index(":")]])
Expand All @@ -65,10 +58,11 @@ def transform_psm(self, df):
)
df.loc[:, "peptidoform"] = df[["modifications", "sequence"]].apply(
lambda row: get_peptidoform_proforma_version_in_mztab(
row["sequence"], row["modifications"], self._modifications
row["sequence"], row["modifications"], self._mods_map
),
axis=1,
)
df.loc[:, "modifications"] = modifications
df.drop(["spectra_ref", "search_engine", "search_engine_score[1]"], inplace=True, axis=1)

@staticmethod
Expand All @@ -83,19 +77,15 @@ def _genarate_additional_scores(self, cols):
return struct_list

def add_addition_msg(self, df):
df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
df.loc[:, "cv_params"] = None
df.loc[:, "best_id_score"] = None
df.loc[:, "consensus_support"] = None
df.loc[:, "modification_details"] = None
df.loc[:, "predicted_rt"] = None
df.loc[:, "ion_mobility"] = None
df.loc[:, "number_peaks"] = None
df.loc[:, "mz_array"] = None
df.loc[:, "intensity_array"] = None
df.loc[:, "rank"] = None
df.loc[:, "cv_params"] = None

def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=None):
def write_psm_to_file(self, output_path, chunksize=1000000, protein_file=None):
protein_list = extract_protein_list(protein_file) if protein_file else None
protein_str = "|".join(protein_list) if protein_list else None
pqwriter = None
Expand All @@ -107,22 +97,20 @@ def write_feature_to_file(self, output_path, chunksize=1000000, protein_file=Non
pqwriter.close()

@staticmethod
def convert_to_parquet_format(res, modifications):
def convert_to_parquet_format(res):
res["mp_accessions"] = res["mp_accessions"].str.split(";")
res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
res["unique"] = res["unique"].astype("Int32")
res["modifications"] = res["modifications"].apply(lambda x: generate_modification_list(x, modifications))
res["precursor_charge"] = res["precursor_charge"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")
res["calculated_mz"] = res["calculated_mz"].astype(float)
res["observed_mz"] = res["observed_mz"].astype(float)
res["posterior_error_probability"] = res["posterior_error_probability"].astype(float)
res["global_qvalue"] = res["global_qvalue"].astype(float)
res["is_decoy"] = res["is_decoy"].map(lambda x: None if pd.isna(x) else int(x)).astype("Int32")

res["scan"] = res["scan"].astype(str)

if "rt" in res.columns:
res["rt"] = res["rt"].astype(float)
else:
res.loc[:, "rt"] = None
# return pa.Table.from_pandas(res, schema=PSM_SCHEMA)

#df.loc[:, "pg_global_qvalue"] = df["mp_accessions"].map(self._protein_global_qvalue_map)
#res["pg_global_qvalue"] = res["pg_global_qvalue"].astype(float)
#res["unique"] = res["unique"].astype("Int32")
#res["global_qvalue"] = res["global_qvalue"].astype(float)
Loading

0 comments on commit 1a279ef

Please sign in to comment.