Skip to content

Commit

Permalink
update: mq
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Oct 17, 2024
1 parent 9f990ce commit 8c451e0
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 113 deletions.
65 changes: 25 additions & 40 deletions docs/README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -103,48 +103,33 @@ Those three properties can be combined, for example, in a string like one string
When represented in parquet files <<psm>>, <<feature>>, modification details will be a list of struct:

```json
{
"name": "modifications",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "modification_details",
"fields": [
{
"name": "accession",
"type": "string",
"doc": "Accession number of the modification (e.g., UNIMOD:35)"
},
{
"name": "positions",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "PositionDetails",
"fields": [
{
"name": "position",
"type": "int",
"doc": "Position of the modification on the peptide"
},
{
"name": "localization_probability",
"type": "float",
"doc": "Probability that the modification is localized at this position"
}
]
}
},
"doc": "Positions and corresponding localization probabilitie"
}
]
[{
"name": "UNIMOD:35",
"fields": [
{
"position": 2,
"localization_probability": 0.94
},
{
"position": 12,
"localization_probability": 0.06
}
},
"doc": "List of modifications and their details"
]
},
{
"name": "UNIMOD:0",
"fields": [
{
"position": 0,
"localization_probability": 0.92
},
{
"position": 16,
"localization_probability": 0.08
}
]
}

]
```

[[scan]]
Expand Down
11 changes: 4 additions & 7 deletions quantmsio/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,17 @@
}
MAXQUANT_MAP = {
"Sequence": "sequence",
"Proteins": "pg_accessions",
"Proteins": "mp_accessions",
"PEP": "posterior_error_probability",
"Modifications": "modifications",
"Reverse": "is_decoy",
"m/z": "calculated_mz",
"MS/MS m/z": "observed_mz",
"Gene names": "gg_names",
"Calibrated retention time": "rt",
"Calibrated retention time start": "rt_start",
"Calibrated retention time finish": "rt_stop",
"Scan number": "scan",
"Retention time": "rt",
"Charge": "precursor_charge",
"Modified sequence": "peptidoform",
"Raw file": "reference_file_name",
"Intensity": "intensity",
"Score": "additional_scores"
}


Expand Down
80 changes: 45 additions & 35 deletions quantmsio/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
),
pa.field(
"modification_details",
pa.list_(pa.string()),
pa.list_(pa.struct([("name", pa.string()), ("fields", pa.list_(pa.struct([("position", pa.int32()), ("localization_probability", pa.float32())])))])),
metadata={
"description": "List of alternative site probabilities for the modification format: read the specification for more details"
},
Expand Down Expand Up @@ -46,20 +46,30 @@
},
),
pa.field(
"additional_scores",
"best_id_score",
pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
metadata={"description": "A named score type and value representing an identification's measure of confidence or input feature"},
),
pa.field(
"pg_accessions",
pa.list_(pa.string()),
metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
"additional_scores",
pa.list_(pa.struct([("name", pa.string()), ("value", pa.float32())])),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
),
pa.field(
"pg_positions",
pa.list_(pa.string()),
metadata={"description": "Protein start and end positions written as start_post:end_post"},
),
"consensus_support",
pa.float32(),
metadata={"description": "Consensus support for the given peptide spectrum match, when multiple search engines are used"},
),
# pa.field(
# "pg_accessions",
# pa.list_(pa.string()),
# metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
# ),
# pa.field(
# "pg_positions",
# pa.list_(pa.string()),
# metadata={"description": "Protein start and end positions written as start_post:end_post"},
# ),
pa.field(
"unique",
pa.int32(),
Expand All @@ -68,20 +78,25 @@
},
),
pa.field(
"protein_global_qvalue",
"pg_global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the protein group at the experiment level"},
),
pa.field(
"gg_accessions",
pa.list_(pa.string()),
metadata={"description": "Gene accessions, as string array"},
),
pa.field(
"gg_names",
"mp_accessions",
pa.list_(pa.string()),
metadata={"description": "Gene names, as string array"},
),
metadata={"description": "Protein accessions of all the proteins that the peptide maps to"},
),
# pa.field(
# "gg_accessions",
# pa.list_(pa.string()),
# metadata={"description": "Gene accessions, as string array"},
# ),
# pa.field(
# "gg_names",
# pa.list_(pa.string()),
# metadata={"description": "Gene names, as string array"},
# ),
pa.field(
"precursor_charge",
pa.int32(),
Expand All @@ -102,11 +117,6 @@
pa.float32(),
metadata={"description": "Predicted retention time of the peptide (in seconds)"},
),
pa.field(
"quantmsio_version",
pa.string(),
metadata={"description": "The version of quantms.io"},
),
]

PSM_UNIQUE_FIELDS = [
Expand All @@ -116,17 +126,25 @@
metadata={"description": "Spectrum file name with no path information and not including the file extension"},
),
pa.field(
"scan_number",
"scan",
pa.string(),
metadata={"description": "Scan number of the spectrum"},
metadata={"description": "Scan index (number of nativeId) of the spectrum identified"},
),
pa.field(
"rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
),
pa.field(
"cv_params",
pa.list_(pa.struct([("name", pa.string()), ("value", pa.string())])),
metadata={"description": "Optional list of CV parameters for additional metadata"},
),
pa.field(
"ion_mobility",
pa.float32(),
metadata={"description": "Ion mobility value for the precursor ion"},
),
pa.field(
"num_peaks",
"number_peaks",
pa.int32(),
metadata={"description": "Number of peaks in the spectrum used for the peptide spectrum match"},
),
Expand All @@ -140,14 +158,6 @@
pa.list_(pa.float32()),
metadata={"description": "Array of intensity values for the spectrum used for the peptide spectrum match"},
),
pa.field(
"rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}
),
pa.field(
"cv_params",
pa.list_(pa.struct([("name", pa.string()), ("value", pa.string())])),
metadata={"description": "Optional list of CV parameters for additional metadata"},
),
]


Expand Down
109 changes: 86 additions & 23 deletions quantmsio/core/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab
from quantmsio.core.common import QUANTMSIO_VERSION, MAXQUANT_MAP, MAXQUANT_USECOLS
from quantmsio.core.feature import Feature

from quantmsio.core.psm import Psm
# format the log entries
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)

Expand Down Expand Up @@ -76,17 +76,23 @@ def generate_mods(row, mod_map):


class MaxQuant:
def __init__(self, sdrf_path, evidence_path):
def __init__(self, sdrf_path, msms_path):
self._modifications = SDRFHandler(sdrf_path).get_mods_dict()
self._sdrf_path = sdrf_path
self._evidence_path = evidence_path
self._msms_path = msms_path
self.mods_map = get_mod_map(sdrf_path)

def iter_batch(self, chunksize: int = 100000):
col_df = pd.read_csv(self._msms_path, sep="\t", nrows=1)
cols = []
for key in self.mods_map.keys():
col = f"{key} Probabilities"
if col in col_df.columns:
cols.append(col)
for df in pd.read_csv(
self._evidence_path,
self._msms_path,
sep="\t",
usecols=MAXQUANT_USECOLS + ["Potential contaminant"],
usecols=MAXQUANT_USECOLS + cols,
low_memory=False,
chunksize=chunksize,
):
Expand All @@ -97,15 +103,49 @@ def open_from_zip_archive(self, zip_file, file_name):
"""Open file from zip archive."""
with zipfile.ZipFile(zip_file) as z:
with z.open(file_name) as f:
df = pd.read_csv(f, sep="\t", usecols=MAXQUANT_USECOLS + ["Potential contaminant"], low_memory=False)
df = pd.read_csv(f, sep="\t", usecols=MAXQUANT_USECOLS, low_memory=False)
return df

def generate_modification_details(self, df):
keys = {}
pattern = r'\((\d+\.?\d*)\)'
for key in self.mods_map.keys():
col = f"{key} Probabilities"
if col in df.columns:
keys[key] = col
def get_details(rows):
modification_details = []
for key, col in keys.items():
modification_map = {
'name': self.mods_map[key]
}
details = []
seq = rows[col]
if(not isinstance(seq, str)):
continue
match_obj = re.search(pattern, seq)
while(match_obj):
details.append({
"position": match_obj.start(0),
"localization_probability": float(match_obj.group(1))
})
seq = seq.replace(match_obj.group(0),'',1)
match_obj = re.search(pattern, seq)
modification_map['fields'] = details
modification_details.append(modification_map)
if len(modification_details) == 0:
return None
else:
return modification_details
if len(keys.values()) != 0:
df.loc[:, 'modification_details'] = df[keys.values()].apply(get_details,axis=1)
else:
df.loc[:, 'modification_details'] = None

def main_operate(self, df: pd.DataFrame):
df.loc[:, "Modifications"] = df[["Modified sequence", "Modifications"]].apply(
lambda row: generate_mods(row, self.mods_map), axis=1
)
df = df.query('`Potential contaminant`!="+"')
df = df.drop("Potential contaminant", axis=1)
df = df[df["PEP"] < 0.05]
df = df.rename(columns=MAXQUANT_MAP)
df.loc[:, "peptidoform"] = df[["sequence", "modifications"]].apply(
Expand All @@ -114,18 +154,27 @@ def main_operate(self, df: pd.DataFrame):
),
axis=1,
)
df.loc[:, "is_decoy"] = df["is_decoy"].map({None: "0", np.nan: "0", "+": "1"})
df["unique"] = df["pg_accessions"].apply(lambda x: "0" if ";" in str(x) else "1")
df.loc[:, "gg_names"] = df["gg_names"].str.split(",")
df.loc[:, "additional_scores"] = None
df.loc[:, "modification_details"] = None
df["is_decoy"] = df["is_decoy"].map({None: "0", np.nan: "0", "+": "1"})
self.generate_modification_details(df)
df.loc[:, 'unique'] = df["mp_accessions"].apply(lambda x: "0" if ";" in str(x) else "1")
df["additional_scores"] = df["additional_scores"].apply(lambda x: [{"name": "maxquant", "value": np.float32(x)}])
df.loc[:, "best_id_score"] = None
df.loc[:, "cv_params"] = None
df.loc[:, "quantmsio_version"] = QUANTMSIO_VERSION
df.loc[:, "gg_accessions"] = None
df.loc[:, "consensus_support"] = None
df.loc[:, "predicted_rt"] = None
df.loc[:, "channel"] = "LFQ"
df.loc[:, "global_qvalue"] = None
df.loc[:, "observed_mz"] = None
df.loc[:, "pg_global_qvalue"] = None
return df

@staticmethod
def transform_psm(df: pd.DataFrame):
df.loc[:, "intensity_array"] = None
df.loc[:, "ion_mobility"] = None
df.loc[:, "mz_array"] = None
df.loc[:, "number_peaks"] = None
df.loc[:, "rank"] = None

def transform_feature(self, df: pd.DataFrame):
df = self.merge_sdrf(df)
df.loc[:, "global_qvalue"] = None
Expand Down Expand Up @@ -157,11 +206,25 @@ def merge_sdrf(self, df: pd.DataFrame):
def convert_to_parquet(self, output_path: str, chunksize: int = None):
pqwriter = None
for df in self.iter_batch(chunksize=chunksize):
df = self.transform_feature(df)
Feature.convert_to_parquet_format(df, self._modifications)
feature = Feature.transform_feature(df)
self.transform_psm(df)
Psm.convert_to_parquet_format(df,self._modifications)
parquet = Psm.transform_parquet(df)
if not pqwriter:
pqwriter = pq.ParquetWriter(output_path, feature.schema)
pqwriter.write_table(feature)
if pqwriter:
pqwriter.close()
pqwriter = pq.ParquetWriter(output_path, parquet.schema)
pqwriter.write_table(parquet)
if pqwriter:
pqwriter.close()


# def convert_to_parquet(self, output_path: str, chunksize: int = None):
# pqwriter = None
# for df in self.iter_batch(chunksize=chunksize):
# df = self.transform_feature(df)
# Feature.convert_to_parquet_format(df, self._modifications)
# feature = Feature.transform_feature(df)
# if not pqwriter:
# pqwriter = pq.ParquetWriter(output_path, feature.schema)
# pqwriter.write_table(feature)
# if pqwriter:
# pqwriter.close()

Loading

0 comments on commit 8c451e0

Please sign in to comment.