diff --git a/docs/include/example.feature.parquet b/docs/include/example.feature.parquet new file mode 100644 index 0000000..f81bb98 Binary files /dev/null and b/docs/include/example.feature.parquet differ diff --git a/docs/score_map_msg.tsv b/docs/score_map_msg.tsv index 5b30cd2..4a32d65 100644 --- a/docs/score_map_msg.tsv +++ b/docs/score_map_msg.tsv @@ -1,3 +1,6 @@ -type field desc map accession -maxquant Score Andromeda:score andromeda_score MS:1002338 -maxquant Delta score Andromeda: delta score andromeda_delta_score MS:1003433 +software field desc map accession +andromeda Score Andromeda:score andromeda_score MS:1002338 +andromeda Delta score Andromeda: delta score andromeda_delta_score MS:1003433 +percolator percolator:score percolator_score MS:1001492 +openMS OpenMS:Best PSM Score openms_score MS:1003114 +MS-GF MS-GF:RawScore msgf_score MS:1002049 diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py index 85fe522..6304ece 100644 --- a/quantmsio/core/common.py +++ b/quantmsio/core/common.py @@ -58,11 +58,16 @@ "Q.Value": "qvalue", "PG.Q.Value": "pg_qvalue", "Precursor.Normalised": "normalize_intensity", + "PG.MaxLFQ": "lfq", + "Quantity.Quality": "quantitative_score", "Precursor.Charge": "precursor_charge", "Stripped.Sequence": "sequence", "Modified.Sequence": "peptidoform", "Genes": "gg_names", + "Run": "run" } +DIANN_USECOLS = list(DIANN_MAP.keys()) + MAXQUANT_PSM_MAP = { "Sequence": "sequence", "Proteins": "mp_accessions", diff --git a/quantmsio/core/diann.py b/quantmsio/core/diann.py index 805b789..11b0360 100644 --- a/quantmsio/core/diann.py +++ b/quantmsio/core/diann.py @@ -15,9 +15,9 @@ from quantmsio.core.feature import Feature from quantmsio.core.duckdb import DuckDB from quantmsio.utils.pride_utils import generate_scan_number -from quantmsio.core.common import DIANN_MAP - +from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS +DIANN_SQL = ', '.join([f'"{name}"' for name in DIANN_USECOLS]) class DiaNNConvert(DuckDB): def __init__(self, diann_report, sdrf_path, duckdb_max_memory="16GB", duckdb_threads=4): @@ -42,11 +42,11 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame: s = time.time() database = self._duckdb.query( """ - select "File.Name", "Run", "Protein.Group", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP", - "PG.Q.Value", "Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", "Precursor.Normalised" - from report + select {} + from report where Run IN {} """.format( + DIANN_SQL, tuple(runs) ) ) @@ -101,7 +101,7 @@ def intergrate_msg(n): sep="\t", usecols=["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"], ) - group = report[report["Run"] == n].copy() + group = report[report["run"] == n].copy() group.sort_values(by="rt_start", inplace=True) target.rename( columns={ @@ -115,8 +115,6 @@ def intergrate_msg(n): res = pd.merge_asof(group, target, on="rt_start", direction="nearest") return res - # query duckdb - # best_ref_map = self.get_peptide_map_from_database() masses_map, modifications_map = self.get_masses_and_modifications_map() info_list = [ @@ -183,31 +181,36 @@ def add_additional_msg(self, report: pd.DataFrame): report["pg_accessions"] = report["pg_accessions"].str.split(";") report.loc[:, "anchor_protein"] = report["pg_accessions"].str[0] report.loc[:, "gg_names"] = report["gg_names"].str.split(",") - report.loc[:, "additional_intensities"] = report[ - ["reference_file_name", "channel", "normalize_intensity"] + ["reference_file_name", "channel", "normalize_intensity", "lfq"] ].apply( lambda rows: [ { "sample_accession": self._sample_map[rows["reference_file_name"] + "-" + rows["channel"]], "channel": rows["channel"], "additional_intensity": [ - {"intensity_name": "normalize_intensity", "intensity_value": rows["normalize_intensity"]} + {"intensity_name": "normalize_intensity", "intensity_value": rows["normalize_intensity"]}, + {"intensity_name": "lfq", "intensity_value": rows["lfq"]} ], } ], axis=1, ) - report.loc[:, "additional_scores"] = report[["qvalue", "pg_qvalue"]].apply( + report.loc[:, "additional_scores"] = report[["qvalue", "pg_qvalue", "global_qvalue"]].apply( lambda row: [ {"score_name": "qvalue", "score_value": row["qvalue"]}, {"score_name": "pg_qvalue", "score_value": row["pg_qvalue"]}, + {"score_name": "global_qvalue", "score_value": row["global_qvalue"]} ], axis=1, ) + report.loc[:, "cv_params"] = report[["quantitative_score"]].apply( + lambda rows: [ + {"cv_name": "quantitative_score", "cv_value": str(rows["quantitative_score"])} + ], + axis=1 + ) report.loc[:, "scan_reference_file_name"] = None - report.loc[:, "scan"] = None - report.loc[:, "cv_params"] = None report.loc[:, "gg_accessions"] = None report.loc[:, "ion_mobility"] = None report.loc[:, "start_ion_mobility"] = None diff --git a/quantmsio/core/psm.py b/quantmsio/core/psm.py index f7402ca..163bc21 100644 --- a/quantmsio/core/psm.py +++ b/quantmsio/core/psm.py @@ -1,3 +1,4 @@ +import re import pyarrow as pa import pyarrow.parquet as pq from quantmsio.utils.file_utils import extract_protein_list @@ -87,7 +88,9 @@ def transform_parquet(df): def _genarate_additional_scores(self, cols): struct_list = [] for software, score in self._score_names.items(): - struct = {"score_name": f"search_engine_score({software})", "score_value": cols[score]} + software = re.sub(r"[^a-zA-Z0-9\s]", "", software) + software = software.lower() + struct = {"score_name": f"{software}_score", "score_value": cols[score]} struct_list.append(struct) if cols["global_qvalue"]: struct = {"score_name": "global_qvalue", "score_value": cols["global_qvalue"]} @@ -95,7 +98,6 @@ def _genarate_additional_scores(self, cols): return struct_list def add_addition_msg(self, df): - df.loc[:, "cv_params"] = None df.loc[:, "predicted_rt"] = None df.loc[:, "ion_mobility"] = None df.loc[:, "number_peaks"] = None diff --git a/quantmsio/operate/tools.py b/quantmsio/operate/tools.py index 7e4b32c..940f10e 100644 --- a/quantmsio/operate/tools.py +++ b/quantmsio/operate/tools.py @@ -79,7 +79,7 @@ def generate_feature_of_gene( pqwriters, pqwriter_no_part = save_parquet_file( partitions, table, output_folder, filename, pqwriters, pqwriter_no_part ) - close_file(partitions, pqwriters, pqwriter_no_part) + close_file(pqwriters, pqwriter_no_part) def map_protein_for_tsv(path: str, fasta: str, output_path: str, map_parameter: str):