Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Dec 12, 2024
1 parent 8ae62c6 commit ae65a4d
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 28 deletions.
59 changes: 59 additions & 0 deletions quantmsio/commands/diann_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,62 @@ def diann_convert_to_parquet(
file_num=file_num,
protein_file=protein_file,
)


@click.command(
"convert-diann-pg",
short_help="Convert diann_report to pg file of quantms.io format",
)
@click.option(
"--report_path",
help="the diann report file path",
required=True,
)
@click.option(
"--output_folder",
help="Folder where the Json file will be generated",
required=True,
)
@click.option(
"--output_prefix_file",
help="Prefix of the Json file needed to generate the file name",
required=False,
)
@click.option(
"--duckdb_max_memory", help="The maximum amount of memory allocated by the DuckDB engine (e.g 4GB)", required=False
)
@click.option("--duckdb_threads", help="The number of threads for the DuckDB engine (e.g 4)", required=False)
@click.option(
"--file_num",
help="The number of files being processed at the same time",
default=100,
)
def diann_pg_convert_to_parquet(
report_path: str,
output_folder: str,
output_prefix_file: str,
duckdb_max_memory: str,
duckdb_threads: int,
file_num: int,
):
if report_path is None is None or output_folder is None:
raise click.UsageError("Please provide all the required parameters")

if not os.path.exists(output_folder):
os.makedirs(output_folder)

if not output_prefix_file:
output_prefix_file = "pg"
filename = create_uuid_filename(output_prefix_file, ".pg.parquet")
pg_output_path = output_folder + "/" + filename

dia_nn = DiaNNConvert(
diann_report=report_path,
sdrf_path=None,
duckdb_max_memory=duckdb_max_memory,
duckdb_threads=duckdb_threads,
)
dia_nn.write_pg_matrix_to_file(
output_path= pg_output_path,
file_num=file_num
)
4 changes: 2 additions & 2 deletions quantmsio/core/ae.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def get_ibaq_columns(path):
with open(path) as f:
line = f.readline()
return line.split("\n")[0].split(",")
return line.split("\n")[0].split("\t")


class AbsoluteExpressionHander:
Expand Down Expand Up @@ -66,7 +66,7 @@ def load_ibaq_file(self, path, protein_str=None):
for col in usecols:
if col not in ibaq_columns:
raise Exception(f"Not found {col} in ibaq file")
ibaqs = pd.read_csv(path, usecols=usecols)
ibaqs = pd.read_csv(path, usecols=usecols, sep="\t")
ibaqs.rename(columns=AbsoluteExpressionHander.LABEL_MAP, inplace=True)
if protein_str:
ibaqs = ibaqs[ibaqs["protein"].str.contains(f"{protein_str}", na=False)]
Expand Down
19 changes: 18 additions & 1 deletion quantmsio/core/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from quantmsio import __version__
from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS
from quantmsio.core.format import PSM_FIELDS, FEATURE_FIELDS, IBAQ_FIELDS, PG_FIELDS
import pyarrow as pa

PSM_MAP = {
Expand Down Expand Up @@ -66,7 +66,19 @@
"Genes": "gg_names",
"Run": "run",
}
DIANN_PG_MAP = {
"Protein.Group": "pg_accessions",
"Protein.Names": "pg_names",
"Genes": "gg_accessions",
"Run": "reference_file_name",
"Global.PG.Q.Value": "global_qvalue",
'PG.Quantity': "intensity",
'PG.Normalised': "normalize_intensity",
"PG.MaxLFQ": "lfq",
'PG.Q.Value': "qvalue"
}
DIANN_USECOLS = list(DIANN_MAP.keys())
DIANN_PG_USECOLS = list(DIANN_PG_MAP.keys())

MAXQUANT_PSM_MAP = {
"Sequence": "sequence",
Expand Down Expand Up @@ -135,3 +147,8 @@
IBAQ_FIELDS,
metadata={"description": "ibaq file in quantms.io format"},
)
PG_SCHEMA = pa.schema(
PG_FIELDS,
metadata={"description": "PG file in quantms.io format"},
)

85 changes: 70 additions & 15 deletions quantmsio/core/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import numpy as np
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq
import concurrent.futures
from pathlib import Path
from collections import defaultdict
from pyopenms import AASequence
from pyopenms.Constants import PROTON_MASS_U
from quantmsio.operate.tools import get_ahocorasick
Expand All @@ -15,27 +17,22 @@
from quantmsio.core.feature import Feature
from quantmsio.core.duckdb import DuckDB
from quantmsio.utils.pride_utils import generate_scan_number
from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS
from quantmsio.core.common import DIANN_MAP, DIANN_USECOLS, DIANN_PG_MAP, DIANN_PG_USECOLS, PG_SCHEMA

DIANN_SQL = ", ".join([f'"{name}"' for name in DIANN_USECOLS])

DIANN_PG_SQL = ", ".join([f'"{name}"' for name in DIANN_PG_USECOLS])

class DiaNNConvert(DuckDB):

def __init__(self, diann_report, sdrf_path, duckdb_max_memory="16GB", duckdb_threads=4):
def __init__(self, diann_report, sdrf_path=None, duckdb_max_memory="16GB", duckdb_threads=4):
super(DiaNNConvert, self).__init__(diann_report, duckdb_max_memory, duckdb_threads)
self.init_duckdb()
self._sdrf = SDRFHandler(sdrf_path)
self._mods_map = self._sdrf.get_mods_dict()
self._automaton = get_ahocorasick(self._mods_map)
self._sample_map = self._sdrf.get_sample_map_run()

def init_duckdb(self):

self._duckdb.execute("""CREATE INDEX idx_precursor_q ON report ("Precursor.Id", "Q.Value")""")
self._duckdb.execute("""CREATE INDEX idx_run ON report ("Run")""")
if sdrf_path:
self._sdrf = SDRFHandler(sdrf_path)
self._mods_map = self._sdrf.get_mods_dict()
self._automaton = get_ahocorasick(self._mods_map)
self._sample_map = self._sdrf.get_sample_map_run()

def get_report_from_database(self, runs: list) -> pd.DataFrame:
def get_report_from_database(self, runs: list, sql:str = DIANN_SQL) -> pd.DataFrame:
"""
This function loads the report from the duckdb database for a group of ms_runs.
:param runs: A list of ms_runs
Expand All @@ -48,7 +45,7 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame:
from report
where Run IN {}
""".format(
DIANN_SQL, tuple(runs)
sql, tuple(runs)
)
)
report = database.df()
Expand Down Expand Up @@ -90,6 +87,46 @@ def get_peptide_map_from_database(self):
logging.info("Time to load peptide map {} seconds".format(et))
return best_ref_map

def get_peptide_count(self, df):
peptide_count = defaultdict(int)
for proteins in df["pg_accessions"]:
for protein in proteins.split(";"):
peptide_count[protein] += 1
return peptide_count

def generate_pg_matrix(self, report):
peptide_count = self.get_peptide_count(report)
report.drop_duplicates(subset=["pg_accessions"],inplace=True)
report["gg_accessions"] = report["gg_accessions"].str.split(";")
report["pg_names"] = report["pg_names"].str.split(";")
report["reference_file_name"] = report["reference_file_name"].apply(lambda x: x.split(".")[0])
report["pg_accessions"] = report["pg_accessions"].str.split(";")
report.loc[:,"peptides"] = report["pg_accessions"].apply(
lambda proteins: [{
"protein_name": protein,
"peptide_count": peptide_count[protein]
} for protein in proteins]
)
report.loc[:, "is_decoy"] = 0
report.loc[:, "additional_intensities"] = report[["normalize_intensity","lfq"]].apply(
lambda rows: [
{
"intensity_name": name,
"intensity_value": rows[name]
} for name in ["normalize_intensity","lfq"]
],
axis=1,
)
report.loc[:, "additional_scores"] = report["qvalue"].apply(
lambda value: [{
"score_name": "qvalue",
"score_value": value
}]
)
report.loc[:, "contaminant"] = None
report.loc[:, "anchor_protein"] = None
return report

def main_report_df(self, qvalue_threshold: float, mzml_info_folder: str, file_num: int, protein_str: str = None):
def intergrate_msg(n):
nonlocal report
Expand Down Expand Up @@ -127,6 +164,7 @@ def intergrate_msg(n):
for refs in info_list:
report = self.get_report_from_database(refs)
report.rename(columns=DIANN_MAP, inplace=True)
report.dropna(subset=["pg_accessions"], inplace=True)
if protein_str:
report = report[report["pg_accessions"].str.contains(f"{protein_str}", na=False)]
# restrict
Expand Down Expand Up @@ -228,6 +266,23 @@ def generate_feature(
logging.info("Time to generate psm and feature file {} seconds".format(et))
yield report

def write_pg_matrix_to_file(self, output_path:str,file_num=20):
info_list = self.get_unique_references("Run")
info_list = [info_list[i : i + file_num] for i in range(0, len(info_list), file_num)]
pqwriter = None
for refs in info_list:
report = self.get_report_from_database(refs, DIANN_PG_SQL)
report.rename(columns=DIANN_PG_MAP, inplace=True)
report.dropna(subset=["pg_accessions"], inplace=True)
for ref in refs:
df = report[report["reference_file_name"] == ref].copy()
df = self.generate_pg_matrix(df)
pg_parquet = pa.Table.from_pandas(df, schema=PG_SCHEMA)
if not pqwriter:
pqwriter = pq.ParquetWriter(output_path, pg_parquet.schema)
pqwriter.write_table(pg_parquet)
close_file(pqwriter=pqwriter)
self.destroy_duckdb_database()
def write_feature_to_file(
self,
qvalue_threshold: float,
Expand Down
62 changes: 62 additions & 0 deletions quantmsio/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,68 @@
pa.field("intensity", pa.float32(), metadata={"description": "intensity value"}),
]

PG_FIELDS = [
pa.field(
"pg_accessions",
pa.list_(pa.string()),
metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
),
pa.field(
"pg_names",
pa.list_(pa.string()),
metadata={"description": "Protein group names"},
),
pa.field(
"gg_accessions",
pa.list_(pa.string()),
metadata={"description": "Gene group accessions, as a string array"},
),
pa.field(
"reference_file_name",
pa.string(),
metadata={"description": "Spectrum file name with no path information and not including the file extension"},
),
pa.field(
"global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the protein group at the experiment level"},
),
pa.field(
"intensity",
pa.float32(),
metadata={"description": "the intensity-based abundance of the protein group in the reference file"},
),
pa.field(
"additional_intensities",
pa.list_(pa.struct([("intensity_name", pa.string()), ("intensity_value", pa.float32())])),
metadata={"description": "The intensity-based abundance of the peptide in the sample"},
),
pa.field(
"is_decoy",
pa.int32(),
metadata={"description": "Decoy indicator, 1 if the PSM is a decoy, 0 target"},
),
pa.field(
"contaminant",
pa.int32(),
metadata={"description": "If the protein is a contaminant"},
),
pa.field(
"peptides",
pa.list_(pa.struct([("protein_name", pa.string()), ("peptide_count", pa.int32())])),
metadata={"description": "Number of peptides per protein in the protein group"},
),
pa.field(
"anchor_protein",
pa.string(),
metadata={"description": "The anchor protein of the protein group, leading protein or representative"},
),
pa.field(
"additional_scores",
pa.list_(pa.struct([("score_name", pa.string()), ("score_value", pa.float32())])),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
)
]

PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS

Expand Down
Loading

0 comments on commit ae65a4d

Please sign in to comment.