Skip to content

Commit

Permalink
update: psm
Browse files Browse the repository at this point in the history
  • Loading branch information
zprobot committed Sep 15, 2024
1 parent e01623b commit 14090e9
Show file tree
Hide file tree
Showing 10 changed files with 534 additions and 9 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ build
.idea
data
docs/.vscode
__pycache__
__pycache__
.vscode
4 changes: 2 additions & 2 deletions docs/feature.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
{"name": "pg_positions", "type": ["null", {"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
4 changes: 2 additions & 2 deletions docs/peptide.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
{"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
4 changes: 2 additions & 2 deletions docs/protein.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
{"name": "global_qvalue","type": ["null", "float32"], "doc": "The global qvalue for a given protein or protein groups"},
{"name": "is_decoy","type": ["null", "int"], "doc": "If the protein is decoy"},
{"name": "best_id_score", "type": "string", "doc": "The best search engine score for the identification"},
{"name": "gene_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"},
{"name": "gene_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"},
{"name": "gg_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"},
{"name": "gg_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"},
{"name": "number_peptides","type": ["null", "int"], "doc": "The total number of peptides for a give protein"},
{"name": "number_psms","type": ["null", "int"], "doc": "The total number of peptide spectrum matches"},
{"name": "number_unique_peptides","type": ["null", "int"], "doc": "The total number of unique peptides"},
Expand Down
4 changes: 2 additions & 2 deletions docs/psm.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
{"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
42 changes: 42 additions & 0 deletions quantmsio/temp_core/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
PSM_MAP = {
"sequence": "sequence",
#"opt_global_cv_MS:1000889_peptidoform_sequence": "peptidoform",
"modifications": "modifications",

"opt_global_Posterior_Error_Probability_score": "posterior_error_probability",
"opt_global_q-value": "global_qvalue",
"opt_global_cv_MS:1002217_decoy_peptide": "is_decoy",
"calc_mass_to_charge": "calculated_mz",
"accession": "pg_accessions",
"unique": "unique",
"charge": "precursor_charge",
"exp_mass_to_charge": "observed_mz",
"retention_time": "rt",
}

PSM_USECOLS = list(PSM_MAP.keys()) + [
"spectra_ref",
"start",
"end",
"search_engine",
"search_engine_score[1]",
]

ADDITIONS = [
"peptidoform",
"modification_details",
"additional_scores",
"pg_positions",
"protein_global_qvalue",
"gg_accessions",
"gg_names",
"predicted_rt"
"reference_file_name"
"scan_number"
"ion_mobility"
"num_peaks"
"mz_array"
"intensity_array"
"rank"
"cv_params"
]
165 changes: 165 additions & 0 deletions quantmsio/temp_core/format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import pyarrow as pa
PEPTIDE_FIELDS = [
pa.field(
"sequence",
pa.string(),
metadata={"description": "The peptide’s sequence corresponding to the PSM"},
),
pa.field(
"peptidoform",
pa.string(),
metadata={"description": "Peptide sequence with modifications: Read the specification for more details"},
),
pa.field(
"modifications",
pa.list_(pa.string()),
metadata={"description": "List of modifications as string array, easy for search and filter"},
),
pa.field(
"modification_details",
pa.list_(pa.string()),
metadata={"description": "List of alternative site probabilities for the modification format: read the specification for more details"},
),
pa.field(
"posterior_error_probability",
pa.float32(),
metadata={"description": "Posterior error probability for the given peptide spectrum match"},
),
pa.field("global_qvalue", pa.float32(), metadata={"description": "Global q-value of the peptide or psm at the level of the experiment"}),

pa.field(
"is_decoy",
pa.int32(),
metadata={"description": "Decoy indicator, 1 if the PSM is a decoy, 0 target"},
),
pa.field(
"calculated_mz",
pa.float32(),
metadata={"description": "Theoretical peptide mass-to-charge ratio based on identified sequence and modifications"},
),
pa.field(
"additional_scores",
pa.list_(
pa.struct([
("name", pa.string()),
("score", pa.float32())
])
),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
),

pa.field(
"pg_accessions",
pa.list_(pa.string()),
metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
),
pa.field(
"pg_positions",
pa.list_(pa.string()),
metadata={"description": "Protein start and end positions written as start_post:end_post"},
),
pa.field(
"unique",
pa.int32(),
metadata={"description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
),
pa.field(
"protein_global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the protein group at the experiment level"},
),
pa.field(
"gg_accessions",
pa.list_(pa.string()),
metadata={"description": "Gene accessions, as string array"},
),
pa.field(
"gg_names",
pa.list_(pa.string()),
metadata={"description": "Gene names, as string array"},
),

pa.field(
"precursor_charge",
pa.int32(),
metadata={"description": "Precursor charge"},
),
pa.field(
"observed_mz",
pa.float32(),
metadata={"description": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
),
pa.field(
"rt",
pa.float32(),
metadata={"description": "MS2 scan’s precursor retention time (in seconds)"},
),
pa.field(
"predicted_rt",
pa.float32(),
metadata={"description": "Predicted retention time of the peptide (in seconds)"},
),
pa.field(
"quantmsio_version",
pa.string(),
metadata={"description": "The version of quantms.io"},
)
]

PSM_UNIQUE_FIELDS = [
pa.field(
"reference_file_name",
pa.string(),
metadata={"description": "Spectrum file name with no path information and not including the file extension"},
),
pa.field(
"scan_number",
pa.string(),
metadata={"description": "Scan number of the spectrum"},
),
pa.field(
"ion_mobility",
pa.float32(),
metadata={"description": "Ion mobility value for the precursor ion"},
),
pa.field("num_peaks", pa.int32(), metadata={"description": "Number of peaks in the spectrum used for the peptide spectrum match"}),
pa.field(
"mz_array",
pa.list_(pa.float32()),
metadata={"description": "Array of m/z values for the spectrum used for the peptide spectrum match"},
),
pa.field(
"intensity_array",
pa.list_(pa.float32()),
metadata={"description": "Array of intensity values for the spectrum used for the peptide spectrum match"},
),
pa.field("rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}),

pa.field(
"cv_params",
pa.list_(
pa.struct([
("name", pa.string()),
("value", pa.string())
])
),
metadata={"description": "Optional list of CV parameters for additional metadata"},
),
]
















PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS
Loading

0 comments on commit 14090e9

Please sign in to comment.