Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/bigbio/quantms.io into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 18, 2024
2 parents f6fab5e + 6f3d3e3 commit 8a4f9d3
Show file tree
Hide file tree
Showing 12 changed files with 675 additions and 23 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ build
.idea
data
docs/.vscode
__pycache__
__pycache__
.vscode
29 changes: 15 additions & 14 deletions docs/README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -549,15 +549,15 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| Modified.Sequence
| Modified Peptide
| Modified sequence
| -
| opt_global_cv_MS:1000889_peptidoform_sequence

| `modifications`
| List of modifications as a string array, easy for search and filter
| array[string], null
| -
| -
| Modifications
| -
| modifications

| `modification_details`
| List of alternative site probabilities for the modification format: read <<modifications>>
Expand All @@ -573,15 +573,15 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| PEP
| x
| PEP
| modifications
| opt_global_Posterior_Error_Probability_score

| `global_qvalue`
| Global q-value for the peptide or psm at the level of the experiment
| float, null
| Global.Q.Value
| x
| -
| -
| opt_global_q-value

7+^| Peptide fields shared by <<feature>> <<psm>>
| `is_decoy`
Expand All @@ -590,12 +590,12 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| -
| -
| Reverse
| -
| opt_global_cv_MS:1002217_decoy_peptide

| `calculated_mz`
| Theoretical peptide mass-to-charge ratio based on an identified sequence and modifications
| float
| -
| Calculate.Precursor.Mz
| Calculated M/Z
| m/z
| calc_mass_to_charge
Expand All @@ -606,15 +606,15 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| DIA-NN Scores
| FragPipe Scores
| MaxQuant Scores
| -
| search_engine_score

7+^| Protein fields shared by <<feature>> <<psm>> <<peptide>>
| `pg_accessions`
| Protein group accessions of all the proteins that the peptide maps to
| array[string], null
| Protein.Ids
| x
| x
| x
| Proteins
| accession

| `pg_positions`
Expand All @@ -624,6 +624,7 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| x
| x
| Combination of start and end positions

| `unique`
| Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0
| int32, null
Expand All @@ -638,7 +639,7 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| Global.PG.Q.Value
| x
| x
| -
| best_search_engine_score

| `gg_accessions`
| Gene group accessions, as a string array
Expand All @@ -651,9 +652,9 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| `gg_names`
| Gene names, as a string array
| array[string], null
| -
| x
| x
| x
| -
| -

7+^| Spectra fields shared by <<feature>> <<psm>>
Expand All @@ -678,7 +679,7 @@ The following fields are shared among Peptide-based views: <<psm>>, <<feature>>,
| float, null
| RT
| x
| x
| Retention time
| retention_time

| `predicted_rt`
Expand Down Expand Up @@ -737,7 +738,7 @@ The fields that are unique to the psm view are:
| string
| <<scan-diann>>
| Spectrum
| x
| MS/MS scan number
| spectra_ref

| `ion_mobility`
Expand Down
4 changes: 2 additions & 2 deletions docs/feature.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
{"name": "pg_positions", "type": ["null", {"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
107 changes: 107 additions & 0 deletions docs/include/psm_parquet_example.csv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/peptide.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
{"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
4 changes: 2 additions & 2 deletions docs/protein.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
{"name": "global_qvalue","type": ["null", "float32"], "doc": "The global qvalue for a given protein or protein groups"},
{"name": "is_decoy","type": ["null", "int"], "doc": "If the protein is decoy"},
{"name": "best_id_score", "type": "string", "doc": "The best search engine score for the identification"},
{"name": "gene_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"},
{"name": "gene_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"},
{"name": "gg_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene accessions corresponding to every protein"},
{"name": "gg_names", "type": ["null", {"type": "array","items": "string"}], "doc": "The gene names corresponding to every protein"},
{"name": "number_peptides","type": ["null", "int"], "doc": "The total number of peptides for a give protein"},
{"name": "number_psms","type": ["null", "int"], "doc": "The total number of peptide spectrum matches"},
{"name": "number_unique_peptides","type": ["null", "int"], "doc": "The total number of unique peptides"},
Expand Down
4 changes: 2 additions & 2 deletions docs/psm.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
{"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
{"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
{"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
{"name": "gene_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gene_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
{"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
{"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},

{"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
{"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
Expand Down
39 changes: 39 additions & 0 deletions quantmsio/temp_core/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
PSM_MAP = {
"sequence": "sequence",
"modifications": "modifications",

"opt_global_Posterior_Error_Probability_score": "posterior_error_probability",
"opt_global_q-value": "global_qvalue",
"opt_global_cv_MS:1002217_decoy_peptide": "is_decoy",
"calc_mass_to_charge": "calculated_mz",
"accession": "pg_accessions",
"unique": "unique",
"charge": "precursor_charge",
"exp_mass_to_charge": "observed_mz",
"retention_time": "rt",
}

PSM_USECOLS = list(PSM_MAP.keys()) + [
"spectra_ref",
"start",
"end",
]

ADDITIONS = [
"peptidoform",
"modification_details",
"additional_scores",
"pg_positions",
"protein_global_qvalue",
"gg_accessions",
"gg_names",
"predicted_rt"
"reference_file_name"
"scan_number"
"ion_mobility"
"num_peaks"
"mz_array"
"intensity_array"
"rank"
"cv_params"
]
165 changes: 165 additions & 0 deletions quantmsio/temp_core/format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import pyarrow as pa
PEPTIDE_FIELDS = [
pa.field(
"sequence",
pa.string(),
metadata={"description": "The peptide’s sequence corresponding to the PSM"},
),
pa.field(
"peptidoform",
pa.string(),
metadata={"description": "Peptide sequence with modifications: Read the specification for more details"},
),
pa.field(
"modifications",
pa.list_(pa.string()),
metadata={"description": "List of modifications as string array, easy for search and filter"},
),
pa.field(
"modification_details",
pa.list_(pa.string()),
metadata={"description": "List of alternative site probabilities for the modification format: read the specification for more details"},
),
pa.field(
"posterior_error_probability",
pa.float32(),
metadata={"description": "Posterior error probability for the given peptide spectrum match"},
),
pa.field("global_qvalue", pa.float32(), metadata={"description": "Global q-value of the peptide or psm at the level of the experiment"}),

pa.field(
"is_decoy",
pa.int32(),
metadata={"description": "Decoy indicator, 1 if the PSM is a decoy, 0 target"},
),
pa.field(
"calculated_mz",
pa.float32(),
metadata={"description": "Theoretical peptide mass-to-charge ratio based on identified sequence and modifications"},
),
pa.field(
"additional_scores",
pa.list_(
pa.struct([
("name", pa.string()),
("value", pa.float32())
])
),
metadata={"description": "List of structures, each structure contains two fields: name and value"},
),

pa.field(
"pg_accessions",
pa.list_(pa.string()),
metadata={"description": "Protein group accessions of all the proteins that the peptide maps to"},
),
pa.field(
"pg_positions",
pa.list_(pa.string()),
metadata={"description": "Protein start and end positions written as start_post:end_post"},
),
pa.field(
"unique",
pa.int32(),
metadata={"description": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
),
pa.field(
"protein_global_qvalue",
pa.float32(),
metadata={"description": "Global q-value of the protein group at the experiment level"},
),
pa.field(
"gg_accessions",
pa.list_(pa.string()),
metadata={"description": "Gene accessions, as string array"},
),
pa.field(
"gg_names",
pa.list_(pa.string()),
metadata={"description": "Gene names, as string array"},
),

pa.field(
"precursor_charge",
pa.int32(),
metadata={"description": "Precursor charge"},
),
pa.field(
"observed_mz",
pa.float32(),
metadata={"description": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
),
pa.field(
"rt",
pa.float32(),
metadata={"description": "MS2 scan’s precursor retention time (in seconds)"},
),
pa.field(
"predicted_rt",
pa.float32(),
metadata={"description": "Predicted retention time of the peptide (in seconds)"},
),
pa.field(
"quantmsio_version",
pa.string(),
metadata={"description": "The version of quantms.io"},
)
]

PSM_UNIQUE_FIELDS = [
pa.field(
"reference_file_name",
pa.string(),
metadata={"description": "Spectrum file name with no path information and not including the file extension"},
),
pa.field(
"scan_number",
pa.string(),
metadata={"description": "Scan number of the spectrum"},
),
pa.field(
"ion_mobility",
pa.float32(),
metadata={"description": "Ion mobility value for the precursor ion"},
),
pa.field("num_peaks", pa.int32(), metadata={"description": "Number of peaks in the spectrum used for the peptide spectrum match"}),
pa.field(
"mz_array",
pa.list_(pa.float32()),
metadata={"description": "Array of m/z values for the spectrum used for the peptide spectrum match"},
),
pa.field(
"intensity_array",
pa.list_(pa.float32()),
metadata={"description": "Array of intensity values for the spectrum used for the peptide spectrum match"},
),
pa.field("rank", pa.int32(), metadata={"description": "Rank of the peptide spectrum match in the search engine output"}),

pa.field(
"cv_params",
pa.list_(
pa.struct([
("name", pa.string()),
("value", pa.string())
])
),
metadata={"description": "Optional list of CV parameters for additional metadata"},
),
]
















PSM_FIELDS = PEPTIDE_FIELDS + PSM_UNIQUE_FIELDS
Loading

0 comments on commit 8a4f9d3

Please sign in to comment.