From e7523031567a9a92e3752d390c0c485d9657d3d9 Mon Sep 17 00:00:00 2001 From: Kim Rutherford Date: Fri, 2 Aug 2024 10:04:42 +1200 Subject: [PATCH] Fix parsing of multiple binding and active sites Refs pombase/website#2115 Refs pombase/pombase-chado#52 Refs pombase/pombase-chado#1172 --- src/pombase/bio/util.rs | 2 + src/pombase/uniprot.rs | 83 ++++++++++++++++++++++++++----------- tests/test_db_json.rs | 7 ++++ tests/test_uniprot_data.tsv | 2 +- tests/util/mod.rs | 2 + 5 files changed, 70 insertions(+), 26 deletions(-) diff --git a/src/pombase/bio/util.rs b/src/pombase/bio/util.rs index 73a5dd64..3a8d63dc 100644 --- a/src/pombase/bio/util.rs +++ b/src/pombase/bio/util.rs @@ -428,6 +428,8 @@ fn make_test_gene() -> GeneDetails { coiled_coil_coords: vec![], signal_peptide: None, transit_peptide: None, + binding_sites: vec![], + active_sites: vec![], has_protein_features: false, rfam_annotations: vec![], orfeome_identifier: None, diff --git a/src/pombase/uniprot.rs b/src/pombase/uniprot.rs index 103d7078..508d43be 100644 --- a/src/pombase/uniprot.rs +++ b/src/pombase/uniprot.rs @@ -45,13 +45,14 @@ struct UniProtDataRecord { } lazy_static! { - static ref RANGE_RE: Regex = Regex::new(r"^(SIGNAL|TRANSIT|BINDING|ACT_SITE) (\d+)(?:\.\.([\?\d]+))?(;.*)?").unwrap(); + static ref SPLIT_RE: Regex = Regex::new(r"(SIGNAL|TRANSIT|BINDING|ACT_SITE) ").unwrap(); + static ref RANGE_RE: Regex = Regex::new(r"^(\d+)(?:\.\.([\?\d]+))?.*?(;.*)?").unwrap(); static ref LIGAND_RE: Regex = Regex::new(r#"/ligand="([^"]+)""#).unwrap(); } fn get_range(cap: Captures) -> PeptideRange { - let start: usize = cap.get(2).unwrap().as_str().parse().unwrap(); - if let Some(end_cap) = cap.get(3) { + let start: usize = cap.get(1).unwrap().as_str().parse().unwrap(); + if let Some(end_cap) = cap.get(2) { if let Ok(end) = end_cap.as_str().parse() { PeptideRange { start, @@ -82,6 +83,12 @@ fn get_ligand(rest: &str) -> FlexStr { flex_str!("unknown ligand") } +fn first_field_part(field: &str) -> Option { + let mut parts_iter = SPLIT_RE.split(field); + parts_iter.next(); + parts_iter.next().map(|s| s.to_owned()) +} + fn process_record(uniprot_record: UniProtDataRecord) -> UniProtDataEntry { let gene_uniquename = if uniprot_record.gene_uniquename.ends_with(";") { @@ -93,39 +100,65 @@ fn process_record(uniprot_record: UniProtDataRecord) -> UniProtDataEntry { }; let signal_peptide = - RANGE_RE.captures_iter(&uniprot_record.signal_peptide).next() - .map(|cap| SignalPeptide { - range: get_range(cap), - }); + if let Some(field_part) = first_field_part(&uniprot_record.signal_peptide) { + eprintln!("field_part: {field_part}"); + RANGE_RE.captures_iter(&field_part).next() + .map(|cap| SignalPeptide { + range: get_range(cap), + }) + } else { + None + }; let transit_peptide = - RANGE_RE.captures_iter(&uniprot_record.transit_peptide).next() - .map(|cap| TransitPeptide { - range: get_range(cap), - }); + if let Some(field_part) = first_field_part(&uniprot_record.transit_peptide) { + eprintln!("field_part: {field_part}"); + RANGE_RE.captures_iter(&field_part).next() + .map(|cap| TransitPeptide { + range: get_range(cap), + }) + } else { + None + }; + + let mut binding_sites_parts_iter = + SPLIT_RE.split(&uniprot_record.binding_sites); + binding_sites_parts_iter.next(); // remove blank let binding_sites = - RANGE_RE.captures_iter(&uniprot_record.binding_sites) - .map(|cap| { + binding_sites_parts_iter.map(|field_part| { + if let Some(cap) = RANGE_RE.captures_iter(field_part).next() { let ligand = - if let Some(rest) = cap.get(4) { - get_ligand(rest.as_str()) - } else { - flex_str!("unknown ligand") - }; + if let Some(rest) = cap.get(3) { + get_ligand(rest.as_str()) + } else { + flex_str!("unknown ligand") + }; BindingSite { ligand, range: get_range(cap), } - }) - .collect(); + } else { + panic!("failed to parse UniProt data file, no range in {}", field_part); + } + }) + .collect(); + + let mut active_sites_parts_iter = + SPLIT_RE.split(&uniprot_record.active_sites); + active_sites_parts_iter.next(); // remove blank let active_sites = - RANGE_RE.captures_iter(&uniprot_record.active_sites) - .map(|cap| ActiveSite { - range: get_range(cap), - }) - .collect(); + active_sites_parts_iter.map(|field_part| { + if let Some(cap) = RANGE_RE.captures_iter(field_part).next() { + ActiveSite { + range: get_range(cap), + } + } else { + panic!("failed to parse UniProt data file, no range in {}", field_part); + } + }) + .collect(); UniProtDataEntry { gene_uniquename: gene_uniquename.into(), diff --git a/tests/test_db_json.rs b/tests/test_db_json.rs index 19953999..731cefb5 100644 --- a/tests/test_db_json.rs +++ b/tests/test_db_json.rs @@ -792,6 +792,13 @@ fn test_gene_details() { assert_eq!(par1_gene.signal_peptide.as_ref().unwrap().range.end, 22); assert_eq!(par1_gene.transit_peptide.as_ref().unwrap().range.start, 1); assert_eq!(par1_gene.transit_peptide.as_ref().unwrap().range.end, 24); + + + assert_eq!(par1_gene.binding_sites.len(), 2); + assert_eq!(par1_gene.binding_sites.iter().next().unwrap().range.end, 90); + assert_eq!(par1_gene.active_sites.len(), 1); + assert_eq!(par1_gene.active_sites.iter().next().unwrap().range.end, 159); + assert_eq!(par1_gene.active_sites.iter().next().unwrap().range.end, 159); } #[test] diff --git a/tests/test_uniprot_data.tsv b/tests/test_uniprot_data.tsv index 0cceb9a7..9f32743a 100644 --- a/tests/test_uniprot_data.tsv +++ b/tests/test_uniprot_data.tsv @@ -1,2 +1,2 @@ Entry Signal peptide Transit peptide PomBase Binding site Active site Catalytic activity Gene Names (synonym) Post-translational modification Modified residue Cofactor Kinetics -O13692 SIGNAL 1..22; /evidence="ECO:0000255" TRANSIT 1..24; /note="Mitochondrion"; /evidence="ECO:0000255" SPCC188.02; BINDING 88; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 158; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 159; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 200; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 205; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 300; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135" ACT_SITE 159; /note="Proton donor"; /evidence="ECO:0000250"; ACT_SITE 261; /note="Nucleophile"; /evidence="ECO:0000250" PTM: The GPI-anchor is attached to the protein in the endoplasmic reticulum and serves to target the protein to the cell surface. There, the glucosamine-inositol phospholipid moiety is cleaved off and the GPI-modified mannoprotein is covalently attached via its lipidless GPI glycan remnant to the 1,6-beta-glucan of the outer cell wall layer. +O13692 SIGNAL 1..22; /evidence="ECO:0000255" TRANSIT 1..24; /note="Mitochondrion"; /evidence="ECO:0000255" SPCC188.02; BINDING 82..90; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 158; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135" ACT_SITE 159; /note="Proton donor"; /evidence="ECO:0000250" PTM: The GPI-anchor is attached to the protein in the endoplasmic reticulum and serves to target the protein to the cell surface. There, the glucosamine-inositol phospholipid moiety is cleaved off and the GPI-modified mannoprotein is covalently attached via its lipidless GPI glycan remnant to the 1,6-beta-glucan of the outer cell wall layer. diff --git a/tests/util/mod.rs b/tests/util/mod.rs index facef4b9..a220aea7 100644 --- a/tests/util/mod.rs +++ b/tests/util/mod.rs @@ -92,6 +92,8 @@ pub fn make_test_gene(uniquename: &str, name: Option<&str>) -> GeneDetails { coiled_coil_coords: vec![], signal_peptide: None, transit_peptide: None, + binding_sites: vec![], + active_sites: vec![], has_protein_features: false, rfam_annotations: vec![], orfeome_identifier: None,