Skip to content

Commit

Permalink
Fix parsing of multiple binding and active sites
Browse files Browse the repository at this point in the history
  • Loading branch information
kimrutherford committed Aug 1, 2024
1 parent 6ace4c4 commit e752303
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 26 deletions.
2 changes: 2 additions & 0 deletions src/pombase/bio/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,8 @@ fn make_test_gene() -> GeneDetails {
coiled_coil_coords: vec![],
signal_peptide: None,
transit_peptide: None,
binding_sites: vec![],
active_sites: vec![],
has_protein_features: false,
rfam_annotations: vec![],
orfeome_identifier: None,
Expand Down
83 changes: 58 additions & 25 deletions src/pombase/uniprot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ struct UniProtDataRecord {
}

lazy_static! {
static ref RANGE_RE: Regex = Regex::new(r"^(SIGNAL|TRANSIT|BINDING|ACT_SITE) (\d+)(?:\.\.([\?\d]+))?(;.*)?").unwrap();
static ref SPLIT_RE: Regex = Regex::new(r"(SIGNAL|TRANSIT|BINDING|ACT_SITE) ").unwrap();
static ref RANGE_RE: Regex = Regex::new(r"^(\d+)(?:\.\.([\?\d]+))?.*?(;.*)?").unwrap();
static ref LIGAND_RE: Regex = Regex::new(r#"/ligand="([^"]+)""#).unwrap();
}

fn get_range(cap: Captures) -> PeptideRange {
let start: usize = cap.get(2).unwrap().as_str().parse().unwrap();
if let Some(end_cap) = cap.get(3) {
let start: usize = cap.get(1).unwrap().as_str().parse().unwrap();
if let Some(end_cap) = cap.get(2) {
if let Ok(end) = end_cap.as_str().parse() {
PeptideRange {
start,
Expand Down Expand Up @@ -82,6 +83,12 @@ fn get_ligand(rest: &str) -> FlexStr {
flex_str!("unknown ligand")
}

fn first_field_part(field: &str) -> Option<String> {
let mut parts_iter = SPLIT_RE.split(field);
parts_iter.next();
parts_iter.next().map(|s| s.to_owned())
}

fn process_record(uniprot_record: UniProtDataRecord) -> UniProtDataEntry {
let gene_uniquename =
if uniprot_record.gene_uniquename.ends_with(";") {
Expand All @@ -93,39 +100,65 @@ fn process_record(uniprot_record: UniProtDataRecord) -> UniProtDataEntry {
};

let signal_peptide =
RANGE_RE.captures_iter(&uniprot_record.signal_peptide).next()
.map(|cap| SignalPeptide {
range: get_range(cap),
});
if let Some(field_part) = first_field_part(&uniprot_record.signal_peptide) {
eprintln!("field_part: {field_part}");
RANGE_RE.captures_iter(&field_part).next()
.map(|cap| SignalPeptide {
range: get_range(cap),
})
} else {
None
};

let transit_peptide =
RANGE_RE.captures_iter(&uniprot_record.transit_peptide).next()
.map(|cap| TransitPeptide {
range: get_range(cap),
});
if let Some(field_part) = first_field_part(&uniprot_record.transit_peptide) {
eprintln!("field_part: {field_part}");
RANGE_RE.captures_iter(&field_part).next()
.map(|cap| TransitPeptide {
range: get_range(cap),
})
} else {
None
};

let mut binding_sites_parts_iter =
SPLIT_RE.split(&uniprot_record.binding_sites);
binding_sites_parts_iter.next(); // remove blank

let binding_sites =
RANGE_RE.captures_iter(&uniprot_record.binding_sites)
.map(|cap| {
binding_sites_parts_iter.map(|field_part| {
if let Some(cap) = RANGE_RE.captures_iter(field_part).next() {
let ligand =
if let Some(rest) = cap.get(4) {
get_ligand(rest.as_str())
} else {
flex_str!("unknown ligand")
};
if let Some(rest) = cap.get(3) {
get_ligand(rest.as_str())
} else {
flex_str!("unknown ligand")
};
BindingSite {
ligand,
range: get_range(cap),
}
})
.collect();
} else {
panic!("failed to parse UniProt data file, no range in {}", field_part);
}
})
.collect();

let mut active_sites_parts_iter =
SPLIT_RE.split(&uniprot_record.active_sites);
active_sites_parts_iter.next(); // remove blank

let active_sites =
RANGE_RE.captures_iter(&uniprot_record.active_sites)
.map(|cap| ActiveSite {
range: get_range(cap),
})
.collect();
active_sites_parts_iter.map(|field_part| {
if let Some(cap) = RANGE_RE.captures_iter(field_part).next() {
ActiveSite {
range: get_range(cap),
}
} else {
panic!("failed to parse UniProt data file, no range in {}", field_part);
}
})
.collect();

UniProtDataEntry {
gene_uniquename: gene_uniquename.into(),
Expand Down
7 changes: 7 additions & 0 deletions tests/test_db_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,13 @@ fn test_gene_details() {
assert_eq!(par1_gene.signal_peptide.as_ref().unwrap().range.end, 22);
assert_eq!(par1_gene.transit_peptide.as_ref().unwrap().range.start, 1);
assert_eq!(par1_gene.transit_peptide.as_ref().unwrap().range.end, 24);


assert_eq!(par1_gene.binding_sites.len(), 2);
assert_eq!(par1_gene.binding_sites.iter().next().unwrap().range.end, 90);
assert_eq!(par1_gene.active_sites.len(), 1);
assert_eq!(par1_gene.active_sites.iter().next().unwrap().range.end, 159);
assert_eq!(par1_gene.active_sites.iter().next().unwrap().range.end, 159);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_uniprot_data.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Entry Signal peptide Transit peptide PomBase Binding site Active site Catalytic activity Gene Names (synonym) Post-translational modification Modified residue Cofactor Kinetics
O13692 SIGNAL 1..22; /evidence="ECO:0000255" TRANSIT 1..24; /note="Mitochondrion"; /evidence="ECO:0000255" SPCC188.02; BINDING 88; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 158; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 159; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 200; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 205; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="2"; /ligand_note="acceptor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 300; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135" ACT_SITE 159; /note="Proton donor"; /evidence="ECO:0000250"; ACT_SITE 261; /note="Nucleophile"; /evidence="ECO:0000250" PTM: The GPI-anchor is attached to the protein in the endoplasmic reticulum and serves to target the protein to the cell surface. There, the glucosamine-inositol phospholipid moiety is cleaved off and the GPI-modified mannoprotein is covalently attached via its lipidless GPI glycan remnant to the 1,6-beta-glucan of the outer cell wall layer.
O13692 SIGNAL 1..22; /evidence="ECO:0000255" TRANSIT 1..24; /note="Mitochondrion"; /evidence="ECO:0000255" SPCC188.02; BINDING 82..90; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135"; BINDING 158; /ligand="(1,3-beta-D-glucosyl)n"; /ligand_id="ChEBI:CHEBI:37671"; /ligand_label="1"; /ligand_note="donor substrate"; /evidence="ECO:0000250|UniProtKB:Q06135" ACT_SITE 159; /note="Proton donor"; /evidence="ECO:0000250" PTM: The GPI-anchor is attached to the protein in the endoplasmic reticulum and serves to target the protein to the cell surface. There, the glucosamine-inositol phospholipid moiety is cleaved off and the GPI-modified mannoprotein is covalently attached via its lipidless GPI glycan remnant to the 1,6-beta-glucan of the outer cell wall layer.
2 changes: 2 additions & 0 deletions tests/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ pub fn make_test_gene(uniquename: &str, name: Option<&str>) -> GeneDetails {
coiled_coil_coords: vec![],
signal_peptide: None,
transit_peptide: None,
binding_sites: vec![],
active_sites: vec![],
has_protein_features: false,
rfam_annotations: vec![],
orfeome_identifier: None,
Expand Down

0 comments on commit e752303

Please sign in to comment.