From 05bff693ebdf101f6992c7a41aaebf5f072754f2 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 6 Sep 2024 16:03:43 -0400 Subject: [PATCH] feat(ontologies): add ontology terms/resources models/types/instances --- README.md | 6 ++ bento_lib/__init__.py | 13 ++- bento_lib/ontologies/__init__.py | 0 bento_lib/ontologies/common_resources.py | 120 +++++++++++++++++++++++ bento_lib/ontologies/common_terms.py | 31 ++++++ bento_lib/ontologies/models.py | 58 +++++++++++ bento_lib/ontologies/types.py | 20 ++++ 7 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 bento_lib/ontologies/__init__.py create mode 100644 bento_lib/ontologies/common_resources.py create mode 100644 bento_lib/ontologies/common_terms.py create mode 100644 bento_lib/ontologies/models.py create mode 100644 bento_lib/ontologies/types.py diff --git a/README.md b/README.md index 509683c..c94ef99 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,12 @@ All Bento channels are prefixed with `bento.`. `logging` contains helper functions for standardized Bento logging configuration and formatting. +### `ontologies` + +`ontologies` contains models, types, and helpers for working with ontology terms, +especially in the context of terms which must be eventually ingested into +[Katsu](https://github.com/bento-platform/katsu). + ### `responses` `responses` contains standardized error message-generating functions diff --git a/bento_lib/__init__.py b/bento_lib/__init__.py index 7053863..0d1416f 100644 --- a/bento_lib/__init__.py +++ b/bento_lib/__init__.py @@ -4,6 +4,7 @@ from . import auth from . import drs from . import events +from . import ontologies from . import schemas from . import search from . import service_info @@ -12,5 +13,15 @@ __version__ = metadata.version(__name__) __all__ = [ - "__version__", "apps", "auth", "drs", "events", "schemas", "search", "service_info", "streaming", "workflows" + "__version__", + "apps", + "auth", + "drs", + "events", + "ontologies", + "schemas", + "search", + "service_info", + "streaming", + "workflows" ] diff --git a/bento_lib/ontologies/__init__.py b/bento_lib/ontologies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bento_lib/ontologies/common_resources.py b/bento_lib/ontologies/common_resources.py new file mode 100644 index 0000000..1cf4360 --- /dev/null +++ b/bento_lib/ontologies/common_resources.py @@ -0,0 +1,120 @@ +from .models import OntologyResource, VersionedOntologyResource + +__all__ = [ + # EFO + "EFO", + "EFO_3_69_0", + # MONDO + "MONDO", + "MONDO_2024_09_03", + # NCBITaxon + "NCBI_TAXON", + "NCBI_TAXON_2024_07_03", + # NCIT + "NCIT", + "NCIT_2024_05_07", + # OBI + "OBI", + "OBI_2024_06_10", + # SO + "SO", + # UBERON + "UBERON", +] + + +def _versioned(ont: OntologyResource, url: str, version: str) -> VersionedOntologyResource: + return VersionedOntologyResource( + **ont.model_dump(include={"id", "name", "namespace_prefix", "iri_prefix"}), + url=url, + version=version, + ) + + +# === EFO ============================================================================================================== + +EFO = OntologyResource( + id="efo", + name="Experimental Factor Ontology", + namespace_prefix="EFO", + iri_prefix="http://www.ebi.ac.uk/efo/EFO_", + url="http://www.ebi.ac.uk/efo/efo.owl", +) +EFO_3_69_0 = _versioned(EFO, "http://www.ebi.ac.uk/efo/releases/v3.69.0/efo.owl", version="3.69.0") + +# === MONDO ============================================================================================================ + +MONDO = OntologyResource( + id="mondo", + name="Mondo Disease Ontology", + namespace_prefix="MONDO", + iri_prefix="http://purl.obolibrary.org/obo/MONDO_", + url="http://purl.obolibrary.org/obo/mondo.owl", +) +MONDO_2024_09_03 = _versioned( + MONDO, + url="http://purl.obolibrary.org/obo/mondo/releases/2024-09-03/mondo.owl", + version="2024-09-03", +) + +# === NCBITaxon ======================================================================================================== + +NCBI_TAXON = OntologyResource( + id="ncbitaxon", + name="NCBI organismal classification", + namespace_prefix="NCBITaxon", + iri_prefix="http://purl.obolibrary.org/obo/NCBITaxon_", + url="http://purl.obolibrary.org/obo/ncbitaxon.owl", +) +NCBI_TAXON_2024_07_03 = _versioned( + NCBI_TAXON, + url="http://purl.obolibrary.org/obo/ncbitaxon/2024-07-03/ncbitaxon.owl", + version="2024-07-03", +) + +# === NCIT ============================================================================================================= + +NCIT = OntologyResource( + id="ncit", + name="NCI Thesaurus OBO Edition", + namespace_prefix="NCIT", + iri_prefix="http://purl.obolibrary.org/obo/NCIT_", + url="http://purl.obolibrary.org/obo/ncit.owl", +) +NCIT_2024_05_07 = _versioned( + NCIT, + url="http://purl.obolibrary.org/obo/ncit/releases/2024-05-07/ncit.owl", + version="2024-05-07", +) + +# === OBI ============================================================================================================== + +OBI = OntologyResource( + id="obi", + name="Ontology for Biomedical Investigations", + namespace_prefix="OBI", + iri_prefix="http://purl.obolibrary.org/obo/OBI_", + url="http://purl.obolibrary.org/obo/obi.owl", +) +OBI_2024_06_10 = _versioned(OBI, url="http://purl.obolibrary.org/obo/obi/2024-06-10/obi.owl", version="2024-06-10") + +# === SO =============================================================================================================== + +SO = OntologyResource( + id="so", + name="Sequence types and features ontology", + namespace_prefix="SO", + iri_prefix="http://purl.obolibrary.org/obo/SO_", + url="http://purl.obolibrary.org/obo/so.owl", +) +SO_2024_06_05 = _versioned(SO, url="http://purl.obolibrary.org/obo/so/2024-06-05/so.owl", version="2024-06-05") + +# === UBERON =========================================================================================================== + +UBERON = OntologyResource( + id="uberon", + name="Uberon multi-species anatomy ontology", + namespace_prefix="UBERON", + iri_prefix="http://purl.obolibrary.org/obo/UBERON_", + url="http://purl.obolibrary.org/obo/uberon.owl", +) diff --git a/bento_lib/ontologies/common_terms.py b/bento_lib/ontologies/common_terms.py new file mode 100644 index 0000000..aa75b77 --- /dev/null +++ b/bento_lib/ontologies/common_terms.py @@ -0,0 +1,31 @@ +from .common_resources import NCBI_TAXON, OBI, SO + +__all__ = [ + # NCBITaxon + "NCBI_TAXON_HOMO_SAPIENS", + "NCBI_TAXON_MUS_MUSCULUS", + # OBI + "OBI_16S_RRNA_ASSAY", + "OBI_RNA_SEQ_ASSAY", + "OBI_PROTEOMIC_PROFILING_BY_ARRAY_ASSAY", + "OBI_WHOLE_GENOME_SEQUENCING_ASSAY", + # SO + "SO_GENOMIC_DNA", +] + + +# === NCBITaxon ======================================================================================================== + +NCBI_TAXON_HOMO_SAPIENS = NCBI_TAXON.make_term("NCBITaxon:9606", "Homo sapiens") +NCBI_TAXON_MUS_MUSCULUS = NCBI_TAXON.make_term("NCBITaxon:10090", "Mus musculus") + +# === OBI ============================================================================================================== + +OBI_16S_RRNA_ASSAY = OBI.make_term("OBI:0002763", "16s ribosomal gene sequencing assay") +OBI_RNA_SEQ_ASSAY = OBI.make_term("OBI:0001271", "RNA-seq assay") +OBI_PROTEOMIC_PROFILING_BY_ARRAY_ASSAY = OBI.make_term("OBI:0001318", "proteomic profiling by array assay") +OBI_WHOLE_GENOME_SEQUENCING_ASSAY = OBI.make_term("OBI:0002117", "whole genome sequencing assay") + +# === SO =============================================================================================================== + +SO_GENOMIC_DNA = SO.make_term("SO:0000991", "genomic DNA") diff --git a/bento_lib/ontologies/models.py b/bento_lib/ontologies/models.py new file mode 100644 index 0000000..98c577a --- /dev/null +++ b/bento_lib/ontologies/models.py @@ -0,0 +1,58 @@ +from pydantic import BaseModel, Field, HttpUrl +from typing import Annotated + +from .types import PhenoV2Resource, PhenoV2OntologyClassDict + +NC_NAME_PATTERN = r"^[a-zA-Z_][a-zA-Z0-9.\-_]*$" +CURIE_PATTERN = r"^[a-zA-Z_][a-zA-Z0-9.\-_]*:[a-zA-Z0-9.\-_]+$" + + +class OntologyResource(BaseModel): + """ + Inspired by the Phenopackets v2 Resource model: + https://phenopacket-schema.readthedocs.io/en/latest/resource.html + """ + + # From Phenopackets v2: "For OBO ontologies, the value of this string MUST always be the official OBO ID, which is + # always equivalent to the ID prefix in lower case. Examples: hp, go, mp, mondo Consult http://obofoundry.org for + # a complete list. For other resources which do not use native CURIE identifiers (e.g. SNOMED, UniProt, ClinVar), + # use the prefix in identifiers.org." + id: str + + # From Phenopackets v2: "The name of the ontology referred to by the id element, for example, The Human Phenotype + # Ontology. For OBO Ontologies, the value of this string SHOULD be the same as the title field on + # http://obofoundry.org. Other resources should use the official title for that resource. Note that this field is + # purely for information purposes and software should not encode any assumptions." + name: str + url: HttpUrl + # From Phenopackets v2: "The prefix used in the CURIE of an OntologyClass e.g. HP, MP, ECO for example an HPO term + # will have a CURIE like this - HP:0012828 which should be used in combination with the iri_prefix to form a + # fully-resolvable IRI." + # Since we use it in a CURIE prefix context, it must match a valid NCName: + # https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName + namespace_prefix: Annotated[str, Field(pattern=NC_NAME_PATTERN)] + iri_prefix: HttpUrl + + def make_term(self, id_: str, label: str) -> "OntologyTerm": + return OntologyTerm(ontology=self, id=id_, label=label) + + +class VersionedOntologyResource(OntologyResource): + version: str + + def to_phenopackets_repr(self) -> PhenoV2Resource: + return self.model_dump(mode="json", include={"id", "version", "name", "url", "namespace_prefix", "iri_prefix"}) + + +class OntologyTerm(BaseModel): + """ + Inspired by the Phenopackets v2 OntologyClass model: + https://phenopacket-schema.readthedocs.io/en/latest/ontologyclass.html + """ + + ontology: VersionedOntologyResource + id: Annotated[str, Field(pattern=CURIE_PATTERN)] + label: str + + def to_phenopackets_repr(self) -> PhenoV2OntologyClassDict: + return self.model_dump(mode="json", include={"id", "label"}) diff --git a/bento_lib/ontologies/types.py b/bento_lib/ontologies/types.py new file mode 100644 index 0000000..ab9e4ba --- /dev/null +++ b/bento_lib/ontologies/types.py @@ -0,0 +1,20 @@ +from typing import TypedDict + +__all__ = [ + "PhenoV2Resource", + "PhenoV2OntologyClassDict", +] + + +class PhenoV2Resource(TypedDict): + id: str + name: str + url: str + version: str + namespace_prefix: str + iri_prefix: str + + +class PhenoV2OntologyClassDict(TypedDict): + id: str + label: str