diff --git a/bam_masterdata/cli/cli.py b/bam_masterdata/cli/cli.py index 529bd9f..f4e6e80 100644 --- a/bam_masterdata/cli/cli.py +++ b/bam_masterdata/cli/cli.py @@ -6,13 +6,16 @@ import click from decouple import config as environ from openpyxl import Workbook +from rdflib import Graph from bam_masterdata.cli.entities_to_excel import entities_to_excel from bam_masterdata.cli.entities_to_json import entities_to_json +from bam_masterdata.cli.entities_to_rdf import entities_to_rdf from bam_masterdata.cli.fill_masterdata import MasterdataCodeGenerator from bam_masterdata.logger import logger from bam_masterdata.utils import ( delete_and_create_dir, + duplicated_property_types, import_module, listdir_py_modules, ) @@ -164,6 +167,12 @@ def export_to_json(force_delete, python_path): # Process each module using the `model_to_json` method of each entity for module_path in py_modules: + if module_path.endswith("property_types.py"): + if duplicated_property_types(module_path=module_path, logger=logger): + click.echo( + "Please fix the duplicated property types before exporting to RDF/XML." + ) + return entities_to_json(module_path=module_path, export_dir=export_dir, logger=logger) click.echo(f"All entity artifacts have been generated and saved to {export_dir}") @@ -211,9 +220,15 @@ def export_to_excel(force_delete, python_path): definitions_module = import_module(module_path=str(definitions_path.resolve())) # Process the modules and save the entities to the openBIS masterdata Excel file - masterdata_file = os.path.join(".", "artifacts", "masterdata.xlsx") + masterdata_file = os.path.join(export_dir, "masterdata.xlsx") wb = Workbook() for i, module_path in enumerate(py_modules): + if module_path.endswith("property_types.py"): + if duplicated_property_types(module_path=module_path, logger=logger): + click.echo( + "Please fix the duplicated property types before exporting to RDF/XML." + ) + return if i == 0: ws = wb.active else: @@ -234,5 +249,68 @@ def export_to_excel(force_delete, python_path): click.echo(f"All masterdata have been generated and saved to {masterdata_file}") +@cli.command( + name="export_to_rdf", + help="Export entities to a RDF/XML file in the path `./artifacts/bam_masterdata.owl`.", +) +@click.option( + "--force-delete", + type=bool, + required=False, + default=False, + help=""" + (Optional) If set to `True`, it will delete the current `./artifacts/` folder and create a new one. Default is `False`. + """, +) +@click.option( + "--python-path", + type=str, + required=False, + default=DATAMODEL_DIR, + help=""" + (Optional) The path to the individual Python module or the directory containing the Python modules to process the datamodel. + Default is `./bam_masterdata/datamodel/`. + """, +) +def export_to_rdf(force_delete, python_path): + # Get the directories from the Python modules and the export directory for the static artifacts + export_dir = os.path.join(".", "artifacts") + + # Delete and create the export directory + delete_and_create_dir( + directory_path=export_dir, + logger=logger, + force_delete=force_delete, + ) + + # Get the Python modules to process the datamodel + py_modules = listdir_py_modules(directory_path=python_path, logger=logger) + # ! Remove the module containing 'vocabulary_types.py' + py_modules = [ + module for module in py_modules if "vocabulary_types.py" not in module + ] + + # Process each module using the `model_to_rdf` method of each entity + graph = Graph() + for module_path in py_modules: + if module_path.endswith("property_types.py"): + if duplicated_property_types(module_path=module_path, logger=logger): + click.echo( + "Please fix the duplicated property types before exporting to RDF/XML." + ) + return + entities_to_rdf(graph=graph, module_path=module_path, logger=logger) + + # Saving RDF/XML to file + rdf_output = graph.serialize(format="pretty-xml") + masterdata_file = os.path.join(export_dir, "masterdata.owl") + with open(masterdata_file, "w", encoding="utf-8") as f: + f.write(rdf_output) + + click.echo( + f"All masterdata has been generated in RDF/XML format and saved to {masterdata_file}" + ) + + if __name__ == "__main__": cli() diff --git a/bam_masterdata/cli/entities_to_rdf.py b/bam_masterdata/cli/entities_to_rdf.py new file mode 100644 index 0000000..0f953bc --- /dev/null +++ b/bam_masterdata/cli/entities_to_rdf.py @@ -0,0 +1,236 @@ +import inspect +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rdflib import Graph + from structlog._config import BoundLoggerLazyProxy + +import click +from rdflib import BNode, Literal, Namespace +from rdflib.namespace import DC, OWL, RDF, RDFS + +from bam_masterdata.utils import code_to_class_name, import_module + +BAM = Namespace("https://bamresearch.github.io/bam-masterdata/") +PROV = Namespace("http://www.w3.org/ns/prov#") + + +def rdf_graph_init(g: "Graph") -> None: + """ + Initialize the RDF graph with base namespaces, annotation properties, and internal BAM properties. This + function also creates placeholders for PropertyType and other entity types. The graph is to be printed out + in RDF/XML format in the `entities_to_rdf` function. + + Args: + g (Graph): The RDF graph to be initialized. + """ + # Adding base namespaces + g.bind("dc", DC) + g.bind("owl", OWL) + g.bind("rdf", RDF) + g.bind("rdfs", RDFS) + g.bind("bam", BAM) + g.bind("prov", PROV) + + # Adding annotation properties from base namespaces + annotation_props = [ + RDFS.label, + RDFS.comment, + DC.identifier, + ] + for prop in annotation_props: + g.add((prop, RDF.type, OWL.AnnotationProperty)) + + # Custom annotation properties from openBIS: `dataType`, `propertyLabel + custom_annotation_props = { + BAM[ + "dataType" + ]: """Represents the data type of a property as defined in the openBIS platform. + This annotation is used to ensure alignment with the native data types in openBIS, + facilitating seamless integration and data exchange. + + The allowed values for this annotation correspond directly to the openBIS type system, + including BOOLEAN, CONTROLLEDVOCABULARY, DATE, HYPERLINK, INTEGER, MULTILINE_VARCHAR, OBJECT, + REAL, TIMESTAMP, VARCHAR, and XML. + + While `bam:dataType` is primarily intended for internal usage with openBIS, mappings to + standard vocabularies such as `xsd` (e.g., `xsd:boolean`, `xsd:string`) are possible to use and documented to + enhance external interoperability. The full mapping is: + - BOOLEAN: xsd:boolean + - CONTROLLEDVOCABULARY: xsd:string + - DATE: xsd:date + - HYPERLINK: xsd:anyURI + - INTEGER: xsd:integer + - MULTILINE_VARCHAR: xsd:string + - OBJECT: bam:ObjectType + - REAL: xsd:decimal + - TIMESTAMP: xsd:dateTime + - VARCHAR: xsd:string + - XML: xsd:string""", + BAM[ + "propertyLabel" + ]: """A UI-specific annotation used in openBIS to provide an alternative label for a property + displayed in the frontend. Not intended for semantic reasoning or interoperability beyond openBIS.""", + } + for custom_prop, custom_prop_def in custom_annotation_props.items(): + g.add((custom_prop, RDF.type, OWL.AnnotationProperty)) + g.add( + ( + custom_prop, + RDFS.label, + Literal(f"bam:{custom_prop.split('/')[-1]}", lang="en"), + ) + ) + g.add((custom_prop, RDFS.comment, Literal(custom_prop_def, lang="en"))) + + # Internal BAM properties + # ? `section`, `ordinal`, `show_in_edit_views`? + bam_props_uri = { + BAM["hasMandatoryProperty"]: [ + (RDF.type, OWL.ObjectProperty), + # (RDFS.domain, OWL.Class), + (RDFS.range, BAM.PropertyType), + (RDFS.label, Literal("hasMandatoryProperty", lang="en")), + ( + RDFS.comment, + Literal( + "The property must be mandatorily filled when creating the object in openBIS.", + lang="en", + ), + ), + ], + BAM["hasOptionalProperty"]: [ + (RDF.type, OWL.ObjectProperty), + # (RDFS.domain, OWL.Class), + (RDFS.range, BAM.PropertyType), + (RDFS.label, Literal("hasOptionalProperty", lang="en")), + ( + RDFS.comment, + Literal( + "The property is optionally filled when creating the object in openBIS.", + lang="en", + ), + ), + ], + BAM["referenceTo"]: [ + (RDF.type, OWL.ObjectProperty), + (RDFS.domain, BAM.PropertyType), # Restricting domain to PropertyType + # (RDFS.range, OWL.Class), # Explicitly setting range to ObjectType + (RDFS.label, Literal("referenceTo", lang="en")), + ( + RDFS.comment, + Literal( + "The property is referencing an object existing in openBIS.", + lang="en", + ), + ), + ], + } + for prop_uri, obj_properties in bam_props_uri.items(): + for prop in obj_properties: # type: ignore + g.add((prop_uri, prop[0], prop[1])) # type: ignore + + # Adding base PropertyType and other objects as placeholders + # ! add only PropertyType + prop_type_description = """A conceptual placeholder used to define and organize properties as first-class entities. + PropertyType is used to place properties and define their metadata, separating properties from the + entities they describe. + + In integration scenarios: + - PropertyType can align with `BFO:Quality` for inherent attributes. + - PropertyType can represent `BFO:Role` if properties serve functional purposes. + - PropertyType can be treated as a `prov:Entity` when properties participate in provenance relationships.""" + for entity in ["PropertyType", "ObjectType", "CollectionType", "DatasetType"]: + entity_uri = BAM[entity] + g.add((entity_uri, RDF.type, OWL.Thing)) + g.add((entity_uri, RDFS.label, Literal(entity, lang="en"))) + if entity == "PropertyType": + g.add((entity_uri, RDFS.comment, Literal(prop_type_description, lang="en"))) + + +def entities_to_rdf( + graph: "Graph", module_path: str, logger: "BoundLoggerLazyProxy" +) -> None: + """ + Convert the entities defined in the specified module to RDF triples and add them to the graph. The function + uses the `model_to_rdf` method defined in each class to convert the class attributes to RDF triples. The + function also adds the PropertyType and other entity types as placeholders in the graph. + + Args: + graph (Graph): The RDF graph to which the entities are added. + module_path (str): The path to the module containing the entities to be converted. + logger (BoundLoggerLazyProxy): The logger to log messages. + """ + rdf_graph_init(graph) + + module = import_module(module_path=module_path) + + # Special case of `PropertyTypeDef` in `property_types.py` + # PROPERTY TYPES + # skos:prefLabel used for class names + # skos:definition used for `description` (en, de) + # skos:altLabel used for `property_label` + # dc:identifier used for `code` # ! only defined for internal codes with $ symbol + # dc:type used for `data_type` + if "property_types.py" in module_path: + for name, obj in inspect.getmembers(module): + if name.startswith("_") or name == "PropertyTypeDef": + continue + prop_uri = BAM[obj.id] + + # Define the property as an OWL class inheriting from PropertyType + graph.add((prop_uri, RDF.type, OWL.Thing)) + graph.add((prop_uri, RDFS.subClassOf, BAM.PropertyType)) + + # Add attributes like id, code, description in English and Deutsch, property_label, data_type + graph.add((prop_uri, RDFS.label, Literal(obj.id, lang="en"))) + graph.add((prop_uri, DC.identifier, Literal(obj.code))) + descriptions = obj.description.split("//") + if len(descriptions) > 1: + graph.add((prop_uri, RDFS.comment, Literal(descriptions[0], lang="en"))) + graph.add((prop_uri, RDFS.comment, Literal(descriptions[1], lang="de"))) + else: + graph.add((prop_uri, RDFS.comment, Literal(obj.description, lang="en"))) + graph.add( + (prop_uri, BAM.propertyLabel, Literal(obj.property_label, lang="en")) + ) + graph.add((prop_uri, BAM.dataType, Literal(obj.data_type.value))) + if obj.data_type.value == "OBJECT": + # entity_ref_uri = BAM[code_to_class_name(obj.object_code)] + # graph.add((prop_uri, BAM.referenceTo, entity_ref_uri)) + if not code_to_class_name(obj.object_code, logger): + logger.error( + f"Failed to identify the `object_code` for the property {obj.id}" + ) + continue + entity_ref_uri = BAM[code_to_class_name(obj.object_code, logger)] + + # Create a restriction with referenceTo + restriction = BNode() + graph.add((restriction, RDF.type, OWL.Restriction)) + graph.add((restriction, OWL.onProperty, BAM["referenceTo"])) + graph.add((restriction, OWL.someValuesFrom, entity_ref_uri)) + + # Add the restriction as a subclass of the property + graph.add((prop_uri, RDFS.subClassOf, restriction)) + return None + + # All other datamodel modules + # OBJECT/DATASET/COLLECTION TYPES + # skos:prefLabel used for class names + # skos:definition used for `description` (en, de) + # dc:identifier used for `code` # ! only defined for internal codes with $ symbol + # parents defined from `code` + # assigned properties can be Mandatory or Optional, can be PropertyType or ObjectType + # ? For OBJECT TYPES + # ? `generated_code_prefix`, `auto_generated_codes`? + for name, obj in inspect.getmembers(module, inspect.isclass): + # Ensure the class has the `model_to_rdf` method + if not hasattr(obj, "defs") or not callable(getattr(obj, "model_to_rdf")): + continue + try: + # Instantiate the class and call the method + entity = obj() + entity.model_to_rdf(namespace=BAM, graph=graph) + except Exception as err: + click.echo(f"Failed to process class {name} in {module_path}: {err}") diff --git a/bam_masterdata/cli/fill_masterdata.py b/bam_masterdata/cli/fill_masterdata.py index 05ac9e7..3b98586 100644 --- a/bam_masterdata/cli/fill_masterdata.py +++ b/bam_masterdata/cli/fill_masterdata.py @@ -55,7 +55,7 @@ class will inherit from `parent_class`. class_names (dict): A dictionary with the class names of the entities. default (str): The default parent class if the parent class does not exist. lines (list): A list of strings to be printed to the Python module. - + logger (BoundLoggerLazyProxy): The logger to log messages. Returns: tuple: The parent code, parent class, and class name of the entity. """ @@ -138,6 +138,9 @@ def generate_property_types(self) -> str: Generate Python code for the property types in the Openbis datamodel. The code is generated as a string which is then printed out to the specific Python module in `bam_masterdata/datamodel/property_types.py`. + Args: + logger (BoundLoggerLazyProxy): The logger to log messages. + Returns: str: Python code for the property types. """ @@ -154,7 +157,7 @@ def generate_property_types(self) -> str: continue # Format class name - class_name = code_to_class_name(code, entity_type="property") + class_name = code_to_class_name(code=code, entity_type="property") # Add class definition lines.append(f"{class_name} = PropertyTypeDef(") diff --git a/bam_masterdata/datamodel/property_types.py b/bam_masterdata/datamodel/property_types.py index 3762054..3510acc 100644 --- a/bam_masterdata/datamodel/property_types.py +++ b/bam_masterdata/datamodel/property_types.py @@ -3259,7 +3259,8 @@ ) -ProductCategory = PropertyTypeDef( +# ! Duplicated variable name for the property type definition (manually fixed) +ProductCategory1 = PropertyTypeDef( code="PRODUCT_CATEGORY", description="""Product Category (corresponds to field `Product Category` in the Hazardous Materials Inventory (GSM) of BAM)//Produktkategorie (entspricht Feld `Verwendungstypen/Produktkategorie` aus dem Gefahrstoffmanagement (GSM) der BAM))""", data_type="CONTROLLEDVOCABULARY", @@ -5897,7 +5898,8 @@ ) -ProductCategory = PropertyTypeDef( +# ! Duplicated variable name for the property type definition (manually fixed) +ProductCategory2 = PropertyTypeDef( code="PRODUCT.CATEGORY", description="""Category""", data_type="VARCHAR", diff --git a/bam_masterdata/metadata/definitions.py b/bam_masterdata/metadata/definitions.py index e909647..e5a4494 100644 --- a/bam_masterdata/metadata/definitions.py +++ b/bam_masterdata/metadata/definitions.py @@ -4,6 +4,8 @@ from pydantic import BaseModel, Field, field_validator, model_validator +from bam_masterdata.utils import code_to_class_name + class DataType(str, Enum): """Enumeration of the data types available in openBIS.""" @@ -74,6 +76,14 @@ class EntityDef(BaseModel): """, ) + id: Optional[str] = Field( + default=None, + description=""" + Identifier of the entity defined as the class name and used to serialize the entity definitions + in other formats. + """, + ) + # TODO check ontology_id, ontology_version, ontology_annotation_id, internal (found in the openBIS docu) @field_validator("code") @@ -115,7 +125,29 @@ def excel_headers(self) -> list[str]: """ Returns the headers for the entity in a format suitable for the openBIS Excel file. """ - return [k.capitalize().replace("_", " ") for k in self.model_fields.keys()] + return [ + k.capitalize().replace("_", " ") + for k in self.model_fields.keys() + if k != "id" + ] + + @model_validator(mode="after") + @classmethod + def model_id(cls, data: Any) -> Any: + """ + Stores the model `id` as the class name from the `code` field. + + Args: + data (Any): The data containing the fields values to validate. + + Returns: + Any: The data with the validated fields. + """ + if "PropertyType" in data.name: + data.id = code_to_class_name(code=data.code, entity_type="property") + else: + data.id = code_to_class_name(code=data.code, entity_type="object") + return data class BaseObjectTypeDef(EntityDef): diff --git a/bam_masterdata/metadata/entities.py b/bam_masterdata/metadata/entities.py index fb97773..5943a33 100644 --- a/bam_masterdata/metadata/entities.py +++ b/bam_masterdata/metadata/entities.py @@ -1,7 +1,12 @@ import json -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional, no_type_check from pydantic import BaseModel, ConfigDict, Field, model_validator +from rdflib import BNode, Literal +from rdflib.namespace import DC, OWL, RDF, RDFS + +if TYPE_CHECKING: + from rdflib import Graph, Namespace from bam_masterdata.metadata.definitions import ( CollectionTypeDef, @@ -19,6 +24,33 @@ class BaseEntity(BaseModel): adding new methods that are useful for interfacing with openBIS. """ + @property + def cls_name(self) -> str: + """ + Returns the entity name of the class as a string to speed up checks. This is a property + to be overwritten by each of the abstract entity types. + """ + return self.__class__.__name__ + + @property + def _base_attrs(self) -> list: + """ + List of base properties or terms assigned to an entity type. This are the direct properties or terms + assigned when defining a new entity type. + """ + cls_attrs = self.__class__.__dict__ + base_attrs = [ + attr_name + for attr_name in cls_attrs + if not ( + attr_name.startswith("_") + or callable(cls_attrs[attr_name]) + or attr_name + in ["defs", "model_config", "model_fields", "model_computed_fields"] + ) + ] + return [getattr(self, attr_name) for attr_name in base_attrs] + def model_to_json(self, indent: Optional[int] = None) -> str: """ Returns the model as a string in JSON format storing the data `defs` and the property or @@ -52,13 +84,63 @@ def model_to_dict(self) -> dict: dump_json = self.model_to_json() return json.loads(dump_json) - @property - def cls_name(self) -> str: - """ - Returns the entity name of the class as a string to speed up checks. This is a property - to be overwritten by each of the abstract entity types. - """ - return self.__class__.__name__ + # skos:prefLabel used for class names + # skos:definition used for `description` (en, de) + # dc:identifier used for `code` # ! only defined for internal codes with $ symbol + # parents defined from `code` + # assigned properties can be Mandatory or Optional, can be PropertyType or ObjectType + # ? For OBJECT TYPES + # ? `generated_code_prefix`, `auto_generated_codes`? + @no_type_check + def model_to_rdf(self, namespace: "Namespace", graph: "Graph") -> None: + entity_uri = namespace[self.defs.id] + + # Define the entity as an OWL class inheriting from the specific namespace type + graph.add((entity_uri, RDF.type, OWL.Thing)) + parent_classes = self.__class__.__bases__ + for parent_class in parent_classes: + if issubclass(parent_class, BaseEntity) and parent_class != BaseEntity: + # if parent_class.__name__ in [ + # "ObjectType", + # "CollectionType", + # "DatasetType", + # ]: + # # ! add here logic of subClassOf connecting with PROV-O or BFO + # # ! maybe via classes instead of ObjectType/CollectionType/DatasetType? + # # ! Example: + # # ! graph.add((entity_uri, RDFS.subClassOf, "http://www.w3.org/ns/prov#Entity")) + # continue + parent_uri = namespace[parent_class.__name__] + graph.add((entity_uri, RDFS.subClassOf, parent_uri)) + + # Add attributes like id, code, description in English and Deutsch, property_label, data_type + graph.add((entity_uri, RDFS.label, Literal(self.defs.id, lang="en"))) + graph.add((entity_uri, DC.identifier, Literal(self.defs.code))) + descriptions = self.defs.description.split("//") + if len(descriptions) > 1: + graph.add((entity_uri, RDFS.comment, Literal(descriptions[0], lang="en"))) + graph.add((entity_uri, RDFS.comment, Literal(descriptions[1], lang="de"))) + else: + graph.add( + (entity_uri, RDFS.comment, Literal(self.defs.description, lang="en")) + ) + # Adding properties relationships to the entities + for assigned_prop in self._base_attrs: + prop_uri = namespace[assigned_prop.id] + restriction = BNode() + graph.add((restriction, RDF.type, OWL.Restriction)) + if assigned_prop.mandatory: + graph.add( + (restriction, OWL.onProperty, namespace["hasMandatoryProperty"]) + ) + else: + graph.add( + (restriction, OWL.onProperty, namespace["hasOptionalProperty"]) + ) + graph.add((restriction, OWL.someValuesFrom, prop_uri)) + + # Add the restriction as a subclass of the entity + graph.add((entity_uri, RDFS.subClassOf, restriction)) class ObjectType(BaseEntity): @@ -89,6 +171,13 @@ class ObjectType(BaseEntity): """, ) + @property + def cls_name(self) -> str: + """ + Returns the entity name of the class as a string. + """ + return "ObjectType" + @model_validator(mode="after") @classmethod def model_validator_after_init(cls, data: Any) -> Any: @@ -109,13 +198,6 @@ def model_validator_after_init(cls, data: Any) -> Any: return data - @property - def cls_name(self) -> str: - """ - Returns the entity name of the class as a string. - """ - return "ObjectType" - class VocabularyType(BaseEntity): """ @@ -135,6 +217,13 @@ class VocabularyType(BaseEntity): """, ) + @property + def cls_name(self) -> str: + """ + Returns the entity name of the class as a string. + """ + return "VocabularyType" + @model_validator(mode="after") @classmethod def model_validator_after_init(cls, data: Any) -> Any: @@ -155,13 +244,6 @@ def model_validator_after_init(cls, data: Any) -> Any: return data - @property - def cls_name(self) -> str: - """ - Returns the entity name of the class as a string. - """ - return "VocabularyType" - class CollectionType(ObjectType): @property diff --git a/bam_masterdata/utils/__init__.py b/bam_masterdata/utils/__init__.py index 6f16e11..286b484 100644 --- a/bam_masterdata/utils/__init__.py +++ b/bam_masterdata/utils/__init__.py @@ -1,6 +1,7 @@ from .utils import ( code_to_class_name, delete_and_create_dir, + duplicated_property_types, import_module, listdir_py_modules, load_validation_rules, diff --git a/bam_masterdata/utils/utils.py b/bam_masterdata/utils/utils.py index d3c1d49..5139a12 100644 --- a/bam_masterdata/utils/utils.py +++ b/bam_masterdata/utils/utils.py @@ -1,24 +1,30 @@ import glob import importlib.util +import inspect import json import os +import re import shutil from itertools import chain -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional + +from bam_masterdata.logger import logger if TYPE_CHECKING: from structlog._config import BoundLoggerLazyProxy def delete_and_create_dir( - directory_path: str, logger: "BoundLoggerLazyProxy", force_delete: bool = False + directory_path: str, + logger: "BoundLoggerLazyProxy" = logger, + force_delete: bool = False, ) -> None: """ Deletes the directory at `directory_path` and creates a new one in the same path. Args: directory_path (str): The directory path to delete and create the folder. - logger (BoundLoggerLazyProxy): The logger to log messages.. + logger (BoundLoggerLazyProxy): The logger to log messages. Default is `logger`. force_delete (bool): If True, the directory will be forcibly deleted if it exists. """ if not directory_path: @@ -45,7 +51,7 @@ def delete_and_create_dir( def listdir_py_modules( - directory_path: str, logger: "BoundLoggerLazyProxy" + directory_path: str, logger: "BoundLoggerLazyProxy" = logger ) -> list[str]: """ Recursively goes through the `directory_path` and returns a list of all .py files that do not start with '_'. If @@ -53,7 +59,7 @@ def listdir_py_modules( Args: directory_path (str): The directory path to search through. - logger (BoundLoggerLazyProxy): The logger to log messages. + logger (BoundLoggerLazyProxy): The logger to log messages. Default is `logger`. Returns: list[str]: A list of all .py files that do not start with '_' @@ -96,7 +102,11 @@ def import_module(module_path: str) -> Any: return module -def code_to_class_name(code: str, entity_type: str = "object") -> str: +def code_to_class_name( + code: Optional[str], + logger: "BoundLoggerLazyProxy" = logger, + entity_type: str = "object", +) -> str: """ Converts an openBIS `code` to a class name by capitalizing each word and removing special characters. In the special case the entity is a property type, it retains the full name separated by points instead of @@ -104,10 +114,17 @@ def code_to_class_name(code: str, entity_type: str = "object") -> str: Args: code (str): The openBIS code to convert to a class name. + logger (BoundLoggerLazyProxy): The logger to log messages. Default is `logger`. entity_type (str): The type of entity to convert. Default is "object". Returns: str: The class name derived from the openBIS code. """ + if not code: + logger.error( + "The `code` is empty. Please, provide a proper input to the function." + ) + return "" + if entity_type == "property": code_names = chain.from_iterable( [c.split("_") for c in code.lstrip("$").split(".")] @@ -145,3 +162,44 @@ def load_validation_rules( except json.JSONDecodeError as e: logger.error(f"Error parsing validation rules JSON: {e}") raise ValueError(f"Error parsing validation rules JSON: {e}") + + +from pathlib import Path + + +def duplicated_property_types(module_path: str, logger: "BoundLoggerLazyProxy") -> dict: + """ + Find the duplicated property types in a module specified by `module_path` and returns a dictionary + containing the duplicated property types class names as keys and the lines where they matched as values. + + Args: + module_path (str): The path to the module containing the property types. + logger (BoundLoggerLazyProxy): The logger to log messages. + + Returns: + dict: A dictionary containing the duplicated property types class names as keys and the + lines where they matched as values. + """ + duplicated_props: dict = {} + module = import_module(module_path=module_path) + source_code = inspect.getsource(module) + for name, _ in inspect.getmembers(module): + if name.startswith("_") or name == "PropertyTypeDef": + continue + + pattern = rf"^\s*{name} *= *PropertyTypeDef" + + # Find all matching line numbers + matches = [ + i + 1 # Convert to 1-based index + for i, line in enumerate(source_code.splitlines()) + if re.match(pattern, line) + ] + if len(matches) > 1: + duplicated_props[name] = matches + if duplicated_props: + logger.critical( + f"Found {len(duplicated_props)} duplicated property types. These are stored in a dictionary " + f"where the keys are the names of the variables in property_types.py and the values are the lines in the module: {duplicated_props}" + ) + return duplicated_props diff --git a/pyproject.toml b/pyproject.toml index 29ee93a..b49c0d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "pybis~=1.37.1rc4", "openpyxl", "click", - "pydantic", + "pydantic~=2.10.5", + "rdflib", ] [project.urls] diff --git a/tests/cli/test_entities_to_rdf.py b/tests/cli/test_entities_to_rdf.py new file mode 100644 index 0000000..8a6e08b --- /dev/null +++ b/tests/cli/test_entities_to_rdf.py @@ -0,0 +1,111 @@ +import os + +from rdflib import Graph, Literal +from rdflib.namespace import DC, OWL, RDF, RDFS + +from bam_masterdata.cli.entities_to_rdf import BAM, entities_to_rdf, rdf_graph_init +from bam_masterdata.logger import logger + + +def test_rdf_init(): + """ + Test the `rdf_graph_init` function. + """ + graph = Graph() + rdf_graph_init(graph) + + # Test how many nodes initialize in the graph + assert len(graph) == 30 + + # Check if base namespaces are bound correctly. + expected_namespaces = {"dc", "owl", "rdf", "rdfs", "bam", "prov"} + bound_namespaces = {prefix for prefix, _ in graph.namespaces()} + expected_namespaces.issubset(bound_namespaces) + + # Ensure standard annotation properties exist with correct types. + annotation_props = [RDFS.label, RDFS.comment, DC.identifier] + for prop in annotation_props: + assert (prop, RDF.type, OWL.AnnotationProperty) in graph + + # Verify bam:dataType and bam:propertyLabel exist with labels and comments. + custom_props = { + BAM["dataType"]: "Represents the data type of a property", + BAM["propertyLabel"]: "A UI-specific annotation used in openBIS", + } + for prop, comment_start in custom_props.items(): + assert (prop, RDF.type, OWL.AnnotationProperty) in graph + assert ( + prop, + RDFS.label, + Literal(f"bam:{prop.split('/')[-1]}", lang="en"), + ) in graph + assert any( + o.startswith(comment_start) + for _, _, o in graph.triples((prop, RDFS.comment, None)) + ) + + # Check that BAM object properties exist and have correct characteristics. + bam_props = { + BAM["hasMandatoryProperty"]: "The property must be mandatorily filled", + BAM["hasOptionalProperty"]: "The property is optionally filled", + BAM["referenceTo"]: "The property is referencing an object", + } + for prop, comment_start in bam_props.items(): + assert (prop, RDF.type, OWL.ObjectProperty) in graph + assert any( + o.startswith(comment_start) + for _, _, o in graph.triples((prop, RDFS.comment, None)) + ) + + # Ensure PropertyType and related objects exist with labels and comments. + prop_type_uri = BAM["PropertyType"] + assert (prop_type_uri, RDF.type, OWL.Thing) in graph + assert (prop_type_uri, RDFS.label, Literal("PropertyType", lang="en")) in graph + assert any( + o.startswith("A conceptual placeholder used to define") + for _, _, o in graph.triples((prop_type_uri, RDFS.comment, None)) + ) + + +def test_entities_to_rdf(): + module_name = "object_types" # ! only one module for testing + module_path = os.path.join("./bam_masterdata/datamodel", f"{module_name}.py") + + graph = Graph() + rdf_graph_init(graph) + entities_to_rdf(graph=graph, module_path=module_path, logger=logger) + + # Testing + # ! this number is subject to change as the datamodel evolves + assert len(graph) == 5794 + + # Check Instrument entity + instrument_uri = BAM["Instrument"] + assert (instrument_uri, RDF.type, OWL.Thing) in graph + assert (instrument_uri, RDFS.label, Literal("Instrument", lang="en")) in graph + assert ( + instrument_uri, + RDFS.comment, + Literal("Measuring Instrument", lang="en"), + ) in graph + assert ( + instrument_uri, + RDFS.comment, + Literal("Messgerät", lang="de"), + ) in graph + + # Check Camera entity (subclass of Instrument) + camera_uri = BAM["Camera"] + assert (camera_uri, RDF.type, OWL.Thing) in graph + assert (camera_uri, RDFS.subClassOf, instrument_uri) in graph + assert (camera_uri, RDFS.label, Literal("Camera", lang="en")) in graph + assert ( + camera_uri, + RDFS.comment, + Literal("A generic camera device for recording video or photos", lang="en"), + ) in graph + assert ( + camera_uri, + RDFS.comment, + Literal("Eine generische Kamera für Video- oder Fotoaufnahmen", lang="de"), + ) in graph diff --git a/tests/data/utils/example_prop_types_1.py b/tests/data/utils/example_prop_types_1.py new file mode 100644 index 0000000..af69407 --- /dev/null +++ b/tests/data/utils/example_prop_types_1.py @@ -0,0 +1,23 @@ +from bam_masterdata.metadata.definitions import PropertyTypeDef + +PropA = PropertyTypeDef( + code="PROPA", + description="""repeated property""", + data_type="VARCHAR", + property_label="A1", +) + + +PropB = PropertyTypeDef( + code="PROPB", + description="""non-repeated property""", + data_type="VARCHAR", + property_label="B", +) + +PropA = PropertyTypeDef( + code="PROPA", + description="""repeated property""", + data_type="VARCHAR", + property_label="A2", +) diff --git a/tests/data/utils/example_prop_types_2.py b/tests/data/utils/example_prop_types_2.py new file mode 100644 index 0000000..c8c98be --- /dev/null +++ b/tests/data/utils/example_prop_types_2.py @@ -0,0 +1,16 @@ +from bam_masterdata.metadata.definitions import PropertyTypeDef + +PropA = PropertyTypeDef( + code="PROPA", + description="""non-repeated property""", + data_type="VARCHAR", + property_label="A", +) + + +PropB = PropertyTypeDef( + code="PROPB", + description="""non-repeated property""", + data_type="VARCHAR", + property_label="B", +) diff --git a/tests/metadata/test_definitions.py b/tests/metadata/test_definitions.py index 88f55dc..ba20ff8 100644 --- a/tests/metadata/test_definitions.py +++ b/tests/metadata/test_definitions.py @@ -47,32 +47,33 @@ def test_fields(self): """Test the existing defined fields of the `EntityDef` class.""" names = list(EntityDef.model_fields.keys()) field_types = [val.annotation for val in list(EntityDef.model_fields.values())] - assert names == ["code", "description"] - assert field_types == [str, str] + assert names == ["code", "description", "id"] + assert field_types == [str, str, Optional[str]] @pytest.mark.parametrize( - "code, description, is_valid", + "code, description, id, is_valid", [ # `code` in capital and separated by underscores - ("EXPERIMENTAL_STEP", "Valid description", True), + ("EXPERIMENTAL_STEP", "Valid description", "ExperimentalStep", True), # `code` starting with $ symbol - ("$NAME", "Valid description", True), + ("$NAME", "Valid description", "Name", True), # `code` separating inheritance with points - ("WELDING_EQUIPMENT.INSTRUMENT", "Valid description", True), + ("WELDING_EQUIPMENT.INSTRUMENT", "Valid description", "Instrument", True), # Invalid `code` - ("INVALID CODE", "Valid description", False), + ("INVALID CODE", "Valid description", None, False), # `description` is not a string - ("EXPERIMENTAL_STEP", 2, False), + ("EXPERIMENTAL_STEP", 2, None, False), # Empty `code` - ("", "Valid description", False), + ("", "Valid description", "", False), ], ) - def test_entity_def(self, code: str, description: str, is_valid: bool): + def test_entity_def(self, code: str, description: str, id: str, is_valid: bool): """Test creation of `EntityDef` and field validation.""" if is_valid: entity = EntityDef(code=code, description=description) assert entity.code == code assert entity.description == description + assert entity.id == id else: with pytest.raises(ValueError): EntityDef(code=code, description=description) @@ -116,8 +117,8 @@ def test_fields(self): field_types = [ val.annotation for val in list(BaseObjectTypeDef.model_fields.values()) ] - assert names == ["code", "description", "validation_script"] - assert field_types == [str, str, Optional[str]] + assert names == ["code", "description", "id", "validation_script"] + assert field_types == [str, str, Optional[str], Optional[str]] class TestCollectionTypeDef: @@ -127,8 +128,8 @@ def test_fields(self): field_types = [ val.annotation for val in list(CollectionTypeDef.model_fields.values()) ] - assert names == ["code", "description", "validation_script"] - assert field_types == [str, str, Optional[str]] + assert names == ["code", "description", "id", "validation_script"] + assert field_types == [str, str, Optional[str], Optional[str]] class TestDatasetTypeDef: @@ -141,6 +142,7 @@ def test_fields(self): assert names == [ "code", "description", + "id", "validation_script", "main_dataset_pattern", "main_dataset_path", @@ -151,6 +153,7 @@ def test_fields(self): Optional[str], Optional[str], Optional[str], + Optional[str], ] @@ -164,11 +167,19 @@ def test_fields(self): assert names == [ "code", "description", + "id", "validation_script", "generated_code_prefix", "auto_generated_codes", ] - assert field_types == [str, str, Optional[str], Optional[str], bool] + assert field_types == [ + str, + str, + Optional[str], + Optional[str], + Optional[str], + bool, + ] @pytest.mark.parametrize( "code, generated_code_prefix, result", @@ -202,6 +213,7 @@ def test_fields(self): assert names == [ "code", "description", + "id", "property_label", "data_type", "vocabulary_code", @@ -212,6 +224,7 @@ def test_fields(self): assert field_types == [ str, str, + Optional[str], str, DataType, Optional[str], @@ -231,6 +244,7 @@ def test_fields(self): assert names == [ "code", "description", + "id", "property_label", "data_type", "vocabulary_code", @@ -246,6 +260,7 @@ def test_fields(self): assert field_types == [ str, str, + Optional[str], str, DataType, Optional[str], @@ -267,8 +282,8 @@ def test_fields(self): field_types = [ val.annotation for val in list(VocabularyTypeDef.model_fields.values()) ] - assert names == ["code", "description", "url_template"] - assert field_types == [str, str, Optional[str]] + assert names == ["code", "description", "id", "url_template"] + assert field_types == [str, str, Optional[str], Optional[str]] class TestVocabularyTerm: @@ -281,8 +296,9 @@ def test_fields(self): assert names == [ "code", "description", + "id", "url_template", "label", "official", ] - assert field_types == [str, str, Optional[str], str, bool] + assert field_types == [str, str, Optional[str], Optional[str], str, bool] diff --git a/tests/metadata/test_entities.py b/tests/metadata/test_entities.py index fefe255..f64e642 100644 --- a/tests/metadata/test_entities.py +++ b/tests/metadata/test_entities.py @@ -12,7 +12,7 @@ def test_model_to_json(self): entity = generate_base_entity() assert ( entity.model_to_json() - == '{"defs": {"code": "MOCKED_ENTITY", "description": "Mockup for an entity definition//Mockup f\\u00fcr eine Entit\\u00e4tsdefinition", "validation_script": null, "generated_code_prefix": "MOCKENT", "auto_generated_codes": true}}' + == '{"defs": {"code": "MOCKED_ENTITY", "description": "Mockup for an entity definition//Mockup f\\u00fcr eine Entit\\u00e4tsdefinition", "id": "MockedEntity", "validation_script": null, "generated_code_prefix": "MOCKENT", "auto_generated_codes": true}}' ) def test_model_to_dict(self): @@ -22,6 +22,7 @@ def test_model_to_dict(self): "defs": { "code": "MOCKED_ENTITY", "description": "Mockup for an entity definition//Mockup für eine Entitätsdefinition", + "id": "MockedEntity", "validation_script": None, "generated_code_prefix": "MOCKENT", "auto_generated_codes": True, diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index a11dfe2..30cde06 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -10,6 +10,7 @@ from bam_masterdata.utils import ( code_to_class_name, delete_and_create_dir, + duplicated_property_types, import_module, listdir_py_modules, load_validation_rules, @@ -62,7 +63,12 @@ def test_delete_and_create_dir( "warning", ), # No Python files found in the directory - ("./tests/data", [], "No Python files found in the directory.", "info"), + ( + "./tests/data/empty", + [], + "No Python files found in the directory.", + "info", + ), # Python files found in the directory ( "./tests/utils", @@ -119,6 +125,9 @@ def test_import_module(): @pytest.mark.parametrize( "code, entity_type, result", [ + # No code + (None, "object", ""), + ("", "object", ""), # for entities which are objects # normal code ("NORMAL", "object", "Normal"), @@ -156,7 +165,7 @@ def test_import_module(): ], ) def test_code_to_class_name(code: str, entity_type: str, result: str): - assert code_to_class_name(code, entity_type) == result + assert code_to_class_name(code, logger, entity_type) == result @pytest.mark.parametrize( @@ -267,3 +276,19 @@ def test_load_validation_rules( assert result == expected_output assert cleared_log_storage[-1]["event"] == expected_log assert cleared_log_storage[-1]["level"] == "info" + + +@pytest.mark.parametrize( + "path, result", + [ + # PropA appears twice + ("tests/data/utils/example_prop_types_1.py", {"PropA": [3, 18]}), + # None duplicated + ("tests/data/utils/example_prop_types_2.py", {}), + ], +) +def test_duplicated_property_types(cleared_log_storage: list, path: str, result: dict): + assert result == duplicated_property_types(path, logger) + if result: + assert cleared_log_storage[0]["level"] == "critical" + assert "Found 1 duplicated property types" in cleared_log_storage[0]["event"]