Automate loading the RDF into Neo4j and cleaning the graph database

RomanoLab · Apr 28, 2022 · 680232f · 680232f
1 parent 0b24950
commit 680232f
Show file tree

Hide file tree

Showing 9 changed files with 2,658 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+
+tests/projects/*
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # ista
 
-> **ista-** _N._ [ˈistɑ] — (Sindarin) to have knowledge
+> **ista** _N._ [ˈistɑ] — (Sindarin) Knowledge
 
 A toolkit for building semantic graph knowledge bases.
diff --git a/ista/__init__.py b/ista/__init__.py
@@ -2,4 +2,5 @@
 
 _OWL = owlready2.get_ontology("http://www.w3.org/2002/07/owl#")
 
-from .database_parser import FlatFileDatabaseParser, MySQLDatabaseParser
+from .database_parser import FlatFileDatabaseParser, MySQLDatabaseParser
+from .load_kb import load_kb
diff --git a/ista/database_parser.py b/ista/database_parser.py
@@ -629,4 +629,6 @@ def parse_node_type(
         c.close()
 
         if no_node_added:
-            print("WARNING: NO NODES/PROPERTIES ADDED TO ONTOLOGY")
+            print("WARNING: NO NODES/PROPERTIES ADDED TO ONTOLOGY")
+
+# class XMLDatabaseParser(DatabaseParser):
diff --git a/ista/load_kb.py b/ista/load_kb.py
@@ -0,0 +1,119 @@
+import neo4j
+import os, sys
+
+def _print_header_info():
+    header = """
+This function loads an `ista` knowledge base into Neo4j.
+
+Before continuing, you need to prepare an empty graph database with the correct
+configuration and plugins.
+
+Press Enter/Return to continue.
+    """[1:]
+    print(header)
+    input()
+
+    install = """
+If you haven't done so already, create a new (empty) Neo4j database. v4.4.0 is
+recommended, but other versions may work - just be aware that they have not been
+tested yet. If you are using Neo4j Desktop, you'll probably want to make this
+new database inside of a new Project.
+
+IF YOU SET A PASSWORD, MAKE SURE TO REMEMBER WHAT IT IS!
+
+(OPTIONAL)
+Before starting the database for the first time, open the settings.json file and
+make the following changes, which increase Neo4j's memory limits:
+    dbms.memory.heap.initial_size=2048m
+    dbms.memory.heap.max_size=4G
+    dbms.memory.pagecache.size=2048m
+For smaller knowledge bases, these may not be necessary.
+
+Install the `Neosemantics (n10s)` library. If you are doing so manually (e.g.,
+when running Neo4j Server rather than Neo4j Desktop), make sure you are using a
+compatible version. You might also want to add the Graph Data Science Library
+and APOC library, both of which add nice features to the graph database.
+
+Now, start the graph database.
+
+When you are sure it is running, press Enter/Return to continue.
+    """[1:]
+    print(install)
+    input()
+
+class SimpleGraphDBDriver():
+
+    def __init__(self, uri, user, password):
+        self.driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
+
+    def close(self):
+        self.driver.close()
+
+    def query(self, cypher_query: str):
+        print("  RUNNING QUERY:")
+        print(f"   {cypher_query}")
+        with self.driver.session() as session:
+            res = session.write_transaction(self._run_transaction, cypher_query)
+            return res
+
+    @staticmethod
+    def _run_transaction(tx, qry):
+        result = tx.run(qry)
+        return result
+
+def load_kb(rdf_filepath: str):
+    """Load a graph knowledge base created using `ista` into Neoj4.
+
+    This function will provide a list of prerequisite steps to make sure the
+    use has a clean/empty database running with the correct configuration and
+    plugins installed.
+    """
+    _print_header_info()
+
+    print("Python will now connect to Neo4j and perform the rest of the setup")
+    print("automatically.")
+    print()
+    print("Please provide authentication details for the (new) database:")
+    my_uri = input("Neo4j URI [default: `bolt://localhost:7687`]: ")
+    my_username = input("Neo4j username [default: `neo4j`]: ")
+    my_password = input("Neo4j password: ")
+
+    if my_uri == '':
+        my_uri = 'bolt://localhost:7687'
+    if my_username == '':
+        my_username = 'neo4j'
+
+    db = SimpleGraphDBDriver(my_uri, my_username, my_password)
+
+    if os.name == 'nt':
+        fname_format = rdf_filepath.replace('\\', '\\\\')
+    else:
+        fname_format = rdf_filepath
+
+    db.query("MATCH (n) DETACH DELETE n;")
+
+    try:
+        db.query("CREATE CONSTRAINT n10s_unique_uri ON (r:Resource) ASSERT r.uri IS UNIQUE;")
+    except neo4j.exceptions.ClientError:
+        print("Constraint already exists - skipping.")
+    db.query("CALL n10s.graphconfig.init();")
+    db.query("CALL n10s.graphconfig.set({applyNeo4jNaming: true, handleVocabUris: 'IGNORE'});")
+    db.query(f"CALL n10s.rdf.import.fetch(\"file:///{fname_format}\", \"RDF/XML\");")
+
+    print()
+    print("Database contents loaded; performing post-install cleanup...")
+
+    db.query("MATCH (n:Resource) REMOVE n:Resource;")
+    db.query("MATCH (n:NamedIndividual) REMOVE n:NamedIndividual;")
+    db.query("MATCH (n:AllDisjointClasses) REMOVE n:AllDisjointClasses;")
+    db.query("MATCH (n:AllDisjointProperties) REMOVE n:AllDisjointProperties;")
+    db.query("MATCH (n:DatatypeProperty) REMOVE n:DatatypeProperty;")
+    db.query("MATCH (n:FunctionalProperty) REMOVE n:FunctionalProperty;")
+    db.query("MATCH (n:ObjectProperty) REMOVE n:ObjectProperty;")
+    db.query("MATCH (n:AnnotationProperty) REMOVE n:AnnotationProperty;")
+    db.query("MATCH (n:SymmetricProperty) REMOVE n:SymmetricProperty;")
+    db.query("MATCH (n:_GraphConfig) REMOVE n:_GraphConfig;")
+    db.query("MATCH (n:Ontology) REMOVE n:Ontology;")
+    db.query("MATCH (n:Restriction) REMOVE n:Restriction;")
+    db.query("MATCH (n:Class) REMOVE n:Class;")
+    db.query("MATCH (n) WHERE size(labels(n)) = 0 DETACH DELETE n;") # Removes nodes without labels
diff --git a/ista/util.py b/ista/util.py
@@ -1,5 +1,7 @@
 import owlready2
 
+import ipdb
+
 _OWL = owlready2.get_ontology("http://www.w3.org/2002/07/owl#")
 
 def safe_add_property(entity, prop, value):
@@ -35,7 +37,7 @@ def get_onto_class_by_node_type(ont: owlready2.namespace.Ontology, node_label: s
     -----
     This should be refactored if/when a better solution is available!
     """
-    matches = [c for c in onto.classes() if str(c).split(".")[-1] == node_label]
+    matches = [c for c in ont.classes() if str(c).split(".")[-1] == node_label]
     if len(matches) == 1:
         return matches[0]
     elif len(matches) == 0:
@@ -66,4 +68,35 @@ def safe_make_individual_name(
         nm = indiv_name.strip().replace(" ", "_").lower()
     else:
         nm = indiv_name
-    return "{0}_{1}".format(cl, nm)
+    return "{0}_{1}".format(cl, nm)
+
+def print_onto_stats(onto: owlready2.Ontology):
+    """Print summary statistics for an OWL2 ontology loaded into `owlready2`.
+    """
+
+    print()
+    print("*******************")
+    print("ONTOLOGY STATISTICS")
+    print("*******************")
+    print()
+
+    # classes, etc.
+    ont_classes = [x.name for x in onto.classes()]
+    ont_object_properties = [y.name for y in onto.object_properties()]
+
+    # individuals
+    print("Individual counts:")
+    for cl in onto.classes():
+        name = cl.name
+        this_class_count = len(onto.get_instances_of(cl))
+        if this_class_count > 0:
+            print(f"{name}: {this_class_count}")
+
+    # relationships
+    print()
+    print("Relationship counts:")
+    for op in onto.object_properties():
+        name = op.name
+        this_op_count = len(list(op.get_relations()))
+        if this_op_count > 0:
+            print(f"{name}: {this_op_count}")
diff --git a/test.rdf b/test.rdf
diff --git a/tests/config/comptoxai.py b/tests/config/comptoxai.py
@@ -0,0 +1,116 @@
+from ista import FlatFileDatabaseParser, MySQLDatabaseParser
+
+epa = FlatFileDatabaseParser()
+
+ncbigene = FlatFileDatabaseParser("ncbigene", onto)
+drugbank = FlatFileDatabaseParser("drugbank", onto)
+hetionet = FlatFileDatabaseParser("hetionet", onto)
+aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config)
+aopwiki = FlatFileDatabaseParser("aopwiki", onto)
+tox21 = FlatFileDatabaseParser("tox21", onto)
+disgenet = FlatFileDatabaseParser("disgenet", onto)
+
+
+#####################
+# EPA COMPTOX NODES #
+#####################
+epa.parse_node_type(
+    node_type="Chemical",
+    source_filename="PubChem_DTXSID_mapping_file.txt",
+    fmt="tsv",
+    parse_config={
+        "iri_column_name": "DTXSID",
+        "headers": True,
+        "data_property_map": {
+            "CID": onto.xrefPubchemCID,
+            "SID": onto.xrefPubchemSID,
+            "DTXSID": onto.xrefDTXSID,
+        },
+    },
+    merge=False,
+    skip=False
+)
+epa.parse_node_type(
+    node_type="Chemical",
+    source_filename="Dsstox_CAS_number_name.csv",
+    fmt="csv",
+    parse_config={
+        "iri_column_name": "dsstox_substance_id",
+        "headers": True,
+        "data_property_map": {
+            "casrn": onto.xrefCasRN,
+            "preferred_name": onto.commonName,
+            "dsstox_substance_id": onto.xrefDTXSID,
+        },
+        "merge_column": {
+            "source_column_name": "dsstox_substance_id",
+            "data_property": onto.xrefDTXSID,
+        },
+    },
+    merge=True,
+    skip=False
+)
+epa.parse_node_type(
+    node_type="Chemical",
+    source_filename="CUSTOM/chemical_maccs_fingerprints.tsv",
+    fmt="tsv",
+    parse_config={
+        "iri_column_name": "DTXSID",
+        "headers": True,
+        "data_property_map": {
+            "DTXSID": onto.xrefDTXSID,
+            "MACCS": onto.maccs
+        },
+        "merge_column": {
+            "source_column_name": "DTXSID",
+            "data_property": onto.xrefDTXSID
+        }
+    },
+    merge=True,
+    skip_create_new_node=True,  # Don't create an empty chemical node with just a MACCS property if the CID isn't already in the ontology
+    skip=False
+)
+
+##################
+# CHEMICAL LISTS #
+##################
+epa.parse_node_type(
+    node_type="ChemicalList",
+    source_filename="CUSTOM/Chemical Lists.tsv",
+    fmt="tsv",
+    parse_config={
+        "iri_column_name": "LIST_ACRONYM",
+        "headers": True,
+        "data_property_map": {
+            "LIST_ACRONYM": onto.listAcronym,
+            "LIST_NAME": onto.commonName,
+            "LIST_DESCRIPTION": onto.listDescription
+        },
+        "data_transforms": {
+            "LIST_ACRONYM": lambda x: x.split('/')[-1]
+        }
+    },
+    merge=False,
+    skip=False
+)
+
+###############################
+# Chemical List relationships #
+###############################
+epa.parse_relationship_type(
+    relationship_type=onto.listIncludesChemical,
+    inverse_relationship_type=onto.chemicalInList,
+    source_filename="CUSTOM/chemical_lists_relationships.tsv",
+    fmt="tsv",
+    parse_config = {
+        "subject_node_type": onto.ChemicalList,
+        "subject_column_name": "list_acronym",
+        "subject_match_property": onto.listAcronym,
+        "object_node_type": onto.Chemical,
+        "object_column_name": "casrn",
+        "object_match_property": onto.xrefCasRN,
+        "headers": True
+    },
+    merge=True,
+    skip=False
+)