Skip to content

Commit

Permalink
Automate loading the RDF into Neo4j and cleaning the graph database
Browse files Browse the repository at this point in the history
  • Loading branch information
JDRomano2 committed Apr 28, 2022
1 parent 0b24950 commit 680232f
Show file tree
Hide file tree
Showing 9 changed files with 2,658 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/


tests/projects/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# ista

> **ista-** _N._ [ˈistɑ] — (Sindarin) to have knowledge
> **ista** _N._ [ˈistɑ] — (Sindarin) Knowledge
A toolkit for building semantic graph knowledge bases.
3 changes: 2 additions & 1 deletion ista/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@

_OWL = owlready2.get_ontology("http://www.w3.org/2002/07/owl#")

from .database_parser import FlatFileDatabaseParser, MySQLDatabaseParser
from .database_parser import FlatFileDatabaseParser, MySQLDatabaseParser
from .load_kb import load_kb
4 changes: 3 additions & 1 deletion ista/database_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,4 +629,6 @@ def parse_node_type(
c.close()

if no_node_added:
print("WARNING: NO NODES/PROPERTIES ADDED TO ONTOLOGY")
print("WARNING: NO NODES/PROPERTIES ADDED TO ONTOLOGY")

# class XMLDatabaseParser(DatabaseParser):
119 changes: 119 additions & 0 deletions ista/load_kb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import neo4j
import os, sys

def _print_header_info():
header = """
This function loads an `ista` knowledge base into Neo4j.
Before continuing, you need to prepare an empty graph database with the correct
configuration and plugins.
Press Enter/Return to continue.
"""[1:]
print(header)
input()

install = """
If you haven't done so already, create a new (empty) Neo4j database. v4.4.0 is
recommended, but other versions may work - just be aware that they have not been
tested yet. If you are using Neo4j Desktop, you'll probably want to make this
new database inside of a new Project.
IF YOU SET A PASSWORD, MAKE SURE TO REMEMBER WHAT IT IS!
(OPTIONAL)
Before starting the database for the first time, open the settings.json file and
make the following changes, which increase Neo4j's memory limits:
dbms.memory.heap.initial_size=2048m
dbms.memory.heap.max_size=4G
dbms.memory.pagecache.size=2048m
For smaller knowledge bases, these may not be necessary.
Install the `Neosemantics (n10s)` library. If you are doing so manually (e.g.,
when running Neo4j Server rather than Neo4j Desktop), make sure you are using a
compatible version. You might also want to add the Graph Data Science Library
and APOC library, both of which add nice features to the graph database.
Now, start the graph database.
When you are sure it is running, press Enter/Return to continue.
"""[1:]
print(install)
input()

class SimpleGraphDBDriver():

def __init__(self, uri, user, password):
self.driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))

def close(self):
self.driver.close()

def query(self, cypher_query: str):
print(" RUNNING QUERY:")
print(f" {cypher_query}")
with self.driver.session() as session:
res = session.write_transaction(self._run_transaction, cypher_query)
return res

@staticmethod
def _run_transaction(tx, qry):
result = tx.run(qry)
return result

def load_kb(rdf_filepath: str):
"""Load a graph knowledge base created using `ista` into Neoj4.
This function will provide a list of prerequisite steps to make sure the
use has a clean/empty database running with the correct configuration and
plugins installed.
"""
_print_header_info()

print("Python will now connect to Neo4j and perform the rest of the setup")
print("automatically.")
print()
print("Please provide authentication details for the (new) database:")
my_uri = input("Neo4j URI [default: `bolt://localhost:7687`]: ")
my_username = input("Neo4j username [default: `neo4j`]: ")
my_password = input("Neo4j password: ")

if my_uri == '':
my_uri = 'bolt://localhost:7687'
if my_username == '':
my_username = 'neo4j'

db = SimpleGraphDBDriver(my_uri, my_username, my_password)

if os.name == 'nt':
fname_format = rdf_filepath.replace('\\', '\\\\')
else:
fname_format = rdf_filepath

db.query("MATCH (n) DETACH DELETE n;")

try:
db.query("CREATE CONSTRAINT n10s_unique_uri ON (r:Resource) ASSERT r.uri IS UNIQUE;")
except neo4j.exceptions.ClientError:
print("Constraint already exists - skipping.")
db.query("CALL n10s.graphconfig.init();")
db.query("CALL n10s.graphconfig.set({applyNeo4jNaming: true, handleVocabUris: 'IGNORE'});")
db.query(f"CALL n10s.rdf.import.fetch(\"file:///{fname_format}\", \"RDF/XML\");")

print()
print("Database contents loaded; performing post-install cleanup...")

db.query("MATCH (n:Resource) REMOVE n:Resource;")
db.query("MATCH (n:NamedIndividual) REMOVE n:NamedIndividual;")
db.query("MATCH (n:AllDisjointClasses) REMOVE n:AllDisjointClasses;")
db.query("MATCH (n:AllDisjointProperties) REMOVE n:AllDisjointProperties;")
db.query("MATCH (n:DatatypeProperty) REMOVE n:DatatypeProperty;")
db.query("MATCH (n:FunctionalProperty) REMOVE n:FunctionalProperty;")
db.query("MATCH (n:ObjectProperty) REMOVE n:ObjectProperty;")
db.query("MATCH (n:AnnotationProperty) REMOVE n:AnnotationProperty;")
db.query("MATCH (n:SymmetricProperty) REMOVE n:SymmetricProperty;")
db.query("MATCH (n:_GraphConfig) REMOVE n:_GraphConfig;")
db.query("MATCH (n:Ontology) REMOVE n:Ontology;")
db.query("MATCH (n:Restriction) REMOVE n:Restriction;")
db.query("MATCH (n:Class) REMOVE n:Class;")
db.query("MATCH (n) WHERE size(labels(n)) = 0 DETACH DELETE n;") # Removes nodes without labels
37 changes: 35 additions & 2 deletions ista/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import owlready2

import ipdb

_OWL = owlready2.get_ontology("http://www.w3.org/2002/07/owl#")

def safe_add_property(entity, prop, value):
Expand Down Expand Up @@ -35,7 +37,7 @@ def get_onto_class_by_node_type(ont: owlready2.namespace.Ontology, node_label: s
-----
This should be refactored if/when a better solution is available!
"""
matches = [c for c in onto.classes() if str(c).split(".")[-1] == node_label]
matches = [c for c in ont.classes() if str(c).split(".")[-1] == node_label]
if len(matches) == 1:
return matches[0]
elif len(matches) == 0:
Expand Down Expand Up @@ -66,4 +68,35 @@ def safe_make_individual_name(
nm = indiv_name.strip().replace(" ", "_").lower()
else:
nm = indiv_name
return "{0}_{1}".format(cl, nm)
return "{0}_{1}".format(cl, nm)

def print_onto_stats(onto: owlready2.Ontology):
"""Print summary statistics for an OWL2 ontology loaded into `owlready2`.
"""

print()
print("*******************")
print("ONTOLOGY STATISTICS")
print("*******************")
print()

# classes, etc.
ont_classes = [x.name for x in onto.classes()]
ont_object_properties = [y.name for y in onto.object_properties()]

# individuals
print("Individual counts:")
for cl in onto.classes():
name = cl.name
this_class_count = len(onto.get_instances_of(cl))
if this_class_count > 0:
print(f"{name}: {this_class_count}")

# relationships
print()
print("Relationship counts:")
for op in onto.object_properties():
name = op.name
this_op_count = len(list(op.get_relations()))
if this_op_count > 0:
print(f"{name}: {this_op_count}")
Empty file added test.rdf
Empty file.
116 changes: 116 additions & 0 deletions tests/config/comptoxai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from ista import FlatFileDatabaseParser, MySQLDatabaseParser

epa = FlatFileDatabaseParser()

ncbigene = FlatFileDatabaseParser("ncbigene", onto)
drugbank = FlatFileDatabaseParser("drugbank", onto)
hetionet = FlatFileDatabaseParser("hetionet", onto)
aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config)
aopwiki = FlatFileDatabaseParser("aopwiki", onto)
tox21 = FlatFileDatabaseParser("tox21", onto)
disgenet = FlatFileDatabaseParser("disgenet", onto)


#####################
# EPA COMPTOX NODES #
#####################
epa.parse_node_type(
node_type="Chemical",
source_filename="PubChem_DTXSID_mapping_file.txt",
fmt="tsv",
parse_config={
"iri_column_name": "DTXSID",
"headers": True,
"data_property_map": {
"CID": onto.xrefPubchemCID,
"SID": onto.xrefPubchemSID,
"DTXSID": onto.xrefDTXSID,
},
},
merge=False,
skip=False
)
epa.parse_node_type(
node_type="Chemical",
source_filename="Dsstox_CAS_number_name.csv",
fmt="csv",
parse_config={
"iri_column_name": "dsstox_substance_id",
"headers": True,
"data_property_map": {
"casrn": onto.xrefCasRN,
"preferred_name": onto.commonName,
"dsstox_substance_id": onto.xrefDTXSID,
},
"merge_column": {
"source_column_name": "dsstox_substance_id",
"data_property": onto.xrefDTXSID,
},
},
merge=True,
skip=False
)
epa.parse_node_type(
node_type="Chemical",
source_filename="CUSTOM/chemical_maccs_fingerprints.tsv",
fmt="tsv",
parse_config={
"iri_column_name": "DTXSID",
"headers": True,
"data_property_map": {
"DTXSID": onto.xrefDTXSID,
"MACCS": onto.maccs
},
"merge_column": {
"source_column_name": "DTXSID",
"data_property": onto.xrefDTXSID
}
},
merge=True,
skip_create_new_node=True, # Don't create an empty chemical node with just a MACCS property if the CID isn't already in the ontology
skip=False
)

##################
# CHEMICAL LISTS #
##################
epa.parse_node_type(
node_type="ChemicalList",
source_filename="CUSTOM/Chemical Lists.tsv",
fmt="tsv",
parse_config={
"iri_column_name": "LIST_ACRONYM",
"headers": True,
"data_property_map": {
"LIST_ACRONYM": onto.listAcronym,
"LIST_NAME": onto.commonName,
"LIST_DESCRIPTION": onto.listDescription
},
"data_transforms": {
"LIST_ACRONYM": lambda x: x.split('/')[-1]
}
},
merge=False,
skip=False
)

###############################
# Chemical List relationships #
###############################
epa.parse_relationship_type(
relationship_type=onto.listIncludesChemical,
inverse_relationship_type=onto.chemicalInList,
source_filename="CUSTOM/chemical_lists_relationships.tsv",
fmt="tsv",
parse_config = {
"subject_node_type": onto.ChemicalList,
"subject_column_name": "list_acronym",
"subject_match_property": onto.listAcronym,
"object_node_type": onto.Chemical,
"object_column_name": "casrn",
"object_match_property": onto.xrefCasRN,
"headers": True
},
merge=True,
skip=False
)
Loading

0 comments on commit 680232f

Please sign in to comment.