Skip to content

Commit

Permalink
Added support for REDFS to cim-diff.py
Browse files Browse the repository at this point in the history
  • Loading branch information
kristjan.vilgo committed Aug 31, 2021
1 parent 16b4d1d commit 53db137
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 25 deletions.
79 changes: 56 additions & 23 deletions Tools/RDF_PARSER/RDF_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from lxml.etree import QName

import pandas
#import dask as pandas
import datetime
import zipfile
import uuid
Expand Down Expand Up @@ -90,7 +91,7 @@ def load_RDF_objects_from_XML(path_or_fileobject, debug=False):

# Get unique ID for loaded instance
# instance_id = clean_ID(parsed_xml.find("./").attrib.values()[0]) # Lets asume that the first RDF element describes the whole document - TODO replace it with hash of whole XML
instance_id = str(uuid.uuid4()) # Guarantees unique ID for each loaded instance of data
instance_id = str(uuid.uuid4()) # Guarantees unique ID for each loaded instance of data, in erronous data it happens that same UUID is used for multiple files

if debug:
_, start_time = print_duration("XML loaded to tree object", start_time)
Expand Down Expand Up @@ -158,7 +159,7 @@ def find_all_xml(list_of_paths_to_zip_globalzip_xml, debug=False):
return xml_files_list


def load_RDF_to_list(path_or_fileobject, debug=False):
def load_RDF_to_list(path_or_fileobject, debug=False, keep_ns=False):
"""Parse single file to triplestore list"""

file_name = path_or_fileobject
Expand All @@ -173,31 +174,38 @@ def load_RDF_to_list(path_or_fileobject, debug=False):
if debug:
start_time = datetime.datetime.now()

# Lets generate list for RDF data and store the original filename under rdf:label
data_list = [(str(uuid.uuid4()), "label", file_name, INSTANCE_ID)]
# Lets generate list for RDF data and store the original filename under rdf:label in dcat:Distribution object
ID = str(uuid.uuid4())
data_list = [
(ID, "Type", "Distribution", INSTANCE_ID),
(ID, "label", file_name, INSTANCE_ID)
]

# lets create all variables, so that in loops they are reused, rather than new ones are created, green thinking
ID = ""
#ID = ""
KEY = ""
VALUE = ""
NS = ""

for RDF_object in RDF_objects:

ID = clean_ID(RDF_object.attrib.values()[0])
# KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Type' # If we would like to keep all with correct namespace
# KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type' # If we would like to keep all with correct namespace
KEY = 'Type'
VALUE = RDF_object.tag.split("}")[1]
KEY_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
VALUE_NS, VALUE = RDF_object.tag.split("}") #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
# VALUE = etree.QName(object).localname
# ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identifi "ID" and "about" types of ID
# ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identify "ID" and "about" types of ID

# data_list.append([ID, ID_TYPE, KEY, VALUE]) # If using ID TYPE, maybe also namespace should be kept?
data_list.append((ID, KEY, VALUE, INSTANCE_ID))

for element in RDF_object.iterchildren():

KEY = element.tag.split("}")[1]
KEY_NS, KEY = element.tag.split("}") #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
# KEY = etree.QName(element).localname
VALUE = element.text
VALUE_NS = ""

if VALUE is None and len(element.attrib.values()) > 0:
VALUE = clean_ID(element.attrib.values()[0])
Expand Down Expand Up @@ -809,24 +817,49 @@ def update_triplet_from_tableview(data, tableview, update=True, add=True, instan
pandas.DataFrame.update_triplet_from_tableview = update_triplet_from_tableview


def get_diff(left_data, right_data, print_diff=False, file_id_key="label"):
diff = left_data.merge(right_data, on=["ID", "KEY", "VALUE"], how='outer', indicator=True, suffixes=("_OLD", "_NEW"), sort=False).query("_merge != 'both'")
def remove_triplet_from_triplet(from_triplet, what_triplet, columns=["ID", "KEY", "VALUE"]):
"""Retuns from_triplet - what_triplet"""
return from_triplet.drop(from_triplet.reset_index().merge(what_triplet[columns], on=columns, how="inner")["index"], axis=0)

if print_diff:
changes = diff.replace({'_merge': {"left_only": "-", "right_only": "+"}}).sort_values(by=['ID', 'KEY']).query("KEY != 'label'")
changes_on_left = len(changes.query("_merge == '-'"))
changes_on_right = len(changes.query("_merge == '+'"))

for _, file_id in left_data.query("KEY == @file_id_key").VALUE.iteritems():
print(f"--- {file_id}")# from-file-modification-time")
def filter_triplet_by_type(triplet, type):
"""Filter out all objects data by rdf:type"""
return triplet.merge(triplet.query("KEY == 'Type' and VALUE == @type").ID)

for _, file_id in right_data.query("KEY == @file_id_key").VALUE.iteritems():
print(f"+++ {file_id}")# to-file-modification-time")
print(f"@@ -1,{changes_on_left} +1,{changes_on_right} @@")
for _, change in (changes._merge + changes.ID + " " + changes.KEY + " " + changes.VALUE).iteritems():
print(change)

return diff
def triplet_diff(left_data, right_data):

return left_data.merge(right_data, on=["ID", "KEY", "VALUE"], how='outer', indicator=True, suffixes=("_OLD", "_NEW"), sort=False).query("_merge != 'both'")


def print_triplet_diff(left_data, right_data, file_id_object="Distribution", file_id_key="label", exclude_objects=[]):

diff = triplet_diff(left_data, right_data)

changes = diff.replace({'_merge': {"left_only": "-", "right_only": "+"}}).sort_values(by=['ID', 'KEY'])

file_id_data = filter_triplet_by_type(changes, file_id_object)
changes = remove_triplet_from_triplet(changes, file_id_data)
print(f"INFO - removed {file_id_object} from diff")

if exclude_objects:
for object_name in exclude_objects:
excluded_data = filter_triplet_by_type(changes, object_name)
changes = remove_triplet_from_triplet(changes, excluded_data)
print(f"INFO - removed {object_name} from diff")

for _, file_id in file_id_data.query("KEY == @file_id_key and _merge == '-'").VALUE.iteritems():
print(f"--- {file_id}")# from-file-modification-time")

for _, file_id in file_id_data.query("KEY == @file_id_key and _merge == '+'").VALUE.iteritems():
print(f"+++ {file_id}")# to-file-modification-time")

changes_on_left = len(changes.query("_merge == '-'"))
changes_on_right = len(changes.query("_merge == '+'"))
print(f"@@ -1,{changes_on_left} +1,{changes_on_right} @@")
for _, change in (changes._merge + changes.ID + " " + changes.KEY + " " + changes.VALUE).iteritems():
print(change)

# changes = changes.replace({'_merge': {"left_only": "-", "right_only": "+"}})

def export_to_networkx(data):
Expand Down
5 changes: 3 additions & 2 deletions Tools/RDF_PARSER/examples/cim-diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import argparse

sys.path.append("..")
from RDF_parser import get_diff, load_all_to_dataframe
from RDF_parser import print_triplet_diff, load_all_to_dataframe

parser = argparse.ArgumentParser(description="""Create diff in Unified format for XML RDF CIM files. Diff is per object (ID KEY VALUE) not per XML line in file. The input can be xml, zip(xml), zip(zip(xml))""",
epilog="""Copyright (c) Kristjan Vilgo 2021; Licence: GPL 2.0""")
parser.add_argument('original_file', type=str, help='Original file path')
parser.add_argument('changed_file', type=str, help='Changed file path')
parser.add_argument('-ex', '--exclude_objects', nargs='+', help='Names of rdf:Description rdf:type-s without namespace or prefix to be excluded from diff')

arg = parser.parse_args()

get_diff(load_all_to_dataframe([arg.original_file]), load_all_to_dataframe([arg.changed_file]), print_diff=True)
print_triplet_diff(load_all_to_dataframe([arg.original_file]), load_all_to_dataframe([arg.changed_file]), exclude_objects=arg.exclude_objects)

# Example Use
# python cim-diff.py K:\PROJEKT\ER_EJK_FSYSTEM\TSM_models\eq\20210512T2330Z_ELERING_EQ_001.zip K:\PROJEKT\ER_EJK_FSYSTEM\TSM_models\eq\20210516T2330Z_ELERING_EQ_001.zip
Expand Down

0 comments on commit 53db137

Please sign in to comment.