-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_script.py
204 lines (173 loc) · 9.23 KB
/
import_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import warnings
from pathlib import Path
import pandas as pd
import regex
from dsp_tools import excel2xml
def main() -> None:
"""
main method: all code must be inside this method
"""
# general preparation
# -------------------
path_to_json = "import_project.json"
main_df = pd.read_csv("data_raw.csv", dtype="str", sep=",") # or: pd.read_excel("*.xls(x)", dtype="str")
# remove rows without usable values (prevents Errors when there are empty rows at the end of the file)
main_df = main_df.map(
lambda x: x if pd.notna(x) and regex.search(r"[\p{L}\d_!?]", str(x), flags=regex.U) else pd.NA
)
main_df = main_df.dropna(axis="index", how="all")
# create the root tag <knora> and append the permissions
root = excel2xml.make_root(shortcode="00A1", default_ontology="import")
root = excel2xml.append_permissions(root)
# create list mappings
# --------------------
# For every node of the list "category", this dictionary maps the German label to the node name.
category_labels_to_names = excel2xml.create_json_list_mapping(
path_to_json=path_to_json,
list_name="category",
language_label="de",
)
# For every node of the list "category", this dictionary maps similar entries of the Excel column to the node name.
# When you run this script, two warnings appear, saying that "Säugetiere" and "Kunstwerk" couldn't be matched.
# Luckily, these two are covered in category_labels_to_names.
category_excel_values_to_names = excel2xml.create_json_excel_list_mapping(
path_to_json=path_to_json,
list_name="category",
excel_values=main_df["Category"],
sep=",",
)
# create resources of type ":Image2D"
# -----------------------------------
# create a dict that keeps the IDs of the created resources,
# retrieve all files from the "images" folder that don't start with "~$" or ".",
image2d_labels_to_ids = {}
all_images = [x for x in Path("images").glob("*") if not regex.search(r"^~$|^\.", x.name)]
# iterate through all images, and create an ":Image2D" for every image file
for img in all_images:
# keep a reference to this ID in the dict
resource_label = img.name
resource_id = excel2xml.make_xsd_id_compatible(resource_label)
image2d_labels_to_ids[resource_label] = resource_id
resource = excel2xml.make_resource(label=resource_label, restype=":Image2D", id=resource_id)
resource.append(excel2xml.make_bitstream_prop(img, check=True))
resource.append(excel2xml.make_text_prop(":hasTitle", resource_label))
root.append(resource)
# create resources of type ":Object"
# ----------------------------------
# create a dict that keeps the IDs of the created resources
object_labels_to_ids = {}
# iterate through all rows of your data source, in pairs of (row-number, row)
for i, row in main_df.iterrows():
index = int(str(i)) # convert the index to an integer
# keep a reference to this ID in the dict
resource_label = row["Object"]
resource_id = excel2xml.make_xsd_id_compatible(resource_label)
object_labels_to_ids[resource_label] = resource_id
resource = excel2xml.make_resource(label=resource_label, restype=":Object", id=resource_id)
# check every existing ":Image2D" resource, if there is an image that belongs to this object
for img_label, img_id in image2d_labels_to_ids.items():
# check if the label of ":Object" is contained in the label of ":Image2D"
if resource_label in img_label:
# create a resptr-link to the ID of the ":Image2D" resource
resource.append(excel2xml.make_resptr_prop(":hasImage", img_id))
# add a text property with the simple approach
resource.append(excel2xml.make_text_prop(":hasName", row["Title"]))
# add a text property, overriding the default values for "permissions" and "encoding"
resource.append(
excel2xml.make_text_prop(
":hasDescription",
excel2xml.PropertyElement(
value=row["Description"],
permissions="prop-restricted",
comment="comment to 'Description'",
encoding="xml",
),
)
)
# get "category" list nodes: split the cell into a list of values...
category_labels = [x.strip() for x in row["Category"].split(",")]
# ...look up every value in "category_labels_to_names",
# and if it's not there, in "category_excel_values_to_names"...
category_names = [
category_labels_to_names.get(x, category_excel_values_to_names.get(x)) for x in category_labels
]
# ...filter out the None values...
category_names_str = [x for x in category_names if x is not None]
# ...create the <list-prop> with the correct names of the list nodes
resource.append(excel2xml.make_list_prop("category", ":hasCategory", category_names_str))
if excel2xml.check_notna(row["Public"]):
resource.append(excel2xml.make_boolean_prop(":isPublic", row["Public"]))
if excel2xml.check_notna(row["Color"]):
resource.append(excel2xml.make_color_prop(":hasColor", row["Color"]))
potential_date = excel2xml.find_date_in_string(row["Date"])
if potential_date:
resource.append(excel2xml.make_date_prop(":hasDate", potential_date))
else:
warnings.warn(f"Error in row {index + 2}: The column 'Date' should contain a date!")
if excel2xml.check_notna(row["Time"]):
resource.append(excel2xml.make_time_prop(":hasTime", row["Time"]))
if excel2xml.check_notna(row["Weight (kg)"]):
resource.append(excel2xml.make_decimal_prop(":hasWeight", row["Weight (kg)"]))
if excel2xml.check_notna(row["Location"]):
resource.append(excel2xml.make_geoname_prop(":hasLocation", row["Location"]))
if excel2xml.check_notna(row["URL"]):
resource.append(excel2xml.make_uri_prop(":hasExternalLink", row["URL"]))
root.append(resource)
# Annotation, Region, Link
# ------------------------
# These special resource classes are DSP base resources,
# that's why they use DSP base properties without prepended colon.
# See the docs for more details:
# https://docs.dasch.swiss/latest/DSP-TOOLS/file-formats/xml-data-file/#dsp-base-resources-and-base-properties-to-be-used-directly-in-the-xml-file
annotation = excel2xml.make_annotation("Annotation to Anubis", "annotation_to_anubis")
annotation.append(
excel2xml.make_text_prop(
"hasComment",
excel2xml.PropertyElement("Date and time are invented, like for the other resources.", encoding="xml"),
)
)
annotation.append(excel2xml.make_resptr_prop("isAnnotationOf", object_labels_to_ids["Anubis"]))
root.append(annotation)
region = excel2xml.make_region("Region of the Meteorite image", "region_of_meteorite")
region.append(excel2xml.make_text_prop("hasComment", excel2xml.PropertyElement("This is a comment", encoding="xml")))
region.append(excel2xml.make_color_prop("hasColor", "#5d1f1e"))
region.append(excel2xml.make_resptr_prop("isRegionOf", image2d_labels_to_ids["GibeonMeteorite.jpg"]))
region.append(
excel2xml.make_geometry_prop(
"hasGeometry",
'{"type": "rectangle", "lineColor": "#ff3333", "lineWidth": 2, '
'"points": [{"x": 0.08, "y": 0.16}, {"x": 0.73, "y": 0.72}], "original_index": 0}',
)
)
root.append(region)
link = excel2xml.make_link("Link between BM1888-0601-716 and Horohoroto", "link_BM1888-0601-716_horohoroto")
link.append(excel2xml.make_text_prop("hasComment", excel2xml.PropertyElement("This is a comment", encoding="xml")))
link.append(
excel2xml.make_resptr_prop(
"hasLinkTo",
[object_labels_to_ids["BM1888-0601-716"], object_labels_to_ids["Horohoroto"]],
)
)
root.append(link)
# Video with a Segment
# --------------------
# Videos and audios are normal resources...
video = excel2xml.make_resource("Publicly available video", ":VideoObject", "video_1")
video.append(excel2xml.make_bitstream_prop("videos/my_video.mp4"))
root.append(video)
# ... but the segments behave differently:
segment = excel2xml.make_video_segment("The first 5 seconds of my video", "segment_1")
segment.append(excel2xml.make_isSegmentOf_prop("video_1"))
segment.append(excel2xml.make_hasSegmentBounds_prop(segment_start=0, segment_end=5))
segment.append(excel2xml.make_hasTitle_prop("Intro of my video"))
segment.append(excel2xml.make_hasComment_prop("Video segments can also have comments"))
segment.append(excel2xml.make_hasDescription_prop("This segments spans the first 5 seconds of my video"))
segment.append(excel2xml.make_hasKeyword_prop("publicly available video"))
segment.append(excel2xml.make_relatesTo_prop(object_labels_to_ids["Horohoroto"]))
root.append(segment)
# audio segments are identical, just replace "video" by "audio"
# write file
# ----------
excel2xml.write_xml(root, "data-processed.xml")
if __name__ == "__main__":
main()