-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_facsimiles_new_signum.py
253 lines (233 loc) · 10.7 KB
/
find_facsimiles_new_signum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# This script finds out which images belong to which document,
# and then saves the file paths to those images.
# The starting point is a csv file with info about the publication
# and its images. The first and last image signum of each publication's
# facsimile is in the file, but we need to modify the signums a bit.
# We have image folders named after the archive signums,
# and images named after the image signums, but we don't want to
# have to go through those folders and images by hand.
# Furthermore, the archive which digitized the images has changed
# its way of creating folder signums. These publications use the
# new signum, others use the old. Both signums should be registered
# for each publication. We already made a dictionary of old and new
# signums.
# The output is a csv containing more info than before for each
# publication: the old and new folder signum, the first and last
# image signum and the file paths to the images. This csv will be
# used for populating table publication as well as for creating the
# facsimile folder and its images and connecting the folder to the
# publication.
# We can either create the list of all possible image file paths
# from the given facsimile folder path, and then save the newly created
# list as a txt file for later use, or we can use a txt file created
# earlier and read it into a list. If no images have been added to the
# facsimile folder recently, reading the file paths from file instead of
# creating the list from scratch speeds up this script a lot.
# Sample input and output (CSV) at end of file.
import re
import json
from pathlib import Path
CSV_IN = "csv/documents_2.csv"
CSV_OUT = "csv/documents_2_facsimiles.csv"
FACSIMILE_FILE_PATHS = "new_signum_facsimile_file_paths.txt"
FACSIMILE_FOLDERS = ["M:/Facsimiles/Mechelin_1", "M:/Facsimiles/Mechelin_2", "M:/Facsimiles/Mechelin_3"]
# create a list from the original csv file
def create_list_from_csv(filename):
with open(filename, "r", encoding="utf-8-sig") as source_file:
list = []
for line in source_file:
row = line.rstrip()
elements = row.split(";")
list.append(elements)
return list
# get dictionary content from file
def read_dict_from_file(filename):
with open(filename, encoding="utf-8-sig") as source_file:
json_content = json.load(source_file)
return json_content
# create a csv file
def write_list_to_csv(list, filename):
with open(filename, "w", encoding="utf-8-sig") as output_file:
for row in list:
for item in row:
output_file.write(str(item) + ";")
output_file.write("\n")
print("List written to file", filename)
# create path object for folder from given filepath string,
# save all paths to files found in this folder or subfolders in a list
def create_file_list(folder_path):
file_list = []
for path in folder_path:
path = Path(path)
iterate_through_folders(path, file_list)
print("List of image file paths created.")
return file_list
# iterate through folders recursively and append filepaths to list
def iterate_through_folders(path, file_list):
for content in path.iterdir():
if content.is_dir():
iterate_through_folders(content, file_list)
else:
file_list.append(content)
# save the list of all facsimile file paths as a file
# thus the script runs faster next time
def write_list_to_text_file(file_list, filename):
with open(filename, "w", encoding="utf-8-sig") as output_file:
for path in file_list:
output_file.write(path.as_posix())
output_file.write('\n')
print("List of facsimile file paths written to file", filename)
# if a file containing all facsimile file paths
# has already been created:
# read that file into a list
def read_list_from_file(filename):
with open(filename, encoding="utf-8-sig") as source_file:
data = source_file.read()
data_list = data.split("\n")
source_file.close()
file_list = []
for path in data_list:
path = Path(path)
file_list.append(path)
print("List of facsimile file paths created from file.")
return file_list
# we need to find the folder signum as well as
# the signum/number of the first and last image
# for each publication
def find_signums(info_list, file_list, signum_dictionary):
# look for a signum and split it into two parts:
# the folder signum and the image signum
# add the found signums to each list in the info_list
for row in info_list:
# look for combined folder and image signum
whole_first_signum = row[11]
search_string = re.compile(r"(\d{10})_(\d{4})")
match_string = re.search(search_string, whole_first_signum)
# if no signum is found we can't find the facsimile with this script
# empty that list
if match_string is None:
print("No facsimiles found for:")
print(row)
row.clear()
continue
# add the newly found signums to list
folder_signum = match_string.group(1)
first_image_signum = match_string.group(2)
row[10] = folder_signum
row[11] = first_image_signum
# look for a second signum and split it, if found
whole_last_signum = row[12]
match_string = re.search(search_string, whole_last_signum)
# if no second signum was found, then the facsimile is just one image
if match_string is None:
last_image_signum = first_image_signum
else:
last_image_signum = match_string.group(2)
row[12] = last_image_signum
# check for a version signum; if there are alternative images,
# we need to add a second facsimile for this publication
if row[15] != "":
version_signum = row[15]
version_first_image_signum = row[16]
# if no second signum is found, then the facsimile is just one image
if row[17] != "":
version_last_image_signum = row[17]
else:
version_last_image_signum = version_first_image_signum
else:
version_signum = ""
version_first_image_signum = ""
version_last_image_signum = ""
# find file paths for all images, save file paths as list,
# append to current list
match_list, match_list_2 = find_image_paths(folder_signum, first_image_signum, last_image_signum, version_signum, version_first_image_signum, version_last_image_signum, file_list)
row.extend([match_list])
if match_list_2 != "":
row.extend([match_list_2])
old_folder_signum = find_old_folder_signum(folder_signum, signum_dictionary)
row[13] = old_folder_signum
print(row)
# get rid of empty lists
result_list = [row for row in info_list if row != []]
return result_list
# we've got a list of all file paths to images
# we have to find the right path(s) for this publication's images
def find_image_paths(folder_signum, first_image_signum, last_image_signum, version_signum, version_first_image_signum, version_last_image_signum, file_list):
i = 0
match_list = []
first_image = int(re.sub("^0+", "", first_image_signum))
last_image = int(re.sub("^0+", "", last_image_signum))
while i < len(file_list) - 1:
file_path = file_list[i]
match_str = re.search(folder_signum, str(file_path))
image = str(file_path.stem)
if match_str and first_image_signum == image:
match_list.append(file_path.as_posix())
for number in range(first_image + 1, last_image + 1):
image_name = str(number).zfill(4) + ".jpg"
file_path = file_path.with_name(image_name)
match_list.append(file_path.as_posix())
break
i += 1
if not match_list:
print(folder_signum + " and " + str(first_image_signum) + " not found among file paths.")
match_list = ""
if version_signum != "":
i = 0
match_list_2 = []
first_image = int(re.sub("^0+", "", version_first_image_signum))
last_image = int(re.sub("^0+", "", version_last_image_signum))
while i < len(file_list) - 1:
file_path = file_list[i]
match_str = re.search(version_signum, str(file_path))
image = str(file_path.stem)
if match_str and version_first_image_signum == image:
match_list_2.append(file_path.as_posix())
for number in range(first_image + 1, last_image + 1):
image_name = str(number).zfill(4) + ".jpg"
file_path = file_path.with_name(image_name)
match_list_2.append(file_path.as_posix())
break
i += 1
if not match_list_2:
print("Version " + folder_signum + " and " + str(first_image_signum) + " not found among file paths.")
match_list_2 = ""
else:
match_list_2 = ""
return match_list, match_list_2
# use new folder signum to find its old equivalent from dictionary
# add old signum to list
def find_old_folder_signum(folder_signum, signum_dictionary):
for key, value in signum_dictionary.items():
if folder_signum == value:
old_folder_signum = key
break
if folder_signum not in signum_dictionary.values() or not old_folder_signum:
print(folder_signum + " not found in signum_dictionary.")
old_folder_signum = ""
return old_folder_signum
def main():
# info about publications
info_list = create_list_from_csv(CSV_IN)
# get a list of all facsimile file paths,
# either from an existing txt file
# or by creating the list from scratch
# and saving it as a txt file for later use
if FACSIMILE_FILE_PATHS != "":
file_list = read_list_from_file(FACSIMILE_FILE_PATHS)
else:
file_list = create_file_list(FACSIMILE_FOLDERS)
write_list_to_text_file(file_list, "new_signum_facsimile_file_paths.txt")
# old and new folder signums from dictionary
signum_dictionary = read_dict_from_file("dictionaries/signum_dictionary.json")
# append folder signum, first and last image signum,
# file paths to images, old folder signum
result_list = find_signums(info_list, file_list, signum_dictionary)
write_list_to_csv(result_list, CSV_OUT)
main()
'''
sample input:
1.10.1878;luento;x;;Finlands statsrätt;8 föreläsningen;;ruotsi;64 LM/KA;2;;1441481490_0147;1441481490_0148;;;;;
sample output:
1.10.1878;luento;x;;Finlands statsrätt;8 föreläsningen;;ruotsi;64 LM/KA;2;1441481490;0147;0148;1909976.KA;;;;;['M:/Faksimiili/Mechelin_11/1441481490/master/0147.jpg', 'M:/Faksimiili/Mechelin_11/1441481490/master/0148.jpg'];
'''