-
Notifications
You must be signed in to change notification settings - Fork 0
/
dspacearchive.py
162 lines (133 loc) · 6.52 KB
/
dspacearchive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
This class handles the creation of a DSpace simple archive suitable for import into a dspace repository.
For more information about the DSpace Simple Archive format,
See: https://wiki.lyrasis.org/display/DSDOC7x/Importing+and+Exporting+Items+via+Simple+Archive+Format
"""
import os
import re
from item import Item
from saxonche import *
import xml.etree.ElementTree as ET
from shutil import copy, make_archive
class DspaceArchive:
"""
Constructor:
The constructor takes a path to a csv file.
It then parses the file, creates items, and adds the items to the archive.
"""
def __init__(self, input_base_path, xsl_files):
self.items = []
self.input_base_path = input_base_path
self.xsl_files = xsl_files
"""
Extract file_basename from XML filename
@pdf_files: An Array which contains all bitstreams of an item. An item can contain more than one bitstreams (PDF/A files).
@file_basename: Both XML and PDF must contain the same @file_basename, so that they can be matched
"""
if os.path.exists(os.path.join(self.input_base_path, 'XML')):
for xml_filename in os.listdir(os.path.join(self.input_base_path, 'XML')):
# file_basename = xml_filename[:-4] # Remove '.xml'
file_basename = xml_filename[:-5] # Remove '.html'
# file_basename = file_basename.split('-')[1] # e.g. '8494-34111' --> '34111'
pdf_files = [] # An item can contain more than one file (bitstream)
for pdf_filename in os.listdir(os.path.join(self.input_base_path, 'PDF-A')):
# filename = re.search("\d+.pdf", pdf_filename) # Involve the filename pattern of Psychosozial-Verlag, e.g. '070-089 34088.pdf' get '34088.pdf' @return a Match object
# filename = filename.group() # @return the part of the string where there was a match
if (file_basename in pdf_filename[:-4]): # Remove '.pdf'
pdf_files.append(pdf_filename)
item = Item(file_basename, pdf_files, xml_filename)
self.addItem(item)
"""
Add an item to the archive.
"""
def addItem(self, item):
self.items.append(item)
"""
Get an item from the archive.
"""
def getItem(self, index):
return self.items[index]
"""
Write the archive to disk in the format specified by the DSpace Simple Archive format.
"""
def write(self, dir = "."):
self.create_directory(dir)
for index, item in enumerate(self.items):
#item directory
item_path = os.path.join(dir, item.basename)
self.create_directory(item_path)
print("====== Writing Item_%03d: " % (int(index) + 1), item_path, " ======")
#metadata file(s)
self.writeMetadata(item, item_path)
#content files (aka bitstreams)
self.copyFiles(item, item_path)
#collections_file
self.writeCollectionsFile(item_path)
#contents file
self.writeContentsFile(item, item_path)
"""
Create a zip file of the archive.
"""
def zip(self, output_filename, dir_name = None):
make_archive(output_filename, 'zip', dir_name)
"""
Create a directory if it doesn't already exist.
"""
def create_directory(self, path):
if not os.path.isdir(path):
os.mkdir(path)
"""
Create a contents file that contains a lits of bitstreams, one per line.
"""
def writeContentsFile(self, item, item_path):
for pdf_filename in item.pdf:
contents_file = open(os.path.join(item_path, 'contents'), "a")
contents_file.write(pdf_filename + "\n")
contents_file.close()
"""
Create a collections file that contains the collection(s) for an item
"""
def writeCollectionsFile(self, item_path):
# handles of all collections in lx86
handles = {'article':'20.500.12034/6', 'bachelorThesis':'20.500.12034/723',
'book':'20.500.12034/4', 'bookPart':'20.500.12034/586',
'code':'20.500.12034/721', 'conferenceObject':'20.500.12034/7',
'courseMaterial':'20.500.12034/8', 'doctoralThesis':'20.500.12034/724',
'habilitationTreatise':'20.500.12034/725', 'image':'20.500.12034/627',
'masterThesis':'20.500.12034/10', 'movingImage':'20.500.12034/626',
'other':'20.500.12034/720', 'preprint':'20.500.12034/718',
'preregistration':'20.500.12034/736', 'report':'20.500.12034/9',
'researchData':'20.500.12034/576', 'review':'20.500.12034/719',
'sound':'20.500.12034/722', 'test':'20.500.12034/12'}
# Get the FIRST 'dc.type' from PsychArchives XML
namespace = {'dc': 'http://purl.org/dc/elements/1.1/'}
tree = ET.parse(os.path.join(item_path, 'dublin_core.xml'))
root = tree.getroot()
dc_type = root.find("./dc:dcvalue[@element='type']", namespace)
collection = handles[dc_type.text]
# Write the handle of the FIRST dc.type into collections file
collections_file = open(os.path.join(item_path, 'collections'), "w")
collections_file.write(collection)
collections_file.close()
"""
Copy the files that are referenced by an item to the item directory in the DSPace simple archive.
"""
def copyFiles(self, item, item_path):
for pdf_filename in item.pdf:
copy(os.path.join(self.input_base_path, 'PDF-A' , pdf_filename), item_path)
"""
Convert input XML documents into PsychArchives Metadata Schema
"""
def writeMetadata(self, item, item_path):
with PySaxonProcessor(license=False) as proc:
xsltproc = proc.new_xslt30_processor()
for schema, stylesheet in self.xsl_files.items():
executable = xsltproc.compile_stylesheet(stylesheet_file=stylesheet, version='2.0')
executable.set_initial_match_selection(file_name=os.path.join(self.input_base_path, 'XML' , item.xml))
executable.set_global_context_item(file_name=os.path.join(self.input_base_path, 'XML' , item.xml)) # Set the global context item for the transformation. (Enable the use of global variables in XSLT)
# It is possible to have Dublin Core and further schemas
if schema == 'dc':
xml_filename = 'dublin_core.xml'
else:
xml_filename = 'metadata_' + schema + '.xml'
executable.apply_templates_returning_file(output_file=os.path.join(item_path, xml_filename))