Skip to content

Commit

Permalink
Simplify the oai pmh update process for Sigel data #462
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiasNx committed Sep 18, 2023
1 parent af05255 commit 36e1c1d
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 145 deletions.
4 changes: 2 additions & 2 deletions README.textile
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ h2. Transform

The transformation is triggered when the application starts but it can also be started separately when the application is running (only works hbz internally).

If you run the transformation with the full data (see above for downloads), the application will download additional updates for the Sigel data. These downloads comprise the data from a given date until today. They are split into smaller intervals of several days, you can specify the size of these intervals.
If you run the transformation with the full data (see above for downloads), the application will download additional updates for the Sigel data.

Thus, you will have specify two parameters in @conf/application.conf@ : (1) the date from which the updates start (usually the date of the base dump creation, e.g. 2013-06-01) and (2) the interval size in days (must not be too large).
Thus, you will have specify one parameters in @conf/application.conf@ : the date from which the updates start (usually the date of the base dump creation, e.g. 2013-06-01).

You can run the transformation of the full data using the following command:

Expand Down
5 changes: 1 addition & 4 deletions app/controllers/Transformation.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,10 @@ public static Result transformSet() throws IOException {
try {
String startOfUpdates =
Application.CONFIG.getString("transformation.updates.start");
String intervalSize =
Application.CONFIG.getString("transformation.updates.interval.size");
String geoLookupServer =
Application.CONFIG.getString("transformation.geo.lookup.server");
String outputPath = TransformAll.DATA_OUTPUT_FILE;
TransformAll.process(startOfUpdates, Integer.parseInt(intervalSize), outputPath,
geoLookupServer);
TransformAll.process(startOfUpdates, outputPath, geoLookupServer);
} catch (Exception e) {
Logger.root().error("Transformation failed", e);
return internalServerError("Transformation failed");
Expand Down
5 changes: 2 additions & 3 deletions app/transformation/TransformAll.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,16 @@ public class TransformAll {

/**
* @param startOfUpdates Date from which updates should start
* @param intervalSize Days to load update for at once
* @param outputPath The path to which the output of transform should go
* @param geoServer The lookup server for geo data
* @throws IOException If dump and temp files cannot be read
*/
public static void process(String startOfUpdates, int intervalSize,
public static void process(String startOfUpdates,
final String outputPath, String geoServer) throws IOException {
String dbsOutput = outputPath + "-dbs";
String sigelOutput = outputPath + "-sigel";
TransformSigel.processBulk(sigelOutput, geoServer); //Start processing Sigel pica binary bulk.
TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformSigel.processUpdates(startOfUpdates, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformDbs.process(dbsOutput, geoServer); //Start process DBS data.

// DBS-Data, Sigel Bulk and Updates are joined in a single ES-Bulk-file.
Expand Down
143 changes: 9 additions & 134 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,10 @@
/**
* Transformation from Sigel PicaPlus-XML to JSON.
*
* @author Fabian Steeg (fsteeg)
* @author Fabian Steeg (fsteeg), Tobias Bülte (@TobiasNx)
*
*/
public class TransformSigel {
static final String UPDATE_TOP_LEVEL_TAG = "harvest";
static final String DUMP_TOP_LEVEL_TAG = "collection";
static final String UPDATE_ENTITY = "metadata";
static final String XPATH =
"/*[local-name() = 'record']/*[local-name() = 'global']/*[local-name() = 'tag'][@id='008H']/*[local-name() = 'subf'][@id='e']";
static final String DUMP_XPATH = "/" + DUMP_TOP_LEVEL_TAG + "/" + XPATH;

// This opens the pica binary bulk we have, transforms them and saves them as JSON ES Bulk.
static void processBulk(final String outputPath, String geoLookupServer) throws IOException {
Expand All @@ -67,144 +61,25 @@ static void processBulk(final String outputPath, String geoLookupServer) throws
}

// This opens the updates and transforms them and appends them to the JSON ES Bulk of the bulk transformation.
static void processUpdates(String startOfUpdates, int intervalSize,
static void processUpdates(String startOfUpdates,
final String outputPath, String geoLookupServer) throws IOException {
final FileOpener splitFileOpener = new FileOpener();
OaiPmhOpener sigelOaiPmhUpdates = new OaiPmhOpener();
sigelOaiPmhUpdates.setDateFrom(startOfUpdates);
sigelOaiPmhUpdates.setMetadataPrefix("PicaPlus-xml");
sigelOaiPmhUpdates.setSetSpec("bib");
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
ObjectWriter objectWriter = new ObjectWriter<>(outputPath);
objectWriter.setAppendIfFileExists(true);
splitFileOpener//
sigelOaiPmhUpdates//
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
.setReceiver(new Metafix("conf/fix-sigel.fix")) // Preprocess Sigel-Data and fix skips all records that have no "inr" and "isil"
.setReceiver(TransformAll.fixEnriched(geoLookupServer))// Process and enrich Sigel-Data.
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(objectWriter);
if (!startOfUpdates.isEmpty()) {
processSigelUpdates(startOfUpdates, intervalSize);
}
Files.walk(Paths.get(TransformAll.DATA_OUTPUT_DIR))//
.filter(Files::isRegularFile)//
.filter(file -> file.toString().endsWith(".xml"))//
.collect(Collectors.toList()).forEach(path -> {
splitFileOpener.process(path.toString());
});
splitFileOpener.closeStream();
sigelOaiPmhUpdates.process(Application.CONFIG.getString("transformation.sigel.repository"));
sigelOaiPmhUpdates.closeStream();
}

private static void processSigelUpdates(String startOfUpdates,
int intervalSize) {
int updateIntervals =
calculateIntervals(startOfUpdates, getToday(), intervalSize);
ArrayList<OaiPmhOpener> updateOpenerList =
buildUpdatePipes(intervalSize, startOfUpdates, updateIntervals);
for (OaiPmhOpener updateOpener : updateOpenerList) {
updateOpener.process(
Application.CONFIG.getString("transformation.sigel.repository"));
updateOpener.closeStream();
}
}


private static ArrayList<OaiPmhOpener> buildUpdatePipes(int intervalSize,
String startOfUpdates, int updateIntervals) {
String start = startOfUpdates;
String end = addDays(start, intervalSize);
final ArrayList<OaiPmhOpener> updateOpenerList = new ArrayList<>();

// There has to be at least one interval
int intervals;
if (updateIntervals == 0)
intervals = 1;
else
intervals = updateIntervals;

for (int i = 0; i < intervals; i++) {
final OaiPmhOpener openSigelUpdates = createOaiPmhOpener(start, end);
final XmlElementSplitter xmlSplitter =
new XmlElementSplitter(UPDATE_TOP_LEVEL_TAG, UPDATE_ENTITY);
final String updateXPath =
"/" + UPDATE_TOP_LEVEL_TAG + "/" + UPDATE_ENTITY + "/" + XPATH;
setupSigelSplitting(openSigelUpdates, xmlSplitter, updateXPath,
TransformAll.DATA_OUTPUT_DIR);

updateOpenerList.add(openSigelUpdates);
start = addDays(start, intervalSize);
if (i == intervals - 2)
end = getToday();
else
end = addDays(end, intervalSize);
}

return updateOpenerList;
}

/**
* @param start the start of updates formatted in yyyy-MM-dd
* @param end the end of updates formatted in yyyy-MM-dd
* @return a new OaiPmhOpener
*/
private static OaiPmhOpener createOaiPmhOpener(String start, String end) {
OaiPmhOpener opener = new OaiPmhOpener();
opener.setDateFrom(start);
opener.setDateUntil(end);
opener.setMetadataPrefix("PicaPlus-xml");
opener.setSetSpec("bib");
return opener;
}

private static String addDays(String start, int intervalSize) {
final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
String result = null;
try {
final Date startDate = dateFormat.parse(start);
final Calendar calender = Calendar.getInstance();
calender.setTime(startDate);
calender.add(Calendar.DATE, intervalSize);
result = dateFormat.format(calender.getTime());
} catch (ParseException e) {
Logger.warn("Couldn't add days", e);
}
return result;
}

private static int calculateIntervals(String startOfUpdates, String end,
int intervalSize) {
final LocalDate startDate = LocalDate.parse(startOfUpdates);
final LocalDate endDate = LocalDate.parse(end);
long timeSpan = startDate.until(endDate, ChronoUnit.DAYS);
return (int) timeSpan / intervalSize;
}

private static String getToday() {
String dateFormat = "yyyy-MM-dd";
Calendar calender = Calendar.getInstance();
SimpleDateFormat simpleDate = new SimpleDateFormat(dateFormat);
return simpleDate.format(calender.getTime());
}

static XmlFilenameWriter setupSigelSplitting(final DefaultObjectPipe<String, ObjectReceiver<Reader>> opener,
final XmlElementSplitter splitter, String xPath,
final String outputPath) {
final XmlDecoder xmlDecoder = new XmlDecoder();
final XmlFilenameWriter xmlFilenameWriter =
xmlFilenameWriter(outputPath, xPath);
return opener//
.setReceiver(xmlDecoder)//
.setReceiver(splitter)//
.setReceiver(xmlFilenameWriter);
}

private static XmlFilenameWriter xmlFilenameWriter(String outputPath,
String xPath) {
final XmlFilenameWriter xmlFilenameWriter = new XmlFilenameWriter();
xmlFilenameWriter.setStartIndex(0);
xmlFilenameWriter.setEndIndex(2);
xmlFilenameWriter.setTarget(outputPath);
xmlFilenameWriter.setProperty(xPath);
return xmlFilenameWriter;
}

}
2 changes: 0 additions & 2 deletions conf/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ index.es.port.tcp=7310
index.remote=[10.1.1.106,127.0.0.1]

transformation.updates.start="2023-06-01"
# due to complications with the oai-pmh interval updates we increase the interval until #487 is properly fixed.
transformation.updates.interval.size=2000
transformation.geo.lookup.server="http://gaia.hbz-nrw.de:4000/v1/search"
transformation.geo.lookup.threshold=0.675
transformation.sigel.repository="http://gnd-proxy.lobid.org/oai/repository"
Expand Down

0 comments on commit 36e1c1d

Please sign in to comment.