From 36e1c1d4a1be4c835fd7aa793a7728131ef71cc7 Mon Sep 17 00:00:00 2001 From: TobiasNx Date: Tue, 22 Aug 2023 16:06:54 +0200 Subject: [PATCH] Simplify the oai pmh update process for Sigel data #462 --- README.textile | 4 +- app/controllers/Transformation.java | 5 +- app/transformation/TransformAll.java | 5 +- app/transformation/TransformSigel.java | 143 ++----------------------- conf/application.conf | 2 - 5 files changed, 14 insertions(+), 145 deletions(-) diff --git a/README.textile b/README.textile index 72855e88..51a46c98 100644 --- a/README.textile +++ b/README.textile @@ -131,9 +131,9 @@ h2. Transform The transformation is triggered when the application starts but it can also be started separately when the application is running (only works hbz internally). -If you run the transformation with the full data (see above for downloads), the application will download additional updates for the Sigel data. These downloads comprise the data from a given date until today. They are split into smaller intervals of several days, you can specify the size of these intervals. +If you run the transformation with the full data (see above for downloads), the application will download additional updates for the Sigel data. -Thus, you will have specify two parameters in @conf/application.conf@ : (1) the date from which the updates start (usually the date of the base dump creation, e.g. 2013-06-01) and (2) the interval size in days (must not be too large). +Thus, you will have specify one parameters in @conf/application.conf@ : the date from which the updates start (usually the date of the base dump creation, e.g. 2013-06-01). You can run the transformation of the full data using the following command: diff --git a/app/controllers/Transformation.java b/app/controllers/Transformation.java index 6cdce6bd..b5a88e4d 100644 --- a/app/controllers/Transformation.java +++ b/app/controllers/Transformation.java @@ -40,13 +40,10 @@ public static Result transformSet() throws IOException { try { String startOfUpdates = Application.CONFIG.getString("transformation.updates.start"); - String intervalSize = - Application.CONFIG.getString("transformation.updates.interval.size"); String geoLookupServer = Application.CONFIG.getString("transformation.geo.lookup.server"); String outputPath = TransformAll.DATA_OUTPUT_FILE; - TransformAll.process(startOfUpdates, Integer.parseInt(intervalSize), outputPath, - geoLookupServer); + TransformAll.process(startOfUpdates, outputPath, geoLookupServer); } catch (Exception e) { Logger.root().error("Transformation failed", e); return internalServerError("Transformation failed"); diff --git a/app/transformation/TransformAll.java b/app/transformation/TransformAll.java index 99db77c4..be15d591 100644 --- a/app/transformation/TransformAll.java +++ b/app/transformation/TransformAll.java @@ -42,17 +42,16 @@ public class TransformAll { /** * @param startOfUpdates Date from which updates should start - * @param intervalSize Days to load update for at once * @param outputPath The path to which the output of transform should go * @param geoServer The lookup server for geo data * @throws IOException If dump and temp files cannot be read */ - public static void process(String startOfUpdates, int intervalSize, + public static void process(String startOfUpdates, final String outputPath, String geoServer) throws IOException { String dbsOutput = outputPath + "-dbs"; String sigelOutput = outputPath + "-sigel"; TransformSigel.processBulk(sigelOutput, geoServer); //Start processing Sigel pica binary bulk. - TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH. + TransformSigel.processUpdates(startOfUpdates, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH. TransformDbs.process(dbsOutput, geoServer); //Start process DBS data. // DBS-Data, Sigel Bulk and Updates are joined in a single ES-Bulk-file. diff --git a/app/transformation/TransformSigel.java b/app/transformation/TransformSigel.java index 84ebeaf6..fe6e64fe 100644 --- a/app/transformation/TransformSigel.java +++ b/app/transformation/TransformSigel.java @@ -36,16 +36,10 @@ /** * Transformation from Sigel PicaPlus-XML to JSON. * - * @author Fabian Steeg (fsteeg) + * @author Fabian Steeg (fsteeg), Tobias Bülte (@TobiasNx) * */ public class TransformSigel { - static final String UPDATE_TOP_LEVEL_TAG = "harvest"; - static final String DUMP_TOP_LEVEL_TAG = "collection"; - static final String UPDATE_ENTITY = "metadata"; - static final String XPATH = - "/*[local-name() = 'record']/*[local-name() = 'global']/*[local-name() = 'tag'][@id='008H']/*[local-name() = 'subf'][@id='e']"; - static final String DUMP_XPATH = "/" + DUMP_TOP_LEVEL_TAG + "/" + XPATH; // This opens the pica binary bulk we have, transforms them and saves them as JSON ES Bulk. static void processBulk(final String outputPath, String geoLookupServer) throws IOException { @@ -67,14 +61,17 @@ static void processBulk(final String outputPath, String geoLookupServer) throws } // This opens the updates and transforms them and appends them to the JSON ES Bulk of the bulk transformation. - static void processUpdates(String startOfUpdates, int intervalSize, + static void processUpdates(String startOfUpdates, final String outputPath, String geoLookupServer) throws IOException { - final FileOpener splitFileOpener = new FileOpener(); + OaiPmhOpener sigelOaiPmhUpdates = new OaiPmhOpener(); + sigelOaiPmhUpdates.setDateFrom(startOfUpdates); + sigelOaiPmhUpdates.setMetadataPrefix("PicaPlus-xml"); + sigelOaiPmhUpdates.setSetSpec("bib"); JsonEncoder encodeJson = new JsonEncoder(); encodeJson.setPrettyPrinting(true); ObjectWriter objectWriter = new ObjectWriter<>(outputPath); objectWriter.setAppendIfFileExists(true); - splitFileOpener// + sigelOaiPmhUpdates// .setReceiver(new XmlDecoder())// .setReceiver(new PicaXmlHandler())// .setReceiver(new Metafix("conf/fix-sigel.fix")) // Preprocess Sigel-Data and fix skips all records that have no "inr" and "isil" @@ -82,129 +79,7 @@ static void processUpdates(String startOfUpdates, int intervalSize, .setReceiver(encodeJson)// .setReceiver(TransformAll.esBulk())// .setReceiver(objectWriter); - if (!startOfUpdates.isEmpty()) { - processSigelUpdates(startOfUpdates, intervalSize); - } - Files.walk(Paths.get(TransformAll.DATA_OUTPUT_DIR))// - .filter(Files::isRegularFile)// - .filter(file -> file.toString().endsWith(".xml"))// - .collect(Collectors.toList()).forEach(path -> { - splitFileOpener.process(path.toString()); - }); - splitFileOpener.closeStream(); + sigelOaiPmhUpdates.process(Application.CONFIG.getString("transformation.sigel.repository")); + sigelOaiPmhUpdates.closeStream(); } - - private static void processSigelUpdates(String startOfUpdates, - int intervalSize) { - int updateIntervals = - calculateIntervals(startOfUpdates, getToday(), intervalSize); - ArrayList updateOpenerList = - buildUpdatePipes(intervalSize, startOfUpdates, updateIntervals); - for (OaiPmhOpener updateOpener : updateOpenerList) { - updateOpener.process( - Application.CONFIG.getString("transformation.sigel.repository")); - updateOpener.closeStream(); - } - } - - - private static ArrayList buildUpdatePipes(int intervalSize, - String startOfUpdates, int updateIntervals) { - String start = startOfUpdates; - String end = addDays(start, intervalSize); - final ArrayList updateOpenerList = new ArrayList<>(); - - // There has to be at least one interval - int intervals; - if (updateIntervals == 0) - intervals = 1; - else - intervals = updateIntervals; - - for (int i = 0; i < intervals; i++) { - final OaiPmhOpener openSigelUpdates = createOaiPmhOpener(start, end); - final XmlElementSplitter xmlSplitter = - new XmlElementSplitter(UPDATE_TOP_LEVEL_TAG, UPDATE_ENTITY); - final String updateXPath = - "/" + UPDATE_TOP_LEVEL_TAG + "/" + UPDATE_ENTITY + "/" + XPATH; - setupSigelSplitting(openSigelUpdates, xmlSplitter, updateXPath, - TransformAll.DATA_OUTPUT_DIR); - - updateOpenerList.add(openSigelUpdates); - start = addDays(start, intervalSize); - if (i == intervals - 2) - end = getToday(); - else - end = addDays(end, intervalSize); - } - - return updateOpenerList; - } - - /** - * @param start the start of updates formatted in yyyy-MM-dd - * @param end the end of updates formatted in yyyy-MM-dd - * @return a new OaiPmhOpener - */ - private static OaiPmhOpener createOaiPmhOpener(String start, String end) { - OaiPmhOpener opener = new OaiPmhOpener(); - opener.setDateFrom(start); - opener.setDateUntil(end); - opener.setMetadataPrefix("PicaPlus-xml"); - opener.setSetSpec("bib"); - return opener; - } - - private static String addDays(String start, int intervalSize) { - final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); - String result = null; - try { - final Date startDate = dateFormat.parse(start); - final Calendar calender = Calendar.getInstance(); - calender.setTime(startDate); - calender.add(Calendar.DATE, intervalSize); - result = dateFormat.format(calender.getTime()); - } catch (ParseException e) { - Logger.warn("Couldn't add days", e); - } - return result; - } - - private static int calculateIntervals(String startOfUpdates, String end, - int intervalSize) { - final LocalDate startDate = LocalDate.parse(startOfUpdates); - final LocalDate endDate = LocalDate.parse(end); - long timeSpan = startDate.until(endDate, ChronoUnit.DAYS); - return (int) timeSpan / intervalSize; - } - - private static String getToday() { - String dateFormat = "yyyy-MM-dd"; - Calendar calender = Calendar.getInstance(); - SimpleDateFormat simpleDate = new SimpleDateFormat(dateFormat); - return simpleDate.format(calender.getTime()); - } - - static XmlFilenameWriter setupSigelSplitting(final DefaultObjectPipe> opener, - final XmlElementSplitter splitter, String xPath, - final String outputPath) { - final XmlDecoder xmlDecoder = new XmlDecoder(); - final XmlFilenameWriter xmlFilenameWriter = - xmlFilenameWriter(outputPath, xPath); - return opener// - .setReceiver(xmlDecoder)// - .setReceiver(splitter)// - .setReceiver(xmlFilenameWriter); - } - - private static XmlFilenameWriter xmlFilenameWriter(String outputPath, - String xPath) { - final XmlFilenameWriter xmlFilenameWriter = new XmlFilenameWriter(); - xmlFilenameWriter.setStartIndex(0); - xmlFilenameWriter.setEndIndex(2); - xmlFilenameWriter.setTarget(outputPath); - xmlFilenameWriter.setProperty(xPath); - return xmlFilenameWriter; - } - } diff --git a/conf/application.conf b/conf/application.conf index 46d66f0b..129bdaa8 100644 --- a/conf/application.conf +++ b/conf/application.conf @@ -16,8 +16,6 @@ index.es.port.tcp=7310 index.remote=[10.1.1.106,127.0.0.1] transformation.updates.start="2023-06-01" -# due to complications with the oai-pmh interval updates we increase the interval until #487 is properly fixed. -transformation.updates.interval.size=2000 transformation.geo.lookup.server="http://gaia.hbz-nrw.de:4000/v1/search" transformation.geo.lookup.threshold=0.675 transformation.sigel.repository="http://gnd-proxy.lobid.org/oai/repository"