Skip to content

Commit

Permalink
Use pica dat instead of xml dump #462
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiasNx committed Aug 7, 2023
1 parent 6f850da commit fc8c5b1
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 1,264 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ app/transformation/input/*.csv
.cache*
/bin/
application-log*.gz
app/transformation/input/*.dat
41 changes: 28 additions & 13 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import org.metafacture.json.JsonEncoder;
import org.metafacture.metafix.Metafix;
import org.metafacture.biblio.pica.PicaXmlHandler;
import org.metafacture.io.LineReader;
import org.metafacture.biblio.pica.PicaDecoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.xml.XmlElementSplitter;
import org.metafacture.io.ObjectWriter;
Expand Down Expand Up @@ -49,18 +51,30 @@ public class TransformSigel {

static void process(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
splitUpSigelDump();
final FileOpener splitFileOpener = new FileOpener();
final FileOpener dumpOpener = new FileOpener();
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
dumpOpener//
.setReceiver(new LineReader())//
.setReceiver(new PicaDecoder())//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
dumpOpener.process(TransformAll.DATA_INPUT_DIR + "sigil.dat");

ObjectWriter objectWriter = new ObjectWriter<>(outputPath);
objectWriter.setAppendIfFileExists(true);
splitFileOpener//
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//fix also kicks out all records without _id
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
.setReceiver(objectWriter);
if (!startOfUpdates.isEmpty()) {
processSigelUpdates(startOfUpdates, intervalSize);
}
Expand All @@ -70,19 +84,20 @@ static void process(String startOfUpdates, int intervalSize,
.collect(Collectors.toList()).forEach(path -> {
splitFileOpener.process(path.toString());
});
splitFileOpener.closeStream();


}

private static void splitUpSigelDump() {
final FileOpener dumpFileOpener = new FileOpener();
dumpFileOpener//
.setReceiver(new XmlDecoder())//
.setReceiver(new XmlElementSplitter(DUMP_TOP_LEVEL_TAG, DUMP_ENTITY))//
.setReceiver(
xmlFilenameWriter(TransformAll.DATA_OUTPUT_DIR, DUMP_XPATH));
dumpFileOpener.process(TransformAll.DATA_INPUT_DIR + "sigel.xml");
dumpFileOpener.closeStream();
}
// private static void splitUpSigelDump() {
// final FileOpener dumpFileOpener = new FileOpener();
// dumpFileOpener//
// .setReceiver(new XmlDecoder())//
// .setReceiver(new XmlElementSplitter(DUMP_TOP_LEVEL_TAG, DUMP_ENTITY))//
// .setReceiver(
// xmlFilenameWriter(TransformAll.DATA_OUTPUT_DIR, DUMP_XPATH));
// dumpFileOpener.process(TransformAll.DATA_INPUT_DIR + "sigel.xml");
// dumpFileOpener.closeStream();
// }

private static void processSigelUpdates(String startOfUpdates,
int intervalSize) {
Expand Down
2 changes: 1 addition & 1 deletion conf/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ index.es.port.tcp=7310

index.remote=[10.1.1.106,127.0.0.1]

transformation.updates.start="2013-06-01"
transformation.updates.start="2023-06-01"
transformation.updates.interval.size=50
transformation.geo.lookup.server="http://gaia.hbz-nrw.de:4000/v1/search"
transformation.geo.lookup.threshold=0.675
Expand Down
Loading

0 comments on commit fc8c5b1

Please sign in to comment.