Skip to content

Commit

Permalink
Differentiate between test and prod (#482)
Browse files Browse the repository at this point in the history
Testing the productive Fix but with a test lookup table.

- use variable in fix
  • Loading branch information
dr0i committed Sep 25, 2023
1 parent 02f0e42 commit 6285e49
Show file tree
Hide file tree
Showing 8 changed files with 41,802 additions and 23 deletions.
2 changes: 1 addition & 1 deletion app/controllers/Transformation.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public static Result transformSet() throws IOException {
Application.CONFIG.getString("transformation.geo.lookup.server");
String outputPath = TransformAll.DATA_OUTPUT_FILE;
TransformAll.process(startOfUpdates, Integer.parseInt(intervalSize), outputPath,
geoLookupServer);
geoLookupServer, "./wikidataLookup.tsv");
} catch (Exception e) {
Logger.root().error("Transformation failed", e);
return internalServerError("Transformation failed");
Expand Down
20 changes: 13 additions & 7 deletions app/transformation/TransformAll.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;

import org.metafacture.metafix.Metafix;
import org.metafacture.elasticsearch.JsonToElasticsearchBulk;
Expand Down Expand Up @@ -47,13 +48,13 @@ public class TransformAll {
* @param geoServer The lookup server for geo data
* @throws IOException If dump and temp files cannot be read
*/
public static void process(String startOfUpdates, int intervalSize,
final String outputPath, String geoServer) throws IOException {
public static void process(final String startOfUpdates, final int intervalSize,
String outputPath, final String geoServer, final String wikidataLookupFilename) throws IOException {
String dbsOutput = outputPath + "-dbs";
String sigelOutput = outputPath + "-sigel";
TransformSigel.processBulk(sigelOutput, geoServer); //Start processing Sigel pica binary bulk.
TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformDbs.process(dbsOutput, geoServer); //Start process DBS data.
TransformSigel.processBulk(sigelOutput, geoServer, wikidataLookupFilename); //Start processing Sigel pica binary bulk.
TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer, wikidataLookupFilename); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformDbs.process(dbsOutput, geoServer,wikidataLookupFilename); //Start process DBS data.

// DBS-Data, Sigel Bulk and Updates are joined in a single ES-Bulk-file.
// DBS data first, so that ES prefers Sigel entries that come later and overwrite DBS entries if available.
Expand Down Expand Up @@ -81,8 +82,13 @@ static JsonToElasticsearchBulk esBulk() {
Application.CONFIG.getString("index.es.name"));
}

static Metafix fixEnriched(String geoLookupServer) throws FileNotFoundException {
final Metafix fixEnriched = new Metafix("conf/fix-enriched.fix");
static Metafix fixEnriched(final String geoLookupServer, final String wikidataLookupFilename) throws FileNotFoundException {
final HashMap<String, String> fixVariables = new HashMap<>();
fixVariables.put("isil2wikidata", wikidataLookupFilename);
fixVariables.put("dbsId2wikidata", wikidataLookupFilename);
fixVariables.put("wikidata2gndIdentifier", wikidataLookupFilename);
Metafix fixEnriched = new Metafix("conf/fix-enriched.fix", fixVariables);

if (geoLookupServer != null && !geoLookupServer.isEmpty()) {
fixEnriched.putMap("addLatMap", new GeoLookupMap(LookupType.LAT));
fixEnriched.putMap("addLongMap", new GeoLookupMap(LookupType.LON));
Expand Down
4 changes: 2 additions & 2 deletions app/transformation/TransformDbs.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*
*/
public class TransformDbs {
static void process(final String outputPath, String geoLookupServer) throws FileNotFoundException {
static void process(final String outputPath, String geoLookupServer, final String wikidataLookupFilename) throws FileNotFoundException {
final FileOpener opener = new FileOpener();
opener.setEncoding("UTF-8");
final StringMatcher matcher = new StringMatcher();
Expand All @@ -33,7 +33,7 @@ static void process(final String outputPath, String geoLookupServer) throws File
.setReceiver(matcher)//
.setReceiver(decoder)//
.setReceiver(new Metafix("conf/fix-dbs.fix"))// Fix skips all records that have no "inr"
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
Expand Down
8 changes: 4 additions & 4 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class TransformSigel {
static final String DUMP_XPATH = "/" + DUMP_TOP_LEVEL_TAG + "/" + XPATH;

// This opens the pica binary bulk we have, transforms them and saves them as JSON ES Bulk.
static void processBulk(final String outputPath, String geoLookupServer) throws IOException {
static void processBulk(final String outputPath, final String geoLookupServer, final String wikidataLookupFilename) throws IOException {
final FileOpener dumpOpener = new FileOpener();
PicaDecoder picaDecoder = new PicaDecoder();
picaDecoder.setNormalizeUTF8(true);
Expand All @@ -58,7 +58,7 @@ static void processBulk(final String outputPath, String geoLookupServer) throws
.setReceiver(new LineReader())//
.setReceiver(picaDecoder)//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
Expand All @@ -68,7 +68,7 @@ static void processBulk(final String outputPath, String geoLookupServer) throws

// This opens the updates and transforms them and appends them to the JSON ES Bulk of the bulk transformation.
static void processUpdates(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
final String outputPath, final String geoLookupServer, final String wikidataLookupFilename) throws IOException {
final FileOpener splitFileOpener = new FileOpener();
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
Expand All @@ -78,7 +78,7 @@ static void processUpdates(String startOfUpdates, int intervalSize,
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
.setReceiver(new Metafix("conf/fix-sigel.fix")) // Preprocess Sigel-Data and fix skips all records that have no "inr" and "isil"
.setReceiver(TransformAll.fixEnriched(geoLookupServer))// Process and enrich Sigel-Data.
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))// Process and enrich Sigel-Data.
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(objectWriter);
Expand Down
6 changes: 3 additions & 3 deletions conf/fix-enriched.fix
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
do once("mapsAndMacros")
put_filemap("conf/wikidataLookup.tsv","isil2wikidata", sep_char:"\t",key_column:"2",value_column:"0",expected_columns:"-1")
put_filemap("conf/wikidataLookup.tsv","dbsId2wikidata", sep_char:"\t",key_column:"4",value_column:"0",expected_columns:"-1")
put_filemap("conf/wikidataLookup.tsv","wikidata2gndIdentifier", sep_char:"\t",key_column:"0",value_column:"3",expected_columns:"-1")
put_filemap("$[isil2wikidata]","isil2wikidata", sep_char:"\t",key_column:"2",value_column:"0",expected_columns:"-1")
put_filemap("$[dbsId2wikidata]","dbsId2wikidata", sep_char:"\t",key_column:"4",value_column:"0",expected_columns:"-1")
put_filemap("$[wikidata2gndIdentifier]","wikidata2gndIdentifier", sep_char:"\t",key_column:"0",value_column:"3",expected_columns:"-1")
put_filemap("conf/libtype-map.csv","libtype_map", sep_char:"\t")
put_filemap("conf/plz-ags-map.csv","ags_map", sep_char:"\t")
put_filemap("conf/ags-rs-map.csv","rs_map", sep_char:"\t")
Expand Down
Loading

0 comments on commit 6285e49

Please sign in to comment.