Skip to content

Commit

Permalink
Merge #498 from remote-tracking branch 'origin/482-updateWikidataLookup'
Browse files Browse the repository at this point in the history
  • Loading branch information
dr0i committed Sep 26, 2023
2 parents 73d1f1a + 2d10df0 commit 9bbb4c6
Show file tree
Hide file tree
Showing 14 changed files with 40,589 additions and 40,321 deletions.
3 changes: 3 additions & 0 deletions README.textile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ h2. Deployment

_Short instructions for clean deployment, includes hbz-internal instructions that won't work outside the hbz network. Find detailed developer documentation further below._

To get the lookup table @conf/wikidataLookup.tsv@:
* @bash getWikidataLookupTableViaSparql.sh@

After the build steps above, edit @conf/application.conf@ as required (e.g. ports to be used by the embedded Elasticsearch), download the full data dumps, and start the application:

* @cd app/transformation/input/@
Expand Down
2 changes: 1 addition & 1 deletion app/controllers/Transformation.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public static Result transformSet() throws IOException {
Application.CONFIG.getString("transformation.geo.lookup.server");
String outputPath = TransformAll.DATA_OUTPUT_FILE;
TransformAll.process(startOfUpdates, Integer.parseInt(intervalSize), outputPath,
geoLookupServer);
geoLookupServer, "./wikidataLookup.tsv");
} catch (Exception e) {
Logger.root().error("Transformation failed", e);
return internalServerError("Transformation failed");
Expand Down
20 changes: 13 additions & 7 deletions app/transformation/TransformAll.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;

import org.metafacture.metafix.Metafix;
import org.metafacture.elasticsearch.JsonToElasticsearchBulk;
Expand Down Expand Up @@ -47,13 +48,13 @@ public class TransformAll {
* @param geoServer The lookup server for geo data
* @throws IOException If dump and temp files cannot be read
*/
public static void process(String startOfUpdates, int intervalSize,
final String outputPath, String geoServer) throws IOException {
public static void process(final String startOfUpdates, final int intervalSize,
String outputPath, final String geoServer, final String wikidataLookupFilename) throws IOException {
String dbsOutput = outputPath + "-dbs";
String sigelOutput = outputPath + "-sigel";
TransformSigel.processBulk(sigelOutput, geoServer); //Start processing Sigel pica binary bulk.
TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformDbs.process(dbsOutput, geoServer); //Start process DBS data.
TransformSigel.processBulk(sigelOutput, geoServer, wikidataLookupFilename); //Start processing Sigel pica binary bulk.
TransformSigel.processUpdates(startOfUpdates, intervalSize, sigelOutput, geoServer, wikidataLookupFilename); //Start process Sigel Pica XML Updates via OAI-PMH.
TransformDbs.process(dbsOutput, geoServer,wikidataLookupFilename); //Start process DBS data.

// DBS-Data, Sigel Bulk and Updates are joined in a single ES-Bulk-file.
// DBS data first, so that ES prefers Sigel entries that come later and overwrite DBS entries if available.
Expand Down Expand Up @@ -81,8 +82,13 @@ static JsonToElasticsearchBulk esBulk() {
Application.CONFIG.getString("index.es.name"));
}

static Metafix fixEnriched(String geoLookupServer) throws FileNotFoundException {
final Metafix fixEnriched = new Metafix("conf/fix-enriched.fix");
static Metafix fixEnriched(final String geoLookupServer, final String wikidataLookupFilename) throws FileNotFoundException {
final HashMap<String, String> fixVariables = new HashMap<>();
fixVariables.put("isil2wikidata", wikidataLookupFilename);
fixVariables.put("dbsId2wikidata", wikidataLookupFilename);
fixVariables.put("wikidata2gndIdentifier", wikidataLookupFilename);
Metafix fixEnriched = new Metafix("conf/fix-enriched.fix", fixVariables);

if (geoLookupServer != null && !geoLookupServer.isEmpty()) {
fixEnriched.putMap("addLatMap", new GeoLookupMap(LookupType.LAT));
fixEnriched.putMap("addLongMap", new GeoLookupMap(LookupType.LON));
Expand Down
4 changes: 2 additions & 2 deletions app/transformation/TransformDbs.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*
*/
public class TransformDbs {
static void process(final String outputPath, String geoLookupServer) throws FileNotFoundException {
static void process(final String outputPath, String geoLookupServer, final String wikidataLookupFilename) throws FileNotFoundException {
final FileOpener opener = new FileOpener();
opener.setEncoding("UTF-8");
final StringMatcher matcher = new StringMatcher();
Expand All @@ -33,7 +33,7 @@ static void process(final String outputPath, String geoLookupServer) throws File
.setReceiver(matcher)//
.setReceiver(decoder)//
.setReceiver(new Metafix("conf/fix-dbs.fix"))// Fix skips all records that have no "inr"
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
Expand Down
8 changes: 4 additions & 4 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class TransformSigel {
static final String DUMP_XPATH = "/" + DUMP_TOP_LEVEL_TAG + "/" + XPATH;

// This opens the pica binary bulk we have, transforms them and saves them as JSON ES Bulk.
static void processBulk(final String outputPath, String geoLookupServer) throws IOException {
static void processBulk(final String outputPath, final String geoLookupServer, final String wikidataLookupFilename) throws IOException {
final FileOpener dumpOpener = new FileOpener();
PicaDecoder picaDecoder = new PicaDecoder();
picaDecoder.setNormalizeUTF8(true);
Expand All @@ -58,7 +58,7 @@ static void processBulk(final String outputPath, String geoLookupServer) throws
.setReceiver(new LineReader())//
.setReceiver(picaDecoder)//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
Expand All @@ -68,7 +68,7 @@ static void processBulk(final String outputPath, String geoLookupServer) throws

// This opens the updates and transforms them and appends them to the JSON ES Bulk of the bulk transformation.
static void processUpdates(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
final String outputPath, final String geoLookupServer, final String wikidataLookupFilename) throws IOException {
final FileOpener splitFileOpener = new FileOpener();
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
Expand All @@ -78,7 +78,7 @@ static void processUpdates(String startOfUpdates, int intervalSize,
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
.setReceiver(new Metafix("conf/fix-sigel.fix")) // Preprocess Sigel-Data and fix skips all records that have no "inr" and "isil"
.setReceiver(TransformAll.fixEnriched(geoLookupServer))// Process and enrich Sigel-Data.
.setReceiver(TransformAll.fixEnriched(geoLookupServer, wikidataLookupFilename))// Process and enrich Sigel-Data.
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(objectWriter);
Expand Down
6 changes: 3 additions & 3 deletions conf/fix-enriched.fix
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
do once("mapsAndMacros")
put_filemap("conf/wikidataLookup.tsv","isil2wikidata", sep_char:"\t",key_column:"2",value_column:"0",expected_columns:"-1")
put_filemap("conf/wikidataLookup.tsv","dbsId2wikidata", sep_char:"\t",key_column:"4",value_column:"0",expected_columns:"-1")
put_filemap("conf/wikidataLookup.tsv","wikidata2gndIdentifier", sep_char:"\t",key_column:"0",value_column:"3",expected_columns:"-1")
put_filemap("$[isil2wikidata]","isil2wikidata", sep_char:"\t",key_column:"2",value_column:"0",expected_columns:"-1")
put_filemap("$[dbsId2wikidata]","dbsId2wikidata", sep_char:"\t",key_column:"4",value_column:"0",expected_columns:"-1")
put_filemap("$[wikidata2gndIdentifier]","wikidata2gndIdentifier", sep_char:"\t",key_column:"0",value_column:"3",expected_columns:"-1")
put_filemap("conf/libtype-map.csv","libtype_map", sep_char:"\t")
put_filemap("conf/plz-ags-map.csv","ags_map", sep_char:"\t")
put_filemap("conf/ags-rs-map.csv","rs_map", sep_char:"\t")
Expand Down
10 changes: 0 additions & 10 deletions conf/wikidataLookup.sparql

This file was deleted.

12 changes: 12 additions & 0 deletions getWikidataLookupTableViaSparql.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Items that have an ISIL or DBS ID, add GND ID if existing
curl --header "Accept: text/tab-separated-values" -G 'https://query.wikidata.org/sparql' --data-urlencode query='
SELECT ?item ?itemLabel ?isil ?gndId ?dbsId
WHERE
{
{ ?item wdt:P791 ?isil } # Give back entries that either have an ISIL
UNION # or
{ ?item wdt:P4007 ?dbsId . } # a DBS ID
OPTIONAL { ?item wdt:P227 ?gndId . } # Add GND ID if in Wikidata.
SERVICE wikibase:label { bd:serviceParam wikibase:language "de,en". }
}
' |sed 's#<##g' | sed 's#^"##g'|sed 's#"\t<#\t#g'|sed 's#>\t"#\t#g' |sed 's#"@..#\t#g' |sed 's#\t"#\t#g' |sed 's#"\t#\t#g' |sed 's#"\^\^.*##g' |sed 's#\t\t#\t#g' |sed 's#"$##g' > ./conf/wikidataLookup.tsv
Loading

0 comments on commit 9bbb4c6

Please sign in to comment.