Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/useFixInsteadOfMorph' into 462-s…
Browse files Browse the repository at this point in the history
…implifyOaiPmhProcess
  • Loading branch information
TobiasNx committed Aug 22, 2023
2 parents 2c4734a + 814c2e9 commit cf18296
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 12 deletions.
28 changes: 18 additions & 10 deletions app/controllers/Index.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
Expand Down Expand Up @@ -106,17 +108,20 @@ public static Result start() throws IOException {

/**
* @param pathToJson Path to the JSON file to index
* @throws IOException if json file cannot be found
* @throws IOException if json file cannot be found
* @throws IllegalArgumentException if to be indexed file is smaller than expected
* @throws ElasticsearchException if bulk indexing fails
*/
public static void initialize(String pathToJson) throws IOException {
public static void initialize(String pathToJson) throws IOException, ElasticsearchException {
long minimumSize =
Long.parseLong(Application.CONFIG.getString("index.file.minsize"));
if (new File(pathToJson).length() >= minimumSize) {
createEmptyIndex();
indexData(pathToJson);
} else {
indexData(CLIENT, pathToJson, INDEX_NAME);
}
else {
throw new IllegalArgumentException(
"File not large enough: " + pathToJson);
"File " + pathToJson + " is not large enough - should be >='" + minimumSize + "' but is " + pathToJson.length());
}
}

Expand Down Expand Up @@ -195,19 +200,22 @@ static void createEmptyIndex() throws IOException {
Index.CLIENT.admin().indices().refresh(new RefreshRequest()).actionGet();
}

static void indexData(final String aPath) throws IOException {
static void indexData(final Client aClient, final String aPath, final String aIndex) throws IOException, ElasticsearchException {
final BulkRequestBuilder bulkRequest = Index.CLIENT.prepareBulk();
try (BufferedReader br =
new BufferedReader(new InputStreamReader(Files.newInputStream(Paths.get(aPath)),
StandardCharsets.UTF_8))) {
readData(bulkRequest, br);
readData(bulkRequest, br, aClient, aIndex);
}
bulkRequest.execute().actionGet();
Index.CLIENT.admin().indices().refresh(new RefreshRequest()).actionGet();
BulkResponse bulkResponse = bulkRequest.execute().actionGet();
if (bulkResponse.hasFailures()) {
throw new ElasticsearchException("Bulk insert failed: " + bulkResponse.buildFailureMessage());
}
aClient.admin().indices().refresh(new RefreshRequest()).actionGet();
}

private static void readData(final BulkRequestBuilder bulkRequest,
final BufferedReader br)
final BufferedReader br, final Client client, final String aIndex)
throws IOException {
final ObjectMapper mapper = new ObjectMapper();
String line;
Expand Down
4 changes: 2 additions & 2 deletions conf/dataset.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
"en": "Memory institutions in German-speaking countries"
},
"description": {
"de": "<p class='lead'>lobid-organisations ist ein umfassendes Verzeichnis von über 20.000 Gedächtnisinstitutionen (Bibliotheken, Archiven und Museen) im deutschsprachigen Raum.</p><p>Die Daten werden in einem strukturierten Format (JSON-LD) über eine <a href='/organisations/api'>webbasierte Programmierschnittstelle (API)</a> mit einer <a href='/organisations/search'>intuitiven Benutzeroberfläche</a> bereitgestellt. Vielfältige Möglichkeiten der Datenabfrage werden unterstützt.</p><p>Die Datenquellen dieses Dienstes sind das <a href=\"http://sigel.staatsbibliothek-berlin.de\">Deutsche ISIL-Verzeichnis</a> und die Stammdaten der <a href=\"https://www.hbz-nrw.de/produkte/bibliotheksstatistik\">Deutschen Bibliotheksstatistik (DBS)</a>.</p>",
"en": "<p class='lead'>lobid-organisations is a comprehensive directory of over 20,000 memory institutions (libraries, archives and museums) in Germany, Austria, and Switzerland.</p><p>The data is provided as structured data (JSON-LD) via a <a href='/organisations/api'>web application programming interface (API)</a> with an <a href='/organisations/search'>intuitive user interface</a> on top. Multiple options for querying the data are supported.</p><p>The source data sets for this service are the <a href=\"http://sigel.staatsbibliothek-berlin.de\">German ISIL registry</a> and the base data from the <a href=\"https://www.hbz-nrw.de/produkte/bibliotheksstatistik\">German Library Statistics (DBS)</a>.</p>"
"de": "<p class='lead'>lobid-organisations ist ein umfassendes Verzeichnis von über 20.000 Gedächtnisinstitutionen (Bibliotheken, Archiven und Museen) im deutschsprachigen Raum.</p><p>Die Daten werden in einem strukturierten Format (JSON-LD) über eine <a href='/organisations/api'>webbasierte Programmierschnittstelle (API)</a> mit einer <a href='/organisations/search'>intuitiven Benutzeroberfläche</a> bereitgestellt. Vielfältige Möglichkeiten der Datenabfrage werden unterstützt.</p><p>Die Datenquellen dieses Dienstes sind das <a href=\"http://sigel.staatsbibliothek-berlin.de\">Deutsche ISIL-Verzeichnis</a> und die Stammdaten der <a href=\"https://www.hbz-nrw.de/produkte/bibliotheksstatistik\">Bibliotheksstatistik</a>.</p>",
"en": "<p class='lead'>lobid-organisations is a comprehensive directory of over 20,000 memory institutions (libraries, archives and museums) in Germany, Austria, and Switzerland.</p><p>The data is provided as structured data (JSON-LD) via a <a href='/organisations/api'>web application programming interface (API)</a> with an <a href='/organisations/search'>intuitive user interface</a> on top. Multiple options for querying the data are supported.</p><p>The source data sets for this service are the <a href=\"http://sigel.staatsbibliothek-berlin.de\">German ISIL registry</a> and the base data from the <a href=\"https://www.hbz-nrw.de/produkte/bibliotheksstatistik\">Library Statistics</a>.</p>"
},
"keywords": [
"libraries",
Expand Down
26 changes: 26 additions & 0 deletions test/index/TestBadDocuments.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright 2023, hbz. Licensed under the EPL 2.0 */

package index;

import java.io.IOException;

import controllers.Index;
import org.elasticsearch.ElasticsearchException;
import org.junit.Test;

import static org.junit.Assert.assertTrue;

@SuppressWarnings("javadoc")
public class TestBadDocuments {

@Test
public void logIndexFailure() {
System.setProperty("config.resource", "test.conf");
try {
Index.initialize("test/index/corruptDocument.json");
} catch (ElasticsearchException | IOException e) {
Class clazz = e.getClass();
assertTrue(e.getClass().getName() == "org.elasticsearch.ElasticsearchException");
}
}
}
2 changes: 2 additions & 0 deletions test/index/corruptDocument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"index":{"_index":"organisations","_type":"organisation","_id":""}}
{"rs":"130750039039","type":{"corrupt":true},"classification":{"id":"http://purl.org/lobid/libtype#n60","type":"Concept","label":{"de":"Zentrale Universitätsbibliothek","en":"Central University Library"}},"@context":"http://lobid.org/organisations/context.jsonld","url":"http://www.uni-greifswald.de/bibliothek/html","provides":"http://www.ub.uni-greifswald.de:2324/","name":"Universitätsbibliothek Greifswald","containedIn":"http://sws.geonames.org/6551180/","location":[{"type":"Place","address":{"postalCode":"17489","streetAddress":"Felix-Hausdorff-Str. 10","addressLocality":"Greifswald","addressCountry":"DE","type":"PostalAddress"},"openingHoursSpecification":{"description":"Mo-Fr.: 8-24, Sa: 9-24 Uhr"}}],"id":"http://lobid.org/organisations/DE-9#!","isil":"DE-9","fundertype":{"id":"http://purl.org/lobid/fundertype#n02","type":"Concept","label":{"de":"Land","en":"Federal State"}},"collects":{"type":"Collection","extent":{"id":"http://purl.org/lobid/stocksize#n10","type":"Concept","label":{"de":"1.000.001 und mehr","en":"1,000,001 and more"}}},"dbsID":"AA009","sameAs":["http://www.wikidata.org/entity/Q2496314","http://ld.zdb-services.de/resource/organisations/DE-9"]}

0 comments on commit cf18296

Please sign in to comment.