diff --git a/app/controllers/Index.java b/app/controllers/Index.java index b9788888..665beeca 100644 --- a/app/controllers/Index.java +++ b/app/controllers/Index.java @@ -4,11 +4,13 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder; import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchResponse; @@ -106,17 +108,20 @@ public static Result start() throws IOException { /** * @param pathToJson Path to the JSON file to index - * @throws IOException if json file cannot be found + * @throws IOException if json file cannot be found + * @throws IllegalArgumentException if to be indexed file is smaller than expected + * @throws ElasticsearchException if bulk indexing fails */ - public static void initialize(String pathToJson) throws IOException { + public static void initialize(String pathToJson) throws IOException, ElasticsearchException { long minimumSize = Long.parseLong(Application.CONFIG.getString("index.file.minsize")); if (new File(pathToJson).length() >= minimumSize) { createEmptyIndex(); - indexData(pathToJson); - } else { + indexData(CLIENT, pathToJson, INDEX_NAME); + } + else { throw new IllegalArgumentException( - "File not large enough: " + pathToJson); + "File " + pathToJson + " is not large enough - should be >='" + minimumSize + "' but is " + pathToJson.length()); } } @@ -195,19 +200,22 @@ static void createEmptyIndex() throws IOException { Index.CLIENT.admin().indices().refresh(new RefreshRequest()).actionGet(); } - static void indexData(final String aPath) throws IOException { + static void indexData(final Client aClient, final String aPath, final String aIndex) throws IOException, ElasticsearchException { final BulkRequestBuilder bulkRequest = Index.CLIENT.prepareBulk(); try (BufferedReader br = new BufferedReader(new InputStreamReader(Files.newInputStream(Paths.get(aPath)), StandardCharsets.UTF_8))) { - readData(bulkRequest, br); + readData(bulkRequest, br, aClient, aIndex); } - bulkRequest.execute().actionGet(); - Index.CLIENT.admin().indices().refresh(new RefreshRequest()).actionGet(); + BulkResponse bulkResponse = bulkRequest.execute().actionGet(); + if (bulkResponse.hasFailures()) { + throw new ElasticsearchException("Bulk insert failed: " + bulkResponse.buildFailureMessage()); + } + aClient.admin().indices().refresh(new RefreshRequest()).actionGet(); } private static void readData(final BulkRequestBuilder bulkRequest, - final BufferedReader br) + final BufferedReader br, final Client client, final String aIndex) throws IOException { final ObjectMapper mapper = new ObjectMapper(); String line; diff --git a/conf/dataset.jsonld b/conf/dataset.jsonld index bc994513..ee821109 100644 --- a/conf/dataset.jsonld +++ b/conf/dataset.jsonld @@ -34,8 +34,8 @@ "en": "Memory institutions in German-speaking countries" }, "description": { - "de": "

lobid-organisations ist ein umfassendes Verzeichnis von über 20.000 Gedächtnisinstitutionen (Bibliotheken, Archiven und Museen) im deutschsprachigen Raum.

Die Daten werden in einem strukturierten Format (JSON-LD) über eine webbasierte Programmierschnittstelle (API) mit einer intuitiven Benutzeroberfläche bereitgestellt. Vielfältige Möglichkeiten der Datenabfrage werden unterstützt.

Die Datenquellen dieses Dienstes sind das Deutsche ISIL-Verzeichnis und die Stammdaten der Deutschen Bibliotheksstatistik (DBS).

", - "en": "

lobid-organisations is a comprehensive directory of over 20,000 memory institutions (libraries, archives and museums) in Germany, Austria, and Switzerland.

The data is provided as structured data (JSON-LD) via a web application programming interface (API) with an intuitive user interface on top. Multiple options for querying the data are supported.

The source data sets for this service are the German ISIL registry and the base data from the German Library Statistics (DBS).

" + "de": "

lobid-organisations ist ein umfassendes Verzeichnis von über 20.000 Gedächtnisinstitutionen (Bibliotheken, Archiven und Museen) im deutschsprachigen Raum.

Die Daten werden in einem strukturierten Format (JSON-LD) über eine webbasierte Programmierschnittstelle (API) mit einer intuitiven Benutzeroberfläche bereitgestellt. Vielfältige Möglichkeiten der Datenabfrage werden unterstützt.

Die Datenquellen dieses Dienstes sind das Deutsche ISIL-Verzeichnis und die Stammdaten der Bibliotheksstatistik.

", + "en": "

lobid-organisations is a comprehensive directory of over 20,000 memory institutions (libraries, archives and museums) in Germany, Austria, and Switzerland.

The data is provided as structured data (JSON-LD) via a web application programming interface (API) with an intuitive user interface on top. Multiple options for querying the data are supported.

The source data sets for this service are the German ISIL registry and the base data from the Library Statistics.

" }, "keywords": [ "libraries", diff --git a/test/index/TestBadDocuments.java b/test/index/TestBadDocuments.java new file mode 100644 index 00000000..dda30a79 --- /dev/null +++ b/test/index/TestBadDocuments.java @@ -0,0 +1,26 @@ +/* Copyright 2023, hbz. Licensed under the EPL 2.0 */ + +package index; + +import java.io.IOException; + +import controllers.Index; +import org.elasticsearch.ElasticsearchException; +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +@SuppressWarnings("javadoc") +public class TestBadDocuments { + + @Test + public void logIndexFailure() { + System.setProperty("config.resource", "test.conf"); + try { + Index.initialize("test/index/corruptDocument.json"); + } catch (ElasticsearchException | IOException e) { + Class clazz = e.getClass(); + assertTrue(e.getClass().getName() == "org.elasticsearch.ElasticsearchException"); + } + } +} diff --git a/test/index/corruptDocument.json b/test/index/corruptDocument.json new file mode 100644 index 00000000..49744ff5 --- /dev/null +++ b/test/index/corruptDocument.json @@ -0,0 +1,2 @@ +{"index":{"_index":"organisations","_type":"organisation","_id":""}} +{"rs":"130750039039","type":{"corrupt":true},"classification":{"id":"http://purl.org/lobid/libtype#n60","type":"Concept","label":{"de":"Zentrale Universitätsbibliothek","en":"Central University Library"}},"@context":"http://lobid.org/organisations/context.jsonld","url":"http://www.uni-greifswald.de/bibliothek/html","provides":"http://www.ub.uni-greifswald.de:2324/","name":"Universitätsbibliothek Greifswald","containedIn":"http://sws.geonames.org/6551180/","location":[{"type":"Place","address":{"postalCode":"17489","streetAddress":"Felix-Hausdorff-Str. 10","addressLocality":"Greifswald","addressCountry":"DE","type":"PostalAddress"},"openingHoursSpecification":{"description":"Mo-Fr.: 8-24, Sa: 9-24 Uhr"}}],"id":"http://lobid.org/organisations/DE-9#!","isil":"DE-9","fundertype":{"id":"http://purl.org/lobid/fundertype#n02","type":"Concept","label":{"de":"Land","en":"Federal State"}},"collects":{"type":"Collection","extent":{"id":"http://purl.org/lobid/stocksize#n10","type":"Concept","label":{"de":"1.000.001 und mehr","en":"1,000,001 and more"}}},"dbsID":"AA009","sameAs":["http://www.wikidata.org/entity/Q2496314","http://ld.zdb-services.de/resource/organisations/DE-9"]}