From fc4cf8f91f7464eeedf08839086981c56e2b40b4 Mon Sep 17 00:00:00 2001 From: Brian Keese Date: Tue, 15 Oct 2024 11:38:54 -0500 Subject: [PATCH] Fix full-text indexing for files over the character limit The error handler for files over the limit logged the correct message, but never actually added the full text to the index doc. (cherry picked from commit 4a4a8bcb22796e5b22c9bbb1796e3458b48f1c07) --- .../indexobject/IndexFactoryImpl.java | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java index f1ae137b9163..c9a865ec85b2 100644 --- a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java +++ b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java @@ -118,20 +118,10 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea ParseContext tikaContext = new ParseContext(); // Use Apache Tika to parse the full text stream(s) + boolean extractionSucceeded = false; try (InputStream fullTextStreams = streams.getStream()) { tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext); - - // Write Tika metadata to "tika_meta_*" fields. - // This metadata is not very useful right now, - // but we'll keep it just in case it becomes more useful. - for (String name : tikaMetadata.names()) { - for (String value : tikaMetadata.getValues(name)) { - doc.addField("tika_meta_" + name, value); - } - } - - // Save (parsed) full text to "fulltext" field - doc.addField("fulltext", tikaHandler.toString()); + extractionSucceeded = true; } catch (SAXException saxe) { // Check if this SAXException is just a notice that this file was longer than the character limit. // Unfortunately there is not a unique, public exception type to catch here. This error is thrown @@ -141,6 +131,7 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea // log that we only indexed up to that configured limit log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)." + " Only the first {} characters were indexed.", charLimit); + extractionSucceeded = true; } else { log.error("Tika parsing error. Could not index full text.", saxe); throw new IOException("Tika parsing error. Could not index full text.", saxe); @@ -148,11 +139,19 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea } catch (TikaException | IOException ex) { log.error("Tika parsing error. Could not index full text.", ex); throw new IOException("Tika parsing error. Could not index full text.", ex); - } finally { - // Add document to index - solr.add(doc); } - return; + if (extractionSucceeded) { + // Write Tika metadata to "tika_meta_*" fields. + // This metadata is not very useful right now, + // but we'll keep it just in case it becomes more useful. + for (String name : tikaMetadata.names()) { + for (String value : tikaMetadata.getValues(name)) { + doc.addField("tika_meta_" + name, value); + } + } + // Save (parsed) full text to "fulltext" field + doc.addField("fulltext", tikaHandler.toString()); + } } // Add document to index solr.add(doc);