diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java index e83bf706ed0..17e7b85e9bf 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java @@ -18,6 +18,7 @@ import org.apache.commons.lang.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.poi.util.IOUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -72,21 +73,23 @@ public InputStream getDestinationStream(Item currentItem, InputStream source, bo // Not using temporary file. We'll use Tika's default in-memory parsing. // Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting. String extractedText; - int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100000); + int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100_000); try { // Use Tika to extract text from input. Tika will automatically detect the file type. Tika tika = new Tika(); tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract + IOUtils.setByteArrayMaxOverride( + configurationService.getIntProperty("textextractor.max-array", 100_000_000)); extractedText = tika.parseToString(source); } catch (IOException e) { System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString()); - e.printStackTrace(); + e.printStackTrace(System.err); log.error("Unable to extract text from bitstream in Item {}", currentItem.getID().toString(), e); throw e; } catch (OutOfMemoryError oe) { System.err.format("OutOfMemoryError occurred when extracting text from bitstream in Item %s. " + "You may wish to enable 'textextractor.use-temp-file'.%n", currentItem.getID().toString()); - oe.printStackTrace(); + oe.printStackTrace(System.err); log.error("OutOfMemoryError occurred when extracting text from bitstream in Item {}. " + "You may wish to enable 'textextractor.use-temp-file'.", currentItem.getID().toString(), oe); throw oe; diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 0d8086751d0..675e2480c4b 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -519,6 +519,13 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF # text ("filter-media -f" ) and then reindex your site ("index-discovery -b"). #textextractor.use-temp-file = false +# Maximum size of a record buffer for text extraction. Set this if you are +# seeing RecordFormatException calling out excessive array length from +# 'dspace filter-media'. It is likely that you will need to increase the +# size of the Java heap if you greatly increase this value -- see JAVA_OPTS +# in 'bin/dspace' or 'bin/dspace/bat'. +#textextractor.max-array = 100000000 + # Custom settings for ImageMagick Thumbnail Filters # ImageMagick and GhostScript must be installed on the server, set the path to ImageMagick and GhostScript executable # http://www.imagemagick.org/