Merge pull request DSpace#9733 from mwoodiupui/tika-array-limit

filter-media: make POI record buffer size adjustable.
4Science · Nov 14, 2024 · eb07d45 · eb07d45
2 parents 112f014 + a5a7af9
commit eb07d45
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 3 deletions.
diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java
@@ -18,6 +18,7 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.apache.poi.util.IOUtils;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -72,21 +73,23 @@ public InputStream getDestinationStream(Item currentItem, InputStream source, bo
         // Not using temporary file. We'll use Tika's default in-memory parsing.
         // Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
         String extractedText;
-        int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100000);
+        int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100_000);
         try {
             // Use Tika to extract text from input. Tika will automatically detect the file type.
             Tika tika = new Tika();
             tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract
+            IOUtils.setByteArrayMaxOverride(
+                    configurationService.getIntProperty("textextractor.max-array", 100_000_000));
             extractedText = tika.parseToString(source);
         } catch (IOException e) {
             System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString());
-            e.printStackTrace();
+            e.printStackTrace(System.err);
             log.error("Unable to extract text from bitstream in Item {}", currentItem.getID().toString(), e);
             throw e;
         } catch (OutOfMemoryError oe) {
             System.err.format("OutOfMemoryError occurred when extracting text from bitstream in Item %s. " +
                 "You may wish to enable 'textextractor.use-temp-file'.%n", currentItem.getID().toString());
-            oe.printStackTrace();
+            oe.printStackTrace(System.err);
             log.error("OutOfMemoryError occurred when extracting text from bitstream in Item {}. " +
                           "You may wish to enable 'textextractor.use-temp-file'.", currentItem.getID().toString(), oe);
             throw oe;

diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg
@@ -519,6 +519,13 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF
 # text ("filter-media -f" ) and then reindex your site ("index-discovery -b").
 #textextractor.use-temp-file = false
 
+# Maximum size of a record buffer for text extraction.  Set this if you are
+# seeing RecordFormatException calling out excessive array length from
+# 'dspace filter-media'.  It is likely that you will need to increase the
+# size of the Java heap if you greatly increase this value -- see JAVA_OPTS
+# in 'bin/dspace' or 'bin/dspace/bat'.
+#textextractor.max-array = 100000000
+
 # Custom settings for ImageMagick Thumbnail Filters
 # ImageMagick and GhostScript must be installed on the server, set the path to ImageMagick and GhostScript executable
 #   http://www.imagemagick.org/