Skip to content

Commit

Permalink
Merge pull request DSpace#9733 from mwoodiupui/tika-array-limit
Browse files Browse the repository at this point in the history
filter-media:  make POI record buffer size adjustable.
  • Loading branch information
tdonohue authored Nov 14, 2024
2 parents 112f014 + a5a7af9 commit eb07d45
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.poi.util.IOUtils;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
Expand Down Expand Up @@ -72,21 +73,23 @@ public InputStream getDestinationStream(Item currentItem, InputStream source, bo
// Not using temporary file. We'll use Tika's default in-memory parsing.
// Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
String extractedText;
int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100000);
int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100_000);
try {
// Use Tika to extract text from input. Tika will automatically detect the file type.
Tika tika = new Tika();
tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract
IOUtils.setByteArrayMaxOverride(
configurationService.getIntProperty("textextractor.max-array", 100_000_000));
extractedText = tika.parseToString(source);
} catch (IOException e) {
System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString());
e.printStackTrace();
e.printStackTrace(System.err);
log.error("Unable to extract text from bitstream in Item {}", currentItem.getID().toString(), e);
throw e;
} catch (OutOfMemoryError oe) {
System.err.format("OutOfMemoryError occurred when extracting text from bitstream in Item %s. " +
"You may wish to enable 'textextractor.use-temp-file'.%n", currentItem.getID().toString());
oe.printStackTrace();
oe.printStackTrace(System.err);
log.error("OutOfMemoryError occurred when extracting text from bitstream in Item {}. " +
"You may wish to enable 'textextractor.use-temp-file'.", currentItem.getID().toString(), oe);
throw oe;
Expand Down
7 changes: 7 additions & 0 deletions dspace/config/dspace.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,13 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF
# text ("filter-media -f" ) and then reindex your site ("index-discovery -b").
#textextractor.use-temp-file = false

# Maximum size of a record buffer for text extraction. Set this if you are
# seeing RecordFormatException calling out excessive array length from
# 'dspace filter-media'. It is likely that you will need to increase the
# size of the Java heap if you greatly increase this value -- see JAVA_OPTS
# in 'bin/dspace' or 'bin/dspace/bat'.
#textextractor.max-array = 100000000

# Custom settings for ImageMagick Thumbnail Filters
# ImageMagick and GhostScript must be installed on the server, set the path to ImageMagick and GhostScript executable
# http://www.imagemagick.org/
Expand Down

0 comments on commit eb07d45

Please sign in to comment.