Skip to content

Commit

Permalink
Added support for post-processing in getText method by introducing a …
Browse files Browse the repository at this point in the history
…BiConsumer parameter.
  • Loading branch information
marevol committed Jun 18, 2024
1 parent e3fcc5d commit 5dade31
Showing 1 changed file with 26 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiConsumer;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

import javax.annotation.PostConstruct;
Expand Down Expand Up @@ -144,21 +146,29 @@ public void init() {

@Override
public ExtractData getText(final InputStream inputStream, final Map<String, String> params) {
return getText(inputStream, params, null);
}

protected ExtractData getText(final InputStream inputStream, final Map<String, String> params,
final BiConsumer<ExtractData, InputStream> postFilter) {
if (inputStream == null) {
throw new CrawlerSystemException("The inputstream is null.");
}

final long contentLength;
final File tempFile;
final boolean isByteStream = inputStream instanceof ByteArrayInputStream;
if (isByteStream) {
inputStream.mark(0);
tempFile = null;
contentLength = (long) ((ByteArrayInputStream) inputStream).available();
} else {
try {
tempFile = File.createTempFile("tikaExtractor-", ".out");
} catch (final IOException e) {
throw new ExtractException("Could not create a temp file.", e);
}
contentLength = tempFile.length();
}

try {
Expand Down Expand Up @@ -284,13 +294,29 @@ public ExtractData getText(final InputStream inputStream, final Map<String, Stri
}
}
final ExtractData extractData = new ExtractData(content);
extractData.putValue("Content-Length", Long.toString(contentLength));

final String[] names = metadata.names();
Arrays.sort(names);
for (final String name : names) {
extractData.putValues(name, metadata.getValues(name));
}

if (postFilter != null) {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
postFilter.accept(extractData, inputStream);
} finally {
CloseableUtil.closeQuietly(in);
}
}

if (logger.isDebugEnabled()) {
logger.debug("Result: metadata: {}", metadata);
}
Expand Down

0 comments on commit 5dade31

Please sign in to comment.