Skip to content

Commit

Permalink
Fix full-text indexing for files over the character limit
Browse files Browse the repository at this point in the history
The error handler for files over the limit logged the correct message, but never actually added the full text to the index doc.

(cherry picked from commit 4a4a8bc)
  • Loading branch information
bkeese authored and github-actions[bot] committed Dec 19, 2024
1 parent 1abf89f commit fc4cf8f
Showing 1 changed file with 15 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,10 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
ParseContext tikaContext = new ParseContext();

// Use Apache Tika to parse the full text stream(s)
boolean extractionSucceeded = false;
try (InputStream fullTextStreams = streams.getStream()) {
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);

// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}

// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
extractionSucceeded = true;
} catch (SAXException saxe) {
// Check if this SAXException is just a notice that this file was longer than the character limit.
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
Expand All @@ -141,18 +131,27 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
// log that we only indexed up to that configured limit
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
+ " Only the first {} characters were indexed.", charLimit);
extractionSucceeded = true;
} else {
log.error("Tika parsing error. Could not index full text.", saxe);
throw new IOException("Tika parsing error. Could not index full text.", saxe);
}
} catch (TikaException | IOException ex) {
log.error("Tika parsing error. Could not index full text.", ex);
throw new IOException("Tika parsing error. Could not index full text.", ex);
} finally {
// Add document to index
solr.add(doc);
}
return;
if (extractionSucceeded) {
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
}
}
// Add document to index
solr.add(doc);
Expand Down

0 comments on commit fc4cf8f

Please sign in to comment.