Skip to content

Commit

Permalink
TIKA-3304 (#420)
Browse files Browse the repository at this point in the history
* fix npe in when OCR is available and fix incorrect mark offset exception

* fix indentation

* Avoid reporting of temporary ocr-based mime type in xhtml output

* TIKA-3304 -- experimental... WIP -- not to be merged

* allow instance credentials in s3 components

* WIP -- do not merge...allow instance credentials in s3 components

* WIP -- do not merge...still a bunch to do

* WIP -- do not merge...still a bunch to do

* merge from main and required updates/conflict resolution

* TIKA-3304 -- basically works...lots more remains

* checkstyle
  • Loading branch information
tballison authored Mar 26, 2021
1 parent d87ac65 commit e47c6cd
Show file tree
Hide file tree
Showing 78 changed files with 4,139 additions and 858 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.svn
target
dependency-reduced-pom.xml
.editorconfig
.idea
.classpath
.project
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@
<modules>
<module>tika-parent</module>
<module>tika-core</module>
<module>tika-serialization</module>
<module>tika-pipes</module>
<module>tika-parsers</module>
<module>tika-bundles</module>
<module>tika-xmp</module>
<module>tika-serialization</module>
<module>tika-batch</module>
<module>tika-langdetect</module>
<module>tika-app</module>
Expand Down
188 changes: 96 additions & 92 deletions tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DelegatingParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
Expand Down Expand Up @@ -99,7 +100,7 @@ public class TikaGUI extends JFrame
implements ActionListener, HyperlinkListener {

//maximum length to allow for mark for reparse to get JSON
private static final int MAX_MARK = 20*1024*1024;//20MB
private static final int MAX_MARK = 20 * 1024 * 1024;//20MB

/**
* Serial version UID.
Expand Down Expand Up @@ -129,7 +130,7 @@ public void run() {
new CommonsDigester(MAX_MARK,
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA256)
)).setVisible(true);
)).setVisible(true);
}
});
}
Expand All @@ -143,7 +144,7 @@ public void run() {
* Configured parser instance.
*/
private final Parser parser;

/**
* Captures requested embedded images
*/
Expand Down Expand Up @@ -173,7 +174,7 @@ public void run() {
* Main content output.
*/
private final JEditorPane textMain;

/**
* Raw XHTML source.
*/
Expand Down Expand Up @@ -341,14 +342,13 @@ private void handleStream(InputStream input, Metadata md)

context.set(DocumentSelector.class, new ImageDocumentSelector());

input = TikaInputStream.get(new ProgressMonitorInputStream(
this, "Parsing stream", input));
input = TikaInputStream.get(input);

if (input.markSupported()) {
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream)input).hasFile()) {
mark = (int)((TikaInputStream)input).getLength();
if (((TikaInputStream) input).hasFile()) {
mark = (int) ((TikaInputStream) input).getLength();
}
}
if (mark == -1) {
Expand Down Expand Up @@ -391,8 +391,8 @@ private void handleStream(InputStream input, Metadata md)
input.reset();
isReset = true;
} catch (IOException e) {
setText(json, "Error during stream reset.\n"+
"There's a limit of "+MAX_MARK + " bytes for this type of processing in the GUI.\n"+
setText(json, "Error during stream reset.\n" +
"There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" +
"Try the app with command line argument of -J."
);
}
Expand Down Expand Up @@ -420,7 +420,7 @@ private void handleError(String name, Throwable t) {
t.printStackTrace(new PrintWriter(writer));

JEditorPane editor =
new JEditorPane("text/plain", writer.toString());
new JEditorPane("text/plain", writer.toString());
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
Expand All @@ -435,7 +435,7 @@ private void handleError(String name, Throwable t) {
private void addWelcomeCard(JPanel panel, String name) {
try {
JEditorPane editor =
new JEditorPane(TikaGUI.class.getResource("welcome.html"));
new JEditorPane(TikaGUI.class.getResource("welcome.html"));
editor.setContentType("text/html");
editor.setEditable(false);
editor.setBackground(Color.WHITE);
Expand Down Expand Up @@ -480,7 +480,7 @@ public void hyperlinkUpdate(HyperlinkEvent e) {
URL url = e.getURL();
try (InputStream stream = url.openStream()) {
JEditorPane editor =
new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
Expand Down Expand Up @@ -519,7 +519,7 @@ private void setText(JEditorPane editor, String text) {
* {@link JEditorPane} fail thinking that the document character set
* is inconsistent.
* <p>
* Additionally, it will use ImageSavingParser to re-write embedded:(image)
* Additionally, it will use ImageSavingParser to re-write embedded:(image)
* image links to be file:///(temporary file) so that they can be loaded.
*
* @param writer output writer
Expand All @@ -529,7 +529,7 @@ private void setText(JEditorPane editor, String text) {
private ContentHandler getHtmlHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
Expand All @@ -542,36 +542,37 @@ public void startElement(
uri = null;
}
if (!"head".equals(localName)) {
if("img".equals(localName)) {
AttributesImpl newAttrs;
if(atts instanceof AttributesImpl) {
newAttrs = (AttributesImpl)atts;
} else {
newAttrs = new AttributesImpl(atts);
}

for(int i=0; i<newAttrs.getLength(); i++) {
if("src".equals(newAttrs.getLocalName(i))) {
String src = newAttrs.getValue(i);
if(src.startsWith("embedded:")) {
String filename = src.substring(src.indexOf(':')+1);
try {
File img = imageParser.requestSave(filename);
String newSrc = img.toURI().toString();
newAttrs.setValue(i, newSrc);
} catch(IOException e) {
System.err.println("Error creating temp image file " + filename);
// The html viewer will show a broken image too to alert them
if ("img".equals(localName)) {
AttributesImpl newAttrs;
if (atts instanceof AttributesImpl) {
newAttrs = (AttributesImpl) atts;
} else {
newAttrs = new AttributesImpl(atts);
}

for (int i = 0; i < newAttrs.getLength(); i++) {
if ("src".equals(newAttrs.getLocalName(i))) {
String src = newAttrs.getValue(i);
if (src.startsWith("embedded:")) {
String filename = src.substring(src.indexOf(':') + 1);
try {
File img = imageParser.requestSave(filename);
String newSrc = img.toURI().toString();
newAttrs.setValue(i, newSrc);
} catch (IOException e) {
System.err.println("Error creating temp image file " + filename);
// The html viewer will show a broken image too to alert them
}
}
}
}
}
super.startElement(uri, localName, name, newAttrs);
}
}
super.startElement(uri, localName, name, newAttrs);
} else {
super.startElement(uri, localName, name, atts);
super.startElement(uri, localName, name, atts);
}
}
}

@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
Expand All @@ -582,9 +583,11 @@ public void endElement(String uri, String localName, String name)
super.endElement(uri, localName, name);
}
}

@Override
public void startPrefixMapping(String prefix, String uri) {
}

@Override
public void endPrefixMapping(String prefix) {
}
Expand All @@ -594,14 +597,15 @@ public void endPrefixMapping(String prefix) {
private ContentHandler getTextContentHandler(Writer writer) {
return new BodyContentHandler(writer);
}

private ContentHandler getTextMainContentHandler(Writer writer) {
return new BoilerpipeContentHandler(writer);
}

private ContentHandler getXmlContentHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.setResult(new StreamResult(writer));
Expand All @@ -612,63 +616,63 @@ private ContentHandler getXmlContentHandler(Writer writer)
* A {@link DocumentSelector} that accepts only images.
*/
private static class ImageDocumentSelector implements DocumentSelector {
public boolean select(Metadata metadata) {
String type = metadata.get(Metadata.CONTENT_TYPE);
return type != null && type.startsWith("image/");
}
public boolean select(Metadata metadata) {
String type = metadata.get(Metadata.CONTENT_TYPE);
return type != null && type.startsWith("image/");
}
}

/**
* A recursive parser that saves certain images into the temporary
* directory, and delegates everything else to another downstream
* parser.
* directory, and delegates everything else to another downstream
* parser.
*/
private static class ImageSavingParser extends AbstractParser {
private Map<String,File> wanted = new HashMap<String,File>();
private Parser downstreamParser;
private File tmpDir;

private ImageSavingParser(Parser downstreamParser) {
this.downstreamParser = downstreamParser;

try {
File t = File.createTempFile("tika", ".test");
tmpDir = t.getParentFile();
} catch(IOException e) {}
}

public File requestSave(String embeddedName) throws IOException {
String suffix = ".tika";

int splitAt = embeddedName.lastIndexOf('.');
if (splitAt > 0) {
embeddedName.substring(splitAt);
}

File tmp = File.createTempFile("tika-embedded-", suffix);
wanted.put(embeddedName, tmp);
return tmp;
}

public Set<MediaType> getSupportedTypes(ParseContext context) {
// Never used in an auto setup
return null;
}

public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if(name != null && wanted.containsKey(name)) {
FileOutputStream out = new FileOutputStream(wanted.get(name));
IOUtils.copy(stream, out);
out.close();
} else {
if(downstreamParser != null) {
downstreamParser.parse(stream, handler, metadata, context);
private Map<String, File> wanted = new HashMap<String, File>();
private Parser downstreamParser;
private File tmpDir;

private ImageSavingParser(Parser downstreamParser) {
this.downstreamParser = downstreamParser;

try {
File t = File.createTempFile("tika", ".test");
tmpDir = t.getParentFile();
} catch (IOException e) {
}
}
}
}

public File requestSave(String embeddedName) throws IOException {
String suffix = ".tika";

int splitAt = embeddedName.lastIndexOf('.');
if (splitAt > 0) {
embeddedName.substring(splitAt);
}

File tmp = File.createTempFile("tika-embedded-", suffix);
wanted.put(embeddedName, tmp);
return tmp;
}

public Set<MediaType> getSupportedTypes(ParseContext context) {
return downstreamParser.getSupportedTypes(context);
}

public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (name != null && wanted.containsKey(name)) {
FileOutputStream out = new FileOutputStream(wanted.get(name));
IOUtils.copy(stream, out);
out.close();
} else {
if (downstreamParser != null) {
downstreamParser.parse(stream, handler, metadata, context);
}
}
}

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL

if (serviceLoaderElement != null) {
boolean dynamic = Boolean.parseBoolean(serviceLoaderElement.getAttribute("dynamic"));
LoadErrorHandler loadErrorHandler = LoadErrorHandler.IGNORE;
LoadErrorHandler loadErrorHandler = LoadErrorHandler.THROW;
String loadErrorHandleConfig = serviceLoaderElement.getAttribute("loadErrorHandler");
if (LoadErrorHandler.WARN.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
loadErrorHandler = LoadErrorHandler.WARN;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.exception;

/**
* Runtime/unchecked version of {@link java.util.concurrent.TimeoutException}
*/
public class TikaTimeoutException extends RuntimeException {
public TikaTimeoutException(String message) {
super(message);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ public void setName(String name) {
* @throws TikaEmitterException
*/
@Override
public void emit(List<EmitData> emitData) throws IOException, TikaEmitterException {
public void emit(List<? extends EmitData> emitData) throws IOException, TikaEmitterException {
for (EmitData d : emitData) {
emit(d.getEmitKey().getKey(), d.getMetadataList());
emit(d.getEmitKey().getEmitKey(), d.getMetadataList());
}
}
}
Loading

0 comments on commit e47c6cd

Please sign in to comment.